Merge branch 'linux-4.7' of git://github.com/skeggsb/linux into drm-fixes
authorDave Airlie <airlied@redhat.com>
Wed, 15 Jun 2016 06:58:32 +0000 (16:58 +1000)
committerDave Airlie <airlied@redhat.com>
Wed, 15 Jun 2016 06:58:32 +0000 (16:58 +1000)
* 'linux-4.7' of git://github.com/skeggsb/linux:
  drm/nouveau/iccsense: fix memory leak
  drm/nouveau/Revert "drm/nouveau/device/pci: set as non-CPU-coherent on ARM64"

1518 files changed:
Documentation/ABI/stable/sysfs-class-ubi
Documentation/DocBook/device-drivers.tmpl
Documentation/DocBook/gpu.tmpl
Documentation/arm64/silicon-errata.txt
Documentation/devicetree/bindings/bus/ti-gpmc.txt [deleted file]
Documentation/devicetree/bindings/display/imx/ldb.txt
Documentation/devicetree/bindings/gpio/microchip,pic32-gpio.txt
Documentation/devicetree/bindings/memory-controllers/omap-gpmc.txt [new file with mode: 0644]
Documentation/devicetree/bindings/mips/cpu_irq.txt
Documentation/devicetree/bindings/mmc/microchip,sdhci-pic32.txt
Documentation/devicetree/bindings/mtd/atmel-nand.txt
Documentation/devicetree/bindings/mtd/brcm,brcmnand.txt
Documentation/devicetree/bindings/mtd/gpmc-nand.txt
Documentation/devicetree/bindings/mtd/nand.txt
Documentation/devicetree/bindings/pinctrl/microchip,pic32-pinctrl.txt
Documentation/devicetree/bindings/serial/microchip,pic32-uart.txt
Documentation/devicetree/bindings/sound/max98371.txt [new file with mode: 0644]
Documentation/devicetree/bindings/sound/mt8173-rt5650-rt5676.txt
Documentation/devicetree/bindings/sound/mt8173-rt5650.txt
Documentation/devicetree/bindings/sound/st,sti-asoc-card.txt
Documentation/devicetree/bindings/sound/tas571x.txt
Documentation/devicetree/bindings/sound/tas5720.txt [new file with mode: 0644]
Documentation/devicetree/bindings/spi/microchip,spi-pic32.txt [new file with mode: 0644]
Documentation/devicetree/bindings/spi/spi-fsl-dspi.txt
Documentation/devicetree/bindings/spi/sqi-pic32.txt [new file with mode: 0644]
Documentation/devicetree/bindings/thermal/nvidia,tegra124-soctherm.txt
Documentation/devicetree/bindings/thermal/rcar-thermal.txt
Documentation/devicetree/bindings/thermal/tango-thermal.txt [new file with mode: 0644]
Documentation/devicetree/bindings/thermal/thermal-generic-adc.txt [new file with mode: 0644]
Documentation/devicetree/bindings/watchdog/fsl-imx-wdt.txt
Documentation/devicetree/bindings/watchdog/microchip,pic32-dmt.txt
Documentation/devicetree/bindings/watchdog/microchip,pic32-wdt.txt
Documentation/devicetree/bindings/watchdog/renesas-wdt.txt [new file with mode: 0644]
Documentation/filesystems/dax.txt
Documentation/filesystems/devpts.txt
Documentation/filesystems/directory-locking
Documentation/filesystems/overlayfs.txt
Documentation/filesystems/porting
Documentation/hwmon/max34440
Documentation/infiniband/sysfs.txt
Documentation/kbuild/kconfig-language.txt
Documentation/kdump/gdbmacros.txt
Documentation/networking/dsa/dsa.txt
Documentation/networking/ip-sysctl.txt
Documentation/pwm.txt
Documentation/scsi/tcm_qla2xxx.txt [new file with mode: 0644]
Documentation/security/keys.txt
Documentation/sysctl/kernel.txt
Documentation/target/tcm_mod_builder.py
Documentation/thermal/sysfs-api.txt
Documentation/watchdog/hpwdt.txt
Documentation/watchdog/watchdog-parameters.txt
MAINTAINERS
Makefile
arch/Kconfig
arch/arc/include/uapi/asm/unistd.h
arch/arc/kernel/perf_event.c
arch/arm/boot/Makefile
arch/arm/boot/bootp/Makefile
arch/arm/boot/dts/Makefile
arch/arm/boot/dts/exynos3250-monk.dts
arch/arm/boot/dts/exynos3250-rinato.dts
arch/arm/boot/dts/exynos3250.dtsi
arch/arm/boot/dts/exynos4210.dtsi
arch/arm/boot/dts/exynos4412-odroid-common.dtsi
arch/arm/boot/dts/exynos4412-ppmu-common.dtsi [new file with mode: 0644]
arch/arm/boot/dts/exynos4412-trats2.dts
arch/arm/boot/dts/exynos4x12.dtsi
arch/arm/boot/dts/exynos5420.dtsi
arch/arm/boot/dts/exynos5422-odroidxu3-common.dtsi
arch/arm/boot/dts/imx7d-nitrogen7.dts [new file with mode: 0644]
arch/arm/boot/dts/imx7d.dtsi
arch/arm/boot/dts/r8a7779.dtsi
arch/arm/boot/dts/r8a7790.dtsi
arch/arm/boot/dts/r8a7791.dtsi
arch/arm/boot/dts/r8a7793.dtsi
arch/arm/boot/dts/r8a7794.dtsi
arch/arm/boot/dts/tegra124-jetson-tk1.dts
arch/arm/boot/dts/tegra124-nyan.dtsi
arch/arm/boot/dts/tegra124-venice2.dts
arch/arm/boot/dts/tegra124.dtsi
arch/arm/boot/dts/vf-colibri-eval-v3.dtsi
arch/arm/boot/dts/vf-colibri.dtsi
arch/arm/boot/dts/vfxxx.dtsi
arch/arm/include/asm/kvm_host.h
arch/arm/include/asm/kvm_mmio.h
arch/arm/kernel/perf_callchain.c
arch/arm/kernel/ptrace.c
arch/arm/kvm/Kconfig
arch/arm/kvm/Makefile
arch/arm/kvm/arm.c
arch/arm/kvm/mmio.c
arch/arm/mach-lpc32xx/Makefile
arch/arm/mach-lpc32xx/include/mach/irqs.h
arch/arm/mach-lpc32xx/irq.c [deleted file]
arch/arm/mach-omap2/gpmc-nand.c
arch/arm/mach-pxa/Kconfig
arch/arm/mach-pxa/eseries.c
arch/arm/mach-pxa/spitz.c
arch/arm/mach-s3c24xx/mach-rx1950.c
arch/arm/vdso/Makefile
arch/arm64/Kconfig
arch/arm64/Kconfig.debug
arch/arm64/Makefile
arch/arm64/boot/dts/renesas/r8a7795.dtsi
arch/arm64/configs/defconfig
arch/arm64/include/asm/elf.h
arch/arm64/include/asm/kvm_host.h
arch/arm64/include/asm/kvm_mmio.h
arch/arm64/include/asm/memory.h
arch/arm64/include/asm/page.h
arch/arm64/include/asm/uaccess.h
arch/arm64/include/asm/unistd.h
arch/arm64/include/asm/unistd32.h
arch/arm64/include/uapi/asm/unistd.h
arch/arm64/kernel/cpuinfo.c
arch/arm64/kernel/perf_callchain.c
arch/arm64/kernel/traps.c
arch/arm64/kvm/Kconfig
arch/arm64/kvm/Makefile
arch/arm64/kvm/hyp/vgic-v3-sr.c
arch/arm64/kvm/inject_fault.c
arch/arm64/kvm/sys_regs.c
arch/arm64/mm/dump.c
arch/arm64/mm/hugetlbpage.c
arch/c6x/include/uapi/asm/unistd.h
arch/cris/arch-v32/drivers/mach-a3/nandflash.c
arch/cris/arch-v32/drivers/mach-fs/nandflash.c
arch/h8300/Kconfig
arch/h8300/boot/compressed/Makefile
arch/h8300/include/asm/hash.h [new file with mode: 0644]
arch/h8300/include/uapi/asm/unistd.h
arch/hexagon/include/uapi/asm/unistd.h
arch/ia64/Makefile
arch/m32r/boot/compressed/Makefile
arch/m68k/Kconfig.cpu
arch/m68k/include/asm/hash.h [new file with mode: 0644]
arch/metag/include/uapi/asm/unistd.h
arch/metag/kernel/perf_callchain.c
arch/microblaze/Kconfig
arch/microblaze/include/asm/hash.h [new file with mode: 0644]
arch/microblaze/include/asm/unistd.h
arch/microblaze/include/uapi/asm/unistd.h
arch/microblaze/kernel/syscall_table.S
arch/microblaze/pci/pci-common.c
arch/mips/Kconfig
arch/mips/boot/dts/ingenic/jz4740.dtsi
arch/mips/boot/dts/ralink/mt7620a.dtsi
arch/mips/boot/dts/ralink/rt2880.dtsi
arch/mips/boot/dts/ralink/rt3050.dtsi
arch/mips/boot/dts/ralink/rt3883.dtsi
arch/mips/boot/dts/xilfpga/nexys4ddr.dts
arch/mips/cavium-octeon/smp.c
arch/mips/include/asm/asmmacro.h
arch/mips/include/asm/hazards.h
arch/mips/include/asm/mach-au1x00/au1xxx_dbdma.h
arch/mips/include/asm/mach-au1x00/gpio-au1300.h
arch/mips/include/asm/mach-bcm63xx/bcm63xx_dev_enet.h
arch/mips/include/asm/mach-ip27/dma-coherence.h
arch/mips/include/asm/mach-ip32/dma-coherence.h
arch/mips/include/asm/mach-jz4740/jz4740_nand.h
arch/mips/include/asm/mach-lantiq/falcon/lantiq_soc.h
arch/mips/include/asm/mach-lantiq/xway/lantiq_soc.h
arch/mips/include/asm/mach-loongson64/loongson_hwmon.h
arch/mips/include/asm/mach-malta/kernel-entry-init.h
arch/mips/include/asm/mips_mt.h
arch/mips/include/asm/mipsregs.h
arch/mips/include/asm/msa.h
arch/mips/include/asm/octeon/cvmx-cmd-queue.h
arch/mips/include/asm/octeon/cvmx-helper-board.h
arch/mips/include/asm/octeon/cvmx-ipd.h
arch/mips/include/asm/octeon/cvmx-pow.h
arch/mips/include/asm/sgi/hpc3.h
arch/mips/jz4740/board-qi_lb60.c
arch/mips/kernel/branch.c
arch/mips/kernel/cps-vec.S
arch/mips/kernel/cpu-probe.c
arch/mips/kernel/elf.c
arch/mips/kernel/irq.c
arch/mips/kernel/mips-r2-to-r6-emul.c
arch/mips/kernel/perf_event.c
arch/mips/kernel/process.c
arch/mips/kernel/signal.c
arch/mips/kernel/smp-cps.c
arch/mips/lasat/picvue_proc.c
arch/mips/lib/ashldi3.c
arch/mips/lib/ashrdi3.c
arch/mips/lib/bswapdi.c
arch/mips/lib/bswapsi.c
arch/mips/lib/cmpdi2.c
arch/mips/lib/lshrdi3.c
arch/mips/lib/memcpy.S
arch/mips/lib/ucmpdi2.c
arch/mips/loongson64/loongson-3/hpet.c
arch/mips/math-emu/dsemul.c
arch/mips/mm/tlbex.c
arch/mips/oprofile/op_impl.h
arch/mips/pci/ops-bridge.c
arch/mips/pistachio/init.c
arch/mips/ralink/mt7620.c
arch/mips/sgi-ip27/ip27-hubio.c
arch/mips/sgi-ip27/ip27-nmi.c
arch/mips/sgi-ip27/ip27-xtalk.c
arch/mips/sni/rm200.c
arch/mips/vdso/Makefile
arch/mips/vr41xx/common/cmu.c
arch/mn10300/boot/compressed/Makefile
arch/nios2/Makefile
arch/nios2/boot/compressed/Makefile
arch/nios2/include/uapi/asm/unistd.h
arch/openrisc/include/uapi/asm/unistd.h
arch/parisc/Kconfig
arch/parisc/include/asm/cmpxchg.h
arch/parisc/include/asm/eisa_eeprom.h
arch/parisc/include/asm/ftrace.h
arch/parisc/include/asm/futex.h
arch/parisc/include/asm/ldcw.h
arch/parisc/include/asm/syscall.h
arch/parisc/include/asm/thread_info.h
arch/parisc/include/asm/traps.h
arch/parisc/include/asm/uaccess.h
arch/parisc/include/uapi/asm/pdc.h
arch/parisc/include/uapi/asm/ptrace.h
arch/parisc/include/uapi/asm/unistd.h
arch/parisc/kernel/entry.S
arch/parisc/kernel/ftrace.c
arch/parisc/kernel/processor.c
arch/parisc/kernel/ptrace.c
arch/parisc/kernel/syscall.S
arch/parisc/kernel/time.c
arch/parisc/kernel/unaligned.c
arch/parisc/kernel/unwind.c
arch/parisc/lib/bitops.c
arch/parisc/math-emu/fpudispatch.c
arch/powerpc/include/asm/reg.h
arch/powerpc/kernel/prom_init.c
arch/powerpc/mm/hash_utils_64.c
arch/powerpc/mm/pgtable-book3s64.c
arch/powerpc/mm/pgtable-radix.c
arch/powerpc/perf/callchain.c
arch/powerpc/platforms/pseries/eeh_pseries.c
arch/powerpc/sysdev/axonram.c
arch/s390/boot/compressed/Makefile
arch/s390/configs/default_defconfig
arch/s390/configs/gcov_defconfig
arch/s390/configs/performance_defconfig
arch/s390/configs/zfcpdump_defconfig
arch/s390/defconfig
arch/s390/kernel/perf_event.c
arch/s390/mm/fault.c
arch/s390/net/bpf_jit.h
arch/s390/net/bpf_jit_comp.c
arch/score/include/uapi/asm/unistd.h
arch/sh/boot/compressed/Makefile
arch/sh/boot/romimage/Makefile
arch/sh/kernel/perf_callchain.c
arch/sparc/include/asm/head_64.h
arch/sparc/include/asm/ttable.h
arch/sparc/kernel/Makefile
arch/sparc/kernel/perf_event.c
arch/sparc/kernel/rtrap_64.S
arch/sparc/kernel/signal32.c
arch/sparc/kernel/signal_32.c
arch/sparc/kernel/signal_64.c
arch/sparc/kernel/sigutil_32.c
arch/sparc/kernel/sigutil_64.c
arch/sparc/kernel/urtt_fill.S [new file with mode: 0644]
arch/sparc/mm/init_64.c
arch/tile/include/uapi/asm/unistd.h
arch/tile/kernel/perf_event.c
arch/um/include/shared/registers.h
arch/um/kernel/process.c
arch/um/os-Linux/signal.c
arch/unicore32/boot/Makefile
arch/unicore32/boot/compressed/Makefile
arch/unicore32/include/uapi/asm/unistd.h
arch/x86/boot/compressed/Makefile
arch/x86/entry/thunk_64.S
arch/x86/entry/vdso/Makefile
arch/x86/events/core.c
arch/x86/events/intel/p4.c
arch/x86/events/intel/uncore.c
arch/x86/ia32/ia32_aout.c
arch/x86/include/asm/bugs.h
arch/x86/include/asm/cpufeature.h
arch/x86/include/asm/disabled-features.h
arch/x86/include/asm/intel_telemetry.h
arch/x86/include/asm/pmc_core.h [new file with mode: 0644]
arch/x86/include/uapi/asm/svm.h
arch/x86/kernel/cpu/common.c
arch/x86/kernel/cpu/intel.c
arch/x86/kernel/process_64.c
arch/x86/kernel/ptrace.c
arch/x86/kernel/tsc_msr.c
arch/x86/kvm/cpuid.c
arch/x86/kvm/mmu.c
arch/x86/kvm/svm.c
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
arch/x86/mm/fault.c
arch/x86/pci/xen.c
arch/x86/platform/efi/efi_stub_64.S
arch/x86/purgatory/Makefile
arch/x86/realmode/rm/Makefile
arch/x86/um/os-Linux/registers.c
arch/x86/um/ptrace_32.c
arch/x86/um/ptrace_64.c
arch/x86/um/shared/sysdep/ptrace_64.h
arch/x86/um/signal.c
arch/x86/um/user-offsets.c
arch/x86/xen/setup.c
arch/x86/xen/time.c
arch/xtensa/kernel/perf_event.c
block/blk-mq.c
block/ioctl.c
crypto/asymmetric_keys/Kconfig
drivers/acpi/acpi_dbg.c
drivers/acpi/acpi_processor.c
drivers/acpi/acpi_video.c
drivers/acpi/acpica/hwregs.c
drivers/acpi/battery.c
drivers/acpi/device_pm.c
drivers/acpi/processor_throttling.c
drivers/ata/sata_highbank.c
drivers/atm/firestream.c
drivers/atm/iphase.c
drivers/base/power/main.c
drivers/bcma/driver_chipcommon_sflash.c
drivers/block/brd.c
drivers/block/rbd.c
drivers/clk/clk-pwm.c
drivers/clk/tegra/clk-tegra210.c
drivers/cpufreq/cpufreq.c
drivers/cpufreq/intel_pstate.c
drivers/cpufreq/mt8173-cpufreq.c
drivers/cpufreq/omap-cpufreq.c
drivers/cpuidle/cpuidle.c
drivers/crypto/caam/ctrl.c
drivers/crypto/ccp/ccp-crypto-aes-xts.c
drivers/crypto/omap-sham.c
drivers/dma-buf/dma-buf.c
drivers/dma-buf/reservation.c
drivers/dma/sun4i-dma.c
drivers/gpio/gpio-lpc32xx.c
drivers/gpio/gpio-xlp.c
drivers/gpio/gpiolib.c
drivers/gpu/drm/Makefile
drivers/gpu/drm/amd/acp/Kconfig
drivers/gpu/drm/amd/amdgpu/amdgpu.h
drivers/gpu/drm/amd/amdgpu/amdgpu_cgs.c
drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
drivers/gpu/drm/amd/amdgpu/amdgpu_powerplay.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
drivers/gpu/drm/amd/amdgpu/amdgpu_sa.c
drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
drivers/gpu/drm/amd/amdgpu/ci_dpm.c
drivers/gpu/drm/amd/amdgpu/cik_ih.c
drivers/gpu/drm/amd/amdgpu/cik_sdma.c
drivers/gpu/drm/amd/amdgpu/cz_dpm.c
drivers/gpu/drm/amd/amdgpu/cz_ih.c
drivers/gpu/drm/amd/amdgpu/dce_v11_0.c
drivers/gpu/drm/amd/amdgpu/fiji_dpm.c
drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
drivers/gpu/drm/amd/amdgpu/iceland_dpm.c
drivers/gpu/drm/amd/amdgpu/iceland_ih.c
drivers/gpu/drm/amd/amdgpu/kv_dpm.c
drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
drivers/gpu/drm/amd/amdgpu/tonga_dpm.c
drivers/gpu/drm/amd/amdgpu/tonga_ih.c
drivers/gpu/drm/amd/amdkfd/kfd_process.c
drivers/gpu/drm/amd/amdkfd/kfd_topology.c
drivers/gpu/drm/amd/include/amd_shared.h
drivers/gpu/drm/amd/include/cgs_common.h
drivers/gpu/drm/amd/powerplay/amd_powerplay.c
drivers/gpu/drm/amd/powerplay/eventmgr/eventmgr.c
drivers/gpu/drm/amd/powerplay/hwmgr/fiji_hwmgr.c
drivers/gpu/drm/amd/powerplay/hwmgr/hwmgr.c
drivers/gpu/drm/amd/powerplay/hwmgr/polaris10_hwmgr.c
drivers/gpu/drm/amd/powerplay/hwmgr/polaris10_powertune.c
drivers/gpu/drm/amd/powerplay/hwmgr/tonga_hwmgr.c
drivers/gpu/drm/amd/powerplay/hwmgr/tonga_processpptables.c
drivers/gpu/drm/amd/powerplay/inc/hwmgr.h
drivers/gpu/drm/amd/powerplay/smumgr/cz_smumgr.c
drivers/gpu/drm/amd/powerplay/smumgr/fiji_smumgr.c
drivers/gpu/drm/amd/powerplay/smumgr/polaris10_smumgr.c
drivers/gpu/drm/amd/powerplay/smumgr/smumgr.c
drivers/gpu/drm/amd/powerplay/smumgr/tonga_smumgr.c
drivers/gpu/drm/arm/hdlcd_crtc.c
drivers/gpu/drm/arm/hdlcd_drv.c
drivers/gpu/drm/arm/hdlcd_drv.h
drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_crtc.c
drivers/gpu/drm/drm_atomic.c
drivers/gpu/drm/drm_crtc.c
drivers/gpu/drm/drm_crtc_helper.c
drivers/gpu/drm/drm_dp_dual_mode_helper.c [new file with mode: 0644]
drivers/gpu/drm/drm_dp_mst_topology.c
drivers/gpu/drm/drm_fb_cma_helper.c
drivers/gpu/drm/drm_gem_cma_helper.c
drivers/gpu/drm/drm_modes.c
drivers/gpu/drm/fsl-dcu/fsl_dcu_drm_drv.c
drivers/gpu/drm/i915/i915_dma.c
drivers/gpu/drm/i915/i915_drv.c
drivers/gpu/drm/i915/i915_drv.h
drivers/gpu/drm/i915/i915_gem.c
drivers/gpu/drm/i915/i915_gem_gtt.c
drivers/gpu/drm/i915/i915_gem_gtt.h
drivers/gpu/drm/i915/intel_bios.c
drivers/gpu/drm/i915/intel_ddi.c
drivers/gpu/drm/i915/intel_display.c
drivers/gpu/drm/i915/intel_dp.c
drivers/gpu/drm/i915/intel_dpll_mgr.c
drivers/gpu/drm/i915/intel_drv.h
drivers/gpu/drm/i915/intel_dsi.c
drivers/gpu/drm/i915/intel_hdmi.c
drivers/gpu/drm/i915/intel_lrc.c
drivers/gpu/drm/i915/intel_lrc.h
drivers/gpu/drm/i915/intel_lvds.c
drivers/gpu/drm/i915/intel_mocs.c
drivers/gpu/drm/i915/intel_panel.c
drivers/gpu/drm/i915/intel_pm.c
drivers/gpu/drm/i915/intel_psr.c
drivers/gpu/drm/i915/intel_ringbuffer.c
drivers/gpu/drm/i915/intel_ringbuffer.h
drivers/gpu/drm/i915/intel_vbt_defs.h
drivers/gpu/drm/imx/imx-drm-core.c
drivers/gpu/drm/imx/imx-drm.h
drivers/gpu/drm/imx/imx-ldb.c
drivers/gpu/drm/imx/imx-tve.c
drivers/gpu/drm/imx/ipuv3-crtc.c
drivers/gpu/drm/imx/ipuv3-plane.c
drivers/gpu/drm/imx/parallel-display.c
drivers/gpu/drm/mediatek/mtk_dpi.c
drivers/gpu/drm/mediatek/mtk_dsi.c
drivers/gpu/drm/mgag200/mgag200_mode.c
drivers/gpu/drm/msm/adreno/adreno_gpu.c
drivers/gpu/drm/msm/msm_fbdev.c
drivers/gpu/drm/msm/msm_gem.c
drivers/gpu/drm/msm/msm_gem_submit.c
drivers/gpu/drm/msm/msm_rd.c
drivers/gpu/drm/msm/msm_ringbuffer.c
drivers/gpu/drm/omapdrm/Kconfig
drivers/gpu/drm/omapdrm/displays/connector-hdmi.c
drivers/gpu/drm/omapdrm/displays/encoder-opa362.c
drivers/gpu/drm/omapdrm/displays/encoder-tfp410.c
drivers/gpu/drm/omapdrm/displays/panel-dpi.c
drivers/gpu/drm/omapdrm/displays/panel-dsi-cm.c
drivers/gpu/drm/omapdrm/displays/panel-lgphilips-lb035q02.c
drivers/gpu/drm/omapdrm/displays/panel-nec-nl8048hl11.c
drivers/gpu/drm/omapdrm/displays/panel-sharp-ls037v7dw01.c
drivers/gpu/drm/omapdrm/displays/panel-sony-acx565akm.c
drivers/gpu/drm/omapdrm/displays/panel-tpo-td043mtea1.c
drivers/gpu/drm/omapdrm/dss/dsi.c
drivers/gpu/drm/omapdrm/dss/dss.c
drivers/gpu/drm/omapdrm/dss/hdmi4.c
drivers/gpu/drm/omapdrm/dss/hdmi4_core.c
drivers/gpu/drm/omapdrm/dss/hdmi5.c
drivers/gpu/drm/omapdrm/dss/hdmi5_core.c
drivers/gpu/drm/omapdrm/dss/hdmi_phy.c
drivers/gpu/drm/omapdrm/dss/hdmi_pll.c
drivers/gpu/drm/omapdrm/dss/hdmi_wp.c
drivers/gpu/drm/omapdrm/omap_debugfs.c
drivers/gpu/drm/omapdrm/omap_dmm_tiler.c
drivers/gpu/drm/omapdrm/omap_fb.c
drivers/gpu/drm/omapdrm/omap_gem.c
drivers/gpu/drm/radeon/kv_dpm.c
drivers/gpu/drm/sti/sti_crtc.c
drivers/gpu/drm/sti/sti_vtg.c
drivers/gpu/drm/tilcdc/tilcdc_tfp410.c
drivers/gpu/drm/vc4/vc4_crtc.c
drivers/gpu/drm/vc4/vc4_drv.c
drivers/gpu/drm/vc4/vc4_kms.c
drivers/gpu/drm/vc4/vc4_regs.h
drivers/gpu/host1x/hw/intr_hw.c
drivers/gpu/ipu-v3/ipu-common.c
drivers/hwmon/emc2103.c
drivers/hwmon/lm75.c
drivers/hwmon/ntc_thermistor.c
drivers/hwmon/pwm-fan.c
drivers/hwmon/scpi-hwmon.c
drivers/hwmon/tmp102.c
drivers/i2c/busses/Kconfig
drivers/i2c/busses/i2c-at91.c
drivers/i2c/busses/i2c-rcar.c
drivers/i2c/i2c-dev.c
drivers/infiniband/Kconfig
drivers/infiniband/core/Makefile
drivers/infiniband/core/addr.c
drivers/infiniband/core/core_priv.h
drivers/infiniband/core/device.c
drivers/infiniband/core/mad.c
drivers/infiniband/core/multicast.c
drivers/infiniband/core/sa_query.c
drivers/infiniband/core/sysfs.c
drivers/infiniband/hw/Makefile
drivers/infiniband/hw/cxgb3/cxio_hal.c
drivers/infiniband/hw/cxgb3/iwch_provider.c
drivers/infiniband/hw/cxgb4/provider.c
drivers/infiniband/hw/hfi1/Kconfig [new file with mode: 0644]
drivers/infiniband/hw/hfi1/Makefile [new file with mode: 0644]
drivers/infiniband/hw/hfi1/affinity.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/affinity.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/aspm.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/chip.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/chip.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/chip_registers.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/common.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/debugfs.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/debugfs.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/device.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/device.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/dma.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/driver.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/efivar.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/efivar.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/eprom.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/eprom.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/file_ops.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/firmware.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/hfi.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/init.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/intr.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/iowait.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/mad.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/mad.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/mmu_rb.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/mmu_rb.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/opa_compat.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/pcie.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/pio.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/pio.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/pio_copy.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/platform.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/platform.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/qp.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/qp.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/qsfp.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/qsfp.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/rc.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/ruc.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/sdma.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/sdma.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/sdma_txreq.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/sysfs.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/trace.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/trace.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/twsi.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/twsi.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/uc.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/ud.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/user_exp_rcv.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/user_exp_rcv.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/user_pages.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/user_sdma.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/user_sdma.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/verbs.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/verbs.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/verbs_txreq.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/verbs_txreq.h [new file with mode: 0644]
drivers/infiniband/hw/i40iw/i40iw_verbs.c
drivers/infiniband/hw/qib/qib_iba7322.c
drivers/infiniband/hw/qib/qib_mad.c
drivers/infiniband/hw/qib/qib_verbs.h
drivers/infiniband/sw/rdmavt/cq.c
drivers/infiniband/sw/rdmavt/mr.c
drivers/infiniband/sw/rdmavt/qp.c
drivers/infiniband/ulp/ipoib/ipoib.h
drivers/infiniband/ulp/ipoib/ipoib_ib.c
drivers/infiniband/ulp/ipoib/ipoib_main.c
drivers/infiniband/ulp/ipoib/ipoib_multicast.c
drivers/infiniband/ulp/ipoib/ipoib_verbs.c
drivers/infiniband/ulp/ipoib/ipoib_vlan.c
drivers/infiniband/ulp/isert/ib_isert.c
drivers/infiniband/ulp/srpt/ib_srpt.c
drivers/input/joystick/xpad.c
drivers/input/misc/max77693-haptic.c
drivers/input/misc/max8997_haptic.c
drivers/input/misc/pwm-beeper.c
drivers/input/misc/uinput.c
drivers/input/touchscreen/sun4i-ts.c
drivers/iommu/arm-smmu-v3.c
drivers/iommu/arm-smmu.c
drivers/iommu/intel-iommu.c
drivers/iommu/iova.c
drivers/irqchip/irq-clps711x.c
drivers/irqchip/irq-gic-v3-its.c
drivers/irqchip/irq-gic-v3.c
drivers/irqchip/irq-gic.c
drivers/irqchip/irq-hip04.c
drivers/irqchip/irq-mips-gic.c
drivers/irqchip/irq-pic32-evic.c
drivers/irqchip/spear-shirq.c
drivers/leds/leds-pwm.c
drivers/md/bcache/alloc.c
drivers/md/bcache/btree.c
drivers/md/bcache/writeback.c
drivers/media/i2c/adp1653.c
drivers/media/platform/s5p-tv/mixer_drv.c
drivers/media/usb/dvb-usb-v2/af9015.c
drivers/memory/Kconfig
drivers/memory/fsl_ifc.c
drivers/memory/omap-gpmc.c
drivers/mfd/twl4030-irq.c
drivers/mmc/card/block.c
drivers/mmc/core/core.c
drivers/mmc/core/host.c
drivers/mmc/core/mmc.c
drivers/mmc/host/dw_mmc-rockchip.c
drivers/mmc/host/dw_mmc.c
drivers/mmc/host/sdhci-acpi.c
drivers/mmc/host/sdhci-esdhc-imx.c
drivers/mmc/host/sdhci-of-at91.c
drivers/mmc/host/sdhci-pci-core.c
drivers/mmc/host/sdhci.c
drivers/mmc/host/sunxi-mmc.c
drivers/mtd/chips/Kconfig
drivers/mtd/devices/bcm47xxsflash.c
drivers/mtd/devices/bcm47xxsflash.h
drivers/mtd/devices/docg3.c
drivers/mtd/devices/m25p80.c
drivers/mtd/devices/pmc551.c
drivers/mtd/maps/ck804xrom.c
drivers/mtd/maps/esb2rom.c
drivers/mtd/maps/ichxrom.c
drivers/mtd/maps/uclinux.c
drivers/mtd/mtdchar.c
drivers/mtd/mtdconcat.c
drivers/mtd/mtdcore.c
drivers/mtd/mtdpart.c
drivers/mtd/nand/ams-delta.c
drivers/mtd/nand/atmel_nand.c
drivers/mtd/nand/atmel_nand_nfc.h
drivers/mtd/nand/au1550nd.c
drivers/mtd/nand/bf5xx_nand.c
drivers/mtd/nand/brcmnand/brcmnand.c
drivers/mtd/nand/cafe_nand.c
drivers/mtd/nand/cmx270_nand.c
drivers/mtd/nand/davinci_nand.c
drivers/mtd/nand/denali.c
drivers/mtd/nand/diskonchip.c
drivers/mtd/nand/docg4.c
drivers/mtd/nand/fsl_elbc_nand.c
drivers/mtd/nand/fsl_ifc_nand.c
drivers/mtd/nand/fsl_upm.c
drivers/mtd/nand/fsmc_nand.c
drivers/mtd/nand/gpio.c
drivers/mtd/nand/gpmi-nand/gpmi-nand.c
drivers/mtd/nand/hisi504_nand.c
drivers/mtd/nand/jz4740_nand.c
drivers/mtd/nand/jz4780_bch.c
drivers/mtd/nand/jz4780_nand.c
drivers/mtd/nand/lpc32xx_mlc.c
drivers/mtd/nand/lpc32xx_slc.c
drivers/mtd/nand/mpc5121_nfc.c
drivers/mtd/nand/mxc_nand.c
drivers/mtd/nand/nand_base.c
drivers/mtd/nand/nand_bch.c
drivers/mtd/nand/nandsim.c
drivers/mtd/nand/nuc900_nand.c
drivers/mtd/nand/omap2.c
drivers/mtd/nand/orion_nand.c
drivers/mtd/nand/pasemi_nand.c
drivers/mtd/nand/plat_nand.c
drivers/mtd/nand/pxa3xx_nand.c
drivers/mtd/nand/qcom_nandc.c
drivers/mtd/nand/s3c2410.c
drivers/mtd/nand/sh_flctl.c
drivers/mtd/nand/sharpsl.c
drivers/mtd/nand/sm_common.c
drivers/mtd/nand/socrates_nand.c
drivers/mtd/nand/sunxi_nand.c
drivers/mtd/nand/vf610_nfc.c
drivers/mtd/onenand/onenand_base.c
drivers/mtd/spi-nor/spi-nor.c
drivers/mtd/ubi/build.c
drivers/mtd/ubi/debug.c
drivers/mtd/ubi/eba.c
drivers/mtd/ubi/fastmap.c
drivers/mtd/ubi/kapi.c
drivers/mtd/ubi/ubi.h
drivers/mtd/ubi/vmt.c
drivers/mtd/ubi/wl.c
drivers/net/ethernet/arc/emac_mdio.c
drivers/net/ethernet/atheros/alx/alx.h
drivers/net/ethernet/atheros/alx/main.c
drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
drivers/net/ethernet/ezchip/nps_enet.c
drivers/net/ethernet/freescale/fec_main.c
drivers/net/ethernet/freescale/fman/fman.c
drivers/net/ethernet/freescale/fman/fman_muram.c
drivers/net/ethernet/freescale/fman/fman_muram.h
drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
drivers/net/ethernet/marvell/mvneta_bm.c
drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
drivers/net/ethernet/mellanox/mlx4/en_netdev.c
drivers/net/ethernet/mellanox/mlx4/en_port.c
drivers/net/ethernet/mellanox/mlx4/en_tx.c
drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
drivers/net/ethernet/qlogic/qed/qed_dcbx.c
drivers/net/ethernet/qlogic/qed/qed_dev.c
drivers/net/ethernet/qlogic/qed/qed_main.c
drivers/net/ethernet/qlogic/qede/qede_ethtool.c
drivers/net/ethernet/qlogic/qede/qede_main.c
drivers/net/ethernet/qlogic/qlge/qlge_main.c
drivers/net/ethernet/sfc/ef10.c
drivers/net/ethernet/sfc/efx.c
drivers/net/ethernet/sfc/net_driver.h
drivers/net/ethernet/sfc/rx.c
drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
drivers/net/team/team.c
drivers/net/usb/pegasus.c
drivers/net/usb/smsc95xx.c
drivers/net/virtio_net.c
drivers/net/vxlan.c
drivers/net/wireless/ti/wlcore/spi.c
drivers/nvdimm/pmem.c
drivers/nvme/host/core.c
drivers/nvme/host/nvme.h
drivers/nvme/host/pci.c
drivers/nvmem/core.c
drivers/of/Makefile
drivers/of/of_mtd.c [deleted file]
drivers/perf/arm_pmu.c
drivers/pinctrl/intel/pinctrl-baytrail.c
drivers/pinctrl/mediatek/pinctrl-mtk-common.c
drivers/pinctrl/nomadik/pinctrl-nomadik.c
drivers/platform/chrome/Kconfig
drivers/platform/chrome/Makefile
drivers/platform/chrome/chromeos_laptop.c
drivers/platform/chrome/chromeos_pstore.c
drivers/platform/chrome/cros_ec_dev.c
drivers/platform/chrome/cros_ec_lightbar.c
drivers/platform/chrome/cros_ec_proto.c
drivers/platform/chrome/cros_kbd_led_backlight.c [new file with mode: 0644]
drivers/platform/x86/Kconfig
drivers/platform/x86/Makefile
drivers/platform/x86/asus-laptop.c
drivers/platform/x86/asus-wmi.c
drivers/platform/x86/dell-rbtn.c
drivers/platform/x86/fujitsu-laptop.c
drivers/platform/x86/ideapad-laptop.c
drivers/platform/x86/intel_menlow.c
drivers/platform/x86/intel_pmc_core.c [new file with mode: 0644]
drivers/platform/x86/intel_pmc_core.h [new file with mode: 0644]
drivers/platform/x86/intel_telemetry_core.c
drivers/platform/x86/intel_telemetry_pltdrv.c
drivers/platform/x86/sony-laptop.c
drivers/platform/x86/surfacepro3_button.c
drivers/platform/x86/thinkpad_acpi.c
drivers/ptp/ptp_chardev.c
drivers/pwm/core.c
drivers/pwm/pwm-crc.c
drivers/pwm/pwm-lpc18xx-sct.c
drivers/pwm/pwm-omap-dmtimer.c
drivers/pwm/pwm-rcar.c
drivers/pwm/pwm-sun4i.c
drivers/pwm/sysfs.c
drivers/s390/block/dcssblk.c
drivers/scsi/aacraid/aacraid.h
drivers/scsi/aacraid/linit.c
drivers/scsi/mpt3sas/mpt3sas_scsih.c
drivers/scsi/qla2xxx/Kconfig
drivers/scsi/qla2xxx/qla_target.c
drivers/scsi/qla2xxx/qla_target.h
drivers/scsi/qla2xxx/tcm_qla2xxx.c
drivers/scsi/qla2xxx/tcm_qla2xxx.h
drivers/scsi/scsi_lib.c
drivers/scsi/sd.c
drivers/soc/mediatek/mtk-pmic-wrap.c
drivers/spi/Kconfig
drivers/spi/Makefile
drivers/spi/spi-axi-spi-engine.c
drivers/spi/spi-bcm53xx.c
drivers/spi/spi-cadence.c
drivers/spi/spi-davinci.c
drivers/spi/spi-dln2.c
drivers/spi/spi-dw-pci.c
drivers/spi/spi-ep93xx.c
drivers/spi/spi-fsl-dspi.c
drivers/spi/spi-fsl-espi.c
drivers/spi/spi-octeon.c
drivers/spi/spi-omap2-mcspi.c
drivers/spi/spi-pic32-sqi.c [new file with mode: 0644]
drivers/spi/spi-pic32.c [new file with mode: 0644]
drivers/spi/spi-pxa2xx-dma.c
drivers/spi/spi-pxa2xx-pci.c
drivers/spi/spi-pxa2xx.c
drivers/spi/spi-pxa2xx.h
drivers/spi/spi-qup.c
drivers/spi/spi-rockchip.c
drivers/spi/spi-st-ssc4.c
drivers/spi/spi-zynqmp-gqspi.c
drivers/spi/spi.c
drivers/staging/Kconfig
drivers/staging/Makefile
drivers/staging/lustre/lustre/llite/llite_internal.h
drivers/staging/lustre/lustre/llite/xattr.c
drivers/staging/mt29f_spinand/mt29f_spinand.c
drivers/staging/rdma/Kconfig [deleted file]
drivers/staging/rdma/Makefile [deleted file]
drivers/staging/rdma/hfi1/Kconfig [deleted file]
drivers/staging/rdma/hfi1/Makefile [deleted file]
drivers/staging/rdma/hfi1/TODO [deleted file]
drivers/staging/rdma/hfi1/affinity.c [deleted file]
drivers/staging/rdma/hfi1/affinity.h [deleted file]
drivers/staging/rdma/hfi1/aspm.h [deleted file]
drivers/staging/rdma/hfi1/chip.c [deleted file]
drivers/staging/rdma/hfi1/chip.h [deleted file]
drivers/staging/rdma/hfi1/chip_registers.h [deleted file]
drivers/staging/rdma/hfi1/common.h [deleted file]
drivers/staging/rdma/hfi1/debugfs.c [deleted file]
drivers/staging/rdma/hfi1/debugfs.h [deleted file]
drivers/staging/rdma/hfi1/device.c [deleted file]
drivers/staging/rdma/hfi1/device.h [deleted file]
drivers/staging/rdma/hfi1/diag.c [deleted file]
drivers/staging/rdma/hfi1/dma.c [deleted file]
drivers/staging/rdma/hfi1/driver.c [deleted file]
drivers/staging/rdma/hfi1/efivar.c [deleted file]
drivers/staging/rdma/hfi1/efivar.h [deleted file]
drivers/staging/rdma/hfi1/eprom.c [deleted file]
drivers/staging/rdma/hfi1/eprom.h [deleted file]
drivers/staging/rdma/hfi1/file_ops.c [deleted file]
drivers/staging/rdma/hfi1/firmware.c [deleted file]
drivers/staging/rdma/hfi1/hfi.h [deleted file]
drivers/staging/rdma/hfi1/init.c [deleted file]
drivers/staging/rdma/hfi1/intr.c [deleted file]
drivers/staging/rdma/hfi1/iowait.h [deleted file]
drivers/staging/rdma/hfi1/mad.c [deleted file]
drivers/staging/rdma/hfi1/mad.h [deleted file]
drivers/staging/rdma/hfi1/mmu_rb.c [deleted file]
drivers/staging/rdma/hfi1/mmu_rb.h [deleted file]
drivers/staging/rdma/hfi1/opa_compat.h [deleted file]
drivers/staging/rdma/hfi1/pcie.c [deleted file]
drivers/staging/rdma/hfi1/pio.c [deleted file]
drivers/staging/rdma/hfi1/pio.h [deleted file]
drivers/staging/rdma/hfi1/pio_copy.c [deleted file]
drivers/staging/rdma/hfi1/platform.c [deleted file]
drivers/staging/rdma/hfi1/platform.h [deleted file]
drivers/staging/rdma/hfi1/qp.c [deleted file]
drivers/staging/rdma/hfi1/qp.h [deleted file]
drivers/staging/rdma/hfi1/qsfp.c [deleted file]
drivers/staging/rdma/hfi1/qsfp.h [deleted file]
drivers/staging/rdma/hfi1/rc.c [deleted file]
drivers/staging/rdma/hfi1/ruc.c [deleted file]
drivers/staging/rdma/hfi1/sdma.c [deleted file]
drivers/staging/rdma/hfi1/sdma.h [deleted file]
drivers/staging/rdma/hfi1/sdma_txreq.h [deleted file]
drivers/staging/rdma/hfi1/sysfs.c [deleted file]
drivers/staging/rdma/hfi1/trace.c [deleted file]
drivers/staging/rdma/hfi1/trace.h [deleted file]
drivers/staging/rdma/hfi1/twsi.c [deleted file]
drivers/staging/rdma/hfi1/twsi.h [deleted file]
drivers/staging/rdma/hfi1/uc.c [deleted file]
drivers/staging/rdma/hfi1/ud.c [deleted file]
drivers/staging/rdma/hfi1/user_exp_rcv.c [deleted file]
drivers/staging/rdma/hfi1/user_exp_rcv.h [deleted file]
drivers/staging/rdma/hfi1/user_pages.c [deleted file]
drivers/staging/rdma/hfi1/user_sdma.c [deleted file]
drivers/staging/rdma/hfi1/user_sdma.h [deleted file]
drivers/staging/rdma/hfi1/verbs.c [deleted file]
drivers/staging/rdma/hfi1/verbs.h [deleted file]
drivers/staging/rdma/hfi1/verbs_txreq.c [deleted file]
drivers/staging/rdma/hfi1/verbs_txreq.h [deleted file]
drivers/target/iscsi/Kconfig
drivers/target/iscsi/Makefile
drivers/target/iscsi/cxgbit/Kconfig [new file with mode: 0644]
drivers/target/iscsi/cxgbit/Makefile [new file with mode: 0644]
drivers/target/iscsi/cxgbit/cxgbit.h [new file with mode: 0644]
drivers/target/iscsi/cxgbit/cxgbit_cm.c [new file with mode: 0644]
drivers/target/iscsi/cxgbit/cxgbit_ddp.c [new file with mode: 0644]
drivers/target/iscsi/cxgbit/cxgbit_lro.h [new file with mode: 0644]
drivers/target/iscsi/cxgbit/cxgbit_main.c [new file with mode: 0644]
drivers/target/iscsi/cxgbit/cxgbit_target.c [new file with mode: 0644]
drivers/target/iscsi/iscsi_target.c
drivers/target/iscsi/iscsi_target_configfs.c
drivers/target/iscsi/iscsi_target_datain_values.c
drivers/target/iscsi/iscsi_target_erl0.c
drivers/target/iscsi/iscsi_target_login.c
drivers/target/iscsi/iscsi_target_nego.c
drivers/target/iscsi/iscsi_target_parameters.c
drivers/target/iscsi/iscsi_target_util.c
drivers/target/loopback/tcm_loop.c
drivers/target/sbp/sbp_target.c
drivers/target/target_core_alua.c
drivers/target/target_core_configfs.c
drivers/target/target_core_internal.h
drivers/target/target_core_pr.c
drivers/target/target_core_rd.c
drivers/target/target_core_tpg.c
drivers/target/target_core_transport.c
drivers/target/tcm_fc/tcm_fc.h
drivers/target/tcm_fc/tfc_conf.c
drivers/target/tcm_fc/tfc_sess.c
drivers/thermal/Kconfig
drivers/thermal/Makefile
drivers/thermal/gov_bang_bang.c
drivers/thermal/hisi_thermal.c
drivers/thermal/int340x_thermal/int3406_thermal.c
drivers/thermal/int340x_thermal/processor_thermal_device.c
drivers/thermal/intel_powerclamp.c
drivers/thermal/mtk_thermal.c
drivers/thermal/of-thermal.c
drivers/thermal/qcom-spmi-temp-alarm.c
drivers/thermal/rcar_thermal.c
drivers/thermal/rockchip_thermal.c
drivers/thermal/tango_thermal.c [new file with mode: 0644]
drivers/thermal/tegra/Kconfig [new file with mode: 0644]
drivers/thermal/tegra/Makefile [new file with mode: 0644]
drivers/thermal/tegra/soctherm-fuse.c [new file with mode: 0644]
drivers/thermal/tegra/soctherm.c [new file with mode: 0644]
drivers/thermal/tegra/soctherm.h [new file with mode: 0644]
drivers/thermal/tegra/tegra124-soctherm.c [new file with mode: 0644]
drivers/thermal/tegra/tegra132-soctherm.c [new file with mode: 0644]
drivers/thermal/tegra/tegra210-soctherm.c [new file with mode: 0644]
drivers/thermal/tegra_soctherm.c [deleted file]
drivers/thermal/thermal-generic-adc.c [new file with mode: 0644]
drivers/thermal/ti-soc-thermal/ti-thermal-common.c
drivers/thermal/x86_pkg_temp_thermal.c
drivers/tty/Kconfig
drivers/tty/pty.c
drivers/tty/serial/amba-pl011.c
drivers/tty/serial/sprd_serial.c
drivers/usb/gadget/function/f_tcm.c
drivers/vfio/pci/vfio_pci.c
drivers/vfio/pci/vfio_pci_config.c
drivers/vfio/pci/vfio_pci_intrs.c
drivers/vfio/pci/vfio_pci_private.h
drivers/vfio/vfio_iommu_spapr_tce.c
drivers/vfio/vfio_iommu_type1.c
drivers/vhost/scsi.c
drivers/video/backlight/lm3630a_bl.c
drivers/video/backlight/lp855x_bl.c
drivers/video/backlight/lp8788_bl.c
drivers/video/backlight/pwm_bl.c
drivers/video/fbdev/da8xx-fb.c
drivers/video/fbdev/omap2/omapfb/dss/hdmi5_core.c
drivers/video/fbdev/ssd1307fb.c
drivers/virtio/virtio_balloon.c
drivers/watchdog/Kconfig
drivers/watchdog/Makefile
drivers/watchdog/cpwd.c
drivers/watchdog/f71808e_wdt.c
drivers/watchdog/imx2_wdt.c
drivers/watchdog/jz4740_wdt.c
drivers/watchdog/octeon-wdt-main.c
drivers/watchdog/qcom-wdt.c
drivers/watchdog/renesas_wdt.c [new file with mode: 0644]
drivers/watchdog/shwdt.c
drivers/watchdog/sp5100_tco.c
drivers/watchdog/watchdog_core.c
drivers/watchdog/watchdog_dev.c
drivers/xen/Makefile
drivers/xen/events/events_base.c
drivers/xen/gntdev.c
drivers/xen/xen-scsiback.c
fs/9p/acl.c
fs/9p/xattr.c
fs/Kconfig
fs/affs/super.c
fs/afs/write.c
fs/bad_inode.c
fs/binfmt_aout.c
fs/binfmt_elf.c
fs/binfmt_flat.c
fs/block_dev.c
fs/btrfs/backref.c
fs/btrfs/btrfs_inode.h
fs/btrfs/check-integrity.c
fs/btrfs/ctree.c
fs/btrfs/ctree.h
fs/btrfs/delayed-ref.h
fs/btrfs/dev-replace.c
fs/btrfs/disk-io.c
fs/btrfs/extent-tree.c
fs/btrfs/extent_io.c
fs/btrfs/extent_io.h
fs/btrfs/extent_map.c
fs/btrfs/file-item.c
fs/btrfs/file.c
fs/btrfs/free-space-cache.c
fs/btrfs/free-space-cache.h
fs/btrfs/inode.c
fs/btrfs/ioctl.c
fs/btrfs/ordered-data.c
fs/btrfs/ordered-data.h
fs/btrfs/qgroup.c
fs/btrfs/raid56.c
fs/btrfs/reada.c
fs/btrfs/relocation.c
fs/btrfs/root-tree.c
fs/btrfs/scrub.c
fs/btrfs/send.c
fs/btrfs/struct-funcs.c
fs/btrfs/super.c
fs/btrfs/tests/extent-io-tests.c
fs/btrfs/tests/free-space-tests.c
fs/btrfs/tests/inode-tests.c
fs/btrfs/tests/qgroup-tests.c
fs/btrfs/transaction.c
fs/btrfs/transaction.h
fs/btrfs/tree-log.c
fs/btrfs/ulist.c
fs/btrfs/volumes.c
fs/btrfs/xattr.c
fs/cachefiles/interface.c
fs/ceph/addr.c
fs/ceph/cache.c
fs/ceph/cache.h
fs/ceph/caps.c
fs/ceph/debugfs.c
fs/ceph/dir.c
fs/ceph/file.c
fs/ceph/inode.c
fs/ceph/ioctl.c
fs/ceph/mds_client.c
fs/ceph/mds_client.h
fs/ceph/mdsmap.c
fs/ceph/super.c
fs/ceph/super.h
fs/ceph/xattr.c
fs/cifs/xattr.c
fs/compat.c
fs/dax.c
fs/dcache.c
fs/devpts/inode.c
fs/direct-io.c
fs/ecryptfs/crypto.c
fs/ecryptfs/ecryptfs_kernel.h
fs/ecryptfs/inode.c
fs/ecryptfs/mmap.c
fs/ext2/file.c
fs/ext2/inode.c
fs/ext2/super.c
fs/ext2/xattr_security.c
fs/ext2/xattr_trusted.c
fs/ext2/xattr_user.c
fs/ext4/balloc.c
fs/ext4/dir.c
fs/ext4/ext4.h
fs/ext4/ext4_jbd2.h
fs/ext4/extents.c
fs/ext4/extents_status.c
fs/ext4/file.c
fs/ext4/ialloc.c
fs/ext4/indirect.c
fs/ext4/inline.c
fs/ext4/inode.c
fs/ext4/ioctl.c
fs/ext4/mballoc.c
fs/ext4/mmp.c
fs/ext4/move_extent.c
fs/ext4/namei.c
fs/ext4/page-io.c
fs/ext4/resize.c
fs/ext4/super.c
fs/ext4/xattr_security.c
fs/ext4/xattr_trusted.c
fs/ext4/xattr_user.c
fs/f2fs/xattr.c
fs/fscache/page.c
fs/fuse/dir.c
fs/gfs2/dir.c
fs/gfs2/xattr.c
fs/hfs/attr.c
fs/hfs/hfs_fs.h
fs/hfsplus/xattr.c
fs/hfsplus/xattr.h
fs/hfsplus/xattr_security.c
fs/hfsplus/xattr_trusted.c
fs/hfsplus/xattr_user.c
fs/hpfs/super.c
fs/jbd2/commit.c
fs/jbd2/journal.c
fs/jbd2/transaction.c
fs/jffs2/security.c
fs/jffs2/xattr_trusted.c
fs/jffs2/xattr_user.c
fs/jfs/xattr.c
fs/kernfs/inode.c
fs/kernfs/kernfs-internal.h
fs/libfs.c
fs/namei.c
fs/nfs/callback_proc.c
fs/nfs/callback_xdr.c
fs/nfs/delegation.c
fs/nfs/delegation.h
fs/nfs/direct.c
fs/nfs/filelayout/filelayout.c
fs/nfs/flexfilelayout/flexfilelayout.c
fs/nfs/flexfilelayout/flexfilelayout.h
fs/nfs/flexfilelayout/flexfilelayoutdev.c
fs/nfs/internal.h
fs/nfs/nfs42.h
fs/nfs/nfs42proc.c
fs/nfs/nfs42xdr.c
fs/nfs/nfs4_fs.h
fs/nfs/nfs4file.c
fs/nfs/nfs4proc.c
fs/nfs/nfs4state.c
fs/nfs/nfs4trace.h
fs/nfs/nfs4xdr.c
fs/nfs/pagelist.c
fs/nfs/pnfs.c
fs/nfs/pnfs.h
fs/nfs/pnfs_nfs.c
fs/nfs/super.c
fs/nfs/write.c
fs/nfsd/nfs3xdr.c
fs/nfsd/nfs4layouts.c
fs/nfsd/nfs4state.c
fs/nfsd/state.h
fs/ocfs2/cluster/heartbeat.c
fs/ocfs2/cluster/tcp_internal.h
fs/ocfs2/inode.c
fs/ocfs2/journal.h
fs/ocfs2/xattr.c
fs/orangefs/xattr.c
fs/overlayfs/copy_up.c
fs/overlayfs/dir.c
fs/overlayfs/inode.c
fs/overlayfs/overlayfs.h
fs/overlayfs/readdir.c
fs/overlayfs/super.c
fs/posix_acl.c
fs/readdir.c
fs/reiserfs/xattr_security.c
fs/reiserfs/xattr_trusted.c
fs/reiserfs/xattr_user.c
fs/ubifs/debug.c
fs/ubifs/xattr.c
fs/xattr.c
fs/xfs/kmem.c
fs/xfs/kmem.h
fs/xfs/libxfs/xfs_attr.c
fs/xfs/libxfs/xfs_bmap.c
fs/xfs/libxfs/xfs_dir2_sf.c
fs/xfs/libxfs/xfs_inode_fork.c
fs/xfs/libxfs/xfs_inode_fork.h
fs/xfs/libxfs/xfs_log_format.h
fs/xfs/libxfs/xfs_sb.c
fs/xfs/libxfs/xfs_shared.h
fs/xfs/xfs_aops.c
fs/xfs/xfs_aops.h
fs/xfs/xfs_attr.h
fs/xfs/xfs_attr_inactive.c
fs/xfs/xfs_attr_list.c
fs/xfs/xfs_bmap_util.c
fs/xfs/xfs_buf.c
fs/xfs/xfs_buf.h
fs/xfs/xfs_buf_item.c
fs/xfs/xfs_dquot.c
fs/xfs/xfs_file.c
fs/xfs/xfs_fsops.c
fs/xfs/xfs_icache.c
fs/xfs/xfs_inode.c
fs/xfs/xfs_inode.h
fs/xfs/xfs_inode_item.c
fs/xfs/xfs_ioctl.c
fs/xfs/xfs_iomap.c
fs/xfs/xfs_iops.c
fs/xfs/xfs_log.c
fs/xfs/xfs_log.h
fs/xfs/xfs_log_cil.c
fs/xfs/xfs_log_priv.h
fs/xfs/xfs_log_recover.c
fs/xfs/xfs_mount.c
fs/xfs/xfs_mount.h
fs/xfs/xfs_pnfs.c
fs/xfs/xfs_qm.c
fs/xfs/xfs_qm_syscalls.c
fs/xfs/xfs_rtalloc.c
fs/xfs/xfs_super.c
fs/xfs/xfs_symlink.c
fs/xfs/xfs_sysfs.c
fs/xfs/xfs_sysfs.h
fs/xfs/xfs_trace.h
fs/xfs/xfs_trans.c
fs/xfs/xfs_trans.h
fs/xfs/xfs_xattr.c
include/acpi/video.h
include/asm-generic/preempt.h
include/drm/drm_dp_dual_mode_helper.h [new file with mode: 0644]
include/dt-bindings/thermal/tegra124-soctherm.h
include/kvm/arm_arch_timer.h
include/kvm/arm_vgic.h
include/kvm/vgic/vgic.h [new file with mode: 0644]
include/linux/bcma/bcma_driver_chipcommon.h
include/linux/blkdev.h
include/linux/ceph/ceph_frag.h
include/linux/ceph/ceph_fs.h
include/linux/ceph/decode.h
include/linux/ceph/libceph.h
include/linux/ceph/mon_client.h
include/linux/ceph/osd_client.h
include/linux/ceph/osdmap.h
include/linux/ceph/rados.h
include/linux/dax.h
include/linux/dcache.h
include/linux/devpts_fs.h
include/linux/dma-buf.h
include/linux/err.h
include/linux/errno.h
include/linux/export.h
include/linux/fence.h
include/linux/fs.h
include/linux/fscache-cache.h
include/linux/fsl_ifc.h
include/linux/hash.h
include/linux/iova.h
include/linux/irqchip/arm-gic-v3.h
include/linux/irqchip/arm-gic.h
include/linux/jbd2.h
include/linux/kvm_host.h
include/linux/memory_hotplug.h
include/linux/mfd/cros_ec.h
include/linux/mfd/twl6040.h
include/linux/mm.h
include/linux/mm_types.h
include/linux/mmc/host.h
include/linux/mtd/fsmc.h
include/linux/mtd/map.h
include/linux/mtd/mtd.h
include/linux/mtd/nand.h
include/linux/mtd/onenand.h
include/linux/mtd/sharpsl.h
include/linux/mtd/spi-nor.h
include/linux/namei.h
include/linux/nfs4.h
include/linux/nfs_fs_sb.h
include/linux/nfs_xdr.h
include/linux/of_mtd.h [deleted file]
include/linux/omap-gpmc.h
include/linux/page_idle.h
include/linux/perf_event.h
include/linux/platform_data/at24.h
include/linux/platform_data/gpmc-omap.h [new file with mode: 0644]
include/linux/platform_data/mtd-nand-omap2.h
include/linux/pwm.h
include/linux/reservation.h
include/linux/rwsem.h
include/linux/sched.h
include/linux/sctp.h
include/linux/seqlock.h
include/linux/slub_def.h
include/linux/spi/spi.h
include/linux/stringhash.h [new file with mode: 0644]
include/linux/sunrpc/auth.h
include/linux/sunrpc/clnt.h
include/linux/sunrpc/msg_prot.h
include/linux/sunrpc/svc_rdma.h
include/linux/sunrpc/svcauth.h
include/linux/sunrpc/xprt.h
include/linux/sunrpc/xprtrdma.h
include/linux/thermal.h
include/linux/timekeeping.h
include/linux/xattr.h
include/net/ip6_tunnel.h
include/net/pkt_sched.h
include/rdma/ib_mad.h
include/rdma/ib_pack.h
include/rdma/ib_sa.h
include/rdma/ib_verbs.h
include/rdma/rdma_vt.h
include/rdma/rdmavt_qp.h
include/target/iscsi/iscsi_target_core.h
include/target/iscsi/iscsi_transport.h
include/target/target_core_base.h
include/target/target_core_fabric.h
include/trace/events/kvm.h
include/uapi/asm-generic/unistd.h
include/uapi/linux/ethtool.h
include/uapi/linux/nvme_ioctl.h
include/uapi/linux/perf_event.h
include/uapi/linux/pkt_cls.h
include/uapi/mtd/mtd-abi.h
include/uapi/rdma/hfi/hfi1_user.h
include/uapi/rdma/rdma_netlink.h
include/uapi/sound/asoc.h
include/video/imx-ipu-v3.h
init/Kconfig
init/main.c
kernel/bpf/inode.c
kernel/bpf/stackmap.c
kernel/events/callchain.c
kernel/fork.c
kernel/gcov/Kconfig
kernel/irq/ipi.c
kernel/locking/percpu-rwsem.c
kernel/locking/rwsem.c
kernel/pid.c
kernel/sched/core.c
kernel/sched/cpufreq_schedutil.c
kernel/sysctl.c
kernel/time/hrtimer.c
lib/Kconfig.debug
lib/Makefile
lib/dma-debug.c
lib/iov_iter.c
lib/test_hash.c [new file with mode: 0644]
lib/test_uuid.c [new file with mode: 0644]
lib/uuid.c
mm/Kconfig
mm/cma.c
mm/filemap.c
mm/kasan/kasan.h
mm/memcontrol.c
mm/memory.c
mm/memory_hotplug.c
mm/mmap.c
mm/nommu.c
mm/oom_kill.c
mm/page_alloc.c
mm/page_ext.c
mm/page_owner.c
mm/page_poison.c
mm/rmap.c
mm/shmem.c
mm/truncate.c
mm/vmalloc.c
mm/vmstat.c
mm/z3fold.c
mm/zsmalloc.c
net/8021q/vlan.c
net/8021q/vlan.h
net/8021q/vlan_dev.c
net/9p/client.c
net/atm/signaling.c
net/atm/svc.c
net/ceph/ceph_common.c
net/ceph/ceph_strings.c
net/ceph/debugfs.c
net/ceph/mon_client.c
net/ceph/osd_client.c
net/ceph/osdmap.c
net/core/hwbm.c
net/core/pktgen.c
net/ieee802154/nl802154.c
net/ipv4/af_inet.c
net/ipv4/sysctl_net_ipv4.c
net/ipv6/Kconfig
net/ipv6/Makefile
net/ipv6/fou6.c
net/ipv6/ip6_gre.c
net/l2tp/l2tp_ip6.c
net/lapb/lapb_in.c
net/lapb/lapb_out.c
net/lapb/lapb_subr.c
net/openvswitch/actions.c
net/sched/act_police.c
net/sched/sch_api.c
net/sched/sch_htb.c
net/sctp/sctp_diag.c
net/sctp/socket.c
net/sunrpc/auth.c
net/sunrpc/auth_generic.c
net/sunrpc/auth_gss/auth_gss.c
net/sunrpc/auth_gss/svcauth_gss.c
net/sunrpc/auth_unix.c
net/sunrpc/clnt.c
net/sunrpc/svc_xprt.c
net/sunrpc/xdr.c
net/sunrpc/xprtrdma/backchannel.c
net/sunrpc/xprtrdma/fmr_ops.c
net/sunrpc/xprtrdma/frwr_ops.c
net/sunrpc/xprtrdma/physical_ops.c
net/sunrpc/xprtrdma/rpc_rdma.c
net/sunrpc/xprtrdma/svc_rdma_marshal.c
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
net/sunrpc/xprtrdma/svc_rdma_sendto.c
net/sunrpc/xprtrdma/svc_rdma_transport.c
net/sunrpc/xprtrdma/transport.c
net/sunrpc/xprtrdma/verbs.c
net/sunrpc/xprtrdma/xprt_rdma.h
net/sunrpc/xprtsock.c
net/tipc/netlink_compat.c
scripts/Kbuild.include
scripts/Makefile.build
scripts/Makefile.extrawarn
scripts/Makefile.lib
scripts/adjust_autoksyms.sh [new file with mode: 0755]
scripts/basic/fixdep.c
scripts/checkpatch.pl
scripts/coccicheck
scripts/coccinelle/api/setup_timer.cocci
scripts/coccinelle/misc/compare_const_fl.cocci [deleted file]
scripts/genksyms/genksyms.c
scripts/kconfig/confdata.c
scripts/kconfig/symbol.c
scripts/package/Makefile
scripts/package/builddeb
scripts/package/mkspec
security/keys/compat.c
security/keys/dh.c
security/keys/internal.h
security/keys/keyctl.c
security/smack/smack_lsm.c
security/yama/yama_lsm.c
sound/pci/hda/patch_realtek.c
sound/soc/codecs/Kconfig
sound/soc/codecs/Makefile
sound/soc/codecs/ak4642.c
sound/soc/codecs/max98371.c [new file with mode: 0644]
sound/soc/codecs/max98371.h [new file with mode: 0644]
sound/soc/codecs/rt298.c
sound/soc/codecs/rt298.h
sound/soc/codecs/rt5677.c
sound/soc/codecs/tas571x.c
sound/soc/codecs/tas571x.h
sound/soc/codecs/tas5720.c [new file with mode: 0644]
sound/soc/codecs/tas5720.h [new file with mode: 0644]
sound/soc/codecs/tlv320aic31xx.c
sound/soc/codecs/tlv320aic32x4-i2c.c [new file with mode: 0644]
sound/soc/codecs/tlv320aic32x4-spi.c [new file with mode: 0644]
sound/soc/codecs/tlv320aic32x4.c
sound/soc/codecs/tlv320aic32x4.h
sound/soc/codecs/twl6040.c
sound/soc/codecs/wm8962.c
sound/soc/codecs/wm8962.h
sound/soc/generic/simple-card.c
sound/soc/kirkwood/Kconfig
sound/soc/mediatek/Kconfig
sound/soc/mediatek/mt8173-rt5650-rt5676.c
sound/soc/mediatek/mt8173-rt5650.c
sound/soc/mediatek/mtk-afe-pcm.c
sound/soc/omap/mcbsp.c
sound/soc/omap/omap-pcm.c
sound/soc/pxa/brownstone.c
sound/soc/pxa/mioa701_wm9713.c
sound/soc/pxa/mmp-pcm.c
sound/soc/pxa/mmp-sspa.c
sound/soc/pxa/palm27x.c
sound/soc/pxa/pxa-ssp.c
sound/soc/pxa/pxa2xx-ac97.c
sound/soc/pxa/pxa2xx-pcm.c
sound/soc/qcom/lpass-platform.c
sound/soc/sh/rcar/adg.c
sound/soc/sh/rcar/dma.c
sound/soc/sh/rcar/rsnd.h
sound/soc/sh/rcar/src.c
sound/soc/soc-topology.c
sound/soc/sti/sti_uniperif.c
sound/soc/sti/uniperif.h
sound/soc/sti/uniperif_player.c
sound/soc/sti/uniperif_reader.c
tools/Makefile
tools/build/Makefile.build
tools/kvm/kvm_stat/Makefile [new file with mode: 0644]
tools/kvm/kvm_stat/kvm_stat [new file with mode: 0755]
tools/kvm/kvm_stat/kvm_stat.txt [new file with mode: 0644]
tools/objtool/Makefile
tools/objtool/elf.h
tools/perf/Documentation/perf-report.txt
tools/perf/Documentation/perf-script.txt
tools/perf/Documentation/perf-trace.txt
tools/perf/builtin-annotate.c
tools/perf/builtin-buildid-cache.c
tools/perf/builtin-diff.c
tools/perf/builtin-record.c
tools/perf/builtin-report.c
tools/perf/builtin-script.c
tools/perf/builtin-stat.c
tools/perf/builtin-timechart.c
tools/perf/builtin-top.c
tools/perf/builtin-trace.c
tools/perf/perf.c
tools/perf/util/annotate.c
tools/perf/util/build-id.c
tools/perf/util/db-export.c
tools/perf/util/dso.c
tools/perf/util/evlist.c
tools/perf/util/evlist.h
tools/perf/util/evsel.c
tools/perf/util/evsel.h
tools/perf/util/hist.c
tools/perf/util/hist.h
tools/perf/util/machine.c
tools/perf/util/machine.h
tools/perf/util/scripting-engines/trace-event-perl.c
tools/perf/util/sort.c
tools/perf/util/sort.h
tools/perf/util/stat-shadow.c
tools/perf/util/symbol.c
tools/perf/util/symbol.h
tools/perf/util/top.h
tools/perf/util/util.c
tools/perf/util/util.h
tools/testing/selftests/seccomp/seccomp_bpf.c
tools/testing/selftests/vm/thuge-gen.c
tools/virtio/ringtest/Makefile
tools/virtio/ringtest/main.c
tools/virtio/ringtest/virtio_ring_0_9.c
tools/virtio/ringtest/virtio_ring_inorder.c [new file with mode: 0644]
virt/kvm/arm/arch_timer.c
virt/kvm/arm/hyp/timer-sr.c
virt/kvm/arm/hyp/vgic-v2-sr.c
virt/kvm/arm/pmu.c
virt/kvm/arm/vgic-v2.c
virt/kvm/arm/vgic-v3.c
virt/kvm/arm/vgic.c
virt/kvm/arm/vgic/vgic-init.c [new file with mode: 0644]
virt/kvm/arm/vgic/vgic-irqfd.c [new file with mode: 0644]
virt/kvm/arm/vgic/vgic-kvm-device.c [new file with mode: 0644]
virt/kvm/arm/vgic/vgic-mmio-v2.c [new file with mode: 0644]
virt/kvm/arm/vgic/vgic-mmio-v3.c [new file with mode: 0644]
virt/kvm/arm/vgic/vgic-mmio.c [new file with mode: 0644]
virt/kvm/arm/vgic/vgic-mmio.h [new file with mode: 0644]
virt/kvm/arm/vgic/vgic-v2.c [new file with mode: 0644]
virt/kvm/arm/vgic/vgic-v3.c [new file with mode: 0644]
virt/kvm/arm/vgic/vgic.c [new file with mode: 0644]
virt/kvm/arm/vgic/vgic.h [new file with mode: 0644]
virt/kvm/irqchip.c
virt/kvm/kvm_main.c

index 18d471d..a6b3240 100644 (file)
@@ -107,6 +107,15 @@ Contact:   Artem Bityutskiy <dedekind@infradead.org>
 Description:
                Number of physical eraseblocks reserved for bad block handling.
 
+What:          /sys/class/ubi/ubiX/ro_mode
+Date:          April 2016
+KernelVersion: 4.7
+Contact:       linux-mtd@lists.infradead.org
+Description:
+               Contains ASCII "1\n" if the read-only flag is set on this
+               device, and "0\n" if it is cleared. UBI devices mark themselves
+               as read-only when they detect an unrecoverable error.
+
 What:          /sys/class/ubi/ubiX/total_eraseblocks
 Date:          July 2006
 KernelVersion: 2.6.22
index de79efd..8c68768 100644 (file)
@@ -128,16 +128,44 @@ X!Edrivers/base/interface.c
 !Edrivers/base/platform.c
 !Edrivers/base/bus.c
      </sect1>
-     <sect1><title>Device Drivers DMA Management</title>
+     <sect1>
+       <title>Buffer Sharing and Synchronization</title>
+       <para>
+         The dma-buf subsystem provides the framework for sharing buffers
+         for hardware (DMA) access across multiple device drivers and
+         subsystems, and for synchronizing asynchronous hardware access.
+       </para>
+       <para>
+         This is used, for example, by drm "prime" multi-GPU support, but
+         is of course not limited to GPU use cases.
+       </para>
+       <para>
+         The three main components of this are: (1) dma-buf, representing
+         a sg_table and exposed to userspace as a file descriptor to allow
+         passing between devices, (2) fence, which provides a mechanism
+         to signal when one device as finished access, and (3) reservation,
+         which manages the shared or exclusive fence(s) associated with
+         the buffer.
+       </para>
+       <sect2><title>dma-buf</title>
 !Edrivers/dma-buf/dma-buf.c
+!Iinclude/linux/dma-buf.h
+       </sect2>
+       <sect2><title>reservation</title>
+!Pdrivers/dma-buf/reservation.c Reservation Object Overview
+!Edrivers/dma-buf/reservation.c
+!Iinclude/linux/reservation.h
+       </sect2>
+       <sect2><title>fence</title>
 !Edrivers/dma-buf/fence.c
-!Edrivers/dma-buf/seqno-fence.c
 !Iinclude/linux/fence.h
+!Edrivers/dma-buf/seqno-fence.c
 !Iinclude/linux/seqno-fence.h
-!Edrivers/dma-buf/reservation.c
-!Iinclude/linux/reservation.h
 !Edrivers/dma-buf/sync_file.c
 !Iinclude/linux/sync_file.h
+       </sect2>
+     </sect1>
+     <sect1><title>Device Drivers DMA Management</title>
 !Edrivers/base/dma-coherent.c
 !Edrivers/base/dma-mapping.c
      </sect1>
index 4a0c599..7586bf7 100644 (file)
@@ -1626,6 +1626,12 @@ void intel_crt_init(struct drm_device *dev)
 !Pdrivers/gpu/drm/drm_dp_helper.c dp helpers
 !Iinclude/drm/drm_dp_helper.h
 !Edrivers/gpu/drm/drm_dp_helper.c
+    </sect2>
+    <sect2>
+      <title>Display Port Dual Mode Adaptor Helper Functions Reference</title>
+!Pdrivers/gpu/drm/drm_dp_dual_mode_helper.c dp dual mode helpers
+!Iinclude/drm/drm_dp_dual_mode_helper.h
+!Edrivers/gpu/drm/drm_dp_dual_mode_helper.c
     </sect2>
     <sect2>
       <title>Display Port MST Helper Functions Reference</title>
index c6938e5..4da60b4 100644 (file)
@@ -56,6 +56,7 @@ stable kernels.
 | ARM            | MMU-500         | #841119,#826419 | N/A                     |
 |                |                 |                 |                         |
 | Cavium         | ThunderX ITS    | #22375, #24313  | CAVIUM_ERRATUM_22375    |
+| Cavium         | ThunderX ITS    | #23144          | CAVIUM_ERRATUM_23144    |
 | Cavium         | ThunderX GICv3  | #23154          | CAVIUM_ERRATUM_23154    |
 | Cavium         | ThunderX Core   | #27456          | CAVIUM_ERRATUM_27456    |
 | Cavium         | ThunderX SMMUv2 | #27704          | N/A                    |
diff --git a/Documentation/devicetree/bindings/bus/ti-gpmc.txt b/Documentation/devicetree/bindings/bus/ti-gpmc.txt
deleted file mode 100644 (file)
index 0168370..0000000
+++ /dev/null
@@ -1,135 +0,0 @@
-Device tree bindings for OMAP general purpose memory controllers (GPMC)
-
-The actual devices are instantiated from the child nodes of a GPMC node.
-
-Required properties:
-
- - compatible:         Should be set to one of the following:
-
-                       ti,omap2420-gpmc (omap2420)
-                       ti,omap2430-gpmc (omap2430)
-                       ti,omap3430-gpmc (omap3430 & omap3630)
-                       ti,omap4430-gpmc (omap4430 & omap4460 & omap543x)
-                       ti,am3352-gpmc   (am335x devices)
-
- - reg:                        A resource specifier for the register space
-                       (see the example below)
- - ti,hwmods:          Should be set to "ti,gpmc" until the DT transition is
-                       completed.
- - #address-cells:     Must be set to 2 to allow memory address translation
- - #size-cells:                Must be set to 1 to allow CS address passing
- - gpmc,num-cs:                The maximum number of chip-select lines that controller
-                       can support.
- - gpmc,num-waitpins:  The maximum number of wait pins that controller can
-                       support.
- - ranges:             Must be set up to reflect the memory layout with four
-                       integer values for each chip-select line in use:
-
-                          <cs-number> 0 <physical address of mapping> <size>
-
-                       Currently, calculated values derived from the contents
-                       of the per-CS register GPMC_CONFIG7 (as set up by the
-                       bootloader) are used for the physical address decoding.
-                       As this will change in the future, filling correct
-                       values here is a requirement.
-
-Timing properties for child nodes. All are optional and default to 0.
-
- - gpmc,sync-clk-ps:   Minimum clock period for synchronous mode, in picoseconds
-
- Chip-select signal timings (in nanoseconds) corresponding to GPMC_CONFIG2:
- - gpmc,cs-on-ns:      Assertion time
- - gpmc,cs-rd-off-ns:  Read deassertion time
- - gpmc,cs-wr-off-ns:  Write deassertion time
-
- ADV signal timings (in nanoseconds) corresponding to GPMC_CONFIG3:
- - gpmc,adv-on-ns:     Assertion time
- - gpmc,adv-rd-off-ns: Read deassertion time
- - gpmc,adv-wr-off-ns: Write deassertion time
- - gpmc,adv-aad-mux-on-ns:     Assertion time for AAD
- - gpmc,adv-aad-mux-rd-off-ns: Read deassertion time for AAD
- - gpmc,adv-aad-mux-wr-off-ns: Write deassertion time for AAD
-
- WE signals timings (in nanoseconds) corresponding to GPMC_CONFIG4:
- - gpmc,we-on-ns       Assertion time
- - gpmc,we-off-ns:     Deassertion time
-
- OE signals timings (in nanoseconds) corresponding to GPMC_CONFIG4:
- - gpmc,oe-on-ns:      Assertion time
- - gpmc,oe-off-ns:     Deassertion time
- - gpmc,oe-aad-mux-on-ns:      Assertion time for AAD
- - gpmc,oe-aad-mux-off-ns:     Deassertion time for AAD
-
- Access time and cycle time timings (in nanoseconds) corresponding to
- GPMC_CONFIG5:
- - gpmc,page-burst-access-ns:  Multiple access word delay
- - gpmc,access-ns:             Start-cycle to first data valid delay
- - gpmc,rd-cycle-ns:           Total read cycle time
- - gpmc,wr-cycle-ns:           Total write cycle time
- - gpmc,bus-turnaround-ns:     Turn-around time between successive accesses
- - gpmc,cycle2cycle-delay-ns:  Delay between chip-select pulses
- - gpmc,clk-activation-ns:     GPMC clock activation time
- - gpmc,wait-monitoring-ns:    Start of wait monitoring with regard to valid
-                               data
-
-Boolean timing parameters. If property is present parameter enabled and
-disabled if omitted:
- - gpmc,adv-extra-delay:       ADV signal is delayed by half GPMC clock
- - gpmc,cs-extra-delay:                CS signal is delayed by half GPMC clock
- - gpmc,cycle2cycle-diffcsen:  Add "cycle2cycle-delay" between successive
-                               accesses to a different CS
- - gpmc,cycle2cycle-samecsen:  Add "cycle2cycle-delay" between successive
-                               accesses to the same CS
- - gpmc,oe-extra-delay:                OE signal is delayed by half GPMC clock
- - gpmc,we-extra-delay:                WE signal is delayed by half GPMC clock
- - gpmc,time-para-granularity: Multiply all access times by 2
-
-The following are only applicable to OMAP3+ and AM335x:
- - gpmc,wr-access-ns:          In synchronous write mode, for single or
-                               burst accesses, defines the number of
-                               GPMC_FCLK cycles from start access time
-                               to the GPMC_CLK rising edge used by the
-                               memory device for the first data capture.
- - gpmc,wr-data-mux-bus-ns:    In address-data multiplex mode, specifies
-                               the time when the first data is driven on
-                               the address-data bus.
-
-GPMC chip-select settings properties for child nodes. All are optional.
-
-- gpmc,burst-length    Page/burst length. Must be 4, 8 or 16.
-- gpmc,burst-wrap      Enables wrap bursting
-- gpmc,burst-read      Enables read page/burst mode
-- gpmc,burst-write     Enables write page/burst mode
-- gpmc,device-width    Total width of device(s) connected to a GPMC
-                       chip-select in bytes. The GPMC supports 8-bit
-                       and 16-bit devices and so this property must be
-                       1 or 2.
-- gpmc,mux-add-data    Address and data multiplexing configuration.
-                       Valid values are 1 for address-address-data
-                       multiplexing mode and 2 for address-data
-                       multiplexing mode.
-- gpmc,sync-read       Enables synchronous read. Defaults to asynchronous
-                       is this is not set.
-- gpmc,sync-write      Enables synchronous writes. Defaults to asynchronous
-                       is this is not set.
-- gpmc,wait-pin                Wait-pin used by client. Must be less than
-                       "gpmc,num-waitpins".
-- gpmc,wait-on-read    Enables wait monitoring on reads.
-- gpmc,wait-on-write   Enables wait monitoring on writes.
-
-Example for an AM33xx board:
-
-       gpmc: gpmc@50000000 {
-               compatible = "ti,am3352-gpmc";
-               ti,hwmods = "gpmc";
-               reg = <0x50000000 0x2000>;
-               interrupts = <100>;
-
-               gpmc,num-cs = <8>;
-               gpmc,num-waitpins = <2>;
-               #address-cells = <2>;
-               #size-cells = <1>;
-               ranges = <0 0 0x08000000 0x10000000>; /* CS0 @addr 0x8000000, size 0x10000000 */
-
-               /* child nodes go here */
-       };
index 0a175d9..a407462 100644 (file)
@@ -62,6 +62,7 @@ Required properties:
    display-timings are used instead.
 
 Optional properties (required if display-timings are used):
+ - ddc-i2c-bus: phandle of an I2C controller used for DDC EDID probing
  - display-timings : A node that describes the display timings as defined in
    Documentation/devicetree/bindings/display/display-timing.txt.
  - fsl,data-mapping : should be "spwg" or "jeida"
index ef37528..dd031fc 100644 (file)
@@ -33,7 +33,7 @@ gpio0: gpio0@1f860000 {
        gpio-controller;
        interrupt-controller;
        #interrupt-cells = <2>;
-       clocks = <&PBCLK4>;
+       clocks = <&rootclk PB4CLK>;
        microchip,gpio-bank = <0>;
        gpio-ranges = <&pic32_pinctrl 0 0 16>;
 };
diff --git a/Documentation/devicetree/bindings/memory-controllers/omap-gpmc.txt b/Documentation/devicetree/bindings/memory-controllers/omap-gpmc.txt
new file mode 100644 (file)
index 0000000..21055e2
--- /dev/null
@@ -0,0 +1,152 @@
+Device tree bindings for OMAP general purpose memory controllers (GPMC)
+
+The actual devices are instantiated from the child nodes of a GPMC node.
+
+Required properties:
+
+ - compatible:         Should be set to one of the following:
+
+                       ti,omap2420-gpmc (omap2420)
+                       ti,omap2430-gpmc (omap2430)
+                       ti,omap3430-gpmc (omap3430 & omap3630)
+                       ti,omap4430-gpmc (omap4430 & omap4460 & omap543x)
+                       ti,am3352-gpmc   (am335x devices)
+
+ - reg:                        A resource specifier for the register space
+                       (see the example below)
+ - ti,hwmods:          Should be set to "ti,gpmc" until the DT transition is
+                       completed.
+ - #address-cells:     Must be set to 2 to allow memory address translation
+ - #size-cells:                Must be set to 1 to allow CS address passing
+ - gpmc,num-cs:                The maximum number of chip-select lines that controller
+                       can support.
+ - gpmc,num-waitpins:  The maximum number of wait pins that controller can
+                       support.
+ - ranges:             Must be set up to reflect the memory layout with four
+                       integer values for each chip-select line in use:
+
+                          <cs-number> 0 <physical address of mapping> <size>
+
+                       Currently, calculated values derived from the contents
+                       of the per-CS register GPMC_CONFIG7 (as set up by the
+                       bootloader) are used for the physical address decoding.
+                       As this will change in the future, filling correct
+                       values here is a requirement.
+ - interrupt-controller: The GPMC driver implements and interrupt controller for
+                       the NAND events "fifoevent" and "termcount" plus the
+                       rising/falling edges on the GPMC_WAIT pins.
+                       The interrupt number mapping is as follows
+                       0 - NAND_fifoevent
+                       1 - NAND_termcount
+                       2 - GPMC_WAIT0 pin edge
+                       3 - GPMC_WAIT1 pin edge, and so on.
+ - interrupt-cells:    Must be set to 2
+ - gpio-controller:    The GPMC driver implements a GPIO controller for the
+                       GPMC WAIT pins that can be used as general purpose inputs.
+                       0 maps to GPMC_WAIT0 pin.
+ - gpio-cells:         Must be set to 2
+
+Timing properties for child nodes. All are optional and default to 0.
+
+ - gpmc,sync-clk-ps:   Minimum clock period for synchronous mode, in picoseconds
+
+ Chip-select signal timings (in nanoseconds) corresponding to GPMC_CONFIG2:
+ - gpmc,cs-on-ns:      Assertion time
+ - gpmc,cs-rd-off-ns:  Read deassertion time
+ - gpmc,cs-wr-off-ns:  Write deassertion time
+
+ ADV signal timings (in nanoseconds) corresponding to GPMC_CONFIG3:
+ - gpmc,adv-on-ns:     Assertion time
+ - gpmc,adv-rd-off-ns: Read deassertion time
+ - gpmc,adv-wr-off-ns: Write deassertion time
+ - gpmc,adv-aad-mux-on-ns:     Assertion time for AAD
+ - gpmc,adv-aad-mux-rd-off-ns: Read deassertion time for AAD
+ - gpmc,adv-aad-mux-wr-off-ns: Write deassertion time for AAD
+
+ WE signals timings (in nanoseconds) corresponding to GPMC_CONFIG4:
+ - gpmc,we-on-ns       Assertion time
+ - gpmc,we-off-ns:     Deassertion time
+
+ OE signals timings (in nanoseconds) corresponding to GPMC_CONFIG4:
+ - gpmc,oe-on-ns:      Assertion time
+ - gpmc,oe-off-ns:     Deassertion time
+ - gpmc,oe-aad-mux-on-ns:      Assertion time for AAD
+ - gpmc,oe-aad-mux-off-ns:     Deassertion time for AAD
+
+ Access time and cycle time timings (in nanoseconds) corresponding to
+ GPMC_CONFIG5:
+ - gpmc,page-burst-access-ns:  Multiple access word delay
+ - gpmc,access-ns:             Start-cycle to first data valid delay
+ - gpmc,rd-cycle-ns:           Total read cycle time
+ - gpmc,wr-cycle-ns:           Total write cycle time
+ - gpmc,bus-turnaround-ns:     Turn-around time between successive accesses
+ - gpmc,cycle2cycle-delay-ns:  Delay between chip-select pulses
+ - gpmc,clk-activation-ns:     GPMC clock activation time
+ - gpmc,wait-monitoring-ns:    Start of wait monitoring with regard to valid
+                               data
+
+Boolean timing parameters. If property is present parameter enabled and
+disabled if omitted:
+ - gpmc,adv-extra-delay:       ADV signal is delayed by half GPMC clock
+ - gpmc,cs-extra-delay:                CS signal is delayed by half GPMC clock
+ - gpmc,cycle2cycle-diffcsen:  Add "cycle2cycle-delay" between successive
+                               accesses to a different CS
+ - gpmc,cycle2cycle-samecsen:  Add "cycle2cycle-delay" between successive
+                               accesses to the same CS
+ - gpmc,oe-extra-delay:                OE signal is delayed by half GPMC clock
+ - gpmc,we-extra-delay:                WE signal is delayed by half GPMC clock
+ - gpmc,time-para-granularity: Multiply all access times by 2
+
+The following are only applicable to OMAP3+ and AM335x:
+ - gpmc,wr-access-ns:          In synchronous write mode, for single or
+                               burst accesses, defines the number of
+                               GPMC_FCLK cycles from start access time
+                               to the GPMC_CLK rising edge used by the
+                               memory device for the first data capture.
+ - gpmc,wr-data-mux-bus-ns:    In address-data multiplex mode, specifies
+                               the time when the first data is driven on
+                               the address-data bus.
+
+GPMC chip-select settings properties for child nodes. All are optional.
+
+- gpmc,burst-length    Page/burst length. Must be 4, 8 or 16.
+- gpmc,burst-wrap      Enables wrap bursting
+- gpmc,burst-read      Enables read page/burst mode
+- gpmc,burst-write     Enables write page/burst mode
+- gpmc,device-width    Total width of device(s) connected to a GPMC
+                       chip-select in bytes. The GPMC supports 8-bit
+                       and 16-bit devices and so this property must be
+                       1 or 2.
+- gpmc,mux-add-data    Address and data multiplexing configuration.
+                       Valid values are 1 for address-address-data
+                       multiplexing mode and 2 for address-data
+                       multiplexing mode.
+- gpmc,sync-read       Enables synchronous read. Defaults to asynchronous
+                       is this is not set.
+- gpmc,sync-write      Enables synchronous writes. Defaults to asynchronous
+                       is this is not set.
+- gpmc,wait-pin                Wait-pin used by client. Must be less than
+                       "gpmc,num-waitpins".
+- gpmc,wait-on-read    Enables wait monitoring on reads.
+- gpmc,wait-on-write   Enables wait monitoring on writes.
+
+Example for an AM33xx board:
+
+       gpmc: gpmc@50000000 {
+               compatible = "ti,am3352-gpmc";
+               ti,hwmods = "gpmc";
+               reg = <0x50000000 0x2000>;
+               interrupts = <100>;
+
+               gpmc,num-cs = <8>;
+               gpmc,num-waitpins = <2>;
+               #address-cells = <2>;
+               #size-cells = <1>;
+               ranges = <0 0 0x08000000 0x10000000>; /* CS0 @addr 0x8000000, size 0x10000000 */
+               interrupt-controller;
+               #interrupt-cells = <2>;
+               gpio-controller;
+               #gpio-cells = <2>;
+
+               /* child nodes go here */
+       };
index fc149f3..f080f06 100644 (file)
@@ -13,7 +13,7 @@ Required properties:
 - compatible : Should be "mti,cpu-interrupt-controller"
 
 Example devicetree:
-       cpu-irq: cpu-irq@0 {
+       cpu-irq: cpu-irq {
                #address-cells = <0>;
 
                interrupt-controller;
index 71ad57e..3149297 100644 (file)
@@ -20,7 +20,7 @@ Example:
                compatible = "microchip,pic32mzda-sdhci";
                reg = <0x1f8ec000 0x100>;
                interrupts = <191 IRQ_TYPE_LEVEL_HIGH>;
-               clocks = <&REFCLKO4>, <&PBCLK5>;
+               clocks = <&rootclk REF4CLK>, <&rootclk PB5CLK>;
                clock-names = "base_clk", "sys_clk";
                bus-width = <4>;
                cap-sd-highspeed;
index d53aba9..3e7ee99 100644 (file)
@@ -39,7 +39,7 @@ Optional properties:
 
 Nand Flash Controller(NFC) is an optional sub-node
 Required properties:
-- compatible : "atmel,sama5d3-nfc" or "atmel,sama5d4-nfc".
+- compatible : "atmel,sama5d3-nfc".
 - reg : should specify the address and size used for NFC command registers,
         NFC registers and NFC SRAM. NFC SRAM address and size can be absent
         if don't want to use it.
index 0f6985b..7066597 100644 (file)
@@ -24,6 +24,7 @@ Required properties:
                          brcm,brcmnand-v5.0
                          brcm,brcmnand-v6.0
                          brcm,brcmnand-v6.1
+                         brcm,brcmnand-v6.2
                          brcm,brcmnand-v7.0
                          brcm,brcmnand-v7.1
                          brcm,brcmnand
index fb733c4..3ee7e20 100644 (file)
@@ -13,7 +13,11 @@ Documentation/devicetree/bindings/mtd/nand.txt
 
 Required properties:
 
- - reg:                The CS line the peripheral is connected to
+ - compatible: "ti,omap2-nand"
+ - reg:                range id (CS number), base offset and length of the
+               NAND I/O space
+ - interrupt-parent: must point to gpmc node
+ - interrupts: Two interrupt specifiers, one for fifoevent, one for termcount.
 
 Optional properties:
 
@@ -44,6 +48,7 @@ Optional properties:
                locating ECC errors for BCHx algorithms. SoC devices which have
                ELM hardware engines should specify this device node in .dtsi
                Using ELM for ECC error correction frees some CPU cycles.
+ - rb-gpios:   GPIO specifier for the ready/busy# pin.
 
 For inline partition table parsing (optional):
 
@@ -55,20 +60,26 @@ Example for an AM33xx board:
        gpmc: gpmc@50000000 {
                compatible = "ti,am3352-gpmc";
                ti,hwmods = "gpmc";
-               reg = <0x50000000 0x1000000>;
+               reg = <0x50000000 0x36c>;
                interrupts = <100>;
                gpmc,num-cs = <8>;
                gpmc,num-waitpins = <2>;
                #address-cells = <2>;
                #size-cells = <1>;
-               ranges = <0 0 0x08000000 0x2000>;       /* CS0: NAND */
+               ranges = <0 0 0x08000000 0x1000000>;    /* CS0 space, 16MB */
                elm_id = <&elm>;
+               interrupt-controller;
+               #interrupt-cells = <2>;
 
                nand@0,0 {
-                       reg = <0 0 0>; /* CS0, offset 0 */
+                       compatible = "ti,omap2-nand";
+                       reg = <0 0 4>;          /* CS0, offset 0, NAND I/O window 4 */
+                       interrupt-parent = <&gpmc>;
+                       interrupts = <0 IRQ_TYPE_NONE>, <1 IRQ_TYPE NONE>;
                        nand-bus-width = <16>;
                        ti,nand-ecc-opt = "bch8";
                        ti,nand-xfer-type = "polled";
+                       rb-gpios = <&gpmc 0 GPIO_ACTIVE_HIGH>; /* gpmc_wait0 */
 
                        gpmc,sync-clk-ps = <0>;
                        gpmc,cs-on-ns = <0>;
index b53f92e..3733300 100644 (file)
@@ -1,8 +1,31 @@
-* MTD generic binding
+* NAND chip and NAND controller generic binding
+
+NAND controller/NAND chip representation:
+
+The NAND controller should be represented with its own DT node, and all
+NAND chips attached to this controller should be defined as children nodes
+of the NAND controller. This representation should be enforced even for
+simple controllers supporting only one chip.
+
+Mandatory NAND controller properties:
+- #address-cells: depends on your controller. Should at least be 1 to
+                 encode the CS line id.
+- #size-cells: depends on your controller. Put zero unless you need a
+              mapping between CS lines and dedicated memory regions
+
+Optional NAND controller properties
+- ranges: only needed if you need to define a mapping between CS lines and
+         memory regions
+
+Optional NAND chip properties:
 
 - nand-ecc-mode : String, operation mode of the NAND ecc mode.
-  Supported values are: "none", "soft", "hw", "hw_syndrome", "hw_oob_first",
-  "soft_bch".
+                 Supported values are: "none", "soft", "hw", "hw_syndrome",
+                 "hw_oob_first".
+                 Deprecated values:
+                 "soft_bch": use "soft" and nand-ecc-algo instead
+- nand-ecc-algo: string, algorithm of NAND ECC.
+                Supported values are: "hamming", "bch".
 - nand-bus-width : 8 or 16 bus width if not present 8
 - nand-on-flash-bbt: boolean to enable on flash bbt option if not present false
 
@@ -19,3 +42,20 @@ errors per {size} bytes".
 The interpretation of these parameters is implementation-defined, so not all
 implementations must support all possible combinations. However, implementations
 are encouraged to further specify the value(s) they support.
+
+Example:
+
+       nand-controller {
+               #address-cells = <1>;
+               #size-cells = <0>;
+
+               /* controller specific properties */
+
+               nand@0 {
+                       reg = <0>;
+                       nand-ecc-mode = "soft";
+                       nand-ecc-algo = "bch";
+
+                       /* controller specific properties */
+               };
+       };
index 4b5efa5..29b72e3 100644 (file)
@@ -34,7 +34,7 @@ pic32_pinctrl: pinctrl@1f801400{
        #size-cells = <1>;
        compatible = "microchip,pic32mzda-pinctrl";
        reg = <0x1f801400 0x400>;
-       clocks = <&PBCLK1>;
+       clocks = <&rootclk PB1CLK>;
 
        pinctrl_uart2: pinctrl_uart2 {
                uart2-tx {
index 65b38bf..7a34345 100644 (file)
@@ -20,7 +20,7 @@ Example:
                interrupts = <112 IRQ_TYPE_LEVEL_HIGH>,
                        <113 IRQ_TYPE_LEVEL_HIGH>,
                        <114 IRQ_TYPE_LEVEL_HIGH>;
-               clocks = <&PBCLK2>;
+               clocks = <&rootclk PB2CLK>;
                pinctrl-names = "default";
                pinctrl-0 = <&pinctrl_uart1
                                &pinctrl_uart1_cts
diff --git a/Documentation/devicetree/bindings/sound/max98371.txt b/Documentation/devicetree/bindings/sound/max98371.txt
new file mode 100644 (file)
index 0000000..6c28523
--- /dev/null
@@ -0,0 +1,17 @@
+max98371 codec
+
+This device supports I2C mode only.
+
+Required properties:
+
+- compatible : "maxim,max98371"
+- reg : The chip select number on the I2C bus
+
+Example:
+
+&i2c {
+       max98371: max98371@0x31 {
+               compatible = "maxim,max98371";
+               reg = <0x31>;
+       };
+};
index f205ce9..ac28cdb 100644 (file)
@@ -1,15 +1,16 @@
-MT8173 with RT5650 RT5676 CODECS
+MT8173 with RT5650 RT5676 CODECS and HDMI via I2S
 
 Required properties:
 - compatible : "mediatek,mt8173-rt5650-rt5676"
 - mediatek,audio-codec: the phandles of rt5650 and rt5676 codecs
+                       and of the hdmi encoder node
 - mediatek,platform: the phandle of MT8173 ASoC platform
 
 Example:
 
        sound {
                compatible = "mediatek,mt8173-rt5650-rt5676";
-               mediatek,audio-codec = <&rt5650 &rt5676>;
+               mediatek,audio-codec = <&rt5650 &rt5676 &hdmi0>;
                mediatek,platform = <&afe>;
        };
 
index fe5a5ef..5bfa6b6 100644 (file)
@@ -5,11 +5,21 @@ Required properties:
 - mediatek,audio-codec: the phandles of rt5650 codecs
 - mediatek,platform: the phandle of MT8173 ASoC platform
 
+Optional subnodes:
+- codec-capture : the subnode of rt5650 codec capture
+Required codec-capture subnode properties:
+- sound-dai: audio codec dai name on capture path
+  <&rt5650 0> : Default setting. Connect rt5650 I2S1 for capture. (dai_name = rt5645-aif1)
+  <&rt5650 1> : Connect rt5650 I2S2 for capture. (dai_name = rt5645-aif2)
+
 Example:
 
        sound {
                compatible = "mediatek,mt8173-rt5650";
                mediatek,audio-codec = <&rt5650>;
                mediatek,platform = <&afe>;
+               codec-capture {
+                       sound-dai = <&rt5650 1>;
+               };
        };
 
index 028fa1c..4d9a83d 100644 (file)
@@ -37,17 +37,18 @@ Required properties:
 
   - dai-name: DAI name that describes the IP.
 
+  - IP mode: IP working mode depending on associated codec.
+       "HDMI" connected to HDMI codec and support IEC HDMI formats (player only).
+       "SPDIF" connected to SPDIF codec and support SPDIF formats (player only).
+       "PCM" PCM standard mode for I2S or TDM bus.
+       "TDM" TDM mode for TDM bus.
+
 Required properties ("st,sti-uni-player" compatibility only):
   - clocks: CPU_DAI IP clock source, listed in the same order than the
            CPU_DAI properties.
 
   - uniperiph-id: internal SOC IP instance ID.
 
-  - IP mode: IP working mode depending on associated codec.
-       "HDMI" connected to HDMI codec IP and IEC HDMI formats.
-       "SPDIF"connected to SPDIF codec and support SPDIF formats.
-       "PCM"  PCM standard mode for I2S or TDM bus.
-
 Optional properties:
   - pinctrl-0: defined for CPU_DAI@1 and CPU_DAI@4 to describe I2S PIOs for
               external codecs connection.
@@ -56,6 +57,22 @@ Optional properties:
 
 Example:
 
+       sti_uni_player1: sti-uni-player@1 {
+               compatible = "st,sti-uni-player";
+               status = "okay";
+               #sound-dai-cells = <0>;
+               st,syscfg = <&syscfg_core>;
+               clocks = <&clk_s_d0_flexgen CLK_PCM_1>;
+               reg = <0x8D81000 0x158>;
+               interrupts = <GIC_SPI 85 IRQ_TYPE_NONE>;
+               dmas = <&fdma0 3 0 1>;
+               st,dai-name = "Uni Player #1 (I2S)";
+               dma-names = "tx";
+               st,uniperiph-id = <1>;
+               st,version = <5>;
+               st,mode = "TDM";
+       };
+
        sti_uni_player2: sti-uni-player@2 {
                compatible = "st,sti-uni-player";
                status = "okay";
@@ -65,7 +82,7 @@ Example:
                reg = <0x8D82000 0x158>;
                interrupts = <GIC_SPI 86 IRQ_TYPE_NONE>;
                dmas = <&fdma0 4 0 1>;
-               dai-name = "Uni Player #1 (DAC)";
+               dai-name = "Uni Player #2 (DAC)";
                dma-names = "tx";
                uniperiph-id = <2>;
                version = <5>;
@@ -82,7 +99,7 @@ Example:
                interrupts = <GIC_SPI 89 IRQ_TYPE_NONE>;
                dmas = <&fdma0 7 0 1>;
                dma-names = "tx";
-               dai-name = "Uni Player #1 (PIO)";
+               dai-name = "Uni Player #3 (SPDIF)";
                uniperiph-id = <3>;
                version = <5>;
                mode = "SPDIF";
@@ -99,6 +116,7 @@ Example:
                dma-names = "rx";
                dai-name = "Uni Reader #1 (HDMI RX)";
                version = <3>;
+               st,mode = "PCM";
        };
 
 2) sti-sas-codec: internal audio codec IPs driver
@@ -152,4 +170,20 @@ Example of audio card declaration:
                                sound-dai = <&sti_sasg_codec 0>;
                        };
                };
+               simple-audio-card,dai-link@2 {
+                       /* TDM playback  */
+                       format = "left_j";
+                       frame-inversion = <1>;
+                       cpu {
+                               sound-dai = <&sti_uni_player1>;
+                               dai-tdm-slot-num = <16>;
+                               dai-tdm-slot-width = <16>;
+                               dai-tdm-slot-tx-mask =
+                                       <1 1 1 1 0 0 0 0 0 0 1 1 0 0 1 1>;
+                       };
+
+                       codec {
+                               sound-dai = <&sti_sasg_codec 3>;
+                       };
+               };
        };
index 0ac31d8..b4959f1 100644 (file)
@@ -1,4 +1,4 @@
-Texas Instruments TAS5711/TAS5717/TAS5719 stereo power amplifiers
+Texas Instruments TAS5711/TAS5717/TAS5719/TAS5721 stereo power amplifiers
 
 The codec is controlled through an I2C interface.  It also has two other
 signals that can be wired up to GPIOs: reset (strongly recommended), and
@@ -6,7 +6,11 @@ powerdown (optional).
 
 Required properties:
 
-- compatible: "ti,tas5711", "ti,tas5717", or "ti,tas5719"
+- compatible: should be one of the following:
+  - "ti,tas5711",
+  - "ti,tas5717",
+  - "ti,tas5719",
+  - "ti,tas5721"
 - reg: The I2C address of the device
 - #sound-dai-cells: must be equal to 0
 
@@ -25,6 +29,8 @@ Optional properties:
 - PVDD_B-supply: regulator phandle for the PVDD_B supply (5711)
 - PVDD_C-supply: regulator phandle for the PVDD_C supply (5711)
 - PVDD_D-supply: regulator phandle for the PVDD_D supply (5711)
+- DRVDD-supply: regulator phandle for the DRVDD supply (5721)
+- PVDD-supply: regulator phandle for the PVDD supply (5721)
 
 Example:
 
diff --git a/Documentation/devicetree/bindings/sound/tas5720.txt b/Documentation/devicetree/bindings/sound/tas5720.txt
new file mode 100644 (file)
index 0000000..806ea73
--- /dev/null
@@ -0,0 +1,25 @@
+Texas Instruments TAS5720 Mono Audio amplifier
+
+The TAS5720 serial control bus communicates through the I2C protocol only. The
+serial bus is also used for periodic codec fault checking/reporting during
+audio playback. For more product information please see the links below:
+
+http://www.ti.com/product/TAS5720L
+http://www.ti.com/product/TAS5720M
+
+Required properties:
+
+- compatible : "ti,tas5720"
+- reg : I2C slave address
+- dvdd-supply : phandle to a 3.3-V supply for the digital circuitry
+- pvdd-supply : phandle to a supply used for the Class-D amp and the analog
+
+Example:
+
+tas5720: tas5720@6c {
+       status = "okay";
+       compatible = "ti,tas5720";
+       reg = <0x6c>;
+       dvdd-supply = <&vdd_3v3_reg>;
+       pvdd-supply = <&amp_supply_reg>;
+};
diff --git a/Documentation/devicetree/bindings/spi/microchip,spi-pic32.txt b/Documentation/devicetree/bindings/spi/microchip,spi-pic32.txt
new file mode 100644 (file)
index 0000000..79de379
--- /dev/null
@@ -0,0 +1,34 @@
+Microchip PIC32 SPI Master controller
+
+Required properties:
+- compatible: Should be "microchip,pic32mzda-spi".
+- reg: Address and length of register space for the device.
+- interrupts: Should contain all three spi interrupts in sequence
+              of <fault-irq>, <receive-irq>, <transmit-irq>.
+- interrupt-names: Should be "fault", "rx", "tx" in order.
+- clocks: Phandle of the clock generating SPI clock on the bus.
+- clock-names: Should be "mck0".
+- cs-gpios: Specifies the gpio pins to be used for chipselects.
+            See: Documentation/devicetree/bindings/spi/spi-bus.txt
+
+Optional properties:
+- dmas: Two or more DMA channel specifiers following the convention outlined
+        in Documentation/devicetree/bindings/dma/dma.txt
+- dma-names: Names for the dma channels. There must be at least one channel
+             named "spi-tx" for transmit and named "spi-rx" for receive.
+
+Example:
+
+spi1: spi@1f821000 {
+        compatible = "microchip,pic32mzda-spi";
+        reg = <0x1f821000 0x200>;
+        interrupts = <109 IRQ_TYPE_LEVEL_HIGH>,
+                     <110 IRQ_TYPE_LEVEL_HIGH>,
+                     <111 IRQ_TYPE_LEVEL_HIGH>;
+        interrupt-names = "fault", "rx", "tx";
+        clocks = <&PBCLK2>;
+        clock-names = "mck0";
+        cs-gpios = <&gpio3 4 GPIO_ACTIVE_LOW>;
+        dmas = <&dma 134>, <&dma 135>;
+        dma-names = "spi-rx", "spi-tx";
+};
index 1ad0fe3..ff5893d 100644 (file)
@@ -16,8 +16,7 @@ Required properties:
 
 Optional property:
 - big-endian: If present the dspi device's registers are implemented
-  in big endian mode, otherwise in native mode(same with CPU), for more
-  detail please see: Documentation/devicetree/bindings/regmap/regmap.txt.
+  in big endian mode.
 
 Optional SPI slave node properties:
 - fsl,spi-cs-sck-delay: a delay in nanoseconds between activating chip
diff --git a/Documentation/devicetree/bindings/spi/sqi-pic32.txt b/Documentation/devicetree/bindings/spi/sqi-pic32.txt
new file mode 100644 (file)
index 0000000..c82d021
--- /dev/null
@@ -0,0 +1,18 @@
+Microchip PIC32 Quad SPI controller
+-----------------------------------
+Required properties:
+- compatible: Should be "microchip,pic32mzda-sqi".
+- reg: Address and length of SQI controller register space.
+- interrupts: Should contain SQI interrupt.
+- clocks: Should contain phandle of two clocks in sequence, one that drives
+          clock on SPI bus and other that drives SQI controller.
+- clock-names: Should be "spi_ck" and "reg_ck" in order.
+
+Example:
+       sqi1: spi@1f8e2000 {
+               compatible = "microchip,pic32mzda-sqi";
+               reg = <0x1f8e2000 0x200>;
+               clocks = <&rootclk REF2CLK>, <&rootclk PB5CLK>;
+               clock-names = "spi_ck", "reg_ck";
+               interrupts = <169 IRQ_TYPE_LEVEL_HIGH>;
+       };
index 6908d3a..edebfa0 100644 (file)
@@ -26,6 +26,10 @@ Required properties :
     of this property. See <dt-bindings/thermal/tegra124-soctherm.h> for a
     list of valid values when referring to thermal sensors.
 
+Note:
+- the "critical" type trip points will be set to SOC_THERM hardware as the
+shut down temperature. Once the temperature of this thermal zone is higher
+than it, the system will be shutdown or reset by hardware.
 
 Example :
 
@@ -51,5 +55,13 @@ Example: referring to thermal sensors :
 
                         thermal-sensors =
                                 <&soctherm TEGRA124_SOCTHERM_SENSOR_CPU>;
+
+                       trips {
+                               cpu_shutdown_trip: shutdown-trip {
+                                       temperature = <102500>;
+                                       hysteresis = <1000>;
+                                       type = "critical";
+                               };
+                       };
                 };
        };
index e5ee3f1..a8e52c8 100644 (file)
@@ -11,7 +11,6 @@ Required properties:
                            - "renesas,thermal-r8a7791" (R-Car M2-W)
                            - "renesas,thermal-r8a7792" (R-Car V2H)
                            - "renesas,thermal-r8a7793" (R-Car M2-N)
-                           - "renesas,thermal-r8a7794" (R-Car E2)
 - reg                  : Address range of the thermal registers.
                          The 1st reg will be recognized as common register
                          if it has "interrupts".
diff --git a/Documentation/devicetree/bindings/thermal/tango-thermal.txt b/Documentation/devicetree/bindings/thermal/tango-thermal.txt
new file mode 100644 (file)
index 0000000..212198d
--- /dev/null
@@ -0,0 +1,17 @@
+* Tango Thermal
+
+The SMP8758 SoC includes 3 instances of this temperature sensor
+(in the CPU, video decoder, and PCIe controller).
+
+Required properties:
+- #thermal-sensor-cells: Should be 0 (see thermal.txt)
+- compatible: "sigma,smp8758-thermal"
+- reg: Address range of the thermal registers
+
+Example:
+
+       cpu_temp: thermal@920100 {
+               #thermal-sensor-cells = <0>;
+               compatible = "sigma,smp8758-thermal";
+               reg = <0x920100 12>;
+       };
diff --git a/Documentation/devicetree/bindings/thermal/thermal-generic-adc.txt b/Documentation/devicetree/bindings/thermal/thermal-generic-adc.txt
new file mode 100644 (file)
index 0000000..d723555
--- /dev/null
@@ -0,0 +1,89 @@
+General Purpose Analog To Digital Converter (ADC) based thermal sensor.
+
+On some of platforms, thermal sensor like thermistors are connected to
+one of ADC channel and sensor resistance is read via voltage across the
+sensor resistor. The voltage read across the sensor is mapped to
+temperature using voltage-temperature lookup table.
+
+Required properties:
+===================
+- compatible:               Must be "generic-adc-thermal".
+- temperature-lookup-table:  Two dimensional array of Integer; lookup table
+                            to map the relation between ADC value and
+                            temperature. When ADC is read, the value is
+                            looked up on the table to get the equivalent
+                            temperature.
+                            The first value of the each row of array is the
+                            temperature in milliCelsius and second value of
+                            the each row of array is the ADC read value.
+- #thermal-sensor-cells:     Should be 1. See ./thermal.txt for a description
+                            of this property.
+
+Example :
+#include <dt-bindings/thermal/thermal.h>
+
+i2c@7000c400 {
+       ads1015: ads1015@4a {
+               reg = <0x4a>;
+               compatible = "ads1015";
+               sampling-frequency = <3300>;
+               #io-channel-cells = <1>;
+       };
+};
+
+tboard_thermistor: thermal-sensor {
+       compatible = "generic-adc-thermal";
+       #thermal-sensor-cells = <0>;
+       io-channels = <&ads1015 1>;
+       io-channel-names = "sensor-channel";
+       temperature-lookup-table = <    (-40000) 2578
+                                       (-39000) 2577
+                                       (-38000) 2576
+                                       (-37000) 2575
+                                       (-36000) 2574
+                                       (-35000) 2573
+                                       (-34000) 2572
+                                       (-33000) 2571
+                                       (-32000) 2569
+                                       (-31000) 2568
+                                       (-30000) 2567
+                                       ::::::::::
+                                       118000 254
+                                       119000 247
+                                       120000 240
+                                       121000 233
+                                       122000 226
+                                       123000 220
+                                       124000 214
+                                       125000 208>;
+};
+
+dummy_cool_dev: dummy-cool-dev {
+       compatible = "dummy-cooling-dev";
+       #cooling-cells = <2>; /* min followed by max */
+};
+
+thermal-zones {
+       Tboard {
+               polling-delay = <15000>; /* milliseconds */
+               polling-delay-passive = <0>; /* milliseconds */
+               thermal-sensors = <&tboard_thermistor>;
+
+               trips {
+                       therm_est_trip: therm_est_trip {
+                               temperature = <40000>;
+                               type = "active";
+                               hysteresis = <1000>;
+                       };
+               };
+
+               cooling-maps {
+                       map0 {
+                               trip = <&therm_est_trip>;
+                               cooling-device = <&dummy_cool_dev THERMAL_NO_LIMIT THERMAL_NO_LIMIT>;
+                               contribution = <100>;
+                       };
+
+               };
+       };
+};
index 8dab6fd..107280e 100644 (file)
@@ -5,10 +5,12 @@ Required properties:
 - reg : Should contain WDT registers location and length
 - interrupts : Should contain WDT interrupt
 
-Optional property:
+Optional properties:
 - big-endian: If present the watchdog device's registers are implemented
   in big endian mode, otherwise in native mode(same with CPU), for more
   detail please see: Documentation/devicetree/bindings/regmap/regmap.txt.
+- fsl,ext-reset-output: If present the watchdog device is configured to
+  assert its external reset (WDOG_B) instead of issuing a software reset.
 
 Examples:
 
index 852f694..49485f8 100644 (file)
@@ -8,12 +8,12 @@ Required properties:
 - compatible: must be "microchip,pic32mzda-dmt".
 - reg: physical base address of the controller and length of memory mapped
   region.
-- clocks: phandle of parent clock (should be &PBCLK7).
+- clocks: phandle of source clk. Should be <&rootclk PB7CLK>.
 
 Example:
 
        watchdog@1f800a00 {
                compatible = "microchip,pic32mzda-dmt";
                reg = <0x1f800a00 0x80>;
-               clocks = <&PBCLK7>;
+               clocks = <&rootclk PB7CLK>;
        };
index d140103..f03a29a 100644 (file)
@@ -7,12 +7,12 @@ Required properties:
 - compatible: must be "microchip,pic32mzda-wdt".
 - reg: physical base address of the controller and length of memory mapped
   region.
-- clocks: phandle of source clk. should be <&LPRC> clk.
+- clocks: phandle of source clk. Should be <&rootclk LPRCCLK>.
 
 Example:
 
        watchdog@1f800800 {
                compatible = "microchip,pic32mzda-wdt";
                reg = <0x1f800800 0x200>;
-               clocks = <&LPRC>;
+               clocks = <&rootclk LPRCCLK>;
        };
diff --git a/Documentation/devicetree/bindings/watchdog/renesas-wdt.txt b/Documentation/devicetree/bindings/watchdog/renesas-wdt.txt
new file mode 100644 (file)
index 0000000..b9512f1
--- /dev/null
@@ -0,0 +1,25 @@
+Renesas Watchdog Timer (WDT) Controller
+
+Required properties:
+- compatible : Should be "renesas,r8a7795-wdt", or "renesas,rcar-gen3-wdt"
+
+  When compatible with the generic version, nodes must list the SoC-specific
+  version corresponding to the platform first, followed by the generic
+  version.
+
+- reg : Should contain WDT registers location and length
+- clocks : the clock feeding the watchdog timer.
+
+Optional properties:
+- timeout-sec : Contains the watchdog timeout in seconds
+- power-domains : the power domain the WDT belongs to
+
+Examples:
+
+       wdt0: watchdog@e6020000 {
+               compatible = "renesas,r8a7795-wdt", "renesas,rcar-gen3-wdt";
+               reg = <0 0xe6020000 0 0x0c>;
+               clocks = <&cpg CPG_MOD 402>;
+               power-domains = <&cpg>;
+               timeout-sec = <60>;
+       };
index 7bde640..ce4587d 100644 (file)
@@ -79,6 +79,38 @@ These filesystems may be used for inspiration:
 - ext4: the fourth extended filesystem, see Documentation/filesystems/ext4.txt
 
 
+Handling Media Errors
+---------------------
+
+The libnvdimm subsystem stores a record of known media error locations for
+each pmem block device (in gendisk->badblocks). If we fault at such location,
+or one with a latent error not yet discovered, the application can expect
+to receive a SIGBUS. Libnvdimm also allows clearing of these errors by simply
+writing the affected sectors (through the pmem driver, and if the underlying
+NVDIMM supports the clear_poison DSM defined by ACPI).
+
+Since DAX IO normally doesn't go through the driver/bio path, applications or
+sysadmins have an option to restore the lost data from a prior backup/inbuilt
+redundancy in the following ways:
+
+1. Delete the affected file, and restore from a backup (sysadmin route):
+   This will free the file system blocks that were being used by the file,
+   and the next time they're allocated, they will be zeroed first, which
+   happens through the driver, and will clear bad sectors.
+
+2. Truncate or hole-punch the part of the file that has a bad-block (at least
+   an entire aligned sector has to be hole-punched, but not necessarily an
+   entire filesystem block).
+
+These are the two basic paths that allow DAX filesystems to continue operating
+in the presence of media errors. More robust error recovery mechanisms can be
+built on top of this in the future, for example, involving redundancy/mirroring
+provided at the block layer through DM, or additionally, at the filesystem
+level. These would have to rely on the above two tenets, that error clearing
+can happen either by sending an IO through the driver, or zeroing (also through
+the driver).
+
+
 Shortcomings
 ------------
 
index 30d2fcb..9f94fe2 100644 (file)
+Each mount of the devpts filesystem is now distinct such that ptys
+and their indicies allocated in one mount are independent from ptys
+and their indicies in all other mounts.
 
-To support containers, we now allow multiple instances of devpts filesystem,
-such that indices of ptys allocated in one instance are independent of indices
-allocated in other instances of devpts.
+All mounts of the devpts filesystem now create a /dev/pts/ptmx node
+with permissions 0000.
 
-To preserve backward compatibility, this support for multiple instances is
-enabled only if:
+To retain backwards compatibility the a ptmx device node (aka any node
+created with "mknod name c 5 2") when opened will look for an instance
+of devpts under the name "pts" in the same directory as the ptmx device
+node.
 
-       - CONFIG_DEVPTS_MULTIPLE_INSTANCES=y, and
-       - '-o newinstance' mount option is specified while mounting devpts
-
-IOW, devpts now supports both single-instance and multi-instance semantics.
-
-If CONFIG_DEVPTS_MULTIPLE_INSTANCES=n, there is no change in behavior and
-this referred to as the "legacy" mode. In this mode, the new mount options
-(-o newinstance and -o ptmxmode) will be ignored with a 'bogus option' message
-on console.
-
-If CONFIG_DEVPTS_MULTIPLE_INSTANCES=y and devpts is mounted without the
-'newinstance' option (as in current start-up scripts) the new mount binds
-to the initial kernel mount of devpts. This mode is referred to as the
-'single-instance' mode and the current, single-instance semantics are
-preserved, i.e PTYs are common across the system.
-
-The only difference between this single-instance mode and the legacy mode
-is the presence of new, '/dev/pts/ptmx' node with permissions 0000, which
-can safely be ignored.
-
-If CONFIG_DEVPTS_MULTIPLE_INSTANCES=y and 'newinstance' option is specified,
-the mount is considered to be in the multi-instance mode and a new instance
-of the devpts fs is created. Any ptys created in this instance are independent
-of ptys in other instances of devpts. Like in the single-instance mode, the
-/dev/pts/ptmx node is present. To effectively use the multi-instance mode,
-open of /dev/ptmx must be a redirected to '/dev/pts/ptmx' using a symlink or
-bind-mount.
-
-Eg: A container startup script could do the following:
-
-       $ chmod 0666 /dev/pts/ptmx
-       $ rm /dev/ptmx
-       $ ln -s pts/ptmx /dev/ptmx
-       $ ns_exec -cm /bin/bash
-
-       # We are now in new container
-
-       $ umount /dev/pts
-       $ mount -t devpts -o newinstance lxcpts /dev/pts
-       $ sshd -p 1234
-
-where 'ns_exec -cm /bin/bash' calls clone() with CLONE_NEWNS flag and execs
-/bin/bash in the child process.  A pty created by the sshd is not visible in
-the original mount of /dev/pts.
+As an option instead of placing a /dev/ptmx device node at /dev/ptmx
+it is possible to place a symlink to /dev/pts/ptmx at /dev/ptmx or
+to bind mount /dev/ptx/ptmx to /dev/ptmx.  If you opt for using
+the devpts filesystem in this manner devpts should be mounted with
+the ptmxmode=0666, or chmod 0666 /dev/pts/ptmx should be called.
 
 Total count of pty pairs in all instances is limited by sysctls:
 kernel.pty.max = 4096          - global limit
-kernel.pty.reserve = 1024      - reserve for initial instance
+kernel.pty.reserve = 1024      - reserved for filesystems mounted from the initial mount namespace
 kernel.pty.nr                  - current count of ptys
 
 Per-instance limit could be set by adding mount option "max=<count>".
 This feature was added in kernel 3.4 together with sysctl kernel.pty.reserve.
 In kernels older than 3.4 sysctl kernel.pty.max works as per-instance limit.
-
-User-space changes
-------------------
-
-In multi-instance mode (i.e '-o newinstance' mount option is specified at least
-once), following user-space issues should be noted.
-
-1. If -o newinstance mount option is never used, /dev/pts/ptmx can be ignored
-   and no change is needed to system-startup scripts.
-
-2. To effectively use multi-instance mode (i.e -o newinstance is specified)
-   administrators or startup scripts should "redirect" open of /dev/ptmx to
-   /dev/pts/ptmx using either a bind mount or symlink.
-
-       $ mount -t devpts -o newinstance devpts /dev/pts
-
-   followed by either
-
-       $ rm /dev/ptmx
-       $ ln -s pts/ptmx /dev/ptmx
-       $ chmod 666 /dev/pts/ptmx
-   or
-       $ mount -o bind /dev/pts/ptmx /dev/ptmx
-
-3. The '/dev/ptmx -> pts/ptmx' symlink is the preferred method since it
-   enables better error-reporting and treats both single-instance and
-   multi-instance mounts similarly.
-
-   But this method requires that system-startup scripts set the mode of
-   /dev/pts/ptmx correctly (default mode is 0000). The scripts can set the
-   mode by, either
-
-       - adding ptmxmode mount option to devpts entry in /etc/fstab, or
-       - using 'chmod 0666 /dev/pts/ptmx'
-
-4. If multi-instance mode mount is needed for containers, but the system
-   startup scripts have not yet been updated, container-startup scripts
-   should bind mount /dev/ptmx to /dev/pts/ptmx to avoid breaking single-
-   instance mounts.
-
-   Or, in general, container-startup scripts should use:
-
-       mount -t devpts -o newinstance -o ptmxmode=0666 devpts /dev/pts
-       if [ ! -L /dev/ptmx ]; then
-               mount -o bind /dev/pts/ptmx /dev/ptmx
-       fi
-
-   When all devpts mounts are multi-instance, /dev/ptmx can permanently be
-   a symlink to pts/ptmx and the bind mount can be ignored.
-
-5. A multi-instance mount that is not accompanied by the /dev/ptmx to
-   /dev/pts/ptmx redirection would result in an unusable/unreachable pty.
-
-       mount -t devpts -o newinstance lxcpts /dev/pts
-
-   immediately followed by:
-
-       open("/dev/ptmx")
-
-    would create a pty, say /dev/pts/7, in the initial kernel mount.
-    But /dev/pts/7 would be invisible in the new mount.
-
-6. The permissions for /dev/pts/ptmx node should be specified when mounting
-   /dev/pts, using the '-o ptmxmode=%o' mount option (default is 0000).
-
-       mount -t devpts -o newinstance -o ptmxmode=0644 devpts /dev/pts
-
-   The permissions can be later be changed as usual with 'chmod'.
-
-       chmod 666 /dev/pts/ptmx
-
-7. A mount of devpts without the 'newinstance' option results in binding to
-   initial kernel mount.  This behavior while preserving legacy semantics,
-   does not provide strict isolation in a container environment. i.e by
-   mounting devpts without the 'newinstance' option, a container could
-   get visibility into the 'host' or root container's devpts.
-   
-   To workaround this and have strict isolation, all mounts of devpts,
-   including the mount in the root container, should use the newinstance
-   option.
index 09bbf9a..c314bad 100644 (file)
@@ -1,30 +1,37 @@
        Locking scheme used for directory operations is based on two
-kinds of locks - per-inode (->i_mutex) and per-filesystem
+kinds of locks - per-inode (->i_rwsem) and per-filesystem
 (->s_vfs_rename_mutex).
 
-       When taking the i_mutex on multiple non-directory objects, we
+       When taking the i_rwsem on multiple non-directory objects, we
 always acquire the locks in order by increasing address.  We'll call
 that "inode pointer" order in the following.
 
        For our purposes all operations fall in 5 classes:
 
 1) read access.  Locking rules: caller locks directory we are accessing.
+The lock is taken shared.
 
-2) object creation.  Locking rules: same as above.
+2) object creation.  Locking rules: same as above, but the lock is taken
+exclusive.
 
 3) object removal.  Locking rules: caller locks parent, finds victim,
-locks victim and calls the method.
+locks victim and calls the method.  Locks are exclusive.
 
 4) rename() that is _not_ cross-directory.  Locking rules: caller locks
-the parent and finds source and target.  If target already exists, lock
-it.  If source is a non-directory, lock it.  If that means we need to
-lock both, lock them in inode pointer order.
+the parent and finds source and target.  In case of exchange (with
+RENAME_EXCHANGE in rename2() flags argument) lock both.  In any case,
+if the target already exists, lock it.  If the source is a non-directory,
+lock it.  If we need to lock both, lock them in inode pointer order.
+Then call the method.  All locks are exclusive.
+NB: we might get away with locking the the source (and target in exchange
+case) shared.
 
 5) link creation.  Locking rules:
        * lock parent
        * check that source is not a directory
        * lock source
        * call the method.
+All locks are exclusive.
 
 6) cross-directory rename.  The trickiest in the whole bunch.  Locking
 rules:
@@ -35,11 +42,12 @@ rules:
                fail with -ENOTEMPTY
        * if new parent is equal to or is a descendent of source
                fail with -ELOOP
-       * If target exists, lock it.  If source is a non-directory, lock
-         it.  In case that means we need to lock both source and target,
-         do so in inode pointer order.
+       * If it's an exchange, lock both the source and the target.
+       * If the target exists, lock it.  If the source is a non-directory,
+         lock it.  If we need to lock both, do so in inode pointer order.
        * call the method.
-
+All ->i_rwsem are taken exclusive.  Again, we might get away with locking
+the the source (and target in exchange case) shared.
 
 The rules above obviously guarantee that all directories that are going to be
 read, modified or removed by method will be locked by caller.
@@ -73,7 +81,7 @@ objects - A < B iff A is an ancestor of B.
 attempt to acquire some lock and already holds at least one lock.  Let's
 consider the set of contended locks.  First of all, filesystem lock is
 not contended, since any process blocked on it is not holding any locks.
-Thus all processes are blocked on ->i_mutex.
+Thus all processes are blocked on ->i_rwsem.
 
        By (3), any process holding a non-directory lock can only be
 waiting on another non-directory lock with a larger address.  Therefore
index 2809145..d6259c7 100644 (file)
@@ -194,15 +194,6 @@ If a file with multiple hard links is copied up, then this will
 "break" the link.  Changes will not be propagated to other names
 referring to the same inode.
 
-Symlinks in /proc/PID/ and /proc/PID/fd which point to a non-directory
-object in overlayfs will not contain valid absolute paths, only
-relative paths leading up to the filesystem's root.  This will be
-fixed in the future.
-
-Some operations are not atomic, for example a crash during copy_up or
-rename will leave the filesystem in an inconsistent state.  This will
-be addressed in the future.
-
 Changes to underlying filesystems
 ---------------------------------
 
index 46f3bb7..a5fb89c 100644 (file)
@@ -578,3 +578,10 @@ in your dentry operations instead.
 --
 [mandatory]
        ->atomic_open() calls without O_CREAT may happen in parallel.
+--
+[mandatory]
+       ->setxattr() and xattr_handler.set() get dentry and inode passed separately.
+       dentry might be yet to be attached to inode, so do _not_ use its ->d_inode
+       in the instances.  Rationale: !@#!@# security_d_instantiate() needs to be
+       called before we attach dentry to inode and !@#!@##!@$!$#!@#$!@$!@$ smack
+       ->d_instantiate() uses not just ->getxattr() but ->setxattr() as well.
index f5b1fca..9ba6587 100644 (file)
@@ -5,17 +5,17 @@ Supported chips:
   * Maxim MAX34440
     Prefixes: 'max34440'
     Addresses scanned: -
-    Datasheet: http://datasheets.maxim-ic.com/en/ds/MAX34440.pdf
+    Datasheet: http://datasheets.maximintegrated.com/en/ds/MAX34440.pdf
   * Maxim MAX34441
     PMBus 5-Channel Power-Supply Manager and Intelligent Fan Controller
     Prefixes: 'max34441'
     Addresses scanned: -
-    Datasheet: http://datasheets.maxim-ic.com/en/ds/MAX34441.pdf
+    Datasheet: http://datasheets.maximintegrated.com/en/ds/MAX34441.pdf
   * Maxim MAX34446
     PMBus Power-Supply Data Logger
     Prefixes: 'max34446'
     Addresses scanned: -
-    Datasheet: http://datasheets.maxim-ic.com/en/ds/MAX34446.pdf
+    Datasheet: http://datasheets.maximintegrated.com/en/ds/MAX34446.pdf
   * Maxim MAX34460
     PMBus 12-Channel Voltage Monitor & Sequencer
     Prefix: 'max34460'
index 3ecf0c3..45bcafe 100644 (file)
@@ -56,6 +56,18 @@ SYSFS FILES
   ports/1/pkeys/10 contains the value at index 10 in port 1's P_Key
   table.
 
+  There is an optional "hw_counters" subdirectory that may be under either
+  the parent device or the port subdirectories or both.  If present,
+  there are a list of counters provided by the hardware.  They may match
+  some of the counters in the counters directory, but they often include
+  many other counters.  In addition to the various counters, there will
+  be a file named "lifespan" that configures how frequently the core
+  should update the counters when they are being accessed (counters are
+  not updated if they are not being accessed).  The lifespan is in milli-
+  seconds and defaults to 10 unless set to something else by the driver.
+  Users may echo a value between 0 - 10000 to the lifespan file to set
+  the length of time between updates in milliseconds.
+
 MTHCA
 
   The Mellanox HCA driver also creates the files:
index c52856d..db10185 100644 (file)
@@ -241,9 +241,8 @@ comment "module support disabled"
        depends on !MODULES
 
 MODVERSIONS directly depends on MODULES, this means it's only visible if
-MODULES is different from 'n'. The comment on the other hand is always
-visible when MODULES is visible (the (empty) dependency of MODULES is
-also part of the comment dependencies).
+MODULES is different from 'n'. The comment on the other hand is only
+visible when MODULES is set to 'n'.
 
 
 Kconfig syntax
@@ -285,12 +284,17 @@ choices:
        "endchoice"
 
 This defines a choice group and accepts any of the above attributes as
-options. A choice can only be of type bool or tristate, while a boolean
-choice only allows a single config entry to be selected, a tristate
-choice also allows any number of config entries to be set to 'm'. This
-can be used if multiple drivers for a single hardware exists and only a
-single driver can be compiled/loaded into the kernel, but all drivers
-can be compiled as modules.
+options. A choice can only be of type bool or tristate.  If no type is
+specified for a choice, it's type will be determined by the type of
+the first choice element in the group or remain unknown if none of the
+choice elements have a type specified, as well.
+
+While a boolean choice only allows a single config entry to be
+selected, a tristate choice also allows any number of config entries
+to be set to 'm'. This can be used if multiple drivers for a single
+hardware exists and only a single driver can be compiled/loaded into
+the kernel, but all drivers can be compiled as modules.
+
 A choice accepts another option "optional", which allows to set the
 choice to 'n' and no entry needs to be selected.
 If no [symbol] is associated with a choice, then you can not have multiple
index 35f6a98..220d0a8 100644 (file)
@@ -170,21 +170,92 @@ document trapinfo
        address the kernel panicked.
 end
 
+define dump_log_idx
+       set $idx = $arg0
+       if ($argc > 1)
+               set $prev_flags = $arg1
+       else
+               set $prev_flags = 0
+       end
+       set $msg = ((struct printk_log *) (log_buf + $idx))
+       set $prefix = 1
+       set $newline = 1
+       set $log = log_buf + $idx + sizeof(*$msg)
 
-define dmesg
-       set $i = 0
-       set $end_idx = (log_end - 1) & (log_buf_len - 1)
+       # prev & LOG_CONT && !(msg->flags & LOG_PREIX)
+       if (($prev_flags & 8) && !($msg->flags & 4))
+               set $prefix = 0
+       end
+
+       # msg->flags & LOG_CONT
+       if ($msg->flags & 8)
+               # (prev & LOG_CONT && !(prev & LOG_NEWLINE))
+               if (($prev_flags & 8) && !($prev_flags & 2))
+                       set $prefix = 0
+               end
+               # (!(msg->flags & LOG_NEWLINE))
+               if (!($msg->flags & 2))
+                       set $newline = 0
+               end
+       end
+
+       if ($prefix)
+               printf "[%5lu.%06lu] ", $msg->ts_nsec / 1000000000, $msg->ts_nsec % 1000000000
+       end
+       if ($msg->text_len != 0)
+               eval "printf \"%%%d.%ds\", $log", $msg->text_len, $msg->text_len
+       end
+       if ($newline)
+               printf "\n"
+       end
+       if ($msg->dict_len > 0)
+               set $dict = $log + $msg->text_len
+               set $idx = 0
+               set $line = 1
+               while ($idx < $msg->dict_len)
+                       if ($line)
+                               printf " "
+                               set $line = 0
+                       end
+                       set $c = $dict[$idx]
+                       if ($c == '\0')
+                               printf "\n"
+                               set $line = 1
+                       else
+                               if ($c < ' ' || $c >= 127 || $c == '\\')
+                                       printf "\\x%02x", $c
+                               else
+                                       printf "%c", $c
+                               end
+                       end
+                       set $idx = $idx + 1
+               end
+               printf "\n"
+       end
+end
+document dump_log_idx
+       Dump a single log given its index in the log buffer.  The first
+       parameter is the index into log_buf, the second is optional and
+       specified the previous log buffer's flags, used for properly
+       formatting continued lines.
+end
 
-       while ($i < logged_chars)
-               set $idx = (log_end - 1 - logged_chars + $i) & (log_buf_len - 1)
+define dmesg
+       set $i = log_first_idx
+       set $end_idx = log_first_idx
+       set $prev_flags = 0
 
-               if ($idx + 100 <= $end_idx) || \
-                  ($end_idx <= $idx && $idx + 100 < log_buf_len)
-                       printf "%.100s", &log_buf[$idx]
-                       set $i = $i + 100
+       while (1)
+               set $msg = ((struct printk_log *) (log_buf + $i))
+               if ($msg->len == 0)
+                       set $i = 0
                else
-                       printf "%c", log_buf[$idx]
-                       set $i = $i + 1
+                       dump_log_idx $i $prev_flags
+                       set $i = $i + $msg->len
+                       set $prev_flags = $msg->flags
+               end
+               if ($i == $end_idx)
+                       loop_break
                end
        end
 end
index 631b0f7..9d05ed7 100644 (file)
@@ -369,8 +369,6 @@ does not allocate any driver private context space.
 Switch configuration
 --------------------
 
-- priv_size: additional size needed by the switch driver for its private context
-
 - tag_protocol: this is to indicate what kind of tagging protocol is supported,
   should be a valid value from the dsa_tag_protocol enum
 
@@ -416,11 +414,6 @@ PHY devices and link management
   to the switch port MDIO registers. If unavailable return a negative error
   code.
 
-- poll_link: Function invoked by DSA to query the link state of the switch
-  builtin Ethernet PHYs, per port. This function is responsible for calling
-  netif_carrier_{on,off} when appropriate, and can be used to poll all ports in a
-  single call. Executes from workqueue context.
-
 - adjust_link: Function invoked by the PHY library when a slave network device
   is attached to a PHY device. This function is responsible for appropriately
   configuring the switch port link parameters: speed, duplex, pause based on
@@ -542,6 +535,16 @@ Bridge layer
 Bridge VLAN filtering
 ---------------------
 
+- port_vlan_filtering: bridge layer function invoked when the bridge gets
+  configured for turning on or off VLAN filtering. If nothing specific needs to
+  be done at the hardware level, this callback does not need to be implemented.
+  When VLAN filtering is turned on, the hardware must be programmed with
+  rejecting 802.1Q frames which have VLAN IDs outside of the programmed allowed
+  VLAN ID map/rules.  If there is no PVID programmed into the switch port,
+  untagged frames must be rejected as well. When turned off the switch must
+  accept any 802.1Q frames irrespective of their VLAN ID, and untagged frames are
+  allowed.
+
 - port_vlan_prepare: bridge layer function invoked when the bridge prepares the
   configuration of a VLAN on the given port. If the operation is not supported
   by the hardware, this function should return -EOPNOTSUPP to inform the bridge
index 6c7f365..9ae9293 100644 (file)
@@ -1036,15 +1036,17 @@ proxy_arp_pvlan - BOOLEAN
 
 shared_media - BOOLEAN
        Send(router) or accept(host) RFC1620 shared media redirects.
-       Overrides ip_secure_redirects.
+       Overrides secure_redirects.
        shared_media for the interface will be enabled if at least one of
        conf/{all,interface}/shared_media is set to TRUE,
        it will be disabled otherwise
        default TRUE
 
 secure_redirects - BOOLEAN
-       Accept ICMP redirect messages only for gateways,
-       listed in default gateway list.
+       Accept ICMP redirect messages only to gateways listed in the
+       interface's current gateway list. Even if disabled, RFC1122 redirect
+       rules still apply.
+       Overridden by shared_media.
        secure_redirects for the interface will be enabled if at least one of
        conf/{all,interface}/secure_redirects is set to TRUE,
        it will be disabled otherwise
index ca895fd..789b27c 100644 (file)
@@ -42,9 +42,26 @@ variants of these functions, devm_pwm_get() and devm_pwm_put(), also exist.
 
 After being requested, a PWM has to be configured using:
 
-int pwm_config(struct pwm_device *pwm, int duty_ns, int period_ns);
+int pwm_apply_state(struct pwm_device *pwm, struct pwm_state *state);
 
-To start/stop toggling the PWM output use pwm_enable()/pwm_disable().
+This API controls both the PWM period/duty_cycle config and the
+enable/disable state.
+
+The pwm_config(), pwm_enable() and pwm_disable() functions are just wrappers
+around pwm_apply_state() and should not be used if the user wants to change
+several parameter at once. For example, if you see pwm_config() and
+pwm_{enable,disable}() calls in the same function, this probably means you
+should switch to pwm_apply_state().
+
+The PWM user API also allows one to query the PWM state with pwm_get_state().
+
+In addition to the PWM state, the PWM API also exposes PWM arguments, which
+are the reference PWM config one should use on this PWM.
+PWM arguments are usually platform-specific and allows the PWM user to only
+care about dutycycle relatively to the full period (like, duty = 50% of the
+period). struct pwm_args contains 2 fields (period and polarity) and should
+be used to set the initial PWM config (usually done in the probe function
+of the PWM user). PWM arguments are retrieved with pwm_get_args().
 
 Using PWMs with the sysfs interface
 -----------------------------------
@@ -105,6 +122,15 @@ goes low for the remainder of the period. Conversely, a signal with inversed
 polarity starts low for the duration of the duty cycle and goes high for the
 remainder of the period.
 
+Drivers are encouraged to implement ->apply() instead of the legacy
+->enable(), ->disable() and ->config() methods. Doing that should provide
+atomicity in the PWM config workflow, which is required when the PWM controls
+a critical device (like a regulator).
+
+The implementation of ->get_state() (a method used to retrieve initial PWM
+state) is also encouraged for the same reason: letting the PWM user know
+about the current PWM state would allow him to avoid glitches.
+
 Locking
 -------
 
diff --git a/Documentation/scsi/tcm_qla2xxx.txt b/Documentation/scsi/tcm_qla2xxx.txt
new file mode 100644 (file)
index 0000000..c3a670a
--- /dev/null
@@ -0,0 +1,22 @@
+tcm_qla2xxx jam_host attribute
+------------------------------
+There is now a new module endpoint atribute called jam_host
+attribute: jam_host: boolean=0/1
+This attribute and accompanying code is only included if the
+Kconfig parameter TCM_QLA2XXX_DEBUG is set to Y
+By default this jammer code and functionality is disabled
+
+Use this attribute to control the discarding of SCSI commands to a
+selected host.
+This may be useful for testing error handling and simulating slow drain
+and other fabric issues.
+
+Setting a boolean of 1 for the jam_host attribute for a particular host
+ will discard the commands for that host.
+Reset back to 0 to stop the jamming.
+
+Enable host 4 to be jammed
+echo 1 > /sys/kernel/config/target/qla2xxx/21:00:00:24:ff:27:8f:ae/tpgt_1/attrib/jam_host
+
+Disable jamming on host 4
+echo 0 > /sys/kernel/config/target/qla2xxx/21:00:00:24:ff:27:8f:ae/tpgt_1/attrib/jam_host
index 20d0571..3849814 100644 (file)
@@ -826,7 +826,8 @@ The keyctl syscall functions are:
  (*) Compute a Diffie-Hellman shared secret or public key
 
        long keyctl(KEYCTL_DH_COMPUTE, struct keyctl_dh_params *params,
-                  char *buffer, size_t buflen);
+                  char *buffer, size_t buflen,
+                  void *reserved);
 
      The params struct contains serial numbers for three keys:
 
@@ -843,6 +844,8 @@ The keyctl syscall functions are:
      public key.  If the base is the remote public key, the result is
      the shared secret.
 
+     The reserved argument must be set to NULL.
+
      The buffer length must be at least the length of the prime, or zero.
 
      If the buffer length is nonzero, the length of the result is
index daabdd7..a3683ce 100644 (file)
@@ -61,6 +61,7 @@ show up in /proc/sys/kernel:
 - perf_cpu_time_max_percent
 - perf_event_paranoid
 - perf_event_max_stack
+- perf_event_max_contexts_per_stack
 - pid_max
 - powersave-nap               [ PPC only ]
 - printk
@@ -668,6 +669,19 @@ The default value is 127.
 
 ==============================================================
 
+perf_event_max_contexts_per_stack:
+
+Controls maximum number of stack frame context entries for
+(attr.sample_type & PERF_SAMPLE_CALLCHAIN) configured events, for
+instance, when using 'perf record -g' or 'perf trace --call-graph fp'.
+
+This can only be done when no events are in use that have callchains
+enabled, otherwise writing to this file will return -EBUSY.
+
+The default value is 8.
+
+==============================================================
+
 pid_max:
 
 PID allocation wrap value.  When the kernel's next PID value
index 7d370c9..94bf694 100755 (executable)
@@ -294,8 +294,6 @@ def tcm_mod_build_configfs(proto_ident, fabric_mod_dir_var, fabric_mod_name):
        buf += "        .tpg_check_prod_mode_write_protect = " + fabric_mod_name + "_check_false,\n"
        buf += "        .tpg_get_inst_index             = " + fabric_mod_name + "_tpg_get_inst_index,\n"
        buf += "        .release_cmd                    = " + fabric_mod_name + "_release_cmd,\n"
-       buf += "        .shutdown_session               = " + fabric_mod_name + "_shutdown_session,\n"
-       buf += "        .close_session                  = " + fabric_mod_name + "_close_session,\n"
        buf += "        .sess_get_index                 = " + fabric_mod_name + "_sess_get_index,\n"
        buf += "        .sess_get_initiator_sid         = NULL,\n"
        buf += "        .write_pending                  = " + fabric_mod_name + "_write_pending,\n"
@@ -467,20 +465,6 @@ def tcm_mod_dump_fabric_ops(proto_ident, fabric_mod_dir_var, fabric_mod_name):
                        buf += "}\n\n"
                        bufi += "void " + fabric_mod_name + "_release_cmd(struct se_cmd *);\n"
 
-               if re.search('shutdown_session\)\(', fo):
-                       buf += "int " + fabric_mod_name + "_shutdown_session(struct se_session *se_sess)\n"
-                       buf += "{\n"
-                       buf += "        return 0;\n"
-                       buf += "}\n\n"
-                       bufi += "int " + fabric_mod_name + "_shutdown_session(struct se_session *);\n"
-
-               if re.search('close_session\)\(', fo):
-                       buf += "void " + fabric_mod_name + "_close_session(struct se_session *se_sess)\n"
-                       buf += "{\n"
-                       buf += "        return;\n"
-                       buf += "}\n\n"
-                       bufi += "void " + fabric_mod_name + "_close_session(struct se_session *);\n"
-
                if re.search('sess_get_index\)\(', fo):
                        buf += "u32 " + fabric_mod_name + "_sess_get_index(struct se_session *se_sess)\n"
                        buf += "{\n"
index ed419d6..efc3f3d 100644 (file)
@@ -69,8 +69,8 @@ temperature) and throttle appropriate devices.
 1.1.2 void thermal_zone_device_unregister(struct thermal_zone_device *tz)
 
     This interface function removes the thermal zone device.
-    It deletes the corresponding entry form /sys/class/thermal folder and
-    unbind all the thermal cooling devices it uses.
+    It deletes the corresponding entry from /sys/class/thermal folder and
+    unbinds all the thermal cooling devices it uses.
 
 1.1.3 struct thermal_zone_device *thermal_zone_of_sensor_register(
                struct device *dev, int sensor_id, void *data,
@@ -146,32 +146,32 @@ temperature) and throttle appropriate devices.
 
     This interface function adds a new thermal cooling device (fan/processor/...)
     to /sys/class/thermal/ folder as cooling_device[0-*]. It tries to bind itself
-    to all the thermal zone devices register at the same time.
+    to all the thermal zone devices registered at the same time.
     name: the cooling device name.
     devdata: device private data.
     ops: thermal cooling devices call-backs.
        .get_max_state: get the Maximum throttle state of the cooling device.
-       .get_cur_state: get the Current throttle state of the cooling device.
+       .get_cur_state: get the Currently requested throttle state of the cooling device.
        .set_cur_state: set the Current throttle state of the cooling device.
 
 1.2.2 void thermal_cooling_device_unregister(struct thermal_cooling_device *cdev)
 
-    This interface function remove the thermal cooling device.
-    It deletes the corresponding entry form /sys/class/thermal folder and
-    unbind itself from all the thermal zone devices using it.
+    This interface function removes the thermal cooling device.
+    It deletes the corresponding entry from /sys/class/thermal folder and
+    unbinds itself from all the thermal zone devices using it.
 
 1.3 interface for binding a thermal zone device with a thermal cooling device
 1.3.1 int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz,
        int trip, struct thermal_cooling_device *cdev,
        unsigned long upper, unsigned long lower, unsigned int weight);
 
-    This interface function bind a thermal cooling device to the certain trip
+    This interface function binds a thermal cooling device to a particular trip
     point of a thermal zone device.
     This function is usually called in the thermal zone device .bind callback.
     tz: the thermal zone device
     cdev: thermal cooling device
-    trip: indicates which trip point the cooling devices is associated with
-         in this thermal zone.
+    trip: indicates which trip point in this thermal zone the cooling device
+          is associated with.
     upper:the Maximum cooling state for this trip point.
           THERMAL_NO_LIMIT means no upper limit,
          and the cooling device can be in max_state.
@@ -184,13 +184,13 @@ temperature) and throttle appropriate devices.
 1.3.2 int thermal_zone_unbind_cooling_device(struct thermal_zone_device *tz,
                int trip, struct thermal_cooling_device *cdev);
 
-    This interface function unbind a thermal cooling device from the certain
+    This interface function unbinds a thermal cooling device from a particular
     trip point of a thermal zone device. This function is usually called in
     the thermal zone device .unbind callback.
     tz: the thermal zone device
     cdev: thermal cooling device
-    trip: indicates which trip point the cooling devices is associated with
-         in this thermal zone.
+    trip: indicates which trip point in this thermal zone the cooling device
+          is associated with.
 
 1.4 Thermal Zone Parameters
 1.4.1 struct thermal_bind_params
@@ -210,13 +210,13 @@ temperature) and throttle appropriate devices.
                this thermal zone and cdev, for a particular trip point.
                If nth bit is set, then the cdev and thermal zone are bound
                for trip point n.
-    .limits: This is an array of cooling state limits. Must have exactly
-         2 * thermal_zone.number_of_trip_points. It is an array consisting
-         of tuples <lower-state upper-state> of state limits. Each trip
-         will be associated with one state limit tuple when binding.
-         A NULL pointer means <THERMAL_NO_LIMITS THERMAL_NO_LIMITS>
-         on all trips. These limits are used when binding a cdev to a
-         trip point.
+    .binding_limits: This is an array of cooling state limits. Must have
+                     exactly 2 * thermal_zone.number_of_trip_points. It is an
+                     array consisting of tuples <lower-state upper-state> of
+                     state limits. Each trip will be associated with one state
+                     limit tuple when binding. A NULL pointer means
+                     <THERMAL_NO_LIMITS THERMAL_NO_LIMITS> on all trips.
+                     These limits are used when binding a cdev to a trip point.
     .match: This call back returns success(0) if the 'tz and cdev' need to
            be bound, as per platform data.
 1.4.2 struct thermal_zone_params
@@ -351,8 +351,8 @@ cdev[0-*]
        RO, Optional
 
 cdev[0-*]_trip_point
-       The trip point with which cdev[0-*] is associated in this thermal
-       zone; -1 means the cooling device is not associated with any trip
+       The trip point in this thermal zone which cdev[0-*] is associated
+       with; -1 means the cooling device is not associated with any trip
        point.
        RO, Optional
 
index 9488078..a40398c 100644 (file)
@@ -1,64 +1,67 @@
-Last reviewed: 06/02/2009
+Last reviewed: 04/04/2016
 
-                     HP iLO2 NMI Watchdog Driver
-              NMI sourcing for iLO2 based ProLiant Servers
+                     HPE iLO NMI Watchdog Driver
+              NMI sourcing for iLO based ProLiant Servers
                      Documentation and Driver by
-              Thomas Mingarelli <thomas.mingarelli@hp.com>
+              Thomas Mingarelli <thomas.mingarelli@hpe.com>
 
- The HP iLO2 NMI Watchdog driver is a kernel module that provides basic
+ The HPE iLO NMI Watchdog driver is a kernel module that provides basic
  watchdog functionality and the added benefit of NMI sourcing. Both the
  watchdog functionality and the NMI sourcing capability need to be enabled
  by the user. Remember that the two modes are not dependent on one another.
  A user can have the NMI sourcing without the watchdog timer and vice-versa.
+ All references to iLO in this document imply it also works on iLO2 and all
+ subsequent generations.
 
  Watchdog functionality is enabled like any other common watchdog driver. That
  is, an application needs to be started that kicks off the watchdog timer. A
  basic application exists in the Documentation/watchdog/src directory called
  watchdog-test.c. Simply compile the C file and kick it off. If the system
- gets into a bad state and hangs, the HP ProLiant iLO 2 timer register will
+ gets into a bad state and hangs, the HPE ProLiant iLO timer register will
  not be updated in a timely fashion and a hardware system reset (also known as
  an Automatic Server Recovery (ASR)) event will occur.
 
- The hpwdt driver also has four (4) module parameters. They are the following:
+ The hpwdt driver also has three (3) module parameters. They are the following:
 
- soft_margin - allows the user to set the watchdog timer value
- allow_kdump - allows the user to save off a kernel dump image after an NMI
+ soft_margin - allows the user to set the watchdog timer value.
+               Default value is 30 seconds.
+ allow_kdump - allows the user to save off a kernel dump image after an NMI.
+               Default value is 1/ON
  nowayout    - basic watchdog parameter that does not allow the timer to
                be restarted or an impending ASR to be escaped.
- priority    - determines whether or not the hpwdt driver is first on the
-               die_notify list to handle NMIs or last. The default value
-               for this module parameter is 0 or LAST. If the user wants to
-               enable NMI sourcing then reload the hpwdt driver with
-               priority=1 (and boot with nmi_watchdog=0).
+               Default value is set when compiling the kernel. If it is set
+               to "Y", then there is no way of disabling the watchdog once
+               it has been started.
 
  NOTE: More information about watchdog drivers in general, including the ioctl
        interface to /dev/watchdog can be found in
        Documentation/watchdog/watchdog-api.txt and Documentation/IPMI.txt.
 
- The priority parameter was introduced due to other kernel software that relied
- on handling NMIs (like oprofile). Keeping hpwdt's priority at 0 (or LAST)
- enables the users of NMIs for non critical events to be work as expected.
-
  The NMI sourcing capability is disabled by default due to the inability to
  distinguish between "NMI Watchdog Ticks" and "HW generated NMI events" in the
  Linux kernel. What this means is that the hpwdt nmi handler code is called
  each time the NMI signal fires off. This could amount to several thousands of
  NMIs in a matter of seconds. If a user sees the Linux kernel's "dazed and
  confused" message in the logs or if the system gets into a hung state, then
- the hpwdt driver can be reloaded with the "priority" module parameter set
- (priority=1).
+ the hpwdt driver can be reloaded.
 
  1. If the kernel has not been booted with nmi_watchdog turned off then
-    edit /boot/grub/menu.lst and place the nmi_watchdog=0 at the end of the
-    currently booting kernel line.
+    edit and place the nmi_watchdog=0 at the end of the currently booting
+    kernel line. Depending on your Linux distribution and platform setup:
+    For non-UEFI systems
+       /boot/grub/grub.conf   or
+       /boot/grub/menu.lst
+    For UEFI systems
+      /boot/efi/EFI/distroname/grub.conf   or
+      /boot/efi/efi/distroname/elilo.conf
  2. reboot the sever
- 3. Once the system comes up perform a rmmod hpwdt
- 4. insmod /lib/modules/`uname -r`/kernel/drivers/char/watchdog/hpwdt.ko priority=1
+ 3. Once the system comes up perform a modprobe -r hpwdt
+ 4. modprobe /lib/modules/`uname -r`/kernel/drivers/watchdog/hpwdt.ko
 
  Now, the hpwdt can successfully receive and source the NMI and provide a log
- message that details the reason for the NMI (as determined by the HP BIOS).
+ message that details the reason for the NMI (as determined by the HPE BIOS).
 
- Below is a list of NMIs the HP BIOS understands along with the associated
+ Below is a list of NMIs the HPE BIOS understands along with the associated
  code (reason):
 
        No source found                00h
@@ -92,4 +95,4 @@ Last reviewed: 06/02/2009
 
 
  -- Tom Mingarelli
-    (thomas.mingarelli@hp.com)
+    (thomas.mingarelli@hpe.com)
index c161399..a8d3642 100644 (file)
@@ -86,6 +86,10 @@ nowayout: Watchdog cannot be stopped once started
 davinci_wdt:
 heartbeat: Watchdog heartbeat period in seconds from 1 to 600, default 60
 -------------------------------------------------
+ebc-c384_wdt:
+timeout: Watchdog timeout in seconds. (1<=timeout<=15300, default=60)
+nowayout: Watchdog cannot be stopped once started
+-------------------------------------------------
 ep93xx_wdt:
 nowayout: Watchdog cannot be stopped once started
 timeout: Watchdog timeout in seconds. (1<=timeout<=3600, default=TBD)
index 3302006..ed42cb6 100644 (file)
@@ -2304,7 +2304,7 @@ BCACHE (BLOCK LAYER CACHE)
 M:     Kent Overstreet <kent.overstreet@gmail.com>
 L:     linux-bcache@vger.kernel.org
 W:     http://bcache.evilpiepirate.org
-S:     Maintained
+S:     Orphan
 F:     drivers/md/bcache/
 
 BDISP ST MEDIA DRIVER
@@ -2505,6 +2505,7 @@ M:        Hauke Mehrtens <hauke@hauke-m.de>
 M:     RafaÅ‚ MiÅ‚ecki <zajec5@gmail.com>
 L:     linux-mips@linux-mips.org
 S:     Maintained
+F:     Documentation/devicetree/bindings/mips/brcm/
 F:     arch/mips/bcm47xx/*
 F:     arch/mips/include/asm/mach-bcm47xx/*
 
@@ -5308,6 +5309,13 @@ F:       drivers/block/cciss*
 F:     include/linux/cciss_ioctl.h
 F:     include/uapi/linux/cciss_ioctl.h
 
+HFI1 DRIVER
+M:     Mike Marciniszyn <mike.marciniszyn@intel.com>
+M:     Dennis Dalessandro <dennis.dalessandro@intel.com>
+L:     linux-rdma@vger.kernel.org
+S:     Supported
+F:     drivers/infiniband/hw/hfi1
+
 HFS FILESYSTEM
 L:     linux-fsdevel@vger.kernel.org
 S:     Orphan
@@ -5837,7 +5845,6 @@ T:        git git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma.git
 S:     Supported
 F:     Documentation/infiniband/
 F:     drivers/infiniband/
-F:     drivers/staging/rdma/
 F:     include/uapi/linux/if_infiniband.h
 F:     include/uapi/rdma/
 F:     include/rdma/
@@ -6096,6 +6103,14 @@ S:       Maintained
 F:     arch/x86/include/asm/intel_telemetry.h
 F:     drivers/platform/x86/intel_telemetry*
 
+INTEL PMC CORE DRIVER
+M:     Rajneesh Bhardwaj <rajneesh.bhardwaj@intel.com>
+M:     Vishwanath Somayaji <vishwanath.somayaji@intel.com>
+L:     platform-driver-x86@vger.kernel.org
+S:     Maintained
+F:     arch/x86/include/asm/pmc_core.h
+F:     drivers/platform/x86/intel_pmc_core*
+
 IOC3 ETHERNET DRIVER
 M:     Ralf Baechle <ralf@linux-mips.org>
 L:     linux-mips@linux-mips.org
@@ -6413,8 +6428,9 @@ F:        Documentation/kbuild/kconfig-language.txt
 F:     scripts/kconfig/
 
 KDUMP
-M:     Vivek Goyal <vgoyal@redhat.com>
-M:     Haren Myneni <hbabu@us.ibm.com>
+M:     Dave Young <dyoung@redhat.com>
+M:     Baoquan He <bhe@redhat.com>
+R:     Vivek Goyal <vgoyal@redhat.com>
 L:     kexec@lists.infradead.org
 W:     http://lse.sourceforge.net/kdump/
 S:     Maintained
@@ -6491,6 +6507,7 @@ F:        arch/*/include/asm/kvm*
 F:     include/linux/kvm*
 F:     include/uapi/linux/kvm*
 F:     virt/kvm/
+F:     tools/kvm/
 
 KERNEL VIRTUAL MACHINE (KVM) FOR AMD-V
 M:     Joerg Roedel <joro@8bytes.org>
@@ -6559,7 +6576,7 @@ L:        kexec@lists.infradead.org
 S:     Maintained
 F:     include/linux/kexec.h
 F:     include/uapi/linux/kexec.h
-F:     kernel/kexec.c
+F:     kernel/kexec*
 
 KEYS/KEYRINGS:
 M:     David Howells <dhowells@redhat.com>
@@ -7505,6 +7522,7 @@ W:        http://www.linux-mips.org/
 T:     git git://git.linux-mips.org/pub/scm/ralf/linux.git
 Q:     http://patchwork.linux-mips.org/project/linux-mips/list/
 S:     Supported
+F:     Documentation/devicetree/bindings/mips/
 F:     Documentation/mips/
 F:     arch/mips/
 
@@ -7971,6 +7989,7 @@ Q:        http://patchwork.ozlabs.org/project/netdev/list/
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git
 S:     Odd Fixes
+F:     Documentation/devicetree/bindings/net/
 F:     drivers/net/
 F:     include/linux/if_*
 F:     include/linux/netdevice.h
@@ -8881,6 +8900,7 @@ F:        arch/*/kernel/*/perf_event*.c
 F:     arch/*/kernel/*/*/perf_event*.c
 F:     arch/*/include/asm/perf_event.h
 F:     arch/*/kernel/perf_callchain.c
+F:     arch/*/events/*
 F:     tools/perf/
 
 PERSONALITY HANDLING
@@ -8925,6 +8945,7 @@ M:        Linus Walleij <linus.walleij@linaro.org>
 L:     linux-gpio@vger.kernel.org
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-pinctrl.git
 S:     Maintained
+F:     Documentation/devicetree/bindings/pinctrl/
 F:     drivers/pinctrl/
 F:     include/linux/pinctrl/
 
@@ -10909,12 +10930,6 @@ M:     Arnaud Patard <arnaud.patard@rtp-net.org>
 S:     Odd Fixes
 F:     drivers/staging/xgifb/
 
-HFI1 DRIVER
-M:     Mike Marciniszyn <infinipath@intel.com>
-L:     linux-rdma@vger.kernel.org
-S:     Supported
-F:     drivers/staging/rdma/hfi1
-
 STARFIRE/DURALAN NETWORK DRIVER
 M:     Ion Badulescu <ionut@badula.org>
 S:     Odd Fixes
@@ -11295,6 +11310,7 @@ F:      drivers/platform/x86/thinkpad_acpi.c
 
 TI BANDGAP AND THERMAL DRIVER
 M:     Eduardo Valentin <edubezval@gmail.com>
+M:     Keerthy <j-keerthy@ti.com>
 L:     linux-pm@vger.kernel.org
 L:     linux-omap@vger.kernel.org
 S:     Maintained
@@ -12345,6 +12361,7 @@ L:      linux-watchdog@vger.kernel.org
 W:     http://www.linux-watchdog.org/
 T:     git git://www.linux-watchdog.org/linux-watchdog.git
 S:     Maintained
+F:     Documentation/devicetree/bindings/watchdog/
 F:     Documentation/watchdog/
 F:     drivers/watchdog/
 F:     include/linux/watchdog.h
index 0f9cb36..8d1301a 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,8 @@
 VERSION = 4
-PATCHLEVEL = 6
+PATCHLEVEL = 7
 SUBLEVEL = 0
-EXTRAVERSION =
-NAME = Charred Weasel
+EXTRAVERSION = -rc2
+NAME = Psychotic Stoned Sheep
 
 # *DOCUMENTATION*
 # To see a list of typical targets execute "make help"
@@ -128,6 +128,10 @@ _all:
 # Cancel implicit rules on top Makefile
 $(CURDIR)/Makefile Makefile: ;
 
+ifneq ($(words $(subst :, ,$(CURDIR))), 1)
+  $(error main directory cannot contain spaces nor colons)
+endif
+
 ifneq ($(KBUILD_OUTPUT),)
 # Invoke a second make in the output directory, passing relevant variables
 # check that the output directory actually exists
@@ -142,7 +146,7 @@ PHONY += $(MAKECMDGOALS) sub-make
 $(filter-out _all sub-make $(CURDIR)/Makefile, $(MAKECMDGOALS)) _all: sub-make
        @:
 
-sub-make: FORCE
+sub-make:
        $(Q)$(MAKE) -C $(KBUILD_OUTPUT) KBUILD_SRC=$(CURDIR) \
        -f $(CURDIR)/Makefile $(filter-out _all sub-make,$(MAKECMDGOALS))
 
@@ -364,7 +368,7 @@ AFLAGS_MODULE   =
 LDFLAGS_MODULE  =
 CFLAGS_KERNEL  =
 AFLAGS_KERNEL  =
-CFLAGS_GCOV    = -fprofile-arcs -ftest-coverage
+CFLAGS_GCOV    = -fprofile-arcs -ftest-coverage -fno-tree-loop-im -Wno-maybe-uninitialized
 CFLAGS_KCOV    = -fsanitize-coverage=trace-pc
 
 
@@ -617,7 +621,11 @@ KBUILD_CFLAGS      += $(call cc-option,-fno-delete-null-pointer-checks,)
 ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
 KBUILD_CFLAGS  += -Os $(call cc-disable-warning,maybe-uninitialized,)
 else
-KBUILD_CFLAGS  += -O2
+ifdef CONFIG_PROFILE_ALL_BRANCHES
+KBUILD_CFLAGS  += -O2 $(call cc-disable-warning,maybe-uninitialized,)
+else
+KBUILD_CFLAGS   += -O2
+endif
 endif
 
 # Tell gcc to never replace conditional load with a non-conditional one
@@ -697,9 +705,10 @@ KBUILD_CFLAGS += $(call cc-option, -mno-global-merge,)
 KBUILD_CFLAGS += $(call cc-option, -fcatch-undefined-behavior)
 else
 
-# This warning generated too much noise in a regular build.
-# Use make W=1 to enable this warning (see scripts/Makefile.build)
+# These warnings generated too much noise in a regular build.
+# Use make W=1 to enable them (see scripts/Makefile.build)
 KBUILD_CFLAGS += $(call cc-disable-warning, unused-but-set-variable)
+KBUILD_CFLAGS += $(call cc-disable-warning, unused-const-variable)
 endif
 
 ifdef CONFIG_FRAME_POINTER
@@ -926,27 +935,41 @@ export KBUILD_ALLDIRS := $(sort $(filter-out arch/%,$(vmlinux-alldirs)) arch Doc
 
 vmlinux-deps := $(KBUILD_LDS) $(KBUILD_VMLINUX_INIT) $(KBUILD_VMLINUX_MAIN)
 
-# Final link of vmlinux
-      cmd_link-vmlinux = $(CONFIG_SHELL) $< $(LD) $(LDFLAGS) $(LDFLAGS_vmlinux)
-quiet_cmd_link-vmlinux = LINK    $@
-
-# Include targets which we want to
-# execute if the rest of the kernel build went well.
-vmlinux: scripts/link-vmlinux.sh $(vmlinux-deps) FORCE
+# Include targets which we want to execute sequentially if the rest of the
+# kernel build went well. If CONFIG_TRIM_UNUSED_KSYMS is set, this might be
+# evaluated more than once.
+PHONY += vmlinux_prereq
+vmlinux_prereq: $(vmlinux-deps) FORCE
 ifdef CONFIG_HEADERS_CHECK
        $(Q)$(MAKE) -f $(srctree)/Makefile headers_check
 endif
-ifdef CONFIG_SAMPLES
-       $(Q)$(MAKE) $(build)=samples
-endif
 ifdef CONFIG_BUILD_DOCSRC
        $(Q)$(MAKE) $(build)=Documentation
 endif
 ifdef CONFIG_GDB_SCRIPTS
        $(Q)ln -fsn `cd $(srctree) && /bin/pwd`/scripts/gdb/vmlinux-gdb.py
 endif
+ifdef CONFIG_TRIM_UNUSED_KSYMS
+       $(Q)$(CONFIG_SHELL) $(srctree)/scripts/adjust_autoksyms.sh \
+         "$(MAKE) KBUILD_MODULES=1 -f $(srctree)/Makefile vmlinux_prereq"
+endif
+
+# standalone target for easier testing
+include/generated/autoksyms.h: FORCE
+       $(Q)$(CONFIG_SHELL) $(srctree)/scripts/adjust_autoksyms.sh true
+
+# Final link of vmlinux
+      cmd_link-vmlinux = $(CONFIG_SHELL) $< $(LD) $(LDFLAGS) $(LDFLAGS_vmlinux)
+quiet_cmd_link-vmlinux = LINK    $@
+
+vmlinux: scripts/link-vmlinux.sh vmlinux_prereq $(vmlinux-deps) FORCE
        +$(call if_changed,link-vmlinux)
 
+# Build samples along the rest of the kernel
+ifdef CONFIG_SAMPLES
+vmlinux-dirs += samples
+endif
+
 # The actual objects are generated when descending,
 # make sure no implicit rule kicks in
 $(sort $(vmlinux-deps)): $(vmlinux-dirs) ;
@@ -998,10 +1021,12 @@ prepare2: prepare3 outputmakefile asm-generic
 prepare1: prepare2 $(version_h) include/generated/utsrelease.h \
                    include/config/auto.conf
        $(cmd_crmodverdir)
+       $(Q)test -e include/generated/autoksyms.h || \
+           touch   include/generated/autoksyms.h
 
 archprepare: archheaders archscripts prepare1 scripts_basic
 
-prepare0: archprepare FORCE
+prepare0: archprepare
        $(Q)$(MAKE) $(build)=.
 
 # All the preparing..
@@ -1061,7 +1086,7 @@ INSTALL_FW_PATH=$(INSTALL_MOD_PATH)/lib/firmware
 export INSTALL_FW_PATH
 
 PHONY += firmware_install
-firmware_install: FORCE
+firmware_install:
        @mkdir -p $(objtree)/firmware
        $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.fwinst obj=firmware __fw_install
 
@@ -1081,7 +1106,7 @@ PHONY += archscripts
 archscripts:
 
 PHONY += __headers
-__headers: $(version_h) scripts_basic asm-generic archheaders archscripts FORCE
+__headers: $(version_h) scripts_basic asm-generic archheaders archscripts
        $(Q)$(MAKE) $(build)=scripts build_unifdef
 
 PHONY += headers_install_all
@@ -1192,7 +1217,8 @@ else # CONFIG_MODULES
 # Modules not configured
 # ---------------------------------------------------------------------------
 
-modules modules_install: FORCE
+PHONY += modules modules_install
+modules modules_install:
        @echo >&2
        @echo >&2 "The present kernel configuration has modules disabled."
        @echo >&2 "Type 'make config' and enable loadable module support."
@@ -1283,6 +1309,7 @@ boards := $(sort $(notdir $(boards)))
 board-dirs := $(dir $(wildcard $(srctree)/arch/$(SRCARCH)/configs/*/*_defconfig))
 board-dirs := $(sort $(notdir $(board-dirs:/=)))
 
+PHONY += help
 help:
        @echo  'Cleaning targets:'
        @echo  '  clean           - Remove most generated files but keep the config and'
@@ -1453,6 +1480,7 @@ $(clean-dirs):
 clean: rm-dirs := $(MODVERDIR)
 clean: rm-files := $(KBUILD_EXTMOD)/Module.symvers
 
+PHONY += help
 help:
        @echo  '  Building external modules.'
        @echo  '  Syntax: make -C path/to/kernel/src M=$$PWD target'
index b16e74e..d794384 100644 (file)
@@ -598,6 +598,14 @@ config HAVE_STACK_VALIDATION
          Architecture supports the 'objtool check' host tool command, which
          performs compile-time stack metadata validation.
 
+config HAVE_ARCH_HASH
+       bool
+       default n
+       help
+         If this is set, the architecture provides an <asm/hash.h>
+         file which provides platform-specific implementations of some
+         functions in <linux/hash.h> or fs/namei.c.
+
 #
 # ABI hall of shame
 #
index 39e58d1..41fa2ec 100644 (file)
@@ -15,6 +15,7 @@
 #if !defined(_UAPI_ASM_ARC_UNISTD_H) || defined(__SYSCALL)
 #define _UAPI_ASM_ARC_UNISTD_H
 
+#define __ARCH_WANT_RENAMEAT
 #define __ARCH_WANT_SYS_EXECVE
 #define __ARCH_WANT_SYS_CLONE
 #define __ARCH_WANT_SYS_VFORK
index 8b134cf..6fd4802 100644 (file)
@@ -48,7 +48,7 @@ struct arc_callchain_trace {
 static int callchain_trace(unsigned int addr, void *data)
 {
        struct arc_callchain_trace *ctrl = data;
-       struct perf_callchain_entry *entry = ctrl->perf_stuff;
+       struct perf_callchain_entry_ctx *entry = ctrl->perf_stuff;
        perf_callchain_store(entry, addr);
 
        if (ctrl->depth++ < 3)
@@ -58,7 +58,7 @@ static int callchain_trace(unsigned int addr, void *data)
 }
 
 void
-perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
        struct arc_callchain_trace ctrl = {
                .depth = 0,
@@ -69,7 +69,7 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
 }
 
 void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
        /*
         * User stack can't be unwound trivially with kernel dwarf unwinder
index 446705a..5be33a2 100644 (file)
@@ -82,7 +82,6 @@ $(obj)/uImage:        $(obj)/zImage FORCE
 
 $(obj)/bootp/bootp: $(obj)/zImage initrd FORCE
        $(Q)$(MAKE) $(build)=$(obj)/bootp $@
-       @:
 
 $(obj)/bootpImage: $(obj)/bootp/bootp FORCE
        $(call if_changed,objcopy)
index 5761f00..5e4acd2 100644 (file)
@@ -17,7 +17,6 @@ targets       := bootp init.o kernel.o initrd.o
 # Note that bootp.lds picks up kernel.o and initrd.o
 $(obj)/bootp:  $(src)/bootp.lds $(addprefix $(obj)/,init.o kernel.o initrd.o) FORCE
        $(call if_changed,ld)
-       @:
 
 # kernel.o and initrd.o includes a binary image using
 # .incbin, a dependency which is not tracked automatically
@@ -26,4 +25,4 @@ $(obj)/kernel.o: arch/arm/boot/zImage FORCE
 
 $(obj)/initrd.o: $(INITRD) FORCE
 
-PHONY += $(INITRD) FORCE
+PHONY += $(INITRD)
index 0f89d87..06b6c2d 100644 (file)
@@ -399,6 +399,7 @@ dtb-$(CONFIG_SOC_IMX6UL) += \
        imx6ul-tx6ul-mainboard.dtb
 dtb-$(CONFIG_SOC_IMX7D) += \
        imx7d-cl-som-imx7.dtb \
+       imx7d-nitrogen7.dtb \
        imx7d-sbc-imx7.dtb \
        imx7d-sdb.dtb
 dtb-$(CONFIG_SOC_LS1021A) += \
index 267f81a..8c89062 100644 (file)
@@ -14,6 +14,7 @@
 
 /dts-v1/;
 #include "exynos3250.dtsi"
+#include "exynos4412-ppmu-common.dtsi"
 #include <dt-bindings/input/input.h>
 #include <dt-bindings/gpio/gpio.h>
 #include <dt-bindings/clock/samsung,s2mps11.h>
        };
 };
 
+&bus_dmc {
+       devfreq-events = <&ppmu_dmc0_3>, <&ppmu_dmc1_3>;
+       vdd-supply = <&buck1_reg>;
+       status = "okay";
+};
+
 &cpu0 {
        cpu0-supply = <&buck2_reg>;
 };
        status = "okay";
 };
 
-&ppmu_dmc0 {
-       status = "okay";
-
-       events {
-               ppmu_dmc0_3: ppmu-event3-dmc0 {
-                       event-name = "ppmu-event3-dmc0";
-               };
-       };
-};
-
-&ppmu_dmc1 {
-       status = "okay";
-
-       events {
-               ppmu_dmc1_3: ppmu-event3-dmc1 {
-                       event-name = "ppmu-event3-dmc1";
-               };
-       };
-};
-
-&ppmu_leftbus {
-       status = "okay";
-
-       events {
-               ppmu_leftbus_3: ppmu-event3-leftbus {
-                       event-name = "ppmu-event3-leftbus";
-               };
-       };
-};
-
-&ppmu_rightbus {
-       status = "okay";
-
-       events {
-               ppmu_rightbus_3: ppmu-event3-rightbus {
-                       event-name = "ppmu-event3-rightbus";
-               };
-       };
-};
-
 &xusbxti {
        clock-frequency = <24000000>;
 };
index 31eb09b..e422819 100644 (file)
@@ -14,6 +14,7 @@
 
 /dts-v1/;
 #include "exynos3250.dtsi"
+#include "exynos4412-ppmu-common.dtsi"
 #include <dt-bindings/input/input.h>
 #include <dt-bindings/gpio/gpio.h>
 #include <dt-bindings/clock/samsung,s2mps11.h>
        };
 };
 
+&bus_dmc {
+       devfreq-events = <&ppmu_dmc0_3>, <&ppmu_dmc1_3>;
+       vdd-supply = <&buck1_reg>;
+       status = "okay";
+};
+
+&bus_leftbus {
+       devfreq-events = <&ppmu_leftbus_3>, <&ppmu_rightbus_3>;
+       vdd-supply = <&buck3_reg>;
+       status = "okay";
+};
+
+&bus_rightbus {
+       devfreq = <&bus_leftbus>;
+       status = "okay";
+};
+
+&bus_lcd0 {
+       devfreq = <&bus_leftbus>;
+       status = "okay";
+};
+
+&bus_fsys {
+       devfreq = <&bus_leftbus>;
+       status = "okay";
+};
+
+&bus_mcuisp {
+       devfreq = <&bus_leftbus>;
+       status = "okay";
+};
+
+&bus_isp {
+       devfreq = <&bus_leftbus>;
+       status = "okay";
+};
+
+&bus_peril {
+       devfreq = <&bus_leftbus>;
+       status = "okay";
+};
+
+&bus_mfc {
+       devfreq = <&bus_leftbus>;
+       status = "okay";
+};
+
 &cpu0 {
        cpu0-supply = <&buck2_reg>;
 };
        status = "okay";
 };
 
-&ppmu_dmc0 {
-       status = "okay";
-
-       events {
-               ppmu_dmc0_3: ppmu-event3-dmc0 {
-                       event-name = "ppmu-event3-dmc0";
-               };
-       };
-};
-
-&ppmu_dmc1 {
-       status = "okay";
-
-       events {
-               ppmu_dmc1_3: ppmu-event3-dmc1 {
-                       event-name = "ppmu-event3-dmc1";
-               };
-       };
-};
-
-&ppmu_leftbus {
-       status = "okay";
-
-       events {
-               ppmu_leftbus_3: ppmu-event3-leftbus {
-                       event-name = "ppmu-event3-leftbus";
-               };
-       };
-};
-
-&ppmu_rightbus {
-       status = "okay";
-
-       events {
-               ppmu_rightbus_3: ppmu-event3-rightbus {
-                       event-name = "ppmu-event3-rightbus";
-               };
-       };
-};
-
 &xusbxti {
        clock-frequency = <24000000>;
 };
index 094782b..62f3dcd 100644 (file)
                        clock-names = "ppmu";
                        status = "disabled";
                };
+
+               bus_dmc: bus_dmc {
+                       compatible = "samsung,exynos-bus";
+                       clocks = <&cmu_dmc CLK_DIV_DMC>;
+                       clock-names = "bus";
+                       operating-points-v2 = <&bus_dmc_opp_table>;
+                       status = "disabled";
+               };
+
+               bus_dmc_opp_table: opp_table1 {
+                       compatible = "operating-points-v2";
+                       opp-shared;
+
+                       opp@50000000 {
+                               opp-hz = /bits/ 64 <50000000>;
+                               opp-microvolt = <800000>;
+                       };
+                       opp@100000000 {
+                               opp-hz = /bits/ 64 <100000000>;
+                               opp-microvolt = <800000>;
+                       };
+                       opp@134000000 {
+                               opp-hz = /bits/ 64 <134000000>;
+                               opp-microvolt = <800000>;
+                       };
+                       opp@200000000 {
+                               opp-hz = /bits/ 64 <200000000>;
+                               opp-microvolt = <825000>;
+                       };
+                       opp@400000000 {
+                               opp-hz = /bits/ 64 <400000000>;
+                               opp-microvolt = <875000>;
+                       };
+               };
+
+               bus_leftbus: bus_leftbus {
+                       compatible = "samsung,exynos-bus";
+                       clocks = <&cmu CLK_DIV_GDL>;
+                       clock-names = "bus";
+                       operating-points-v2 = <&bus_leftbus_opp_table>;
+                       status = "disabled";
+               };
+
+               bus_rightbus: bus_rightbus {
+                       compatible = "samsung,exynos-bus";
+                       clocks = <&cmu CLK_DIV_GDR>;
+                       clock-names = "bus";
+                       operating-points-v2 = <&bus_leftbus_opp_table>;
+                       status = "disabled";
+               };
+
+               bus_lcd0: bus_lcd0 {
+                       compatible = "samsung,exynos-bus";
+                       clocks = <&cmu CLK_DIV_ACLK_160>;
+                       clock-names = "bus";
+                       operating-points-v2 = <&bus_leftbus_opp_table>;
+                       status = "disabled";
+               };
+
+               bus_fsys: bus_fsys {
+                       compatible = "samsung,exynos-bus";
+                       clocks = <&cmu CLK_DIV_ACLK_200>;
+                       clock-names = "bus";
+                       operating-points-v2 = <&bus_leftbus_opp_table>;
+                       status = "disabled";
+               };
+
+               bus_mcuisp: bus_mcuisp {
+                       compatible = "samsung,exynos-bus";
+                       clocks = <&cmu CLK_DIV_ACLK_400_MCUISP>;
+                       clock-names = "bus";
+                       operating-points-v2 = <&bus_mcuisp_opp_table>;
+                       status = "disabled";
+               };
+
+               bus_isp: bus_isp {
+                       compatible = "samsung,exynos-bus";
+                       clocks = <&cmu CLK_DIV_ACLK_266>;
+                       clock-names = "bus";
+                       operating-points-v2 = <&bus_isp_opp_table>;
+                       status = "disabled";
+               };
+
+               bus_peril: bus_peril {
+                       compatible = "samsung,exynos-bus";
+                       clocks = <&cmu CLK_DIV_ACLK_100>;
+                       clock-names = "bus";
+                       operating-points-v2 = <&bus_peril_opp_table>;
+                       status = "disabled";
+               };
+
+               bus_mfc: bus_mfc {
+                       compatible = "samsung,exynos-bus";
+                       clocks = <&cmu CLK_SCLK_MFC>;
+                       clock-names = "bus";
+                       operating-points-v2 = <&bus_leftbus_opp_table>;
+                       status = "disabled";
+               };
+
+               bus_leftbus_opp_table: opp_table2 {
+                       compatible = "operating-points-v2";
+                       opp-shared;
+
+                       opp@50000000 {
+                               opp-hz = /bits/ 64 <50000000>;
+                               opp-microvolt = <900000>;
+                       };
+                       opp@80000000 {
+                               opp-hz = /bits/ 64 <80000000>;
+                               opp-microvolt = <900000>;
+                       };
+                       opp@100000000 {
+                               opp-hz = /bits/ 64 <100000000>;
+                               opp-microvolt = <1000000>;
+                       };
+                       opp@134000000 {
+                               opp-hz = /bits/ 64 <134000000>;
+                               opp-microvolt = <1000000>;
+                       };
+                       opp@200000000 {
+                               opp-hz = /bits/ 64 <200000000>;
+                               opp-microvolt = <1000000>;
+                       };
+               };
+
+               bus_mcuisp_opp_table: opp_table3 {
+                       compatible = "operating-points-v2";
+                       opp-shared;
+
+                       opp@50000000 {
+                               opp-hz = /bits/ 64 <50000000>;
+                       };
+                       opp@80000000 {
+                               opp-hz = /bits/ 64 <80000000>;
+                       };
+                       opp@100000000 {
+                               opp-hz = /bits/ 64 <100000000>;
+                       };
+                       opp@200000000 {
+                               opp-hz = /bits/ 64 <200000000>;
+                       };
+                       opp@400000000 {
+                               opp-hz = /bits/ 64 <400000000>;
+                       };
+               };
+
+               bus_isp_opp_table: opp_table4 {
+                       compatible = "operating-points-v2";
+                       opp-shared;
+
+                       opp@50000000 {
+                               opp-hz = /bits/ 64 <50000000>;
+                       };
+                       opp@80000000 {
+                               opp-hz = /bits/ 64 <80000000>;
+                       };
+                       opp@100000000 {
+                               opp-hz = /bits/ 64 <100000000>;
+                       };
+                       opp@200000000 {
+                               opp-hz = /bits/ 64 <200000000>;
+                       };
+                       opp@300000000 {
+                               opp-hz = /bits/ 64 <300000000>;
+                       };
+               };
+
+               bus_peril_opp_table: opp_table5 {
+                       compatible = "operating-points-v2";
+                       opp-shared;
+
+                       opp@50000000 {
+                               opp-hz = /bits/ 64 <50000000>;
+                       };
+                       opp@80000000 {
+                               opp-hz = /bits/ 64 <80000000>;
+                       };
+                       opp@100000000 {
+                               opp-hz = /bits/ 64 <100000000>;
+                       };
+               };
        };
 };
 
index c1cb8df..2d9b029 100644 (file)
                power-domains = <&pd_lcd1>;
                #iommu-cells = <0>;
        };
+
+       bus_dmc: bus_dmc {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DIV_DMC>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_dmc_opp_table>;
+               status = "disabled";
+       };
+
+       bus_acp: bus_acp {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DIV_ACP>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_acp_opp_table>;
+               status = "disabled";
+       };
+
+       bus_peri: bus_peri {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_ACLK100>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_peri_opp_table>;
+               status = "disabled";
+       };
+
+       bus_fsys: bus_fsys {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_ACLK133>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_fsys_opp_table>;
+               status = "disabled";
+       };
+
+       bus_display: bus_display {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_ACLK160>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_display_opp_table>;
+               status = "disabled";
+       };
+
+       bus_lcd0: bus_lcd0 {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_ACLK200>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_leftbus_opp_table>;
+               status = "disabled";
+       };
+
+       bus_leftbus: bus_leftbus {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DIV_GDL>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_leftbus_opp_table>;
+               status = "disabled";
+       };
+
+       bus_rightbus: bus_rightbus {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DIV_GDR>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_leftbus_opp_table>;
+               status = "disabled";
+       };
+
+       bus_mfc: bus_mfc {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_SCLK_MFC>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_leftbus_opp_table>;
+               status = "disabled";
+       };
+
+       bus_dmc_opp_table: opp_table1 {
+               compatible = "operating-points-v2";
+               opp-shared;
+
+               opp@134000000 {
+                       opp-hz = /bits/ 64 <134000000>;
+                       opp-microvolt = <1025000>;
+               };
+               opp@267000000 {
+                       opp-hz = /bits/ 64 <267000000>;
+                       opp-microvolt = <1050000>;
+               };
+               opp@400000000 {
+                       opp-hz = /bits/ 64 <400000000>;
+                       opp-microvolt = <1150000>;
+               };
+       };
+
+       bus_acp_opp_table: opp_table2 {
+               compatible = "operating-points-v2";
+               opp-shared;
+
+               opp@134000000 {
+                       opp-hz = /bits/ 64 <134000000>;
+               };
+               opp@160000000 {
+                       opp-hz = /bits/ 64 <160000000>;
+               };
+               opp@200000000 {
+                       opp-hz = /bits/ 64 <200000000>;
+               };
+       };
+
+       bus_peri_opp_table: opp_table3 {
+               compatible = "operating-points-v2";
+               opp-shared;
+
+               opp@5000000 {
+                       opp-hz = /bits/ 64 <5000000>;
+               };
+               opp@100000000 {
+                       opp-hz = /bits/ 64 <100000000>;
+               };
+       };
+
+       bus_fsys_opp_table: opp_table4 {
+               compatible = "operating-points-v2";
+               opp-shared;
+
+               opp@10000000 {
+                       opp-hz = /bits/ 64 <10000000>;
+               };
+               opp@134000000 {
+                       opp-hz = /bits/ 64 <134000000>;
+               };
+       };
+
+       bus_display_opp_table: opp_table5 {
+               compatible = "operating-points-v2";
+               opp-shared;
+
+               opp@100000000 {
+                       opp-hz = /bits/ 64 <100000000>;
+               };
+               opp@134000000 {
+                       opp-hz = /bits/ 64 <134000000>;
+               };
+               opp@160000000 {
+                       opp-hz = /bits/ 64 <160000000>;
+               };
+       };
+
+       bus_leftbus_opp_table: opp_table6 {
+               compatible = "operating-points-v2";
+               opp-shared;
+
+               opp@100000000 {
+                       opp-hz = /bits/ 64 <100000000>;
+               };
+               opp@160000000 {
+                       opp-hz = /bits/ 64 <160000000>;
+               };
+               opp@200000000 {
+                       opp-hz = /bits/ 64 <200000000>;
+               };
+       };
 };
 
 &gic {
index cab0f07..ec7619a 100644 (file)
@@ -11,6 +11,7 @@
 #include <dt-bindings/input/input.h>
 #include <dt-bindings/clock/maxim,max77686.h>
 #include "exynos4412.dtsi"
+#include "exynos4412-ppmu-common.dtsi"
 #include <dt-bindings/gpio/gpio.h>
 
 / {
        };
 };
 
+&bus_dmc {
+       devfreq-events = <&ppmu_dmc0_3>, <&ppmu_dmc1_3>;
+       vdd-supply = <&buck1_reg>;
+       status = "okay";
+};
+
+&bus_acp {
+       devfreq = <&bus_dmc>;
+       status = "okay";
+};
+
+&bus_c2c {
+       devfreq = <&bus_dmc>;
+       status = "okay";
+};
+
+&bus_leftbus {
+       devfreq-events = <&ppmu_leftbus_3>, <&ppmu_rightbus_3>;
+       vdd-supply = <&buck3_reg>;
+       status = "okay";
+};
+
+&bus_rightbus {
+       devfreq = <&bus_leftbus>;
+       status = "okay";
+};
+
+&bus_display {
+       devfreq = <&bus_leftbus>;
+       status = "okay";
+};
+
+&bus_fsys {
+       devfreq = <&bus_leftbus>;
+       status = "okay";
+};
+
+&bus_peri {
+       devfreq = <&bus_leftbus>;
+       status = "okay";
+};
+
+&bus_mfc {
+       devfreq = <&bus_leftbus>;
+       status = "okay";
+};
+
 &cpu0 {
        cpu0-supply = <&buck2_reg>;
 };
 
                        buck1_reg: BUCK1 {
                                regulator-name = "vdd_mif";
-                               regulator-min-microvolt = <1000000>;
-                               regulator-max-microvolt = <1000000>;
+                               regulator-min-microvolt = <900000>;
+                               regulator-max-microvolt = <1100000>;
                                regulator-always-on;
                                regulator-boot-on;
                        };
 
                        buck3_reg: BUCK3 {
                                regulator-name = "vdd_int";
-                               regulator-min-microvolt = <1000000>;
-                               regulator-max-microvolt = <1000000>;
+                               regulator-min-microvolt = <900000>;
+                               regulator-max-microvolt = <1050000>;
                                regulator-always-on;
                                regulator-boot-on;
                        };
diff --git a/arch/arm/boot/dts/exynos4412-ppmu-common.dtsi b/arch/arm/boot/dts/exynos4412-ppmu-common.dtsi
new file mode 100644 (file)
index 0000000..16e4b77
--- /dev/null
@@ -0,0 +1,50 @@
+/*
+ * Device tree sources for Exynos4412 PPMU common device tree
+ *
+ * Copyright (C) 2015 Samsung Electronics
+ * Author: Chanwoo Choi <cw00.choi@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+&ppmu_dmc0 {
+       status = "okay";
+
+       events {
+              ppmu_dmc0_3: ppmu-event3-dmc0 {
+                      event-name = "ppmu-event3-dmc0";
+              };
+       };
+};
+
+&ppmu_dmc1 {
+       status = "okay";
+
+       events {
+              ppmu_dmc1_3: ppmu-event3-dmc1 {
+                      event-name = "ppmu-event3-dmc1";
+              };
+       };
+};
+
+&ppmu_leftbus {
+       status = "okay";
+
+       events {
+              ppmu_leftbus_3: ppmu-event3-leftbus {
+                      event-name = "ppmu-event3-leftbus";
+              };
+       };
+};
+
+&ppmu_rightbus {
+       status = "okay";
+
+       events {
+              ppmu_rightbus_3: ppmu-event3-rightbus {
+                      event-name = "ppmu-event3-rightbus";
+              };
+       };
+};
index 5d1eaea..9336fd4 100644 (file)
@@ -14,6 +14,7 @@
 
 /dts-v1/;
 #include "exynos4412.dtsi"
+#include "exynos4412-ppmu-common.dtsi"
 #include <dt-bindings/gpio/gpio.h>
 #include <dt-bindings/interrupt-controller/irq.h>
 #include <dt-bindings/clock/maxim,max77686.h>
        status = "okay";
 };
 
+&bus_dmc {
+       devfreq-events = <&ppmu_dmc0_3>, <&ppmu_dmc1_3>;
+       vdd-supply = <&buck1_reg>;
+       status = "okay";
+};
+
+&bus_acp {
+       devfreq = <&bus_dmc>;
+       status = "okay";
+};
+
+&bus_c2c {
+       devfreq = <&bus_dmc>;
+       status = "okay";
+};
+
+&bus_leftbus {
+       devfreq-events = <&ppmu_leftbus_3>, <&ppmu_rightbus_3>;
+       vdd-supply = <&buck3_reg>;
+       status = "okay";
+};
+
+&bus_rightbus {
+       devfreq = <&bus_leftbus>;
+       status = "okay";
+};
+
+&bus_display {
+       devfreq = <&bus_leftbus>;
+       status = "okay";
+};
+
+&bus_fsys {
+       devfreq = <&bus_leftbus>;
+       status = "okay";
+};
+
+&bus_peri {
+       devfreq = <&bus_leftbus>;
+       status = "okay";
+};
+
+&bus_mfc {
+       devfreq = <&bus_leftbus>;
+       status = "okay";
+};
+
 &cpu0 {
        cpu0-supply = <&buck2_reg>;
 };
        assigned-clock-parents =  <&clock CLK_XUSBXTI>;
 };
 
-&ppmu_dmc0 {
-       status = "okay";
-
-       events {
-               ppmu_dmc0_3: ppmu-event3-dmc0 {
-                       event-name = "ppmu-event3-dmc0";
-               };
-       };
-};
-
-&ppmu_dmc1 {
-       status = "okay";
-
-       events {
-               ppmu_dmc1_3: ppmu-event3-dmc1 {
-                       event-name = "ppmu-event3-dmc1";
-               };
-       };
-};
-
-&ppmu_leftbus {
-       status = "okay";
-
-       events {
-               ppmu_leftbus_3: ppmu-event3-leftbus {
-                       event-name = "ppmu-event3-leftbus";
-               };
-       };
-};
-
-&ppmu_rightbus {
-       status = "okay";
-
-       events {
-               ppmu_rightbus_3: ppmu-event3-rightbus {
-                       event-name = "ppmu-event3-rightbus";
-               };
-       };
-};
-
 &pinctrl_0 {
        pinctrl-names = "default";
        pinctrl-0 = <&sleep0>;
index b7490ea..c452499 100644 (file)
                clocks = <&clock CLK_SMMU_LITE1>, <&clock CLK_FIMC_LITE1>;
                #iommu-cells = <0>;
        };
+
+       bus_dmc: bus_dmc {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DIV_DMC>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_dmc_opp_table>;
+               status = "disabled";
+       };
+
+       bus_acp: bus_acp {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DIV_ACP>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_acp_opp_table>;
+               status = "disabled";
+       };
+
+       bus_c2c: bus_c2c {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DIV_C2C>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_dmc_opp_table>;
+               status = "disabled";
+       };
+
+       bus_dmc_opp_table: opp_table1 {
+               compatible = "operating-points-v2";
+               opp-shared;
+
+               opp@100000000 {
+                       opp-hz = /bits/ 64 <100000000>;
+                       opp-microvolt = <900000>;
+               };
+               opp@134000000 {
+                       opp-hz = /bits/ 64 <134000000>;
+                       opp-microvolt = <900000>;
+               };
+               opp@160000000 {
+                       opp-hz = /bits/ 64 <160000000>;
+                       opp-microvolt = <900000>;
+               };
+               opp@267000000 {
+                       opp-hz = /bits/ 64 <267000000>;
+                       opp-microvolt = <950000>;
+               };
+               opp@400000000 {
+                       opp-hz = /bits/ 64 <400000000>;
+                       opp-microvolt = <1050000>;
+               };
+       };
+
+       bus_acp_opp_table: opp_table2 {
+               compatible = "operating-points-v2";
+               opp-shared;
+
+               opp@100000000 {
+                       opp-hz = /bits/ 64 <100000000>;
+               };
+               opp@134000000 {
+                       opp-hz = /bits/ 64 <134000000>;
+               };
+               opp@160000000 {
+                       opp-hz = /bits/ 64 <160000000>;
+               };
+               opp@267000000 {
+                       opp-hz = /bits/ 64 <267000000>;
+               };
+       };
+
+       bus_leftbus: bus_leftbus {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DIV_GDL>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_leftbus_opp_table>;
+               status = "disabled";
+       };
+
+       bus_rightbus: bus_rightbus {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DIV_GDR>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_leftbus_opp_table>;
+               status = "disabled";
+       };
+
+       bus_display: bus_display {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_ACLK160>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_display_opp_table>;
+               status = "disabled";
+       };
+
+       bus_fsys: bus_fsys {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_ACLK133>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_fsys_opp_table>;
+               status = "disabled";
+       };
+
+       bus_peri: bus_peri {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_ACLK100>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_peri_opp_table>;
+               status = "disabled";
+       };
+
+       bus_mfc: bus_mfc {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_SCLK_MFC>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_leftbus_opp_table>;
+               status = "disabled";
+       };
+
+       bus_leftbus_opp_table: opp_table3 {
+               compatible = "operating-points-v2";
+               opp-shared;
+
+               opp@100000000 {
+                       opp-hz = /bits/ 64 <100000000>;
+                       opp-microvolt = <900000>;
+               };
+               opp@134000000 {
+                       opp-hz = /bits/ 64 <134000000>;
+                       opp-microvolt = <925000>;
+               };
+               opp@160000000 {
+                       opp-hz = /bits/ 64 <160000000>;
+                       opp-microvolt = <950000>;
+               };
+               opp@200000000 {
+                       opp-hz = /bits/ 64 <200000000>;
+                       opp-microvolt = <1000000>;
+               };
+       };
+
+       bus_display_opp_table: opp_table4 {
+               compatible = "operating-points-v2";
+               opp-shared;
+
+               opp@160000000 {
+                       opp-hz = /bits/ 64 <160000000>;
+               };
+               opp@200000000 {
+                       opp-hz = /bits/ 64 <200000000>;
+               };
+       };
+
+       bus_fsys_opp_table: opp_table5 {
+               compatible = "operating-points-v2";
+               opp-shared;
+
+               opp@100000000 {
+                       opp-hz = /bits/ 64 <100000000>;
+               };
+               opp@134000000 {
+                       opp-hz = /bits/ 64 <134000000>;
+               };
+       };
+
+       bus_peri_opp_table: opp_table6 {
+               compatible = "operating-points-v2";
+               opp-shared;
+
+               opp@50000000 {
+                       opp-hz = /bits/ 64 <50000000>;
+               };
+               opp@100000000 {
+                       opp-hz = /bits/ 64 <100000000>;
+               };
+       };
 };
 
 &combiner {
index 4c85234..c6e05eb 100644 (file)
                };
        };
 
+       nocp_mem0_0: nocp@10CA1000 {
+               compatible = "samsung,exynos5420-nocp";
+               reg = <0x10CA1000 0x200>;
+               status = "disabled";
+       };
+
+       nocp_mem0_1: nocp@10CA1400 {
+               compatible = "samsung,exynos5420-nocp";
+               reg = <0x10CA1400 0x200>;
+               status = "disabled";
+       };
+
+       nocp_mem1_0: nocp@10CA1800 {
+               compatible = "samsung,exynos5420-nocp";
+               reg = <0x10CA1800 0x200>;
+               status = "disabled";
+       };
+
+       nocp_mem1_1: nocp@10CA1C00 {
+               compatible = "samsung,exynos5420-nocp";
+               reg = <0x10CA1C00 0x200>;
+               status = "disabled";
+       };
+
+       nocp_g3d_0: nocp@11A51000 {
+               compatible = "samsung,exynos5420-nocp";
+               reg = <0x11A51000 0x200>;
+               status = "disabled";
+       };
+
+       nocp_g3d_1: nocp@11A51400 {
+               compatible = "samsung,exynos5420-nocp";
+               reg = <0x11A51400 0x200>;
+               status = "disabled";
+       };
+
        gsc_pd: power-domain@10044000 {
                compatible = "samsung,exynos4210-pd";
                reg = <0x10044000 0x20>;
                power-domains = <&disp_pd>;
                #iommu-cells = <0>;
        };
+
+       bus_wcore: bus_wcore {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DOUT_ACLK400_WCORE>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_wcore_opp_table>;
+               status = "disabled";
+       };
+
+       bus_noc: bus_noc {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DOUT_ACLK100_NOC>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_noc_opp_table>;
+               status = "disabled";
+       };
+
+       bus_fsys_apb: bus_fsys_apb {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DOUT_PCLK200_FSYS>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_fsys_apb_opp_table>;
+               status = "disabled";
+       };
+
+       bus_fsys: bus_fsys {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DOUT_ACLK200_FSYS>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_fsys_apb_opp_table>;
+               status = "disabled";
+       };
+
+       bus_fsys2: bus_fsys2 {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DOUT_ACLK200_FSYS2>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_fsys2_opp_table>;
+               status = "disabled";
+       };
+
+       bus_mfc: bus_mfc {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DOUT_ACLK333>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_mfc_opp_table>;
+               status = "disabled";
+       };
+
+       bus_gen: bus_gen {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DOUT_ACLK266>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_gen_opp_table>;
+               status = "disabled";
+       };
+
+       bus_peri: bus_peri {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DOUT_ACLK66>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_peri_opp_table>;
+               status = "disabled";
+       };
+
+       bus_g2d: bus_g2d {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DOUT_ACLK333_G2D>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_g2d_opp_table>;
+               status = "disabled";
+       };
+
+       bus_g2d_acp: bus_g2d_acp {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DOUT_ACLK266_G2D>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_g2d_acp_opp_table>;
+               status = "disabled";
+       };
+
+       bus_jpeg: bus_jpeg {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DOUT_ACLK300_JPEG>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_jpeg_opp_table>;
+               status = "disabled";
+       };
+
+       bus_jpeg_apb: bus_jpeg_apb {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DOUT_ACLK166>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_jpeg_apb_opp_table>;
+               status = "disabled";
+       };
+
+       bus_disp1_fimd: bus_disp1_fimd {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DOUT_ACLK300_DISP1>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_disp1_fimd_opp_table>;
+               status = "disabled";
+       };
+
+       bus_disp1: bus_disp1 {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DOUT_ACLK400_DISP1>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_disp1_opp_table>;
+               status = "disabled";
+       };
+
+       bus_gscl_scaler: bus_gscl_scaler {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DOUT_ACLK300_GSCL>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_gscl_opp_table>;
+               status = "disabled";
+       };
+
+       bus_mscl: bus_mscl {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DOUT_ACLK400_MSCL>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_mscl_opp_table>;
+               status = "disabled";
+       };
+
+       bus_wcore_opp_table: opp_table2 {
+               compatible = "operating-points-v2";
+
+               opp00 {
+                       opp-hz = /bits/ 64 <84000000>;
+                       opp-microvolt = <925000>;
+               };
+               opp01 {
+                       opp-hz = /bits/ 64 <111000000>;
+                       opp-microvolt = <950000>;
+               };
+               opp02 {
+                       opp-hz = /bits/ 64 <222000000>;
+                       opp-microvolt = <950000>;
+               };
+               opp03 {
+                       opp-hz = /bits/ 64 <333000000>;
+                       opp-microvolt = <950000>;
+               };
+               opp04 {
+                       opp-hz = /bits/ 64 <400000000>;
+                       opp-microvolt = <987500>;
+               };
+       };
+
+       bus_noc_opp_table: opp_table3 {
+               compatible = "operating-points-v2";
+
+               opp00 {
+                       opp-hz = /bits/ 64 <67000000>;
+               };
+               opp01 {
+                       opp-hz = /bits/ 64 <75000000>;
+               };
+               opp02 {
+                       opp-hz = /bits/ 64 <86000000>;
+               };
+               opp03 {
+                       opp-hz = /bits/ 64 <100000000>;
+               };
+       };
+
+       bus_fsys_apb_opp_table: opp_table4 {
+               compatible = "operating-points-v2";
+               opp-shared;
+
+               opp00 {
+                       opp-hz = /bits/ 64 <100000000>;
+               };
+               opp01 {
+                       opp-hz = /bits/ 64 <200000000>;
+               };
+       };
+
+       bus_fsys2_opp_table: opp_table5 {
+               compatible = "operating-points-v2";
+
+               opp00 {
+                       opp-hz = /bits/ 64 <75000000>;
+               };
+               opp01 {
+                       opp-hz = /bits/ 64 <100000000>;
+               };
+               opp02 {
+                       opp-hz = /bits/ 64 <150000000>;
+               };
+       };
+
+       bus_mfc_opp_table: opp_table6 {
+               compatible = "operating-points-v2";
+
+               opp00 {
+                       opp-hz = /bits/ 64 <96000000>;
+               };
+               opp01 {
+                       opp-hz = /bits/ 64 <111000000>;
+               };
+               opp02 {
+                       opp-hz = /bits/ 64 <167000000>;
+               };
+               opp03 {
+                       opp-hz = /bits/ 64 <222000000>;
+               };
+               opp04 {
+                       opp-hz = /bits/ 64 <333000000>;
+               };
+       };
+
+       bus_gen_opp_table: opp_table7 {
+               compatible = "operating-points-v2";
+
+               opp00 {
+                       opp-hz = /bits/ 64 <89000000>;
+               };
+               opp01 {
+                       opp-hz = /bits/ 64 <133000000>;
+               };
+               opp02 {
+                       opp-hz = /bits/ 64 <178000000>;
+               };
+               opp03 {
+                       opp-hz = /bits/ 64 <267000000>;
+               };
+       };
+
+       bus_peri_opp_table: opp_table8 {
+               compatible = "operating-points-v2";
+
+               opp00 {
+                       opp-hz = /bits/ 64 <67000000>;
+               };
+       };
+
+       bus_g2d_opp_table: opp_table9 {
+               compatible = "operating-points-v2";
+
+               opp00 {
+                       opp-hz = /bits/ 64 <84000000>;
+               };
+               opp01 {
+                       opp-hz = /bits/ 64 <167000000>;
+               };
+               opp02 {
+                       opp-hz = /bits/ 64 <222000000>;
+               };
+               opp03 {
+                       opp-hz = /bits/ 64 <300000000>;
+               };
+               opp04 {
+                       opp-hz = /bits/ 64 <333000000>;
+               };
+       };
+
+       bus_g2d_acp_opp_table: opp_table10 {
+               compatible = "operating-points-v2";
+
+               opp00 {
+                       opp-hz = /bits/ 64 <67000000>;
+               };
+               opp01 {
+                       opp-hz = /bits/ 64 <133000000>;
+               };
+               opp02 {
+                       opp-hz = /bits/ 64 <178000000>;
+               };
+               opp03 {
+                       opp-hz = /bits/ 64 <267000000>;
+               };
+       };
+
+       bus_jpeg_opp_table: opp_table11 {
+               compatible = "operating-points-v2";
+
+               opp00 {
+                       opp-hz = /bits/ 64 <75000000>;
+               };
+               opp01 {
+                       opp-hz = /bits/ 64 <150000000>;
+               };
+               opp02 {
+                       opp-hz = /bits/ 64 <200000000>;
+               };
+               opp03 {
+                       opp-hz = /bits/ 64 <300000000>;
+               };
+       };
+
+       bus_jpeg_apb_opp_table: opp_table12 {
+               compatible = "operating-points-v2";
+
+               opp00 {
+                       opp-hz = /bits/ 64 <84000000>;
+               };
+               opp01 {
+                       opp-hz = /bits/ 64 <111000000>;
+               };
+               opp02 {
+                       opp-hz = /bits/ 64 <134000000>;
+               };
+               opp03 {
+                       opp-hz = /bits/ 64 <167000000>;
+               };
+       };
+
+       bus_disp1_fimd_opp_table: opp_table13 {
+               compatible = "operating-points-v2";
+
+               opp00 {
+                       opp-hz = /bits/ 64 <120000000>;
+               };
+               opp01 {
+                       opp-hz = /bits/ 64 <200000000>;
+               };
+       };
+
+       bus_disp1_opp_table: opp_table14 {
+               compatible = "operating-points-v2";
+
+               opp00 {
+                       opp-hz = /bits/ 64 <120000000>;
+               };
+               opp01 {
+                       opp-hz = /bits/ 64 <200000000>;
+               };
+               opp02 {
+                       opp-hz = /bits/ 64 <300000000>;
+               };
+       };
+
+       bus_gscl_opp_table: opp_table15 {
+               compatible = "operating-points-v2";
+
+               opp00 {
+                       opp-hz = /bits/ 64 <150000000>;
+               };
+               opp01 {
+                       opp-hz = /bits/ 64 <200000000>;
+               };
+               opp02 {
+                       opp-hz = /bits/ 64 <300000000>;
+               };
+       };
+
+       bus_mscl_opp_table: opp_table16 {
+               compatible = "operating-points-v2";
+
+               opp00 {
+                       opp-hz = /bits/ 64 <84000000>;
+               };
+               opp01 {
+                       opp-hz = /bits/ 64 <167000000>;
+               };
+               opp02 {
+                       opp-hz = /bits/ 64 <222000000>;
+               };
+               opp03 {
+                       opp-hz = /bits/ 64 <333000000>;
+               };
+               opp04 {
+                       opp-hz = /bits/ 64 <400000000>;
+               };
+       };
 };
 
 &dp {
index 20fa761..2a4e10b 100644 (file)
        };
 };
 
+&bus_wcore {
+       devfreq-events = <&nocp_mem0_0>, <&nocp_mem0_1>,
+                       <&nocp_mem1_0>, <&nocp_mem1_1>;
+       vdd-supply = <&buck3_reg>;
+       exynos,saturation-ratio = <100>;
+       status = "okay";
+};
+
+&bus_noc {
+       devfreq = <&bus_wcore>;
+       status = "okay";
+};
+
+&bus_fsys_apb {
+       devfreq = <&bus_wcore>;
+       status = "okay";
+};
+
+&bus_fsys {
+       devfreq = <&bus_wcore>;
+       status = "okay";
+};
+
+&bus_fsys2 {
+       devfreq = <&bus_wcore>;
+       status = "okay";
+};
+
+&bus_mfc {
+       devfreq = <&bus_wcore>;
+       status = "okay";
+};
+
+&bus_gen {
+       devfreq = <&bus_wcore>;
+       status = "okay";
+};
+
+&bus_peri {
+       devfreq = <&bus_wcore>;
+       status = "okay";
+};
+
+&bus_g2d {
+       devfreq = <&bus_wcore>;
+       status = "okay";
+};
+
+&bus_g2d_acp {
+       devfreq = <&bus_wcore>;
+       status = "okay";
+};
+
+&bus_jpeg {
+       devfreq = <&bus_wcore>;
+       status = "okay";
+};
+
+&bus_jpeg_apb {
+       devfreq = <&bus_wcore>;
+       status = "okay";
+};
+
+&bus_disp1_fimd {
+       devfreq = <&bus_wcore>;
+       status = "okay";
+};
+
+&bus_disp1 {
+       devfreq = <&bus_wcore>;
+       status = "okay";
+};
+
+&bus_gscl_scaler {
+       devfreq = <&bus_wcore>;
+       status = "okay";
+};
+
+&bus_mscl {
+       devfreq = <&bus_wcore>;
+       status = "okay";
+};
+
 &clock_audss {
        assigned-clocks = <&clock_audss EXYNOS_MOUT_AUDSS>,
                        <&clock_audss EXYNOS_MOUT_I2S>,
        vqmmc-supply = <&ldo13_reg>;
 };
 
+&nocp_mem0_0 {
+       status = "okay";
+};
+
+&nocp_mem0_1 {
+       status = "okay";
+};
+
+&nocp_mem1_0 {
+       status = "okay";
+};
+
+&nocp_mem1_1 {
+       status = "okay";
+};
+
 &pinctrl_0 {
        hdmi_hpd_irq: hdmi-hpd-irq {
                samsung,pins = "gpx3-7";
diff --git a/arch/arm/boot/dts/imx7d-nitrogen7.dts b/arch/arm/boot/dts/imx7d-nitrogen7.dts
new file mode 100644 (file)
index 0000000..1ce9780
--- /dev/null
@@ -0,0 +1,745 @@
+/*
+ * Copyright 2016 Boundary Devices, Inc.
+ *
+ * This file is dual-licensed: you can use it either under the terms
+ * of the GPL or the X11 license, at your option. Note that this dual
+ * licensing only applies to this file, and not this project as a
+ * whole.
+ *
+ *  a) This file is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of the
+ *     License, or (at your option) any later version.
+ *
+ *     This file is distributed in the hope that it will be useful,
+ *     but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *     GNU General Public License for more details.
+ *
+ * Or, alternatively,
+ *
+ *  b) Permission is hereby granted, free of charge, to any person
+ *     obtaining a copy of this software and associated documentation
+ *     files (the "Software"), to deal in the Software without
+ *     restriction, including without limitation the rights to use,
+ *     copy, modify, merge, publish, distribute, sublicense, and/or
+ *     sell copies of the Software, and to permit persons to whom the
+ *     Software is furnished to do so, subject to the following
+ *     conditions:
+ *
+ *     The above copyright notice and this permission notice shall be
+ *     included in all copies or substantial portions of the Software.
+ *
+ *     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *     EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ *     OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *     NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ *     HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ *     WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ *     OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/dts-v1/;
+
+#include <dt-bindings/input/input.h>
+#include "imx7d.dtsi"
+
+/ {
+       model = "Boundary Devices i.MX7 Nitrogen7 Board";
+       compatible = "boundary,imx7d-nitrogen7", "fsl,imx7d";
+
+       aliases {
+               fb_lcd = &lcdif;
+               t_lcd = &t_lcd;
+       };
+
+       memory {
+               reg = <0x80000000 0x40000000>;
+       };
+
+       backlight-j9 {
+               compatible = "gpio-backlight";
+               pinctrl-names = "default";
+               pinctrl-0 = <&pinctrl_backlight_j9>;
+               gpios = <&gpio1 7 GPIO_ACTIVE_HIGH>;
+               default-on;
+       };
+
+       backlight-j20 {
+               compatible = "pwm-backlight";
+               pwms = <&pwm1 0 5000000>;
+               brightness-levels = <0 4 8 16 32 64 128 255>;
+               default-brightness-level = <6>;
+               status = "okay";
+       };
+
+       reg_usb_otg1_vbus: regulator-usb-otg1-vbus {
+               compatible = "regulator-fixed";
+               regulator-name = "usb_otg1_vbus";
+               regulator-min-microvolt = <5000000>;
+               regulator-max-microvolt = <5000000>;
+               gpio = <&gpio1 5 GPIO_ACTIVE_HIGH>;
+               enable-active-high;
+       };
+
+       reg_usb_otg2_vbus: regulator-usb-otg2-vbus {
+               compatible = "regulator-fixed";
+               regulator-name = "usb_otg2_vbus";
+               regulator-min-microvolt = <5000000>;
+               regulator-max-microvolt = <5000000>;
+               gpio = <&gpio4 7 GPIO_ACTIVE_HIGH>;
+               enable-active-high;
+       };
+
+       reg_can2_3v3: regulator-can2-3v3 {
+               compatible = "regulator-fixed";
+               regulator-name = "can2-3v3";
+               regulator-min-microvolt = <3300000>;
+               regulator-max-microvolt = <3300000>;
+               gpio = <&gpio2 14 GPIO_ACTIVE_LOW>;
+       };
+
+       reg_vref_1v8: regulator-vref-1v8 {
+               compatible = "regulator-fixed";
+               regulator-name = "vref-1v8";
+               regulator-min-microvolt = <1800000>;
+               regulator-max-microvolt = <1800000>;
+       };
+
+       reg_vref_3v3: regulator-vref-3v3 {
+               compatible = "regulator-fixed";
+               regulator-name = "vref-3v3";
+               regulator-min-microvolt = <3300000>;
+               regulator-max-microvolt = <3300000>;
+       };
+
+       reg_wlan: regulator-wlan {
+               compatible = "regulator-fixed";
+               regulator-min-microvolt = <3300000>;
+               regulator-max-microvolt = <3300000>;
+               clocks = <&clks IMX7D_CLKO2_ROOT_DIV>;
+               clock-names = "slow";
+               regulator-name = "reg_wlan";
+               startup-delay-us = <70000>;
+               gpio = <&gpio4 21 GPIO_ACTIVE_HIGH>;
+               enable-active-high;
+       };
+};
+
+&adc1 {
+       vref-supply = <&reg_vref_1v8>;
+       status = "okay";
+};
+
+&adc2 {
+       vref-supply = <&reg_vref_1v8>;
+       status = "okay";
+};
+
+&clks {
+       assigned-clocks = <&clks IMX7D_CLKO2_ROOT_SRC>,
+                         <&clks IMX7D_CLKO2_ROOT_DIV>;
+       assigned-clock-parents = <&clks IMX7D_CKIL>;
+       assigned-clock-rates = <0>, <32768>;
+};
+
+&cpu0 {
+       arm-supply = <&sw1a_reg>;
+};
+
+&fec1 {
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_enet1>;
+       assigned-clocks = <&clks IMX7D_ENET1_TIME_ROOT_SRC>,
+                         <&clks IMX7D_ENET1_TIME_ROOT_CLK>;
+       assigned-clock-parents = <&clks IMX7D_PLL_ENET_MAIN_100M_CLK>;
+       assigned-clock-rates = <0>, <100000000>;
+       phy-mode = "rgmii";
+       phy-handle = <&ethphy0>;
+       fsl,magic-packet;
+       status = "okay";
+
+       mdio {
+               #address-cells = <1>;
+               #size-cells = <0>;
+
+               ethphy0: ethernet-phy@4 {
+                       reg = <4>;
+               };
+       };
+};
+
+&flexcan2 {
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_flexcan2>;
+       xceiver-supply = <&reg_can2_3v3>;
+       status = "okay";
+};
+
+&i2c1 {
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_i2c1>;
+       status = "okay";
+
+       pmic: pfuze3000@08 {
+               compatible = "fsl,pfuze3000";
+               reg = <0x08>;
+
+               regulators {
+                       sw1a_reg: sw1a {
+                               regulator-min-microvolt = <700000>;
+                               regulator-max-microvolt = <1475000>;
+                               regulator-boot-on;
+                               regulator-always-on;
+                               regulator-ramp-delay = <6250>;
+                       };
+
+                       /* use sw1c_reg to align with pfuze100/pfuze200 */
+                       sw1c_reg: sw1b {
+                               regulator-min-microvolt = <700000>;
+                               regulator-max-microvolt = <1475000>;
+                               regulator-boot-on;
+                               regulator-always-on;
+                               regulator-ramp-delay = <6250>;
+                       };
+
+                       sw2_reg: sw2 {
+                               regulator-min-microvolt = <1500000>;
+                               regulator-max-microvolt = <1850000>;
+                               regulator-boot-on;
+                               regulator-always-on;
+                       };
+
+                       sw3a_reg: sw3 {
+                               regulator-min-microvolt = <900000>;
+                               regulator-max-microvolt = <1650000>;
+                               regulator-boot-on;
+                               regulator-always-on;
+                       };
+
+                       swbst_reg: swbst {
+                               regulator-min-microvolt = <5000000>;
+                               regulator-max-microvolt = <5150000>;
+                       };
+
+                       snvs_reg: vsnvs {
+                               regulator-min-microvolt = <1000000>;
+                               regulator-max-microvolt = <3000000>;
+                               regulator-boot-on;
+                               regulator-always-on;
+                       };
+
+                       vref_reg: vrefddr {
+                               regulator-boot-on;
+                               regulator-always-on;
+                       };
+
+                       vgen1_reg: vldo1 {
+                               regulator-min-microvolt = <1800000>;
+                               regulator-max-microvolt = <3300000>;
+                               regulator-always-on;
+                       };
+
+                       vgen2_reg: vldo2 {
+                               regulator-min-microvolt = <800000>;
+                               regulator-max-microvolt = <1550000>;
+                               regulator-always-on;
+                       };
+
+                       vgen3_reg: vccsd {
+                               regulator-min-microvolt = <2850000>;
+                               regulator-max-microvolt = <3300000>;
+                               regulator-always-on;
+                       };
+
+                       vgen4_reg: v33 {
+                               regulator-min-microvolt = <2850000>;
+                               regulator-max-microvolt = <3300000>;
+                               regulator-always-on;
+                       };
+
+                       vgen5_reg: vldo3 {
+                               regulator-min-microvolt = <1800000>;
+                               regulator-max-microvolt = <3300000>;
+                               regulator-always-on;
+                       };
+
+                       vgen6_reg: vldo4 {
+                               regulator-min-microvolt = <1800000>;
+                               regulator-max-microvolt = <3300000>;
+                               regulator-always-on;
+                       };
+               };
+       };
+};
+
+&i2c2 {
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_i2c2>;
+       status = "okay";
+
+       rtc@68 {
+               compatible = "rv4162";
+               pinctrl-names = "default";
+               pinctrl-0 = <&pinctrl_i2c2_rv4162>;
+               reg = <0x68>;
+               interrupts-extended = <&gpio2 15 IRQ_TYPE_LEVEL_LOW>;
+       };
+};
+
+&i2c3 {
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_i2c3>;
+       status = "okay";
+
+       touch@48 {
+               compatible = "ti,tsc2004";
+               reg = <0x48>;
+               pinctrl-names = "default";
+               pinctrl-0 = <&pinctrl_i2c3_tsc2004>;
+               interrupts-extended = <&gpio3 4 IRQ_TYPE_EDGE_FALLING>;
+               wakeup-gpios = <&gpio3 4 GPIO_ACTIVE_LOW>;
+       };
+};
+
+&i2c4 {
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_i2c4>;
+       status = "okay";
+
+       codec: wm8960@1a {
+               compatible = "wlf,wm8960";
+               reg = <0x1a>;
+               clocks = <&clks IMX7D_AUDIO_MCLK_ROOT_CLK>;
+               clock-names = "mclk";
+               wlf,shared-lrclk;
+       };
+};
+
+&lcdif {
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_lcdif_dat
+                    &pinctrl_lcdif_ctrl>;
+       lcd-supply = <&reg_vref_3v3>;
+       display = <&display0>;
+       status = "okay";
+
+       display0: lcd-display {
+               bits-per-pixel = <16>;
+               bus-width = <18>;
+
+               display-timings {
+                       native-mode = <&t_lcd>;
+                       t_lcd: t_lcd_default {
+                               /* default to Okaya display */
+                               clock-frequency = <30000000>;
+                               hactive = <800>;
+                               vactive = <480>;
+                               hfront-porch = <40>;
+                               hback-porch = <40>;
+                               hsync-len = <48>;
+                               vback-porch = <29>;
+                               vfront-porch = <13>;
+                               vsync-len = <3>;
+                               hsync-active = <0>;
+                               vsync-active = <0>;
+                               de-active = <1>;
+                               pixelclk-active = <0>;
+                       };
+               };
+       };
+};
+
+&pwm1 {
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_pwm1>;
+       status = "okay";
+};
+
+&pwm2 {
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_pwm2>;
+       status = "okay";
+};
+
+&uart1 {
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_uart1>;
+       assigned-clocks = <&clks IMX7D_UART1_ROOT_SRC>;
+       assigned-clock-parents = <&clks IMX7D_OSC_24M_CLK>;
+       status = "okay";
+};
+
+&uart2 {
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_uart2>;
+       assigned-clocks = <&clks IMX7D_UART2_ROOT_SRC>;
+       assigned-clock-parents = <&clks IMX7D_OSC_24M_CLK>;
+       status = "okay";
+};
+
+&uart3 {
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_uart3>;
+       assigned-clocks = <&clks IMX7D_UART3_ROOT_SRC>;
+       assigned-clock-parents = <&clks IMX7D_OSC_24M_CLK>;
+       status = "okay";
+};
+
+&uart6 {
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_uart6>;
+       assigned-clocks = <&clks IMX7D_UART6_ROOT_SRC>;
+       assigned-clock-parents = <&clks IMX7D_PLL_SYS_MAIN_240M_CLK>;
+       fsl,uart-has-rtscts;
+       status = "okay";
+};
+
+&usbotg1 {
+       vbus-supply = <&reg_usb_otg1_vbus>;
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_usbotg1>;
+       status = "okay";
+};
+
+&usbotg2 {
+       vbus-supply = <&reg_usb_otg2_vbus>;
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_usbotg2>;
+       dr_mode = "host";
+       status = "okay";
+};
+
+&usdhc1 {
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_usdhc1>;
+       cd-gpios = <&gpio5 0 GPIO_ACTIVE_LOW>;
+       vmmc-supply = <&vgen3_reg>;
+       bus-width = <4>;
+       fsl,tuning-step = <2>;
+       wakeup-source;
+       keep-power-in-suspend;
+       status = "okay";
+};
+
+&usdhc2 {
+       #address-cells = <1>;
+       #size-cells = <0>;
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_usdhc2>;
+       bus-width = <4>;
+       non-removable;
+       vmmc-supply = <&reg_wlan>;
+       cap-power-off-card;
+       keep-power-in-suspend;
+       status = "okay";
+
+       wlcore: wlcore@2 {
+               compatible = "ti,wl1271";
+               reg = <2>;
+               interrupt-parent = <&gpio4>;
+               interrupts = <20 IRQ_TYPE_LEVEL_HIGH>;
+               ref-clock-frequency = <38400000>;
+       };
+};
+
+&usdhc3 {
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_usdhc3>;
+       assigned-clocks = <&clks IMX7D_USDHC3_ROOT_CLK>;
+       assigned-clock-rates = <400000000>;
+       bus-width = <8>;
+       fsl,tuning-step = <2>;
+       non-removable;
+       status = "okay";
+};
+
+&wdog1 {
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_wdog1>;
+       status = "okay";
+};
+
+&iomuxc {
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_hog_1 &pinctrl_j2>;
+
+       pinctrl_hog_1: hoggrp-1 {
+               fsl,pins = <
+                       MX7D_PAD_SD3_RESET_B__GPIO6_IO11        0x5d
+                       MX7D_PAD_GPIO1_IO13__GPIO1_IO13         0x7d
+                       MX7D_PAD_ECSPI2_MISO__GPIO4_IO22        0x7d
+               >;
+       };
+
+       pinctrl_enet1: enet1grp {
+               fsl,pins = <
+                       MX7D_PAD_GPIO1_IO10__ENET1_MDIO                 0x3
+                       MX7D_PAD_GPIO1_IO11__ENET1_MDC                  0x3
+                       MX7D_PAD_GPIO1_IO12__CCM_ENET_REF_CLK1          0x3
+                       MX7D_PAD_ENET1_RGMII_TXC__ENET1_RGMII_TXC       0x71
+                       MX7D_PAD_ENET1_RGMII_TD0__ENET1_RGMII_TD0       0x71
+                       MX7D_PAD_ENET1_RGMII_TD1__ENET1_RGMII_TD1       0x71
+                       MX7D_PAD_ENET1_RGMII_TD2__ENET1_RGMII_TD2       0x71
+                       MX7D_PAD_ENET1_RGMII_TD3__ENET1_RGMII_TD3       0x71
+                       MX7D_PAD_ENET1_RGMII_TX_CTL__ENET1_RGMII_TX_CTL 0x71
+                       MX7D_PAD_ENET1_RGMII_RXC__ENET1_RGMII_RXC       0x71
+                       MX7D_PAD_ENET1_RGMII_RD0__ENET1_RGMII_RD0       0x11
+                       MX7D_PAD_ENET1_RGMII_RD1__ENET1_RGMII_RD1       0x11
+                       MX7D_PAD_ENET1_RGMII_RD2__ENET1_RGMII_RD2       0x11
+                       MX7D_PAD_ENET1_RGMII_RD3__ENET1_RGMII_RD3       0x71
+                       MX7D_PAD_ENET1_RGMII_RX_CTL__ENET1_RGMII_RX_CTL 0x11
+                       MX7D_PAD_SD3_STROBE__GPIO6_IO10                 0x75
+               >;
+       };
+
+       pinctrl_flexcan2: flexcan2grp {
+               fsl,pins = <
+                       MX7D_PAD_GPIO1_IO14__FLEXCAN2_RX        0x7d
+                       MX7D_PAD_GPIO1_IO15__FLEXCAN2_TX        0x7d
+                       MX7D_PAD_EPDC_DATA14__GPIO2_IO14        0x7d
+               >;
+       };
+
+       pinctrl_i2c1: i2c1grp {
+               fsl,pins = <
+                       MX7D_PAD_I2C1_SDA__I2C1_SDA             0x4000007f
+                       MX7D_PAD_I2C1_SCL__I2C1_SCL             0x4000007f
+               >;
+       };
+
+       pinctrl_i2c2: i2c2grp {
+               fsl,pins = <
+                       MX7D_PAD_I2C2_SDA__I2C2_SDA             0x4000007f
+                       MX7D_PAD_I2C2_SCL__I2C2_SCL             0x4000007f
+               >;
+       };
+
+       pinctrl_i2c2_rv4162: i2c2-rv4162grp {
+               fsl,pins = <
+                       MX7D_PAD_EPDC_DATA15__GPIO2_IO15        0x7d
+               >;
+       };
+
+       pinctrl_i2c3: i2c3grp {
+               fsl,pins = <
+                       MX7D_PAD_I2C3_SDA__I2C3_SDA             0x4000007f
+                       MX7D_PAD_I2C3_SCL__I2C3_SCL             0x4000007f
+               >;
+       };
+
+       pinctrl_i2c3_tsc2004: i2c3tsc2004grp {
+               fsl,pins = <
+                       MX7D_PAD_LCD_RESET__GPIO3_IO4           0x79
+                       MX7D_PAD_SD2_WP__GPIO5_IO10             0x7d
+               >;
+       };
+
+       pinctrl_i2c4: i2c4grp {
+               fsl,pins = <
+                       MX7D_PAD_I2C4_SDA__I2C4_SDA             0x4000007f
+                       MX7D_PAD_I2C4_SCL__I2C4_SCL             0x4000007f
+               >;
+       };
+
+       pinctrl_j2: j2grp {
+               fsl,pins = <
+                       MX7D_PAD_SAI1_TX_DATA__GPIO6_IO15       0x7d
+                       MX7D_PAD_EPDC_BDR0__GPIO2_IO28          0x7d
+                       MX7D_PAD_SAI1_RX_DATA__GPIO6_IO12       0x7d
+                       MX7D_PAD_EPDC_BDR1__GPIO2_IO29          0x7d
+                       MX7D_PAD_SD1_WP__GPIO5_IO1              0x7d
+                       MX7D_PAD_EPDC_SDSHR__GPIO2_IO19         0x7d
+                       MX7D_PAD_SD1_RESET_B__GPIO5_IO2         0x7d
+                       MX7D_PAD_SD2_RESET_B__GPIO5_IO11        0x7d
+                       MX7D_PAD_EPDC_DATA07__GPIO2_IO7         0x7d
+                       MX7D_PAD_EPDC_DATA08__GPIO2_IO8         0x7d
+                       MX7D_PAD_EPDC_DATA09__GPIO2_IO9         0x7d
+                       MX7D_PAD_EPDC_DATA10__GPIO2_IO10        0x7d
+                       MX7D_PAD_EPDC_DATA11__GPIO2_IO11        0x7d
+                       MX7D_PAD_EPDC_DATA12__GPIO2_IO12        0x7d
+                       MX7D_PAD_SAI1_TX_SYNC__GPIO6_IO14       0x7d
+                       MX7D_PAD_EPDC_DATA13__GPIO2_IO13        0x7d
+                       MX7D_PAD_SAI1_TX_BCLK__GPIO6_IO13       0x7d
+                       MX7D_PAD_SD2_CD_B__GPIO5_IO9            0x7d
+                       MX7D_PAD_EPDC_GDCLK__GPIO2_IO24         0x7d
+                       MX7D_PAD_SAI2_RX_DATA__GPIO6_IO21       0x7d
+                       MX7D_PAD_EPDC_GDOE__GPIO2_IO25          0x7d
+                       MX7D_PAD_EPDC_GDRL__GPIO2_IO26          0x7d
+                       MX7D_PAD_SAI2_TX_DATA__GPIO6_IO22       0x7d
+                       MX7D_PAD_EPDC_SDCE0__GPIO2_IO20         0x7d
+                       MX7D_PAD_SAI2_TX_BCLK__GPIO6_IO20       0x7d
+                       MX7D_PAD_EPDC_SDCE1__GPIO2_IO21         0x7d
+                       MX7D_PAD_SAI2_TX_SYNC__GPIO6_IO19       0x7d
+                       MX7D_PAD_EPDC_SDCE2__GPIO2_IO22         0x7d
+                       MX7D_PAD_EPDC_SDCE3__GPIO2_IO23         0x7d
+                       MX7D_PAD_EPDC_GDSP__GPIO2_IO27          0x7d
+                       MX7D_PAD_EPDC_SDCLK__GPIO2_IO16         0x7d
+                       MX7D_PAD_EPDC_SDLE__GPIO2_IO17          0x7d
+                       MX7D_PAD_EPDC_SDOE__GPIO2_IO18          0x7d
+                       MX7D_PAD_EPDC_PWR_COM__GPIO2_IO30       0x7d
+                       MX7D_PAD_EPDC_PWR_STAT__GPIO2_IO31      0x7d
+               >;
+       };
+
+       pinctrl_lcdif_dat: lcdifdatgrp {
+               fsl,pins = <
+                       MX7D_PAD_LCD_DATA00__LCD_DATA0          0x79
+                       MX7D_PAD_LCD_DATA01__LCD_DATA1          0x79
+                       MX7D_PAD_LCD_DATA02__LCD_DATA2          0x79
+                       MX7D_PAD_LCD_DATA03__LCD_DATA3          0x79
+                       MX7D_PAD_LCD_DATA04__LCD_DATA4          0x79
+                       MX7D_PAD_LCD_DATA05__LCD_DATA5          0x79
+                       MX7D_PAD_LCD_DATA06__LCD_DATA6          0x79
+                       MX7D_PAD_LCD_DATA07__LCD_DATA7          0x79
+                       MX7D_PAD_LCD_DATA08__LCD_DATA8          0x79
+                       MX7D_PAD_LCD_DATA09__LCD_DATA9          0x79
+                       MX7D_PAD_LCD_DATA10__LCD_DATA10         0x79
+                       MX7D_PAD_LCD_DATA11__LCD_DATA11         0x79
+                       MX7D_PAD_LCD_DATA12__LCD_DATA12         0x79
+                       MX7D_PAD_LCD_DATA13__LCD_DATA13         0x79
+                       MX7D_PAD_LCD_DATA14__LCD_DATA14         0x79
+                       MX7D_PAD_LCD_DATA15__LCD_DATA15         0x79
+                       MX7D_PAD_LCD_DATA16__LCD_DATA16         0x79
+                       MX7D_PAD_LCD_DATA17__LCD_DATA17         0x79
+                       MX7D_PAD_LCD_DATA18__LCD_DATA18         0x79
+                       MX7D_PAD_LCD_DATA19__LCD_DATA19         0x79
+                       MX7D_PAD_LCD_DATA20__LCD_DATA20         0x79
+                       MX7D_PAD_LCD_DATA21__LCD_DATA21         0x79
+                       MX7D_PAD_LCD_DATA22__LCD_DATA22         0x79
+                       MX7D_PAD_LCD_DATA23__LCD_DATA23         0x79
+               >;
+       };
+
+       pinctrl_lcdif_ctrl: lcdifctrlgrp {
+               fsl,pins = <
+                       MX7D_PAD_LCD_CLK__LCD_CLK               0x79
+                       MX7D_PAD_LCD_ENABLE__LCD_ENABLE         0x79
+                       MX7D_PAD_LCD_VSYNC__LCD_VSYNC           0x79
+                       MX7D_PAD_LCD_HSYNC__LCD_HSYNC           0x79
+               >;
+       };
+
+       pinctrl_pwm2: pwm2grp {
+               fsl,pins = <
+                       MX7D_PAD_GPIO1_IO09__PWM2_OUT           0x7d
+               >;
+       };
+
+       pinctrl_uart1: uart1grp {
+               fsl,pins = <
+                       MX7D_PAD_UART1_TX_DATA__UART1_DCE_TX    0x79
+                       MX7D_PAD_UART1_RX_DATA__UART1_DCE_RX    0x79
+               >;
+       };
+
+       pinctrl_uart2: uart2grp {
+               fsl,pins = <
+                       MX7D_PAD_UART2_TX_DATA__UART2_DCE_TX    0x79
+                       MX7D_PAD_UART2_RX_DATA__UART2_DCE_RX    0x79
+               >;
+       };
+
+       pinctrl_uart3: uart3grp {
+               fsl,pins = <
+                       MX7D_PAD_UART3_TX_DATA__UART3_DCE_TX    0x79
+                       MX7D_PAD_UART3_RX_DATA__UART3_DCE_RX    0x79
+                       MX7D_PAD_EPDC_DATA04__GPIO2_IO4         0x7d
+               >;
+       };
+
+       pinctrl_uart6: uart6grp {
+               fsl,pins = <
+                       MX7D_PAD_ECSPI1_MOSI__UART6_DCE_TX      0x79
+                       MX7D_PAD_ECSPI1_SCLK__UART6_DCE_RX      0x79
+                       MX7D_PAD_ECSPI1_SS0__UART6_DCE_CTS      0x79
+                       MX7D_PAD_ECSPI1_MISO__UART6_DCE_RTS     0x79
+               >;
+       };
+
+       pinctrl_usbotg2: usbotg2grp {
+               fsl,pins = <
+                       MX7D_PAD_UART3_RTS_B__USB_OTG2_OC       0x7d
+                       MX7D_PAD_UART3_CTS_B__GPIO4_IO7         0x14
+               >;
+       };
+
+       pinctrl_usdhc1: usdhc1grp {
+               fsl,pins = <
+                       MX7D_PAD_SD1_CMD__SD1_CMD               0x59
+                       MX7D_PAD_SD1_CLK__SD1_CLK               0x19
+                       MX7D_PAD_SD1_DATA0__SD1_DATA0           0x59
+                       MX7D_PAD_SD1_DATA1__SD1_DATA1           0x59
+                       MX7D_PAD_SD1_DATA2__SD1_DATA2           0x59
+                       MX7D_PAD_SD1_DATA3__SD1_DATA3           0x59
+                       MX7D_PAD_GPIO1_IO08__SD1_VSELECT        0x75
+                       MX7D_PAD_SD1_CD_B__GPIO5_IO0            0x75
+               >;
+       };
+
+       pinctrl_usdhc2: usdhc2grp {
+               fsl,pins = <
+                       MX7D_PAD_SD2_CMD__SD2_CMD               0x59
+                       MX7D_PAD_SD2_CLK__SD2_CLK               0x19
+                       MX7D_PAD_SD2_DATA0__SD2_DATA0           0x59
+                       MX7D_PAD_SD2_DATA1__SD2_DATA1           0x59
+                       MX7D_PAD_SD2_DATA2__SD2_DATA2           0x59
+                       MX7D_PAD_SD2_DATA3__SD2_DATA3           0x59
+                       MX7D_PAD_ECSPI2_SCLK__GPIO4_IO20        0x59
+                       MX7D_PAD_ECSPI2_MOSI__GPIO4_IO21        0x59
+               >;
+       };
+
+       pinctrl_usdhc3: usdhc3grp {
+               fsl,pins = <
+                       MX7D_PAD_SD3_CMD__SD3_CMD               0x59
+                       MX7D_PAD_SD3_CLK__SD3_CLK               0x19
+                       MX7D_PAD_SD3_DATA0__SD3_DATA0           0x59
+                       MX7D_PAD_SD3_DATA1__SD3_DATA1           0x59
+                       MX7D_PAD_SD3_DATA2__SD3_DATA2           0x59
+                       MX7D_PAD_SD3_DATA3__SD3_DATA3           0x59
+                       MX7D_PAD_SD3_DATA4__SD3_DATA4           0x59
+                       MX7D_PAD_SD3_DATA5__SD3_DATA5           0x59
+                       MX7D_PAD_SD3_DATA6__SD3_DATA6           0x59
+                       MX7D_PAD_SD3_DATA7__SD3_DATA7           0x59
+               >;
+       };
+};
+
+&iomuxc_lpsr {
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_hog_2>;
+
+       pinctrl_hog_2: hoggrp-2 {
+               fsl,pins = <
+                       MX7D_PAD_GPIO1_IO02__GPIO1_IO2          0x7d
+                       MX7D_PAD_GPIO1_IO03__CCM_CLKO2          0x7d
+               >;
+       };
+
+       pinctrl_backlight_j9: backlightj9grp {
+               fsl,pins = <
+                       MX7D_PAD_GPIO1_IO07__GPIO1_IO7          0x7d
+               >;
+       };
+
+       pinctrl_pwm1: pwm1grp {
+               fsl,pins = <
+                       MX7D_PAD_GPIO1_IO01__PWM1_OUT           0x7d
+               >;
+       };
+
+       pinctrl_usbotg1: usbotg1grp {
+               fsl,pins = <
+                       MX7D_PAD_GPIO1_IO04__USB_OTG1_OC        0x7d
+                       MX7D_PAD_GPIO1_IO05__GPIO1_IO5          0x14
+               >;
+       };
+
+       pinctrl_wdog1: wdog1grp {
+               fsl,pins = <
+                       MX7D_PAD_GPIO1_IO00__WDOD1_WDOG_B       0x75
+               >;
+       };
+};
index b5a50e0..6b3faa2 100644 (file)
                                #pwm-cells = <2>;
                                status = "disabled";
                        };
+
+                       lcdif: lcdif@30730000 {
+                               compatible = "fsl,imx7d-lcdif", "fsl,imx28-lcdif";
+                               reg = <0x30730000 0x10000>;
+                               interrupts = <GIC_SPI 5 IRQ_TYPE_LEVEL_HIGH>;
+                               clocks = <&clks IMX7D_LCDIF_PIXEL_ROOT_CLK>,
+                                       <&clks IMX7D_CLK_DUMMY>,
+                                       <&clks IMX7D_CLK_DUMMY>;
+                               clock-names = "pix", "axi", "disp_axi";
+                               status = "disabled";
+                       };
                };
 
                aips3: aips-bus@30800000 {
                                status = "disabled";
                        };
 
+                       flexcan1: can@30a00000 {
+                               compatible = "fsl,imx7d-flexcan", "fsl,imx6q-flexcan";
+                               reg = <0x30a00000 0x10000>;
+                               interrupts = <GIC_SPI 110 IRQ_TYPE_LEVEL_HIGH>;
+                               clocks = <&clks IMX7D_CLK_DUMMY>,
+                                       <&clks IMX7D_CAN1_ROOT_CLK>;
+                               clock-names = "ipg", "per";
+                               status = "disabled";
+                       };
+
+                       flexcan2: can@30a10000 {
+                               compatible = "fsl,imx7d-flexcan", "fsl,imx6q-flexcan";
+                               reg = <0x30a10000 0x10000>;
+                               interrupts = <GIC_SPI 111 IRQ_TYPE_LEVEL_HIGH>;
+                               clocks = <&clks IMX7D_CLK_DUMMY>,
+                                       <&clks IMX7D_CAN2_ROOT_CLK>;
+                               clock-names = "ipg", "per";
+                               status = "disabled";
+                       };
+
                        i2c1: i2c@30a20000 {
                                #address-cells = <1>;
                                #size-cells = <0>;
index 0c82097..b9bbcce 100644 (file)
@@ -14,6 +14,7 @@
 #include <dt-bindings/clock/r8a7779-clock.h>
 #include <dt-bindings/interrupt-controller/arm-gic.h>
 #include <dt-bindings/interrupt-controller/irq.h>
+#include <dt-bindings/power/r8a7779-sysc.h>
 
 / {
        compatible = "renesas,r8a7779";
                        compatible = "arm,cortex-a9";
                        reg = <1>;
                        clock-frequency = <1000000000>;
+                       power-domains = <&sysc R8A7779_PD_ARM1>;
                };
                cpu@2 {
                        device_type = "cpu";
                        compatible = "arm,cortex-a9";
                        reg = <2>;
                        clock-frequency = <1000000000>;
+                       power-domains = <&sysc R8A7779_PD_ARM2>;
                };
                cpu@3 {
                        device_type = "cpu";
                        compatible = "arm,cortex-a9";
                        reg = <3>;
                        clock-frequency = <1000000000>;
+                       power-domains = <&sysc R8A7779_PD_ARM3>;
                };
        };
 
                reg = <0xffc70000 0x1000>;
                interrupts = <GIC_SPI 79 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp0_clks R8A7779_CLK_I2C0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0xffc71000 0x1000>;
                interrupts = <GIC_SPI 82 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp0_clks R8A7779_CLK_I2C1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0xffc72000 0x1000>;
                interrupts = <GIC_SPI 80 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp0_clks R8A7779_CLK_I2C2>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0xffc73000 0x1000>;
                interrupts = <GIC_SPI 81 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp0_clks R8A7779_CLK_I2C3>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp0_clks R8A7779_CLK_SCIF0>,
                         <&cpg_clocks R8A7779_CLK_S1>, <&scif_clk>;
                clock-names = "fck", "brg_int", "scif_clk";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp0_clks R8A7779_CLK_SCIF1>,
                         <&cpg_clocks R8A7779_CLK_S1>, <&scif_clk>;
                clock-names = "fck", "brg_int", "scif_clk";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp0_clks R8A7779_CLK_SCIF2>,
                         <&cpg_clocks R8A7779_CLK_S1>, <&scif_clk>;
                clock-names = "fck", "brg_int", "scif_clk";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp0_clks R8A7779_CLK_SCIF3>,
                         <&cpg_clocks R8A7779_CLK_S1>, <&scif_clk>;
                clock-names = "fck", "brg_int", "scif_clk";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp0_clks R8A7779_CLK_SCIF4>,
                         <&cpg_clocks R8A7779_CLK_S1>, <&scif_clk>;
                clock-names = "fck", "brg_int", "scif_clk";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp0_clks R8A7779_CLK_SCIF5>,
                         <&cpg_clocks R8A7779_CLK_S1>, <&scif_clk>;
                clock-names = "fck", "brg_int", "scif_clk";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                             <GIC_SPI 34 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp0_clks R8A7779_CLK_TMU0>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
 
                #renesas,channels = <3>;
 
                             <GIC_SPI 38 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp0_clks R8A7779_CLK_TMU1>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
 
                #renesas,channels = <3>;
 
                             <GIC_SPI 42 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp0_clks R8A7779_CLK_TMU2>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
 
                #renesas,channels = <3>;
 
                reg = <0xfc600000 0x2000>;
                interrupts = <GIC_SPI 100 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp1_clks R8A7779_CLK_SATA>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
        };
 
        sdhi0: sd@ffe4c000 {
                reg = <0xffe4c000 0x100>;
                interrupts = <GIC_SPI 104 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp3_clks R8A7779_CLK_SDHI0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0xffe4d000 0x100>;
                interrupts = <GIC_SPI 105 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp3_clks R8A7779_CLK_SDHI1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0xffe4e000 0x100>;
                interrupts = <GIC_SPI 107 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp3_clks R8A7779_CLK_SDHI2>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0xffe4f000 0x100>;
                interrupts = <GIC_SPI 106 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp3_clks R8A7779_CLK_SDHI3>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                #address-cells = <1>;
                #size-cells = <0>;
                clocks = <&mstp0_clks R8A7779_CLK_HSPI>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                #address-cells = <1>;
                #size-cells = <0>;
                clocks = <&mstp0_clks R8A7779_CLK_HSPI>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                #address-cells = <1>;
                #size-cells = <0>;
                clocks = <&mstp0_clks R8A7779_CLK_HSPI>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0 0xfff80000 0 0x40000>;
                interrupts = <GIC_SPI 31 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp1_clks R8A7779_CLK_DU>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
                status = "disabled";
 
                ports {
                                "mmc1", "mmc0";
                };
        };
+
+       sysc: system-controller@ffd85000 {
+               compatible = "renesas,r8a7779-sysc";
+               reg = <0xffd85000 0x0200>;
+               #power-domain-cells = <1>;
+       };
 };
index 935064f..83cf23c 100644 (file)
@@ -13,6 +13,7 @@
 #include <dt-bindings/clock/r8a7790-clock.h>
 #include <dt-bindings/interrupt-controller/arm-gic.h>
 #include <dt-bindings/interrupt-controller/irq.h>
+#include <dt-bindings/power/r8a7790-sysc.h>
 
 / {
        compatible = "renesas,r8a7790";
@@ -52,6 +53,7 @@
                        voltage-tolerance = <1>; /* 1% */
                        clocks = <&cpg_clocks R8A7790_CLK_Z>;
                        clock-latency = <300000>; /* 300 us */
+                       power-domains = <&sysc R8A7790_PD_CA15_CPU0>;
                        next-level-cache = <&L2_CA15>;
 
                        /* kHz - uV - OPPs unknown yet */
@@ -68,6 +70,7 @@
                        compatible = "arm,cortex-a15";
                        reg = <1>;
                        clock-frequency = <1300000000>;
+                       power-domains = <&sysc R8A7790_PD_CA15_CPU1>;
                        next-level-cache = <&L2_CA15>;
                };
 
@@ -76,6 +79,7 @@
                        compatible = "arm,cortex-a15";
                        reg = <2>;
                        clock-frequency = <1300000000>;
+                       power-domains = <&sysc R8A7790_PD_CA15_CPU2>;
                        next-level-cache = <&L2_CA15>;
                };
 
@@ -84,6 +88,7 @@
                        compatible = "arm,cortex-a15";
                        reg = <3>;
                        clock-frequency = <1300000000>;
+                       power-domains = <&sysc R8A7790_PD_CA15_CPU3>;
                        next-level-cache = <&L2_CA15>;
                };
 
@@ -92,6 +97,7 @@
                        compatible = "arm,cortex-a7";
                        reg = <0x100>;
                        clock-frequency = <780000000>;
+                       power-domains = <&sysc R8A7790_PD_CA7_CPU0>;
                        next-level-cache = <&L2_CA7>;
                };
 
                        compatible = "arm,cortex-a7";
                        reg = <0x101>;
                        clock-frequency = <780000000>;
+                       power-domains = <&sysc R8A7790_PD_CA7_CPU1>;
                        next-level-cache = <&L2_CA7>;
                };
 
                        compatible = "arm,cortex-a7";
                        reg = <0x102>;
                        clock-frequency = <780000000>;
+                       power-domains = <&sysc R8A7790_PD_CA7_CPU2>;
                        next-level-cache = <&L2_CA7>;
                };
 
                        compatible = "arm,cortex-a7";
                        reg = <0x103>;
                        clock-frequency = <780000000>;
+                       power-domains = <&sysc R8A7790_PD_CA7_CPU3>;
                        next-level-cache = <&L2_CA7>;
                };
        };
 
        L2_CA15: cache-controller@0 {
                compatible = "cache";
+               power-domains = <&sysc R8A7790_PD_CA15_SCU>;
                cache-unified;
                cache-level = <2>;
        };
 
        L2_CA7: cache-controller@1 {
                compatible = "cache";
+               power-domains = <&sysc R8A7790_PD_CA7_SCU>;
                cache-unified;
                cache-level = <2>;
        };
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7790_CLK_GPIO0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
        };
 
        gpio1: gpio@e6051000 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7790_CLK_GPIO1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
        };
 
        gpio2: gpio@e6052000 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7790_CLK_GPIO2>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
        };
 
        gpio3: gpio@e6053000 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7790_CLK_GPIO3>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
        };
 
        gpio4: gpio@e6054000 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7790_CLK_GPIO4>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
        };
 
        gpio5: gpio@e6055000 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7790_CLK_GPIO5>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
        };
 
        thermal: thermal@e61f0000 {
                reg = <0 0xe61f0000 0 0x14>, <0 0xe61f0100 0 0x38>;
                interrupts = <GIC_SPI 69 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp5_clks R8A7790_CLK_THERMAL>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                #thermal-sensor-cells = <0>;
        };
 
                             <GIC_SPI 143 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp1_clks R8A7790_CLK_CMT0>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
 
                renesas,channels-mask = <0x60>;
 
                             <GIC_SPI 127 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp3_clks R8A7790_CLK_CMT1>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
 
                renesas,channels-mask = <0xff>;
 
                             <GIC_SPI 2 IRQ_TYPE_LEVEL_HIGH>,
                             <GIC_SPI 3 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp4_clks R8A7790_CLK_IRQC>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
        };
 
        dmac0: dma-controller@e6700000 {
                                "ch12", "ch13", "ch14";
                clocks = <&mstp2_clks R8A7790_CLK_SYS_DMAC0>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                #dma-cells = <1>;
                dma-channels = <15>;
        };
                                "ch12", "ch13", "ch14";
                clocks = <&mstp2_clks R8A7790_CLK_SYS_DMAC1>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                #dma-cells = <1>;
                dma-channels = <15>;
        };
                                "ch12";
                clocks = <&mstp5_clks R8A7790_CLK_AUDIO_DMAC0>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                #dma-cells = <1>;
                dma-channels = <13>;
        };
                                "ch12";
                clocks = <&mstp5_clks R8A7790_CLK_AUDIO_DMAC1>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                #dma-cells = <1>;
                dma-channels = <13>;
        };
                              GIC_SPI 109 IRQ_TYPE_LEVEL_HIGH>;
                interrupt-names = "ch0", "ch1";
                clocks = <&mstp3_clks R8A7790_CLK_USBDMAC0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                #dma-cells = <1>;
                dma-channels = <2>;
        };
                              GIC_SPI 110 IRQ_TYPE_LEVEL_HIGH>;
                interrupt-names = "ch0", "ch1";
                clocks = <&mstp3_clks R8A7790_CLK_USBDMAC1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                #dma-cells = <1>;
                dma-channels = <2>;
        };
                reg = <0 0xe6508000 0 0x40>;
                interrupts = <GIC_SPI 287 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7790_CLK_I2C0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                i2c-scl-internal-delay-ns = <110>;
                status = "disabled";
        };
                reg = <0 0xe6518000 0 0x40>;
                interrupts = <GIC_SPI 288 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7790_CLK_I2C1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                i2c-scl-internal-delay-ns = <6>;
                status = "disabled";
        };
                reg = <0 0xe6530000 0 0x40>;
                interrupts = <GIC_SPI 286 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7790_CLK_I2C2>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                i2c-scl-internal-delay-ns = <6>;
                status = "disabled";
        };
                reg = <0 0xe6540000 0 0x40>;
                interrupts = <GIC_SPI 290 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7790_CLK_I2C3>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                i2c-scl-internal-delay-ns = <110>;
                status = "disabled";
        };
                clocks = <&mstp3_clks R8A7790_CLK_IIC0>;
                dmas = <&dmac0 0x61>, <&dmac0 0x62>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp3_clks R8A7790_CLK_IIC1>;
                dmas = <&dmac0 0x65>, <&dmac0 0x66>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp3_clks R8A7790_CLK_IIC2>;
                dmas = <&dmac0 0x69>, <&dmac0 0x6a>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp9_clks R8A7790_CLK_IICDVFS>;
                dmas = <&dmac0 0x77>, <&dmac0 0x78>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp3_clks R8A7790_CLK_MMCIF0>;
                dmas = <&dmac0 0xd1>, <&dmac0 0xd2>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                reg-io-width = <4>;
                status = "disabled";
                max-frequency = <97500000>;
                clocks = <&mstp3_clks R8A7790_CLK_MMCIF1>;
                dmas = <&dmac0 0xe1>, <&dmac0 0xe2>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                reg-io-width = <4>;
                status = "disabled";
                max-frequency = <97500000>;
                dmas = <&dmac1 0xcd>, <&dmac1 0xce>;
                dma-names = "tx", "rx";
                max-frequency = <195000000>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                dmas = <&dmac1 0xc9>, <&dmac1 0xca>;
                dma-names = "tx", "rx";
                max-frequency = <195000000>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                dmas = <&dmac1 0xc1>, <&dmac1 0xc2>;
                dma-names = "tx", "rx";
                max-frequency = <97500000>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                dmas = <&dmac1 0xd3>, <&dmac1 0xd4>;
                dma-names = "tx", "rx";
                max-frequency = <97500000>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x21>, <&dmac0 0x22>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x25>, <&dmac0 0x26>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x27>, <&dmac0 0x28>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x3d>, <&dmac0 0x3e>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x19>, <&dmac0 0x1a>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x1d>, <&dmac0 0x1e>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x29>, <&dmac0 0x2a>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x2d>, <&dmac0 0x2e>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x2b>, <&dmac0 0x2c>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x39>, <&dmac0 0x3a>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x4d>, <&dmac0 0x4e>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0 0xee700000 0 0x400>;
                interrupts = <GIC_SPI 162 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7790_CLK_ETHER>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                phy-mode = "rmii";
                #address-cells = <1>;
                #size-cells = <0>;
                reg = <0 0xe6800000 0 0x800>, <0 0xee0e8000 0 0x4000>;
                interrupts = <GIC_SPI 163 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7790_CLK_ETHERAVB>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                #address-cells = <1>;
                #size-cells = <0>;
                status = "disabled";
                reg = <0 0xee300000 0 0x2000>;
                interrupts = <GIC_SPI 105 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7790_CLK_SATA0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0 0xee500000 0 0x2000>;
                interrupts = <GIC_SPI 106 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7790_CLK_SATA1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                dmas = <&usb_dmac0 0>, <&usb_dmac0 1>,
                       <&usb_dmac1 0>, <&usb_dmac1 1>;
                dma-names = "ch0", "ch1", "ch2", "ch3";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                renesas,buswait = <4>;
                phys = <&usb0 1>;
                phy-names = "usb";
                #size-cells = <0>;
                clocks = <&mstp7_clks R8A7790_CLK_HSUSB>;
                clock-names = "usbhs";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
 
                usb0: usb-channel@0 {
                reg = <0 0xe6ef0000 0 0x1000>;
                interrupts = <GIC_SPI 188 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7790_CLK_VIN0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0 0xe6ef1000 0 0x1000>;
                interrupts = <GIC_SPI 189 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7790_CLK_VIN1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0 0xe6ef2000 0 0x1000>;
                interrupts = <GIC_SPI 190 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7790_CLK_VIN2>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0 0xe6ef3000 0 0x1000>;
                interrupts = <GIC_SPI 191 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7790_CLK_VIN3>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0 0xfe920000 0 0x8000>;
                interrupts = <GIC_SPI 266 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp1_clks R8A7790_CLK_VSP1_R>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
 
                renesas,has-sru;
                renesas,#rpf = <5>;
                reg = <0 0xfe928000 0 0x8000>;
                interrupts = <GIC_SPI 267 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp1_clks R8A7790_CLK_VSP1_S>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
 
                renesas,has-lut;
                renesas,has-sru;
                reg = <0 0xfe930000 0 0x8000>;
                interrupts = <GIC_SPI 246 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp1_clks R8A7790_CLK_VSP1_DU0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
 
                renesas,has-lif;
                renesas,has-lut;
                reg = <0 0xfe938000 0 0x8000>;
                interrupts = <GIC_SPI 247 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp1_clks R8A7790_CLK_VSP1_DU1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
 
                renesas,has-lif;
                renesas,has-lut;
                clocks = <&mstp9_clks R8A7790_CLK_RCAN0>,
                         <&cpg_clocks R8A7790_CLK_RCAN>, <&can_clk>;
                clock-names = "clkp1", "clkp2", "can_clk";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp9_clks R8A7790_CLK_RCAN1>,
                         <&cpg_clocks R8A7790_CLK_RCAN>, <&can_clk>;
                clock-names = "clkp1", "clkp2", "can_clk";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0 0xfe980000 0 0x10300>;
                interrupts = <GIC_SPI 272 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp1_clks R8A7790_CLK_JPU>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
        };
 
        clocks {
                };
        };
 
+       sysc: system-controller@e6180000 {
+               compatible = "renesas,r8a7790-sysc";
+               reg = <0 0xe6180000 0 0x0200>;
+               #power-domain-cells = <1>;
+       };
+
        qspi: spi@e6b10000 {
                compatible = "renesas,qspi-r8a7790", "renesas,qspi";
                reg = <0 0xe6b10000 0 0x2c>;
                clocks = <&mstp9_clks R8A7790_CLK_QSPI_MOD>;
                dmas = <&dmac0 0x17>, <&dmac0 0x18>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                num-cs = <1>;
                #address-cells = <1>;
                #size-cells = <0>;
                clocks = <&mstp0_clks R8A7790_CLK_MSIOF0>;
                dmas = <&dmac0 0x51>, <&dmac0 0x52>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                #address-cells = <1>;
                #size-cells = <0>;
                status = "disabled";
                clocks = <&mstp2_clks R8A7790_CLK_MSIOF1>;
                dmas = <&dmac0 0x55>, <&dmac0 0x56>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                #address-cells = <1>;
                #size-cells = <0>;
                status = "disabled";
                clocks = <&mstp2_clks R8A7790_CLK_MSIOF2>;
                dmas = <&dmac0 0x41>, <&dmac0 0x42>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                #address-cells = <1>;
                #size-cells = <0>;
                status = "disabled";
                clocks = <&mstp2_clks R8A7790_CLK_MSIOF3>;
                dmas = <&dmac0 0x45>, <&dmac0 0x46>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                #address-cells = <1>;
                #size-cells = <0>;
                status = "disabled";
                reg = <0 0xee000000 0 0xc00>;
                interrupts = <GIC_SPI 101 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp3_clks R8A7790_CLK_SSUSB>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                phys = <&usb2 1>;
                phy-names = "usb";
                status = "disabled";
                      <0 0xee080000 0 0x1100>;
                interrupts = <GIC_SPI 108 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp7_clks R8A7790_CLK_EHCI>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
 
                bus-range = <0 0>;
                      <0 0xee0a0000 0 0x1100>;
                interrupts = <GIC_SPI 112 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp7_clks R8A7790_CLK_EHCI>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
 
                bus-range = <1 1>;
                compatible = "renesas,pci-r8a7790", "renesas,pci-rcar-gen2";
                device_type = "pci";
                clocks = <&mstp7_clks R8A7790_CLK_EHCI>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                reg = <0 0xee0d0000 0 0xc00>,
                      <0 0xee0c0000 0 0x1100>;
                interrupts = <GIC_SPI 113 IRQ_TYPE_LEVEL_HIGH>;
                interrupt-map = <0 0 0 0 &gic GIC_SPI 116 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp3_clks R8A7790_CLK_PCIEC>, <&pcie_bus_clk>;
                clock-names = "pcie", "pcie_bus";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                                "mix.0", "mix.1",
                                "dvc.0", "dvc.1",
                                "clk_a", "clk_b", "clk_c", "clk_i";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
 
                status = "disabled";
 
index 565c270..db67e34 100644 (file)
@@ -13,6 +13,7 @@
 #include <dt-bindings/clock/r8a7791-clock.h>
 #include <dt-bindings/interrupt-controller/arm-gic.h>
 #include <dt-bindings/interrupt-controller/irq.h>
+#include <dt-bindings/power/r8a7791-sysc.h>
 
 / {
        compatible = "renesas,r8a7791";
@@ -51,6 +52,7 @@
                        voltage-tolerance = <1>; /* 1% */
                        clocks = <&cpg_clocks R8A7791_CLK_Z>;
                        clock-latency = <300000>; /* 300 us */
+                       power-domains = <&sysc R8A7791_PD_CA15_CPU0>;
                        next-level-cache = <&L2_CA15>;
 
                        /* kHz - uV - OPPs unknown yet */
@@ -67,6 +69,7 @@
                        compatible = "arm,cortex-a15";
                        reg = <1>;
                        clock-frequency = <1500000000>;
+                       power-domains = <&sysc R8A7791_PD_CA15_CPU1>;
                        next-level-cache = <&L2_CA15>;
                };
        };
@@ -92,6 +95,7 @@
 
        L2_CA15: cache-controller@0 {
                compatible = "cache";
+               power-domains = <&sysc R8A7791_PD_CA15_SCU>;
                cache-unified;
                cache-level = <2>;
        };
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7791_CLK_GPIO0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
        };
 
        gpio1: gpio@e6051000 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7791_CLK_GPIO1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
        };
 
        gpio2: gpio@e6052000 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7791_CLK_GPIO2>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
        };
 
        gpio3: gpio@e6053000 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7791_CLK_GPIO3>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
        };
 
        gpio4: gpio@e6054000 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7791_CLK_GPIO4>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
        };
 
        gpio5: gpio@e6055000 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7791_CLK_GPIO5>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
        };
 
        gpio6: gpio@e6055400 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7791_CLK_GPIO6>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
        };
 
        gpio7: gpio@e6055800 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7791_CLK_GPIO7>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
        };
 
        thermal: thermal@e61f0000 {
                reg = <0 0xe61f0000 0 0x14>, <0 0xe61f0100 0 0x38>;
                interrupts = <GIC_SPI 69 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp5_clks R8A7791_CLK_THERMAL>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                #thermal-sensor-cells = <0>;
        };
 
                             <GIC_SPI 143 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp1_clks R8A7791_CLK_CMT0>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
 
                renesas,channels-mask = <0x60>;
 
                             <GIC_SPI 127 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp3_clks R8A7791_CLK_CMT1>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
 
                renesas,channels-mask = <0xff>;
 
                             <GIC_SPI 16 IRQ_TYPE_LEVEL_HIGH>,
                             <GIC_SPI 17 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp4_clks R8A7791_CLK_IRQC>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
        };
 
        dmac0: dma-controller@e6700000 {
                                "ch12", "ch13", "ch14";
                clocks = <&mstp2_clks R8A7791_CLK_SYS_DMAC0>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                #dma-cells = <1>;
                dma-channels = <15>;
        };
                                "ch12", "ch13", "ch14";
                clocks = <&mstp2_clks R8A7791_CLK_SYS_DMAC1>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                #dma-cells = <1>;
                dma-channels = <15>;
        };
                                "ch12";
                clocks = <&mstp5_clks R8A7791_CLK_AUDIO_DMAC0>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                #dma-cells = <1>;
                dma-channels = <13>;
        };
                                "ch12";
                clocks = <&mstp5_clks R8A7791_CLK_AUDIO_DMAC1>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                #dma-cells = <1>;
                dma-channels = <13>;
        };
                              GIC_SPI 109 IRQ_TYPE_LEVEL_HIGH>;
                interrupt-names = "ch0", "ch1";
                clocks = <&mstp3_clks R8A7791_CLK_USBDMAC0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                #dma-cells = <1>;
                dma-channels = <2>;
        };
                              GIC_SPI 110 IRQ_TYPE_LEVEL_HIGH>;
                interrupt-names = "ch0", "ch1";
                clocks = <&mstp3_clks R8A7791_CLK_USBDMAC1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                #dma-cells = <1>;
                dma-channels = <2>;
        };
                reg = <0 0xe6508000 0 0x40>;
                interrupts = <GIC_SPI 287 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7791_CLK_I2C0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                i2c-scl-internal-delay-ns = <6>;
                status = "disabled";
        };
                reg = <0 0xe6518000 0 0x40>;
                interrupts = <GIC_SPI 288 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7791_CLK_I2C1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                i2c-scl-internal-delay-ns = <6>;
                status = "disabled";
        };
                reg = <0 0xe6530000 0 0x40>;
                interrupts = <GIC_SPI 286 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7791_CLK_I2C2>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                i2c-scl-internal-delay-ns = <6>;
                status = "disabled";
        };
                reg = <0 0xe6540000 0 0x40>;
                interrupts = <GIC_SPI 290 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7791_CLK_I2C3>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                i2c-scl-internal-delay-ns = <6>;
                status = "disabled";
        };
                reg = <0 0xe6520000 0 0x40>;
                interrupts = <GIC_SPI 19 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7791_CLK_I2C4>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                i2c-scl-internal-delay-ns = <6>;
                status = "disabled";
        };
                reg = <0 0xe6528000 0 0x40>;
                interrupts = <GIC_SPI 20 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7791_CLK_I2C5>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                i2c-scl-internal-delay-ns = <110>;
                status = "disabled";
        };
                clocks = <&mstp9_clks R8A7791_CLK_IICDVFS>;
                dmas = <&dmac0 0x77>, <&dmac0 0x78>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp3_clks R8A7791_CLK_IIC0>;
                dmas = <&dmac0 0x61>, <&dmac0 0x62>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp3_clks R8A7791_CLK_IIC1>;
                dmas = <&dmac0 0x65>, <&dmac0 0x66>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp3_clks R8A7791_CLK_MMCIF0>;
                dmas = <&dmac0 0xd1>, <&dmac0 0xd2>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                reg-io-width = <4>;
                status = "disabled";
                max-frequency = <97500000>;
                clocks = <&mstp3_clks R8A7791_CLK_SDHI0>;
                dmas = <&dmac1 0xcd>, <&dmac1 0xce>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp3_clks R8A7791_CLK_SDHI1>;
                dmas = <&dmac1 0xc1>, <&dmac1 0xc2>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp3_clks R8A7791_CLK_SDHI2>;
                dmas = <&dmac1 0xd3>, <&dmac1 0xd4>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x21>, <&dmac0 0x22>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x25>, <&dmac0 0x26>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x27>, <&dmac0 0x28>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x1b>, <&dmac0 0x1c>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x1f>, <&dmac0 0x20>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x23>, <&dmac0 0x24>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x3d>, <&dmac0 0x3e>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x19>, <&dmac0 0x1a>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x1d>, <&dmac0 0x1e>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x29>, <&dmac0 0x2a>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x2d>, <&dmac0 0x2e>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x2b>, <&dmac0 0x2c>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x2f>, <&dmac0 0x30>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0xfb>, <&dmac0 0xfc>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0xfd>, <&dmac0 0xfe>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x39>, <&dmac0 0x3a>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x4d>, <&dmac0 0x4e>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x3b>, <&dmac0 0x3c>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0 0xee700000 0 0x400>;
                interrupts = <GIC_SPI 162 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7791_CLK_ETHER>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                phy-mode = "rmii";
                #address-cells = <1>;
                #size-cells = <0>;
                reg = <0 0xe6800000 0 0x800>, <0 0xee0e8000 0 0x4000>;
                interrupts = <GIC_SPI 163 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7791_CLK_ETHERAVB>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                #address-cells = <1>;
                #size-cells = <0>;
                status = "disabled";
                reg = <0 0xee300000 0 0x2000>;
                interrupts = <GIC_SPI 105 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7791_CLK_SATA0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0 0xee500000 0 0x2000>;
                interrupts = <GIC_SPI 106 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7791_CLK_SATA1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                dmas = <&usb_dmac0 0>, <&usb_dmac0 1>,
                       <&usb_dmac1 0>, <&usb_dmac1 1>;
                dma-names = "ch0", "ch1", "ch2", "ch3";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                renesas,buswait = <4>;
                phys = <&usb0 1>;
                phy-names = "usb";
                #size-cells = <0>;
                clocks = <&mstp7_clks R8A7791_CLK_HSUSB>;
                clock-names = "usbhs";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
 
                usb0: usb-channel@0 {
                reg = <0 0xe6ef0000 0 0x1000>;
                interrupts = <GIC_SPI 188 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7791_CLK_VIN0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0 0xe6ef1000 0 0x1000>;
                interrupts = <GIC_SPI 189 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7791_CLK_VIN1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0 0xe6ef2000 0 0x1000>;
                interrupts = <GIC_SPI 190 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7791_CLK_VIN2>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0 0xfe928000 0 0x8000>;
                interrupts = <GIC_SPI 267 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp1_clks R8A7791_CLK_VSP1_S>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
 
                renesas,has-lut;
                renesas,has-sru;
                reg = <0 0xfe930000 0 0x8000>;
                interrupts = <GIC_SPI 246 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp1_clks R8A7791_CLK_VSP1_DU0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
 
                renesas,has-lif;
                renesas,has-lut;
                reg = <0 0xfe938000 0 0x8000>;
                interrupts = <GIC_SPI 247 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp1_clks R8A7791_CLK_VSP1_DU1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
 
                renesas,has-lif;
                renesas,has-lut;
                clocks = <&mstp9_clks R8A7791_CLK_RCAN0>,
                         <&cpg_clocks R8A7791_CLK_RCAN>, <&can_clk>;
                clock-names = "clkp1", "clkp2", "can_clk";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp9_clks R8A7791_CLK_RCAN1>,
                         <&cpg_clocks R8A7791_CLK_RCAN>, <&can_clk>;
                clock-names = "clkp1", "clkp2", "can_clk";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0 0xfe980000 0 0x10300>;
                interrupts = <GIC_SPI 272 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp1_clks R8A7791_CLK_JPU>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
        };
 
        clocks {
                };
        };
 
+       sysc: system-controller@e6180000 {
+               compatible = "renesas,r8a7791-sysc";
+               reg = <0 0xe6180000 0 0x0200>;
+               #power-domain-cells = <1>;
+       };
+
        qspi: spi@e6b10000 {
                compatible = "renesas,qspi-r8a7791", "renesas,qspi";
                reg = <0 0xe6b10000 0 0x2c>;
                clocks = <&mstp9_clks R8A7791_CLK_QSPI_MOD>;
                dmas = <&dmac0 0x17>, <&dmac0 0x18>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                num-cs = <1>;
                #address-cells = <1>;
                #size-cells = <0>;
                clocks = <&mstp0_clks R8A7791_CLK_MSIOF0>;
                dmas = <&dmac0 0x51>, <&dmac0 0x52>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                #address-cells = <1>;
                #size-cells = <0>;
                status = "disabled";
                clocks = <&mstp2_clks R8A7791_CLK_MSIOF1>;
                dmas = <&dmac0 0x55>, <&dmac0 0x56>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                #address-cells = <1>;
                #size-cells = <0>;
                status = "disabled";
                clocks = <&mstp2_clks R8A7791_CLK_MSIOF2>;
                dmas = <&dmac0 0x41>, <&dmac0 0x42>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                #address-cells = <1>;
                #size-cells = <0>;
                status = "disabled";
                reg = <0 0xee000000 0 0xc00>;
                interrupts = <GIC_SPI 101 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp3_clks R8A7791_CLK_SSUSB>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                phys = <&usb2 1>;
                phy-names = "usb";
                status = "disabled";
                      <0 0xee080000 0 0x1100>;
                interrupts = <GIC_SPI 108 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp7_clks R8A7791_CLK_EHCI>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
 
                bus-range = <0 0>;
                      <0 0xee0c0000 0 0x1100>;
                interrupts = <GIC_SPI 113 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp7_clks R8A7791_CLK_EHCI>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
 
                bus-range = <1 1>;
                interrupt-map = <0 0 0 0 &gic GIC_SPI 116 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp3_clks R8A7791_CLK_PCIEC>, <&pcie_bus_clk>;
                clock-names = "pcie", "pcie_bus";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                                "mix.0", "mix.1",
                                "dvc.0", "dvc.1",
                                "clk_a", "clk_b", "clk_c", "clk_i";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
 
                status = "disabled";
 
index cf6dc2a..1dd6d20 100644 (file)
@@ -11,6 +11,7 @@
 #include <dt-bindings/clock/r8a7793-clock.h>
 #include <dt-bindings/interrupt-controller/arm-gic.h>
 #include <dt-bindings/interrupt-controller/irq.h>
+#include <dt-bindings/power/r8a7793-sysc.h>
 
 / {
        compatible = "renesas,r8a7793";
@@ -43,6 +44,7 @@
                        voltage-tolerance = <1>; /* 1% */
                        clocks = <&cpg_clocks R8A7793_CLK_Z>;
                        clock-latency = <300000>; /* 300 us */
+                       power-domains = <&sysc R8A7793_PD_CA15_CPU0>;
 
                        /* kHz - uV - OPPs unknown yet */
                        operating-points = <1500000 1000000>,
@@ -76,6 +78,7 @@
 
        L2_CA15: cache-controller@0 {
                compatible = "cache";
+               power-domains = <&sysc R8A7793_PD_CA15_SCU>;
                cache-unified;
                cache-level = <2>;
        };
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7793_CLK_GPIO0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
        };
 
        gpio1: gpio@e6051000 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7793_CLK_GPIO1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
        };
 
        gpio2: gpio@e6052000 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7793_CLK_GPIO2>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
        };
 
        gpio3: gpio@e6053000 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7793_CLK_GPIO3>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
        };
 
        gpio4: gpio@e6054000 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7793_CLK_GPIO4>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
        };
 
        gpio5: gpio@e6055000 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7793_CLK_GPIO5>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
        };
 
        gpio6: gpio@e6055400 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7793_CLK_GPIO6>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
        };
 
        gpio7: gpio@e6055800 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7793_CLK_GPIO7>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
        };
 
        thermal: thermal@e61f0000 {
                reg = <0 0xe61f0000 0 0x14>, <0 0xe61f0100 0 0x38>;
                interrupts = <GIC_SPI 69 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp5_clks R8A7793_CLK_THERMAL>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                #thermal-sensor-cells = <0>;
        };
 
                             <GIC_SPI 143 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp1_clks R8A7793_CLK_CMT0>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
 
                renesas,channels-mask = <0x60>;
 
                             <GIC_SPI 127 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp3_clks R8A7793_CLK_CMT1>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
 
                renesas,channels-mask = <0xff>;
 
                             <GIC_SPI 16 IRQ_TYPE_LEVEL_HIGH>,
                             <GIC_SPI 17 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp4_clks R8A7793_CLK_IRQC>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
        };
 
        dmac0: dma-controller@e6700000 {
                                "ch12", "ch13", "ch14";
                clocks = <&mstp2_clks R8A7793_CLK_SYS_DMAC0>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                #dma-cells = <1>;
                dma-channels = <15>;
        };
                                "ch12", "ch13", "ch14";
                clocks = <&mstp2_clks R8A7793_CLK_SYS_DMAC1>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                #dma-cells = <1>;
                dma-channels = <15>;
        };
                                "ch12";
                clocks = <&mstp5_clks R8A7793_CLK_AUDIO_DMAC0>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                #dma-cells = <1>;
                dma-channels = <13>;
        };
                                "ch12";
                clocks = <&mstp5_clks R8A7793_CLK_AUDIO_DMAC1>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                #dma-cells = <1>;
                dma-channels = <13>;
        };
                reg = <0 0xe6508000 0 0x40>;
                interrupts = <GIC_SPI 287 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7793_CLK_I2C0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                i2c-scl-internal-delay-ns = <6>;
                status = "disabled";
        };
                reg = <0 0xe6518000 0 0x40>;
                interrupts = <GIC_SPI 288 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7793_CLK_I2C1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                i2c-scl-internal-delay-ns = <6>;
                status = "disabled";
        };
                reg = <0 0xe6530000 0 0x40>;
                interrupts = <GIC_SPI 286 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7793_CLK_I2C2>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                i2c-scl-internal-delay-ns = <6>;
                status = "disabled";
        };
                reg = <0 0xe6540000 0 0x40>;
                interrupts = <GIC_SPI 290 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7793_CLK_I2C3>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                i2c-scl-internal-delay-ns = <6>;
                status = "disabled";
        };
                reg = <0 0xe6520000 0 0x40>;
                interrupts = <GIC_SPI 19 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7793_CLK_I2C4>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                i2c-scl-internal-delay-ns = <6>;
                status = "disabled";
        };
                reg = <0 0xe6528000 0 0x40>;
                interrupts = <GIC_SPI 20 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7793_CLK_I2C5>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                i2c-scl-internal-delay-ns = <110>;
                status = "disabled";
        };
                clocks = <&mstp9_clks R8A7793_CLK_IICDVFS>;
                dmas = <&dmac0 0x77>, <&dmac0 0x78>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp3_clks R8A7793_CLK_IIC0>;
                dmas = <&dmac0 0x61>, <&dmac0 0x62>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp3_clks R8A7793_CLK_IIC1>;
                dmas = <&dmac0 0x65>, <&dmac0 0x66>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp3_clks R8A7793_CLK_SDHI0>;
                dmas = <&dmac0 0xcd>, <&dmac0 0xce>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp3_clks R8A7793_CLK_SDHI1>;
                dmas = <&dmac0 0xc1>, <&dmac0 0xc2>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp3_clks R8A7793_CLK_SDHI2>;
                dmas = <&dmac0 0xd3>, <&dmac0 0xd4>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x21>, <&dmac0 0x22>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x25>, <&dmac0 0x26>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x27>, <&dmac0 0x28>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x1b>, <&dmac0 0x1c>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x1f>, <&dmac0 0x20>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x23>, <&dmac0 0x24>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x3d>, <&dmac0 0x3e>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x19>, <&dmac0 0x1a>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x1d>, <&dmac0 0x1e>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x29>, <&dmac0 0x2a>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x2d>, <&dmac0 0x2e>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x2b>, <&dmac0 0x2c>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x2f>, <&dmac0 0x30>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0xfb>, <&dmac0 0xfc>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0xfd>, <&dmac0 0xfe>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x39>, <&dmac0 0x3a>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x4d>, <&dmac0 0x4e>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x3b>, <&dmac0 0x3c>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0 0xee700000 0 0x400>;
                interrupts = <GIC_SPI 162 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7793_CLK_ETHER>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                phy-mode = "rmii";
                #address-cells = <1>;
                #size-cells = <0>;
                clocks = <&mstp9_clks R8A7793_CLK_QSPI_MOD>;
                dmas = <&dmac0 0x17>, <&dmac0 0x18>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                num-cs = <1>;
                #address-cells = <1>;
                #size-cells = <0>;
                clocks = <&mstp9_clks R8A7793_CLK_RCAN0>,
                         <&cpg_clocks R8A7793_CLK_RCAN>, <&can_clk>;
                clock-names = "clkp1", "clkp2", "can_clk";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp9_clks R8A7793_CLK_RCAN1>,
                         <&cpg_clocks R8A7793_CLK_RCAN>, <&can_clk>;
                clock-names = "clkp1", "clkp2", "can_clk";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                };
        };
 
+       sysc: system-controller@e6180000 {
+               compatible = "renesas,r8a7793-sysc";
+               reg = <0 0xe6180000 0 0x0200>;
+               #power-domain-cells = <1>;
+       };
+
        ipmmu_sy0: mmu@e6280000 {
                compatible = "renesas,ipmmu-r8a7793", "renesas,ipmmu-vmsa";
                reg = <0 0xe6280000 0 0x1000>;
                                "src.4", "src.3", "src.2", "src.1", "src.0",
                                "dvc.0", "dvc.1",
                                "clk_a", "clk_b", "clk_c", "clk_i";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
 
                status = "disabled";
 
index e45b23f..f334a3a 100644 (file)
@@ -12,6 +12,7 @@
 #include <dt-bindings/clock/r8a7794-clock.h>
 #include <dt-bindings/interrupt-controller/arm-gic.h>
 #include <dt-bindings/interrupt-controller/irq.h>
+#include <dt-bindings/power/r8a7794-sysc.h>
 
 / {
        compatible = "renesas,r8a7794";
@@ -42,6 +43,7 @@
                        compatible = "arm,cortex-a7";
                        reg = <0>;
                        clock-frequency = <1000000000>;
+                       power-domains = <&sysc R8A7794_PD_CA7_CPU0>;
                        next-level-cache = <&L2_CA7>;
                };
 
                        compatible = "arm,cortex-a7";
                        reg = <1>;
                        clock-frequency = <1000000000>;
+                       power-domains = <&sysc R8A7794_PD_CA7_CPU1>;
                        next-level-cache = <&L2_CA7>;
                };
        };
 
        L2_CA7: cache-controller@1 {
                compatible = "cache";
+               power-domains = <&sysc R8A7794_PD_CA7_SCU>;
                cache-unified;
                cache-level = <2>;
        };
@@ -82,7 +86,7 @@
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7794_CLK_GPIO0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
        };
 
        gpio1: gpio@e6051000 {
@@ -95,7 +99,7 @@
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7794_CLK_GPIO1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
        };
 
        gpio2: gpio@e6052000 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7794_CLK_GPIO2>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
        };
 
        gpio3: gpio@e6053000 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7794_CLK_GPIO3>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
        };
 
        gpio4: gpio@e6054000 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7794_CLK_GPIO4>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
        };
 
        gpio5: gpio@e6055000 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7794_CLK_GPIO5>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
        };
 
        gpio6: gpio@e6055400 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7794_CLK_GPIO6>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
        };
 
        cmt0: timer@ffca0000 {
                             <GIC_SPI 143 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp1_clks R8A7794_CLK_CMT0>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
 
                renesas,channels-mask = <0x60>;
 
                             <GIC_SPI 127 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp3_clks R8A7794_CLK_CMT1>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
 
                renesas,channels-mask = <0xff>;
 
                             <GIC_SPI 16 IRQ_TYPE_LEVEL_HIGH>,
                             <GIC_SPI 17 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp4_clks R8A7794_CLK_IRQC>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
        };
 
        pfc: pin-controller@e6060000 {
                                "ch12", "ch13", "ch14";
                clocks = <&mstp2_clks R8A7794_CLK_SYS_DMAC0>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                #dma-cells = <1>;
                dma-channels = <15>;
        };
                                "ch12", "ch13", "ch14";
                clocks = <&mstp2_clks R8A7794_CLK_SYS_DMAC1>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                #dma-cells = <1>;
                dma-channels = <15>;
        };
                clock-names = "fck";
                dmas = <&dmac0 0x21>, <&dmac0 0x22>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x25>, <&dmac0 0x26>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x27>, <&dmac0 0x28>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x1b>, <&dmac0 0x1c>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x1f>, <&dmac0 0x20>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x23>, <&dmac0 0x24>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x3d>, <&dmac0 0x3e>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x19>, <&dmac0 0x1a>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x1d>, <&dmac0 0x1e>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x29>, <&dmac0 0x2a>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x2d>, <&dmac0 0x2e>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x2b>, <&dmac0 0x2c>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x2f>, <&dmac0 0x30>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0xfb>, <&dmac0 0xfc>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0xfd>, <&dmac0 0xfe>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x39>, <&dmac0 0x3a>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x4d>, <&dmac0 0x4e>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x3b>, <&dmac0 0x3c>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0 0xee700000 0 0x400>;
                interrupts = <GIC_SPI 162 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7794_CLK_ETHER>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                phy-mode = "rmii";
                #address-cells = <1>;
                #size-cells = <0>;
                reg = <0 0xe6800000 0 0x800>, <0 0xee0e8000 0 0x4000>;
                interrupts = <GIC_SPI 163 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7794_CLK_ETHERAVB>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                #address-cells = <1>;
                #size-cells = <0>;
                status = "disabled";
                reg = <0 0xe6508000 0 0x40>;
                interrupts = <GIC_SPI 287 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7794_CLK_I2C0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                #address-cells = <1>;
                #size-cells = <0>;
                i2c-scl-internal-delay-ns = <6>;
                reg = <0 0xe6518000 0 0x40>;
                interrupts = <GIC_SPI 288 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7794_CLK_I2C1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                #address-cells = <1>;
                #size-cells = <0>;
                i2c-scl-internal-delay-ns = <6>;
                reg = <0 0xe6530000 0 0x40>;
                interrupts = <GIC_SPI 286 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7794_CLK_I2C2>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                #address-cells = <1>;
                #size-cells = <0>;
                i2c-scl-internal-delay-ns = <6>;
                reg = <0 0xe6540000 0 0x40>;
                interrupts = <GIC_SPI 290 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7794_CLK_I2C3>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                #address-cells = <1>;
                #size-cells = <0>;
                i2c-scl-internal-delay-ns = <6>;
                reg = <0 0xe6520000 0 0x40>;
                interrupts = <GIC_SPI 19 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7794_CLK_I2C4>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                #address-cells = <1>;
                #size-cells = <0>;
                i2c-scl-internal-delay-ns = <6>;
                reg = <0 0xe6528000 0 0x40>;
                interrupts = <GIC_SPI 20 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7794_CLK_I2C5>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                #address-cells = <1>;
                #size-cells = <0>;
                i2c-scl-internal-delay-ns = <6>;
                clocks = <&mstp3_clks R8A7794_CLK_IIC0>;
                dmas = <&dmac0 0x61>, <&dmac0 0x62>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                #address-cells = <1>;
                #size-cells = <0>;
                status = "disabled";
                clocks = <&mstp3_clks R8A7794_CLK_IIC1>;
                dmas = <&dmac0 0x65>, <&dmac0 0x66>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                #address-cells = <1>;
                #size-cells = <0>;
                status = "disabled";
                clocks = <&mstp3_clks R8A7794_CLK_MMCIF0>;
                dmas = <&dmac0 0xd1>, <&dmac0 0xd2>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                reg-io-width = <4>;
                status = "disabled";
        };
                reg = <0 0xee100000 0 0x200>;
                interrupts = <GIC_SPI 165 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp3_clks R8A7794_CLK_SDHI0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0 0xee140000 0 0x100>;
                interrupts = <GIC_SPI 167 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp3_clks R8A7794_CLK_SDHI1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0 0xee160000 0 0x100>;
                interrupts = <GIC_SPI 168 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp3_clks R8A7794_CLK_SDHI2>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp9_clks R8A7794_CLK_QSPI_MOD>;
                dmas = <&dmac0 0x17>, <&dmac0 0x18>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                num-cs = <1>;
                #address-cells = <1>;
                #size-cells = <0>;
                reg = <0 0xe6ef0000 0 0x1000>;
                interrupts = <GIC_SPI 188 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7794_CLK_VIN0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0 0xe6ef1000 0 0x1000>;
                interrupts = <GIC_SPI 189 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7794_CLK_VIN1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                      <0 0xee080000 0 0x1100>;
                interrupts = <GIC_SPI 108 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp7_clks R8A7794_CLK_EHCI>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
 
                bus-range = <0 0>;
                      <0 0xee0c0000 0 0x1100>;
                interrupts = <GIC_SPI 113 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp7_clks R8A7794_CLK_EHCI>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
 
                bus-range = <1 1>;
                reg = <0 0xe6590000 0 0x100>;
                interrupts = <GIC_SPI 107 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp7_clks R8A7794_CLK_HSUSB>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                renesas,buswait = <4>;
                phys = <&usb0 1>;
                phy-names = "usb";
                #size-cells = <0>;
                clocks = <&mstp7_clks R8A7794_CLK_HSUSB>;
                clock-names = "usbhs";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
 
                usb0: usb-channel@0 {
                clocks = <&mstp9_clks R8A7794_CLK_RCAN0>,
                         <&cpg_clocks R8A7794_CLK_RCAN>, <&can_clk>;
                clock-names = "clkp1", "clkp2", "can_clk";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp9_clks R8A7794_CLK_RCAN1>,
                         <&cpg_clocks R8A7794_CLK_RCAN>, <&can_clk>;
                clock-names = "clkp1", "clkp2", "can_clk";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                };
        };
 
+       sysc: system-controller@e6180000 {
+               compatible = "renesas,r8a7794-sysc";
+               reg = <0 0xe6180000 0 0x0200>;
+               #power-domain-cells = <1>;
+       };
+
        ipmmu_sy0: mmu@e6280000 {
                compatible = "renesas,ipmmu-r8a7794", "renesas,ipmmu-vmsa";
                reg = <0 0xe6280000 0 0x1000>;
index a99f07a..941f362 100644 (file)
                vddio-pex-ctl-supply = <&vdd_3v3_lp0>;
                avdd-pll-erefe-supply = <&avdd_1v05_run>;
 
+               /* Mini PCIe */
                pci@1,0 {
+                       phys = <&{/padctl@0,7009f000/pads/pcie/lanes/pcie-4}>;
+                       phy-names = "pcie-0";
                        status = "okay";
                };
 
+               /* Gigabit Ethernet */
                pci@2,0 {
+                       phys = <&{/padctl@0,7009f000/pads/pcie/lanes/pcie-2}>;
+                       phy-names = "pcie-0";
                        status = "okay";
                };
        };
        sata@0,70020000 {
                status = "okay";
 
+               phys = <&{/padctl@0,7009f000/pads/sata/lanes/sata-0}>;
+               phy-names = "sata-0";
+
                hvdd-supply = <&vdd_3v3_lp0>;
                vddio-supply = <&vdd_1v05_run>;
                avdd-supply = <&vdd_1v05_run>;
                status = "okay";
        };
 
+       usb@0,70090000 {
+               phys = <&{/padctl@0,7009f000/pads/usb2/lanes/usb2-0}>, /* Micro A/B */
+                      <&{/padctl@0,7009f000/pads/usb2/lanes/usb2-1}>, /* Mini PCIe */
+                      <&{/padctl@0,7009f000/pads/usb2/lanes/usb2-2}>, /* USB3 */
+                      <&{/padctl@0,7009f000/pads/pcie/lanes/pcie-0}>; /* USB3 */
+               phy-names = "usb2-0", "usb2-1", "usb2-2", "usb3-0";
+
+               avddio-pex-supply = <&vdd_1v05_run>;
+               dvddio-pex-supply = <&vdd_1v05_run>;
+               avdd-usb-supply = <&vdd_3v3_lp0>;
+               avdd-pll-utmip-supply = <&vddio_1v8>;
+               avdd-pll-erefe-supply = <&avdd_1v05_run>;
+               avdd-usb-ss-pll-supply = <&vdd_1v05_run>;
+               hvdd-usb-ss-supply = <&vdd_3v3_lp0>;
+               hvdd-usb-ss-pll-e-supply = <&vdd_3v3_lp0>;
+
+               status = "okay";
+       };
+
        padctl@0,7009f000 {
-               pinctrl-0 = <&padctl_default>;
-               pinctrl-names = "default";
+               status = "okay";
 
-               padctl_default: pinmux {
-                       usb3 {
-                               nvidia,lanes = "pcie-0", "pcie-1";
-                               nvidia,function = "usb3";
-                               nvidia,iddq = <0>;
+               pads {
+                       usb2 {
+                               status = "okay";
+
+                               lanes {
+                                       usb2-0 {
+                                               nvidia,function = "xusb";
+                                               status = "okay";
+                                       };
+
+                                       usb2-1 {
+                                               nvidia,function = "xusb";
+                                               status = "okay";
+                                       };
+
+                                       usb2-2 {
+                                               nvidia,function = "xusb";
+                                               status = "okay";
+                                       };
+                               };
                        };
 
                        pcie {
-                               nvidia,lanes = "pcie-2", "pcie-3",
-                                              "pcie-4";
-                               nvidia,function = "pcie";
-                               nvidia,iddq = <0>;
+                               status = "okay";
+
+                               lanes {
+                                       pcie-0 {
+                                               nvidia,function = "usb3-ss";
+                                               status = "okay";
+                                       };
+
+                                       pcie-2 {
+                                               nvidia,function = "pcie";
+                                               status = "okay";
+                                       };
+
+                                       pcie-4 {
+                                               nvidia,function = "pcie";
+                                               status = "okay";
+                                       };
+                               };
                        };
 
                        sata {
-                               nvidia,lanes = "sata-0";
-                               nvidia,function = "sata";
-                               nvidia,iddq = <0>;
+                               status = "okay";
+
+                               lanes {
+                                       sata-0 {
+                                               nvidia,function = "sata";
+                                               status = "okay";
+                                       };
+                               };
+                       };
+               };
+
+               ports {
+                       /* Micro A/B */
+                       usb2-0 {
+                               status = "okay";
+                               mode = "otg";
+                       };
+
+                       /* Mini PCIe */
+                       usb2-1 {
+                               status = "okay";
+                               mode = "host";
+                       };
+
+                       /* USB3 */
+                       usb2-2 {
+                               status = "okay";
+                               mode = "host";
+
+                               vbus-supply = <&vdd_usb3_vbus>;
+                       };
+
+                       usb3-0 {
+                               nvidia,usb2-companion = <2>;
+                               status = "okay";
                        };
                };
        };
index 5f1fc14..0710a60 100644 (file)
                                        regulator-always-on;
                                };
 
-                               ldo0 {
+                               avdd_1v05_run: ldo0 {
                                        regulator-name = "+1.05V_RUN_AVDD";
                                        regulator-min-microvolt = <1050000>;
                                        regulator-max-microvolt = <1050000>;
                status = "okay";
        };
 
+       usb@0,70090000 {
+               phys = <&{/padctl@0,7009f000/pads/usb2/lanes/usb2-0}>, /* 1st USB A */
+                      <&{/padctl@0,7009f000/pads/usb2/lanes/usb2-1}>, /* Internal USB */
+                      <&{/padctl@0,7009f000/pads/usb2/lanes/usb2-2}>, /* 2nd USB A */
+                      <&{/padctl@0,7009f000/pads/pcie/lanes/pcie-0}>, /* 1st USB A */
+                      <&{/padctl@0,7009f000/pads/pcie/lanes/pcie-1}>; /* 2nd USB A */
+               phy-names = "usb2-0", "usb2-1", "usb2-2", "usb3-0", "usb3-1";
+
+               avddio-pex-supply = <&vdd_1v05_run>;
+               dvddio-pex-supply = <&vdd_1v05_run>;
+               avdd-usb-supply = <&vdd_3v3_lp0>;
+               avdd-pll-utmip-supply = <&vddio_1v8>;
+               avdd-pll-erefe-supply = <&avdd_1v05_run>;
+               avdd-usb-ss-pll-supply = <&vdd_1v05_run>;
+               hvdd-usb-ss-supply = <&vdd_3v3_lp0>;
+               hvdd-usb-ss-pll-e-supply = <&vdd_3v3_lp0>;
+
+               status = "okay";
+       };
+
+       padctl@0,7009f000 {
+               status = "okay";
+
+               pads {
+                       usb2 {
+                               status = "okay";
+
+                               lanes {
+                                       usb2-0 {
+                                               nvidia,function = "xusb";
+                                               status = "okay";
+                                       };
+
+                                       usb2-1 {
+                                               nvidia,function = "xusb";
+                                               status = "okay";
+                                       };
+
+                                       usb2-2 {
+                                               nvidia,function = "xusb";
+                                               status = "okay";
+                                       };
+                               };
+                       };
+
+                       pcie {
+                               status = "okay";
+
+                               lanes {
+                                       pcie-0 {
+                                               nvidia,function = "usb3-ss";
+                                               status = "okay";
+                                       };
+
+                                       pcie-1 {
+                                               nvidia,function = "usb3-ss";
+                                               status = "okay";
+                                       };
+                               };
+                       };
+               };
+
+               ports {
+                       usb2-0 {
+                               vbus-supply = <&vdd_usb1_vbus>;
+                               status = "okay";
+                               mode = "otg";
+                       };
+
+                       usb2-1 {
+                               vbus-supply = <&vdd_run_cam>;
+                               status = "okay";
+                               mode = "host";
+                       };
+
+                       usb2-2 {
+                               vbus-supply = <&vdd_usb3_vbus>;
+                               status = "okay";
+                               mode = "host";
+                       };
+
+                       usb3-0 {
+                               nvidia,usb2-companion = <0>;
+                               status = "okay";
+                       };
+
+                       usb3-1 {
+                               nvidia,usb2-companion = <1>;
+                               status = "okay";
+                       };
+               };
+       };
+
        sdhci0_pwrseq: sdhci0_pwrseq {
                compatible = "mmc-pwrseq-simple";
 
                };
        };
 
-       usb@0,7d000000 { /* Rear external USB port. */
-               status = "okay";
-       };
-
-       usb-phy@0,7d000000 {
-               status = "okay";
-               vbus-supply = <&vdd_usb1_vbus>;
-       };
-
-       usb@0,7d004000 { /* Internal webcam. */
-               status = "okay";
-       };
-
-       usb-phy@0,7d004000 {
-               status = "okay";
-               vbus-supply = <&vdd_run_cam>;
-       };
-
-       usb@0,7d008000 { /* Left external USB port. */
-               status = "okay";
-       };
-
-       usb-phy@0,7d008000 {
-               status = "okay";
-               vbus-supply = <&vdd_usb3_vbus>;
-       };
-
        backlight: backlight {
                compatible = "pwm-backlight";
 
index 0318258..973446d 100644 (file)
                                        regulator-always-on;
                                };
 
-                               ldo0 {
+                               avdd_1v05_run: ldo0 {
                                        regulator-name = "+1.05V_RUN_AVDD";
                                        regulator-min-microvolt = <1050000>;
                                        regulator-max-microvolt = <1050000>;
                status = "okay";
        };
 
+       usb@0,70090000 {
+               phys = <&{/padctl@0,7009f000/pads/usb2/lanes/usb2-0}>, /* 1st USB A */
+                      <&{/padctl@0,7009f000/pads/usb2/lanes/usb2-1}>, /* Internal USB */
+                      <&{/padctl@0,7009f000/pads/usb2/lanes/usb2-2}>, /* 2nd USB A */
+                      <&{/padctl@0,7009f000/pads/pcie/lanes/pcie-0}>, /* 1st USB A */
+                      <&{/padctl@0,7009f000/pads/pcie/lanes/pcie-1}>; /* 2nd USB A */
+               phy-names = "usb2-0", "usb2-1", "usb2-2", "usb3-0", "usb3-1";
+
+               avddio-pex-supply = <&vdd_1v05_run>;
+               dvddio-pex-supply = <&vdd_1v05_run>;
+               avdd-usb-supply = <&vdd_3v3_lp0>;
+               avdd-pll-utmip-supply = <&vddio_1v8>;
+               avdd-pll-erefe-supply = <&avdd_1v05_run>;
+               avdd-usb-ss-pll-supply = <&vdd_1v05_run>;
+               hvdd-usb-ss-supply = <&vdd_3v3_lp0>;
+               hvdd-usb-ss-pll-e-supply = <&vdd_3v3_lp0>;
+
+               status = "okay";
+       };
+
+       padctl@0,7009f000 {
+               pads {
+                       usb2 {
+                               status = "okay";
+
+                               lanes {
+                                       usb2-0 {
+                                               nvidia,function = "xusb";
+                                               status = "okay";
+                                       };
+
+                                       usb2-1 {
+                                               nvidia,function = "xusb";
+                                               status = "okay";
+                                       };
+
+                                       usb2-2 {
+                                               nvidia,function = "xusb";
+                                               status = "okay";
+                                       };
+                               };
+                       };
+
+                       pcie {
+                               status = "okay";
+
+                               lanes {
+                                       pcie-0 {
+                                               nvidia,function = "usb3-ss";
+                                               status = "okay";
+                                       };
+
+                                       pcie-1 {
+                                               nvidia,function = "usb3-ss";
+                                               status = "okay";
+                                       };
+
+                                       pcie-1 {
+                                               nvidia,function = "usb3-ss";
+                                               status = "okay";
+                                       };
+                               };
+                       };
+               };
+
+               ports {
+                       usb2-0 {
+                               status = "okay";
+                               mode = "otg";
+
+                               vbus-supply = <&vdd_usb1_vbus>;
+                       };
+
+                       usb2-1 {
+                               status = "okay";
+                               mode = "host";
+
+                               vbus-supply = <&vdd_run_cam>;
+                       };
+
+                       usb2-2 {
+                               status = "okay";
+                               mode = "host";
+
+                               vbus-supply = <&vdd_usb3_vbus>;
+                       };
+
+                       usb3-0 {
+                               nvidia,usb2-companion = <0>;
+                               status = "okay";
+                       };
+
+                       usb3-1 {
+                               nvidia,usb2-companion = <2>;
+                               status = "okay";
+                       };
+               };
+       };
+
        sdhci@0,700b0400 {
                cd-gpios = <&gpio TEGRA_GPIO(V, 2) GPIO_ACTIVE_HIGH>;
                power-gpios = <&gpio TEGRA_GPIO(R, 0) GPIO_ACTIVE_HIGH>;
index e4eac1f..ea48118 100644 (file)
@@ -2,7 +2,6 @@
 #include <dt-bindings/gpio/tegra-gpio.h>
 #include <dt-bindings/memory/tegra124-mc.h>
 #include <dt-bindings/pinctrl/pinctrl-tegra.h>
-#include <dt-bindings/pinctrl/pinctrl-tegra-xusb.h>
 #include <dt-bindings/interrupt-controller/arm-gic.h>
 #include <dt-bindings/reset/tegra124-car.h>
 #include <dt-bindings/thermal/tegra124-soctherm.h>
@@ -51,9 +50,6 @@
                reset-names = "pex", "afi", "pcie_x";
                status = "disabled";
 
-               phys = <&padctl TEGRA_XUSB_PADCTL_PCIE>;
-               phy-names = "pcie";
-
                pci@1,0 {
                        device_type = "pci";
                        assigned-addresses = <0x82000800 0 0x01000000 0 0x1000>;
                         <&tegra_car 123>,
                         <&tegra_car 129>;
                reset-names = "sata", "sata-oob", "sata-cold";
-               phys = <&padctl TEGRA_XUSB_PADCTL_SATA>;
-               phy-names = "sata-phy";
                status = "disabled";
        };
 
                status = "disabled";
        };
 
+       usb@0,70090000 {
+               compatible = "nvidia,tegra124-xusb";
+               reg = <0x0 0x70090000 0x0 0x8000>,
+                     <0x0 0x70098000 0x0 0x1000>,
+                     <0x0 0x70099000 0x0 0x1000>;
+               reg-names = "hcd", "fpci", "ipfs";
+
+               interrupts = <GIC_SPI 39 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 40 IRQ_TYPE_LEVEL_HIGH>;
+
+               clocks = <&tegra_car TEGRA124_CLK_XUSB_HOST>,
+                        <&tegra_car TEGRA124_CLK_XUSB_HOST_SRC>,
+                        <&tegra_car TEGRA124_CLK_XUSB_FALCON_SRC>,
+                        <&tegra_car TEGRA124_CLK_XUSB_SS>,
+                        <&tegra_car TEGRA124_CLK_XUSB_SS_DIV2>,
+                        <&tegra_car TEGRA124_CLK_XUSB_SS_SRC>,
+                        <&tegra_car TEGRA124_CLK_XUSB_HS_SRC>,
+                        <&tegra_car TEGRA124_CLK_XUSB_FS_SRC>,
+                        <&tegra_car TEGRA124_CLK_PLL_U_480M>,
+                        <&tegra_car TEGRA124_CLK_CLK_M>,
+                        <&tegra_car TEGRA124_CLK_PLL_E>;
+               clock-names = "xusb_host", "xusb_host_src",
+                             "xusb_falcon_src", "xusb_ss",
+                             "xusb_ss_div2", "xusb_ss_src",
+                             "xusb_hs_src", "xusb_fs_src",
+                             "pll_u_480m", "clk_m", "pll_e";
+               resets = <&tegra_car 89>, <&tegra_car 156>,
+                        <&tegra_car 143>;
+               reset-names = "xusb_host", "xusb_ss", "xusb_src";
+
+               nvidia,xusb-padctl = <&padctl>;
+
+               status = "disabled";
+       };
+
        padctl: padctl@0,7009f000 {
                compatible = "nvidia,tegra124-xusb-padctl";
                reg = <0x0 0x7009f000 0x0 0x1000>;
                resets = <&tegra_car 142>;
                reset-names = "padctl";
 
-               #phy-cells = <1>;
+               pads {
+                       usb2 {
+                               status = "disabled";
+
+                               lanes {
+                                       usb2-0 {
+                                               status = "disabled";
+                                               #phy-cells = <0>;
+                                       };
+
+                                       usb2-1 {
+                                               status = "disabled";
+                                               #phy-cells = <0>;
+                                       };
+
+                                       usb2-2 {
+                                               status = "disabled";
+                                               #phy-cells = <0>;
+                                       };
+                               };
+                       };
+
+                       ulpi {
+                               status = "disabled";
+
+                               lanes {
+                                       ulpi-0 {
+                                               status = "disabled";
+                                               #phy-cells = <0>;
+                                       };
+                               };
+                       };
+
+                       hsic {
+                               status = "disabled";
+
+                               lanes {
+                                       hsic-0 {
+                                               status = "disabled";
+                                               #phy-cells = <0>;
+                                       };
+
+                                       hsic-1 {
+                                               status = "disabled";
+                                               #phy-cells = <0>;
+                                       };
+                               };
+                       };
+
+                       pcie {
+                               status = "disabled";
+
+                               lanes {
+                                       pcie-0 {
+                                               status = "disabled";
+                                               #phy-cells = <0>;
+                                       };
+
+                                       pcie-1 {
+                                               status = "disabled";
+                                               #phy-cells = <0>;
+                                       };
+
+                                       pcie-2 {
+                                               status = "disabled";
+                                               #phy-cells = <0>;
+                                       };
+
+                                       pcie-3 {
+                                               status = "disabled";
+                                               #phy-cells = <0>;
+                                       };
+
+                                       pcie-4 {
+                                               status = "disabled";
+                                               #phy-cells = <0>;
+                                       };
+                               };
+                       };
+
+                       sata {
+                               status = "disabled";
+
+                               lanes {
+                                       sata-0 {
+                                               status = "disabled";
+                                               #phy-cells = <0>;
+                                       };
+                               };
+                       };
+               };
+
+               ports {
+                       usb2-0 {
+                               status = "disabled";
+                       };
+
+                       usb2-1 {
+                               status = "disabled";
+                       };
+
+                       usb2-2 {
+                               status = "disabled";
+                       };
+
+                       ulpi-0 {
+                               status = "disabled";
+                       };
+
+                       hsic-0 {
+                               status = "disabled";
+                       };
+
+                       hsic-1 {
+                               status = "disabled";
+                       };
+
+                       usb3-0 {
+                               status = "disabled";
+                       };
+
+                       usb3-1 {
+                               status = "disabled";
+                       };
+               };
        };
 
        sdhci@0,700b0000 {
index 4d8b7f6..a8a8e43 100644 (file)
                clock-frequency = <16000000>;
        };
 
+       panel: panel {
+               compatible = "edt,et057090dhu";
+               backlight = <&bl>;
+       };
+
        reg_3v3: regulator-3v3 {
                compatible = "regulator-fixed";
                regulator-name = "3.3V";
        status  = "okay";
 };
 
+&dcu0 {
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_dcu0_1>;
+       fsl,panel = <&panel>;
+       status = "okay";
+};
+
 &dspi1 {
        status = "okay";
 
        vin-supply = <&reg_3v3>;
 };
 
+&tcon0 {
+       status = "okay";
+};
+
 &uart0 {
        status = "okay";
 };
index 226a86f..b741709 100644 (file)
                        >;
                };
 
+               pinctrl_dcu0_1: dcu0grp_1 {
+                       fsl,pins = <
+                               VF610_PAD_PTE0__DCU0_HSYNC      0x1902
+                               VF610_PAD_PTE1__DCU0_VSYNC      0x1902
+                               VF610_PAD_PTE2__DCU0_PCLK       0x1902
+                               VF610_PAD_PTE4__DCU0_DE         0x1902
+                               VF610_PAD_PTE5__DCU0_R0         0x1902
+                               VF610_PAD_PTE6__DCU0_R1         0x1902
+                               VF610_PAD_PTE7__DCU0_R2         0x1902
+                               VF610_PAD_PTE8__DCU0_R3         0x1902
+                               VF610_PAD_PTE9__DCU0_R4         0x1902
+                               VF610_PAD_PTE10__DCU0_R5        0x1902
+                               VF610_PAD_PTE11__DCU0_R6        0x1902
+                               VF610_PAD_PTE12__DCU0_R7        0x1902
+                               VF610_PAD_PTE13__DCU0_G0        0x1902
+                               VF610_PAD_PTE14__DCU0_G1        0x1902
+                               VF610_PAD_PTE15__DCU0_G2        0x1902
+                               VF610_PAD_PTE16__DCU0_G3        0x1902
+                               VF610_PAD_PTE17__DCU0_G4        0x1902
+                               VF610_PAD_PTE18__DCU0_G5        0x1902
+                               VF610_PAD_PTE19__DCU0_G6        0x1902
+                               VF610_PAD_PTE20__DCU0_G7        0x1902
+                               VF610_PAD_PTE21__DCU0_B0        0x1902
+                               VF610_PAD_PTE22__DCU0_B1        0x1902
+                               VF610_PAD_PTE23__DCU0_B2        0x1902
+                               VF610_PAD_PTE24__DCU0_B3        0x1902
+                               VF610_PAD_PTE25__DCU0_B4        0x1902
+                               VF610_PAD_PTE26__DCU0_B5        0x1902
+                               VF610_PAD_PTE27__DCU0_B6        0x1902
+                               VF610_PAD_PTE28__DCU0_B7        0x1902
+                       >;
+               };
+
                pinctrl_dspi1: dspi1grp {
                        fsl,pins = <
                                VF610_PAD_PTD5__DSPI1_CS0               0x33e2
index 04ef54d..2c13ec6 100644 (file)
                                                        <20000000>;
                        };
 
+                       tcon0: timing-controller@4003d000 {
+                               compatible = "fsl,vf610-tcon";
+                               reg = <0x4003d000 0x1000>;
+                               clocks = <&clks VF610_CLK_TCON0>;
+                               clock-names = "ipg";
+                               status = "disabled";
+                       };
+
                        wdoga5: wdog@4003e000 {
                                compatible = "fsl,vf610-wdt", "fsl,imx21-wdt";
                                reg = <0x4003e000 0x1000>;
                                status = "disabled";
                        };
 
+                       dcu0: dcu@40058000 {
+                               compatible = "fsl,vf610-dcu";
+                               reg = <0x40058000 0x1200>;
+                               interrupts = <30 IRQ_TYPE_LEVEL_HIGH>;
+                               clocks = <&clks VF610_CLK_DCU0>,
+                                       <&clks VF610_CLK_DCU0_DIV>;
+                               clock-names = "dcu", "pix";
+                               fsl,tcon = <&tcon0>;
+                               status = "disabled";
+                       };
+
                        i2c0: i2c@40066000 {
                                #address-cells = <1>;
                                #size-cells = <0>;
index 0df6b1f..96387d4 100644 (file)
@@ -41,6 +41,8 @@
 
 #define KVM_MAX_VCPUS VGIC_V2_MAX_CPUS
 
+#define KVM_REQ_VCPU_EXIT      8
+
 u32 *kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode);
 int __attribute_const__ kvm_target_cpu(void);
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
@@ -226,6 +228,10 @@ static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
 
 struct kvm_vcpu *kvm_arm_get_running_vcpu(void);
 struct kvm_vcpu __percpu **kvm_get_running_vcpus(void);
+void kvm_arm_halt_guest(struct kvm *kvm);
+void kvm_arm_resume_guest(struct kvm *kvm);
+void kvm_arm_halt_vcpu(struct kvm_vcpu *vcpu);
+void kvm_arm_resume_vcpu(struct kvm_vcpu *vcpu);
 
 int kvm_arm_copy_coproc_indices(struct kvm_vcpu *vcpu, u64 __user *uindices);
 unsigned long kvm_arm_num_coproc_regs(struct kvm_vcpu *vcpu);
index d8e90c8..f3a7de7 100644 (file)
@@ -28,6 +28,9 @@ struct kvm_decode {
        bool sign_extend;
 };
 
+void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data);
+unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len);
+
 int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run);
 int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
                 phys_addr_t fault_ipa);
index 27563be..22bf1f6 100644 (file)
@@ -31,7 +31,7 @@ struct frame_tail {
  */
 static struct frame_tail __user *
 user_backtrace(struct frame_tail __user *tail,
-              struct perf_callchain_entry *entry)
+              struct perf_callchain_entry_ctx *entry)
 {
        struct frame_tail buftail;
        unsigned long err;
@@ -59,7 +59,7 @@ user_backtrace(struct frame_tail __user *tail,
 }
 
 void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
        struct frame_tail __user *tail;
 
@@ -75,7 +75,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 
        tail = (struct frame_tail __user *)regs->ARM_fp - 1;
 
-       while ((entry->nr < sysctl_perf_event_max_stack) &&
+       while ((entry->nr < entry->max_stack) &&
               tail && !((unsigned long)tail & 0x3))
                tail = user_backtrace(tail, entry);
 }
@@ -89,13 +89,13 @@ static int
 callchain_trace(struct stackframe *fr,
                void *data)
 {
-       struct perf_callchain_entry *entry = data;
+       struct perf_callchain_entry_ctx *entry = data;
        perf_callchain_store(entry, fr->pc);
        return 0;
 }
 
 void
-perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
        struct stackframe fr;
 
index ef9119f..4d93758 100644 (file)
@@ -733,8 +733,8 @@ static int vfp_set(struct task_struct *target,
        if (ret)
                return ret;
 
-       vfp_flush_hwstate(thread);
        thread->vfpstate.hard = new_vfp;
+       vfp_flush_hwstate(thread);
 
        return 0;
 }
index 95a0005..02abfff 100644 (file)
@@ -46,6 +46,13 @@ config KVM_ARM_HOST
        ---help---
          Provides host support for ARM processors.
 
+config KVM_NEW_VGIC
+       bool "New VGIC implementation"
+       depends on KVM
+       default y
+       ---help---
+         uses the new VGIC implementation
+
 source drivers/vhost/Kconfig
 
 endif # VIRTUALIZATION
index eb1bf43..a596b58 100644 (file)
@@ -21,7 +21,18 @@ obj-$(CONFIG_KVM_ARM_HOST) += hyp/
 obj-y += kvm-arm.o init.o interrupts.o
 obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o
 obj-y += coproc.o coproc_a15.o coproc_a7.o mmio.o psci.o perf.o
+
+ifeq ($(CONFIG_KVM_NEW_VGIC),y)
+obj-y += $(KVM)/arm/vgic/vgic.o
+obj-y += $(KVM)/arm/vgic/vgic-init.o
+obj-y += $(KVM)/arm/vgic/vgic-irqfd.o
+obj-y += $(KVM)/arm/vgic/vgic-v2.o
+obj-y += $(KVM)/arm/vgic/vgic-mmio.o
+obj-y += $(KVM)/arm/vgic/vgic-mmio-v2.o
+obj-y += $(KVM)/arm/vgic/vgic-kvm-device.o
+else
 obj-y += $(KVM)/arm/vgic.o
 obj-y += $(KVM)/arm/vgic-v2.o
 obj-y += $(KVM)/arm/vgic-v2-emul.o
+endif
 obj-y += $(KVM)/arm/arch_timer.o
index 237d5d8..893941e 100644 (file)
@@ -455,7 +455,7 @@ static void update_vttbr(struct kvm *kvm)
 static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
 {
        struct kvm *kvm = vcpu->kvm;
-       int ret;
+       int ret = 0;
 
        if (likely(vcpu->arch.has_run_once))
                return 0;
@@ -478,9 +478,9 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
         * interrupts from the virtual timer with a userspace gic.
         */
        if (irqchip_in_kernel(kvm) && vgic_initialized(kvm))
-               kvm_timer_enable(kvm);
+               ret = kvm_timer_enable(vcpu);
 
-       return 0;
+       return ret;
 }
 
 bool kvm_arch_intc_initialized(struct kvm *kvm)
@@ -488,30 +488,37 @@ bool kvm_arch_intc_initialized(struct kvm *kvm)
        return vgic_initialized(kvm);
 }
 
-static void kvm_arm_halt_guest(struct kvm *kvm) __maybe_unused;
-static void kvm_arm_resume_guest(struct kvm *kvm) __maybe_unused;
-
-static void kvm_arm_halt_guest(struct kvm *kvm)
+void kvm_arm_halt_guest(struct kvm *kvm)
 {
        int i;
        struct kvm_vcpu *vcpu;
 
        kvm_for_each_vcpu(i, vcpu, kvm)
                vcpu->arch.pause = true;
-       force_vm_exit(cpu_all_mask);
+       kvm_make_all_cpus_request(kvm, KVM_REQ_VCPU_EXIT);
+}
+
+void kvm_arm_halt_vcpu(struct kvm_vcpu *vcpu)
+{
+       vcpu->arch.pause = true;
+       kvm_vcpu_kick(vcpu);
 }
 
-static void kvm_arm_resume_guest(struct kvm *kvm)
+void kvm_arm_resume_vcpu(struct kvm_vcpu *vcpu)
+{
+       struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
+
+       vcpu->arch.pause = false;
+       swake_up(wq);
+}
+
+void kvm_arm_resume_guest(struct kvm *kvm)
 {
        int i;
        struct kvm_vcpu *vcpu;
 
-       kvm_for_each_vcpu(i, vcpu, kvm) {
-               struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
-
-               vcpu->arch.pause = false;
-               swake_up(wq);
-       }
+       kvm_for_each_vcpu(i, vcpu, kvm)
+               kvm_arm_resume_vcpu(vcpu);
 }
 
 static void vcpu_sleep(struct kvm_vcpu *vcpu)
index 0f6600f..10f80a6 100644 (file)
@@ -23,7 +23,7 @@
 
 #include "trace.h"
 
-static void mmio_write_buf(char *buf, unsigned int len, unsigned long data)
+void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data)
 {
        void *datap = NULL;
        union {
@@ -55,7 +55,7 @@ static void mmio_write_buf(char *buf, unsigned int len, unsigned long data)
        memcpy(buf, datap, len);
 }
 
-static unsigned long mmio_read_buf(char *buf, unsigned int len)
+unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len)
 {
        unsigned long data = 0;
        union {
@@ -66,7 +66,7 @@ static unsigned long mmio_read_buf(char *buf, unsigned int len)
 
        switch (len) {
        case 1:
-               data = buf[0];
+               data = *(u8 *)buf;
                break;
        case 2:
                memcpy(&tmp.hword, buf, len);
@@ -87,11 +87,10 @@ static unsigned long mmio_read_buf(char *buf, unsigned int len)
 
 /**
  * kvm_handle_mmio_return -- Handle MMIO loads after user space emulation
+ *                          or in-kernel IO emulation
+ *
  * @vcpu: The VCPU pointer
  * @run:  The VCPU run struct containing the mmio data
- *
- * This should only be called after returning from userspace for MMIO load
- * emulation.
  */
 int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
@@ -104,7 +103,7 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run)
                if (len > sizeof(unsigned long))
                        return -EINVAL;
 
-               data = mmio_read_buf(run->mmio.data, len);
+               data = kvm_mmio_read_buf(run->mmio.data, len);
 
                if (vcpu->arch.mmio_decode.sign_extend &&
                    len < sizeof(unsigned long)) {
@@ -190,7 +189,7 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
                                               len);
 
                trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len, fault_ipa, data);
-               mmio_write_buf(data_buf, len, data);
+               kvm_mmio_write_buf(data_buf, len, data);
 
                ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, fault_ipa, len,
                                       data_buf);
@@ -206,18 +205,19 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
        run->mmio.is_write      = is_write;
        run->mmio.phys_addr     = fault_ipa;
        run->mmio.len           = len;
-       if (is_write)
-               memcpy(run->mmio.data, data_buf, len);
 
        if (!ret) {
                /* We handled the access successfully in the kernel. */
+               if (!is_write)
+                       memcpy(run->mmio.data, data_buf, len);
                vcpu->stat.mmio_exit_kernel++;
                kvm_handle_mmio_return(vcpu, run);
                return 1;
-       } else {
-               vcpu->stat.mmio_exit_user++;
        }
 
+       if (is_write)
+               memcpy(run->mmio.data, data_buf, len);
+       vcpu->stat.mmio_exit_user++;
        run->exit_reason        = KVM_EXIT_MMIO;
        return 0;
 }
index c70709a..79b6b07 100644 (file)
@@ -2,6 +2,6 @@
 # Makefile for the linux kernel.
 #
 
-obj-y  := irq.o common.o serial.o
+obj-y  := common.o serial.o
 obj-y  += pm.o suspend.o
 obj-y  += phy3250.o
index 9e3b90d..0019053 100644 (file)
 #define IRQ_LPC32XX_GPI_06             LPC32XX_SIC2_IRQ(28)
 #define IRQ_LPC32XX_SYSCLK             LPC32XX_SIC2_IRQ(31)
 
-#define NR_IRQS                                96
+#define LPC32XX_NR_IRQS                        96
 
 #endif
diff --git a/arch/arm/mach-lpc32xx/irq.c b/arch/arm/mach-lpc32xx/irq.c
deleted file mode 100644 (file)
index 2ae431e..0000000
+++ /dev/null
@@ -1,477 +0,0 @@
-/*
- * arch/arm/mach-lpc32xx/irq.c
- *
- * Author: Kevin Wells <kevin.wells@nxp.com>
- *
- * Copyright (C) 2010 NXP Semiconductors
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/err.h>
-#include <linux/io.h>
-#include <linux/of.h>
-#include <linux/of_address.h>
-#include <linux/of_irq.h>
-#include <linux/irqdomain.h>
-#include <linux/module.h>
-
-#include <mach/irqs.h>
-#include <mach/hardware.h>
-#include <mach/platform.h>
-#include "common.h"
-
-/*
- * Default value representing the Activation polarity of all internal
- * interrupt sources
- */
-#define MIC_APR_DEFAULT                0x3FF0EFE0
-#define SIC1_APR_DEFAULT       0xFBD27186
-#define SIC2_APR_DEFAULT       0x801810C0
-
-/*
- * Default value representing the Activation Type of all internal
- * interrupt sources. All are level sensitive.
- */
-#define MIC_ATR_DEFAULT                0x00000000
-#define SIC1_ATR_DEFAULT       0x00026000
-#define SIC2_ATR_DEFAULT       0x00000000
-
-static struct irq_domain *lpc32xx_mic_domain;
-static struct device_node *lpc32xx_mic_np;
-
-struct lpc32xx_event_group_regs {
-       void __iomem *enab_reg;
-       void __iomem *edge_reg;
-       void __iomem *maskstat_reg;
-       void __iomem *rawstat_reg;
-};
-
-static const struct lpc32xx_event_group_regs lpc32xx_event_int_regs = {
-       .enab_reg = LPC32XX_CLKPWR_INT_ER,
-       .edge_reg = LPC32XX_CLKPWR_INT_AP,
-       .maskstat_reg = LPC32XX_CLKPWR_INT_SR,
-       .rawstat_reg = LPC32XX_CLKPWR_INT_RS,
-};
-
-static const struct lpc32xx_event_group_regs lpc32xx_event_pin_regs = {
-       .enab_reg = LPC32XX_CLKPWR_PIN_ER,
-       .edge_reg = LPC32XX_CLKPWR_PIN_AP,
-       .maskstat_reg = LPC32XX_CLKPWR_PIN_SR,
-       .rawstat_reg = LPC32XX_CLKPWR_PIN_RS,
-};
-
-struct lpc32xx_event_info {
-       const struct lpc32xx_event_group_regs *event_group;
-       u32 mask;
-};
-
-/*
- * Maps an IRQ number to and event mask and register
- */
-static const struct lpc32xx_event_info lpc32xx_events[NR_IRQS] = {
-       [IRQ_LPC32XX_GPI_08] = {
-               .event_group = &lpc32xx_event_pin_regs,
-               .mask = LPC32XX_CLKPWR_EXTSRC_GPI_08_BIT,
-       },
-       [IRQ_LPC32XX_GPI_09] = {
-               .event_group = &lpc32xx_event_pin_regs,
-               .mask = LPC32XX_CLKPWR_EXTSRC_GPI_09_BIT,
-       },
-       [IRQ_LPC32XX_GPI_19] = {
-               .event_group = &lpc32xx_event_pin_regs,
-               .mask = LPC32XX_CLKPWR_EXTSRC_GPI_19_BIT,
-       },
-       [IRQ_LPC32XX_GPI_07] = {
-               .event_group = &lpc32xx_event_pin_regs,
-               .mask = LPC32XX_CLKPWR_EXTSRC_GPI_07_BIT,
-       },
-       [IRQ_LPC32XX_GPI_00] = {
-               .event_group = &lpc32xx_event_pin_regs,
-               .mask = LPC32XX_CLKPWR_EXTSRC_GPI_00_BIT,
-       },
-       [IRQ_LPC32XX_GPI_01] = {
-               .event_group = &lpc32xx_event_pin_regs,
-               .mask = LPC32XX_CLKPWR_EXTSRC_GPI_01_BIT,
-       },
-       [IRQ_LPC32XX_GPI_02] = {
-               .event_group = &lpc32xx_event_pin_regs,
-               .mask = LPC32XX_CLKPWR_EXTSRC_GPI_02_BIT,
-       },
-       [IRQ_LPC32XX_GPI_03] = {
-               .event_group = &lpc32xx_event_pin_regs,
-               .mask = LPC32XX_CLKPWR_EXTSRC_GPI_03_BIT,
-       },
-       [IRQ_LPC32XX_GPI_04] = {
-               .event_group = &lpc32xx_event_pin_regs,
-               .mask = LPC32XX_CLKPWR_EXTSRC_GPI_04_BIT,
-       },
-       [IRQ_LPC32XX_GPI_05] = {
-               .event_group = &lpc32xx_event_pin_regs,
-               .mask = LPC32XX_CLKPWR_EXTSRC_GPI_05_BIT,
-       },
-       [IRQ_LPC32XX_GPI_06] = {
-               .event_group = &lpc32xx_event_pin_regs,
-               .mask = LPC32XX_CLKPWR_EXTSRC_GPI_06_BIT,
-       },
-       [IRQ_LPC32XX_GPI_28] = {
-               .event_group = &lpc32xx_event_pin_regs,
-               .mask = LPC32XX_CLKPWR_EXTSRC_GPI_28_BIT,
-       },
-       [IRQ_LPC32XX_GPIO_00] = {
-               .event_group = &lpc32xx_event_int_regs,
-               .mask = LPC32XX_CLKPWR_INTSRC_GPIO_00_BIT,
-       },
-       [IRQ_LPC32XX_GPIO_01] = {
-               .event_group = &lpc32xx_event_int_regs,
-               .mask = LPC32XX_CLKPWR_INTSRC_GPIO_01_BIT,
-       },
-       [IRQ_LPC32XX_GPIO_02] = {
-               .event_group = &lpc32xx_event_int_regs,
-               .mask = LPC32XX_CLKPWR_INTSRC_GPIO_02_BIT,
-       },
-       [IRQ_LPC32XX_GPIO_03] = {
-               .event_group = &lpc32xx_event_int_regs,
-               .mask = LPC32XX_CLKPWR_INTSRC_GPIO_03_BIT,
-       },
-       [IRQ_LPC32XX_GPIO_04] = {
-               .event_group = &lpc32xx_event_int_regs,
-               .mask = LPC32XX_CLKPWR_INTSRC_GPIO_04_BIT,
-       },
-       [IRQ_LPC32XX_GPIO_05] = {
-               .event_group = &lpc32xx_event_int_regs,
-               .mask = LPC32XX_CLKPWR_INTSRC_GPIO_05_BIT,
-       },
-       [IRQ_LPC32XX_KEY] = {
-               .event_group = &lpc32xx_event_int_regs,
-               .mask = LPC32XX_CLKPWR_INTSRC_KEY_BIT,
-       },
-       [IRQ_LPC32XX_ETHERNET] = {
-               .event_group = &lpc32xx_event_int_regs,
-               .mask = LPC32XX_CLKPWR_INTSRC_MAC_BIT,
-       },
-       [IRQ_LPC32XX_USB_OTG_ATX] = {
-               .event_group = &lpc32xx_event_int_regs,
-               .mask = LPC32XX_CLKPWR_INTSRC_USBATXINT_BIT,
-       },
-       [IRQ_LPC32XX_USB_HOST] = {
-               .event_group = &lpc32xx_event_int_regs,
-               .mask = LPC32XX_CLKPWR_INTSRC_USB_BIT,
-       },
-       [IRQ_LPC32XX_RTC] = {
-               .event_group = &lpc32xx_event_int_regs,
-               .mask = LPC32XX_CLKPWR_INTSRC_RTC_BIT,
-       },
-       [IRQ_LPC32XX_MSTIMER] = {
-               .event_group = &lpc32xx_event_int_regs,
-               .mask = LPC32XX_CLKPWR_INTSRC_MSTIMER_BIT,
-       },
-       [IRQ_LPC32XX_TS_AUX] = {
-               .event_group = &lpc32xx_event_int_regs,
-               .mask = LPC32XX_CLKPWR_INTSRC_TS_AUX_BIT,
-       },
-       [IRQ_LPC32XX_TS_P] = {
-               .event_group = &lpc32xx_event_int_regs,
-               .mask = LPC32XX_CLKPWR_INTSRC_TS_P_BIT,
-       },
-       [IRQ_LPC32XX_TS_IRQ] = {
-               .event_group = &lpc32xx_event_int_regs,
-               .mask = LPC32XX_CLKPWR_INTSRC_ADC_BIT,
-       },
-};
-
-static void get_controller(unsigned int irq, unsigned int *base,
-       unsigned int *irqbit)
-{
-       if (irq < 32) {
-               *base = LPC32XX_MIC_BASE;
-               *irqbit = 1 << irq;
-       } else if (irq < 64) {
-               *base = LPC32XX_SIC1_BASE;
-               *irqbit = 1 << (irq - 32);
-       } else {
-               *base = LPC32XX_SIC2_BASE;
-               *irqbit = 1 << (irq - 64);
-       }
-}
-
-static void lpc32xx_mask_irq(struct irq_data *d)
-{
-       unsigned int reg, ctrl, mask;
-
-       get_controller(d->hwirq, &ctrl, &mask);
-
-       reg = __raw_readl(LPC32XX_INTC_MASK(ctrl)) & ~mask;
-       __raw_writel(reg, LPC32XX_INTC_MASK(ctrl));
-}
-
-static void lpc32xx_unmask_irq(struct irq_data *d)
-{
-       unsigned int reg, ctrl, mask;
-
-       get_controller(d->hwirq, &ctrl, &mask);
-
-       reg = __raw_readl(LPC32XX_INTC_MASK(ctrl)) | mask;
-       __raw_writel(reg, LPC32XX_INTC_MASK(ctrl));
-}
-
-static void lpc32xx_ack_irq(struct irq_data *d)
-{
-       unsigned int ctrl, mask;
-
-       get_controller(d->hwirq, &ctrl, &mask);
-
-       __raw_writel(mask, LPC32XX_INTC_RAW_STAT(ctrl));
-
-       /* Also need to clear pending wake event */
-       if (lpc32xx_events[d->hwirq].mask != 0)
-               __raw_writel(lpc32xx_events[d->hwirq].mask,
-                       lpc32xx_events[d->hwirq].event_group->rawstat_reg);
-}
-
-static void __lpc32xx_set_irq_type(unsigned int irq, int use_high_level,
-       int use_edge)
-{
-       unsigned int reg, ctrl, mask;
-
-       get_controller(irq, &ctrl, &mask);
-
-       /* Activation level, high or low */
-       reg = __raw_readl(LPC32XX_INTC_POLAR(ctrl));
-       if (use_high_level)
-               reg |= mask;
-       else
-               reg &= ~mask;
-       __raw_writel(reg, LPC32XX_INTC_POLAR(ctrl));
-
-       /* Activation type, edge or level */
-       reg = __raw_readl(LPC32XX_INTC_ACT_TYPE(ctrl));
-       if (use_edge)
-               reg |= mask;
-       else
-               reg &= ~mask;
-       __raw_writel(reg, LPC32XX_INTC_ACT_TYPE(ctrl));
-
-       /* Use same polarity for the wake events */
-       if (lpc32xx_events[irq].mask != 0) {
-               reg = __raw_readl(lpc32xx_events[irq].event_group->edge_reg);
-
-               if (use_high_level)
-                       reg |= lpc32xx_events[irq].mask;
-               else
-                       reg &= ~lpc32xx_events[irq].mask;
-
-               __raw_writel(reg, lpc32xx_events[irq].event_group->edge_reg);
-       }
-}
-
-static int lpc32xx_set_irq_type(struct irq_data *d, unsigned int type)
-{
-       switch (type) {
-       case IRQ_TYPE_EDGE_RISING:
-               /* Rising edge sensitive */
-               __lpc32xx_set_irq_type(d->hwirq, 1, 1);
-               irq_set_handler_locked(d, handle_edge_irq);
-               break;
-
-       case IRQ_TYPE_EDGE_FALLING:
-               /* Falling edge sensitive */
-               __lpc32xx_set_irq_type(d->hwirq, 0, 1);
-               irq_set_handler_locked(d, handle_edge_irq);
-               break;
-
-       case IRQ_TYPE_LEVEL_LOW:
-               /* Low level sensitive */
-               __lpc32xx_set_irq_type(d->hwirq, 0, 0);
-               irq_set_handler_locked(d, handle_level_irq);
-               break;
-
-       case IRQ_TYPE_LEVEL_HIGH:
-               /* High level sensitive */
-               __lpc32xx_set_irq_type(d->hwirq, 1, 0);
-               irq_set_handler_locked(d, handle_level_irq);
-               break;
-
-       /* Other modes are not supported */
-       default:
-               return -EINVAL;
-       }
-
-       return 0;
-}
-
-static int lpc32xx_irq_wake(struct irq_data *d, unsigned int state)
-{
-       unsigned long eventreg;
-
-       if (lpc32xx_events[d->hwirq].mask != 0) {
-               eventreg = __raw_readl(lpc32xx_events[d->hwirq].
-                       event_group->enab_reg);
-
-               if (state)
-                       eventreg |= lpc32xx_events[d->hwirq].mask;
-               else {
-                       eventreg &= ~lpc32xx_events[d->hwirq].mask;
-
-                       /*
-                        * When disabling the wakeup, clear the latched
-                        * event
-                        */
-                       __raw_writel(lpc32xx_events[d->hwirq].mask,
-                               lpc32xx_events[d->hwirq].
-                               event_group->rawstat_reg);
-               }
-
-               __raw_writel(eventreg,
-                       lpc32xx_events[d->hwirq].event_group->enab_reg);
-
-               return 0;
-       }
-
-       /* Clear event */
-       __raw_writel(lpc32xx_events[d->hwirq].mask,
-               lpc32xx_events[d->hwirq].event_group->rawstat_reg);
-
-       return -ENODEV;
-}
-
-static void __init lpc32xx_set_default_mappings(unsigned int apr,
-       unsigned int atr, unsigned int offset)
-{
-       unsigned int i;
-
-       /* Set activation levels for each interrupt */
-       i = 0;
-       while (i < 32) {
-               __lpc32xx_set_irq_type(offset + i, ((apr >> i) & 0x1),
-                       ((atr >> i) & 0x1));
-               i++;
-       }
-}
-
-static struct irq_chip lpc32xx_irq_chip = {
-       .name = "MIC",
-       .irq_ack = lpc32xx_ack_irq,
-       .irq_mask = lpc32xx_mask_irq,
-       .irq_unmask = lpc32xx_unmask_irq,
-       .irq_set_type = lpc32xx_set_irq_type,
-       .irq_set_wake = lpc32xx_irq_wake
-};
-
-static void lpc32xx_sic1_handler(struct irq_desc *desc)
-{
-       unsigned long ints = __raw_readl(LPC32XX_INTC_STAT(LPC32XX_SIC1_BASE));
-
-       while (ints != 0) {
-               int irqno = fls(ints) - 1;
-
-               ints &= ~(1 << irqno);
-
-               generic_handle_irq(LPC32XX_SIC1_IRQ(irqno));
-       }
-}
-
-static void lpc32xx_sic2_handler(struct irq_desc *desc)
-{
-       unsigned long ints = __raw_readl(LPC32XX_INTC_STAT(LPC32XX_SIC2_BASE));
-
-       while (ints != 0) {
-               int irqno = fls(ints) - 1;
-
-               ints &= ~(1 << irqno);
-
-               generic_handle_irq(LPC32XX_SIC2_IRQ(irqno));
-       }
-}
-
-static int __init __lpc32xx_mic_of_init(struct device_node *node,
-                                       struct device_node *parent)
-{
-       lpc32xx_mic_np = node;
-
-       return 0;
-}
-
-static const struct of_device_id mic_of_match[] __initconst = {
-       { .compatible = "nxp,lpc3220-mic", .data = __lpc32xx_mic_of_init },
-       { }
-};
-
-void __init lpc32xx_init_irq(void)
-{
-       unsigned int i;
-
-       /* Setup MIC */
-       __raw_writel(0, LPC32XX_INTC_MASK(LPC32XX_MIC_BASE));
-       __raw_writel(MIC_APR_DEFAULT, LPC32XX_INTC_POLAR(LPC32XX_MIC_BASE));
-       __raw_writel(MIC_ATR_DEFAULT, LPC32XX_INTC_ACT_TYPE(LPC32XX_MIC_BASE));
-
-       /* Setup SIC1 */
-       __raw_writel(0, LPC32XX_INTC_MASK(LPC32XX_SIC1_BASE));
-       __raw_writel(SIC1_APR_DEFAULT, LPC32XX_INTC_POLAR(LPC32XX_SIC1_BASE));
-       __raw_writel(SIC1_ATR_DEFAULT,
-                               LPC32XX_INTC_ACT_TYPE(LPC32XX_SIC1_BASE));
-
-       /* Setup SIC2 */
-       __raw_writel(0, LPC32XX_INTC_MASK(LPC32XX_SIC2_BASE));
-       __raw_writel(SIC2_APR_DEFAULT, LPC32XX_INTC_POLAR(LPC32XX_SIC2_BASE));
-       __raw_writel(SIC2_ATR_DEFAULT,
-                               LPC32XX_INTC_ACT_TYPE(LPC32XX_SIC2_BASE));
-
-       /* Configure supported IRQ's */
-       for (i = 0; i < NR_IRQS; i++) {
-               irq_set_chip_and_handler(i, &lpc32xx_irq_chip,
-                                        handle_level_irq);
-               irq_clear_status_flags(i, IRQ_NOREQUEST);
-       }
-
-       /* Set default mappings */
-       lpc32xx_set_default_mappings(MIC_APR_DEFAULT, MIC_ATR_DEFAULT, 0);
-       lpc32xx_set_default_mappings(SIC1_APR_DEFAULT, SIC1_ATR_DEFAULT, 32);
-       lpc32xx_set_default_mappings(SIC2_APR_DEFAULT, SIC2_ATR_DEFAULT, 64);
-
-       /* Initially disable all wake events */
-       __raw_writel(0, LPC32XX_CLKPWR_P01_ER);
-       __raw_writel(0, LPC32XX_CLKPWR_INT_ER);
-       __raw_writel(0, LPC32XX_CLKPWR_PIN_ER);
-
-       /*
-        * Default wake activation polarities, all pin sources are low edge
-        * triggered
-        */
-       __raw_writel(LPC32XX_CLKPWR_INTSRC_TS_P_BIT |
-               LPC32XX_CLKPWR_INTSRC_MSTIMER_BIT |
-               LPC32XX_CLKPWR_INTSRC_RTC_BIT,
-               LPC32XX_CLKPWR_INT_AP);
-       __raw_writel(0, LPC32XX_CLKPWR_PIN_AP);
-
-       /* Clear latched wake event states */
-       __raw_writel(__raw_readl(LPC32XX_CLKPWR_PIN_RS),
-               LPC32XX_CLKPWR_PIN_RS);
-       __raw_writel(__raw_readl(LPC32XX_CLKPWR_INT_RS),
-               LPC32XX_CLKPWR_INT_RS);
-
-       of_irq_init(mic_of_match);
-
-       lpc32xx_mic_domain = irq_domain_add_legacy(lpc32xx_mic_np, NR_IRQS,
-                                                  0, 0, &irq_domain_simple_ops,
-                                                  NULL);
-       if (!lpc32xx_mic_domain)
-               panic("Unable to add MIC irq domain\n");
-
-       /* MIC SUBIRQx interrupts will route handling to the chain handlers */
-       irq_set_chained_handler(IRQ_LPC32XX_SUB1IRQ, lpc32xx_sic1_handler);
-       irq_set_chained_handler(IRQ_LPC32XX_SUB2IRQ, lpc32xx_sic2_handler);
-}
index 72918c4..f6ac027 100644 (file)
@@ -97,10 +97,7 @@ int gpmc_nand_init(struct omap_nand_platform_data *gpmc_nand_data,
        gpmc_nand_res[2].start = gpmc_get_client_irq(GPMC_IRQ_COUNT_EVENT);
 
        memset(&s, 0, sizeof(struct gpmc_settings));
-       if (gpmc_nand_data->of_node)
-               gpmc_read_settings_dt(gpmc_nand_data->of_node, &s);
-       else
-               gpmc_set_legacy(gpmc_nand_data, &s);
+       gpmc_set_legacy(gpmc_nand_data, &s);
 
        s.device_nand = true;
 
@@ -121,8 +118,6 @@ int gpmc_nand_init(struct omap_nand_platform_data *gpmc_nand_data,
        if (err < 0)
                goto out_free_cs;
 
-       gpmc_update_nand_reg(&gpmc_nand_data->reg, gpmc_nand_data->cs);
-
        if (!gpmc_hwecc_bch_capable(gpmc_nand_data->ecc_opt)) {
                pr_err("omap2-nand: Unsupported NAND ECC scheme selected\n");
                err = -EINVAL;
index 7ee4652..cd894d6 100644 (file)
@@ -6,6 +6,7 @@ comment "Intel/Marvell Dev Platforms (sorted by hardware release time)"
 
 config MACH_PXA27X_DT
        bool "Support PXA27x platforms from device tree"
+       select PINCTRL
        select POWER_SUPPLY
        select PXA27x
        select USE_OF
@@ -17,6 +18,7 @@ config MACH_PXA27X_DT
 config MACH_PXA3XX_DT
        bool "Support PXA3xx platforms from device tree"
        select CPU_PXA300
+       select PINCTRL
        select POWER_SUPPLY
        select PXA3xx
        select USE_OF
index e838b11..fa9d71d 100644 (file)
@@ -128,7 +128,7 @@ struct resource eseries_tmio_resources[] = {
 /* Some e-series hardware cannot control the 32K clock */
 static void __init __maybe_unused eseries_register_clks(void)
 {
-       clk_register_fixed_rate(NULL, "CLK_CK32K", NULL, CLK_IS_ROOT, 32768);
+       clk_register_fixed_rate(NULL, "CLK_CK32K", NULL, 0, 32768);
 }
 
 #ifdef CONFIG_MACH_E330
index d9578bc..bd7cd8b 100644 (file)
@@ -763,14 +763,49 @@ static struct nand_bbt_descr spitz_nand_bbt = {
        .pattern        = scan_ff_pattern
 };
 
-static struct nand_ecclayout akita_oobinfo = {
-       .oobfree        = { {0x08, 0x09} },
-       .eccbytes       = 24,
-       .eccpos         = {
-                       0x05, 0x01, 0x02, 0x03, 0x06, 0x07, 0x15, 0x11,
-                       0x12, 0x13, 0x16, 0x17, 0x25, 0x21, 0x22, 0x23,
-                       0x26, 0x27, 0x35, 0x31, 0x32, 0x33, 0x36, 0x37,
-       },
+static int akita_ooblayout_ecc(struct mtd_info *mtd, int section,
+                              struct mtd_oob_region *oobregion)
+{
+       if (section > 12)
+               return -ERANGE;
+
+       switch (section % 3) {
+       case 0:
+               oobregion->offset = 5;
+               oobregion->length = 1;
+               break;
+
+       case 1:
+               oobregion->offset = 1;
+               oobregion->length = 3;
+               break;
+
+       case 2:
+               oobregion->offset = 6;
+               oobregion->length = 2;
+               break;
+       }
+
+       oobregion->offset += (section / 3) * 0x10;
+
+       return 0;
+}
+
+static int akita_ooblayout_free(struct mtd_info *mtd, int section,
+                               struct mtd_oob_region *oobregion)
+{
+       if (section)
+               return -ERANGE;
+
+       oobregion->offset = 8;
+       oobregion->length = 9;
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops akita_ooblayout_ops = {
+       .ecc = akita_ooblayout_ecc,
+       .free = akita_ooblayout_free,
 };
 
 static struct sharpsl_nand_platform_data spitz_nand_pdata = {
@@ -804,11 +839,11 @@ static void __init spitz_nand_init(void)
        } else if (machine_is_akita()) {
                spitz_nand_partitions[1].size = 58 * 1024 * 1024;
                spitz_nand_bbt.len = 1;
-               spitz_nand_pdata.ecc_layout = &akita_oobinfo;
+               spitz_nand_pdata.ecc_layout = &akita_ooblayout_ops;
        } else if (machine_is_borzoi()) {
                spitz_nand_partitions[1].size = 32 * 1024 * 1024;
                spitz_nand_bbt.len = 1;
-               spitz_nand_pdata.ecc_layout = &akita_oobinfo;
+               spitz_nand_pdata.ecc_layout = &akita_ooblayout_ops;
        }
 
        platform_device_register(&spitz_nand_device);
index 774c982..25a139b 100644 (file)
@@ -496,6 +496,12 @@ static int rx1950_backlight_init(struct device *dev)
                return PTR_ERR(lcd_pwm);
        }
 
+       /*
+        * FIXME: pwm_apply_args() should be removed when switching to
+        * the atomic PWM API.
+        */
+       pwm_apply_args(lcd_pwm);
+
        rx1950_lcd_power(1);
        rx1950_bl_power(1);
 
index 1160434..59a8fa7 100644 (file)
@@ -74,5 +74,5 @@ $(MODLIB)/vdso: FORCE
        @mkdir -p $(MODLIB)/vdso
 
 PHONY += vdso_install
-vdso_install: $(obj)/vdso.so.dbg $(MODLIB)/vdso FORCE
+vdso_install: $(obj)/vdso.so.dbg $(MODLIB)/vdso
        $(call cmd,vdso_install)
index 76747d9..5a0a691 100644 (file)
@@ -113,6 +113,18 @@ config ARCH_PHYS_ADDR_T_64BIT
 config MMU
        def_bool y
 
+config ARM64_PAGE_SHIFT
+       int
+       default 16 if ARM64_64K_PAGES
+       default 14 if ARM64_16K_PAGES
+       default 12
+
+config ARM64_CONT_SHIFT
+       int
+       default 5 if ARM64_64K_PAGES
+       default 7 if ARM64_16K_PAGES
+       default 4
+
 config ARCH_MMAP_RND_BITS_MIN
        default 14 if ARM64_64K_PAGES
        default 16 if ARM64_16K_PAGES
@@ -426,6 +438,15 @@ config CAVIUM_ERRATUM_22375
 
          If unsure, say Y.
 
+config CAVIUM_ERRATUM_23144
+       bool "Cavium erratum 23144: ITS SYNC hang on dual socket system"
+       depends on NUMA
+       default y
+       help
+         ITS SYNC command hang for cross node io and collections/cpu mapping.
+
+         If unsure, say Y.
+
 config CAVIUM_ERRATUM_23154
        bool "Cavium erratum 23154: Access to ICC_IAR1_EL1 is not sync'ed"
        default y
index 710fde4..0cc758c 100644 (file)
@@ -12,7 +12,8 @@ config ARM64_PTDUMP
          who are working in architecture specific areas of the kernel.
          It is probably not a good idea to enable this feature in a production
          kernel.
-         If in doubt, say "N"
+
+         If in doubt, say N.
 
 config PID_IN_CONTEXTIDR
        bool "Write the current PID to the CONTEXTIDR register"
@@ -38,15 +39,15 @@ config ARM64_RANDOMIZE_TEXT_OFFSET
          value.
 
 config DEBUG_SET_MODULE_RONX
-        bool "Set loadable kernel module data as NX and text as RO"
-        depends on MODULES
-        help
-          This option helps catch unintended modifications to loadable
-          kernel module's text and read-only data. It also prevents execution
-          of module data. Such protection may interfere with run-time code
-          patching and dynamic kernel tracing - and they might also protect
-          against certain classes of kernel exploits.
-          If in doubt, say "N".
+       bool "Set loadable kernel module data as NX and text as RO"
+       depends on MODULES
+       default y
+       help
+         Is this is set, kernel module text and rodata will be made read-only.
+         This is to help catch accidental or malicious attempts to change the
+         kernel's executable code.
+
+         If in doubt, say Y.
 
 config DEBUG_RODATA
        bool "Make kernel text and rodata read-only"
@@ -56,7 +57,7 @@ config DEBUG_RODATA
          is to help catch accidental or malicious attempts to change the
          kernel's executable code.
 
-         If in doubt, say Y
+         If in doubt, say Y.
 
 config DEBUG_ALIGN_RODATA
        depends on DEBUG_RODATA
@@ -69,7 +70,7 @@ config DEBUG_ALIGN_RODATA
          alignment and potentially wasted space. Turn on this option if
          performance is more important than memory pressure.
 
-         If in doubt, say N
+         If in doubt, say N.
 
 source "drivers/hwtracing/coresight/Kconfig"
 
index 354d754..7085e32 100644 (file)
@@ -60,7 +60,9 @@ head-y                := arch/arm64/kernel/head.o
 
 # The byte offset of the kernel image in RAM from the start of RAM.
 ifeq ($(CONFIG_ARM64_RANDOMIZE_TEXT_OFFSET), y)
-TEXT_OFFSET := $(shell awk 'BEGIN {srand(); printf "0x%03x000\n", int(512 * rand())}')
+TEXT_OFFSET := $(shell awk "BEGIN {srand(); printf \"0x%06x\n\", \
+                int(2 * 1024 * 1024 / (2 ^ $(CONFIG_ARM64_PAGE_SHIFT)) * \
+                rand()) * (2 ^ $(CONFIG_ARM64_PAGE_SHIFT))}")
 else
 TEXT_OFFSET := 0x00080000
 endif
index 7cb2d72..3285a92 100644 (file)
@@ -10,6 +10,7 @@
 
 #include <dt-bindings/clock/r8a7795-cpg-mssr.h>
 #include <dt-bindings/interrupt-controller/arm-gic.h>
+#include <dt-bindings/power/r8a7795-sysc.h>
 
 / {
        compatible = "renesas,r8a7795";
@@ -39,6 +40,7 @@
                        compatible = "arm,cortex-a57", "arm,armv8";
                        reg = <0x0>;
                        device_type = "cpu";
+                       power-domains = <&sysc R8A7795_PD_CA57_CPU0>;
                        next-level-cache = <&L2_CA57>;
                        enable-method = "psci";
                };
@@ -47,6 +49,7 @@
                        compatible = "arm,cortex-a57","arm,armv8";
                        reg = <0x1>;
                        device_type = "cpu";
+                       power-domains = <&sysc R8A7795_PD_CA57_CPU1>;
                        next-level-cache = <&L2_CA57>;
                        enable-method = "psci";
                };
@@ -54,6 +57,7 @@
                        compatible = "arm,cortex-a57","arm,armv8";
                        reg = <0x2>;
                        device_type = "cpu";
+                       power-domains = <&sysc R8A7795_PD_CA57_CPU2>;
                        next-level-cache = <&L2_CA57>;
                        enable-method = "psci";
                };
@@ -61,6 +65,7 @@
                        compatible = "arm,cortex-a57","arm,armv8";
                        reg = <0x3>;
                        device_type = "cpu";
+                       power-domains = <&sysc R8A7795_PD_CA57_CPU3>;
                        next-level-cache = <&L2_CA57>;
                        enable-method = "psci";
                };
 
        L2_CA57: cache-controller@0 {
                compatible = "cache";
+               power-domains = <&sysc R8A7795_PD_CA57_SCU>;
                cache-unified;
                cache-level = <2>;
        };
 
        L2_CA53: cache-controller@1 {
                compatible = "cache";
+               power-domains = <&sysc R8A7795_PD_CA53_SCU>;
                cache-unified;
                cache-level = <2>;
        };
                        #interrupt-cells = <2>;
                        interrupt-controller;
                        clocks = <&cpg CPG_MOD 912>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                };
 
                gpio1: gpio@e6051000 {
                        #interrupt-cells = <2>;
                        interrupt-controller;
                        clocks = <&cpg CPG_MOD 911>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                };
 
                gpio2: gpio@e6052000 {
                        #interrupt-cells = <2>;
                        interrupt-controller;
                        clocks = <&cpg CPG_MOD 910>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                };
 
                gpio3: gpio@e6053000 {
                        #interrupt-cells = <2>;
                        interrupt-controller;
                        clocks = <&cpg CPG_MOD 909>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                };
 
                gpio4: gpio@e6054000 {
                        #interrupt-cells = <2>;
                        interrupt-controller;
                        clocks = <&cpg CPG_MOD 908>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                };
 
                gpio5: gpio@e6055000 {
                        #interrupt-cells = <2>;
                        interrupt-controller;
                        clocks = <&cpg CPG_MOD 907>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                };
 
                gpio6: gpio@e6055400 {
                        #interrupt-cells = <2>;
                        interrupt-controller;
                        clocks = <&cpg CPG_MOD 906>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                };
 
                gpio7: gpio@e6055800 {
                        #interrupt-cells = <2>;
                        interrupt-controller;
                        clocks = <&cpg CPG_MOD 905>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                };
 
                pmu_a57 {
                        #power-domain-cells = <0>;
                };
 
+               sysc: system-controller@e6180000 {
+                       compatible = "renesas,r8a7795-sysc";
+                       reg = <0 0xe6180000 0 0x0400>;
+                       #power-domain-cells = <1>;
+               };
+
                audma0: dma-controller@ec700000 {
                        compatible = "renesas,rcar-dmac";
                        reg = <0 0xec700000 0 0x10000>;
                                        "ch12", "ch13", "ch14", "ch15";
                        clocks = <&cpg CPG_MOD 502>;
                        clock-names = "fck";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        #dma-cells = <1>;
                        dma-channels = <16>;
                };
                                        "ch12", "ch13", "ch14", "ch15";
                        clocks = <&cpg CPG_MOD 501>;
                        clock-names = "fck";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        #dma-cells = <1>;
                        dma-channels = <16>;
                };
                                      GIC_SPI 18 IRQ_TYPE_LEVEL_HIGH
                                      GIC_SPI 161 IRQ_TYPE_LEVEL_HIGH>;
                        clocks = <&cpg CPG_MOD 407>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                };
 
                dmac0: dma-controller@e6700000 {
                                        "ch12", "ch13", "ch14", "ch15";
                        clocks = <&cpg CPG_MOD 219>;
                        clock-names = "fck";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        #dma-cells = <1>;
                        dma-channels = <16>;
                };
                                        "ch12", "ch13", "ch14", "ch15";
                        clocks = <&cpg CPG_MOD 218>;
                        clock-names = "fck";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        #dma-cells = <1>;
                        dma-channels = <16>;
                };
                                        "ch12", "ch13", "ch14", "ch15";
                        clocks = <&cpg CPG_MOD 217>;
                        clock-names = "fck";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        #dma-cells = <1>;
                        dma-channels = <16>;
                };
                                          "ch20", "ch21", "ch22", "ch23",
                                          "ch24";
                        clocks = <&cpg CPG_MOD 812>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        phy-mode = "rgmii-id";
                        #address-cells = <1>;
                        #size-cells = <0>;
                        clock-names = "clkp1", "clkp2", "can_clk";
                        assigned-clocks = <&cpg CPG_CORE R8A7795_CLK_CANFD>;
                        assigned-clock-rates = <40000000>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        clock-names = "clkp1", "clkp2", "can_clk";
                        assigned-clocks = <&cpg CPG_CORE R8A7795_CLK_CANFD>;
                        assigned-clock-rates = <40000000>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        clock-names = "fck", "brg_int", "scif_clk";
                        dmas = <&dmac1 0x31>, <&dmac1 0x30>;
                        dma-names = "tx", "rx";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        clock-names = "fck", "brg_int", "scif_clk";
                        dmas = <&dmac1 0x33>, <&dmac1 0x32>;
                        dma-names = "tx", "rx";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        clock-names = "fck", "brg_int", "scif_clk";
                        dmas = <&dmac1 0x35>, <&dmac1 0x34>;
                        dma-names = "tx", "rx";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        clock-names = "fck", "brg_int", "scif_clk";
                        dmas = <&dmac0 0x37>, <&dmac0 0x36>;
                        dma-names = "tx", "rx";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        clock-names = "fck", "brg_int", "scif_clk";
                        dmas = <&dmac0 0x39>, <&dmac0 0x38>;
                        dma-names = "tx", "rx";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        clock-names = "fck", "brg_int", "scif_clk";
                        dmas = <&dmac1 0x51>, <&dmac1 0x50>;
                        dma-names = "tx", "rx";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        clock-names = "fck", "brg_int", "scif_clk";
                        dmas = <&dmac1 0x53>, <&dmac1 0x52>;
                        dma-names = "tx", "rx";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        clock-names = "fck", "brg_int", "scif_clk";
                        dmas = <&dmac1 0x13>, <&dmac1 0x12>;
                        dma-names = "tx", "rx";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        clock-names = "fck", "brg_int", "scif_clk";
                        dmas = <&dmac0 0x57>, <&dmac0 0x56>;
                        dma-names = "tx", "rx";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        clock-names = "fck", "brg_int", "scif_clk";
                        dmas = <&dmac0 0x59>, <&dmac0 0x58>;
                        dma-names = "tx", "rx";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        clock-names = "fck", "brg_int", "scif_clk";
                        dmas = <&dmac1 0x5b>, <&dmac1 0x5a>;
                        dma-names = "tx", "rx";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        reg = <0 0xe6500000 0 0x40>;
                        interrupts = <GIC_SPI 287 IRQ_TYPE_LEVEL_HIGH>;
                        clocks = <&cpg CPG_MOD 931>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        i2c-scl-internal-delay-ns = <110>;
                        status = "disabled";
                };
                        reg = <0 0xe6508000 0 0x40>;
                        interrupts = <GIC_SPI 288 IRQ_TYPE_LEVEL_HIGH>;
                        clocks = <&cpg CPG_MOD 930>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        i2c-scl-internal-delay-ns = <6>;
                        status = "disabled";
                };
                        reg = <0 0xe6510000 0 0x40>;
                        interrupts = <GIC_SPI 286 IRQ_TYPE_LEVEL_HIGH>;
                        clocks = <&cpg CPG_MOD 929>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        i2c-scl-internal-delay-ns = <6>;
                        status = "disabled";
                };
                        reg = <0 0xe66d0000 0 0x40>;
                        interrupts = <GIC_SPI 290 IRQ_TYPE_LEVEL_HIGH>;
                        clocks = <&cpg CPG_MOD 928>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        i2c-scl-internal-delay-ns = <110>;
                        status = "disabled";
                };
                        reg = <0 0xe66d8000 0 0x40>;
                        interrupts = <GIC_SPI 19 IRQ_TYPE_LEVEL_HIGH>;
                        clocks = <&cpg CPG_MOD 927>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        i2c-scl-internal-delay-ns = <110>;
                        status = "disabled";
                };
                        reg = <0 0xe66e0000 0 0x40>;
                        interrupts = <GIC_SPI 20 IRQ_TYPE_LEVEL_HIGH>;
                        clocks = <&cpg CPG_MOD 919>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        i2c-scl-internal-delay-ns = <110>;
                        status = "disabled";
                };
                        reg = <0 0xe66e8000 0 0x40>;
                        interrupts = <GIC_SPI 21 IRQ_TYPE_LEVEL_HIGH>;
                        clocks = <&cpg CPG_MOD 918>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        i2c-scl-internal-delay-ns = <6>;
                        status = "disabled";
                };
                                      "src.1", "src.0",
                                      "dvc.0", "dvc.1",
                                      "clk_a", "clk_b", "clk_c", "clk_i";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
 
                        rcar_sound,dvc {
                        reg = <0 0xee000000 0 0xc00>;
                        interrupts = <GIC_SPI 102 IRQ_TYPE_LEVEL_HIGH>;
                        clocks = <&cpg CPG_MOD 328>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        reg = <0 0xee040000 0 0xc00>;
                        interrupts = <GIC_SPI 98 IRQ_TYPE_LEVEL_HIGH>;
                        clocks = <&cpg CPG_MOD 327>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                                      GIC_SPI 109 IRQ_TYPE_LEVEL_HIGH>;
                        interrupt-names = "ch0", "ch1";
                        clocks = <&cpg CPG_MOD 330>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        #dma-cells = <1>;
                        dma-channels = <2>;
                };
                                      GIC_SPI 110 IRQ_TYPE_LEVEL_HIGH>;
                        interrupt-names = "ch0", "ch1";
                        clocks = <&cpg CPG_MOD 331>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        #dma-cells = <1>;
                        dma-channels = <2>;
                };
                        reg = <0 0xee100000 0 0x2000>;
                        interrupts = <GIC_SPI 165 IRQ_TYPE_LEVEL_HIGH>;
                        clocks = <&cpg CPG_MOD 314>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        reg = <0 0xee120000 0 0x2000>;
                        interrupts = <GIC_SPI 166 IRQ_TYPE_LEVEL_HIGH>;
                        clocks = <&cpg CPG_MOD 313>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        reg = <0 0xee140000 0 0x2000>;
                        interrupts = <GIC_SPI 167 IRQ_TYPE_LEVEL_HIGH>;
                        clocks = <&cpg CPG_MOD 312>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        cap-mmc-highspeed;
                        status = "disabled";
                };
                        reg = <0 0xee160000 0 0x2000>;
                        interrupts = <GIC_SPI 168 IRQ_TYPE_LEVEL_HIGH>;
                        clocks = <&cpg CPG_MOD 311>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        cap-mmc-highspeed;
                        status = "disabled";
                };
                        reg = <0 0xee080200 0 0x700>;
                        interrupts = <GIC_SPI 108 IRQ_TYPE_LEVEL_HIGH>;
                        clocks = <&cpg CPG_MOD 703>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        #phy-cells = <0>;
                        status = "disabled";
                };
                        compatible = "renesas,usb2-phy-r8a7795";
                        reg = <0 0xee0a0200 0 0x700>;
                        clocks = <&cpg CPG_MOD 702>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        #phy-cells = <0>;
                        status = "disabled";
                };
                        compatible = "renesas,usb2-phy-r8a7795";
                        reg = <0 0xee0c0200 0 0x700>;
                        clocks = <&cpg CPG_MOD 701>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        #phy-cells = <0>;
                        status = "disabled";
                };
                        clocks = <&cpg CPG_MOD 703>;
                        phys = <&usb2_phy0>;
                        phy-names = "usb";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        clocks = <&cpg CPG_MOD 702>;
                        phys = <&usb2_phy1>;
                        phy-names = "usb";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        clocks = <&cpg CPG_MOD 701>;
                        phys = <&usb2_phy2>;
                        phy-names = "usb";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        clocks = <&cpg CPG_MOD 703>;
                        phys = <&usb2_phy0>;
                        phy-names = "usb";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        clocks = <&cpg CPG_MOD 702>;
                        phys = <&usb2_phy1>;
                        phy-names = "usb";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        clocks = <&cpg CPG_MOD 701>;
                        phys = <&usb2_phy2>;
                        phy-names = "usb";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
                pciec0: pcie@fe000000 {
                        interrupt-map = <0 0 0 0 &gic GIC_SPI 116 IRQ_TYPE_LEVEL_HIGH>;
                        clocks = <&cpg CPG_MOD 319>, <&pcie_bus_clk>;
                        clock-names = "pcie", "pcie_bus";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        interrupt-map = <0 0 0 0 &gic GIC_SPI 148 IRQ_TYPE_LEVEL_HIGH>;
                        clocks = <&cpg CPG_MOD 318>, <&pcie_bus_clk>;
                        clock-names = "pcie", "pcie_bus";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
        };
index 8917150..fd2d74d 100644 (file)
@@ -200,6 +200,8 @@ CONFIG_SENSORS_INA2XX=m
 CONFIG_THERMAL=y
 CONFIG_THERMAL_EMULATION=y
 CONFIG_EXYNOS_THERMAL=y
+CONFIG_WATCHDOG=y
+CONFIG_RENESAS_WDT=y
 CONFIG_MFD_SPMI_PMIC=y
 CONFIG_MFD_SEC_CORE=y
 CONFIG_MFD_HI655X_PMIC=y
index 7a09c48..579b6e6 100644 (file)
@@ -160,14 +160,14 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm,
 #define STACK_RND_MASK                 (0x3ffff >> (PAGE_SHIFT - 12))
 #endif
 
-#ifdef CONFIG_COMPAT
-
 #ifdef __AARCH64EB__
 #define COMPAT_ELF_PLATFORM            ("v8b")
 #else
 #define COMPAT_ELF_PLATFORM            ("v8l")
 #endif
 
+#ifdef CONFIG_COMPAT
+
 #define COMPAT_ELF_ET_DYN_BASE         (2 * TASK_SIZE_32 / 3)
 
 /* AArch32 registers. */
index e63d23b..49095fc 100644 (file)
@@ -43,6 +43,8 @@
 
 #define KVM_VCPU_MAX_FEATURES 4
 
+#define KVM_REQ_VCPU_EXIT      8
+
 int __attribute_const__ kvm_target_cpu(void);
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
 int kvm_arch_dev_ioctl_check_extension(long ext);
@@ -327,6 +329,10 @@ static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
 
 struct kvm_vcpu *kvm_arm_get_running_vcpu(void);
 struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void);
+void kvm_arm_halt_guest(struct kvm *kvm);
+void kvm_arm_resume_guest(struct kvm *kvm);
+void kvm_arm_halt_vcpu(struct kvm_vcpu *vcpu);
+void kvm_arm_resume_vcpu(struct kvm_vcpu *vcpu);
 
 u64 __kvm_call_hyp(void *hypfn, ...);
 #define kvm_call_hyp(f, ...) __kvm_call_hyp(kvm_ksym_ref(f), ##__VA_ARGS__)
index fe612a9..75ea420 100644 (file)
@@ -30,6 +30,9 @@ struct kvm_decode {
        bool sign_extend;
 };
 
+void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data);
+unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len);
+
 int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run);
 int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
                 phys_addr_t fault_ipa);
index 72a3025..31b7322 100644 (file)
@@ -55,8 +55,9 @@
 #define VMEMMAP_SIZE (UL(1) << (VA_BITS - PAGE_SHIFT - 1 + STRUCT_PAGE_MAX_SHIFT))
 
 /*
- * PAGE_OFFSET - the virtual address of the start of the kernel image (top
+ * PAGE_OFFSET - the virtual address of the start of the linear map (top
  *              (VA_BITS - 1))
+ * KIMAGE_VADDR - the virtual address of the start of the kernel image
  * VA_BITS - the maximum number of bits for virtual addresses.
  * VA_START - the first kernel virtual address.
  * TASK_SIZE - the maximum size of a user space task.
index 17b45f7..8472c6d 100644 (file)
 
 /* PAGE_SHIFT determines the page size */
 /* CONT_SHIFT determines the number of pages which can be tracked together  */
-#ifdef CONFIG_ARM64_64K_PAGES
-#define PAGE_SHIFT             16
-#define CONT_SHIFT             5
-#elif defined(CONFIG_ARM64_16K_PAGES)
-#define PAGE_SHIFT             14
-#define CONT_SHIFT             7
-#else
-#define PAGE_SHIFT             12
-#define CONT_SHIFT             4
-#endif
+#define PAGE_SHIFT             CONFIG_ARM64_PAGE_SHIFT
+#define CONT_SHIFT             CONFIG_ARM64_CONT_SHIFT
 #define PAGE_SIZE              (_AC(1, UL) << PAGE_SHIFT)
 #define PAGE_MASK              (~(PAGE_SIZE-1))
 
index 0685d74..9e397a5 100644 (file)
@@ -80,19 +80,6 @@ static inline void set_fs(mm_segment_t fs)
 
 #define segment_eq(a, b)       ((a) == (b))
 
-/*
- * Return 1 if addr < current->addr_limit, 0 otherwise.
- */
-#define __addr_ok(addr)                                                        \
-({                                                                     \
-       unsigned long flag;                                             \
-       asm("cmp %1, %0; cset %0, lo"                                   \
-               : "=&r" (flag)                                          \
-               : "r" (addr), "0" (current_thread_info()->addr_limit)   \
-               : "cc");                                                \
-       flag;                                                           \
-})
-
 /*
  * Test whether a block of memory is a valid user space address.
  * Returns 1 if the range is valid, 0 otherwise.
index 41e58fe..e78ac26 100644 (file)
@@ -44,7 +44,7 @@
 #define __ARM_NR_compat_cacheflush     (__ARM_NR_COMPAT_BASE+2)
 #define __ARM_NR_compat_set_tls                (__ARM_NR_COMPAT_BASE+5)
 
-#define __NR_compat_syscalls           390
+#define __NR_compat_syscalls           394
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
index 5b925b7..b7e8ef1 100644 (file)
@@ -801,6 +801,14 @@ __SYSCALL(__NR_execveat, compat_sys_execveat)
 __SYSCALL(__NR_userfaultfd, sys_userfaultfd)
 #define __NR_membarrier 389
 __SYSCALL(__NR_membarrier, sys_membarrier)
+#define __NR_mlock2 390
+__SYSCALL(__NR_mlock2, sys_mlock2)
+#define __NR_copy_file_range 391
+__SYSCALL(__NR_copy_file_range, sys_copy_file_range)
+#define __NR_preadv2 392
+__SYSCALL(__NR_preadv2, compat_sys_preadv2)
+#define __NR_pwritev2 393
+__SYSCALL(__NR_pwritev2, compat_sys_pwritev2)
 
 /*
  * Please add new compat syscalls above this comment and update
index 1caadc2..043d17a 100644 (file)
@@ -13,4 +13,7 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
+
+#define __ARCH_WANT_RENAMEAT
+
 #include <asm-generic/unistd.h>
index 3808470..c173d32 100644 (file)
@@ -22,6 +22,8 @@
 
 #include <linux/bitops.h>
 #include <linux/bug.h>
+#include <linux/compat.h>
+#include <linux/elf.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/personality.h>
@@ -104,6 +106,7 @@ static const char *const compat_hwcap2_str[] = {
 static int c_show(struct seq_file *m, void *v)
 {
        int i, j;
+       bool compat = personality(current->personality) == PER_LINUX32;
 
        for_each_online_cpu(i) {
                struct cpuinfo_arm64 *cpuinfo = &per_cpu(cpu_data, i);
@@ -115,6 +118,9 @@ static int c_show(struct seq_file *m, void *v)
                 * "processor".  Give glibc what it expects.
                 */
                seq_printf(m, "processor\t: %d\n", i);
+               if (compat)
+                       seq_printf(m, "model name\t: ARMv8 Processor rev %d (%s)\n",
+                                  MIDR_REVISION(midr), COMPAT_ELF_PLATFORM);
 
                seq_printf(m, "BogoMIPS\t: %lu.%02lu\n",
                           loops_per_jiffy / (500000UL/HZ),
@@ -127,7 +133,7 @@ static int c_show(struct seq_file *m, void *v)
                 * software which does already (at least for 32-bit).
                 */
                seq_puts(m, "Features\t:");
-               if (personality(current->personality) == PER_LINUX32) {
+               if (compat) {
 #ifdef CONFIG_COMPAT
                        for (j = 0; compat_hwcap_str[j]; j++)
                                if (compat_elf_hwcap & (1 << j))
index 32c3c6e..713ca82 100644 (file)
@@ -31,7 +31,7 @@ struct frame_tail {
  */
 static struct frame_tail __user *
 user_backtrace(struct frame_tail __user *tail,
-              struct perf_callchain_entry *entry)
+              struct perf_callchain_entry_ctx *entry)
 {
        struct frame_tail buftail;
        unsigned long err;
@@ -76,7 +76,7 @@ struct compat_frame_tail {
 
 static struct compat_frame_tail __user *
 compat_user_backtrace(struct compat_frame_tail __user *tail,
-                     struct perf_callchain_entry *entry)
+                     struct perf_callchain_entry_ctx *entry)
 {
        struct compat_frame_tail buftail;
        unsigned long err;
@@ -106,7 +106,7 @@ compat_user_backtrace(struct compat_frame_tail __user *tail,
 }
 #endif /* CONFIG_COMPAT */
 
-void perf_callchain_user(struct perf_callchain_entry *entry,
+void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
                         struct pt_regs *regs)
 {
        if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
@@ -122,7 +122,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
 
                tail = (struct frame_tail __user *)regs->regs[29];
 
-               while (entry->nr < sysctl_perf_event_max_stack &&
+               while (entry->nr < entry->max_stack &&
                       tail && !((unsigned long)tail & 0xf))
                        tail = user_backtrace(tail, entry);
        } else {
@@ -132,7 +132,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
 
                tail = (struct compat_frame_tail __user *)regs->compat_fp - 1;
 
-               while ((entry->nr < sysctl_perf_event_max_stack) &&
+               while ((entry->nr < entry->max_stack) &&
                        tail && !((unsigned long)tail & 0x3))
                        tail = compat_user_backtrace(tail, entry);
 #endif
@@ -146,12 +146,12 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
  */
 static int callchain_trace(struct stackframe *frame, void *data)
 {
-       struct perf_callchain_entry *entry = data;
+       struct perf_callchain_entry_ctx *entry = data;
        perf_callchain_store(entry, frame->pc);
        return 0;
 }
 
-void perf_callchain_kernel(struct perf_callchain_entry *entry,
+void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
                           struct pt_regs *regs)
 {
        struct stackframe frame;
index c539208..f7cf463 100644 (file)
@@ -477,8 +477,9 @@ asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr)
        void __user *pc = (void __user *)instruction_pointer(regs);
        console_verbose();
 
-       pr_crit("Bad mode in %s handler detected, code 0x%08x -- %s\n",
-               handler[reason], esr, esr_get_class_string(esr));
+       pr_crit("Bad mode in %s handler detected on CPU%d, code 0x%08x -- %s\n",
+               handler[reason], smp_processor_id(), esr,
+               esr_get_class_string(esr));
        __show_regs(regs);
 
        info.si_signo = SIGILL;
index aa2e34e..c4f26ef 100644 (file)
@@ -54,6 +54,13 @@ config KVM_ARM_PMU
          Adds support for a virtual Performance Monitoring Unit (PMU) in
          virtual machines.
 
+config KVM_NEW_VGIC
+       bool "New VGIC implementation"
+       depends on KVM
+       default y
+        ---help---
+          uses the new VGIC implementation
+
 source drivers/vhost/Kconfig
 
 endif # VIRTUALIZATION
index 122cff4..a7a958c 100644 (file)
@@ -20,10 +20,22 @@ kvm-$(CONFIG_KVM_ARM_HOST) += emulate.o inject_fault.o regmap.o
 kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o
 kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o
 
+ifeq ($(CONFIG_KVM_NEW_VGIC),y)
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-init.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-irqfd.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-v2.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-v3.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v2.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v3.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-kvm-device.o
+else
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2-emul.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v3.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v3-emul.o
+endif
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arch_timer.o
 kvm-$(CONFIG_KVM_ARM_PMU) += $(KVM)/arm/pmu.o
index fff7cd4..5f8f80b 100644 (file)
@@ -169,7 +169,8 @@ void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu)
         * Make sure stores to the GIC via the memory mapped interface
         * are now visible to the system register interface.
         */
-       dsb(st);
+       if (!cpu_if->vgic_sre)
+               dsb(st);
 
        cpu_if->vgic_vmcr  = read_gicreg(ICH_VMCR_EL2);
 
@@ -190,12 +191,11 @@ void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu)
                        if (!(vcpu->arch.vgic_cpu.live_lrs & (1UL << i)))
                                continue;
 
-                       if (cpu_if->vgic_elrsr & (1 << i)) {
+                       if (cpu_if->vgic_elrsr & (1 << i))
                                cpu_if->vgic_lr[i] &= ~ICH_LR_STATE;
-                               continue;
-                       }
+                       else
+                               cpu_if->vgic_lr[i] = __gic_v3_get_lr(i);
 
-                       cpu_if->vgic_lr[i] = __gic_v3_get_lr(i);
                        __gic_v3_set_lr(0, i);
                }
 
@@ -236,8 +236,12 @@ void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu)
 
        val = read_gicreg(ICC_SRE_EL2);
        write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2);
-       isb(); /* Make sure ENABLE is set at EL2 before setting SRE at EL1 */
-       write_gicreg(1, ICC_SRE_EL1);
+
+       if (!cpu_if->vgic_sre) {
+               /* Make sure ENABLE is set at EL2 before setting SRE at EL1 */
+               isb();
+               write_gicreg(1, ICC_SRE_EL1);
+       }
 }
 
 void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu)
@@ -256,8 +260,10 @@ void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu)
         * been actually programmed with the value we want before
         * starting to mess with the rest of the GIC.
         */
-       write_gicreg(cpu_if->vgic_sre, ICC_SRE_EL1);
-       isb();
+       if (!cpu_if->vgic_sre) {
+               write_gicreg(0, ICC_SRE_EL1);
+               isb();
+       }
 
        val = read_gicreg(ICH_VTR_EL2);
        max_lr_idx = vtr_to_max_lr_idx(val);
@@ -306,18 +312,18 @@ void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu)
         * (re)distributors. This ensure the guest will read the
         * correct values from the memory-mapped interface.
         */
-       isb();
-       dsb(sy);
+       if (!cpu_if->vgic_sre) {
+               isb();
+               dsb(sy);
+       }
        vcpu->arch.vgic_cpu.live_lrs = live_lrs;
 
        /*
         * Prevent the guest from touching the GIC system registers if
         * SRE isn't enabled for GICv3 emulation.
         */
-       if (!cpu_if->vgic_sre) {
-               write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE,
-                            ICC_SRE_EL2);
-       }
+       write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE,
+                    ICC_SRE_EL2);
 }
 
 void __hyp_text __vgic_v3_init_lrs(void)
index 4d1ac81..e9e0e6d 100644 (file)
@@ -162,7 +162,7 @@ static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr
                esr |= (ESR_ELx_EC_IABT_CUR << ESR_ELx_EC_SHIFT);
 
        if (!is_iabt)
-               esr |= ESR_ELx_EC_DABT_LOW;
+               esr |= ESR_ELx_EC_DABT_LOW << ESR_ELx_EC_SHIFT;
 
        vcpu_sys_reg(vcpu, ESR_EL1) = esr | ESR_ELx_FSC_EXTABT;
 }
index 7bbe3ff..a57d650 100644 (file)
@@ -134,6 +134,17 @@ static bool access_gic_sgi(struct kvm_vcpu *vcpu,
        return true;
 }
 
+static bool access_gic_sre(struct kvm_vcpu *vcpu,
+                          struct sys_reg_params *p,
+                          const struct sys_reg_desc *r)
+{
+       if (p->is_write)
+               return ignore_write(vcpu, p);
+
+       p->regval = vcpu->arch.vgic_cpu.vgic_v3.vgic_sre;
+       return true;
+}
+
 static bool trap_raz_wi(struct kvm_vcpu *vcpu,
                        struct sys_reg_params *p,
                        const struct sys_reg_desc *r)
@@ -958,7 +969,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
          access_gic_sgi },
        /* ICC_SRE_EL1 */
        { Op0(0b11), Op1(0b000), CRn(0b1100), CRm(0b1100), Op2(0b101),
-         trap_raz_wi },
+         access_gic_sre },
 
        /* CONTEXTIDR_EL1 */
        { Op0(0b11), Op1(0b000), CRn(0b1101), CRm(0b0000), Op2(0b001),
index 8404190..ccfde23 100644 (file)
@@ -150,6 +150,7 @@ static const struct prot_bits pte_bits[] = {
 
 struct pg_level {
        const struct prot_bits *bits;
+       const char *name;
        size_t num;
        u64 mask;
 };
@@ -157,15 +158,19 @@ struct pg_level {
 static struct pg_level pg_level[] = {
        {
        }, { /* pgd */
+               .name   = "PGD",
                .bits   = pte_bits,
                .num    = ARRAY_SIZE(pte_bits),
        }, { /* pud */
+               .name   = (CONFIG_PGTABLE_LEVELS > 3) ? "PUD" : "PGD",
                .bits   = pte_bits,
                .num    = ARRAY_SIZE(pte_bits),
        }, { /* pmd */
+               .name   = (CONFIG_PGTABLE_LEVELS > 2) ? "PMD" : "PGD",
                .bits   = pte_bits,
                .num    = ARRAY_SIZE(pte_bits),
        }, { /* pte */
+               .name   = "PTE",
                .bits   = pte_bits,
                .num    = ARRAY_SIZE(pte_bits),
        },
@@ -214,7 +219,8 @@ static void note_page(struct pg_state *st, unsigned long addr, unsigned level,
                                delta >>= 10;
                                unit++;
                        }
-                       seq_printf(st->seq, "%9lu%c", delta, *unit);
+                       seq_printf(st->seq, "%9lu%c %s", delta, *unit,
+                                  pg_level[st->level].name);
                        if (pg_level[st->level].bits)
                                dump_prot(st, pg_level[st->level].bits,
                                          pg_level[st->level].num);
index aa8aee7..2e49bd2 100644 (file)
@@ -306,6 +306,10 @@ static __init int setup_hugepagesz(char *opt)
                hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
        } else if (ps == PUD_SIZE) {
                hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
+       } else if (ps == (PAGE_SIZE * CONT_PTES)) {
+               hugetlb_add_hstate(CONT_PTE_SHIFT);
+       } else if (ps == (PMD_SIZE * CONT_PMDS)) {
+               hugetlb_add_hstate((PMD_SHIFT + CONT_PMD_SHIFT) - PAGE_SHIFT);
        } else {
                hugetlb_bad_size();
                pr_err("hugepagesz: Unsupported page size %lu K\n", ps >> 10);
@@ -314,3 +318,13 @@ static __init int setup_hugepagesz(char *opt)
        return 1;
 }
 __setup("hugepagesz=", setup_hugepagesz);
+
+#ifdef CONFIG_ARM64_64K_PAGES
+static __init int add_default_hugepagesz(void)
+{
+       if (size_to_hstate(CONT_PTES * PAGE_SIZE) == NULL)
+               hugetlb_add_hstate(CONT_PMD_SHIFT);
+       return 0;
+}
+arch_initcall(add_default_hugepagesz);
+#endif
index e7d09a6..12d73d9 100644 (file)
@@ -14,6 +14,7 @@
  *   more details.
  */
 
+#define __ARCH_WANT_RENAMEAT
 #define __ARCH_WANT_SYS_CLONE
 
 /* Use the standard ABI for syscalls. */
index 5aa3f51..3f646c7 100644 (file)
@@ -157,6 +157,7 @@ struct mtd_info *__init crisv32_nand_flash_probe(void)
        /* 20 us command delay time */
        this->chip_delay = 20;
        this->ecc.mode = NAND_ECC_SOFT;
+       this->ecc.algo = NAND_ECC_HAMMING;
 
        /* Enable the following for a flash based bad block table */
        /* this->bbt_options = NAND_BBT_USE_FLASH; */
index a7c17b0..a745405 100644 (file)
@@ -148,6 +148,7 @@ struct mtd_info *__init crisv32_nand_flash_probe(void)
        /* 20 us command delay time */
        this->chip_delay = 20;
        this->ecc.mode = NAND_ECC_SOFT;
+       this->ecc.algo = NAND_ECC_HAMMING;
 
        /* Enable the following for a flash based bad block table */
        /* this->bbt_options = NAND_BBT_USE_FLASH; */
index aa232de..3ae8525 100644 (file)
@@ -20,6 +20,7 @@ config H8300
        select HAVE_KERNEL_GZIP
        select HAVE_KERNEL_LZO
        select HAVE_ARCH_KGDB
+       select HAVE_ARCH_HASH
        select CPU_NO_EFFICIENT_FFS
 
 config RWSEM_GENERIC_SPINLOCK
index 7643633..613bfe6 100644 (file)
@@ -23,7 +23,6 @@ LDFLAGS_vmlinux := -Ttext $(IMAGE_OFFSET) -estartup -T $(obj)/vmlinux.lds \
 
 $(obj)/vmlinux: $(OBJECTS) $(obj)/piggy.o $(LIBGCC) FORCE
        $(call if_changed,ld)
-       @:
 
 $(obj)/vmlinux.bin: vmlinux FORCE
        $(call if_changed,objcopy)
diff --git a/arch/h8300/include/asm/hash.h b/arch/h8300/include/asm/hash.h
new file mode 100644 (file)
index 0000000..04cfbd2
--- /dev/null
@@ -0,0 +1,53 @@
+#ifndef _ASM_HASH_H
+#define _ASM_HASH_H
+
+/*
+ * The later H8SX models have a 32x32-bit multiply, but the H8/300H
+ * and H8S have only 16x16->32.  Since it's tolerably compact, this is
+ * basically an inlined version of the __mulsi3 code.  Since the inputs
+ * are not expected to be small, it's also simplfied by skipping the
+ * early-out checks.
+ *
+ * (Since neither CPU has any multi-bit shift instructions, a
+ * shift-and-add version is a non-starter.)
+ *
+ * TODO: come up with an arch-specific version of the hashing in fs/namei.c,
+ * since that is heavily dependent on rotates.  Which, as mentioned, suck
+ * horribly on H8.
+ */
+
+#if defined(CONFIG_CPU_H300H) || defined(CONFIG_CPU_H8S)
+
+#define HAVE_ARCH__HASH_32 1
+
+/*
+ * Multiply by k = 0x61C88647.  Fitting this into three registers requires
+ * one extra instruction, but reducing register pressure will probably
+ * make that back and then some.
+ *
+ * GCC asm note: %e1 is the high half of operand %1, while %f1 is the
+ * low half.  So if %1 is er4, then %e1 is e4 and %f1 is r4.
+ *
+ * This has been designed to modify x in place, since that's the most
+ * common usage, but preserve k, since hash_64() makes two calls in
+ * quick succession.
+ */
+static inline u32 __attribute_const__ __hash_32(u32 x)
+{
+       u32 temp;
+
+       asm(   "mov.w   %e1,%f0"
+       "\n     mulxu.w %f2,%0"         /* klow * xhigh */
+       "\n     mov.w   %f0,%e1"        /* The extra instruction */
+       "\n     mov.w   %f1,%f0"
+       "\n     mulxu.w %e2,%0"         /* khigh * xlow */
+       "\n     add.w   %e1,%f0"
+       "\n     mulxu.w %f2,%1"         /* klow * xlow */
+       "\n     add.w   %f0,%e1"
+       : "=&r" (temp), "=r" (x)
+       : "%r" (GOLDEN_RATIO_32), "1" (x));
+       return x;
+}
+
+#endif
+#endif /* _ASM_HASH_H */
index 7a2eb69..7dd20ef 100644 (file)
@@ -1,3 +1,5 @@
 #define __ARCH_NOMMU
 
+#define __ARCH_WANT_RENAMEAT
+
 #include <asm-generic/unistd.h>
index ffee405..2151760 100644 (file)
@@ -27,6 +27,7 @@
  */
 
 #define sys_mmap2 sys_mmap_pgoff
+#define __ARCH_WANT_RENAMEAT
 #define __ARCH_WANT_SYS_EXECVE
 #define __ARCH_WANT_SYS_CLONE
 #define __ARCH_WANT_SYS_VFORK
index 970d0bd..c100d78 100644 (file)
@@ -95,8 +95,8 @@ define archhelp
   echo '* unwcheck     - Check vmlinux for invalid unwind info'
 endef
 
-archprepare: make_nr_irqs_h FORCE
-PHONY += make_nr_irqs_h FORCE
+archprepare: make_nr_irqs_h
+PHONY += make_nr_irqs_h
 
-make_nr_irqs_h: FORCE
+make_nr_irqs_h:
        $(Q)$(MAKE) $(build)=arch/ia64/kernel include/generated/nr-irqs.h
index 01729c2..0606a72 100644 (file)
@@ -19,7 +19,6 @@ LDFLAGS_vmlinux := -T
 
 $(obj)/vmlinux: $(obj)/vmlinux.lds $(OBJECTS) $(obj)/piggy.o FORCE
        $(call if_changed,ld)
-       @:
 
 $(obj)/vmlinux.bin: vmlinux FORCE
        $(call if_changed,objcopy)
index 8ace920..967260f 100644 (file)
@@ -41,6 +41,7 @@ config M68000
        select CPU_HAS_NO_UNALIGNED
        select GENERIC_CSUM
        select CPU_NO_EFFICIENT_FFS
+       select HAVE_ARCH_HASH
        help
          The Freescale (was Motorola) 68000 CPU is the first generation of
          the well known M68K family of processors. The CPU core as well as
diff --git a/arch/m68k/include/asm/hash.h b/arch/m68k/include/asm/hash.h
new file mode 100644 (file)
index 0000000..6407af8
--- /dev/null
@@ -0,0 +1,59 @@
+#ifndef _ASM_HASH_H
+#define _ASM_HASH_H
+
+/*
+ * If CONFIG_M68000=y (original mc68000/010), this file is #included
+ * to work around the lack of a MULU.L instruction.
+ */
+
+#define HAVE_ARCH__HASH_32 1
+/*
+ * While it would be legal to substitute a different hash operation
+ * entirely, let's keep it simple and just use an optimized multiply
+ * by GOLDEN_RATIO_32 = 0x61C88647.
+ *
+ * The best way to do that appears to be to multiply by 0x8647 with
+ * shifts and adds, and use mulu.w to multiply the high half by 0x61C8.
+ *
+ * Because the 68000 has multi-cycle shifts, this addition chain is
+ * chosen to minimise the shift distances.
+ *
+ * Despite every attempt to spoon-feed it simple operations, GCC
+ * 6.1.1 doggedly insists on doing annoying things like converting
+ * "lsl.l #2,<reg>" (12 cycles) to two adds (8+8 cycles).
+ *
+ * It also likes to notice two shifts in a row, like "a = x << 2" and
+ * "a <<= 7", and convert that to "a = x << 9".  But shifts longer
+ * than 8 bits are extra-slow on m68k, so that's a lose.
+ *
+ * Since the 68000 is a very simple in-order processor with no
+ * instruction scheduling effects on execution time, we can safely
+ * take it out of GCC's hands and write one big asm() block.
+ *
+ * Without calling overhead, this operation is 30 bytes (14 instructions
+ * plus one immediate constant) and 166 cycles.
+ *
+ * (Because %2 is fetched twice, it can't be postincrement, and thus it
+ * can't be a fully general "g" or "m".  Register is preferred, but
+ * offsettable memory or immediate will work.)
+ */
+static inline u32 __attribute_const__ __hash_32(u32 x)
+{
+       u32 a, b;
+
+       asm(   "move.l %2,%0"   /* a = x * 0x0001 */
+       "\n     lsl.l #2,%0"    /* a = x * 0x0004 */
+       "\n     move.l %0,%1"
+       "\n     lsl.l #7,%0"    /* a = x * 0x0200 */
+       "\n     add.l %2,%0"    /* a = x * 0x0201 */
+       "\n     add.l %0,%1"    /* b = x * 0x0205 */
+       "\n     add.l %0,%0"    /* a = x * 0x0402 */
+       "\n     add.l %0,%1"    /* b = x * 0x0607 */
+       "\n     lsl.l #5,%0"    /* a = x * 0x8040 */
+       : "=&d,d" (a), "=&r,r" (b)
+       : "r,roi?" (x));        /* a+b = x*0x8647 */
+
+       return ((u16)(x*0x61c8) << 16) + a + b;
+}
+
+#endif /* _ASM_HASH_H */
index b80b8e8..459b6ec 100644 (file)
@@ -7,6 +7,8 @@
  * (at your option) any later version.
  */
 
+#define __ARCH_WANT_RENAMEAT
+
 /* Use the standard ABI for syscalls. */
 #include <asm-generic/unistd.h>
 
index 252abc1..3e8e048 100644 (file)
@@ -29,7 +29,7 @@ static bool is_valid_call(unsigned long calladdr)
 
 static struct metag_frame __user *
 user_backtrace(struct metag_frame __user *user_frame,
-              struct perf_callchain_entry *entry)
+              struct perf_callchain_entry_ctx *entry)
 {
        struct metag_frame frame;
        unsigned long calladdr;
@@ -56,7 +56,7 @@ user_backtrace(struct metag_frame __user *user_frame,
 }
 
 void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
        unsigned long sp = regs->ctx.AX[0].U0;
        struct metag_frame __user *frame;
@@ -65,7 +65,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 
        --frame;
 
-       while ((entry->nr < sysctl_perf_event_max_stack) && frame)
+       while ((entry->nr < entry->max_stack) && frame)
                frame = user_backtrace(frame, entry);
 }
 
@@ -78,13 +78,13 @@ static int
 callchain_trace(struct stackframe *fr,
                void *data)
 {
-       struct perf_callchain_entry *entry = data;
+       struct perf_callchain_entry_ctx *entry = data;
        perf_callchain_store(entry, fr->pc);
        return 0;
 }
 
 void
-perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
        struct stackframe fr;
 
index f17c3a4..636e072 100644 (file)
@@ -16,6 +16,7 @@ config MICROBLAZE
        select GENERIC_IRQ_SHOW
        select GENERIC_PCI_IOMAP
        select GENERIC_SCHED_CLOCK
+       select HAVE_ARCH_HASH
        select HAVE_ARCH_KGDB
        select HAVE_DEBUG_KMEMLEAK
        select HAVE_DMA_API_DEBUG
diff --git a/arch/microblaze/include/asm/hash.h b/arch/microblaze/include/asm/hash.h
new file mode 100644 (file)
index 0000000..753513a
--- /dev/null
@@ -0,0 +1,81 @@
+#ifndef _ASM_HASH_H
+#define _ASM_HASH_H
+
+/*
+ * Fortunately, most people who want to run Linux on Microblaze enable
+ * both multiplier and barrel shifter, but omitting them is technically
+ * a supported configuration.
+ *
+ * With just a barrel shifter, we can implement an efficient constant
+ * multiply using shifts and adds.  GCC can find a 9-step solution, but
+ * this 6-step solution was found by Yevgen Voronenko's implementation
+ * of the Hcub algorithm at http://spiral.ece.cmu.edu/mcm/gen.html.
+ *
+ * That software is really not designed for a single multiplier this large,
+ * but if you run it enough times with different seeds, it'll find several
+ * 6-shift, 6-add sequences for computing x * 0x61C88647.  They are all
+ *     c = (x << 19) + x;
+ *     a = (x <<  9) + c;
+ *     b = (x << 23) + a;
+ *     return (a<<11) + (b<<6) + (c<<3) - b;
+ * with variations on the order of the final add.
+ *
+ * Without even a shifter, it's hopless; any hash function will suck.
+ */
+
+#if CONFIG_XILINX_MICROBLAZE0_USE_HW_MUL == 0
+
+#define HAVE_ARCH__HASH_32 1
+
+/* Multiply by GOLDEN_RATIO_32 = 0x61C88647 */
+static inline u32 __attribute_const__ __hash_32(u32 a)
+{
+#if CONFIG_XILINX_MICROBLAZE0_USE_BARREL
+       unsigned int b, c;
+
+       /* Phase 1: Compute three intermediate values */
+       b =  a << 23;
+       c = (a << 19) + a;
+       a = (a <<  9) + c;
+       b += a;
+
+       /* Phase 2: Compute (a << 11) + (b << 6) + (c << 3) - b */
+       a <<= 5;
+       a += b;         /* (a << 5) + b */
+       a <<= 3;
+       a += c;         /* (a << 8) + (b << 3) + c */
+       a <<= 3;
+       return a - b;   /* (a << 11) + (b << 6) + (c << 3) - b */
+#else
+       /*
+        * "This is really going to hurt."
+        *
+        * Without a barrel shifter, left shifts are implemented as
+        * repeated additions, and the best we can do is an optimal
+        * addition-subtraction chain.  This one is not known to be
+        * optimal, but at 37 steps, it's decent for a 31-bit multiplier.
+        *
+        * Question: given its size (37*4 = 148 bytes per instance),
+        * and slowness, is this worth having inline?
+        */
+       unsigned int b, c, d;
+
+       b = a << 4;     /* 4    */
+       c = b << 1;     /* 1  5 */
+       b += a;         /* 1  6 */
+       c += b;         /* 1  7 */
+       c <<= 3;        /* 3 10 */
+       c -= a;         /* 1 11 */
+       d = c << 7;     /* 7 18 */
+       d += b;         /* 1 19 */
+       d <<= 8;        /* 8 27 */
+       d += a;         /* 1 28 */
+       d <<= 1;        /* 1 29 */
+       d += b;         /* 1 30 */
+       d <<= 6;        /* 6 36 */
+       return d + c;   /* 1 37 total instructions*/
+#endif
+}
+
+#endif /* !CONFIG_XILINX_MICROBLAZE0_USE_HW_MUL */
+#endif /* _ASM_HASH_H */
index 76ed17b..805ae5d 100644 (file)
@@ -38,6 +38,6 @@
 
 #endif /* __ASSEMBLY__ */
 
-#define __NR_syscalls         389
+#define __NR_syscalls         392
 
 #endif /* _ASM_MICROBLAZE_UNISTD_H */
index 32850c7..a8bd3fa 100644 (file)
 #define __NR_memfd_create      386
 #define __NR_bpf               387
 #define __NR_execveat          388
+#define __NR_userfaultfd       389
+#define __NR_membarrier                390
+#define __NR_mlock2            391
 
 #endif /* _UAPI_ASM_MICROBLAZE_UNISTD_H */
index 29c8568..6b3dd99 100644 (file)
@@ -389,3 +389,6 @@ ENTRY(sys_call_table)
        .long sys_memfd_create
        .long sys_bpf
        .long sys_execveat
+       .long sys_userfaultfd
+       .long sys_membarrier            /* 390 */
+       .long sys_mlock2
index 35654be..14cba60 100644 (file)
@@ -48,6 +48,8 @@ static int global_phb_number;         /* Global phb counter */
 resource_size_t isa_mem_base;
 
 unsigned long isa_io_base;
+EXPORT_SYMBOL(isa_io_base);
+
 static int pci_bus_count;
 
 struct pci_controller *pcibios_alloc_controller(struct device_node *dev)
index 4693884..ac91939 100644 (file)
@@ -398,6 +398,7 @@ config MACH_PISTACHIO
        select SYS_SUPPORTS_LITTLE_ENDIAN
        select SYS_SUPPORTS_MIPS_CPS
        select SYS_SUPPORTS_MULTITHREADING
+       select SYS_SUPPORTS_RELOCATABLE
        select SYS_SUPPORTS_ZBOOT
        select SYS_HAS_EARLY_PRINTK
        select USE_GENERIC_EARLY_PRINTK_8250
index 4a9c8f2..f6ae6ed 100644 (file)
@@ -5,7 +5,7 @@
        #size-cells = <1>;
        compatible = "ingenic,jz4740";
 
-       cpuintc: interrupt-controller@0 {
+       cpuintc: interrupt-controller {
                #address-cells = <0>;
                #interrupt-cells = <1>;
                interrupt-controller;
index 08bf24f..793c0c7 100644 (file)
@@ -9,7 +9,7 @@
                };
        };
 
-       cpuintc: cpuintc@0 {
+       cpuintc: cpuintc {
                #address-cells = <0>;
                #interrupt-cells = <1>;
                interrupt-controller;
index 182afde..fb2faef 100644 (file)
@@ -9,7 +9,7 @@
                };
        };
 
-       cpuintc: cpuintc@0 {
+       cpuintc: cpuintc {
                #address-cells = <0>;
                #interrupt-cells = <1>;
                interrupt-controller;
index e3203d4..d3cb57f 100644 (file)
@@ -9,7 +9,7 @@
                };
        };
 
-       cpuintc: cpuintc@0 {
+       cpuintc: cpuintc {
                #address-cells = <0>;
                #interrupt-cells = <1>;
                interrupt-controller;
index 3b131dd..3d6fc9a 100644 (file)
@@ -9,7 +9,7 @@
                };
        };
 
-       cpuintc: cpuintc@0 {
+       cpuintc: cpuintc {
                #address-cells = <0>;
                #interrupt-cells = <1>;
                interrupt-controller;
index 686ebd1..48d2112 100644 (file)
@@ -10,7 +10,7 @@
                reg = <0x0 0x08000000>;
        };
 
-       cpuintc: interrupt-controller@0 {
+       cpuintc: interrupt-controller {
                #address-cells = <0>;
                #interrupt-cells = <1>;
                interrupt-controller;
index dff88aa..33aab89 100644 (file)
@@ -384,7 +384,7 @@ static int octeon_cpu_callback(struct notifier_block *nfb,
 {
        unsigned int cpu = (unsigned long)hcpu;
 
-       switch (action) {
+       switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_UP_PREPARE:
                octeon_update_boot_vector(cpu);
                break;
index 6741673..56584a6 100644 (file)
 #include <asm/asmmacro-64.h>
 #endif
 
+/*
+ * Helper macros for generating raw instruction encodings.
+ */
+#ifdef CONFIG_CPU_MICROMIPS
+       .macro  insn32_if_mm enc
+       .insn
+       .hword ((\enc) >> 16)
+       .hword ((\enc) & 0xffff)
+       .endm
+
+       .macro  insn_if_mips enc
+       .endm
+#else
+       .macro  insn32_if_mm enc
+       .endm
+
+       .macro  insn_if_mips enc
+       .insn
+       .word (\enc)
+       .endm
+#endif
+
 #if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR6)
        .macro  local_irq_enable reg=t0
        ei
        .endm
 #else
 
-#ifdef CONFIG_CPU_MICROMIPS
-#define CFC_MSA_INSN           0x587e0056
-#define CTC_MSA_INSN           0x583e0816
-#define LDB_MSA_INSN           0x58000807
-#define LDH_MSA_INSN           0x58000817
-#define LDW_MSA_INSN           0x58000827
-#define LDD_MSA_INSN           0x58000837
-#define STB_MSA_INSN           0x5800080f
-#define STH_MSA_INSN           0x5800081f
-#define STW_MSA_INSN           0x5800082f
-#define STD_MSA_INSN           0x5800083f
-#define COPY_SW_MSA_INSN       0x58b00056
-#define COPY_SD_MSA_INSN       0x58b80056
-#define INSERT_W_MSA_INSN      0x59300816
-#define INSERT_D_MSA_INSN      0x59380816
-#else
-#define CFC_MSA_INSN           0x787e0059
-#define CTC_MSA_INSN           0x783e0819
-#define LDB_MSA_INSN           0x78000820
-#define LDH_MSA_INSN           0x78000821
-#define LDW_MSA_INSN           0x78000822
-#define LDD_MSA_INSN           0x78000823
-#define STB_MSA_INSN           0x78000824
-#define STH_MSA_INSN           0x78000825
-#define STW_MSA_INSN           0x78000826
-#define STD_MSA_INSN           0x78000827
-#define COPY_SW_MSA_INSN       0x78b00059
-#define COPY_SD_MSA_INSN       0x78b80059
-#define INSERT_W_MSA_INSN      0x79300819
-#define INSERT_D_MSA_INSN      0x79380819
-#endif
-
        /*
         * Temporary until all toolchains in use include MSA support.
         */
        .set    push
        .set    noat
        SET_HARDFLOAT
-       .insn
-       .word   CFC_MSA_INSN | (\cs << 11)
+       insn_if_mips 0x787e0059 | (\cs << 11)
+       insn32_if_mm 0x587e0056 | (\cs << 11)
        move    \rd, $1
        .set    pop
        .endm
        .set    noat
        SET_HARDFLOAT
        move    $1, \rs
-       .word   CTC_MSA_INSN | (\cd << 6)
+       insn_if_mips 0x783e0819 | (\cd << 6)
+       insn32_if_mm 0x583e0816 | (\cd << 6)
        .set    pop
        .endm
 
        .set    noat
        SET_HARDFLOAT
        PTR_ADDU $1, \base, \off
-       .word   LDB_MSA_INSN | (\wd << 6)
+       insn_if_mips 0x78000820 | (\wd << 6)
+       insn32_if_mm 0x58000807 | (\wd << 6)
        .set    pop
        .endm
 
        .set    noat
        SET_HARDFLOAT
        PTR_ADDU $1, \base, \off
-       .word   LDH_MSA_INSN | (\wd << 6)
+       insn_if_mips 0x78000821 | (\wd << 6)
+       insn32_if_mm 0x58000817 | (\wd << 6)
        .set    pop
        .endm
 
        .set    noat
        SET_HARDFLOAT
        PTR_ADDU $1, \base, \off
-       .word   LDW_MSA_INSN | (\wd << 6)
+       insn_if_mips 0x78000822 | (\wd << 6)
+       insn32_if_mm 0x58000827 | (\wd << 6)
        .set    pop
        .endm
 
        .set    noat
        SET_HARDFLOAT
        PTR_ADDU $1, \base, \off
-       .word   LDD_MSA_INSN | (\wd << 6)
+       insn_if_mips 0x78000823 | (\wd << 6)
+       insn32_if_mm 0x58000837 | (\wd << 6)
        .set    pop
        .endm
 
        .set    noat
        SET_HARDFLOAT
        PTR_ADDU $1, \base, \off
-       .word   STB_MSA_INSN | (\wd << 6)
+       insn_if_mips 0x78000824 | (\wd << 6)
+       insn32_if_mm 0x5800080f | (\wd << 6)
        .set    pop
        .endm
 
        .set    noat
        SET_HARDFLOAT
        PTR_ADDU $1, \base, \off
-       .word   STH_MSA_INSN | (\wd << 6)
+       insn_if_mips 0x78000825 | (\wd << 6)
+       insn32_if_mm 0x5800081f | (\wd << 6)
        .set    pop
        .endm
 
        .set    noat
        SET_HARDFLOAT
        PTR_ADDU $1, \base, \off
-       .word   STW_MSA_INSN | (\wd << 6)
+       insn_if_mips 0x78000826 | (\wd << 6)
+       insn32_if_mm 0x5800082f | (\wd << 6)
        .set    pop
        .endm
 
        .set    noat
        SET_HARDFLOAT
        PTR_ADDU $1, \base, \off
-       .word   STD_MSA_INSN | (\wd << 6)
+       insn_if_mips 0x78000827 | (\wd << 6)
+       insn32_if_mm 0x5800083f | (\wd << 6)
        .set    pop
        .endm
 
        .set    push
        .set    noat
        SET_HARDFLOAT
-       .insn
-       .word   COPY_SW_MSA_INSN | (\n << 16) | (\ws << 11)
+       insn_if_mips 0x78b00059 | (\n << 16) | (\ws << 11)
+       insn32_if_mm 0x58b00056 | (\n << 16) | (\ws << 11)
        .set    pop
        .endm
 
        .set    push
        .set    noat
        SET_HARDFLOAT
-       .insn
-       .word   COPY_SD_MSA_INSN | (\n << 16) | (\ws << 11)
+       insn_if_mips 0x78b80059 | (\n << 16) | (\ws << 11)
+       insn32_if_mm 0x58b80056 | (\n << 16) | (\ws << 11)
        .set    pop
        .endm
 
        .set    push
        .set    noat
        SET_HARDFLOAT
-       .word   INSERT_W_MSA_INSN | (\n << 16) | (\wd << 6)
+       insn_if_mips 0x79300819 | (\n << 16) | (\wd << 6)
+       insn32_if_mm 0x59300816 | (\n << 16) | (\wd << 6)
        .set    pop
        .endm
 
        .set    push
        .set    noat
        SET_HARDFLOAT
-       .word   INSERT_D_MSA_INSN | (\n << 16) | (\wd << 6)
+       insn_if_mips 0x79380819 | (\n << 16) | (\wd << 6)
+       insn32_if_mm 0x59380816 | (\n << 16) | (\wd << 6)
        .set    pop
        .endm
 #endif
index dbb1eb6..e0fecf2 100644 (file)
@@ -58,8 +58,8 @@
  * address of a label as argument to inline assembler. Gas otoh has the
  * annoying difference between la and dla which are only usable for 32-bit
  * rsp. 64-bit code, so can't be used without conditional compilation.
- * The alterantive is switching the assembler to 64-bit code which happens
- * to work right even for 32-bit code ...
+ * The alternative is switching the assembler to 64-bit code which happens
+ * to work right even for 32-bit code...
  */
 #define instruction_hazard()                                           \
 do {                                                                   \
@@ -133,8 +133,8 @@ do {                                                                        \
  * address of a label as argument to inline assembler. Gas otoh has the
  * annoying difference between la and dla which are only usable for 32-bit
  * rsp. 64-bit code, so can't be used without conditional compilation.
- * The alterantive is switching the assembler to 64-bit code which happens
- * to work right even for 32-bit code ...
+ * The alternative is switching the assembler to 64-bit code which happens
+ * to work right even for 32-bit code...
  */
 #define __instruction_hazard()                                         \
 do {                                                                   \
index ca8077a..456ddba 100644 (file)
@@ -100,7 +100,7 @@ typedef volatile struct au1xxx_ddma_desc {
        u32     dscr_nxtptr;            /* Next descriptor pointer (mostly) */
        /*
         * First 32 bytes are HW specific!!!
-        * Lets have some SW data following -- make sure it's 32 bytes.
+        * Let's have some SW data following -- make sure it's 32 bytes.
         */
        u32     sw_status;
        u32     sw_context;
index ce02894..d607d64 100644 (file)
@@ -140,7 +140,7 @@ static inline int au1300_gpio_getinitlvl(unsigned int gpio)
 * Cases 1 and 3 are intended for boards which want to provide their own
 * GPIO namespace and -operations (i.e. for example you have 8 GPIOs
 * which are in part provided by spare Au1300 GPIO pins and in part by
-* an external FPGA but you still want them to be accssible in linux
+* an external FPGA but you still want them to be accessible in linux
 * as gpio0-7. The board can of course use the alchemy_gpioX_* functions
 * as required).
 */
index 466fc85..c4e856f 100644 (file)
@@ -22,7 +22,7 @@ struct bcm63xx_enet_platform_data {
        int has_phy_interrupt;
        int phy_interrupt;
 
-       /* if has_phy, use autonegociated pause parameters or force
+       /* if has_phy, use autonegotiated pause parameters or force
         * them */
        int pause_auto;
        int pause_rx;
index 1daa644..04d8620 100644 (file)
@@ -64,7 +64,7 @@ static inline void plat_post_dma_flush(struct device *dev)
 
 static inline int plat_device_is_coherent(struct device *dev)
 {
-       return 1;               /* IP27 non-cohernet mode is unsupported */
+       return 1;               /* IP27 non-coherent mode is unsupported */
 }
 
 #endif /* __ASM_MACH_IP27_DMA_COHERENCE_H */
index 0a0b0e2..7bdf212 100644 (file)
@@ -86,7 +86,7 @@ static inline void plat_post_dma_flush(struct device *dev)
 
 static inline int plat_device_is_coherent(struct device *dev)
 {
-       return 0;               /* IP32 is non-cohernet */
+       return 0;               /* IP32 is non-coherent */
 }
 
 #endif /* __ASM_MACH_IP32_DMA_COHERENCE_H */
index 398733e..7f7b0fc 100644 (file)
@@ -27,7 +27,7 @@ struct jz_nand_platform_data {
 
        unsigned char banks[JZ_NAND_NUM_BANKS];
 
-       void (*ident_callback)(struct platform_device *, struct nand_chip *,
+       void (*ident_callback)(struct platform_device *, struct mtd_info *,
                                struct mtd_partition **, int *num_partitions);
 };
 
index 7023883..8e9b022 100644 (file)
@@ -22,7 +22,7 @@
 
 /*
  * during early_printk no ioremap possible at this early stage
- * lets use KSEG1 instead
+ * let's use KSEG1 instead
  */
 #define LTQ_ASC0_BASE_ADDR     0x1E100C00
 #define LTQ_EARLY_ASC          KSEG1ADDR(LTQ_ASC0_BASE_ADDR)
index f873107..17b41bb 100644 (file)
@@ -75,7 +75,7 @@ extern __iomem void *ltq_cgu_membase;
 
 /*
  * during early_printk no ioremap is possible
- * lets use KSEG1 instead
+ * let's use KSEG1 instead
  */
 #define LTQ_ASC1_BASE_ADDR     0x1E100C00
 #define LTQ_EARLY_ASC          KSEG1ADDR(LTQ_ASC1_BASE_ADDR)
index 4431fc5..74230d0 100644 (file)
@@ -24,7 +24,7 @@ struct temp_range {
        u8 level;
 };
 
-#define CONSTANT_SPEED_POLICY  0  /* at constent speed */
+#define CONSTANT_SPEED_POLICY  0  /* at constant speed */
 #define STEP_SPEED_POLICY      1  /* use up/down arrays to describe policy */
 #define KERNEL_HELPER_POLICY   2  /* kernel as a helper to fan control */
 
index 0cf8622..ab03eb3 100644 (file)
@@ -56,7 +56,7 @@
                (0 << MIPS_SEGCFG_PA_SHIFT) |                           \
                (1 << MIPS_SEGCFG_EU_SHIFT)) << 16)
        or      t0, t2
-       mtc0    t0, $5, 2
+       mtc0    t0, CP0_SEGCTL0
 
        /* SegCtl1 */
        li      t0, ((MIPS_SEGCFG_MUSUK << MIPS_SEGCFG_AM_SHIFT) |      \
@@ -67,7 +67,7 @@
                (0 << MIPS_SEGCFG_PA_SHIFT) |                           \
                (1 << MIPS_SEGCFG_EU_SHIFT)) << 16)
        ins     t0, t1, 16, 3
-       mtc0    t0, $5, 3
+       mtc0    t0, CP0_SEGCTL1
 
        /* SegCtl2 */
        li      t0, ((MIPS_SEGCFG_MUSUK << MIPS_SEGCFG_AM_SHIFT) |      \
@@ -77,7 +77,7 @@
                (4 << MIPS_SEGCFG_PA_SHIFT) |                           \
                (1 << MIPS_SEGCFG_EU_SHIFT)) << 16)
        or      t0, t2
-       mtc0    t0, $5, 4
+       mtc0    t0, CP0_SEGCTL2
 
        jal     mips_ihb
        mfc0    t0, $16, 5
index f6ba004..aa4cca0 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Definitions and decalrations for MIPS MT support that are common between
+ * Definitions and declarations for MIPS MT support that are common between
  * the VSMP, and AP/SP kernel models.
  */
 #ifndef __ASM_MIPS_MT_H
index 25d0157..e1ca65c 100644 (file)
@@ -48,6 +48,9 @@
 #define CP0_CONF $3
 #define CP0_CONTEXT $4
 #define CP0_PAGEMASK $5
+#define CP0_SEGCTL0 $5, 2
+#define CP0_SEGCTL1 $5, 3
+#define CP0_SEGCTL2 $5, 4
 #define CP0_WIRED $6
 #define CP0_INFO $7
 #define CP0_HWRENA $7, 0
 #define MIPS_PWFIELD_PTEI_SHIFT        0
 #define MIPS_PWFIELD_PTEI_MASK 0x0000003f
 
+#define MIPS_PWSIZE_PS_SHIFT   30
+#define MIPS_PWSIZE_PS_MASK    0x40000000
 #define MIPS_PWSIZE_GDW_SHIFT  24
 #define MIPS_PWSIZE_GDW_MASK   0x3f000000
 #define MIPS_PWSIZE_UDW_SHIFT  18
 
 #define MIPS_PWCTL_PWEN_SHIFT  31
 #define MIPS_PWCTL_PWEN_MASK   0x80000000
+#define MIPS_PWCTL_XK_SHIFT    28
+#define MIPS_PWCTL_XK_MASK     0x10000000
+#define MIPS_PWCTL_XS_SHIFT    27
+#define MIPS_PWCTL_XS_MASK     0x08000000
+#define MIPS_PWCTL_XU_SHIFT    26
+#define MIPS_PWCTL_XU_MASK     0x04000000
 #define MIPS_PWCTL_DPH_SHIFT   7
 #define MIPS_PWCTL_DPH_MASK    0x00000080
 #define MIPS_PWCTL_HUGEPG_SHIFT        6
@@ -1045,6 +1056,33 @@ static inline int mm_insn_16bit(u16 insn)
        return (opcode >= 1 && opcode <= 3) ? 1 : 0;
 }
 
+/*
+ * Helper macros for generating raw instruction encodings in inline asm.
+ */
+#ifdef CONFIG_CPU_MICROMIPS
+#define _ASM_INSN16_IF_MM(_enc)                        \
+       ".insn\n\t"                             \
+       ".hword (" #_enc ")\n\t"
+#define _ASM_INSN32_IF_MM(_enc)                        \
+       ".insn\n\t"                             \
+       ".hword ((" #_enc ") >> 16)\n\t"        \
+       ".hword ((" #_enc ") & 0xffff)\n\t"
+#else
+#define _ASM_INSN_IF_MIPS(_enc)                        \
+       ".insn\n\t"                             \
+       ".word (" #_enc ")\n\t"
+#endif
+
+#ifndef _ASM_INSN16_IF_MM
+#define _ASM_INSN16_IF_MM(_enc)
+#endif
+#ifndef _ASM_INSN32_IF_MM
+#define _ASM_INSN32_IF_MM(_enc)
+#endif
+#ifndef _ASM_INSN_IF_MIPS
+#define _ASM_INSN_IF_MIPS(_enc)
+#endif
+
 /*
  * TLB Invalidate Flush
  */
@@ -1053,7 +1091,9 @@ static inline void tlbinvf(void)
        __asm__ __volatile__(
                ".set push\n\t"
                ".set noreorder\n\t"
-               ".word 0x42000004\n\t" /* tlbinvf */
+               "# tlbinvf\n\t"
+               _ASM_INSN_IF_MIPS(0x42000004)
+               _ASM_INSN32_IF_MM(0x0000537c)
                ".set pop");
 }
 
@@ -1274,9 +1314,9 @@ do {                                                                      \
        "       .set    push                                    \n"     \
        "       .set    noat                                    \n"     \
        "       .set    mips32r2                                \n"     \
-       "       .insn                                           \n"     \
        "       # mfhc0 $1, %1                                  \n"     \
-       "       .word   (0x40410000 | ((%1 & 0x1f) << 11))      \n"     \
+       _ASM_INSN_IF_MIPS(0x40410000 | ((%1 & 0x1f) << 11))             \
+       _ASM_INSN32_IF_MM(0x002000f4 | ((%1 & 0x1f) << 16))             \
        "       move    %0, $1                                  \n"     \
        "       .set    pop                                     \n"     \
        : "=r" (__res)                                                  \
@@ -1292,8 +1332,8 @@ do {                                                                      \
        "       .set    mips32r2                                \n"     \
        "       move    $1, %0                                  \n"     \
        "       # mthc0 $1, %1                                  \n"     \
-       "       .insn                                           \n"     \
-       "       .word   (0x40c10000 | ((%1 & 0x1f) << 11))      \n"     \
+       _ASM_INSN_IF_MIPS(0x40c10000 | ((%1 & 0x1f) << 11))             \
+       _ASM_INSN32_IF_MM(0x002002f4 | ((%1 & 0x1f) << 16))             \
        "       .set    pop                                     \n"     \
        :                                                               \
        : "r" (value), "i" (register));                                 \
@@ -1743,7 +1783,8 @@ do {                                                                      \
                ".set\tpush\n\t"                                        \
                ".set\tnoat\n\t"                                        \
                "# mfgc0\t$1, $%1, %2\n\t"                              \
-               ".word\t(0x40610000 | %1 << 11 | %2)\n\t"               \
+               _ASM_INSN_IF_MIPS(0x40610000 | %1 << 11 | %2)           \
+               _ASM_INSN32_IF_MM(0x002004fc | %1 << 16 | %2 << 11)     \
                "move\t%0, $1\n\t"                                      \
                ".set\tpop"                                             \
                : "=r" (__res)                                          \
@@ -1757,7 +1798,8 @@ do {                                                                      \
                ".set\tpush\n\t"                                        \
                ".set\tnoat\n\t"                                        \
                "# dmfgc0\t$1, $%1, %2\n\t"                             \
-               ".word\t(0x40610100 | %1 << 11 | %2)\n\t"               \
+               _ASM_INSN_IF_MIPS(0x40610100 | %1 << 11 | %2)           \
+               _ASM_INSN32_IF_MM(0x582004fc | %1 << 16 | %2 << 11)     \
                "move\t%0, $1\n\t"                                      \
                ".set\tpop"                                             \
                : "=r" (__res)                                          \
@@ -1770,9 +1812,10 @@ do {                                                                     \
        __asm__ __volatile__(                                           \
                ".set\tpush\n\t"                                        \
                ".set\tnoat\n\t"                                        \
-               "move\t$1, %0\n\t"                                      \
+               "move\t$1, %z0\n\t"                                     \
                "# mtgc0\t$1, $%1, %2\n\t"                              \
-               ".word\t(0x40610200 | %1 << 11 | %2)\n\t"               \
+               _ASM_INSN_IF_MIPS(0x40610200 | %1 << 11 | %2)           \
+               _ASM_INSN32_IF_MM(0x002006fc | %1 << 16 | %2 << 11)     \
                ".set\tpop"                                             \
                : : "Jr" ((unsigned int)(value)),                       \
                    "i" (register), "i" (sel));                         \
@@ -1783,9 +1826,10 @@ do {                                                                     \
        __asm__ __volatile__(                                           \
                ".set\tpush\n\t"                                        \
                ".set\tnoat\n\t"                                        \
-               "move\t$1, %0\n\t"                                      \
+               "move\t$1, %z0\n\t"                                     \
                "# dmtgc0\t$1, $%1, %2\n\t"                             \
-               ".word\t(0x40610300 | %1 << 11 | %2)\n\t"               \
+               _ASM_INSN_IF_MIPS(0x40610300 | %1 << 11 | %2)           \
+               _ASM_INSN32_IF_MM(0x582006fc | %1 << 16 | %2 << 11)     \
                ".set\tpop"                                             \
                : : "Jr" (value),                                       \
                    "i" (register), "i" (sel));                         \
@@ -2246,7 +2290,6 @@ do {                                                                      \
 
 #else
 
-#ifdef CONFIG_CPU_MICROMIPS
 #define rddsp(mask)                                                    \
 ({                                                                     \
        unsigned int __res;                                             \
@@ -2255,8 +2298,8 @@ do {                                                                      \
        "       .set    push                                    \n"     \
        "       .set    noat                                    \n"     \
        "       # rddsp $1, %x1                                 \n"     \
-       "       .hword  ((0x0020067c | (%x1 << 14)) >> 16)      \n"     \
-       "       .hword  ((0x0020067c | (%x1 << 14)) & 0xffff)   \n"     \
+       _ASM_INSN_IF_MIPS(0x7c000cb8 | (%x1 << 16))                     \
+       _ASM_INSN32_IF_MM(0x0020067c | (%x1 << 14))                     \
        "       move    %0, $1                                  \n"     \
        "       .set    pop                                     \n"     \
        : "=r" (__res)                                                  \
@@ -2271,22 +2314,22 @@ do {                                                                    \
        "       .set    noat                                    \n"     \
        "       move    $1, %0                                  \n"     \
        "       # wrdsp $1, %x1                                 \n"     \
-       "       .hword  ((0x0020167c | (%x1 << 14)) >> 16)      \n"     \
-       "       .hword  ((0x0020167c | (%x1 << 14)) & 0xffff)   \n"     \
+       _ASM_INSN_IF_MIPS(0x7c2004f8 | (%x1 << 11))                     \
+       _ASM_INSN32_IF_MM(0x0020167c | (%x1 << 14))                     \
        "       .set    pop                                     \n"     \
        :                                                               \
        : "r" (val), "i" (mask));                                       \
 } while (0)
 
-#define _umips_dsp_mfxxx(ins)                                          \
+#define _dsp_mfxxx(ins)                                                        \
 ({                                                                     \
        unsigned long __treg;                                           \
                                                                        \
        __asm__ __volatile__(                                           \
        "       .set    push                                    \n"     \
        "       .set    noat                                    \n"     \
-       "       .hword  0x0001                                  \n"     \
-       "       .hword  %x1                                     \n"     \
+       _ASM_INSN_IF_MIPS(0x00000810 | %X1)                             \
+       _ASM_INSN32_IF_MM(0x0001007c | %x1)                             \
        "       move    %0, $1                                  \n"     \
        "       .set    pop                                     \n"     \
        : "=r" (__treg)                                                 \
@@ -2294,101 +2337,28 @@ do {                                                                   \
        __treg;                                                         \
 })
 
-#define _umips_dsp_mtxxx(val, ins)                                     \
+#define _dsp_mtxxx(val, ins)                                           \
 do {                                                                   \
        __asm__ __volatile__(                                           \
        "       .set    push                                    \n"     \
        "       .set    noat                                    \n"     \
        "       move    $1, %0                                  \n"     \
-       "       .hword  0x0001                                  \n"     \
-       "       .hword  %x1                                     \n"     \
+       _ASM_INSN_IF_MIPS(0x00200011 | %X1)                             \
+       _ASM_INSN32_IF_MM(0x0001207c | %x1)                             \
        "       .set    pop                                     \n"     \
        :                                                               \
        : "r" (val), "i" (ins));                                        \
 } while (0)
 
-#define _umips_dsp_mflo(reg) _umips_dsp_mfxxx((reg << 14) | 0x107c)
-#define _umips_dsp_mfhi(reg) _umips_dsp_mfxxx((reg << 14) | 0x007c)
-
-#define _umips_dsp_mtlo(val, reg) _umips_dsp_mtxxx(val, ((reg << 14) | 0x307c))
-#define _umips_dsp_mthi(val, reg) _umips_dsp_mtxxx(val, ((reg << 14) | 0x207c))
-
-#define mflo0() _umips_dsp_mflo(0)
-#define mflo1() _umips_dsp_mflo(1)
-#define mflo2() _umips_dsp_mflo(2)
-#define mflo3() _umips_dsp_mflo(3)
-
-#define mfhi0() _umips_dsp_mfhi(0)
-#define mfhi1() _umips_dsp_mfhi(1)
-#define mfhi2() _umips_dsp_mfhi(2)
-#define mfhi3() _umips_dsp_mfhi(3)
+#ifdef CONFIG_CPU_MICROMIPS
 
-#define mtlo0(x) _umips_dsp_mtlo(x, 0)
-#define mtlo1(x) _umips_dsp_mtlo(x, 1)
-#define mtlo2(x) _umips_dsp_mtlo(x, 2)
-#define mtlo3(x) _umips_dsp_mtlo(x, 3)
+#define _dsp_mflo(reg) _dsp_mfxxx((reg << 14) | 0x1000)
+#define _dsp_mfhi(reg) _dsp_mfxxx((reg << 14) | 0x0000)
 
-#define mthi0(x) _umips_dsp_mthi(x, 0)
-#define mthi1(x) _umips_dsp_mthi(x, 1)
-#define mthi2(x) _umips_dsp_mthi(x, 2)
-#define mthi3(x) _umips_dsp_mthi(x, 3)
+#define _dsp_mtlo(val, reg) _dsp_mtxxx(val, ((reg << 14) | 0x1000))
+#define _dsp_mthi(val, reg) _dsp_mtxxx(val, ((reg << 14) | 0x0000))
 
 #else  /* !CONFIG_CPU_MICROMIPS */
-#define rddsp(mask)                                                    \
-({                                                                     \
-       unsigned int __res;                                             \
-                                                                       \
-       __asm__ __volatile__(                                           \
-       "       .set    push                            \n"             \
-       "       .set    noat                            \n"             \
-       "       # rddsp $1, %x1                         \n"             \
-       "       .word   0x7c000cb8 | (%x1 << 16)        \n"             \
-       "       move    %0, $1                          \n"             \
-       "       .set    pop                             \n"             \
-       : "=r" (__res)                                                  \
-       : "i" (mask));                                                  \
-       __res;                                                          \
-})
-
-#define wrdsp(val, mask)                                               \
-do {                                                                   \
-       __asm__ __volatile__(                                           \
-       "       .set    push                                    \n"     \
-       "       .set    noat                                    \n"     \
-       "       move    $1, %0                                  \n"     \
-       "       # wrdsp $1, %x1                                 \n"     \
-       "       .word   0x7c2004f8 | (%x1 << 11)                \n"     \
-       "       .set    pop                                     \n"     \
-        :                                                              \
-       : "r" (val), "i" (mask));                                       \
-} while (0)
-
-#define _dsp_mfxxx(ins)                                                        \
-({                                                                     \
-       unsigned long __treg;                                           \
-                                                                       \
-       __asm__ __volatile__(                                           \
-       "       .set    push                                    \n"     \
-       "       .set    noat                                    \n"     \
-       "       .word   (0x00000810 | %1)                       \n"     \
-       "       move    %0, $1                                  \n"     \
-       "       .set    pop                                     \n"     \
-       : "=r" (__treg)                                                 \
-       : "i" (ins));                                                   \
-       __treg;                                                         \
-})
-
-#define _dsp_mtxxx(val, ins)                                           \
-do {                                                                   \
-       __asm__ __volatile__(                                           \
-       "       .set    push                                    \n"     \
-       "       .set    noat                                    \n"     \
-       "       move    $1, %0                                  \n"     \
-       "       .word   (0x00200011 | %1)                       \n"     \
-       "       .set    pop                                     \n"     \
-       :                                                               \
-       : "r" (val), "i" (ins));                                        \
-} while (0)
 
 #define _dsp_mflo(reg) _dsp_mfxxx((reg << 21) | 0x0002)
 #define _dsp_mfhi(reg) _dsp_mfxxx((reg << 21) | 0x0000)
@@ -2396,6 +2366,8 @@ do {                                                                      \
 #define _dsp_mtlo(val, reg) _dsp_mtxxx(val, ((reg << 11) | 0x0002))
 #define _dsp_mthi(val, reg) _dsp_mtxxx(val, ((reg << 11) | 0x0000))
 
+#endif /* CONFIG_CPU_MICROMIPS */
+
 #define mflo0() _dsp_mflo(0)
 #define mflo1() _dsp_mflo(1)
 #define mflo2() _dsp_mflo(2)
@@ -2416,7 +2388,6 @@ do {                                                                      \
 #define mthi2(x) _dsp_mthi(x, 2)
 #define mthi3(x) _dsp_mthi(x, 3)
 
-#endif /* CONFIG_CPU_MICROMIPS */
 #endif
 
 /*
@@ -2556,28 +2527,32 @@ static inline void guest_tlb_probe(void)
 {
        __asm__ __volatile__(
                "# tlbgp\n\t"
-               ".word 0x42000010");
+               _ASM_INSN_IF_MIPS(0x42000010)
+               _ASM_INSN32_IF_MM(0x0000017c));
 }
 
 static inline void guest_tlb_read(void)
 {
        __asm__ __volatile__(
                "# tlbgr\n\t"
-               ".word 0x42000009");
+               _ASM_INSN_IF_MIPS(0x42000009)
+               _ASM_INSN32_IF_MM(0x0000117c));
 }
 
 static inline void guest_tlb_write_indexed(void)
 {
        __asm__ __volatile__(
                "# tlbgwi\n\t"
-               ".word 0x4200000a");
+               _ASM_INSN_IF_MIPS(0x4200000a)
+               _ASM_INSN32_IF_MM(0x0000217c));
 }
 
 static inline void guest_tlb_write_random(void)
 {
        __asm__ __volatile__(
                "# tlbgwr\n\t"
-               ".word 0x4200000e");
+               _ASM_INSN_IF_MIPS(0x4200000e)
+               _ASM_INSN32_IF_MM(0x0000317c));
 }
 
 /*
@@ -2587,7 +2562,8 @@ static inline void guest_tlbinvf(void)
 {
        __asm__ __volatile__(
                "# tlbginvf\n\t"
-               ".word 0x4200000c");
+               _ASM_INSN_IF_MIPS(0x4200000c)
+               _ASM_INSN32_IF_MM(0x0000517c));
 }
 
 #endif /* !TOOLCHAIN_SUPPORTS_VIRT */
index 6e4effa..ddf496c 100644 (file)
@@ -192,13 +192,6 @@ static inline void write_msa_##name(unsigned int val)              \
  * allow compilation with toolchains that do not support MSA. Once all
  * toolchains in use support MSA these can be removed.
  */
-#ifdef CONFIG_CPU_MICROMIPS
-#define CFC_MSA_INSN   0x587e0056
-#define CTC_MSA_INSN   0x583e0816
-#else
-#define CFC_MSA_INSN   0x787e0059
-#define CTC_MSA_INSN   0x783e0819
-#endif
 
 #define __BUILD_MSA_CTL_REG(name, cs)                          \
 static inline unsigned int read_msa_##name(void)               \
@@ -207,11 +200,12 @@ static inline unsigned int read_msa_##name(void)          \
        __asm__ __volatile__(                                   \
        "       .set    push\n"                                 \
        "       .set    noat\n"                                 \
-       "       .insn\n"                                        \
-       "       .word   %1 | (" #cs " << 11)\n"                 \
+       "       # cfcmsa $1, $%1\n"                             \
+       _ASM_INSN_IF_MIPS(0x787e0059 | %1 << 11)                \
+       _ASM_INSN32_IF_MM(0x587e0056 | %1 << 11)                \
        "       move    %0, $1\n"                               \
        "       .set    pop\n"                                  \
-       : "=r"(reg) : "i"(CFC_MSA_INSN));                       \
+       : "=r"(reg) : "i"(cs));                                 \
        return reg;                                             \
 }                                                              \
                                                                \
@@ -221,10 +215,11 @@ static inline void write_msa_##name(unsigned int val)             \
        "       .set    push\n"                                 \
        "       .set    noat\n"                                 \
        "       move    $1, %0\n"                               \
-       "       .insn\n"                                        \
-       "       .word   %1 | (" #cs " << 6)\n"                  \
+       "       # ctcmsa $%1, $1\n"                             \
+       _ASM_INSN_IF_MIPS(0x783e0819 | %1 << 6)                 \
+       _ASM_INSN32_IF_MM(0x583e0816 | %1 << 6)                 \
        "       .set    pop\n"                                  \
-       : : "r"(val), "i"(CTC_MSA_INSN));                       \
+       : : "r"(val), "i"(cs));                                 \
 }
 
 #endif /* !TOOLCHAIN_SUPPORTS_MSA */
index 8d05d90..a07a36f 100644 (file)
@@ -146,7 +146,7 @@ typedef struct {
  * This structure contains the global state of all command queues.
  * It is stored in a bootmem named block and shared by all
  * applications running on Octeon. Tickets are stored in a differnet
- * cahce line that queue information to reduce the contention on the
+ * cache line that queue information to reduce the contention on the
  * ll/sc used to get a ticket. If this is not the case, the update
  * of queue state causes the ll/sc to fail quite often.
  */
index 8933203..cda93ae 100644 (file)
@@ -94,7 +94,7 @@ extern int cvmx_helper_board_get_mii_address(int ipd_port);
  * @phy_addr:  The address of the PHY to program
  * @link_flags:
  *                 Flags to control autonegotiation.  Bit 0 is autonegotiation
- *                 enable/disable to maintain backware compatibility.
+ *                 enable/disable to maintain backward compatibility.
  * @link_info: Link speed to program. If the speed is zero and autonegotiation
  *                 is enabled, all possible negotiation speeds are advertised.
  *
index e13490e..cbdc14b 100644 (file)
@@ -39,7 +39,7 @@
 
 enum cvmx_ipd_mode {
    CVMX_IPD_OPC_MODE_STT = 0LL,          /* All blocks DRAM, not cached in L2 */
-   CVMX_IPD_OPC_MODE_STF = 1LL,          /* All bloccks into  L2 */
+   CVMX_IPD_OPC_MODE_STF = 1LL,          /* All blocks into  L2 */
    CVMX_IPD_OPC_MODE_STF1_STT = 2LL,   /* 1st block L2, rest DRAM */
    CVMX_IPD_OPC_MODE_STF2_STT = 3LL    /* 1st, 2nd blocks L2, rest DRAM */
 };
index 5153156..410bb70 100644 (file)
@@ -2051,7 +2051,7 @@ static inline void cvmx_pow_tag_sw_desched(uint32_t tag,
 }
 
 /**
- * Descchedules the current work queue entry.
+ * Deschedules the current work queue entry.
  *
  * @no_sched: no schedule flag value to be set on the work queue
  *           entry.  If this is set the entry will not be
index 4a9c990..c0e3dc0 100644 (file)
@@ -39,7 +39,7 @@ struct hpc3_pbus_dmacregs {
        volatile u32 pbdma_dptr;        /* pbus dma channel desc ptr */
        u32 _unused0[0x1000/4 - 2];     /* padding */
        volatile u32 pbdma_ctrl;        /* pbus dma channel control register has
-                                        * copletely different meaning for read
+                                        * completely different meaning for read
                                         * compared with write */
        /* read */
 #define HPC3_PDMACTRL_INT      0x00000001 /* interrupt (cleared after read) */
index 4e3f9b7..258fd03 100644 (file)
 #define QI_LB60_GPIO_KEYIN8            JZ_GPIO_PORTD(26)
 
 /* NAND */
-static struct nand_ecclayout qi_lb60_ecclayout_1gb = {
-       .eccbytes = 36,
-       .eccpos = {
-               6,  7,  8,  9,  10, 11, 12, 13,
-               14, 15, 16, 17, 18, 19, 20, 21,
-               22, 23, 24, 25, 26, 27, 28, 29,
-               30, 31, 32, 33, 34, 35, 36, 37,
-               38, 39, 40, 41
-       },
-       .oobfree = {
-               { .offset = 2, .length = 4 },
-               { .offset = 42, .length = 22 }
-       },
-};
 
 /* Early prototypes of the QI LB60 had only 1GB of NAND.
  * In order to support these devices as well the partition and ecc layout is
@@ -84,25 +70,6 @@ static struct mtd_partition qi_lb60_partitions_1gb[] = {
        },
 };
 
-static struct nand_ecclayout qi_lb60_ecclayout_2gb = {
-       .eccbytes = 72,
-       .eccpos = {
-               12, 13, 14, 15, 16, 17, 18, 19,
-               20, 21, 22, 23, 24, 25, 26, 27,
-               28, 29, 30, 31, 32, 33, 34, 35,
-               36, 37, 38, 39, 40, 41, 42, 43,
-               44, 45, 46, 47, 48, 49, 50, 51,
-               52, 53, 54, 55, 56, 57, 58, 59,
-               60, 61, 62, 63, 64, 65, 66, 67,
-               68, 69, 70, 71, 72, 73, 74, 75,
-               76, 77, 78, 79, 80, 81, 82, 83
-       },
-       .oobfree = {
-               { .offset = 2, .length = 10 },
-               { .offset = 84, .length = 44 },
-       },
-};
-
 static struct mtd_partition qi_lb60_partitions_2gb[] = {
        {
                .name = "NAND BOOT partition",
@@ -121,19 +88,67 @@ static struct mtd_partition qi_lb60_partitions_2gb[] = {
        },
 };
 
+static int qi_lb60_ooblayout_ecc(struct mtd_info *mtd, int section,
+                                struct mtd_oob_region *oobregion)
+{
+       if (section)
+               return -ERANGE;
+
+       oobregion->length = 36;
+       oobregion->offset = 6;
+
+       if (mtd->oobsize == 128) {
+               oobregion->length *= 2;
+               oobregion->offset *= 2;
+       }
+
+       return 0;
+}
+
+static int qi_lb60_ooblayout_free(struct mtd_info *mtd, int section,
+                                 struct mtd_oob_region *oobregion)
+{
+       int eccbytes = 36, eccoff = 6;
+
+       if (section > 1)
+               return -ERANGE;
+
+       if (mtd->oobsize == 128) {
+               eccbytes *= 2;
+               eccoff *= 2;
+       }
+
+       if (!section) {
+               oobregion->offset = 2;
+               oobregion->length = eccoff - 2;
+       } else {
+               oobregion->offset = eccoff + eccbytes;
+               oobregion->length = mtd->oobsize - oobregion->offset;
+       }
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops qi_lb60_ooblayout_ops = {
+       .ecc = qi_lb60_ooblayout_ecc,
+       .free = qi_lb60_ooblayout_free,
+};
+
 static void qi_lb60_nand_ident(struct platform_device *pdev,
-               struct nand_chip *chip, struct mtd_partition **partitions,
+               struct mtd_info *mtd, struct mtd_partition **partitions,
                int *num_partitions)
 {
+       struct nand_chip *chip = mtd_to_nand(mtd);
+
        if (chip->page_shift == 12) {
-               chip->ecc.layout = &qi_lb60_ecclayout_2gb;
                *partitions = qi_lb60_partitions_2gb;
                *num_partitions = ARRAY_SIZE(qi_lb60_partitions_2gb);
        } else {
-               chip->ecc.layout = &qi_lb60_ecclayout_1gb;
                *partitions = qi_lb60_partitions_1gb;
                *num_partitions = ARRAY_SIZE(qi_lb60_partitions_1gb);
        }
+
+       mtd_set_ooblayout(mtd, &qi_lb60_ooblayout_ops);
 }
 
 static struct jz_nand_platform_data qi_lb60_nand_pdata = {
index ceca6cc..6dc3f1f 100644 (file)
@@ -481,7 +481,7 @@ int __compute_return_epc_for_insn(struct pt_regs *regs,
                        /*
                         * OK we are here either because we hit a NAL
                         * instruction or because we are emulating an
-                        * old bltzal{,l} one. Lets figure out what the
+                        * old bltzal{,l} one. Let's figure out what the
                         * case really is.
                         */
                        if (!insn.i_format.rs) {
@@ -515,7 +515,7 @@ int __compute_return_epc_for_insn(struct pt_regs *regs,
                        /*
                         * OK we are here either because we hit a BAL
                         * instruction or because we are emulating an
-                        * old bgezal{,l} one. Lets figure out what the
+                        * old bgezal{,l} one. Let's figure out what the
                         * case really is.
                         */
                        if (!insn.i_format.rs) {
index 51b98dc..59476a6 100644 (file)
@@ -441,6 +441,21 @@ LEAF(mips_cps_boot_vpes)
        mfc0    t0, CP0_CONFIG
        mttc0   t0, CP0_CONFIG
 
+       /*
+        * Copy the EVA config from this VPE if the CPU supports it.
+        * CONFIG3 must exist to be running MT startup - just read it.
+        */
+       mfc0    t0, CP0_CONFIG, 3
+       and     t0, t0, MIPS_CONF3_SC
+       beqz    t0, 3f
+        nop
+       mfc0    t0, CP0_SEGCTL0
+       mttc0   t0, CP0_SEGCTL0
+       mfc0    t0, CP0_SEGCTL1
+       mttc0   t0, CP0_SEGCTL1
+       mfc0    t0, CP0_SEGCTL2
+       mttc0   t0, CP0_SEGCTL2
+3:
        /* Ensure no software interrupts are pending */
        mttc0   zero, CP0_CAUSE
        mttc0   zero, CP0_STATUS
index 5ac5c3e..a88d442 100644 (file)
@@ -833,10 +833,8 @@ static inline unsigned int decode_config5(struct cpuinfo_mips *c)
                c->options |= MIPS_CPU_MAAR;
        if (config5 & MIPS_CONF5_LLB)
                c->options |= MIPS_CPU_RW_LLB;
-#ifdef CONFIG_XPA
        if (config5 & MIPS_CONF5_MVH)
-               c->options |= MIPS_CPU_XPA;
-#endif
+               c->options |= MIPS_CPU_MVH;
        if (cpu_has_mips_r6 && (config5 & MIPS_CONF5_VP))
                c->options |= MIPS_CPU_VP;
 
index c3c234d..891f5ee 100644 (file)
@@ -88,7 +88,7 @@ int arch_elf_pt_proc(void *_ehdr, void *_phdr, struct file *elf,
        elf32 = ehdr->e32.e_ident[EI_CLASS] == ELFCLASS32;
        flags = elf32 ? ehdr->e32.e_flags : ehdr->e64.e_flags;
 
-       /* Lets see if this is an O32 ELF */
+       /* Let's see if this is an O32 ELF */
        if (elf32) {
                if (flags & EF_MIPS_FP64) {
                        /*
index 8eb5af8..f25f7ea 100644 (file)
@@ -54,6 +54,9 @@ void __init init_IRQ(void)
        for (i = 0; i < NR_IRQS; i++)
                irq_set_noprobe(i);
 
+       if (cpu_has_veic)
+               clear_c0_status(ST0_IM);
+
        arch_init_irq();
 }
 
index 625ee77..7ff2a55 100644 (file)
@@ -2202,7 +2202,7 @@ fpu_emul:
        }
 
        /*
-        * Lets not return to userland just yet. It's constly and
+        * Let's not return to userland just yet. It's costly and
         * it's likely we have more R2 instructions to emulate
         */
        if (!err && (pass++ < MIPS_R2_EMUL_TOTAL_PASS)) {
index 5021c54..d64056e 100644 (file)
@@ -25,8 +25,8 @@
  * the user stack callchains, we will add it here.
  */
 
-static void save_raw_perf_callchain(struct perf_callchain_entry *entry,
-       unsigned long reg29)
+static void save_raw_perf_callchain(struct perf_callchain_entry_ctx *entry,
+                                   unsigned long reg29)
 {
        unsigned long *sp = (unsigned long *)reg29;
        unsigned long addr;
@@ -35,14 +35,14 @@ static void save_raw_perf_callchain(struct perf_callchain_entry *entry,
                addr = *sp++;
                if (__kernel_text_address(addr)) {
                        perf_callchain_store(entry, addr);
-                       if (entry->nr >= sysctl_perf_event_max_stack)
+                       if (entry->nr >= entry->max_stack)
                                break;
                }
        }
 }
 
-void perf_callchain_kernel(struct perf_callchain_entry *entry,
-                     struct pt_regs *regs)
+void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
+                          struct pt_regs *regs)
 {
        unsigned long sp = regs->regs[29];
 #ifdef CONFIG_KALLSYMS
@@ -59,7 +59,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
        }
        do {
                perf_callchain_store(entry, pc);
-               if (entry->nr >= sysctl_perf_event_max_stack)
+               if (entry->nr >= entry->max_stack)
                        break;
                pc = unwind_stack(current, &sp, pc, &ra);
        } while (pc);
index 411c971..813ed78 100644 (file)
@@ -345,7 +345,7 @@ static int get_frame_info(struct mips_frame_info *info)
                return 0;
        if (info->pc_offset < 0) /* leaf */
                return 1;
-       /* prologue seems boggus... */
+       /* prologue seems bogus... */
 err:
        return -1;
 }
index ab04229..ae42314 100644 (file)
@@ -770,15 +770,7 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs)
        sigset_t *oldset = sigmask_to_save();
        int ret;
        struct mips_abi *abi = current->thread.abi;
-#ifdef CONFIG_CPU_MICROMIPS
-       void *vdso;
-       unsigned long tmp = (unsigned long)current->mm->context.vdso;
-
-       set_isa16_mode(tmp);
-       vdso = (void *)tmp;
-#else
        void *vdso = current->mm->context.vdso;
-#endif
 
        if (regs->regs[0]) {
                switch(regs->regs[2]) {
index 1061bd2..4ed36f2 100644 (file)
@@ -359,8 +359,12 @@ static void cps_init_secondary(void)
                BUG_ON(ident != mips_cm_vp_id(smp_processor_id()));
        }
 
-       change_c0_status(ST0_IM, STATUSF_IP2 | STATUSF_IP3 | STATUSF_IP4 |
-                                STATUSF_IP5 | STATUSF_IP6 | STATUSF_IP7);
+       if (cpu_has_veic)
+               clear_c0_status(ST0_IM);
+       else
+               change_c0_status(ST0_IM, STATUSF_IP2 | STATUSF_IP3 |
+                                        STATUSF_IP4 | STATUSF_IP5 |
+                                        STATUSF_IP6 | STATUSF_IP7);
 }
 
 static void cps_smp_finish(void)
index b420958..27533c1 100644 (file)
@@ -43,7 +43,7 @@ static int pvc_line_proc_show(struct seq_file *m, void *v)
 {
        int lineno = *(int *)m->private;
 
-       if (lineno < 0 || lineno > PVC_NLINES) {
+       if (lineno < 0 || lineno >= PVC_NLINES) {
                printk(KERN_WARNING "proc_read_line: invalid lineno %d\n", lineno);
                return 0;
        }
@@ -67,7 +67,7 @@ static ssize_t pvc_line_proc_write(struct file *file, const char __user *buf,
        char kbuf[PVC_LINELEN];
        size_t len;
 
-       BUG_ON(lineno < 0 || lineno > PVC_NLINES);
+       BUG_ON(lineno < 0 || lineno >= PVC_NLINES);
 
        len = min(count, sizeof(kbuf) - 1);
        if (copy_from_user(kbuf, buf, len))
index beb80f3..927dc94 100644 (file)
@@ -2,7 +2,7 @@
 
 #include "libgcc.h"
 
-long long __ashldi3(long long u, word_type b)
+long long notrace __ashldi3(long long u, word_type b)
 {
        DWunion uu, w;
        word_type bm;
index c884a91..9fdf1a5 100644 (file)
@@ -2,7 +2,7 @@
 
 #include "libgcc.h"
 
-long long __ashrdi3(long long u, word_type b)
+long long notrace __ashrdi3(long long u, word_type b)
 {
        DWunion uu, w;
        word_type bm;
index 77e5f9c..e3e77aa 100644 (file)
@@ -1,6 +1,6 @@
 #include <linux/module.h>
 
-unsigned long long __bswapdi2(unsigned long long u)
+unsigned long long notrace __bswapdi2(unsigned long long u)
 {
        return (((u) & 0xff00000000000000ull) >> 56) |
               (((u) & 0x00ff000000000000ull) >> 40) |
index 2b302ff..530a8af 100644 (file)
@@ -1,6 +1,6 @@
 #include <linux/module.h>
 
-unsigned int __bswapsi2(unsigned int u)
+unsigned int notrace __bswapsi2(unsigned int u)
 {
        return (((u) & 0xff000000) >> 24) |
               (((u) & 0x00ff0000) >>  8) |
index 8c13064..06857da 100644 (file)
@@ -2,7 +2,7 @@
 
 #include "libgcc.h"
 
-word_type __cmpdi2(long long a, long long b)
+word_type notrace __cmpdi2(long long a, long long b)
 {
        const DWunion au = {
                .ll = a
index dcf8d68..3645474 100644 (file)
@@ -2,7 +2,7 @@
 
 #include "libgcc.h"
 
-long long __lshrdi3(long long u, word_type b)
+long long notrace __lshrdi3(long long u, word_type b)
 {
        DWunion uu, w;
        word_type bm;
index 9245e17..6c303a9 100644 (file)
 
        /*
         * Macro to build the __copy_user common code
-        * Arguements:
+        * Arguments:
         * mode : LEGACY_MODE or EVA_MODE
         * from : Source operand. USEROP or KERNELOP
         * to   : Destination operand. USEROP or KERNELOP
index bb4cb2f..bd599f5 100644 (file)
@@ -2,7 +2,7 @@
 
 #include "libgcc.h"
 
-word_type __ucmpdi2(unsigned long long a, unsigned long long b)
+word_type notrace __ucmpdi2(unsigned long long a, unsigned long long b)
 {
        const DWunion au = {.ll = a};
        const DWunion bu = {.ll = b};
index a2631a5..249039a 100644 (file)
@@ -212,7 +212,7 @@ static void hpet_setup(void)
        /* set hpet base address */
        smbus_write(SMBUS_PCI_REGB4, HPET_ADDR);
 
-       /* enable decodeing of access to HPET MMIO*/
+       /* enable decoding of access to HPET MMIO*/
        smbus_enable(SMBUS_PCI_REG40, (1 << 28));
 
        /* HPET irq enable */
index d4ceacd..4707488 100644 (file)
@@ -8,7 +8,7 @@
 #include "ieee754.h"
 
 /*
- * Emulate the arbritrary instruction ir at xcp->cp0_epc.  Required when
+ * Emulate the arbitrary instruction ir at xcp->cp0_epc.  Required when
  * we have to emulate the instruction in a COP1 branch delay slot.  Do
  * not change cp0_epc due to the instruction
  *
@@ -88,7 +88,7 @@ int mips_dsemul(struct pt_regs *regs, mips_instruction ir, unsigned long cpc)
        fr = (struct emuframe __user *)
                ((regs->regs[29] - sizeof(struct emuframe)) & ~0x7);
 
-       /* Verify that the stack pointer is not competely insane */
+       /* Verify that the stack pointer is not completely insane */
        if (unlikely(!access_ok(VERIFY_WRITE, fr, sizeof(struct emuframe))))
                return SIGBUS;
 
index 274da90..4004b65 100644 (file)
@@ -2361,8 +2361,9 @@ static void print_htw_config(void)
                (config & MIPS_PWFIELD_PTEI_MASK) >> MIPS_PWFIELD_PTEI_SHIFT);
 
        config = read_c0_pwsize();
-       pr_debug("PWSize  (0x%0*lx): GDW: 0x%02lx  UDW: 0x%02lx  MDW: 0x%02lx  PTW: 0x%02lx  PTEW: 0x%02lx\n",
+       pr_debug("PWSize  (0x%0*lx): PS: 0x%lx  GDW: 0x%02lx  UDW: 0x%02lx  MDW: 0x%02lx  PTW: 0x%02lx  PTEW: 0x%02lx\n",
                field, config,
+               (config & MIPS_PWSIZE_PS_MASK) >> MIPS_PWSIZE_PS_SHIFT,
                (config & MIPS_PWSIZE_GDW_MASK) >> MIPS_PWSIZE_GDW_SHIFT,
                (config & MIPS_PWSIZE_UDW_MASK) >> MIPS_PWSIZE_UDW_SHIFT,
                (config & MIPS_PWSIZE_MDW_MASK) >> MIPS_PWSIZE_MDW_SHIFT,
@@ -2370,9 +2371,12 @@ static void print_htw_config(void)
                (config & MIPS_PWSIZE_PTEW_MASK) >> MIPS_PWSIZE_PTEW_SHIFT);
 
        pwctl = read_c0_pwctl();
-       pr_debug("PWCtl   (0x%x): PWEn: 0x%x  DPH: 0x%x  HugePg: 0x%x  Psn: 0x%x\n",
+       pr_debug("PWCtl   (0x%x): PWEn: 0x%x  XK: 0x%x  XS: 0x%x  XU: 0x%x  DPH: 0x%x  HugePg: 0x%x  Psn: 0x%x\n",
                pwctl,
                (pwctl & MIPS_PWCTL_PWEN_MASK) >> MIPS_PWCTL_PWEN_SHIFT,
+               (pwctl & MIPS_PWCTL_XK_MASK) >> MIPS_PWCTL_XK_SHIFT,
+               (pwctl & MIPS_PWCTL_XS_MASK) >> MIPS_PWCTL_XS_SHIFT,
+               (pwctl & MIPS_PWCTL_XU_MASK) >> MIPS_PWCTL_XU_SHIFT,
                (pwctl & MIPS_PWCTL_DPH_MASK) >> MIPS_PWCTL_DPH_SHIFT,
                (pwctl & MIPS_PWCTL_HUGEPG_MASK) >> MIPS_PWCTL_HUGEPG_SHIFT,
                (pwctl & MIPS_PWCTL_PSN_MASK) >> MIPS_PWCTL_PSN_SHIFT);
@@ -2427,15 +2431,25 @@ static void config_htw_params(void)
        if (CONFIG_PGTABLE_LEVELS >= 3)
                pwsize |= ilog2(PTRS_PER_PMD) << MIPS_PWSIZE_MDW_SHIFT;
 
-       pwsize |= ilog2(sizeof(pte_t)/4) << MIPS_PWSIZE_PTEW_SHIFT;
+       /* Set pointer size to size of directory pointers */
+       if (config_enabled(CONFIG_64BIT))
+               pwsize |= MIPS_PWSIZE_PS_MASK;
+       /* PTEs may be multiple pointers long (e.g. with XPA) */
+       pwsize |= ((PTE_T_LOG2 - PGD_T_LOG2) << MIPS_PWSIZE_PTEW_SHIFT)
+                       & MIPS_PWSIZE_PTEW_MASK;
 
        write_c0_pwsize(pwsize);
 
        /* Make sure everything is set before we enable the HTW */
        back_to_back_c0_hazard();
 
-       /* Enable HTW and disable the rest of the pwctl fields */
+       /*
+        * Enable HTW (and only for XUSeg on 64-bit), and disable the rest of
+        * the pwctl fields.
+        */
        config = 1 << MIPS_PWCTL_PWEN_SHIFT;
+       if (config_enabled(CONFIG_64BIT))
+               config |= MIPS_PWCTL_XU_MASK;
        write_c0_pwctl(config);
        pr_info("Hardware Page Table Walker enabled\n");
 
index 7c2da27..a4e758a 100644 (file)
@@ -24,7 +24,7 @@ struct op_counter_config {
        unsigned long unit_mask;
 };
 
-/* Per-architecture configury and hooks.  */
+/* Per-architecture configure and hooks.  */
 struct op_mips_model {
        void (*reg_setup) (struct op_counter_config *);
        void (*cpu_setup) (void *dummy);
index 4383194..57e1463 100644 (file)
@@ -33,9 +33,9 @@ static u32 emulate_ioc3_cfg(int where, int size)
  * The Bridge ASIC supports both type 0 and type 1 access.  Type 1 is
  * not really documented, so right now I can't write code which uses it.
  * Therefore we use type 0 accesses for now even though they won't work
- * correcly for PCI-to-PCI bridges.
+ * correctly for PCI-to-PCI bridges.
  *
- * The function is complicated by the ultimate brokeness of the IOC3 chip
+ * The function is complicated by the ultimate brokenness of the IOC3 chip
  * which is used in SGI systems.  The IOC3 can only handle 32-bit PCI
  * accesses and does only decode parts of it's address space.
  */
index 956c92e..ab79828 100644 (file)
@@ -83,12 +83,16 @@ static void __init plat_setup_iocoherency(void)
        }
 }
 
-void __init plat_mem_setup(void)
+void __init *plat_get_fdt(void)
 {
        if (fw_arg0 != -2)
                panic("Device-tree not present");
+       return (void *)fw_arg1;
+}
 
-       __dt_setup_arch((void *)fw_arg1);
+void __init plat_mem_setup(void)
+{
+       __dt_setup_arch(plat_get_fdt());
 
        plat_setup_iocoherency();
 }
index 88b82fe..d40edda 100644 (file)
@@ -188,6 +188,41 @@ static struct rt2880_pmx_func gpio_grp_mt7628[] = {
        FUNC("gpio", 0, 11, 1),
 };
 
+static struct rt2880_pmx_func p4led_kn_grp_mt7628[] = {
+       FUNC("jtag", 3, 30, 1),
+       FUNC("util", 2, 30, 1),
+       FUNC("gpio", 1, 30, 1),
+       FUNC("p4led_kn", 0, 30, 1),
+};
+
+static struct rt2880_pmx_func p3led_kn_grp_mt7628[] = {
+       FUNC("jtag", 3, 31, 1),
+       FUNC("util", 2, 31, 1),
+       FUNC("gpio", 1, 31, 1),
+       FUNC("p3led_kn", 0, 31, 1),
+};
+
+static struct rt2880_pmx_func p2led_kn_grp_mt7628[] = {
+       FUNC("jtag", 3, 32, 1),
+       FUNC("util", 2, 32, 1),
+       FUNC("gpio", 1, 32, 1),
+       FUNC("p2led_kn", 0, 32, 1),
+};
+
+static struct rt2880_pmx_func p1led_kn_grp_mt7628[] = {
+       FUNC("jtag", 3, 33, 1),
+       FUNC("util", 2, 33, 1),
+       FUNC("gpio", 1, 33, 1),
+       FUNC("p1led_kn", 0, 33, 1),
+};
+
+static struct rt2880_pmx_func p0led_kn_grp_mt7628[] = {
+       FUNC("jtag", 3, 34, 1),
+       FUNC("rsvd", 2, 34, 1),
+       FUNC("gpio", 1, 34, 1),
+       FUNC("p0led_kn", 0, 34, 1),
+};
+
 static struct rt2880_pmx_func wled_kn_grp_mt7628[] = {
        FUNC("rsvd", 3, 35, 1),
        FUNC("rsvd", 2, 35, 1),
@@ -195,16 +230,61 @@ static struct rt2880_pmx_func wled_kn_grp_mt7628[] = {
        FUNC("wled_kn", 0, 35, 1),
 };
 
+static struct rt2880_pmx_func p4led_an_grp_mt7628[] = {
+       FUNC("jtag", 3, 39, 1),
+       FUNC("util", 2, 39, 1),
+       FUNC("gpio", 1, 39, 1),
+       FUNC("p4led_an", 0, 39, 1),
+};
+
+static struct rt2880_pmx_func p3led_an_grp_mt7628[] = {
+       FUNC("jtag", 3, 40, 1),
+       FUNC("util", 2, 40, 1),
+       FUNC("gpio", 1, 40, 1),
+       FUNC("p3led_an", 0, 40, 1),
+};
+
+static struct rt2880_pmx_func p2led_an_grp_mt7628[] = {
+       FUNC("jtag", 3, 41, 1),
+       FUNC("util", 2, 41, 1),
+       FUNC("gpio", 1, 41, 1),
+       FUNC("p2led_an", 0, 41, 1),
+};
+
+static struct rt2880_pmx_func p1led_an_grp_mt7628[] = {
+       FUNC("jtag", 3, 42, 1),
+       FUNC("util", 2, 42, 1),
+       FUNC("gpio", 1, 42, 1),
+       FUNC("p1led_an", 0, 42, 1),
+};
+
+static struct rt2880_pmx_func p0led_an_grp_mt7628[] = {
+       FUNC("jtag", 3, 43, 1),
+       FUNC("rsvd", 2, 43, 1),
+       FUNC("gpio", 1, 43, 1),
+       FUNC("p0led_an", 0, 43, 1),
+};
+
 static struct rt2880_pmx_func wled_an_grp_mt7628[] = {
-       FUNC("rsvd", 3, 35, 1),
-       FUNC("rsvd", 2, 35, 1),
-       FUNC("gpio", 1, 35, 1),
-       FUNC("wled_an", 0, 35, 1),
+       FUNC("rsvd", 3, 44, 1),
+       FUNC("rsvd", 2, 44, 1),
+       FUNC("gpio", 1, 44, 1),
+       FUNC("wled_an", 0, 44, 1),
 };
 
 #define MT7628_GPIO_MODE_MASK          0x3
 
+#define MT7628_GPIO_MODE_P4LED_KN      58
+#define MT7628_GPIO_MODE_P3LED_KN      56
+#define MT7628_GPIO_MODE_P2LED_KN      54
+#define MT7628_GPIO_MODE_P1LED_KN      52
+#define MT7628_GPIO_MODE_P0LED_KN      50
 #define MT7628_GPIO_MODE_WLED_KN       48
+#define MT7628_GPIO_MODE_P4LED_AN      42
+#define MT7628_GPIO_MODE_P3LED_AN      40
+#define MT7628_GPIO_MODE_P2LED_AN      38
+#define MT7628_GPIO_MODE_P1LED_AN      36
+#define MT7628_GPIO_MODE_P0LED_AN      34
 #define MT7628_GPIO_MODE_WLED_AN       32
 #define MT7628_GPIO_MODE_PWM1          30
 #define MT7628_GPIO_MODE_PWM0          28
@@ -223,9 +303,9 @@ static struct rt2880_pmx_func wled_an_grp_mt7628[] = {
 #define MT7628_GPIO_MODE_GPIO          0
 
 static struct rt2880_pmx_group mt7628an_pinmux_data[] = {
-       GRP_G("pmw1", pwm1_grp_mt7628, MT7628_GPIO_MODE_MASK,
+       GRP_G("pwm1", pwm1_grp_mt7628, MT7628_GPIO_MODE_MASK,
                                1, MT7628_GPIO_MODE_PWM1),
-       GRP_G("pmw0", pwm0_grp_mt7628, MT7628_GPIO_MODE_MASK,
+       GRP_G("pwm0", pwm0_grp_mt7628, MT7628_GPIO_MODE_MASK,
                                1, MT7628_GPIO_MODE_PWM0),
        GRP_G("uart2", uart2_grp_mt7628, MT7628_GPIO_MODE_MASK,
                                1, MT7628_GPIO_MODE_UART2),
@@ -251,8 +331,28 @@ static struct rt2880_pmx_group mt7628an_pinmux_data[] = {
                                1, MT7628_GPIO_MODE_GPIO),
        GRP_G("wled_an", wled_an_grp_mt7628, MT7628_GPIO_MODE_MASK,
                                1, MT7628_GPIO_MODE_WLED_AN),
+       GRP_G("p0led_an", p0led_an_grp_mt7628, MT7628_GPIO_MODE_MASK,
+                               1, MT7628_GPIO_MODE_P0LED_AN),
+       GRP_G("p1led_an", p1led_an_grp_mt7628, MT7628_GPIO_MODE_MASK,
+                               1, MT7628_GPIO_MODE_P1LED_AN),
+       GRP_G("p2led_an", p2led_an_grp_mt7628, MT7628_GPIO_MODE_MASK,
+                               1, MT7628_GPIO_MODE_P2LED_AN),
+       GRP_G("p3led_an", p3led_an_grp_mt7628, MT7628_GPIO_MODE_MASK,
+                               1, MT7628_GPIO_MODE_P3LED_AN),
+       GRP_G("p4led_an", p4led_an_grp_mt7628, MT7628_GPIO_MODE_MASK,
+                               1, MT7628_GPIO_MODE_P4LED_AN),
        GRP_G("wled_kn", wled_kn_grp_mt7628, MT7628_GPIO_MODE_MASK,
                                1, MT7628_GPIO_MODE_WLED_KN),
+       GRP_G("p0led_kn", p0led_kn_grp_mt7628, MT7628_GPIO_MODE_MASK,
+                               1, MT7628_GPIO_MODE_P0LED_KN),
+       GRP_G("p1led_kn", p1led_kn_grp_mt7628, MT7628_GPIO_MODE_MASK,
+                               1, MT7628_GPIO_MODE_P1LED_KN),
+       GRP_G("p2led_kn", p2led_kn_grp_mt7628, MT7628_GPIO_MODE_MASK,
+                               1, MT7628_GPIO_MODE_P2LED_KN),
+       GRP_G("p3led_kn", p3led_kn_grp_mt7628, MT7628_GPIO_MODE_MASK,
+                               1, MT7628_GPIO_MODE_P3LED_KN),
+       GRP_G("p4led_kn", p4led_kn_grp_mt7628, MT7628_GPIO_MODE_MASK,
+                               1, MT7628_GPIO_MODE_P4LED_KN),
        { 0 }
 };
 
index 328ceb3..2abe016 100644 (file)
@@ -105,7 +105,7 @@ static void hub_setup_prb(nasid_t nasid, int prbnum, int credits)
        prb.iprb_ff = force_fire_and_forget ? 1 : 0;
 
        /*
-        * Set the appropriate number of PIO cresits for the widget.
+        * Set the appropriate number of PIO credits for the widget.
         */
        prb.iprb_xtalkctr = credits;
 
index a2358b4..cfceaea 100644 (file)
@@ -23,7 +23,7 @@ typedef unsigned long machreg_t;
 static arch_spinlock_t nmi_lock = __ARCH_SPIN_LOCK_UNLOCKED;
 
 /*
- * Lets see what else we need to do here. Set up sp, gp?
+ * Let's see what else we need to do here. Set up sp, gp?
  */
 void nmi_dump(void)
 {
index 20f582a..4fe5678 100644 (file)
@@ -67,7 +67,7 @@ static int xbow_probe(nasid_t nasid)
                return -ENODEV;
 
        /*
-        * Okay, here's a xbow. Lets arbitrate and find
+        * Okay, here's a xbow. Let's arbitrate and find
         * out if we should initialize it. Set enabled
         * hub connected at highest or lowest widget as
         * master.
index a046b30..160b880 100644 (file)
@@ -263,7 +263,7 @@ spurious_8259A_irq:
                static int spurious_irq_mask;
                /*
                 * At this point we can be sure the IRQ is spurious,
-                * lets ACK and report it. [once per IRQ]
+                * let's ACK and report it. [once per IRQ]
                 */
                if (!(spurious_irq_mask & irqmask)) {
                        printk(KERN_DEBUG
index b369509..3b4538e 100644 (file)
@@ -5,10 +5,12 @@ obj-vdso-y := elf.o gettimeofday.o sigreturn.o
 ccflags-vdso := \
        $(filter -I%,$(KBUILD_CFLAGS)) \
        $(filter -E%,$(KBUILD_CFLAGS)) \
+       $(filter -mmicromips,$(KBUILD_CFLAGS)) \
        $(filter -march=%,$(KBUILD_CFLAGS))
 cflags-vdso := $(ccflags-vdso) \
        $(filter -W%,$(filter-out -Wa$(comma)%,$(KBUILD_CFLAGS))) \
-       -O2 -g -fPIC -fno-common -fno-builtin -G 0 -DDISABLE_BRANCH_PROFILING \
+       -O2 -g -fPIC -fno-strict-aliasing -fno-common -fno-builtin -G 0 \
+       -DDISABLE_BRANCH_PROFILING \
        $(call cc-option, -fno-stack-protector)
 aflags-vdso := $(ccflags-vdso) \
        $(filter -I%,$(KBUILD_CFLAGS)) \
index 05302bf..89bac98 100644 (file)
@@ -3,7 +3,7 @@
  *
  *  Copyright (C) 2001-2002  MontaVista Software Inc.
  *    Author: Yoichi Yuasa <source@mvista.com>
- *  Copuright (C) 2003-2005  Yoichi Yuasa <yuasa@linux-mips.org>
+ *  Copyright (C) 2003-2005  Yoichi Yuasa <yuasa@linux-mips.org>
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
index 08a95e1..5f56f9d 100644 (file)
@@ -8,7 +8,6 @@ LDFLAGS_vmlinux := -Ttext $(CONFIG_KERNEL_ZIMAGE_BASE_ADDRESS) -e startup_32
 
 $(obj)/vmlinux: $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE
        $(call if_changed,ld)
-       @:
 
 $(obj)/vmlinux.bin: vmlinux FORCE
        $(call if_changed,objcopy)
index 2328f82..e74afc1 100644 (file)
@@ -20,7 +20,7 @@ UTS_SYSNAME = Linux
 
 export MMU
 
-LIBGCC         := $(shell $(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name)
+LIBGCC         := $(shell $(CC) $(KBUILD_CFLAGS) $(KCFLAGS) -print-libgcc-file-name)
 
 KBUILD_CFLAGS += -pipe -D__linux__ -D__ELF__
 KBUILD_CFLAGS += $(if $(CONFIG_NIOS2_HW_MUL_SUPPORT),-mhw-mul,-mno-hw-mul)
@@ -53,7 +53,7 @@ all: vmImage
 archclean:
        $(Q)$(MAKE) $(clean)=$(nios2-boot)
 
-%.dtb:
+%.dtb: | scripts
        $(Q)$(MAKE) $(build)=$(nios2-boot) $(nios2-boot)/$@
 
 dtbs:
index 5b0fb34..d5921c9 100644 (file)
@@ -11,7 +11,6 @@ LDFLAGS_vmlinux := -T
 
 $(obj)/vmlinux: $(obj)/vmlinux.lds $(OBJECTS) $(obj)/piggy.o FORCE
        $(call if_changed,ld)
-       @:
 
 LDFLAGS_piggy.o := -r --format binary --oformat elf32-littlenios2 -T
 
index c4bf795..51a32c7 100644 (file)
@@ -17,6 +17,8 @@
 
  #define sys_mmap2 sys_mmap_pgoff
 
+#define __ARCH_WANT_RENAMEAT
+
 /* Use the standard ABI for syscalls */
 #include <asm-generic/unistd.h>
 
index ce40b71..471905b 100644 (file)
@@ -20,6 +20,7 @@
 
 #define sys_mmap2 sys_mmap_pgoff
 
+#define __ARCH_WANT_RENAMEAT
 #define __ARCH_WANT_SYS_FORK
 #define __ARCH_WANT_SYS_CLONE
 
index 3d498a6..dc11738 100644 (file)
@@ -6,6 +6,7 @@ config PARISC
        select HAVE_OPROFILE
        select HAVE_FUNCTION_TRACER
        select HAVE_FUNCTION_GRAPH_TRACER
+       select HAVE_SYSCALL_TRACEPOINTS
        select ARCH_WANT_FRAME_POINTERS
        select RTC_CLASS
        select RTC_DRV_GENERIC
@@ -31,6 +32,8 @@ config PARISC
        select HAVE_DEBUG_STACKOVERFLOW
        select HAVE_ARCH_AUDITSYSCALL
        select HAVE_ARCH_SECCOMP_FILTER
+       select HAVE_ARCH_TRACEHOOK
+       select HAVE_UNSTABLE_SCHED_CLOCK if (SMP || !64BIT)
        select ARCH_NO_COHERENT_DMA_MMAP
        select CPU_NO_EFFICIENT_FFS
 
index 0a90b96..7ada309 100644 (file)
@@ -52,8 +52,7 @@ extern void __cmpxchg_called_with_bad_pointer(void);
 /* __cmpxchg_u32/u64 defined in arch/parisc/lib/bitops.c */
 extern unsigned long __cmpxchg_u32(volatile unsigned int *m, unsigned int old,
                                   unsigned int new_);
-extern unsigned long __cmpxchg_u64(volatile unsigned long *ptr,
-                                  unsigned long old, unsigned long new_);
+extern u64 __cmpxchg_u64(volatile u64 *ptr, u64 old, u64 new_);
 
 /* don't worry...optimizer will get rid of most of this */
 static inline unsigned long
@@ -61,7 +60,7 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new_, int size)
 {
        switch (size) {
 #ifdef CONFIG_64BIT
-       case 8: return __cmpxchg_u64((unsigned long *)ptr, old, new_);
+       case 8: return __cmpxchg_u64((u64 *)ptr, old, new_);
 #endif
        case 4: return __cmpxchg_u32((unsigned int *)ptr,
                                     (unsigned int)old, (unsigned int)new_);
@@ -86,7 +85,7 @@ static inline unsigned long __cmpxchg_local(volatile void *ptr,
 {
        switch (size) {
 #ifdef CONFIG_64BIT
-       case 8: return __cmpxchg_u64((unsigned long *)ptr, old, new_);
+       case 8: return __cmpxchg_u64((u64 *)ptr, old, new_);
 #endif
        case 4: return __cmpxchg_u32(ptr, old, new_);
        default:
@@ -111,4 +110,6 @@ static inline unsigned long __cmpxchg_local(volatile void *ptr,
 #define cmpxchg64_local(ptr, o, n) __cmpxchg64_local_generic((ptr), (o), (n))
 #endif
 
+#define cmpxchg64(ptr, o, n) __cmpxchg_u64(ptr, o, n)
+
 #endif /* _ASM_PARISC_CMPXCHG_H_ */
index 8ce8b85..5637ac9 100644 (file)
@@ -99,7 +99,7 @@ struct eeprom_eisa_slot_info
 #define HPEE_MEMORY_DECODE_24BITS 0x04
 #define HPEE_MEMORY_DECODE_32BITS 0x08
 /* byte 2 and 3 are a 16bit LE value
- * containging the memory size in kilobytes */
+ * containing the memory size in kilobytes */
 /* byte 4,5,6 are a 24bit LE value
  * containing the memory base address */
 
@@ -135,7 +135,7 @@ struct eeprom_eisa_slot_info
 #define HPEE_PORT_SHARED    0x40
 #define HPEE_PORT_MORE      0x80
 /* byte 1 and 2 is a 16bit LE value
- * conating the start port number */
+ * containing the start port number */
 
 #define HPEE_PORT_INIT_MAX_LEN     60 /* in bytes here */
 /* port init entry byte 0 */
index 24cd81d..d635c6b 100644 (file)
@@ -6,6 +6,8 @@ extern void mcount(void);
 
 #define MCOUNT_INSN_SIZE 4
 
+extern unsigned long sys_call_table[];
+
 extern unsigned long return_address(unsigned int);
 
 #define ftrace_return_address(n) return_address(n)
index 49df148..ac8bd58 100644 (file)
@@ -35,70 +35,57 @@ static inline int
 futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
 {
        unsigned long int flags;
-       u32 val;
        int op = (encoded_op >> 28) & 7;
        int cmp = (encoded_op >> 24) & 15;
        int oparg = (encoded_op << 8) >> 20;
        int cmparg = (encoded_op << 20) >> 20;
-       int oldval = 0, ret;
+       int oldval, ret;
+       u32 tmp;
+
        if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
                oparg = 1 << oparg;
 
        if (!access_ok(VERIFY_WRITE, uaddr, sizeof(*uaddr)))
                return -EFAULT;
 
+       _futex_spin_lock_irqsave(uaddr, &flags);
        pagefault_disable();
 
-       _futex_spin_lock_irqsave(uaddr, &flags);
+       ret = -EFAULT;
+       if (unlikely(get_user(oldval, uaddr) != 0))
+               goto out_pagefault_enable;
+
+       ret = 0;
+       tmp = oldval;
 
        switch (op) {
        case FUTEX_OP_SET:
-               /* *(int *)UADDR2 = OPARG; */
-               ret = get_user(oldval, uaddr);
-               if (!ret)
-                       ret = put_user(oparg, uaddr);
+               tmp = oparg;
                break;
        case FUTEX_OP_ADD:
-               /* *(int *)UADDR2 += OPARG; */
-               ret = get_user(oldval, uaddr);
-               if (!ret) {
-                       val = oldval + oparg;
-                       ret = put_user(val, uaddr);
-               }
+               tmp += oparg;
                break;
        case FUTEX_OP_OR:
-               /* *(int *)UADDR2 |= OPARG; */
-               ret = get_user(oldval, uaddr);
-               if (!ret) {
-                       val = oldval | oparg;
-                       ret = put_user(val, uaddr);
-               }
+               tmp |= oparg;
                break;
        case FUTEX_OP_ANDN:
-               /* *(int *)UADDR2 &= ~OPARG; */
-               ret = get_user(oldval, uaddr);
-               if (!ret) {
-                       val = oldval & ~oparg;
-                       ret = put_user(val, uaddr);
-               }
+               tmp &= ~oparg;
                break;
        case FUTEX_OP_XOR:
-               /* *(int *)UADDR2 ^= OPARG; */
-               ret = get_user(oldval, uaddr);
-               if (!ret) {
-                       val = oldval ^ oparg;
-                       ret = put_user(val, uaddr);
-               }
+               tmp ^= oparg;
                break;
        default:
                ret = -ENOSYS;
        }
 
-       _futex_spin_unlock_irqrestore(uaddr, &flags);
+       if (ret == 0 && unlikely(put_user(tmp, uaddr) != 0))
+               ret = -EFAULT;
 
+out_pagefault_enable:
        pagefault_enable();
+       _futex_spin_unlock_irqrestore(uaddr, &flags);
 
-       if (!ret) {
+       if (ret == 0) {
                switch (cmp) {
                case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
                case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
@@ -112,12 +99,10 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
        return ret;
 }
 
-/* Non-atomic version */
 static inline int
 futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
                              u32 oldval, u32 newval)
 {
-       int ret;
        u32 val;
        unsigned long flags;
 
@@ -137,17 +122,20 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
         */
 
        _futex_spin_lock_irqsave(uaddr, &flags);
+       if (unlikely(get_user(val, uaddr) != 0)) {
+               _futex_spin_unlock_irqrestore(uaddr, &flags);
+               return -EFAULT;
+       }
 
-       ret = get_user(val, uaddr);
-
-       if (!ret && val == oldval)
-               ret = put_user(newval, uaddr);
+       if (val == oldval && unlikely(put_user(newval, uaddr) != 0)) {
+               _futex_spin_unlock_irqrestore(uaddr, &flags);
+               return -EFAULT;
+       }
 
        *uval = val;
-
        _futex_spin_unlock_irqrestore(uaddr, &flags);
 
-       return ret;
+       return 0;
 }
 
 #endif /*__KERNEL__*/
index 8121aa6..8be707e 100644 (file)
@@ -40,7 +40,7 @@
    memory to indicate to the compiler that the assembly code reads
    or writes to items other than those listed in the input and output
    operands.  This may pessimize the code somewhat but __ldcw is
-   usually used within code blocks surrounded by memory barriors.  */
+   usually used within code blocks surrounded by memory barriers.  */
 #define __ldcw(a) ({                                           \
        unsigned __ret;                                         \
        __asm__ __volatile__(__LDCW " 0(%1),%0"                 \
index 637ce8d..5e0b4e6 100644 (file)
@@ -8,6 +8,8 @@
 #include <linux/err.h>
 #include <asm/ptrace.h>
 
+#define NR_syscalls (__NR_Linux_syscalls)
+
 static inline long syscall_get_nr(struct task_struct *tsk,
                                  struct pt_regs *regs)
 {
@@ -33,12 +35,19 @@ static inline void syscall_get_arguments(struct task_struct *tsk,
                args[1] = regs->gr[25];
        case 1:
                args[0] = regs->gr[26];
+       case 0:
                break;
        default:
                BUG();
        }
 }
 
+static inline long syscall_get_return_value(struct task_struct *task,
+                                               struct pt_regs *regs)
+{
+       return regs->gr[28];
+}
+
 static inline void syscall_set_return_value(struct task_struct *task,
                                            struct pt_regs *regs,
                                            int error, long val)
index e96e693..7581330 100644 (file)
@@ -55,6 +55,7 @@ struct thread_info {
 #define TIF_SINGLESTEP         9       /* single stepping? */
 #define TIF_BLOCKSTEP          10      /* branch stepping? */
 #define TIF_SECCOMP            11      /* secure computing */
+#define TIF_SYSCALL_TRACEPOINT 12      /* syscall tracepoint instrumentation */
 
 #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
 #define _TIF_SIGPENDING                (1 << TIF_SIGPENDING)
@@ -66,12 +67,13 @@ struct thread_info {
 #define _TIF_SINGLESTEP                (1 << TIF_SINGLESTEP)
 #define _TIF_BLOCKSTEP         (1 << TIF_BLOCKSTEP)
 #define _TIF_SECCOMP           (1 << TIF_SECCOMP)
+#define _TIF_SYSCALL_TRACEPOINT        (1 << TIF_SYSCALL_TRACEPOINT)
 
 #define _TIF_USER_WORK_MASK     (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | \
                                  _TIF_NEED_RESCHED)
 #define _TIF_SYSCALL_TRACE_MASK (_TIF_SYSCALL_TRACE | _TIF_SINGLESTEP |        \
                                 _TIF_BLOCKSTEP | _TIF_SYSCALL_AUDIT | \
-                                _TIF_SECCOMP)
+                                _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT)
 
 #ifdef CONFIG_64BIT
 # ifdef CONFIG_COMPAT
index 4736020..5e953ab 100644 (file)
@@ -8,6 +8,8 @@ struct pt_regs;
 void parisc_terminate(char *msg, struct pt_regs *regs,
                int code, unsigned long offset) __noreturn __cold;
 
+void die_if_kernel(char *str, struct pt_regs *regs, long err);
+
 /* mm/fault.c */
 void do_page_fault(struct pt_regs *regs, unsigned long code,
                unsigned long address);
index 7955e43..0f59fd9 100644 (file)
@@ -40,14 +40,10 @@ static inline long access_ok(int type, const void __user * addr,
 #define get_user __get_user
 
 #if !defined(CONFIG_64BIT)
-#define LDD_KERNEL(ptr)                BUILD_BUG()
-#define LDD_USER(ptr)          BUILD_BUG()
-#define STD_KERNEL(x, ptr)     __put_kernel_asm64(x, ptr)
+#define LDD_USER(ptr)          __get_user_asm64(ptr)
 #define STD_USER(x, ptr)       __put_user_asm64(x, ptr)
 #else
-#define LDD_KERNEL(ptr)                __get_kernel_asm("ldd", ptr)
 #define LDD_USER(ptr)          __get_user_asm("ldd", ptr)
-#define STD_KERNEL(x, ptr)     __put_kernel_asm("std", x, ptr)
 #define STD_USER(x, ptr)       __put_user_asm("std", x, ptr)
 #endif
 
@@ -80,70 +76,70 @@ struct exception_data {
        unsigned long fault_addr;
 };
 
+/*
+ * load_sr2() preloads the space register %%sr2 - based on the value of
+ * get_fs() - with either a value of 0 to access kernel space (KERNEL_DS which
+ * is 0), or with the current value of %%sr3 to access user space (USER_DS)
+ * memory. The following __get_user_asm() and __put_user_asm() functions have
+ * %%sr2 hard-coded to access the requested memory.
+ */
+#define load_sr2() \
+       __asm__(" or,=  %0,%%r0,%%r0\n\t"       \
+               " mfsp %%sr3,%0\n\t"            \
+               " mtsp %0,%%sr2\n\t"            \
+               : : "r"(get_fs()) : )
+
 #define __get_user(x, ptr)                               \
 ({                                                       \
        register long __gu_err __asm__ ("r8") = 0;       \
        register long __gu_val __asm__ ("r9") = 0;       \
                                                         \
-       if (segment_eq(get_fs(), KERNEL_DS)) {           \
-           switch (sizeof(*(ptr))) {                    \
-           case 1: __get_kernel_asm("ldb", ptr); break; \
-           case 2: __get_kernel_asm("ldh", ptr); break; \
-           case 4: __get_kernel_asm("ldw", ptr); break; \
-           case 8: LDD_KERNEL(ptr); break;              \
-           default: BUILD_BUG(); break;                 \
-           }                                            \
-       }                                                \
-       else {                                           \
-           switch (sizeof(*(ptr))) {                    \
+       load_sr2();                                      \
+       switch (sizeof(*(ptr))) {                        \
            case 1: __get_user_asm("ldb", ptr); break;   \
            case 2: __get_user_asm("ldh", ptr); break;   \
            case 4: __get_user_asm("ldw", ptr); break;   \
            case 8: LDD_USER(ptr);  break;               \
            default: BUILD_BUG(); break;                 \
-           }                                            \
        }                                                \
                                                         \
        (x) = (__force __typeof__(*(ptr))) __gu_val;     \
        __gu_err;                                        \
 })
 
-#define __get_kernel_asm(ldx, ptr)                      \
-       __asm__("\n1:\t" ldx "\t0(%2),%0\n\t"           \
+#define __get_user_asm(ldx, ptr)                        \
+       __asm__("\n1:\t" ldx "\t0(%%sr2,%2),%0\n\t"     \
                ASM_EXCEPTIONTABLE_ENTRY(1b, fixup_get_user_skip_1)\
                : "=r"(__gu_val), "=r"(__gu_err)        \
                : "r"(ptr), "1"(__gu_err)               \
                : "r1");
 
-#define __get_user_asm(ldx, ptr)                        \
-       __asm__("\n1:\t" ldx "\t0(%%sr3,%2),%0\n\t"     \
-               ASM_EXCEPTIONTABLE_ENTRY(1b, fixup_get_user_skip_1)\
-               : "=r"(__gu_val), "=r"(__gu_err)        \
+#if !defined(CONFIG_64BIT)
+
+#define __get_user_asm64(ptr)                          \
+       __asm__("\n1:\tldw 0(%%sr2,%2),%0"              \
+               "\n2:\tldw 4(%%sr2,%2),%R0\n\t"         \
+               ASM_EXCEPTIONTABLE_ENTRY(1b, fixup_get_user_skip_2)\
+               ASM_EXCEPTIONTABLE_ENTRY(2b, fixup_get_user_skip_1)\
+               : "=r"(__gu_val), "=r"(__gu_err)        \
                : "r"(ptr), "1"(__gu_err)               \
                : "r1");
 
+#endif /* !defined(CONFIG_64BIT) */
+
+
 #define __put_user(x, ptr)                                      \
 ({                                                             \
        register long __pu_err __asm__ ("r8") = 0;              \
         __typeof__(*(ptr)) __x = (__typeof__(*(ptr)))(x);      \
                                                                \
-       if (segment_eq(get_fs(), KERNEL_DS)) {                  \
-           switch (sizeof(*(ptr))) {                           \
-           case 1: __put_kernel_asm("stb", __x, ptr); break;   \
-           case 2: __put_kernel_asm("sth", __x, ptr); break;   \
-           case 4: __put_kernel_asm("stw", __x, ptr); break;   \
-           case 8: STD_KERNEL(__x, ptr); break;                \
-           default: BUILD_BUG(); break;                        \
-           }                                                   \
-       }                                                       \
-       else {                                                  \
-           switch (sizeof(*(ptr))) {                           \
+       load_sr2();                                             \
+       switch (sizeof(*(ptr))) {                               \
            case 1: __put_user_asm("stb", __x, ptr); break;     \
            case 2: __put_user_asm("sth", __x, ptr); break;     \
            case 4: __put_user_asm("stw", __x, ptr); break;     \
            case 8: STD_USER(__x, ptr); break;                  \
            default: BUILD_BUG(); break;                        \
-           }                                                   \
        }                                                       \
                                                                \
        __pu_err;                                               \
@@ -159,17 +155,9 @@ struct exception_data {
  * r8/r9 are already listed as err/val.
  */
 
-#define __put_kernel_asm(stx, x, ptr)                       \
-       __asm__ __volatile__ (                              \
-               "\n1:\t" stx "\t%2,0(%1)\n\t"               \
-               ASM_EXCEPTIONTABLE_ENTRY(1b, fixup_put_user_skip_1)\
-               : "=r"(__pu_err)                            \
-               : "r"(ptr), "r"(x), "0"(__pu_err)           \
-               : "r1")
-
 #define __put_user_asm(stx, x, ptr)                         \
        __asm__ __volatile__ (                              \
-               "\n1:\t" stx "\t%2,0(%%sr3,%1)\n\t"         \
+               "\n1:\t" stx "\t%2,0(%%sr2,%1)\n\t"         \
                ASM_EXCEPTIONTABLE_ENTRY(1b, fixup_put_user_skip_1)\
                : "=r"(__pu_err)                            \
                : "r"(ptr), "r"(x), "0"(__pu_err)           \
@@ -178,21 +166,10 @@ struct exception_data {
 
 #if !defined(CONFIG_64BIT)
 
-#define __put_kernel_asm64(__val, ptr) do {                \
-       __asm__ __volatile__ (                              \
-               "\n1:\tstw %2,0(%1)"                        \
-               "\n2:\tstw %R2,4(%1)\n\t"                   \
-               ASM_EXCEPTIONTABLE_ENTRY(1b, fixup_put_user_skip_2)\
-               ASM_EXCEPTIONTABLE_ENTRY(2b, fixup_put_user_skip_1)\
-               : "=r"(__pu_err)                            \
-               : "r"(ptr), "r"(__val), "0"(__pu_err) \
-               : "r1");                                    \
-} while (0)
-
 #define __put_user_asm64(__val, ptr) do {                  \
        __asm__ __volatile__ (                              \
-               "\n1:\tstw %2,0(%%sr3,%1)"                  \
-               "\n2:\tstw %R2,4(%%sr3,%1)\n\t"             \
+               "\n1:\tstw %2,0(%%sr2,%1)"                  \
+               "\n2:\tstw %R2,4(%%sr2,%1)\n\t"             \
                ASM_EXCEPTIONTABLE_ENTRY(1b, fixup_put_user_skip_2)\
                ASM_EXCEPTIONTABLE_ENTRY(2b, fixup_put_user_skip_1)\
                : "=r"(__pu_err)                            \
index 702498f..0609ff1 100644 (file)
@@ -59,7 +59,7 @@
 #define PDC_MODEL_GET_BOOT__OP 8       /* returns boot test options    */
 #define PDC_MODEL_SET_BOOT__OP 9       /* set boot test options        */
 
-#define PA89_INSTRUCTION_SET   0x4     /* capatibilies returned        */
+#define PA89_INSTRUCTION_SET   0x4     /* capabilities returned        */
 #define PA90_INSTRUCTION_SET   0x8
 
 #define PDC_CACHE      5               /* return/set cache (& TLB) info*/
index c4fa6c8..02ce2eb 100644 (file)
  * N.B. gdb/strace care about the size and offsets within this
  * structure. If you change things, you may break object compatibility
  * for those applications.
+ *
+ * Please do NOT use this structure for future programs, but use
+ * user_regs_struct (see below) instead.
+ *
+ * It can be accessed through PTRACE_PEEKUSR/PTRACE_POKEUSR only.
  */
 
 struct pt_regs {
@@ -33,6 +38,45 @@ struct pt_regs {
        unsigned long ipsw;     /* CR22 */
 };
 
+/**
+ * struct user_regs_struct - User general purpose registers
+ *
+ * This is the user-visible general purpose register state structure
+ * which is used to define the elf_gregset_t.
+ *
+ * It can be accessed through PTRACE_GETREGSET with NT_PRSTATUS
+ * and through PTRACE_GETREGS.
+ */
+struct user_regs_struct {
+       unsigned long gr[32];   /* PSW is in gr[0] */
+       unsigned long sr[8];
+       unsigned long iaoq[2];
+       unsigned long iasq[2];
+       unsigned long sar;      /* CR11 */
+       unsigned long iir;      /* CR19 */
+       unsigned long isr;      /* CR20 */
+       unsigned long ior;      /* CR21 */
+       unsigned long ipsw;     /* CR22 */
+       unsigned long cr0;
+       unsigned long cr24, cr25, cr26, cr27, cr28, cr29, cr30, cr31;
+       unsigned long cr8, cr9, cr12, cr13, cr10, cr15;
+       unsigned long _pad[80-64];      /* pad to ELF_NGREG (80) */
+};
+
+/**
+ * struct user_fp_struct - User floating point registers
+ *
+ * This is the user-visible floating point register state structure.
+ * It uses the same layout and size as elf_fpregset_t.
+ *
+ * It can be accessed through PTRACE_GETREGSET with NT_PRFPREG
+ * and through PTRACE_GETFPREGS.
+ */
+struct user_fp_struct {
+       __u64 fr[32];
+};
+
+
 /*
  * The numbers chosen here are somewhat arbitrary but absolutely MUST
  * not overlap with any of the number assigned in <linux/ptrace.h>.
@@ -43,5 +87,9 @@ struct pt_regs {
  */
 #define PTRACE_SINGLEBLOCK     12      /* resume execution until next branch */
 
+#define PTRACE_GETREGS         18
+#define PTRACE_SETREGS         19
+#define PTRACE_GETFPREGS       14
+#define PTRACE_SETFPREGS       15
 
 #endif /* _UAPI_PARISC_PTRACE_H */
index cc0ce92..a9b9407 100644 (file)
 #define __NR_uselib              (__NR_Linux + 86)
 #define __NR_swapon              (__NR_Linux + 87)
 #define __NR_reboot              (__NR_Linux + 88)
-#define __NR_mmap2             (__NR_Linux + 89)
+#define __NR_mmap2               (__NR_Linux + 89)
 #define __NR_mmap                (__NR_Linux + 90)
 #define __NR_munmap              (__NR_Linux + 91)
 #define __NR_truncate            (__NR_Linux + 92)
 #define __NR_recv                (__NR_Linux + 98)
 #define __NR_statfs              (__NR_Linux + 99)
 #define __NR_fstatfs            (__NR_Linux + 100)
-#define __NR_stat64           (__NR_Linux + 101)
+#define __NR_stat64             (__NR_Linux + 101)
 /* #define __NR_socketcall         (__NR_Linux + 102) */
 #define __NR_syslog             (__NR_Linux + 103)
 #define __NR_setitimer          (__NR_Linux + 104)
 #define __NR_adjtimex           (__NR_Linux + 124)
 #define __NR_mprotect           (__NR_Linux + 125)
 #define __NR_sigprocmask        (__NR_Linux + 126)
-#define __NR_create_module      (__NR_Linux + 127)
+#define __NR_create_module      (__NR_Linux + 127) /* not used */
 #define __NR_init_module        (__NR_Linux + 128)
 #define __NR_delete_module      (__NR_Linux + 129)
-#define __NR_get_kernel_syms    (__NR_Linux + 130)
+#define __NR_get_kernel_syms    (__NR_Linux + 130) /* not used */
 #define __NR_quotactl           (__NR_Linux + 131)
 #define __NR_getpgid            (__NR_Linux + 132)
 #define __NR_fchdir             (__NR_Linux + 133)
 #define __NR_bdflush            (__NR_Linux + 134)
 #define __NR_sysfs              (__NR_Linux + 135)
 #define __NR_personality        (__NR_Linux + 136)
-#define __NR_afs_syscall        (__NR_Linux + 137) /* Syscall for Andrew File System */
+#define __NR_afs_syscall        (__NR_Linux + 137) /* not used */
 #define __NR_setfsuid           (__NR_Linux + 138)
 #define __NR_setfsgid           (__NR_Linux + 139)
 #define __NR__llseek            (__NR_Linux + 140)
 #define __NR_setresuid          (__NR_Linux + 164)
 #define __NR_getresuid          (__NR_Linux + 165)
 #define __NR_sigaltstack        (__NR_Linux + 166)
-#define __NR_query_module       (__NR_Linux + 167)
+#define __NR_query_module       (__NR_Linux + 167) /* not used */
 #define __NR_poll               (__NR_Linux + 168)
-#define __NR_nfsservctl         (__NR_Linux + 169)
+#define __NR_nfsservctl         (__NR_Linux + 169) /* not used */
 #define __NR_setresgid          (__NR_Linux + 170)
 #define __NR_getresgid          (__NR_Linux + 171)
 #define __NR_prctl              (__NR_Linux + 172)
 #define __NR_shmdt              (__NR_Linux + 193)
 #define __NR_shmget             (__NR_Linux + 194)
 #define __NR_shmctl             (__NR_Linux + 195)
-
-#define __NR_getpmsg           (__NR_Linux + 196) /* Somebody *wants* streams? */
-#define __NR_putpmsg           (__NR_Linux + 197)
-
+#define __NR_getpmsg            (__NR_Linux + 196) /* not used */
+#define __NR_putpmsg            (__NR_Linux + 197) /* not used */
 #define __NR_lstat64            (__NR_Linux + 198)
 #define __NR_truncate64         (__NR_Linux + 199)
 #define __NR_ftruncate64        (__NR_Linux + 200)
 #define __NR_getdents64         (__NR_Linux + 201)
 #define __NR_fcntl64            (__NR_Linux + 202)
-#define __NR_attrctl            (__NR_Linux + 203)
-#define __NR_acl_get            (__NR_Linux + 204)
-#define __NR_acl_set            (__NR_Linux + 205)
+#define __NR_attrctl            (__NR_Linux + 203) /* not used */
+#define __NR_acl_get            (__NR_Linux + 204) /* not used */
+#define __NR_acl_set            (__NR_Linux + 205) /* not used */
 #define __NR_gettid             (__NR_Linux + 206)
 #define __NR_readahead          (__NR_Linux + 207)
 #define __NR_tkill              (__NR_Linux + 208)
 #define __NR_futex              (__NR_Linux + 210)
 #define __NR_sched_setaffinity  (__NR_Linux + 211)
 #define __NR_sched_getaffinity  (__NR_Linux + 212)
-#define __NR_set_thread_area    (__NR_Linux + 213)
-#define __NR_get_thread_area    (__NR_Linux + 214)
+#define __NR_set_thread_area    (__NR_Linux + 213) /* not used */
+#define __NR_get_thread_area    (__NR_Linux + 214) /* not used */
 #define __NR_io_setup           (__NR_Linux + 215)
 #define __NR_io_destroy         (__NR_Linux + 216)
 #define __NR_io_getevents       (__NR_Linux + 217)
 #define __NR_mbind             (__NR_Linux + 260)
 #define __NR_get_mempolicy     (__NR_Linux + 261)
 #define __NR_set_mempolicy     (__NR_Linux + 262)
-#define __NR_vserver           (__NR_Linux + 263)
+#define __NR_vserver           (__NR_Linux + 263) /* not used */
 #define __NR_add_key           (__NR_Linux + 264)
 #define __NR_request_key       (__NR_Linux + 265)
 #define __NR_keyctl            (__NR_Linux + 266)
 #define __NR_kexec_load                (__NR_Linux + 300)
 #define __NR_utimensat         (__NR_Linux + 301)
 #define __NR_signalfd          (__NR_Linux + 302)
-#define __NR_timerfd           (__NR_Linux + 303)
+#define __NR_timerfd           (__NR_Linux + 303) /* not used */
 #define __NR_eventfd           (__NR_Linux + 304)
 #define __NR_fallocate         (__NR_Linux + 305)
 #define __NR_timerfd_create    (__NR_Linux + 306)
index 39127d3..baa3d9d 100644 (file)
         * boundary
         */
 
-       .text
+       .section .text.hot
        .align 2048
 
 ENTRY(fault_vector_20)
@@ -2019,6 +2019,7 @@ ftrace_stub:
        .procend
 ENDPROC(mcount)
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
        .align 8
        .globl return_to_handler
        .type  return_to_handler, @function
@@ -2040,11 +2041,17 @@ parisc_return_to_handler:
 #endif
 
        /* call ftrace_return_to_handler(0) */
+       .import ftrace_return_to_handler,code
+       load32 ftrace_return_to_handler,%ret0
+       load32 .Lftrace_ret,%r2
 #ifdef CONFIG_64BIT
        ldo -16(%sp),%ret1              /* Reference param save area */
+       bve     (%ret0)
+#else
+       bv      %r0(%ret0)
 #endif
-       BL ftrace_return_to_handler,%r2
        ldi 0,%r26
+.Lftrace_ret:
        copy %ret0,%rp
 
        /* restore original return values */
@@ -2062,6 +2069,8 @@ parisc_return_to_handler:
        .procend
 ENDPROC(return_to_handler)
 
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
 #endif /* CONFIG_FUNCTION_TRACER */
 
 #ifdef CONFIG_IRQSTACKS
index b13f9ec..a828a0a 100644 (file)
 #include <asm/ftrace.h>
 
 
+#define __hot __attribute__ ((__section__ (".text.hot")))
+
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 /*
  * Hook the return address and push it in the stack of return addrs
  * in current thread info.
  */
-static void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
+static void __hot prepare_ftrace_return(unsigned long *parent,
+                                       unsigned long self_addr)
 {
        unsigned long old;
        struct ftrace_graph_ent trace;
@@ -53,7 +56,7 @@ static void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr
 }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
-void notrace ftrace_function_trampoline(unsigned long parent,
+void notrace __hot ftrace_function_trampoline(unsigned long parent,
                                unsigned long self_addr,
                                unsigned long org_sp_gr3)
 {
index e81ccf1..5adc339 100644 (file)
@@ -324,8 +324,9 @@ int init_per_cpu(int cpunum)
                per_cpu(cpu_data, cpunum).fp_rev = coproc_cfg.revision;
                per_cpu(cpu_data, cpunum).fp_model = coproc_cfg.model;
 
-               printk(KERN_INFO  "FP[%d] enabled: Rev %ld Model %ld\n",
-                       cpunum, coproc_cfg.revision, coproc_cfg.model);
+               if (cpunum == 0)
+                       printk(KERN_INFO  "FP[%d] enabled: Rev %ld Model %ld\n",
+                               cpunum, coproc_cfg.revision, coproc_cfg.model);
 
                /*
                ** store status register to stack (hopefully aligned)
index 8fb81a3..b5458b3 100644 (file)
@@ -4,18 +4,20 @@
  * Copyright (C) 2000 Hewlett-Packard Co, Linuxcare Inc.
  * Copyright (C) 2000 Matthew Wilcox <matthew@wil.cx>
  * Copyright (C) 2000 David Huggins-Daines <dhd@debian.org>
- * Copyright (C) 2008 Helge Deller <deller@gmx.de>
+ * Copyright (C) 2008-2016 Helge Deller <deller@gmx.de>
  */
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
+#include <linux/elf.h>
 #include <linux/errno.h>
 #include <linux/ptrace.h>
 #include <linux/tracehook.h>
 #include <linux/user.h>
 #include <linux/personality.h>
+#include <linux/regset.h>
 #include <linux/security.h>
 #include <linux/seccomp.h>
 #include <linux/compat.h>
 /* PSW bits we allow the debugger to modify */
 #define USER_PSW_BITS  (PSW_N | PSW_B | PSW_V | PSW_CB)
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/syscalls.h>
+
+/*
+ * These are our native regset flavors.
+ */
+enum parisc_regset {
+       REGSET_GENERAL,
+       REGSET_FP
+};
+
 /*
  * Called by kernel/ptrace.c when detaching..
  *
@@ -114,6 +127,7 @@ void user_enable_block_step(struct task_struct *task)
 long arch_ptrace(struct task_struct *child, long request,
                 unsigned long addr, unsigned long data)
 {
+       unsigned long __user *datap = (unsigned long __user *)data;
        unsigned long tmp;
        long ret = -EIO;
 
@@ -126,7 +140,7 @@ long arch_ptrace(struct task_struct *child, long request,
                     addr >= sizeof(struct pt_regs))
                        break;
                tmp = *(unsigned long *) ((char *) task_regs(child) + addr);
-               ret = put_user(tmp, (unsigned long __user *) data);
+               ret = put_user(tmp, datap);
                break;
 
        /* Write the word at location addr in the USER area.  This will need
@@ -165,6 +179,34 @@ long arch_ptrace(struct task_struct *child, long request,
                }
                break;
 
+       case PTRACE_GETREGS:    /* Get all gp regs from the child. */
+               return copy_regset_to_user(child,
+                                          task_user_regset_view(current),
+                                          REGSET_GENERAL,
+                                          0, sizeof(struct user_regs_struct),
+                                          datap);
+
+       case PTRACE_SETREGS:    /* Set all gp regs in the child. */
+               return copy_regset_from_user(child,
+                                            task_user_regset_view(current),
+                                            REGSET_GENERAL,
+                                            0, sizeof(struct user_regs_struct),
+                                            datap);
+
+       case PTRACE_GETFPREGS:  /* Get the child FPU state. */
+               return copy_regset_to_user(child,
+                                          task_user_regset_view(current),
+                                          REGSET_FP,
+                                          0, sizeof(struct user_fp_struct),
+                                          datap);
+
+       case PTRACE_SETFPREGS:  /* Set the child FPU state. */
+               return copy_regset_from_user(child,
+                                            task_user_regset_view(current),
+                                            REGSET_FP,
+                                            0, sizeof(struct user_fp_struct),
+                                            datap);
+
        default:
                ret = ptrace_request(child, request, addr, data);
                break;
@@ -283,6 +325,10 @@ long do_syscall_trace_enter(struct pt_regs *regs)
                regs->gr[20] = -1UL;
                goto out;
        }
+#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
+       if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
+               trace_sys_enter(regs, regs->gr[20]);
+#endif
 
 #ifdef CONFIG_64BIT
        if (!is_compat_task())
@@ -311,6 +357,324 @@ void do_syscall_trace_exit(struct pt_regs *regs)
 
        audit_syscall_exit(regs);
 
+#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
+       if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
+               trace_sys_exit(regs, regs->gr[20]);
+#endif
+
        if (stepping || test_thread_flag(TIF_SYSCALL_TRACE))
                tracehook_report_syscall_exit(regs, stepping);
 }
+
+
+/*
+ * regset functions.
+ */
+
+static int fpr_get(struct task_struct *target,
+                    const struct user_regset *regset,
+                    unsigned int pos, unsigned int count,
+                    void *kbuf, void __user *ubuf)
+{
+       struct pt_regs *regs = task_regs(target);
+       __u64 *k = kbuf;
+       __u64 __user *u = ubuf;
+       __u64 reg;
+
+       pos /= sizeof(reg);
+       count /= sizeof(reg);
+
+       if (kbuf)
+               for (; count > 0 && pos < ELF_NFPREG; --count)
+                       *k++ = regs->fr[pos++];
+       else
+               for (; count > 0 && pos < ELF_NFPREG; --count)
+                       if (__put_user(regs->fr[pos++], u++))
+                               return -EFAULT;
+
+       kbuf = k;
+       ubuf = u;
+       pos *= sizeof(reg);
+       count *= sizeof(reg);
+       return user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
+                                       ELF_NFPREG * sizeof(reg), -1);
+}
+
+static int fpr_set(struct task_struct *target,
+                    const struct user_regset *regset,
+                    unsigned int pos, unsigned int count,
+                    const void *kbuf, const void __user *ubuf)
+{
+       struct pt_regs *regs = task_regs(target);
+       const __u64 *k = kbuf;
+       const __u64 __user *u = ubuf;
+       __u64 reg;
+
+       pos /= sizeof(reg);
+       count /= sizeof(reg);
+
+       if (kbuf)
+               for (; count > 0 && pos < ELF_NFPREG; --count)
+                       regs->fr[pos++] = *k++;
+       else
+               for (; count > 0 && pos < ELF_NFPREG; --count) {
+                       if (__get_user(reg, u++))
+                               return -EFAULT;
+                       regs->fr[pos++] = reg;
+               }
+
+       kbuf = k;
+       ubuf = u;
+       pos *= sizeof(reg);
+       count *= sizeof(reg);
+       return user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+                                        ELF_NFPREG * sizeof(reg), -1);
+}
+
+#define RI(reg) (offsetof(struct user_regs_struct,reg) / sizeof(long))
+
+static unsigned long get_reg(struct pt_regs *regs, int num)
+{
+       switch (num) {
+       case RI(gr[0]) ... RI(gr[31]):  return regs->gr[num - RI(gr[0])];
+       case RI(sr[0]) ... RI(sr[7]):   return regs->sr[num - RI(sr[0])];
+       case RI(iasq[0]):               return regs->iasq[0];
+       case RI(iasq[1]):               return regs->iasq[1];
+       case RI(iaoq[0]):               return regs->iaoq[0];
+       case RI(iaoq[1]):               return regs->iaoq[1];
+       case RI(sar):                   return regs->sar;
+       case RI(iir):                   return regs->iir;
+       case RI(isr):                   return regs->isr;
+       case RI(ior):                   return regs->ior;
+       case RI(ipsw):                  return regs->ipsw;
+       case RI(cr27):                  return regs->cr27;
+       case RI(cr0):                   return mfctl(0);
+       case RI(cr24):                  return mfctl(24);
+       case RI(cr25):                  return mfctl(25);
+       case RI(cr26):                  return mfctl(26);
+       case RI(cr28):                  return mfctl(28);
+       case RI(cr29):                  return mfctl(29);
+       case RI(cr30):                  return mfctl(30);
+       case RI(cr31):                  return mfctl(31);
+       case RI(cr8):                   return mfctl(8);
+       case RI(cr9):                   return mfctl(9);
+       case RI(cr12):                  return mfctl(12);
+       case RI(cr13):                  return mfctl(13);
+       case RI(cr10):                  return mfctl(10);
+       case RI(cr15):                  return mfctl(15);
+       default:                        return 0;
+       }
+}
+
+static void set_reg(struct pt_regs *regs, int num, unsigned long val)
+{
+       switch (num) {
+       case RI(gr[0]): /*
+                        * PSW is in gr[0].
+                        * Allow writing to Nullify, Divide-step-correction,
+                        * and carry/borrow bits.
+                        * BEWARE, if you set N, and then single step, it won't
+                        * stop on the nullified instruction.
+                        */
+                       val &= USER_PSW_BITS;
+                       regs->gr[0] &= ~USER_PSW_BITS;
+                       regs->gr[0] |= val;
+                       return;
+       case RI(gr[1]) ... RI(gr[31]):
+                       regs->gr[num - RI(gr[0])] = val;
+                       return;
+       case RI(iaoq[0]):
+       case RI(iaoq[1]):
+                       regs->iaoq[num - RI(iaoq[0])] = val;
+                       return;
+       case RI(sar):   regs->sar = val;
+                       return;
+       default:        return;
+#if 0
+       /* do not allow to change any of the following registers (yet) */
+       case RI(sr[0]) ... RI(sr[7]):   return regs->sr[num - RI(sr[0])];
+       case RI(iasq[0]):               return regs->iasq[0];
+       case RI(iasq[1]):               return regs->iasq[1];
+       case RI(iir):                   return regs->iir;
+       case RI(isr):                   return regs->isr;
+       case RI(ior):                   return regs->ior;
+       case RI(ipsw):                  return regs->ipsw;
+       case RI(cr27):                  return regs->cr27;
+        case cr0, cr24, cr25, cr26, cr27, cr28, cr29, cr30, cr31;
+        case cr8, cr9, cr12, cr13, cr10, cr15;
+#endif
+       }
+}
+
+static int gpr_get(struct task_struct *target,
+                    const struct user_regset *regset,
+                    unsigned int pos, unsigned int count,
+                    void *kbuf, void __user *ubuf)
+{
+       struct pt_regs *regs = task_regs(target);
+       unsigned long *k = kbuf;
+       unsigned long __user *u = ubuf;
+       unsigned long reg;
+
+       pos /= sizeof(reg);
+       count /= sizeof(reg);
+
+       if (kbuf)
+               for (; count > 0 && pos < ELF_NGREG; --count)
+                       *k++ = get_reg(regs, pos++);
+       else
+               for (; count > 0 && pos < ELF_NGREG; --count)
+                       if (__put_user(get_reg(regs, pos++), u++))
+                               return -EFAULT;
+       kbuf = k;
+       ubuf = u;
+       pos *= sizeof(reg);
+       count *= sizeof(reg);
+       return user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
+                                       ELF_NGREG * sizeof(reg), -1);
+}
+
+static int gpr_set(struct task_struct *target,
+                    const struct user_regset *regset,
+                    unsigned int pos, unsigned int count,
+                    const void *kbuf, const void __user *ubuf)
+{
+       struct pt_regs *regs = task_regs(target);
+       const unsigned long *k = kbuf;
+       const unsigned long __user *u = ubuf;
+       unsigned long reg;
+
+       pos /= sizeof(reg);
+       count /= sizeof(reg);
+
+       if (kbuf)
+               for (; count > 0 && pos < ELF_NGREG; --count)
+                       set_reg(regs, pos++, *k++);
+       else
+               for (; count > 0 && pos < ELF_NGREG; --count) {
+                       if (__get_user(reg, u++))
+                               return -EFAULT;
+                       set_reg(regs, pos++, reg);
+               }
+
+       kbuf = k;
+       ubuf = u;
+       pos *= sizeof(reg);
+       count *= sizeof(reg);
+       return user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+                                        ELF_NGREG * sizeof(reg), -1);
+}
+
+static const struct user_regset native_regsets[] = {
+       [REGSET_GENERAL] = {
+               .core_note_type = NT_PRSTATUS, .n = ELF_NGREG,
+               .size = sizeof(long), .align = sizeof(long),
+               .get = gpr_get, .set = gpr_set
+       },
+       [REGSET_FP] = {
+               .core_note_type = NT_PRFPREG, .n = ELF_NFPREG,
+               .size = sizeof(__u64), .align = sizeof(__u64),
+               .get = fpr_get, .set = fpr_set
+       }
+};
+
+static const struct user_regset_view user_parisc_native_view = {
+       .name = "parisc", .e_machine = ELF_ARCH, .ei_osabi = ELFOSABI_LINUX,
+       .regsets = native_regsets, .n = ARRAY_SIZE(native_regsets)
+};
+
+#ifdef CONFIG_64BIT
+#include <linux/compat.h>
+
+static int gpr32_get(struct task_struct *target,
+                    const struct user_regset *regset,
+                    unsigned int pos, unsigned int count,
+                    void *kbuf, void __user *ubuf)
+{
+       struct pt_regs *regs = task_regs(target);
+       compat_ulong_t *k = kbuf;
+       compat_ulong_t __user *u = ubuf;
+       compat_ulong_t reg;
+
+       pos /= sizeof(reg);
+       count /= sizeof(reg);
+
+       if (kbuf)
+               for (; count > 0 && pos < ELF_NGREG; --count)
+                       *k++ = get_reg(regs, pos++);
+       else
+               for (; count > 0 && pos < ELF_NGREG; --count)
+                       if (__put_user((compat_ulong_t) get_reg(regs, pos++), u++))
+                               return -EFAULT;
+
+       kbuf = k;
+       ubuf = u;
+       pos *= sizeof(reg);
+       count *= sizeof(reg);
+       return user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
+                                       ELF_NGREG * sizeof(reg), -1);
+}
+
+static int gpr32_set(struct task_struct *target,
+                    const struct user_regset *regset,
+                    unsigned int pos, unsigned int count,
+                    const void *kbuf, const void __user *ubuf)
+{
+       struct pt_regs *regs = task_regs(target);
+       const compat_ulong_t *k = kbuf;
+       const compat_ulong_t __user *u = ubuf;
+       compat_ulong_t reg;
+
+       pos /= sizeof(reg);
+       count /= sizeof(reg);
+
+       if (kbuf)
+               for (; count > 0 && pos < ELF_NGREG; --count)
+                       set_reg(regs, pos++, *k++);
+       else
+               for (; count > 0 && pos < ELF_NGREG; --count) {
+                       if (__get_user(reg, u++))
+                               return -EFAULT;
+                       set_reg(regs, pos++, reg);
+               }
+
+       kbuf = k;
+       ubuf = u;
+       pos *= sizeof(reg);
+       count *= sizeof(reg);
+       return user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+                                        ELF_NGREG * sizeof(reg), -1);
+}
+
+/*
+ * These are the regset flavors matching the 32bit native set.
+ */
+static const struct user_regset compat_regsets[] = {
+       [REGSET_GENERAL] = {
+               .core_note_type = NT_PRSTATUS, .n = ELF_NGREG,
+               .size = sizeof(compat_long_t), .align = sizeof(compat_long_t),
+               .get = gpr32_get, .set = gpr32_set
+       },
+       [REGSET_FP] = {
+               .core_note_type = NT_PRFPREG, .n = ELF_NFPREG,
+               .size = sizeof(__u64), .align = sizeof(__u64),
+               .get = fpr_get, .set = fpr_set
+       }
+};
+
+static const struct user_regset_view user_parisc_compat_view = {
+       .name = "parisc", .e_machine = EM_PARISC, .ei_osabi = ELFOSABI_LINUX,
+       .regsets = compat_regsets, .n = ARRAY_SIZE(compat_regsets)
+};
+#endif /* CONFIG_64BIT */
+
+const struct user_regset_view *task_user_regset_view(struct task_struct *task)
+{
+       BUILD_BUG_ON(sizeof(struct user_regs_struct)/sizeof(long) != ELF_NGREG);
+       BUILD_BUG_ON(sizeof(struct user_fp_struct)/sizeof(__u64) != ELF_NFPREG);
+#ifdef CONFIG_64BIT
+       if (is_compat_task())
+               return &user_parisc_compat_view;
+#endif
+       return &user_parisc_native_view;
+}
index 57b4836..d03422e 100644 (file)
@@ -912,6 +912,7 @@ END(lws_table)
 
        .align 8
 ENTRY(sys_call_table)
+       .export sys_call_table,data
 #include "syscall_table.S"
 END(sys_call_table)
 
index 400acac..31ec99a 100644 (file)
 
 static unsigned long clocktick __read_mostly;  /* timer cycles per tick */
 
+#ifndef CONFIG_64BIT
+/*
+ * The processor-internal cycle counter (Control Register 16) is used as time
+ * source for the sched_clock() function.  This register is 64bit wide on a
+ * 64-bit kernel and 32bit on a 32-bit kernel. Since sched_clock() always
+ * requires a 64bit counter we emulate on the 32-bit kernel the higher 32bits
+ * with a per-cpu variable which we increase every time the counter
+ * wraps-around (which happens every ~4 secounds).
+ */
+static DEFINE_PER_CPU(unsigned long, cr16_high_32_bits);
+#endif
+
 /*
  * We keep time on PA-RISC Linux by using the Interval Timer which is
  * a pair of registers; one is read-only and one is write-only; both
@@ -108,6 +120,12 @@ irqreturn_t __irq_entry timer_interrupt(int irq, void *dev_id)
         */
        mtctl(next_tick, 16);
 
+#if !defined(CONFIG_64BIT)
+       /* check for overflow on a 32bit kernel (every ~4 seconds). */
+       if (unlikely(next_tick < now))
+               this_cpu_inc(cr16_high_32_bits);
+#endif
+
        /* Skip one clocktick on purpose if we missed next_tick.
         * The new CR16 must be "later" than current CR16 otherwise
         * itimer would not fire until CR16 wrapped - e.g 4 seconds
@@ -219,6 +237,12 @@ void __init start_cpu_itimer(void)
        unsigned int cpu = smp_processor_id();
        unsigned long next_tick = mfctl(16) + clocktick;
 
+#if defined(CONFIG_HAVE_UNSTABLE_SCHED_CLOCK) && defined(CONFIG_64BIT)
+       /* With multiple 64bit CPUs online, the cr16's are not syncronized. */
+       if (cpu != 0)
+               clear_sched_clock_stable();
+#endif
+
        mtctl(next_tick, 16);           /* kick off Interval Timer (CR16) */
 
        per_cpu(cpu_data, cpu).it_value = next_tick;
@@ -246,15 +270,47 @@ void read_persistent_clock(struct timespec *ts)
        }
 }
 
+
+/*
+ * sched_clock() framework
+ */
+
+static u32 cyc2ns_mul __read_mostly;
+static u32 cyc2ns_shift __read_mostly;
+
+u64 sched_clock(void)
+{
+       u64 now;
+
+       /* Get current cycle counter (Control Register 16). */
+#ifdef CONFIG_64BIT
+       now = mfctl(16);
+#else
+       now = mfctl(16) + (((u64) this_cpu_read(cr16_high_32_bits)) << 32);
+#endif
+
+       /* return the value in ns (cycles_2_ns) */
+       return mul_u64_u32_shr(now, cyc2ns_mul, cyc2ns_shift);
+}
+
+
+/*
+ * timer interrupt and sched_clock() initialization
+ */
+
 void __init time_init(void)
 {
        unsigned long current_cr16_khz;
 
+       current_cr16_khz = PAGE0->mem_10msec/10;  /* kHz */
        clocktick = (100 * PAGE0->mem_10msec) / HZ;
 
+       /* calculate mult/shift values for cr16 */
+       clocks_calc_mult_shift(&cyc2ns_mul, &cyc2ns_shift, current_cr16_khz,
+                               NSEC_PER_MSEC, 0);
+
        start_cpu_itimer();     /* get CPU 0 started */
 
        /* register at clocksource framework */
-       current_cr16_khz = PAGE0->mem_10msec/10;  /* kHz */
        clocksource_register_khz(&clocksource_cr16, current_cr16_khz);
 }
index d7c0acb..2b65c01 100644 (file)
@@ -28,6 +28,7 @@
 #include <linux/ratelimit.h>
 #include <asm/uaccess.h>
 #include <asm/hardirq.h>
+#include <asm/traps.h>
 
 /* #define DEBUG_UNALIGNED 1 */
 
 
 int unaligned_enabled __read_mostly = 1;
 
-void die_if_kernel (char *str, struct pt_regs *regs, long err);
-
 static int emulate_ldh(struct pt_regs *regs, int toreg)
 {
        unsigned long saddr = regs->ior;
@@ -666,7 +665,7 @@ void handle_unaligned(struct pt_regs *regs)
                break;
        }
 
-       if (modify && R1(regs->iir))
+       if (ret == 0 && modify && R1(regs->iir))
                regs->gr[R1(regs->iir)] = newbase;
 
 
@@ -677,6 +676,14 @@ void handle_unaligned(struct pt_regs *regs)
 
        if (ret)
        {
+               /*
+                * The unaligned handler failed.
+                * If we were called by __get_user() or __put_user() jump
+                * to it's exception fixup handler instead of crashing.
+                */
+               if (!user_mode(regs) && fixup_exception(regs))
+                       return;
+
                printk(KERN_CRIT "Unaligned handler failed, ret = %d\n", ret);
                die_if_kernel("Unaligned data reference", regs, 28);
 
index ddd988b..e278a87 100644 (file)
@@ -75,7 +75,10 @@ find_unwind_entry(unsigned long addr)
        if (addr >= kernel_unwind_table.start && 
            addr <= kernel_unwind_table.end)
                e = find_unwind_entry_in_table(&kernel_unwind_table, addr);
-       else 
+       else {
+               unsigned long flags;
+
+               spin_lock_irqsave(&unwind_lock, flags);
                list_for_each_entry(table, &unwind_tables, list) {
                        if (addr >= table->start && 
                            addr <= table->end)
@@ -86,6 +89,8 @@ find_unwind_entry(unsigned long addr)
                                break;
                        }
                }
+               spin_unlock_irqrestore(&unwind_lock, flags);
+       }
 
        return e;
 }
@@ -303,18 +308,16 @@ static void unwind_frame_regs(struct unwind_frame_info *info)
 
                        insn = *(unsigned int *)npc;
 
-                       if ((insn & 0xffffc000) == 0x37de0000 ||
-                           (insn & 0xffe00000) == 0x6fc00000) {
+                       if ((insn & 0xffffc001) == 0x37de0000 ||
+                           (insn & 0xffe00001) == 0x6fc00000) {
                                /* ldo X(sp), sp, or stwm X,D(sp) */
-                               frame_size += (insn & 0x1 ? -1 << 13 : 0) | 
-                                       ((insn & 0x3fff) >> 1);
+                               frame_size += (insn & 0x3fff) >> 1;
                                dbg("analyzing func @ %lx, insn=%08x @ "
                                    "%lx, frame_size = %ld\n", info->ip,
                                    insn, npc, frame_size);
-                       } else if ((insn & 0xffe00008) == 0x73c00008) {
+                       } else if ((insn & 0xffe00009) == 0x73c00008) {
                                /* std,ma X,D(sp) */
-                               frame_size += (insn & 0x1 ? -1 << 13 : 0) | 
-                                       (((insn >> 4) & 0x3ff) << 3);
+                               frame_size += ((insn >> 4) & 0x3ff) << 3;
                                dbg("analyzing func @ %lx, insn=%08x @ "
                                    "%lx, frame_size = %ld\n", info->ip,
                                    insn, npc, frame_size);
@@ -333,6 +336,9 @@ static void unwind_frame_regs(struct unwind_frame_info *info)
                        }
                }
 
+               if (frame_size > e->Total_frame_size << 3)
+                       frame_size = e->Total_frame_size << 3;
+
                if (!unwind_special(info, e->region_start, frame_size)) {
                        info->prev_sp = info->sp - frame_size;
                        if (e->Millicode)
index 1871188..8e45b0a 100644 (file)
@@ -55,11 +55,10 @@ unsigned long __xchg8(char x, char *ptr)
 }
 
 
-#ifdef CONFIG_64BIT
-unsigned long __cmpxchg_u64(volatile unsigned long *ptr, unsigned long old, unsigned long new)
+u64 __cmpxchg_u64(volatile u64 *ptr, u64 old, u64 new)
 {
        unsigned long flags;
-       unsigned long prev;
+       u64 prev;
 
        _atomic_spin_lock_irqsave(ptr, flags);
        if ((prev = *ptr) == old)
@@ -67,7 +66,6 @@ unsigned long __cmpxchg_u64(volatile unsigned long *ptr, unsigned long old, unsi
        _atomic_spin_unlock_irqrestore(ptr, flags);
        return prev;
 }
-#endif
 
 unsigned long __cmpxchg_u32(volatile unsigned int *ptr, unsigned int old, unsigned int new)
 {
index 673b73e..18df123 100644 (file)
@@ -184,7 +184,7 @@ static void parisc_linux_get_fpu_type(u_int fpregs[])
 
 /*
  * this routine will decode the excepting floating point instruction and
- * call the approiate emulation routine.
+ * call the appropriate emulation routine.
  * It is called by decode_fpu with the following parameters:
  * fpudispatch(current_ir, unimplemented_code, 0, &Fpu_register)
  * where current_ir is the instruction to be emulated,
index c1e82e9..a0948f4 100644 (file)
 #define   MMCR0_FCWAIT 0x00000002UL /* freeze counter in WAIT state */
 #define   MMCR0_FCHV   0x00000001UL /* freeze conditions in hypervisor mode */
 #define SPRN_MMCR1     798
-#define SPRN_MMCR2     769
+#define SPRN_MMCR2     785
 #define SPRN_MMCRA     0x312
 #define   MMCRA_SDSYNC 0x80000000UL /* SDAR synced with SIAR */
 #define   MMCRA_SDAR_DCACHE_MISS 0x40000000UL
 #define SPRN_PMC6      792
 #define SPRN_PMC7      793
 #define SPRN_PMC8      794
-#define SPRN_SIAR      780
-#define SPRN_SDAR      781
 #define SPRN_SIER      784
 #define   SIER_SIPR            0x2000000       /* Sampled MSR_PR */
 #define   SIER_SIHV            0x1000000       /* Sampled MSR_HV */
 #define   SIER_SIAR_VALID      0x0400000       /* SIAR contents valid */
 #define   SIER_SDAR_VALID      0x0200000       /* SDAR contents valid */
+#define SPRN_SIAR      796
+#define SPRN_SDAR      797
 #define SPRN_TACR      888
 #define SPRN_TCSCR     889
 #define SPRN_CSIGR     890
index da51925..ccd2037 100644 (file)
@@ -656,6 +656,7 @@ unsigned char ibm_architecture_vec[] = {
        W(0xffff0000), W(0x003e0000),   /* POWER6 */
        W(0xffff0000), W(0x003f0000),   /* POWER7 */
        W(0xffff0000), W(0x004b0000),   /* POWER8E */
+       W(0xffff0000), W(0x004c0000),   /* POWER8NVL */
        W(0xffff0000), W(0x004d0000),   /* POWER8 */
        W(0xffffffff), W(0x0f000004),   /* all 2.07-compliant */
        W(0xffffffff), W(0x0f000003),   /* all 2.06-compliant */
index 5926896..b2740c6 100644 (file)
@@ -159,6 +159,19 @@ static struct mmu_psize_def mmu_psize_defaults_gp[] = {
        },
 };
 
+/*
+ * 'R' and 'C' update notes:
+ *  - Under pHyp or KVM, the updatepp path will not set C, thus it *will*
+ *     create writeable HPTEs without C set, because the hcall H_PROTECT
+ *     that we use in that case will not update C
+ *  - The above is however not a problem, because we also don't do that
+ *     fancy "no flush" variant of eviction and we use H_REMOVE which will
+ *     do the right thing and thus we don't have the race I described earlier
+ *
+ *    - Under bare metal,  we do have the race, so we need R and C set
+ *    - We make sure R is always set and never lost
+ *    - C is _PAGE_DIRTY, and *should* always be set for a writeable mapping
+ */
 unsigned long htab_convert_pte_flags(unsigned long pteflags)
 {
        unsigned long rflags = 0;
@@ -186,9 +199,14 @@ unsigned long htab_convert_pte_flags(unsigned long pteflags)
                        rflags |= 0x1;
        }
        /*
-        * Always add "C" bit for perf. Memory coherence is always enabled
+        * We can't allow hardware to update hpte bits. Hence always
+        * set 'R' bit and set 'C' if it is a write fault
+        * Memory coherence is always enabled
         */
-       rflags |=  HPTE_R_C | HPTE_R_M;
+       rflags |=  HPTE_R_R | HPTE_R_M;
+
+       if (pteflags & _PAGE_DIRTY)
+               rflags |= HPTE_R_C;
        /*
         * Add in WIG bits
         */
index eb44511..6703187 100644 (file)
@@ -33,10 +33,7 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
        changed = !pmd_same(*(pmdp), entry);
        if (changed) {
                __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry));
-               /*
-                * Since we are not supporting SW TLB systems, we don't
-                * have any thing similar to flush_tlb_page_nohash()
-                */
+               flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
        }
        return changed;
 }
index 18b2c11..c939e6e 100644 (file)
@@ -296,11 +296,6 @@ found:
 void __init radix__early_init_mmu(void)
 {
        unsigned long lpcr;
-       /*
-        * setup LPCR UPRT based on mmu_features
-        */
-       lpcr = mfspr(SPRN_LPCR);
-       mtspr(SPRN_LPCR, lpcr | LPCR_UPRT);
 
 #ifdef CONFIG_PPC_64K_PAGES
        /* PAGE_SIZE mappings */
@@ -343,8 +338,11 @@ void __init radix__early_init_mmu(void)
        __pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT;
 
        radix_init_page_sizes();
-       if (!firmware_has_feature(FW_FEATURE_LPAR))
+       if (!firmware_has_feature(FW_FEATURE_LPAR)) {
+               lpcr = mfspr(SPRN_LPCR);
+               mtspr(SPRN_LPCR, lpcr | LPCR_UPRT);
                radix_init_partition_table();
+       }
 
        radix_init_pgtable();
 }
@@ -353,16 +351,15 @@ void radix__early_init_mmu_secondary(void)
 {
        unsigned long lpcr;
        /*
-        * setup LPCR UPRT based on mmu_features
+        * update partition table control register and UPRT
         */
-       lpcr = mfspr(SPRN_LPCR);
-       mtspr(SPRN_LPCR, lpcr | LPCR_UPRT);
-       /*
-        * update partition table control register, 64 K size.
-        */
-       if (!firmware_has_feature(FW_FEATURE_LPAR))
+       if (!firmware_has_feature(FW_FEATURE_LPAR)) {
+               lpcr = mfspr(SPRN_LPCR);
+               mtspr(SPRN_LPCR, lpcr | LPCR_UPRT);
+
                mtspr(SPRN_PTCR,
                      __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
+       }
 }
 
 void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base,
index 26d37e6..0fc2671 100644 (file)
@@ -47,7 +47,7 @@ static int valid_next_sp(unsigned long sp, unsigned long prev_sp)
 }
 
 void
-perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
        unsigned long sp, next_sp;
        unsigned long next_ip;
@@ -76,7 +76,7 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
                        next_ip = regs->nip;
                        lr = regs->link;
                        level = 0;
-                       perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
+                       perf_callchain_store_context(entry, PERF_CONTEXT_KERNEL);
 
                } else {
                        if (level == 0)
@@ -232,7 +232,7 @@ static int sane_signal_64_frame(unsigned long sp)
                puc == (unsigned long) &sf->uc;
 }
 
-static void perf_callchain_user_64(struct perf_callchain_entry *entry,
+static void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry,
                                   struct pt_regs *regs)
 {
        unsigned long sp, next_sp;
@@ -247,7 +247,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
        sp = regs->gpr[1];
        perf_callchain_store(entry, next_ip);
 
-       while (entry->nr < sysctl_perf_event_max_stack) {
+       while (entry->nr < entry->max_stack) {
                fp = (unsigned long __user *) sp;
                if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp))
                        return;
@@ -274,7 +274,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
                            read_user_stack_64(&uregs[PT_R1], &sp))
                                return;
                        level = 0;
-                       perf_callchain_store(entry, PERF_CONTEXT_USER);
+                       perf_callchain_store_context(entry, PERF_CONTEXT_USER);
                        perf_callchain_store(entry, next_ip);
                        continue;
                }
@@ -319,7 +319,7 @@ static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret)
        return rc;
 }
 
-static inline void perf_callchain_user_64(struct perf_callchain_entry *entry,
+static inline void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry,
                                          struct pt_regs *regs)
 {
 }
@@ -439,7 +439,7 @@ static unsigned int __user *signal_frame_32_regs(unsigned int sp,
        return mctx->mc_gregs;
 }
 
-static void perf_callchain_user_32(struct perf_callchain_entry *entry,
+static void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry,
                                   struct pt_regs *regs)
 {
        unsigned int sp, next_sp;
@@ -453,7 +453,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
        sp = regs->gpr[1];
        perf_callchain_store(entry, next_ip);
 
-       while (entry->nr < sysctl_perf_event_max_stack) {
+       while (entry->nr < entry->max_stack) {
                fp = (unsigned int __user *) (unsigned long) sp;
                if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp))
                        return;
@@ -473,7 +473,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
                            read_user_stack_32(&uregs[PT_R1], &sp))
                                return;
                        level = 0;
-                       perf_callchain_store(entry, PERF_CONTEXT_USER);
+                       perf_callchain_store_context(entry, PERF_CONTEXT_USER);
                        perf_callchain_store(entry, next_ip);
                        continue;
                }
@@ -487,7 +487,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
 }
 
 void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
        if (current_is_64bit())
                perf_callchain_user_64(entry, regs);
index ac3ffd9..3998e0f 100644 (file)
@@ -53,7 +53,6 @@ static int ibm_read_slot_reset_state2;
 static int ibm_slot_error_detail;
 static int ibm_get_config_addr_info;
 static int ibm_get_config_addr_info2;
-static int ibm_configure_bridge;
 static int ibm_configure_pe;
 
 /*
@@ -81,7 +80,14 @@ static int pseries_eeh_init(void)
        ibm_get_config_addr_info2       = rtas_token("ibm,get-config-addr-info2");
        ibm_get_config_addr_info        = rtas_token("ibm,get-config-addr-info");
        ibm_configure_pe                = rtas_token("ibm,configure-pe");
-       ibm_configure_bridge            = rtas_token("ibm,configure-bridge");
+
+       /*
+        * ibm,configure-pe and ibm,configure-bridge have the same semantics,
+        * however ibm,configure-pe can be faster.  If we can't find
+        * ibm,configure-pe then fall back to using ibm,configure-bridge.
+        */
+       if (ibm_configure_pe == RTAS_UNKNOWN_SERVICE)
+               ibm_configure_pe        = rtas_token("ibm,configure-bridge");
 
        /*
         * Necessary sanity check. We needn't check "get-config-addr-info"
@@ -93,8 +99,7 @@ static int pseries_eeh_init(void)
            (ibm_read_slot_reset_state2 == RTAS_UNKNOWN_SERVICE &&
             ibm_read_slot_reset_state == RTAS_UNKNOWN_SERVICE) ||
            ibm_slot_error_detail == RTAS_UNKNOWN_SERVICE       ||
-           (ibm_configure_pe == RTAS_UNKNOWN_SERVICE           &&
-            ibm_configure_bridge == RTAS_UNKNOWN_SERVICE)) {
+           ibm_configure_pe == RTAS_UNKNOWN_SERVICE) {
                pr_info("EEH functionality not supported\n");
                return -EINVAL;
        }
@@ -615,29 +620,41 @@ static int pseries_eeh_configure_bridge(struct eeh_pe *pe)
 {
        int config_addr;
        int ret;
+       /* Waiting 0.2s maximum before skipping configuration */
+       int max_wait = 200;
 
        /* Figure out the PE address */
        config_addr = pe->config_addr;
        if (pe->addr)
                config_addr = pe->addr;
 
-       /* Use new configure-pe function, if supported */
-       if (ibm_configure_pe != RTAS_UNKNOWN_SERVICE) {
+       while (max_wait > 0) {
                ret = rtas_call(ibm_configure_pe, 3, 1, NULL,
                                config_addr, BUID_HI(pe->phb->buid),
                                BUID_LO(pe->phb->buid));
-       } else if (ibm_configure_bridge != RTAS_UNKNOWN_SERVICE) {
-               ret = rtas_call(ibm_configure_bridge, 3, 1, NULL,
-                               config_addr, BUID_HI(pe->phb->buid),
-                               BUID_LO(pe->phb->buid));
-       } else {
-               return -EFAULT;
-       }
 
-       if (ret)
-               pr_warn("%s: Unable to configure bridge PHB#%d-PE#%x (%d)\n",
-                       __func__, pe->phb->global_number, pe->addr, ret);
+               if (!ret)
+                       return ret;
+
+               /*
+                * If RTAS returns a delay value that's above 100ms, cut it
+                * down to 100ms in case firmware made a mistake.  For more
+                * on how these delay values work see rtas_busy_delay_time
+                */
+               if (ret > RTAS_EXTENDED_DELAY_MIN+2 &&
+                   ret <= RTAS_EXTENDED_DELAY_MAX)
+                       ret = RTAS_EXTENDED_DELAY_MIN+2;
+
+               max_wait -= rtas_busy_delay_time(ret);
+
+               if (max_wait < 0)
+                       break;
+
+               rtas_busy_delay(ret);
+       }
 
+       pr_warn("%s: Unable to configure bridge PHB#%d-PE#%x (%d)\n",
+               __func__, pe->phb->global_number, pe->addr, ret);
        return ret;
 }
 
index 0d112b9..ff75d70 100644 (file)
@@ -143,7 +143,7 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio)
  */
 static long
 axon_ram_direct_access(struct block_device *device, sector_t sector,
-                      void __pmem **kaddr, pfn_t *pfn)
+                      void __pmem **kaddr, pfn_t *pfn, long size)
 {
        struct axon_ram_bank *bank = device->bd_disk->private_data;
        loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT;
index fac6ac9..1dd2103 100644 (file)
@@ -22,7 +22,6 @@ OBJECTS += $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o
 LDFLAGS_vmlinux := --oformat $(LD_BFD) -e startup -T
 $(obj)/vmlinux: $(obj)/vmlinux.lds $(OBJECTS)
        $(call if_changed,ld)
-       @:
 
 sed-sizes := -e 's/^\([0-9a-fA-F]*\) . \(__bss_start\|_end\)$$/\#define SZ\2 0x\1/p'
 
index 0ac42cc..d5ec71b 100644 (file)
@@ -1,8 +1,7 @@
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
-CONFIG_FHANDLE=y
 CONFIG_AUDIT=y
-CONFIG_NO_HZ=y
+CONFIG_NO_HZ_IDLE=y
 CONFIG_HIGH_RES_TIMERS=y
 CONFIG_BSD_PROCESS_ACCT=y
 CONFIG_BSD_PROCESS_ACCT_V3=y
@@ -13,19 +12,19 @@ CONFIG_TASK_IO_ACCOUNTING=y
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 CONFIG_NUMA_BALANCING=y
-CONFIG_CGROUP_FREEZER=y
-CONFIG_CGROUP_PIDS=y
-CONFIG_CGROUP_DEVICE=y
-CONFIG_CPUSETS=y
-CONFIG_CGROUP_CPUACCT=y
 CONFIG_MEMCG=y
 CONFIG_MEMCG_SWAP=y
-CONFIG_MEMCG_KMEM=y
-CONFIG_CGROUP_HUGETLB=y
-CONFIG_CGROUP_PERF=y
+CONFIG_BLK_CGROUP=y
 CONFIG_CFS_BANDWIDTH=y
 CONFIG_RT_GROUP_SCHED=y
-CONFIG_BLK_CGROUP=y
+CONFIG_CGROUP_PIDS=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_HUGETLB=y
+CONFIG_CPUSETS=y
+CONFIG_CGROUP_DEVICE=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_PERF=y
+CONFIG_CHECKPOINT_RESTORE=y
 CONFIG_NAMESPACES=y
 CONFIG_USER_NS=y
 CONFIG_SCHED_AUTOGROUP=y
@@ -55,7 +54,6 @@ CONFIG_UNIXWARE_DISKLABEL=y
 CONFIG_CFQ_GROUP_IOSCHED=y
 CONFIG_DEFAULT_DEADLINE=y
 CONFIG_LIVEPATCH=y
-CONFIG_MARCH_Z196=y
 CONFIG_TUNE_ZEC12=y
 CONFIG_NR_CPUS=256
 CONFIG_NUMA=y
@@ -65,6 +63,15 @@ CONFIG_MEMORY_HOTPLUG=y
 CONFIG_MEMORY_HOTREMOVE=y
 CONFIG_KSM=y
 CONFIG_TRANSPARENT_HUGEPAGE=y
+CONFIG_CLEANCACHE=y
+CONFIG_FRONTSWAP=y
+CONFIG_CMA=y
+CONFIG_MEM_SOFT_DIRTY=y
+CONFIG_ZPOOL=m
+CONFIG_ZBUD=m
+CONFIG_ZSMALLOC=m
+CONFIG_ZSMALLOC_STAT=y
+CONFIG_IDLE_PAGE_TRACKING=y
 CONFIG_PCI=y
 CONFIG_PCI_DEBUG=y
 CONFIG_HOTPLUG_PCI=y
@@ -452,6 +459,7 @@ CONFIG_HW_RANDOM_VIRTIO=m
 CONFIG_RAW_DRIVER=m
 CONFIG_HANGCHECK_TIMER=m
 CONFIG_TN3270_FS=y
+# CONFIG_HWMON is not set
 CONFIG_WATCHDOG=y
 CONFIG_WATCHDOG_NOWAYOUT=y
 CONFIG_SOFT_WATCHDOG=m
@@ -537,6 +545,8 @@ CONFIG_DLM=m
 CONFIG_PRINTK_TIME=y
 CONFIG_DYNAMIC_DEBUG=y
 CONFIG_DEBUG_INFO=y
+CONFIG_DEBUG_INFO_DWARF4=y
+CONFIG_GDB_SCRIPTS=y
 CONFIG_FRAME_WARN=1024
 CONFIG_READABLE_ASM=y
 CONFIG_UNUSED_SYMBOLS=y
@@ -555,13 +565,17 @@ CONFIG_SLUB_DEBUG_ON=y
 CONFIG_SLUB_STATS=y
 CONFIG_DEBUG_STACK_USAGE=y
 CONFIG_DEBUG_VM=y
+CONFIG_DEBUG_VM_VMACACHE=y
 CONFIG_DEBUG_VM_RB=y
+CONFIG_DEBUG_VM_PGFLAGS=y
 CONFIG_DEBUG_MEMORY_INIT=y
 CONFIG_MEMORY_NOTIFIER_ERROR_INJECT=m
 CONFIG_DEBUG_PER_CPU_MAPS=y
 CONFIG_DEBUG_SHIRQ=y
 CONFIG_DETECT_HUNG_TASK=y
+CONFIG_WQ_WATCHDOG=y
 CONFIG_PANIC_ON_OOPS=y
+CONFIG_DEBUG_TIMEKEEPING=y
 CONFIG_TIMER_STATS=y
 CONFIG_DEBUG_RT_MUTEXES=y
 CONFIG_DEBUG_WW_MUTEX_SLOWPATH=y
@@ -596,6 +610,8 @@ CONFIG_FTRACE_SYSCALLS=y
 CONFIG_STACK_TRACER=y
 CONFIG_BLK_DEV_IO_TRACE=y
 CONFIG_UPROBE_EVENT=y
+CONFIG_FUNCTION_PROFILER=y
+CONFIG_TRACE_ENUM_MAP_FILE=y
 CONFIG_LKDTM=m
 CONFIG_TEST_LIST_SORT=y
 CONFIG_KPROBES_SANITY_TEST=y
@@ -607,7 +623,6 @@ CONFIG_TEST_STRING_HELPERS=y
 CONFIG_TEST_KSTRTOX=y
 CONFIG_DMA_API_DEBUG=y
 CONFIG_TEST_BPF=m
-# CONFIG_STRICT_DEVMEM is not set
 CONFIG_S390_PTDUMP=y
 CONFIG_ENCRYPTED_KEYS=m
 CONFIG_SECURITY=y
@@ -651,7 +666,6 @@ CONFIG_CRYPTO_SEED=m
 CONFIG_CRYPTO_SERPENT=m
 CONFIG_CRYPTO_TEA=m
 CONFIG_CRYPTO_TWOFISH=m
-CONFIG_CRYPTO_ZLIB=y
 CONFIG_CRYPTO_LZO=m
 CONFIG_CRYPTO_LZ4=m
 CONFIG_CRYPTO_LZ4HC=m
@@ -664,7 +678,7 @@ CONFIG_CRYPTO_SHA512_S390=m
 CONFIG_CRYPTO_DES_S390=m
 CONFIG_CRYPTO_AES_S390=m
 CONFIG_CRYPTO_GHASH_S390=m
-CONFIG_ASYMMETRIC_KEY_TYPE=m
+CONFIG_ASYMMETRIC_KEY_TYPE=y
 CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=m
 CONFIG_X509_CERTIFICATE_PARSER=m
 CONFIG_CRC7=m
index a31dcd5..f46a351 100644 (file)
@@ -1,8 +1,7 @@
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
-CONFIG_FHANDLE=y
 CONFIG_AUDIT=y
-CONFIG_NO_HZ=y
+CONFIG_NO_HZ_IDLE=y
 CONFIG_HIGH_RES_TIMERS=y
 CONFIG_BSD_PROCESS_ACCT=y
 CONFIG_BSD_PROCESS_ACCT_V3=y
@@ -13,17 +12,17 @@ CONFIG_TASK_IO_ACCOUNTING=y
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 CONFIG_NUMA_BALANCING=y
-CONFIG_CGROUP_FREEZER=y
-CONFIG_CGROUP_PIDS=y
-CONFIG_CGROUP_DEVICE=y
-CONFIG_CPUSETS=y
-CONFIG_CGROUP_CPUACCT=y
 CONFIG_MEMCG=y
 CONFIG_MEMCG_SWAP=y
-CONFIG_MEMCG_KMEM=y
+CONFIG_BLK_CGROUP=y
+CONFIG_CGROUP_PIDS=y
+CONFIG_CGROUP_FREEZER=y
 CONFIG_CGROUP_HUGETLB=y
+CONFIG_CPUSETS=y
+CONFIG_CGROUP_DEVICE=y
+CONFIG_CGROUP_CPUACCT=y
 CONFIG_CGROUP_PERF=y
-CONFIG_BLK_CGROUP=y
+CONFIG_CHECKPOINT_RESTORE=y
 CONFIG_NAMESPACES=y
 CONFIG_USER_NS=y
 CONFIG_SCHED_AUTOGROUP=y
@@ -53,7 +52,6 @@ CONFIG_SOLARIS_X86_PARTITION=y
 CONFIG_UNIXWARE_DISKLABEL=y
 CONFIG_CFQ_GROUP_IOSCHED=y
 CONFIG_DEFAULT_DEADLINE=y
-CONFIG_MARCH_Z196=y
 CONFIG_TUNE_ZEC12=y
 CONFIG_NR_CPUS=256
 CONFIG_NUMA=y
@@ -62,6 +60,14 @@ CONFIG_MEMORY_HOTPLUG=y
 CONFIG_MEMORY_HOTREMOVE=y
 CONFIG_KSM=y
 CONFIG_TRANSPARENT_HUGEPAGE=y
+CONFIG_CLEANCACHE=y
+CONFIG_FRONTSWAP=y
+CONFIG_CMA=y
+CONFIG_ZSWAP=y
+CONFIG_ZBUD=m
+CONFIG_ZSMALLOC=m
+CONFIG_ZSMALLOC_STAT=y
+CONFIG_IDLE_PAGE_TRACKING=y
 CONFIG_PCI=y
 CONFIG_HOTPLUG_PCI=y
 CONFIG_HOTPLUG_PCI_S390=y
@@ -530,6 +536,8 @@ CONFIG_NLS_UTF8=m
 CONFIG_DLM=m
 CONFIG_PRINTK_TIME=y
 CONFIG_DEBUG_INFO=y
+CONFIG_DEBUG_INFO_DWARF4=y
+CONFIG_GDB_SCRIPTS=y
 # CONFIG_ENABLE_MUST_CHECK is not set
 CONFIG_FRAME_WARN=1024
 CONFIG_UNUSED_SYMBOLS=y
@@ -547,13 +555,13 @@ CONFIG_LATENCYTOP=y
 CONFIG_DEBUG_STRICT_USER_COPY_CHECKS=y
 CONFIG_BLK_DEV_IO_TRACE=y
 # CONFIG_KPROBE_EVENT is not set
+CONFIG_TRACE_ENUM_MAP_FILE=y
 CONFIG_LKDTM=m
 CONFIG_RBTREE_TEST=m
 CONFIG_INTERVAL_TREE_TEST=m
 CONFIG_PERCPU_TEST=m
 CONFIG_ATOMIC64_SELFTEST=y
 CONFIG_TEST_BPF=m
-# CONFIG_STRICT_DEVMEM is not set
 CONFIG_S390_PTDUMP=y
 CONFIG_ENCRYPTED_KEYS=m
 CONFIG_SECURITY=y
@@ -597,8 +605,6 @@ CONFIG_CRYPTO_SEED=m
 CONFIG_CRYPTO_SERPENT=m
 CONFIG_CRYPTO_TEA=m
 CONFIG_CRYPTO_TWOFISH=m
-CONFIG_CRYPTO_ZLIB=y
-CONFIG_CRYPTO_LZO=m
 CONFIG_CRYPTO_LZ4=m
 CONFIG_CRYPTO_LZ4HC=m
 CONFIG_CRYPTO_USER_API_HASH=m
@@ -610,7 +616,7 @@ CONFIG_CRYPTO_SHA512_S390=m
 CONFIG_CRYPTO_DES_S390=m
 CONFIG_CRYPTO_AES_S390=m
 CONFIG_CRYPTO_GHASH_S390=m
-CONFIG_ASYMMETRIC_KEY_TYPE=m
+CONFIG_ASYMMETRIC_KEY_TYPE=y
 CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=m
 CONFIG_X509_CERTIFICATE_PARSER=m
 CONFIG_CRC7=m
index 7b73bf3..ba0f2a5 100644 (file)
@@ -1,8 +1,7 @@
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
-CONFIG_FHANDLE=y
 CONFIG_AUDIT=y
-CONFIG_NO_HZ=y
+CONFIG_NO_HZ_IDLE=y
 CONFIG_HIGH_RES_TIMERS=y
 CONFIG_BSD_PROCESS_ACCT=y
 CONFIG_BSD_PROCESS_ACCT_V3=y
@@ -14,17 +13,17 @@ CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 CONFIG_NUMA_BALANCING=y
 # CONFIG_NUMA_BALANCING_DEFAULT_ENABLED is not set
-CONFIG_CGROUP_FREEZER=y
-CONFIG_CGROUP_PIDS=y
-CONFIG_CGROUP_DEVICE=y
-CONFIG_CPUSETS=y
-CONFIG_CGROUP_CPUACCT=y
 CONFIG_MEMCG=y
 CONFIG_MEMCG_SWAP=y
-CONFIG_MEMCG_KMEM=y
+CONFIG_BLK_CGROUP=y
+CONFIG_CGROUP_PIDS=y
+CONFIG_CGROUP_FREEZER=y
 CONFIG_CGROUP_HUGETLB=y
+CONFIG_CPUSETS=y
+CONFIG_CGROUP_DEVICE=y
+CONFIG_CGROUP_CPUACCT=y
 CONFIG_CGROUP_PERF=y
-CONFIG_BLK_CGROUP=y
+CONFIG_CHECKPOINT_RESTORE=y
 CONFIG_NAMESPACES=y
 CONFIG_USER_NS=y
 CONFIG_SCHED_AUTOGROUP=y
@@ -53,7 +52,6 @@ CONFIG_UNIXWARE_DISKLABEL=y
 CONFIG_CFQ_GROUP_IOSCHED=y
 CONFIG_DEFAULT_DEADLINE=y
 CONFIG_LIVEPATCH=y
-CONFIG_MARCH_Z196=y
 CONFIG_TUNE_ZEC12=y
 CONFIG_NR_CPUS=512
 CONFIG_NUMA=y
@@ -62,6 +60,14 @@ CONFIG_MEMORY_HOTPLUG=y
 CONFIG_MEMORY_HOTREMOVE=y
 CONFIG_KSM=y
 CONFIG_TRANSPARENT_HUGEPAGE=y
+CONFIG_CLEANCACHE=y
+CONFIG_FRONTSWAP=y
+CONFIG_CMA=y
+CONFIG_ZSWAP=y
+CONFIG_ZBUD=m
+CONFIG_ZSMALLOC=m
+CONFIG_ZSMALLOC_STAT=y
+CONFIG_IDLE_PAGE_TRACKING=y
 CONFIG_PCI=y
 CONFIG_HOTPLUG_PCI=y
 CONFIG_HOTPLUG_PCI_S390=y
@@ -447,6 +453,7 @@ CONFIG_HW_RANDOM_VIRTIO=m
 CONFIG_RAW_DRIVER=m
 CONFIG_HANGCHECK_TIMER=m
 CONFIG_TN3270_FS=y
+# CONFIG_HWMON is not set
 CONFIG_WATCHDOG=y
 CONFIG_WATCHDOG_NOWAYOUT=y
 CONFIG_SOFT_WATCHDOG=m
@@ -530,6 +537,8 @@ CONFIG_NLS_UTF8=m
 CONFIG_DLM=m
 CONFIG_PRINTK_TIME=y
 CONFIG_DEBUG_INFO=y
+CONFIG_DEBUG_INFO_DWARF4=y
+CONFIG_GDB_SCRIPTS=y
 # CONFIG_ENABLE_MUST_CHECK is not set
 CONFIG_FRAME_WARN=1024
 CONFIG_UNUSED_SYMBOLS=y
@@ -546,11 +555,12 @@ CONFIG_FTRACE_SYSCALLS=y
 CONFIG_STACK_TRACER=y
 CONFIG_BLK_DEV_IO_TRACE=y
 CONFIG_UPROBE_EVENT=y
+CONFIG_FUNCTION_PROFILER=y
+CONFIG_TRACE_ENUM_MAP_FILE=y
 CONFIG_LKDTM=m
 CONFIG_PERCPU_TEST=m
 CONFIG_ATOMIC64_SELFTEST=y
 CONFIG_TEST_BPF=m
-# CONFIG_STRICT_DEVMEM is not set
 CONFIG_S390_PTDUMP=y
 CONFIG_ENCRYPTED_KEYS=m
 CONFIG_SECURITY=y
@@ -594,8 +604,6 @@ CONFIG_CRYPTO_SEED=m
 CONFIG_CRYPTO_SERPENT=m
 CONFIG_CRYPTO_TEA=m
 CONFIG_CRYPTO_TWOFISH=m
-CONFIG_CRYPTO_ZLIB=y
-CONFIG_CRYPTO_LZO=m
 CONFIG_CRYPTO_LZ4=m
 CONFIG_CRYPTO_LZ4HC=m
 CONFIG_CRYPTO_USER_API_HASH=m
@@ -607,7 +615,7 @@ CONFIG_CRYPTO_SHA512_S390=m
 CONFIG_CRYPTO_DES_S390=m
 CONFIG_CRYPTO_AES_S390=m
 CONFIG_CRYPTO_GHASH_S390=m
-CONFIG_ASYMMETRIC_KEY_TYPE=m
+CONFIG_ASYMMETRIC_KEY_TYPE=y
 CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=m
 CONFIG_X509_CERTIFICATE_PARSER=m
 CONFIG_CRC7=m
index 1719843..4366a3e 100644 (file)
@@ -1,5 +1,5 @@
 # CONFIG_SWAP is not set
-CONFIG_NO_HZ=y
+CONFIG_NO_HZ_IDLE=y
 CONFIG_HIGH_RES_TIMERS=y
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
@@ -7,7 +7,6 @@ CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 CONFIG_PARTITION_ADVANCED=y
 CONFIG_IBM_PARTITION=y
 CONFIG_DEFAULT_DEADLINE=y
-CONFIG_MARCH_Z196=y
 CONFIG_TUNE_ZEC12=y
 # CONFIG_COMPAT is not set
 CONFIG_NR_CPUS=2
@@ -64,7 +63,6 @@ CONFIG_PANIC_ON_OOPS=y
 # CONFIG_SCHED_DEBUG is not set
 CONFIG_RCU_CPU_STALL_TIMEOUT=60
 # CONFIG_FTRACE is not set
-# CONFIG_STRICT_DEVMEM is not set
 # CONFIG_PFAULT is not set
 # CONFIG_S390_HYPFS_FS is not set
 # CONFIG_VIRTUALIZATION is not set
index e24f2af..3f571ea 100644 (file)
@@ -1,8 +1,8 @@
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
-CONFIG_FHANDLE=y
+CONFIG_USELIB=y
 CONFIG_AUDIT=y
-CONFIG_NO_HZ=y
+CONFIG_NO_HZ_IDLE=y
 CONFIG_HIGH_RES_TIMERS=y
 CONFIG_TASKSTATS=y
 CONFIG_TASK_DELAY_ACCT=y
@@ -11,19 +11,19 @@ CONFIG_TASK_IO_ACCOUNTING=y
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 CONFIG_CGROUPS=y
-CONFIG_CGROUP_FREEZER=y
-CONFIG_CGROUP_PIDS=y
-CONFIG_CGROUP_DEVICE=y
-CONFIG_CPUSETS=y
-CONFIG_CGROUP_CPUACCT=y
 CONFIG_MEMCG=y
 CONFIG_MEMCG_SWAP=y
-CONFIG_MEMCG_KMEM=y
-CONFIG_CGROUP_HUGETLB=y
-CONFIG_CGROUP_PERF=y
+CONFIG_BLK_CGROUP=y
 CONFIG_CGROUP_SCHED=y
 CONFIG_RT_GROUP_SCHED=y
-CONFIG_BLK_CGROUP=y
+CONFIG_CGROUP_PIDS=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_HUGETLB=y
+CONFIG_CPUSETS=y
+CONFIG_CGROUP_DEVICE=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_PERF=y
+CONFIG_CHECKPOINT_RESTORE=y
 CONFIG_NAMESPACES=y
 CONFIG_USER_NS=y
 CONFIG_BLK_DEV_INITRD=y
@@ -44,7 +44,6 @@ CONFIG_PARTITION_ADVANCED=y
 CONFIG_IBM_PARTITION=y
 CONFIG_DEFAULT_DEADLINE=y
 CONFIG_LIVEPATCH=y
-CONFIG_MARCH_Z196=y
 CONFIG_NR_CPUS=256
 CONFIG_NUMA=y
 CONFIG_HZ_100=y
@@ -52,6 +51,14 @@ CONFIG_MEMORY_HOTPLUG=y
 CONFIG_MEMORY_HOTREMOVE=y
 CONFIG_KSM=y
 CONFIG_TRANSPARENT_HUGEPAGE=y
+CONFIG_CLEANCACHE=y
+CONFIG_FRONTSWAP=y
+CONFIG_CMA=y
+CONFIG_ZSWAP=y
+CONFIG_ZBUD=m
+CONFIG_ZSMALLOC=m
+CONFIG_ZSMALLOC_STAT=y
+CONFIG_IDLE_PAGE_TRACKING=y
 CONFIG_CRASH_DUMP=y
 CONFIG_BINFMT_MISC=m
 CONFIG_HIBERNATION=y
@@ -61,7 +68,6 @@ CONFIG_UNIX=y
 CONFIG_NET_KEY=y
 CONFIG_INET=y
 CONFIG_IP_MULTICAST=y
-# CONFIG_INET_LRO is not set
 CONFIG_L2TP=m
 CONFIG_L2TP_DEBUGFS=m
 CONFIG_VLAN_8021Q=y
@@ -144,6 +150,9 @@ CONFIG_TMPFS=y
 CONFIG_TMPFS_POSIX_ACL=y
 CONFIG_HUGETLBFS=y
 # CONFIG_NETWORK_FILESYSTEMS is not set
+CONFIG_DEBUG_INFO=y
+CONFIG_DEBUG_INFO_DWARF4=y
+CONFIG_GDB_SCRIPTS=y
 CONFIG_UNUSED_SYMBOLS=y
 CONFIG_DEBUG_SECTION_MISMATCH=y
 CONFIG_DEBUG_FORCE_WEAK_PER_CPU=y
@@ -158,20 +167,21 @@ CONFIG_LOCK_STAT=y
 CONFIG_DEBUG_LOCKDEP=y
 CONFIG_DEBUG_ATOMIC_SLEEP=y
 CONFIG_DEBUG_LIST=y
-CONFIG_DEBUG_PI_LIST=y
 CONFIG_DEBUG_SG=y
 CONFIG_DEBUG_NOTIFIERS=y
 CONFIG_RCU_CPU_STALL_TIMEOUT=60
 CONFIG_RCU_TRACE=y
 CONFIG_LATENCYTOP=y
 CONFIG_DEBUG_STRICT_USER_COPY_CHECKS=y
-CONFIG_TRACER_SNAPSHOT=y
+CONFIG_SCHED_TRACER=y
+CONFIG_FTRACE_SYSCALLS=y
 CONFIG_TRACER_SNAPSHOT_PER_CPU_SWAP=y
 CONFIG_STACK_TRACER=y
 CONFIG_BLK_DEV_IO_TRACE=y
 CONFIG_UPROBE_EVENT=y
+CONFIG_FUNCTION_PROFILER=y
+CONFIG_TRACE_ENUM_MAP_FILE=y
 CONFIG_KPROBES_SANITY_TEST=y
-# CONFIG_STRICT_DEVMEM is not set
 CONFIG_S390_PTDUMP=y
 CONFIG_CRYPTO_CRYPTD=m
 CONFIG_CRYPTO_AUTHENC=m
@@ -212,8 +222,6 @@ CONFIG_CRYPTO_SERPENT=m
 CONFIG_CRYPTO_TEA=m
 CONFIG_CRYPTO_TWOFISH=m
 CONFIG_CRYPTO_DEFLATE=m
-CONFIG_CRYPTO_ZLIB=m
-CONFIG_CRYPTO_LZO=m
 CONFIG_CRYPTO_LZ4=m
 CONFIG_CRYPTO_LZ4HC=m
 CONFIG_CRYPTO_ANSI_CPRNG=m
index c3e4099..87035fa 100644 (file)
@@ -224,13 +224,13 @@ arch_initcall(service_level_perf_register);
 
 static int __perf_callchain_kernel(void *data, unsigned long address)
 {
-       struct perf_callchain_entry *entry = data;
+       struct perf_callchain_entry_ctx *entry = data;
 
        perf_callchain_store(entry, address);
        return 0;
 }
 
-void perf_callchain_kernel(struct perf_callchain_entry *entry,
+void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
                           struct pt_regs *regs)
 {
        if (user_mode(regs))
index 7a31440..19288c1 100644 (file)
@@ -250,6 +250,7 @@ static noinline void do_sigsegv(struct pt_regs *regs, int si_code)
 
        report_user_fault(regs, SIGSEGV, 1);
        si.si_signo = SIGSEGV;
+       si.si_errno = 0;
        si.si_code = si_code;
        si.si_addr = (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK);
        force_sig_info(SIGSEGV, &si, current);
index f010c93..fda605d 100644 (file)
@@ -37,7 +37,7 @@ extern u8 sk_load_word[], sk_load_half[], sk_load_byte[];
  *           |               |     |
  *           +---------------+     |
  *           | 8 byte skbp   |     |
- * R15+170 -> +---------------+     |
+ * R15+176 -> +---------------+     |
  *           | 8 byte hlen   |     |
  * R15+168 -> +---------------+     |
  *           | 4 byte align  |     |
@@ -58,7 +58,7 @@ extern u8 sk_load_word[], sk_load_half[], sk_load_byte[];
 #define STK_OFF                (STK_SPACE - STK_160_UNUSED)
 #define STK_OFF_TMP    160     /* Offset of tmp buffer on stack */
 #define STK_OFF_HLEN   168     /* Offset of SKB header length on stack */
-#define STK_OFF_SKBP   170     /* Offset of SKB pointer on stack */
+#define STK_OFF_SKBP   176     /* Offset of SKB pointer on stack */
 
 #define STK_OFF_R6     (160 - 11 * 8)  /* Offset of r6 on stack */
 #define STK_OFF_TCCNT  (160 - 12 * 8)  /* Offset of tail_call_cnt on stack */
index 9133b0e..bee281f 100644 (file)
@@ -45,7 +45,7 @@ struct bpf_jit {
        int labels[1];          /* Labels for local jumps */
 };
 
-#define BPF_SIZE_MAX   0x7ffff /* Max size for program (20 bit signed displ) */
+#define BPF_SIZE_MAX   0xffff  /* Max size for program (16 bit branches) */
 
 #define SEEN_SKB       1       /* skb access */
 #define SEEN_MEM       2       /* use mem[] for temporary storage */
@@ -450,7 +450,7 @@ static void bpf_jit_prologue(struct bpf_jit *jit)
                emit_load_skb_data_hlen(jit);
        if (jit->seen & SEEN_SKB_CHANGE)
                /* stg %b1,ST_OFF_SKBP(%r0,%r15) */
-               EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0, REG_15,
+               EMIT6_DISP_LH(0xe3000000, 0x0024, BPF_REG_1, REG_0, REG_15,
                              STK_OFF_SKBP);
 }
 
index 9cb4260..d4008c3 100644 (file)
@@ -1,5 +1,6 @@
 #define __ARCH_HAVE_MMU
 
+#define __ARCH_WANT_RENAMEAT
 #define __ARCH_WANT_SYSCALL_NO_AT
 #define __ARCH_WANT_SYSCALL_NO_FLAGS
 #define __ARCH_WANT_SYSCALL_OFF_T
index 6df826e..c4c47ea 100644 (file)
@@ -55,7 +55,6 @@ $(addprefix $(obj)/,$(lib1funcs-y)): $(obj)/%: $(lib1funcs-dir)/% FORCE
 
 $(obj)/vmlinux: $(OBJECTS) $(obj)/piggy.o $(lib1funcs-obj) FORCE
        $(call if_changed,ld)
-       @:
 
 $(obj)/vmlinux.bin: vmlinux FORCE
        $(call if_changed,objcopy)
index 2216ee5..43c4119 100644 (file)
@@ -17,7 +17,6 @@ LDFLAGS_vmlinux := --oformat $(ld-bfd) -Ttext $(load-y) -e romstart \
 
 $(obj)/vmlinux: $(obj)/head.o $(obj-y) $(obj)/piggy.o FORCE
        $(call if_changed,ld)
-       @:
 
 OBJCOPYFLAGS += -j .empty_zero_page
 
index cc80b61..fa2c0cd 100644 (file)
@@ -21,7 +21,7 @@ static int callchain_stack(void *data, char *name)
 
 static void callchain_address(void *data, unsigned long addr, int reliable)
 {
-       struct perf_callchain_entry *entry = data;
+       struct perf_callchain_entry_ctx *entry = data;
 
        if (reliable)
                perf_callchain_store(entry, addr);
@@ -33,7 +33,7 @@ static const struct stacktrace_ops callchain_ops = {
 };
 
 void
-perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
        perf_callchain_store(entry, regs->pc);
 
index 10e9dab..f0700cf 100644 (file)
 
 #define        PTREGS_OFF      (STACK_BIAS + STACKFRAME_SZ)
 
+#define        RTRAP_PSTATE            (PSTATE_TSO|PSTATE_PEF|PSTATE_PRIV|PSTATE_IE)
+#define        RTRAP_PSTATE_IRQOFF     (PSTATE_TSO|PSTATE_PEF|PSTATE_PRIV)
+#define RTRAP_PSTATE_AG_IRQOFF (PSTATE_TSO|PSTATE_PEF|PSTATE_PRIV|PSTATE_AG)
+
 #define __CHEETAH_ID   0x003e0014
 #define __JALAPENO_ID  0x003e0016
 #define __SERRANO_ID   0x003e0022
index 71b5a67..781b9f1 100644 (file)
@@ -589,8 +589,8 @@ user_rtt_fill_64bit:                                        \
         restored;                                      \
        nop; nop; nop; nop; nop; nop;                   \
        nop; nop; nop; nop; nop;                        \
-       ba,a,pt %xcc, user_rtt_fill_fixup;              \
-       ba,a,pt %xcc, user_rtt_fill_fixup;              \
+       ba,a,pt %xcc, user_rtt_fill_fixup_dax;          \
+       ba,a,pt %xcc, user_rtt_fill_fixup_mna;          \
        ba,a,pt %xcc, user_rtt_fill_fixup;
 
 
@@ -652,8 +652,8 @@ user_rtt_fill_32bit:                                        \
         restored;                                      \
        nop; nop; nop; nop; nop;                        \
        nop; nop; nop;                                  \
-       ba,a,pt %xcc, user_rtt_fill_fixup;              \
-       ba,a,pt %xcc, user_rtt_fill_fixup;              \
+       ba,a,pt %xcc, user_rtt_fill_fixup_dax;          \
+       ba,a,pt %xcc, user_rtt_fill_fixup_mna;          \
        ba,a,pt %xcc, user_rtt_fill_fixup;
 
 
index 7cf9c6e..fdb1332 100644 (file)
@@ -21,6 +21,7 @@ CFLAGS_REMOVE_perf_event.o := -pg
 CFLAGS_REMOVE_pcr.o := -pg
 endif
 
+obj-$(CONFIG_SPARC64)   += urtt_fill.o
 obj-$(CONFIG_SPARC32)   += entry.o wof.o wuf.o
 obj-$(CONFIG_SPARC32)   += etrap_32.o
 obj-$(CONFIG_SPARC32)   += rtrap_32.o
index a4b8b5a..710f327 100644 (file)
@@ -1711,7 +1711,7 @@ static int __init init_hw_perf_events(void)
 }
 pure_initcall(init_hw_perf_events);
 
-void perf_callchain_kernel(struct perf_callchain_entry *entry,
+void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
                           struct pt_regs *regs)
 {
        unsigned long ksp, fp;
@@ -1756,7 +1756,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
                        }
                }
 #endif
-       } while (entry->nr < sysctl_perf_event_max_stack);
+       } while (entry->nr < entry->max_stack);
 }
 
 static inline int
@@ -1769,7 +1769,7 @@ valid_user_frame(const void __user *fp, unsigned long size)
        return (__range_not_ok(fp, size, TASK_SIZE) == 0);
 }
 
-static void perf_callchain_user_64(struct perf_callchain_entry *entry,
+static void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry,
                                   struct pt_regs *regs)
 {
        unsigned long ufp;
@@ -1790,10 +1790,10 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
                pc = sf.callers_pc;
                ufp = (unsigned long)sf.fp + STACK_BIAS;
                perf_callchain_store(entry, pc);
-       } while (entry->nr < sysctl_perf_event_max_stack);
+       } while (entry->nr < entry->max_stack);
 }
 
-static void perf_callchain_user_32(struct perf_callchain_entry *entry,
+static void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry,
                                   struct pt_regs *regs)
 {
        unsigned long ufp;
@@ -1822,11 +1822,11 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
                        ufp = (unsigned long)sf.fp;
                }
                perf_callchain_store(entry, pc);
-       } while (entry->nr < sysctl_perf_event_max_stack);
+       } while (entry->nr < entry->max_stack);
 }
 
 void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
        u64 saved_fault_address = current_thread_info()->fault_address;
        u8 saved_fault_code = get_thread_fault_code();
index d08bdaf..216948c 100644 (file)
 #include <asm/visasm.h>
 #include <asm/processor.h>
 
-#define                RTRAP_PSTATE            (PSTATE_TSO|PSTATE_PEF|PSTATE_PRIV|PSTATE_IE)
-#define                RTRAP_PSTATE_IRQOFF     (PSTATE_TSO|PSTATE_PEF|PSTATE_PRIV)
-#define                RTRAP_PSTATE_AG_IRQOFF  (PSTATE_TSO|PSTATE_PEF|PSTATE_PRIV|PSTATE_AG)
-
 #ifdef CONFIG_CONTEXT_TRACKING
 # define SCHEDULE_USER schedule_user
 #else
@@ -242,52 +238,17 @@ rt_continue:      ldx                     [%sp + PTREGS_OFF + PT_V9_G1], %g1
                 wrpr                   %g1, %cwp
                ba,a,pt                 %xcc, user_rtt_fill_64bit
 
-user_rtt_fill_fixup:
-               rdpr    %cwp, %g1
-               add     %g1, 1, %g1
-               wrpr    %g1, 0x0, %cwp
-
-               rdpr    %wstate, %g2
-               sll     %g2, 3, %g2
-               wrpr    %g2, 0x0, %wstate
-
-               /* We know %canrestore and %otherwin are both zero.  */
-
-               sethi   %hi(sparc64_kern_pri_context), %g2
-               ldx     [%g2 + %lo(sparc64_kern_pri_context)], %g2
-               mov     PRIMARY_CONTEXT, %g1
-
-661:           stxa    %g2, [%g1] ASI_DMMU
-               .section .sun4v_1insn_patch, "ax"
-               .word   661b
-               stxa    %g2, [%g1] ASI_MMU
-               .previous
-
-               sethi   %hi(KERNBASE), %g1
-               flush   %g1
+user_rtt_fill_fixup_dax:
+               ba,pt   %xcc, user_rtt_fill_fixup_common
+                mov    1, %g3
 
-               or      %g4, FAULT_CODE_WINFIXUP, %g4
-               stb     %g4, [%g6 + TI_FAULT_CODE]
-               stx     %g5, [%g6 + TI_FAULT_ADDR]
+user_rtt_fill_fixup_mna:
+               ba,pt   %xcc, user_rtt_fill_fixup_common
+                mov    2, %g3
 
-               mov     %g6, %l1
-               wrpr    %g0, 0x0, %tl
-
-661:           nop
-               .section                .sun4v_1insn_patch, "ax"
-               .word                   661b
-               SET_GL(0)
-               .previous
-
-               wrpr    %g0, RTRAP_PSTATE, %pstate
-
-               mov     %l1, %g6
-               ldx     [%g6 + TI_TASK], %g4
-               LOAD_PER_CPU_BASE(%g5, %g6, %g1, %g2, %g3)
-               call    do_sparc64_fault
-                add    %sp, PTREGS_OFF, %o0
-               ba,pt   %xcc, rtrap
-                nop
+user_rtt_fill_fixup:
+               ba,pt   %xcc, user_rtt_fill_fixup_common
+                clr    %g3
 
 user_rtt_pre_restore:
                add                     %g1, 1, %g1
index 3c25241..91cc2f4 100644 (file)
@@ -138,12 +138,24 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
        return 0;
 }
 
+/* Checks if the fp is valid.  We always build signal frames which are
+ * 16-byte aligned, therefore we can always enforce that the restore
+ * frame has that property as well.
+ */
+static bool invalid_frame_pointer(void __user *fp, int fplen)
+{
+       if ((((unsigned long) fp) & 15) ||
+           ((unsigned long)fp) > 0x100000000ULL - fplen)
+               return true;
+       return false;
+}
+
 void do_sigreturn32(struct pt_regs *regs)
 {
        struct signal_frame32 __user *sf;
        compat_uptr_t fpu_save;
        compat_uptr_t rwin_save;
-       unsigned int psr;
+       unsigned int psr, ufp;
        unsigned int pc, npc;
        sigset_t set;
        compat_sigset_t seta;
@@ -158,11 +170,16 @@ void do_sigreturn32(struct pt_regs *regs)
        sf = (struct signal_frame32 __user *) regs->u_regs[UREG_FP];
 
        /* 1. Make sure we are not getting garbage from the user */
-       if (!access_ok(VERIFY_READ, sf, sizeof(*sf)) ||
-           (((unsigned long) sf) & 3))
+       if (invalid_frame_pointer(sf, sizeof(*sf)))
+               goto segv;
+
+       if (get_user(ufp, &sf->info.si_regs.u_regs[UREG_FP]))
+               goto segv;
+
+       if (ufp & 0x7)
                goto segv;
 
-       if (get_user(pc, &sf->info.si_regs.pc) ||
+       if (__get_user(pc, &sf->info.si_regs.pc) ||
            __get_user(npc, &sf->info.si_regs.npc))
                goto segv;
 
@@ -227,7 +244,7 @@ segv:
 asmlinkage void do_rt_sigreturn32(struct pt_regs *regs)
 {
        struct rt_signal_frame32 __user *sf;
-       unsigned int psr, pc, npc;
+       unsigned int psr, pc, npc, ufp;
        compat_uptr_t fpu_save;
        compat_uptr_t rwin_save;
        sigset_t set;
@@ -242,11 +259,16 @@ asmlinkage void do_rt_sigreturn32(struct pt_regs *regs)
        sf = (struct rt_signal_frame32 __user *) regs->u_regs[UREG_FP];
 
        /* 1. Make sure we are not getting garbage from the user */
-       if (!access_ok(VERIFY_READ, sf, sizeof(*sf)) ||
-           (((unsigned long) sf) & 3))
+       if (invalid_frame_pointer(sf, sizeof(*sf)))
                goto segv;
 
-       if (get_user(pc, &sf->regs.pc) || 
+       if (get_user(ufp, &sf->regs.u_regs[UREG_FP]))
+               goto segv;
+
+       if (ufp & 0x7)
+               goto segv;
+
+       if (__get_user(pc, &sf->regs.pc) || 
            __get_user(npc, &sf->regs.npc))
                goto segv;
 
@@ -307,14 +329,6 @@ segv:
        force_sig(SIGSEGV, current);
 }
 
-/* Checks if the fp is valid */
-static int invalid_frame_pointer(void __user *fp, int fplen)
-{
-       if ((((unsigned long) fp) & 7) || ((unsigned long)fp) > 0x100000000ULL - fplen)
-               return 1;
-       return 0;
-}
-
 static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, unsigned long framesize)
 {
        unsigned long sp;
index 52aa5e4..c3c12ef 100644 (file)
@@ -60,10 +60,22 @@ struct rt_signal_frame {
 #define SF_ALIGNEDSZ  (((sizeof(struct signal_frame) + 7) & (~7)))
 #define RT_ALIGNEDSZ  (((sizeof(struct rt_signal_frame) + 7) & (~7)))
 
+/* Checks if the fp is valid.  We always build signal frames which are
+ * 16-byte aligned, therefore we can always enforce that the restore
+ * frame has that property as well.
+ */
+static inline bool invalid_frame_pointer(void __user *fp, int fplen)
+{
+       if ((((unsigned long) fp) & 15) || !__access_ok((unsigned long)fp, fplen))
+               return true;
+
+       return false;
+}
+
 asmlinkage void do_sigreturn(struct pt_regs *regs)
 {
+       unsigned long up_psr, pc, npc, ufp;
        struct signal_frame __user *sf;
-       unsigned long up_psr, pc, npc;
        sigset_t set;
        __siginfo_fpu_t __user *fpu_save;
        __siginfo_rwin_t __user *rwin_save;
@@ -77,10 +89,13 @@ asmlinkage void do_sigreturn(struct pt_regs *regs)
        sf = (struct signal_frame __user *) regs->u_regs[UREG_FP];
 
        /* 1. Make sure we are not getting garbage from the user */
-       if (!access_ok(VERIFY_READ, sf, sizeof(*sf)))
+       if (!invalid_frame_pointer(sf, sizeof(*sf)))
+               goto segv_and_exit;
+
+       if (get_user(ufp, &sf->info.si_regs.u_regs[UREG_FP]))
                goto segv_and_exit;
 
-       if (((unsigned long) sf) & 3)
+       if (ufp & 0x7)
                goto segv_and_exit;
 
        err = __get_user(pc,  &sf->info.si_regs.pc);
@@ -127,7 +142,7 @@ segv_and_exit:
 asmlinkage void do_rt_sigreturn(struct pt_regs *regs)
 {
        struct rt_signal_frame __user *sf;
-       unsigned int psr, pc, npc;
+       unsigned int psr, pc, npc, ufp;
        __siginfo_fpu_t __user *fpu_save;
        __siginfo_rwin_t __user *rwin_save;
        sigset_t set;
@@ -135,8 +150,13 @@ asmlinkage void do_rt_sigreturn(struct pt_regs *regs)
 
        synchronize_user_stack();
        sf = (struct rt_signal_frame __user *) regs->u_regs[UREG_FP];
-       if (!access_ok(VERIFY_READ, sf, sizeof(*sf)) ||
-           (((unsigned long) sf) & 0x03))
+       if (!invalid_frame_pointer(sf, sizeof(*sf)))
+               goto segv;
+
+       if (get_user(ufp, &sf->regs.u_regs[UREG_FP]))
+               goto segv;
+
+       if (ufp & 0x7)
                goto segv;
 
        err = __get_user(pc, &sf->regs.pc);
@@ -178,15 +198,6 @@ segv:
        force_sig(SIGSEGV, current);
 }
 
-/* Checks if the fp is valid */
-static inline int invalid_frame_pointer(void __user *fp, int fplen)
-{
-       if ((((unsigned long) fp) & 7) || !__access_ok((unsigned long)fp, fplen))
-               return 1;
-
-       return 0;
-}
-
 static inline void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, unsigned long framesize)
 {
        unsigned long sp = regs->u_regs[UREG_FP];
index 39aaec1..5ee930c 100644 (file)
@@ -234,6 +234,17 @@ do_sigsegv:
        goto out;
 }
 
+/* Checks if the fp is valid.  We always build rt signal frames which
+ * are 16-byte aligned, therefore we can always enforce that the
+ * restore frame has that property as well.
+ */
+static bool invalid_frame_pointer(void __user *fp)
+{
+       if (((unsigned long) fp) & 15)
+               return true;
+       return false;
+}
+
 struct rt_signal_frame {
        struct sparc_stackf     ss;
        siginfo_t               info;
@@ -246,8 +257,8 @@ struct rt_signal_frame {
 
 void do_rt_sigreturn(struct pt_regs *regs)
 {
+       unsigned long tpc, tnpc, tstate, ufp;
        struct rt_signal_frame __user *sf;
-       unsigned long tpc, tnpc, tstate;
        __siginfo_fpu_t __user *fpu_save;
        __siginfo_rwin_t __user *rwin_save;
        sigset_t set;
@@ -261,10 +272,16 @@ void do_rt_sigreturn(struct pt_regs *regs)
                (regs->u_regs [UREG_FP] + STACK_BIAS);
 
        /* 1. Make sure we are not getting garbage from the user */
-       if (((unsigned long) sf) & 3)
+       if (invalid_frame_pointer(sf))
+               goto segv;
+
+       if (get_user(ufp, &sf->regs.u_regs[UREG_FP]))
                goto segv;
 
-       err = get_user(tpc, &sf->regs.tpc);
+       if ((ufp + STACK_BIAS) & 0x7)
+               goto segv;
+
+       err = __get_user(tpc, &sf->regs.tpc);
        err |= __get_user(tnpc, &sf->regs.tnpc);
        if (test_thread_flag(TIF_32BIT)) {
                tpc &= 0xffffffff;
@@ -308,14 +325,6 @@ segv:
        force_sig(SIGSEGV, current);
 }
 
-/* Checks if the fp is valid */
-static int invalid_frame_pointer(void __user *fp)
-{
-       if (((unsigned long) fp) & 15)
-               return 1;
-       return 0;
-}
-
 static inline void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, unsigned long framesize)
 {
        unsigned long sp = regs->u_regs[UREG_FP] + STACK_BIAS;
index 0f6eebe..e5fe8ce 100644 (file)
@@ -48,6 +48,10 @@ int save_fpu_state(struct pt_regs *regs, __siginfo_fpu_t __user *fpu)
 int restore_fpu_state(struct pt_regs *regs, __siginfo_fpu_t __user *fpu)
 {
        int err;
+
+       if (((unsigned long) fpu) & 3)
+               return -EFAULT;
+
 #ifdef CONFIG_SMP
        if (test_tsk_thread_flag(current, TIF_USEDFPU))
                regs->psr &= ~PSR_EF;
@@ -97,7 +101,10 @@ int restore_rwin_state(__siginfo_rwin_t __user *rp)
        struct thread_info *t = current_thread_info();
        int i, wsaved, err;
 
-       __get_user(wsaved, &rp->wsaved);
+       if (((unsigned long) rp) & 3)
+               return -EFAULT;
+
+       get_user(wsaved, &rp->wsaved);
        if (wsaved > NSWINS)
                return -EFAULT;
 
index 387834a..36aadcb 100644 (file)
@@ -37,7 +37,10 @@ int restore_fpu_state(struct pt_regs *regs, __siginfo_fpu_t __user *fpu)
        unsigned long fprs;
        int err;
 
-       err = __get_user(fprs, &fpu->si_fprs);
+       if (((unsigned long) fpu) & 7)
+               return -EFAULT;
+
+       err = get_user(fprs, &fpu->si_fprs);
        fprs_write(0);
        regs->tstate &= ~TSTATE_PEF;
        if (fprs & FPRS_DL)
@@ -72,7 +75,10 @@ int restore_rwin_state(__siginfo_rwin_t __user *rp)
        struct thread_info *t = current_thread_info();
        int i, wsaved, err;
 
-       __get_user(wsaved, &rp->wsaved);
+       if (((unsigned long) rp) & 7)
+               return -EFAULT;
+
+       get_user(wsaved, &rp->wsaved);
        if (wsaved > NSWINS)
                return -EFAULT;
 
diff --git a/arch/sparc/kernel/urtt_fill.S b/arch/sparc/kernel/urtt_fill.S
new file mode 100644 (file)
index 0000000..5604a2b
--- /dev/null
@@ -0,0 +1,98 @@
+#include <asm/thread_info.h>
+#include <asm/trap_block.h>
+#include <asm/spitfire.h>
+#include <asm/ptrace.h>
+#include <asm/head.h>
+
+               .text
+               .align  8
+               .globl  user_rtt_fill_fixup_common
+user_rtt_fill_fixup_common:
+               rdpr    %cwp, %g1
+               add     %g1, 1, %g1
+               wrpr    %g1, 0x0, %cwp
+
+               rdpr    %wstate, %g2
+               sll     %g2, 3, %g2
+               wrpr    %g2, 0x0, %wstate
+
+               /* We know %canrestore and %otherwin are both zero.  */
+
+               sethi   %hi(sparc64_kern_pri_context), %g2
+               ldx     [%g2 + %lo(sparc64_kern_pri_context)], %g2
+               mov     PRIMARY_CONTEXT, %g1
+
+661:           stxa    %g2, [%g1] ASI_DMMU
+               .section .sun4v_1insn_patch, "ax"
+               .word   661b
+               stxa    %g2, [%g1] ASI_MMU
+               .previous
+
+               sethi   %hi(KERNBASE), %g1
+               flush   %g1
+
+               mov     %g4, %l4
+               mov     %g5, %l5
+               brnz,pn %g3, 1f
+                mov    %g3, %l3
+
+               or      %g4, FAULT_CODE_WINFIXUP, %g4
+               stb     %g4, [%g6 + TI_FAULT_CODE]
+               stx     %g5, [%g6 + TI_FAULT_ADDR]
+1:
+               mov     %g6, %l1
+               wrpr    %g0, 0x0, %tl
+
+661:           nop
+               .section                .sun4v_1insn_patch, "ax"
+               .word                   661b
+               SET_GL(0)
+               .previous
+
+               wrpr    %g0, RTRAP_PSTATE, %pstate
+
+               mov     %l1, %g6
+               ldx     [%g6 + TI_TASK], %g4
+               LOAD_PER_CPU_BASE(%g5, %g6, %g1, %g2, %g3)
+
+               brnz,pn %l3, 1f
+                nop
+
+               call    do_sparc64_fault
+                add    %sp, PTREGS_OFF, %o0
+               ba,pt   %xcc, rtrap
+                nop
+
+1:             cmp     %g3, 2
+               bne,pn  %xcc, 2f
+                nop
+
+               sethi   %hi(tlb_type), %g1
+               lduw    [%g1 + %lo(tlb_type)], %g1
+               cmp     %g1, 3
+               bne,pt  %icc, 1f
+                add    %sp, PTREGS_OFF, %o0
+               mov     %l4, %o2
+               call    sun4v_do_mna
+                mov    %l5, %o1
+               ba,a,pt %xcc, rtrap
+1:             mov     %l4, %o1
+               mov     %l5, %o2
+               call    mem_address_unaligned
+                nop
+               ba,a,pt %xcc, rtrap
+
+2:             sethi   %hi(tlb_type), %g1
+               mov     %l4, %o1
+               lduw    [%g1 + %lo(tlb_type)], %g1
+               mov     %l5, %o2
+               cmp     %g1, 3
+               bne,pt  %icc, 1f
+                add    %sp, PTREGS_OFF, %o0
+               call    sun4v_data_access_exception
+                nop
+               ba,a,pt %xcc, rtrap
+
+1:             call    spitfire_data_access_exception
+                nop
+               ba,a,pt %xcc, rtrap
index 652683c..14bb0d5 100644 (file)
@@ -2824,9 +2824,10 @@ void hugetlb_setup(struct pt_regs *regs)
         * the Data-TLB for huge pages.
         */
        if (tlb_type == cheetah_plus) {
+               bool need_context_reload = false;
                unsigned long ctx;
 
-               spin_lock(&ctx_alloc_lock);
+               spin_lock_irq(&ctx_alloc_lock);
                ctx = mm->context.sparc64_ctx_val;
                ctx &= ~CTX_PGSZ_MASK;
                ctx |= CTX_PGSZ_BASE << CTX_PGSZ0_SHIFT;
@@ -2845,9 +2846,12 @@ void hugetlb_setup(struct pt_regs *regs)
                         * also executing in this address space.
                         */
                        mm->context.sparc64_ctx_val = ctx;
-                       on_each_cpu(context_reload, mm, 0);
+                       need_context_reload = true;
                }
-               spin_unlock(&ctx_alloc_lock);
+               spin_unlock_irq(&ctx_alloc_lock);
+
+               if (need_context_reload)
+                       on_each_cpu(context_reload, mm, 0);
        }
 }
 #endif
index 3866397..24e9187 100644 (file)
@@ -12,6 +12,7 @@
  *   more details.
  */
 
+#define __ARCH_WANT_RENAMEAT
 #if !defined(__LP64__) || defined(__SYSCALL_COMPAT)
 /* Use the flavor of this syscall that matches the 32-bit API better. */
 #define __ARCH_WANT_SYNC_FILE_RANGE2
index 8767060..6394c1c 100644 (file)
@@ -941,7 +941,7 @@ arch_initcall(init_hw_perf_events);
 /*
  * Tile specific backtracing code for perf_events.
  */
-static inline void perf_callchain(struct perf_callchain_entry *entry,
+static inline void perf_callchain(struct perf_callchain_entry_ctx *entry,
                    struct pt_regs *regs)
 {
        struct KBacktraceIterator kbt;
@@ -992,13 +992,13 @@ static inline void perf_callchain(struct perf_callchain_entry *entry,
        }
 }
 
-void perf_callchain_user(struct perf_callchain_entry *entry,
+void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
                    struct pt_regs *regs)
 {
        perf_callchain(entry, regs);
 }
 
-void perf_callchain_kernel(struct perf_callchain_entry *entry,
+void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
                      struct pt_regs *regs)
 {
        perf_callchain(entry, regs);
index f5b7635..a74449b 100644 (file)
@@ -9,6 +9,8 @@
 #include <sysdep/ptrace.h>
 #include <sysdep/archsetjmp.h>
 
+extern int save_i387_registers(int pid, unsigned long *fp_regs);
+extern int restore_i387_registers(int pid, unsigned long *fp_regs);
 extern int save_fp_registers(int pid, unsigned long *fp_regs);
 extern int restore_fp_registers(int pid, unsigned long *fp_regs);
 extern int save_fpx_registers(int pid, unsigned long *fp_regs);
index 0b04711..034b42c 100644 (file)
@@ -398,6 +398,6 @@ int elf_core_copy_fpregs(struct task_struct *t, elf_fpregset_t *fpu)
 {
        int cpu = current_thread_info()->cpu;
 
-       return save_fp_registers(userspace_pid[cpu], (unsigned long *) fpu);
+       return save_i387_registers(userspace_pid[cpu], (unsigned long *) fpu);
 }
 
index 7801666..8acaf4e 100644 (file)
@@ -29,23 +29,29 @@ void (*sig_info[NSIG])(int, struct siginfo *, struct uml_pt_regs *) = {
 
 static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc)
 {
-       struct uml_pt_regs r;
+       struct uml_pt_regs *r;
        int save_errno = errno;
 
-       r.is_user = 0;
+       r = malloc(sizeof(struct uml_pt_regs));
+       if (!r)
+               panic("out of memory");
+
+       r->is_user = 0;
        if (sig == SIGSEGV) {
                /* For segfaults, we want the data from the sigcontext. */
-               get_regs_from_mc(&r, mc);
-               GET_FAULTINFO_FROM_MC(r.faultinfo, mc);
+               get_regs_from_mc(r, mc);
+               GET_FAULTINFO_FROM_MC(r->faultinfo, mc);
        }
 
        /* enable signals if sig isn't IRQ signal */
        if ((sig != SIGIO) && (sig != SIGWINCH) && (sig != SIGALRM))
                unblock_signals();
 
-       (*sig_info[sig])(sig, si, &r);
+       (*sig_info[sig])(sig, si, r);
 
        errno = save_errno;
+
+       free(r);
 }
 
 /*
@@ -83,11 +89,17 @@ void sig_handler(int sig, struct siginfo *si, mcontext_t *mc)
 
 static void timer_real_alarm_handler(mcontext_t *mc)
 {
-       struct uml_pt_regs regs;
+       struct uml_pt_regs *regs;
+
+       regs = malloc(sizeof(struct uml_pt_regs));
+       if (!regs)
+               panic("out of memory");
 
        if (mc != NULL)
-               get_regs_from_mc(&regs, mc);
-       timer_handler(SIGALRM, NULL, &regs);
+               get_regs_from_mc(regs, mc);
+       timer_handler(SIGALRM, NULL, regs);
+
+       free(regs);
 }
 
 void timer_alarm_handler(int sig, struct siginfo *unused_si, mcontext_t *mc)
index ec7fb70..8288550 100644 (file)
@@ -31,7 +31,7 @@ $(obj)/uImage: $(obj)/zImage FORCE
        $(call if_changed,uimage)
        @echo '  Image $@ is ready'
 
-PHONY += initrd FORCE
+PHONY += initrd
 initrd:
        @test "$(INITRD)" != "" || \
        (echo You must specify INITRD; exit -1)
index 96494fb..9aecdd3 100644 (file)
@@ -54,7 +54,6 @@ LDFLAGS_vmlinux += -T
 $(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head.o $(obj)/piggy.o \
                $(obj)/misc.o FORCE
        $(call if_changed,ld)
-       @:
 
 # We now have a PIC decompressor implementation.  Decompressors running
 # from RAM should not define ZTEXTADDR.  Decompressors running directly
index d4cc455..1f63c47 100644 (file)
@@ -10,6 +10,8 @@
  * published by the Free Software Foundation.
  */
 
+#define __ARCH_WANT_RENAMEAT
+
 /* Use the standard ABI for syscalls. */
 #include <asm-generic/unistd.h>
 #define __ARCH_WANT_SYS_CLONE
index cfdd8c3..f135688 100644 (file)
@@ -87,7 +87,6 @@ vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o
 
 $(obj)/vmlinux: $(vmlinux-objs-y) FORCE
        $(call if_changed,ld)
-       @:
 
 OBJCOPYFLAGS_vmlinux.bin :=  -R .comment -S
 $(obj)/vmlinux.bin: vmlinux FORCE
index 98df1fa..027aec4 100644 (file)
@@ -8,16 +8,15 @@
 #include <linux/linkage.h>
 #include "calling.h"
 #include <asm/asm.h>
-#include <asm/frame.h>
 
        /* rdi: arg1 ... normal C conventions. rax is saved/restored. */
        .macro THUNK name, func, put_ret_addr_in_rdi=0
        .globl \name
        .type \name, @function
 \name:
-       FRAME_BEGIN
+       pushq %rbp
+       movq %rsp, %rbp
 
-       /* this one pushes 9 elems, the next one would be %rIP */
        pushq %rdi
        pushq %rsi
        pushq %rdx
@@ -29,8 +28,8 @@
        pushq %r11
 
        .if \put_ret_addr_in_rdi
-       /* 9*8(%rsp) is return addr on stack */
-       movq 9*8(%rsp), %rdi
+       /* 8(%rbp) is return addr on stack */
+       movq 8(%rbp), %rdi
        .endif
 
        call \func
@@ -65,7 +64,7 @@ restore:
        popq %rdx
        popq %rsi
        popq %rdi
-       FRAME_END
+       popq %rbp
        ret
        _ASM_NOKPROBE(restore)
 #endif
index 6874da5..253b72e 100644 (file)
@@ -193,10 +193,10 @@ vdso_img_insttargets := $(vdso_img_sodbg:%.dbg=install_%)
 $(MODLIB)/vdso: FORCE
        @mkdir -p $(MODLIB)/vdso
 
-$(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso FORCE
+$(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso
        $(call cmd,vdso_install)
 
 PHONY += vdso_install $(vdso_img_insttargets)
-vdso_install: $(vdso_img_insttargets) FORCE
+vdso_install: $(vdso_img_insttargets)
 
 clean-files := vdso32.so vdso32.so.dbg vdso64* vdso-image-*.c vdsox32.so*
index 73a75aa..33787ee 100644 (file)
@@ -2202,7 +2202,7 @@ static int backtrace_stack(void *data, char *name)
 
 static int backtrace_address(void *data, unsigned long addr, int reliable)
 {
-       struct perf_callchain_entry *entry = data;
+       struct perf_callchain_entry_ctx *entry = data;
 
        return perf_callchain_store(entry, addr);
 }
@@ -2214,7 +2214,7 @@ static const struct stacktrace_ops backtrace_ops = {
 };
 
 void
-perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
        if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
                /* TODO: We don't support guest os callchain now */
@@ -2268,7 +2268,7 @@ static unsigned long get_segment_base(unsigned int segment)
 #include <asm/compat.h>
 
 static inline int
-perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
+perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry)
 {
        /* 32-bit process in 64-bit kernel. */
        unsigned long ss_base, cs_base;
@@ -2283,7 +2283,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 
        fp = compat_ptr(ss_base + regs->bp);
        pagefault_disable();
-       while (entry->nr < sysctl_perf_event_max_stack) {
+       while (entry->nr < entry->max_stack) {
                unsigned long bytes;
                frame.next_frame     = 0;
                frame.return_address = 0;
@@ -2309,14 +2309,14 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 }
 #else
 static inline int
-perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
+perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry)
 {
     return 0;
 }
 #endif
 
 void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
        struct stack_frame frame;
        const void __user *fp;
@@ -2343,7 +2343,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
                return;
 
        pagefault_disable();
-       while (entry->nr < sysctl_perf_event_max_stack) {
+       while (entry->nr < entry->max_stack) {
                unsigned long bytes;
                frame.next_frame             = NULL;
                frame.return_address = 0;
index 0a5ede1..eb05335 100644 (file)
@@ -826,7 +826,7 @@ static int p4_hw_config(struct perf_event *event)
                 * Clear bits we reserve to be managed by kernel itself
                 * and never allowed from a user space
                 */
-                event->attr.config &= P4_CONFIG_MASK;
+               event->attr.config &= P4_CONFIG_MASK;
 
                rc = p4_validate_raw_event(event);
                if (rc)
index 16c1789..fce7406 100644 (file)
@@ -891,7 +891,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
                return -ENODEV;
 
        pkg = topology_phys_to_logical_pkg(phys_id);
-       if (WARN_ON_ONCE(pkg < 0))
+       if (pkg < 0)
                return -EINVAL;
 
        if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) {
index f5e737f..cb26f18 100644 (file)
@@ -116,12 +116,12 @@ static struct linux_binfmt aout_format = {
        .min_coredump   = PAGE_SIZE
 };
 
-static unsigned long set_brk(unsigned long start, unsigned long end)
+static int set_brk(unsigned long start, unsigned long end)
 {
        start = PAGE_ALIGN(start);
        end = PAGE_ALIGN(end);
        if (end <= start)
-               return start;
+               return 0;
        return vm_brk(start, end - start);
 }
 
@@ -321,7 +321,7 @@ static int load_aout_binary(struct linux_binprm *bprm)
 
                error = vm_brk(text_addr & PAGE_MASK, map_size);
 
-               if (error != (text_addr & PAGE_MASK))
+               if (error)
                        return error;
 
                error = read_code(bprm->file, text_addr, 32,
@@ -350,7 +350,7 @@ static int load_aout_binary(struct linux_binprm *bprm)
 
                if (!bprm->file->f_op->mmap || (fd_offset & ~PAGE_MASK) != 0) {
                        error = vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
-                       if (IS_ERR_VALUE(error))
+                       if (error)
                                return error;
 
                        read_code(bprm->file, N_TXTADDR(ex), fd_offset,
@@ -378,7 +378,7 @@ static int load_aout_binary(struct linux_binprm *bprm)
 
 beyond_if:
        error = set_brk(current->mm->start_brk, current->mm->brk);
-       if (IS_ERR_VALUE(error))
+       if (error)
                return error;
 
        set_binfmt(&aout_format);
@@ -441,7 +441,7 @@ static int load_aout_library(struct file *file)
                }
 #endif
                retval = vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
-               if (IS_ERR_VALUE(retval))
+               if (retval)
                        goto out;
 
                read_code(file, start_addr, N_TXTOFF(ex),
@@ -461,9 +461,8 @@ static int load_aout_library(struct file *file)
        len = PAGE_ALIGN(ex.a_text + ex.a_data);
        bss = ex.a_text + ex.a_data + ex.a_bss;
        if (bss > len) {
-               error = vm_brk(start_addr + len, bss - len);
-               retval = error;
-               if (error != start_addr + len)
+               retval = vm_brk(start_addr + len, bss - len);
+               if (retval)
                        goto out;
        }
        retval = 0;
index 08abf63..5490bba 100644 (file)
@@ -1,8 +1,16 @@
 #ifndef _ASM_X86_BUGS_H
 #define _ASM_X86_BUGS_H
 
+#include <asm/processor.h>
+
 extern void check_bugs(void);
 
+#if defined(CONFIG_CPU_SUP_INTEL)
+void check_mpx_erratum(struct cpuinfo_x86 *c);
+#else
+static inline void check_mpx_erratum(struct cpuinfo_x86 *c) {}
+#endif
+
 #if defined(CONFIG_CPU_SUP_INTEL) && defined(CONFIG_X86_32)
 int ppro_with_ram_bug(void);
 #else
index 25ebb54..483fb54 100644 (file)
@@ -64,9 +64,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
           (((bit)>>5)==11 && (1UL<<((bit)&31) & REQUIRED_MASK11)) ||   \
           (((bit)>>5)==12 && (1UL<<((bit)&31) & REQUIRED_MASK12)) ||   \
           (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK13)) ||   \
-          (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK14)) ||   \
-          (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK15)) ||   \
-          (((bit)>>5)==14 && (1UL<<((bit)&31) & REQUIRED_MASK16)) )
+          (((bit)>>5)==14 && (1UL<<((bit)&31) & REQUIRED_MASK14)) ||   \
+          (((bit)>>5)==15 && (1UL<<((bit)&31) & REQUIRED_MASK15)) ||   \
+          (((bit)>>5)==16 && (1UL<<((bit)&31) & REQUIRED_MASK16)) )
 
 #define DISABLED_MASK_BIT_SET(bit)                                     \
         ( (((bit)>>5)==0  && (1UL<<((bit)&31) & DISABLED_MASK0 )) ||   \
@@ -83,9 +83,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
           (((bit)>>5)==11 && (1UL<<((bit)&31) & DISABLED_MASK11)) ||   \
           (((bit)>>5)==12 && (1UL<<((bit)&31) & DISABLED_MASK12)) ||   \
           (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK13)) ||   \
-          (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK14)) ||   \
-          (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK15)) ||   \
-          (((bit)>>5)==14 && (1UL<<((bit)&31) & DISABLED_MASK16)) )
+          (((bit)>>5)==14 && (1UL<<((bit)&31) & DISABLED_MASK14)) ||   \
+          (((bit)>>5)==15 && (1UL<<((bit)&31) & DISABLED_MASK15)) ||   \
+          (((bit)>>5)==16 && (1UL<<((bit)&31) & DISABLED_MASK16)) )
 
 #define cpu_has(c, bit)                                                        \
        (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 :  \
index 39343be..911e935 100644 (file)
 #endif /* CONFIG_X86_64 */
 
 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
-# define DISABLE_PKU           (1<<(X86_FEATURE_PKU))
-# define DISABLE_OSPKE         (1<<(X86_FEATURE_OSPKE))
-#else
 # define DISABLE_PKU           0
 # define DISABLE_OSPKE         0
+#else
+# define DISABLE_PKU           (1<<(X86_FEATURE_PKU & 31))
+# define DISABLE_OSPKE         (1<<(X86_FEATURE_OSPKE & 31))
 #endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
 
 /*
index ed65fe7..85029b5 100644 (file)
@@ -99,7 +99,7 @@ struct telemetry_core_ops {
        int (*reset_events)(void);
 };
 
-int telemetry_set_pltdata(struct telemetry_core_ops *ops,
+int telemetry_set_pltdata(const struct telemetry_core_ops *ops,
                          struct telemetry_plt_config *pltconfig);
 
 int telemetry_clear_pltdata(void);
diff --git a/arch/x86/include/asm/pmc_core.h b/arch/x86/include/asm/pmc_core.h
new file mode 100644 (file)
index 0000000..d4855f1
--- /dev/null
@@ -0,0 +1,27 @@
+/*
+ * Intel Core SoC Power Management Controller Header File
+ *
+ * Copyright (c) 2016, Intel Corporation.
+ * All Rights Reserved.
+ *
+ * Authors: Rajneesh Bhardwaj <rajneesh.bhardwaj@intel.com>
+ *          Vishwanath Somayaji <vishwanath.somayaji@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#ifndef _ASM_PMC_CORE_H
+#define _ASM_PMC_CORE_H
+
+/* API to read SLP_S0_RESIDENCY counter */
+int intel_pmc_slp_s0_counter_read(u32 *data);
+
+#endif /* _ASM_PMC_CORE_H */
index b9e9bb2..3725e14 100644 (file)
@@ -2,10 +2,12 @@
 #define _UAPI__SVM_H
 
 #define SVM_EXIT_READ_CR0      0x000
+#define SVM_EXIT_READ_CR2      0x002
 #define SVM_EXIT_READ_CR3      0x003
 #define SVM_EXIT_READ_CR4      0x004
 #define SVM_EXIT_READ_CR8      0x008
 #define SVM_EXIT_WRITE_CR0     0x010
+#define SVM_EXIT_WRITE_CR2     0x012
 #define SVM_EXIT_WRITE_CR3     0x013
 #define SVM_EXIT_WRITE_CR4     0x014
 #define SVM_EXIT_WRITE_CR8     0x018
 
 #define SVM_EXIT_REASONS \
        { SVM_EXIT_READ_CR0,    "read_cr0" }, \
+       { SVM_EXIT_READ_CR2,    "read_cr2" }, \
        { SVM_EXIT_READ_CR3,    "read_cr3" }, \
        { SVM_EXIT_READ_CR4,    "read_cr4" }, \
        { SVM_EXIT_READ_CR8,    "read_cr8" }, \
        { SVM_EXIT_WRITE_CR0,   "write_cr0" }, \
+       { SVM_EXIT_WRITE_CR2,   "write_cr2" }, \
        { SVM_EXIT_WRITE_CR3,   "write_cr3" }, \
        { SVM_EXIT_WRITE_CR4,   "write_cr4" }, \
        { SVM_EXIT_WRITE_CR8,   "write_cr8" }, \
        { SVM_EXIT_READ_DR1,    "read_dr1" }, \
        { SVM_EXIT_READ_DR2,    "read_dr2" }, \
        { SVM_EXIT_READ_DR3,    "read_dr3" }, \
+       { SVM_EXIT_READ_DR4,    "read_dr4" }, \
+       { SVM_EXIT_READ_DR5,    "read_dr5" }, \
+       { SVM_EXIT_READ_DR6,    "read_dr6" }, \
+       { SVM_EXIT_READ_DR7,    "read_dr7" }, \
        { SVM_EXIT_WRITE_DR0,   "write_dr0" }, \
        { SVM_EXIT_WRITE_DR1,   "write_dr1" }, \
        { SVM_EXIT_WRITE_DR2,   "write_dr2" }, \
        { SVM_EXIT_WRITE_DR3,   "write_dr3" }, \
+       { SVM_EXIT_WRITE_DR4,   "write_dr4" }, \
        { SVM_EXIT_WRITE_DR5,   "write_dr5" }, \
+       { SVM_EXIT_WRITE_DR6,   "write_dr6" }, \
        { SVM_EXIT_WRITE_DR7,   "write_dr7" }, \
+       { SVM_EXIT_EXCP_BASE + DE_VECTOR,       "DE excp" }, \
        { SVM_EXIT_EXCP_BASE + DB_VECTOR,       "DB excp" }, \
        { SVM_EXIT_EXCP_BASE + BP_VECTOR,       "BP excp" }, \
+       { SVM_EXIT_EXCP_BASE + OF_VECTOR,       "OF excp" }, \
+       { SVM_EXIT_EXCP_BASE + BR_VECTOR,       "BR excp" }, \
        { SVM_EXIT_EXCP_BASE + UD_VECTOR,       "UD excp" }, \
-       { SVM_EXIT_EXCP_BASE + PF_VECTOR,       "PF excp" }, \
        { SVM_EXIT_EXCP_BASE + NM_VECTOR,       "NM excp" }, \
+       { SVM_EXIT_EXCP_BASE + DF_VECTOR,       "DF excp" }, \
+       { SVM_EXIT_EXCP_BASE + TS_VECTOR,       "TS excp" }, \
+       { SVM_EXIT_EXCP_BASE + NP_VECTOR,       "NP excp" }, \
+       { SVM_EXIT_EXCP_BASE + SS_VECTOR,       "SS excp" }, \
+       { SVM_EXIT_EXCP_BASE + GP_VECTOR,       "GP excp" }, \
+       { SVM_EXIT_EXCP_BASE + PF_VECTOR,       "PF excp" }, \
+       { SVM_EXIT_EXCP_BASE + MF_VECTOR,       "MF excp" }, \
        { SVM_EXIT_EXCP_BASE + AC_VECTOR,       "AC excp" }, \
        { SVM_EXIT_EXCP_BASE + MC_VECTOR,       "MC excp" }, \
+       { SVM_EXIT_EXCP_BASE + XM_VECTOR,       "XF excp" }, \
        { SVM_EXIT_INTR,        "interrupt" }, \
        { SVM_EXIT_NMI,         "nmi" }, \
        { SVM_EXIT_SMI,         "smi" }, \
        { SVM_EXIT_INIT,        "init" }, \
        { SVM_EXIT_VINTR,       "vintr" }, \
        { SVM_EXIT_CR0_SEL_WRITE, "cr0_sel_write" }, \
+       { SVM_EXIT_IDTR_READ,   "read_idtr" }, \
+       { SVM_EXIT_GDTR_READ,   "read_gdtr" }, \
+       { SVM_EXIT_LDTR_READ,   "read_ldtr" }, \
+       { SVM_EXIT_TR_READ,     "read_rt" }, \
+       { SVM_EXIT_IDTR_WRITE,  "write_idtr" }, \
+       { SVM_EXIT_GDTR_WRITE,  "write_gdtr" }, \
+       { SVM_EXIT_LDTR_WRITE,  "write_ldtr" }, \
+       { SVM_EXIT_TR_WRITE,    "write_rt" }, \
+       { SVM_EXIT_RDTSC,       "rdtsc" }, \
+       { SVM_EXIT_RDPMC,       "rdpmc" }, \
+       { SVM_EXIT_PUSHF,       "pushf" }, \
+       { SVM_EXIT_POPF,        "popf" }, \
        { SVM_EXIT_CPUID,       "cpuid" }, \
+       { SVM_EXIT_RSM,         "rsm" }, \
+       { SVM_EXIT_IRET,        "iret" }, \
+       { SVM_EXIT_SWINT,       "swint" }, \
        { SVM_EXIT_INVD,        "invd" }, \
        { SVM_EXIT_PAUSE,       "pause" }, \
        { SVM_EXIT_HLT,         "hlt" }, \
        { SVM_EXIT_IOIO,        "io" }, \
        { SVM_EXIT_MSR,         "msr" }, \
        { SVM_EXIT_TASK_SWITCH, "task_switch" }, \
+       { SVM_EXIT_FERR_FREEZE, "ferr_freeze" }, \
        { SVM_EXIT_SHUTDOWN,    "shutdown" }, \
        { SVM_EXIT_VMRUN,       "vmrun" }, \
        { SVM_EXIT_VMMCALL,     "hypercall" }, \
        { SVM_EXIT_STGI,        "stgi" }, \
        { SVM_EXIT_CLGI,        "clgi" }, \
        { SVM_EXIT_SKINIT,      "skinit" }, \
+       { SVM_EXIT_RDTSCP,      "rdtscp" }, \
+       { SVM_EXIT_ICEBP,       "icebp" }, \
        { SVM_EXIT_WBINVD,      "wbinvd" }, \
        { SVM_EXIT_MONITOR,     "monitor" }, \
        { SVM_EXIT_MWAIT,       "mwait" }, \
        { SVM_EXIT_XSETBV,      "xsetbv" }, \
        { SVM_EXIT_NPF,         "npf" }, \
-       { SVM_EXIT_RSM,         "rsm" }, \
        { SVM_EXIT_AVIC_INCOMPLETE_IPI,         "avic_incomplete_ipi" }, \
-       { SVM_EXIT_AVIC_UNACCELERATED_ACCESS,   "avic_unaccelerated_access" }
+       { SVM_EXIT_AVIC_UNACCELERATED_ACCESS,   "avic_unaccelerated_access" }, \
+       { SVM_EXIT_ERR,         "invalid_guest_state" }
 
 
 #endif /* _UAPI__SVM_H */
index 6ef6ed9..0fe6953 100644 (file)
@@ -37,6 +37,7 @@
 #include <asm/mtrr.h>
 #include <linux/numa.h>
 #include <asm/asm.h>
+#include <asm/bugs.h>
 #include <asm/cpu.h>
 #include <asm/mce.h>
 #include <asm/msr.h>
@@ -270,6 +271,8 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
 static __init int setup_disable_smep(char *arg)
 {
        setup_clear_cpu_cap(X86_FEATURE_SMEP);
+       /* Check for things that depend on SMEP being enabled: */
+       check_mpx_erratum(&boot_cpu_data);
        return 1;
 }
 __setup("nosmep", setup_disable_smep);
@@ -310,6 +313,10 @@ static bool pku_disabled;
 
 static __always_inline void setup_pku(struct cpuinfo_x86 *c)
 {
+       /* check the boot processor, plus compile options for PKU: */
+       if (!cpu_feature_enabled(X86_FEATURE_PKU))
+               return;
+       /* checks the actual processor's cpuid bits: */
        if (!cpu_has(c, X86_FEATURE_PKU))
                return;
        if (pku_disabled)
index 8dae51f..6e2ffbe 100644 (file)
 #include <asm/apic.h>
 #endif
 
+/*
+ * Just in case our CPU detection goes bad, or you have a weird system,
+ * allow a way to override the automatic disabling of MPX.
+ */
+static int forcempx;
+
+static int __init forcempx_setup(char *__unused)
+{
+       forcempx = 1;
+
+       return 1;
+}
+__setup("intel-skd-046-workaround=disable", forcempx_setup);
+
+void check_mpx_erratum(struct cpuinfo_x86 *c)
+{
+       if (forcempx)
+               return;
+       /*
+        * Turn off the MPX feature on CPUs where SMEP is not
+        * available or disabled.
+        *
+        * Works around Intel Erratum SKD046: "Branch Instructions
+        * May Initialize MPX Bound Registers Incorrectly".
+        *
+        * This might falsely disable MPX on systems without
+        * SMEP, like Atom processors without SMEP.  But there
+        * is no such hardware known at the moment.
+        */
+       if (cpu_has(c, X86_FEATURE_MPX) && !cpu_has(c, X86_FEATURE_SMEP)) {
+               setup_clear_cpu_cap(X86_FEATURE_MPX);
+               pr_warn("x86/mpx: Disabling MPX since SMEP not present\n");
+       }
+}
+
 static void early_init_intel(struct cpuinfo_x86 *c)
 {
        u64 misc_enable;
@@ -173,6 +208,8 @@ static void early_init_intel(struct cpuinfo_x86 *c)
                if (edx & (1U << 28))
                        c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff);
        }
+
+       check_mpx_erratum(c);
 }
 
 #ifdef CONFIG_X86_32
index 6b16c36..6e789ca 100644 (file)
@@ -532,7 +532,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 
        switch (code) {
        case ARCH_SET_GS:
-               if (addr >= TASK_SIZE_OF(task))
+               if (addr >= TASK_SIZE_MAX)
                        return -EPERM;
                cpu = get_cpu();
                task->thread.gsindex = 0;
@@ -546,7 +546,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
        case ARCH_SET_FS:
                /* Not strictly needed for fs, but do it for symmetry
                   with gs */
-               if (addr >= TASK_SIZE_OF(task))
+               if (addr >= TASK_SIZE_MAX)
                        return -EPERM;
                cpu = get_cpu();
                task->thread.fsindex = 0;
index e60ef91..600edd2 100644 (file)
@@ -392,7 +392,7 @@ static int putreg(struct task_struct *child,
 
 #ifdef CONFIG_X86_64
        case offsetof(struct user_regs_struct,fs_base):
-               if (value >= TASK_SIZE_OF(child))
+               if (value >= TASK_SIZE_MAX)
                        return -EIO;
                /*
                 * When changing the segment base, use do_arch_prctl
@@ -406,7 +406,7 @@ static int putreg(struct task_struct *child,
                /*
                 * Exactly the same here as the %fs handling above.
                 */
-               if (value >= TASK_SIZE_OF(child))
+               if (value >= TASK_SIZE_MAX)
                        return -EIO;
                if (child->thread.gsbase != value)
                        return do_arch_prctl(child, ARCH_SET_GS, value);
index 6aa0f4d..9911a06 100644 (file)
@@ -23,6 +23,7 @@
 #include <asm/param.h>
 
 /* CPU reference clock frequency: in KHz */
+#define FREQ_80                80000
 #define FREQ_83                83200
 #define FREQ_100       99840
 #define FREQ_133       133200
@@ -56,6 +57,8 @@ static struct freq_desc freq_desc_tables[] = {
        { 6, 0x37, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_166, 0, 0, 0, 0 } },
        /* ANN */
        { 6, 0x5a, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_100, 0, 0, 0, 0 } },
+       /* AIRMONT */
+       { 6, 0x4c, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_166, FREQ_80, 0, 0, 0 } },
 };
 
 static int match_cpu(u8 family, u8 model)
index 769af90..7597b42 100644 (file)
@@ -181,19 +181,22 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
                             struct kvm_cpuid_entry __user *entries)
 {
        int r, i;
-       struct kvm_cpuid_entry *cpuid_entries;
+       struct kvm_cpuid_entry *cpuid_entries = NULL;
 
        r = -E2BIG;
        if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
                goto out;
        r = -ENOMEM;
-       cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
-       if (!cpuid_entries)
-               goto out;
-       r = -EFAULT;
-       if (copy_from_user(cpuid_entries, entries,
-                          cpuid->nent * sizeof(struct kvm_cpuid_entry)))
-               goto out_free;
+       if (cpuid->nent) {
+               cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) *
+                                       cpuid->nent);
+               if (!cpuid_entries)
+                       goto out;
+               r = -EFAULT;
+               if (copy_from_user(cpuid_entries, entries,
+                                  cpuid->nent * sizeof(struct kvm_cpuid_entry)))
+                       goto out;
+       }
        for (i = 0; i < cpuid->nent; i++) {
                vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
                vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
@@ -212,9 +215,8 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
        kvm_x86_ops->cpuid_update(vcpu);
        r = kvm_update_cpuid(vcpu);
 
-out_free:
-       vfree(cpuid_entries);
 out:
+       vfree(cpuid_entries);
        return r;
 }
 
index 24e8001..def97b3 100644 (file)
@@ -336,12 +336,12 @@ static gfn_t pse36_gfn_delta(u32 gpte)
 #ifdef CONFIG_X86_64
 static void __set_spte(u64 *sptep, u64 spte)
 {
-       *sptep = spte;
+       WRITE_ONCE(*sptep, spte);
 }
 
 static void __update_clear_spte_fast(u64 *sptep, u64 spte)
 {
-       *sptep = spte;
+       WRITE_ONCE(*sptep, spte);
 }
 
 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
@@ -390,7 +390,7 @@ static void __set_spte(u64 *sptep, u64 spte)
         */
        smp_wmb();
 
-       ssptep->spte_low = sspte.spte_low;
+       WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
 }
 
 static void __update_clear_spte_fast(u64 *sptep, u64 spte)
@@ -400,7 +400,7 @@ static void __update_clear_spte_fast(u64 *sptep, u64 spte)
        ssptep = (union split_spte *)sptep;
        sspte = (union split_spte)spte;
 
-       ssptep->spte_low = sspte.spte_low;
+       WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
 
        /*
         * If we map the spte from present to nonpresent, we should clear
index 2214214..1163e81 100644 (file)
@@ -84,7 +84,7 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
 #define TSC_RATIO_MIN          0x0000000000000001ULL
 #define TSC_RATIO_MAX          0x000000ffffffffffULL
 
-#define AVIC_HPA_MASK  ~((0xFFFULL << 52) || 0xFFF)
+#define AVIC_HPA_MASK  ~((0xFFFULL << 52) | 0xFFF)
 
 /*
  * 0xff is broadcast, so the max index allowed for physical APIC ID
@@ -3597,7 +3597,7 @@ static int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
        u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
        u32 icrl = svm->vmcb->control.exit_info_1;
        u32 id = svm->vmcb->control.exit_info_2 >> 32;
-       u32 index = svm->vmcb->control.exit_info_2 && 0xFF;
+       u32 index = svm->vmcb->control.exit_info_2 & 0xFF;
        struct kvm_lapic *apic = svm->vcpu.arch.apic;
 
        trace_kvm_avic_incomplete_ipi(svm->vcpu.vcpu_id, icrh, icrl, id, index);
index e605d1e..fb93010 100644 (file)
@@ -2418,7 +2418,9 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
 
        if (is_guest_mode(vcpu))
                msr_bitmap = vmx_msr_bitmap_nested;
-       else if (vcpu->arch.apic_base & X2APIC_ENABLE) {
+       else if (cpu_has_secondary_exec_ctrls() &&
+                (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
+                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
                if (is_long_mode(vcpu))
                        msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
                else
@@ -4787,6 +4789,19 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
        struct vcpu_vmx *vmx = to_vmx(vcpu);
 
        vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
+       if (cpu_has_secondary_exec_ctrls()) {
+               if (kvm_vcpu_apicv_active(vcpu))
+                       vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
+                                     SECONDARY_EXEC_APIC_REGISTER_VIRT |
+                                     SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+               else
+                       vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
+                                       SECONDARY_EXEC_APIC_REGISTER_VIRT |
+                                       SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+       }
+
+       if (cpu_has_vmx_msr_bitmap())
+               vmx_set_msr_bitmap(vcpu);
 }
 
 static u32 vmx_exec_control(struct vcpu_vmx *vmx)
@@ -6333,23 +6348,20 @@ static __init int hardware_setup(void)
 
        set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
 
-       if (enable_apicv) {
-               for (msr = 0x800; msr <= 0x8ff; msr++)
-                       vmx_disable_intercept_msr_read_x2apic(msr);
-
-               /* According SDM, in x2apic mode, the whole id reg is used.
-                * But in KVM, it only use the highest eight bits. Need to
-                * intercept it */
-               vmx_enable_intercept_msr_read_x2apic(0x802);
-               /* TMCCT */
-               vmx_enable_intercept_msr_read_x2apic(0x839);
-               /* TPR */
-               vmx_disable_intercept_msr_write_x2apic(0x808);
-               /* EOI */
-               vmx_disable_intercept_msr_write_x2apic(0x80b);
-               /* SELF-IPI */
-               vmx_disable_intercept_msr_write_x2apic(0x83f);
-       }
+       for (msr = 0x800; msr <= 0x8ff; msr++)
+               vmx_disable_intercept_msr_read_x2apic(msr);
+
+       /* According SDM, in x2apic mode, the whole id reg is used.  But in
+        * KVM, it only use the highest eight bits. Need to intercept it */
+       vmx_enable_intercept_msr_read_x2apic(0x802);
+       /* TMCCT */
+       vmx_enable_intercept_msr_read_x2apic(0x839);
+       /* TPR */
+       vmx_disable_intercept_msr_write_x2apic(0x808);
+       /* EOI */
+       vmx_disable_intercept_msr_write_x2apic(0x80b);
+       /* SELF-IPI */
+       vmx_disable_intercept_msr_write_x2apic(0x83f);
 
        if (enable_ept) {
                kvm_mmu_set_mask_ptes(0ull,
index c805cf4..902d9da 100644 (file)
@@ -2314,6 +2314,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        case MSR_AMD64_NB_CFG:
        case MSR_FAM10H_MMIO_CONF_BASE:
        case MSR_AMD64_BU_CFG2:
+       case MSR_IA32_PERF_CTL:
                msr_info->data = 0;
                break;
        case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
@@ -2972,6 +2973,10 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
                              | KVM_VCPUEVENT_VALID_SMM))
                return -EINVAL;
 
+       if (events->exception.injected &&
+           (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR))
+               return -EINVAL;
+
        process_nmi(vcpu);
        vcpu->arch.exception.pending = events->exception.injected;
        vcpu->arch.exception.nr = events->exception.nr;
@@ -3036,6 +3041,11 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
        if (dbgregs->flags)
                return -EINVAL;
 
+       if (dbgregs->dr6 & ~0xffffffffull)
+               return -EINVAL;
+       if (dbgregs->dr7 & ~0xffffffffull)
+               return -EINVAL;
+
        memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
        kvm_update_dr0123(vcpu);
        vcpu->arch.dr6 = dbgregs->dr6;
@@ -7815,7 +7825,7 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
 
        slot = id_to_memslot(slots, id);
        if (size) {
-               if (WARN_ON(slot->npages))
+               if (slot->npages)
                        return -EEXIST;
 
                /*
index 5ce1ed0..7d1fa7c 100644 (file)
@@ -292,7 +292,7 @@ void vmalloc_sync_all(void)
                return;
 
        for (address = VMALLOC_START & PMD_MASK;
-            address >= TASK_SIZE && address < FIXADDR_TOP;
+            address >= TASK_SIZE_MAX && address < FIXADDR_TOP;
             address += PMD_SIZE) {
                struct page *page;
 
@@ -854,8 +854,13 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
                                return;
                }
 #endif
-               /* Kernel addresses are always protection faults: */
-               if (address >= TASK_SIZE)
+
+               /*
+                * To avoid leaking information about the kernel page table
+                * layout, pretend that user-mode accesses to kernel addresses
+                * are always protection faults.
+                */
+               if (address >= TASK_SIZE_MAX)
                        error_code |= PF_PROT;
 
                if (likely(show_unhandled_signals))
index 4bd08b0..99ddab7 100644 (file)
@@ -491,8 +491,11 @@ int __init pci_xen_initial_domain(void)
 #endif
        __acpi_register_gsi = acpi_register_gsi_xen;
        __acpi_unregister_gsi = NULL;
-       /* Pre-allocate legacy irqs */
-       for (irq = 0; irq < nr_legacy_irqs(); irq++) {
+       /*
+        * Pre-allocate the legacy IRQs.  Use NR_LEGACY_IRQS here
+        * because we don't have a PIC and thus nr_legacy_irqs() is zero.
+        */
+       for (irq = 0; irq < NR_IRQS_LEGACY; irq++) {
                int trigger, polarity;
 
                if (acpi_get_override_irq(irq, &trigger, &polarity) == -1)
index 92723ae..cd95075 100644 (file)
@@ -11,7 +11,6 @@
 #include <asm/msr.h>
 #include <asm/processor-flags.h>
 #include <asm/page_types.h>
-#include <asm/frame.h>
 
 #define SAVE_XMM                       \
        mov %rsp, %rax;                 \
        mov (%rsp), %rsp
 
 ENTRY(efi_call)
-       FRAME_BEGIN
+       pushq %rbp
+       movq %rsp, %rbp
        SAVE_XMM
-       mov (%rsp), %rax
-       mov 8(%rax), %rax
+       mov 16(%rbp), %rax
        subq $48, %rsp
        mov %r9, 32(%rsp)
        mov %rax, 40(%rsp)
@@ -53,6 +52,6 @@ ENTRY(efi_call)
        call *%rdi
        addq $48, %rsp
        RESTORE_XMM
-       FRAME_END
+       popq %rbp
        ret
 ENDPROC(efi_call)
index 92e3e1d..12734a9 100644 (file)
@@ -26,7 +26,5 @@ quiet_cmd_bin2c = BIN2C   $@
 
 $(obj)/kexec-purgatory.c: $(obj)/purgatory.ro FORCE
        $(call if_changed,bin2c)
-       @:
-
 
 obj-$(CONFIG_KEXEC_FILE)       += kexec-purgatory.o
index b959646..c556c5a 100644 (file)
@@ -59,7 +59,6 @@ OBJCOPYFLAGS_realmode.bin := -O binary
 targets += realmode.bin
 $(obj)/realmode.bin: $(obj)/realmode.elf $(obj)/realmode.relocs FORCE
        $(call if_changed,objcopy)
-       @:
 
 quiet_cmd_relocs = RELOCS  $@
       cmd_relocs = arch/x86/tools/relocs --realmode $< > $@
index 41bfe84..00f54a9 100644 (file)
 #endif
 #include <longjmp.h>
 #include <sysdep/ptrace_user.h>
+#include <sys/uio.h>
+#include <asm/sigcontext.h>
+#include <linux/elf.h>
 
-int save_fp_registers(int pid, unsigned long *fp_regs)
+int have_xstate_support;
+
+int save_i387_registers(int pid, unsigned long *fp_regs)
 {
        if (ptrace(PTRACE_GETFPREGS, pid, 0, fp_regs) < 0)
                return -errno;
        return 0;
 }
 
-int restore_fp_registers(int pid, unsigned long *fp_regs)
+int save_fp_registers(int pid, unsigned long *fp_regs)
+{
+       struct iovec iov;
+
+       if (have_xstate_support) {
+               iov.iov_base = fp_regs;
+               iov.iov_len = sizeof(struct _xstate);
+               if (ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov) < 0)
+                       return -errno;
+               return 0;
+       } else {
+               return save_i387_registers(pid, fp_regs);
+       }
+}
+
+int restore_i387_registers(int pid, unsigned long *fp_regs)
 {
        if (ptrace(PTRACE_SETFPREGS, pid, 0, fp_regs) < 0)
                return -errno;
        return 0;
 }
 
+int restore_fp_registers(int pid, unsigned long *fp_regs)
+{
+       struct iovec iov;
+
+       if (have_xstate_support) {
+               iov.iov_base = fp_regs;
+               iov.iov_len = sizeof(struct _xstate);
+               if (ptrace(PTRACE_SETREGSET, pid, NT_X86_XSTATE, &iov) < 0)
+                       return -errno;
+               return 0;
+       } else {
+               return restore_i387_registers(pid, fp_regs);
+       }
+}
+
 #ifdef __i386__
 int have_fpx_regs = 1;
 int save_fpx_registers(int pid, unsigned long *fp_regs)
@@ -85,6 +120,16 @@ int put_fp_registers(int pid, unsigned long *regs)
        return restore_fp_registers(pid, regs);
 }
 
+void arch_init_registers(int pid)
+{
+       struct _xstate fp_regs;
+       struct iovec iov;
+
+       iov.iov_base = &fp_regs;
+       iov.iov_len = sizeof(struct _xstate);
+       if (ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov) == 0)
+               have_xstate_support = 1;
+}
 #endif
 
 unsigned long get_thread_reg(int reg, jmp_buf *buf)
index 47c78d5..ebd4dd6 100644 (file)
@@ -194,7 +194,8 @@ static int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *c
        int err, n, cpu = ((struct thread_info *) child->stack)->cpu;
        struct user_i387_struct fpregs;
 
-       err = save_fp_registers(userspace_pid[cpu], (unsigned long *) &fpregs);
+       err = save_i387_registers(userspace_pid[cpu],
+                                 (unsigned long *) &fpregs);
        if (err)
                return err;
 
@@ -214,7 +215,7 @@ static int set_fpregs(struct user_i387_struct __user *buf, struct task_struct *c
        if (n > 0)
                return -EFAULT;
 
-       return restore_fp_registers(userspace_pid[cpu],
+       return restore_i387_registers(userspace_pid[cpu],
                                    (unsigned long *) &fpregs);
 }
 
index a629694..faab418 100644 (file)
@@ -222,14 +222,14 @@ int is_syscall(unsigned long addr)
 static int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *child)
 {
        int err, n, cpu = ((struct thread_info *) child->stack)->cpu;
-       long fpregs[HOST_FP_SIZE];
+       struct user_i387_struct fpregs;
 
-       BUG_ON(sizeof(*buf) != sizeof(fpregs));
-       err = save_fp_registers(userspace_pid[cpu], fpregs);
+       err = save_i387_registers(userspace_pid[cpu],
+                                 (unsigned long *) &fpregs);
        if (err)
                return err;
 
-       n = copy_to_user(buf, fpregs, sizeof(fpregs));
+       n = copy_to_user(buf, &fpregs, sizeof(fpregs));
        if (n > 0)
                return -EFAULT;
 
@@ -239,14 +239,14 @@ static int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *c
 static int set_fpregs(struct user_i387_struct __user *buf, struct task_struct *child)
 {
        int n, cpu = ((struct thread_info *) child->stack)->cpu;
-       long fpregs[HOST_FP_SIZE];
+       struct user_i387_struct fpregs;
 
-       BUG_ON(sizeof(*buf) != sizeof(fpregs));
-       n = copy_from_user(fpregs, buf, sizeof(fpregs));
+       n = copy_from_user(&fpregs, buf, sizeof(fpregs));
        if (n > 0)
                return -EFAULT;
 
-       return restore_fp_registers(userspace_pid[cpu], fpregs);
+       return restore_i387_registers(userspace_pid[cpu],
+                                     (unsigned long *) &fpregs);
 }
 
 long subarch_ptrace(struct task_struct *child, long request,
index 919789f..0dc223a 100644 (file)
@@ -57,8 +57,6 @@
 #define UPT_SYSCALL_ARG5(r) UPT_R8(r)
 #define UPT_SYSCALL_ARG6(r) UPT_R9(r)
 
-static inline void arch_init_registers(int pid)
-{
-}
+extern void arch_init_registers(int pid);
 
 #endif
index 14fcd01..49e5036 100644 (file)
@@ -225,26 +225,16 @@ static int copy_sc_from_user(struct pt_regs *regs,
        } else
 #endif
        {
-               struct user_i387_struct fp;
-
-               err = copy_from_user(&fp, (void *)sc.fpstate,
-                                    sizeof(struct user_i387_struct));
+               err = copy_from_user(regs->regs.fp, (void *)sc.fpstate,
+                                    sizeof(struct _xstate));
                if (err)
                        return 1;
-
-               err = restore_fp_registers(pid, (unsigned long *) &fp);
-               if (err < 0) {
-                       printk(KERN_ERR "copy_sc_from_user - "
-                              "restore_fp_registers failed, errno = %d\n",
-                              -err);
-                       return 1;
-               }
        }
        return 0;
 }
 
 static int copy_sc_to_user(struct sigcontext __user *to,
-                          struct _fpstate __user *to_fp, struct pt_regs *regs,
+                          struct _xstate __user *to_fp, struct pt_regs *regs,
                           unsigned long mask)
 {
        struct sigcontext sc;
@@ -310,25 +300,22 @@ static int copy_sc_to_user(struct sigcontext __user *to,
                        return 1;
                }
 
-               err = convert_fxsr_to_user(to_fp, &fpx);
+               err = convert_fxsr_to_user(&to_fp->fpstate, &fpx);
                if (err)
                        return 1;
 
-               err |= __put_user(fpx.swd, &to_fp->status);
-               err |= __put_user(X86_FXSR_MAGIC, &to_fp->magic);
+               err |= __put_user(fpx.swd, &to_fp->fpstate.status);
+               err |= __put_user(X86_FXSR_MAGIC, &to_fp->fpstate.magic);
                if (err)
                        return 1;
 
-               if (copy_to_user(&to_fp->_fxsr_env[0], &fpx,
+               if (copy_to_user(&to_fp->fpstate._fxsr_env[0], &fpx,
                                 sizeof(struct user_fxsr_struct)))
                        return 1;
        } else
 #endif
        {
-               struct user_i387_struct fp;
-
-               err = save_fp_registers(pid, (unsigned long *) &fp);
-               if (copy_to_user(to_fp, &fp, sizeof(struct user_i387_struct)))
+               if (copy_to_user(to_fp, regs->regs.fp, sizeof(struct _xstate)))
                        return 1;
        }
 
@@ -337,7 +324,7 @@ static int copy_sc_to_user(struct sigcontext __user *to,
 
 #ifdef CONFIG_X86_32
 static int copy_ucontext_to_user(struct ucontext __user *uc,
-                                struct _fpstate __user *fp, sigset_t *set,
+                                struct _xstate __user *fp, sigset_t *set,
                                 unsigned long sp)
 {
        int err = 0;
@@ -353,7 +340,7 @@ struct sigframe
        char __user *pretcode;
        int sig;
        struct sigcontext sc;
-       struct _fpstate fpstate;
+       struct _xstate fpstate;
        unsigned long extramask[_NSIG_WORDS-1];
        char retcode[8];
 };
@@ -366,7 +353,7 @@ struct rt_sigframe
        void __user *puc;
        struct siginfo info;
        struct ucontext uc;
-       struct _fpstate fpstate;
+       struct _xstate fpstate;
        char retcode[8];
 };
 
@@ -495,7 +482,7 @@ struct rt_sigframe
        char __user *pretcode;
        struct ucontext uc;
        struct siginfo info;
-       struct _fpstate fpstate;
+       struct _xstate fpstate;
 };
 
 int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig,
index 470564b..cb3c223 100644 (file)
@@ -50,7 +50,7 @@ void foo(void)
        DEFINE(HOST_GS, GS);
        DEFINE(HOST_ORIG_AX, ORIG_EAX);
 #else
-       DEFINE(HOST_FP_SIZE, sizeof(struct _fpstate) / sizeof(unsigned long));
+       DEFINE(HOST_FP_SIZE, sizeof(struct _xstate) / sizeof(unsigned long));
        DEFINE_LONGS(HOST_BX, RBX);
        DEFINE_LONGS(HOST_CX, RCX);
        DEFINE_LONGS(HOST_DI, RDI);
index 7ab2951..e345891 100644 (file)
@@ -393,6 +393,9 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
        unsigned long i = 0;
        unsigned long n = end_pfn - start_pfn;
 
+       if (remap_pfn == 0)
+               remap_pfn = nr_pages;
+
        while (i < n) {
                unsigned long cur_pfn = start_pfn + i;
                unsigned long left = n - i;
@@ -438,17 +441,29 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
        return remap_pfn;
 }
 
-static void __init xen_set_identity_and_remap(unsigned long nr_pages)
+static unsigned long __init xen_count_remap_pages(
+       unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
+       unsigned long remap_pages)
+{
+       if (start_pfn >= nr_pages)
+               return remap_pages;
+
+       return remap_pages + min(end_pfn, nr_pages) - start_pfn;
+}
+
+static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages,
+       unsigned long (*func)(unsigned long start_pfn, unsigned long end_pfn,
+                             unsigned long nr_pages, unsigned long last_val))
 {
        phys_addr_t start = 0;
-       unsigned long last_pfn = nr_pages;
+       unsigned long ret_val = 0;
        const struct e820entry *entry = xen_e820_map;
        int i;
 
        /*
         * Combine non-RAM regions and gaps until a RAM region (or the
-        * end of the map) is reached, then set the 1:1 map and
-        * remap the memory in those non-RAM regions.
+        * end of the map) is reached, then call the provided function
+        * to perform its duty on the non-RAM region.
         *
         * The combined non-RAM regions are rounded to a whole number
         * of pages so any partial pages are accessible via the 1:1
@@ -466,14 +481,13 @@ static void __init xen_set_identity_and_remap(unsigned long nr_pages)
                                end_pfn = PFN_UP(entry->addr);
 
                        if (start_pfn < end_pfn)
-                               last_pfn = xen_set_identity_and_remap_chunk(
-                                               start_pfn, end_pfn, nr_pages,
-                                               last_pfn);
+                               ret_val = func(start_pfn, end_pfn, nr_pages,
+                                              ret_val);
                        start = end;
                }
        }
 
-       pr_info("Released %ld page(s)\n", xen_released_pages);
+       return ret_val;
 }
 
 /*
@@ -596,35 +610,6 @@ static void __init xen_ignore_unusable(void)
        }
 }
 
-static unsigned long __init xen_count_remap_pages(unsigned long max_pfn)
-{
-       unsigned long extra = 0;
-       unsigned long start_pfn, end_pfn;
-       const struct e820entry *entry = xen_e820_map;
-       int i;
-
-       end_pfn = 0;
-       for (i = 0; i < xen_e820_map_entries; i++, entry++) {
-               start_pfn = PFN_DOWN(entry->addr);
-               /* Adjacent regions on non-page boundaries handling! */
-               end_pfn = min(end_pfn, start_pfn);
-
-               if (start_pfn >= max_pfn)
-                       return extra + max_pfn - end_pfn;
-
-               /* Add any holes in map to result. */
-               extra += start_pfn - end_pfn;
-
-               end_pfn = PFN_UP(entry->addr + entry->size);
-               end_pfn = min(end_pfn, max_pfn);
-
-               if (entry->type != E820_RAM)
-                       extra += end_pfn - start_pfn;
-       }
-
-       return extra;
-}
-
 bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size)
 {
        struct e820entry *entry;
@@ -804,7 +789,7 @@ char * __init xen_memory_setup(void)
        max_pages = xen_get_max_pages();
 
        /* How many extra pages do we need due to remapping? */
-       max_pages += xen_count_remap_pages(max_pfn);
+       max_pages += xen_foreach_remap_area(max_pfn, xen_count_remap_pages);
 
        if (max_pages > max_pfn)
                extra_pages += max_pages - max_pfn;
@@ -922,7 +907,9 @@ char * __init xen_memory_setup(void)
         * Set identity map on non-RAM pages and prepare remapping the
         * underlying RAM.
         */
-       xen_set_identity_and_remap(max_pfn);
+       xen_foreach_remap_area(max_pfn, xen_set_identity_and_remap_chunk);
+
+       pr_info("Released %ld page(s)\n", xen_released_pages);
 
        return "Xen";
 }
index a0a4e55..6deba5b 100644 (file)
@@ -290,11 +290,11 @@ static int xen_vcpuop_set_next_event(unsigned long delta,
        WARN_ON(!clockevent_state_oneshot(evt));
 
        single.timeout_abs_ns = get_abs_timeout(delta);
-       single.flags = VCPU_SSHOTTMR_future;
+       /* Get an event anyway, even if the timeout is already expired */
+       single.flags = 0;
 
        ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
-
-       BUG_ON(ret != 0 && ret != -ETIME);
+       BUG_ON(ret != 0);
 
        return ret;
 }
index a6b00b3..ef90479 100644 (file)
@@ -323,23 +323,23 @@ static void xtensa_pmu_read(struct perf_event *event)
 
 static int callchain_trace(struct stackframe *frame, void *data)
 {
-       struct perf_callchain_entry *entry = data;
+       struct perf_callchain_entry_ctx *entry = data;
 
        perf_callchain_store(entry, frame->pc);
        return 0;
 }
 
-void perf_callchain_kernel(struct perf_callchain_entry *entry,
+void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
                           struct pt_regs *regs)
 {
-       xtensa_backtrace_kernel(regs, sysctl_perf_event_max_stack,
+       xtensa_backtrace_kernel(regs, entry->max_stack,
                                callchain_trace, NULL, entry);
 }
 
-void perf_callchain_user(struct perf_callchain_entry *entry,
+void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
                         struct pt_regs *regs)
 {
-       xtensa_backtrace_user(regs, sysctl_perf_event_max_stack,
+       xtensa_backtrace_user(regs, entry->max_stack,
                              callchain_trace, entry);
 }
 
index 7df9c92..29cbc1b 100644 (file)
@@ -2020,7 +2020,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 
        q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
        if (!q->queue_ctx)
-               return ERR_PTR(-ENOMEM);
+               goto err_exit;
 
        q->queue_hw_ctx = kzalloc_node(nr_cpu_ids * sizeof(*(q->queue_hw_ctx)),
                                                GFP_KERNEL, set->numa_node);
@@ -2084,6 +2084,8 @@ err_map:
        kfree(q->queue_hw_ctx);
 err_percpu:
        free_percpu(q->queue_ctx);
+err_exit:
+       q->mq_ops = NULL;
        return ERR_PTR(-ENOMEM);
 }
 EXPORT_SYMBOL(blk_mq_init_allocated_queue);
index 698c793..ed2397f 100644 (file)
@@ -4,7 +4,6 @@
 #include <linux/gfp.h>
 #include <linux/blkpg.h>
 #include <linux/hdreg.h>
-#include <linux/badblocks.h>
 #include <linux/backing-dev.h>
 #include <linux/fs.h>
 #include <linux/blktrace_api.h>
index e28e912..331f6ba 100644 (file)
@@ -13,6 +13,7 @@ config ASYMMETRIC_PUBLIC_KEY_SUBTYPE
        tristate "Asymmetric public-key crypto algorithm subtype"
        select MPILIB
        select CRYPTO_HASH_INFO
+       select CRYPTO_AKCIPHER
        help
          This option provides support for asymmetric public key type handling.
          If signature generation and/or verification are to be used,
index 15e4604..1f41284 100644 (file)
@@ -265,7 +265,7 @@ static int acpi_aml_write_kern(const char *buf, int len)
        char *p;
 
        ret = acpi_aml_lock_write(crc, ACPI_AML_OUT_KERN);
-       if (IS_ERR_VALUE(ret))
+       if (ret < 0)
                return ret;
        /* sync tail before inserting logs */
        smp_mb();
@@ -286,7 +286,7 @@ static int acpi_aml_readb_kern(void)
        char *p;
 
        ret = acpi_aml_lock_read(crc, ACPI_AML_IN_KERN);
-       if (IS_ERR_VALUE(ret))
+       if (ret < 0)
                return ret;
        /* sync head before removing cmds */
        smp_rmb();
@@ -330,7 +330,7 @@ again:
                                goto again;
                        break;
                }
-               if (IS_ERR_VALUE(ret))
+               if (ret < 0)
                        break;
                size += ret;
                count -= ret;
@@ -373,7 +373,7 @@ again:
                        if (ret == 0)
                                goto again;
                }
-               if (IS_ERR_VALUE(ret))
+               if (ret < 0)
                        break;
                *(msg + size) = (char)ret;
                size++;
@@ -526,7 +526,7 @@ static int acpi_aml_open(struct inode *inode, struct file *file)
        }
        acpi_aml_io.users++;
 err_lock:
-       if (IS_ERR_VALUE(ret)) {
+       if (ret < 0) {
                if (acpi_aml_active_reader == file)
                        acpi_aml_active_reader = NULL;
        }
@@ -587,7 +587,7 @@ static int acpi_aml_read_user(char __user *buf, int len)
        char *p;
 
        ret = acpi_aml_lock_read(crc, ACPI_AML_OUT_USER);
-       if (IS_ERR_VALUE(ret))
+       if (ret < 0)
                return ret;
        /* sync head before removing logs */
        smp_rmb();
@@ -602,7 +602,7 @@ static int acpi_aml_read_user(char __user *buf, int len)
        crc->tail = (crc->tail + n) & (ACPI_AML_BUF_SIZE - 1);
        ret = n;
 out:
-       acpi_aml_unlock_fifo(ACPI_AML_OUT_USER, !IS_ERR_VALUE(ret));
+       acpi_aml_unlock_fifo(ACPI_AML_OUT_USER, !ret);
        return ret;
 }
 
@@ -634,7 +634,7 @@ again:
                                        goto again;
                        }
                }
-               if (IS_ERR_VALUE(ret)) {
+               if (ret < 0) {
                        if (!acpi_aml_running())
                                ret = 0;
                        break;
@@ -657,7 +657,7 @@ static int acpi_aml_write_user(const char __user *buf, int len)
        char *p;
 
        ret = acpi_aml_lock_write(crc, ACPI_AML_IN_USER);
-       if (IS_ERR_VALUE(ret))
+       if (ret < 0)
                return ret;
        /* sync tail before inserting cmds */
        smp_mb();
@@ -672,7 +672,7 @@ static int acpi_aml_write_user(const char __user *buf, int len)
        crc->head = (crc->head + n) & (ACPI_AML_BUF_SIZE - 1);
        ret = n;
 out:
-       acpi_aml_unlock_fifo(ACPI_AML_IN_USER, !IS_ERR_VALUE(ret));
+       acpi_aml_unlock_fifo(ACPI_AML_IN_USER, !ret);
        return n;
 }
 
@@ -704,7 +704,7 @@ again:
                                        goto again;
                        }
                }
-               if (IS_ERR_VALUE(ret)) {
+               if (ret < 0) {
                        if (!acpi_aml_running())
                                ret = 0;
                        break;
index 0d92d0f..c7ba948 100644 (file)
@@ -331,15 +331,6 @@ static int acpi_processor_get_info(struct acpi_device *device)
                pr->throttling.duty_width = acpi_gbl_FADT.duty_width;
 
                pr->pblk = object.processor.pblk_address;
-
-               /*
-                * We don't care about error returns - we just try to mark
-                * these reserved so that nobody else is confused into thinking
-                * that this region might be unused..
-                *
-                * (In particular, allocating the IO range for Cardbus)
-                */
-               request_region(pr->throttling.address, 6, "ACPI CPU throttle");
        }
 
        /*
index 3d5b8a0..c1d138e 100644 (file)
@@ -754,7 +754,8 @@ static int acpi_video_bqc_quirk(struct acpi_video_device *device,
 }
 
 int acpi_video_get_levels(struct acpi_device *device,
-                         struct acpi_video_device_brightness **dev_br)
+                         struct acpi_video_device_brightness **dev_br,
+                         int *pmax_level)
 {
        union acpi_object *obj = NULL;
        int i, max_level = 0, count = 0, level_ac_battery = 0;
@@ -841,6 +842,8 @@ int acpi_video_get_levels(struct acpi_device *device,
 
        br->count = count;
        *dev_br = br;
+       if (pmax_level)
+               *pmax_level = max_level;
 
 out:
        kfree(obj);
@@ -869,7 +872,7 @@ acpi_video_init_brightness(struct acpi_video_device *device)
        struct acpi_video_device_brightness *br = NULL;
        int result = -EINVAL;
 
-       result = acpi_video_get_levels(device->dev, &br);
+       result = acpi_video_get_levels(device->dev, &br, &max_level);
        if (result)
                return result;
        device->brightness = br;
@@ -1737,7 +1740,7 @@ static void acpi_video_run_bcl_for_osi(struct acpi_video_bus *video)
 
        mutex_lock(&video->device_list_lock);
        list_for_each_entry(dev, &video->video_device_list, entry) {
-               if (!acpi_video_device_lcd_query_levels(dev, &levels))
+               if (!acpi_video_device_lcd_query_levels(dev->dev->handle, &levels))
                        kfree(levels);
        }
        mutex_unlock(&video->device_list_lock);
index 0f18dbc..daceb80 100644 (file)
@@ -83,27 +83,22 @@ acpi_hw_write_multiple(u32 value,
 static u8
 acpi_hw_get_access_bit_width(struct acpi_generic_address *reg, u8 max_bit_width)
 {
-       u64 address;
-
        if (!reg->access_width) {
+               if (reg->space_id == ACPI_ADR_SPACE_SYSTEM_IO) {
+                       max_bit_width = 32;
+               }
+
                /*
                 * Detect old register descriptors where only the bit_width field
-                * makes senses. The target address is copied to handle possible
-                * alignment issues.
+                * makes senses.
                 */
-               ACPI_MOVE_64_TO_64(&address, &reg->address);
-               if (!reg->bit_offset && reg->bit_width &&
+               if (reg->bit_width < max_bit_width &&
+                   !reg->bit_offset && reg->bit_width &&
                    ACPI_IS_POWER_OF_TWO(reg->bit_width) &&
-                   ACPI_IS_ALIGNED(reg->bit_width, 8) &&
-                   ACPI_IS_ALIGNED(address, reg->bit_width)) {
+                   ACPI_IS_ALIGNED(reg->bit_width, 8)) {
                        return (reg->bit_width);
-               } else {
-                       if (reg->space_id == ACPI_ADR_SPACE_SYSTEM_IO) {
-                               return (32);
-                       } else {
-                               return (max_bit_width);
-                       }
                }
+               return (max_bit_width);
        } else {
                return (1 << (reg->access_width + 2));
        }
index b719ab3..ab23479 100644 (file)
@@ -1316,7 +1316,7 @@ static int __init acpi_battery_init(void)
 
 static void __exit acpi_battery_exit(void)
 {
-       async_synchronize_cookie(async_cookie);
+       async_synchronize_cookie(async_cookie + 1);
        acpi_bus_unregister_driver(&acpi_battery_driver);
 #ifdef CONFIG_ACPI_PROCFS_POWER
        acpi_unlock_battery_dir(acpi_battery_dir);
index cd2c3d6..993fd31 100644 (file)
@@ -319,6 +319,7 @@ int acpi_device_fix_up_power(struct acpi_device *device)
 
        return ret;
 }
+EXPORT_SYMBOL_GPL(acpi_device_fix_up_power);
 
 int acpi_device_update_power(struct acpi_device *device, int *state_p)
 {
index f170d74..c72e648 100644 (file)
@@ -676,6 +676,15 @@ static int acpi_processor_get_throttling_fadt(struct acpi_processor *pr)
        if (!pr->flags.throttling)
                return -ENODEV;
 
+       /*
+        * We don't care about error returns - we just try to mark
+        * these reserved so that nobody else is confused into thinking
+        * that this region might be unused..
+        *
+        * (In particular, allocating the IO range for Cardbus)
+        */
+       request_region(pr->throttling.address, 6, "ACPI CPU throttle");
+
        pr->throttling.state = 0;
 
        duty_mask = pr->throttling.state_count - 1;
index 8638d57..aafb8cc 100644 (file)
@@ -197,7 +197,7 @@ static void highbank_set_em_messages(struct device *dev,
 
        for (i = 0; i < SGPIO_PINS; i++) {
                err = of_get_named_gpio(np, "calxeda,sgpio-gpio", i);
-               if (IS_ERR_VALUE(err))
+               if (err < 0)
                        return;
 
                pdata->sgpio_gpio[i] = err;
index a969a7e..85aaf22 100644 (file)
@@ -181,13 +181,17 @@ static char *res_strings[] = {
        "reserved 27", 
        "reserved 28", 
        "reserved 29", 
-       "reserved 30", 
+       "reserved 30", /* FIXME: The strings between 30-40 might be wrong. */
        "reassembly abort: no buffers", 
        "receive buffer overflow", 
        "change in GFC", 
        "receive buffer full", 
        "low priority discard - no receive descriptor", 
        "low priority discard - missing end of packet", 
+       "reserved 37",
+       "reserved 38",
+       "reserved 39",
+       "reseverd 40",
        "reserved 41", 
        "reserved 42", 
        "reserved 43", 
index 7d00f29..809dd1e 100644 (file)
@@ -1128,7 +1128,7 @@ static int rx_pkt(struct atm_dev *dev)
        /* make the ptr point to the corresponding buffer desc entry */  
        buf_desc_ptr += desc;     
         if (!desc || (desc > iadev->num_rx_desc) || 
-                      ((buf_desc_ptr->vc_index & 0xffff) > iadev->num_vc)) { 
+                      ((buf_desc_ptr->vc_index & 0xffff) >= iadev->num_vc)) {
             free_desc(dev, desc);
             IF_ERR(printk("IA: bad descriptor desc = %d \n", desc);)
             return -1;
index c81667d..e44944f 100644 (file)
@@ -1267,14 +1267,15 @@ int dpm_suspend_late(pm_message_t state)
                error = device_suspend_late(dev);
 
                mutex_lock(&dpm_list_mtx);
+               if (!list_empty(&dev->power.entry))
+                       list_move(&dev->power.entry, &dpm_late_early_list);
+
                if (error) {
                        pm_dev_err(dev, state, " late", error);
                        dpm_save_failed_dev(dev_name(dev));
                        put_device(dev);
                        break;
                }
-               if (!list_empty(&dev->power.entry))
-                       list_move(&dev->power.entry, &dpm_late_early_list);
                put_device(dev);
 
                if (async_error)
index 04d706c..35b13a0 100644 (file)
@@ -146,7 +146,6 @@ int bcma_sflash_init(struct bcma_drv_cc *cc)
                return -ENOTSUPP;
        }
 
-       sflash->window = BCMA_SOC_FLASH2;
        sflash->blocksize = e->blocksize;
        sflash->numblocks = e->numblocks;
        sflash->size = sflash->blocksize * sflash->numblocks;
index 51a071e..c04bd9b 100644 (file)
@@ -381,7 +381,7 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector,
 
 #ifdef CONFIG_BLK_DEV_RAM_DAX
 static long brd_direct_access(struct block_device *bdev, sector_t sector,
-                       void __pmem **kaddr, pfn_t *pfn)
+                       void __pmem **kaddr, pfn_t *pfn, long size)
 {
        struct brd_device *brd = bdev->bd_disk->private_data;
        struct page *page;
index 0ede6d7..81666a5 100644 (file)
@@ -350,12 +350,12 @@ struct rbd_device {
        struct rbd_spec         *spec;
        struct rbd_options      *opts;
 
-       char                    *header_name;
+       struct ceph_object_id   header_oid;
+       struct ceph_object_locator header_oloc;
 
        struct ceph_file_layout layout;
 
-       struct ceph_osd_event   *watch_event;
-       struct rbd_obj_request  *watch_request;
+       struct ceph_osd_linger_request *watch_handle;
 
        struct rbd_spec         *parent_spec;
        u64                     parent_overlap;
@@ -1596,12 +1596,6 @@ static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
        return __rbd_obj_request_wait(obj_request, 0);
 }
 
-static int rbd_obj_request_wait_timeout(struct rbd_obj_request *obj_request,
-                                       unsigned long timeout)
-{
-       return __rbd_obj_request_wait(obj_request, timeout);
-}
-
 static void rbd_img_request_complete(struct rbd_img_request *img_request)
 {
 
@@ -1751,12 +1745,6 @@ static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
                complete_all(&obj_request->completion);
 }
 
-static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
-{
-       dout("%s: obj %p\n", __func__, obj_request);
-       obj_request_done_set(obj_request);
-}
-
 static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
 {
        struct rbd_img_request *img_request = NULL;
@@ -1828,13 +1816,12 @@ static void rbd_osd_call_callback(struct rbd_obj_request *obj_request)
                obj_request_done_set(obj_request);
 }
 
-static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
-                               struct ceph_msg *msg)
+static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
 {
        struct rbd_obj_request *obj_request = osd_req->r_priv;
        u16 opcode;
 
-       dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
+       dout("%s: osd_req %p\n", __func__, osd_req);
        rbd_assert(osd_req == obj_request->osd_req);
        if (obj_request_img_data_test(obj_request)) {
                rbd_assert(obj_request->img_request);
@@ -1878,10 +1865,6 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
        case CEPH_OSD_OP_CALL:
                rbd_osd_call_callback(obj_request);
                break;
-       case CEPH_OSD_OP_NOTIFY_ACK:
-       case CEPH_OSD_OP_WATCH:
-               rbd_osd_trivial_callback(obj_request);
-               break;
        default:
                rbd_warn(NULL, "%s: unsupported op %hu",
                        obj_request->object_name, (unsigned short) opcode);
@@ -1896,27 +1879,17 @@ static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
 {
        struct rbd_img_request *img_request = obj_request->img_request;
        struct ceph_osd_request *osd_req = obj_request->osd_req;
-       u64 snap_id;
 
-       rbd_assert(osd_req != NULL);
-
-       snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
-       ceph_osdc_build_request(osd_req, obj_request->offset,
-                       NULL, snap_id, NULL);
+       if (img_request)
+               osd_req->r_snapid = img_request->snap_id;
 }
 
 static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
 {
-       struct rbd_img_request *img_request = obj_request->img_request;
        struct ceph_osd_request *osd_req = obj_request->osd_req;
-       struct ceph_snap_context *snapc;
-       struct timespec mtime = CURRENT_TIME;
 
-       rbd_assert(osd_req != NULL);
-
-       snapc = img_request ? img_request->snapc : NULL;
-       ceph_osdc_build_request(osd_req, obj_request->offset,
-                       snapc, CEPH_NOSNAP, &mtime);
+       osd_req->r_mtime = CURRENT_TIME;
+       osd_req->r_data_offset = obj_request->offset;
 }
 
 /*
@@ -1954,7 +1927,7 @@ static struct ceph_osd_request *rbd_osd_req_create(
        osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false,
                                          GFP_NOIO);
        if (!osd_req)
-               return NULL;    /* ENOMEM */
+               goto fail;
 
        if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
                osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
@@ -1965,9 +1938,18 @@ static struct ceph_osd_request *rbd_osd_req_create(
        osd_req->r_priv = obj_request;
 
        osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
-       ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
+       if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
+                            obj_request->object_name))
+               goto fail;
+
+       if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
+               goto fail;
 
        return osd_req;
+
+fail:
+       ceph_osdc_put_request(osd_req);
+       return NULL;
 }
 
 /*
@@ -2003,16 +1985,25 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
        osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops,
                                                false, GFP_NOIO);
        if (!osd_req)
-               return NULL;    /* ENOMEM */
+               goto fail;
 
        osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
        osd_req->r_callback = rbd_osd_req_callback;
        osd_req->r_priv = obj_request;
 
        osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
-       ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
+       if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
+                            obj_request->object_name))
+               goto fail;
+
+       if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
+               goto fail;
 
        return osd_req;
+
+fail:
+       ceph_osdc_put_request(osd_req);
+       return NULL;
 }
 
 
@@ -2973,17 +2964,20 @@ static int rbd_img_request_submit(struct rbd_img_request *img_request)
 {
        struct rbd_obj_request *obj_request;
        struct rbd_obj_request *next_obj_request;
+       int ret = 0;
 
        dout("%s: img %p\n", __func__, img_request);
-       for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
-               int ret;
 
+       rbd_img_request_get(img_request);
+       for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
                ret = rbd_img_obj_request_submit(obj_request);
                if (ret)
-                       return ret;
+                       goto out_put_ireq;
        }
 
-       return 0;
+out_put_ireq:
+       rbd_img_request_put(img_request);
+       return ret;
 }
 
 static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
@@ -3090,45 +3084,18 @@ out_err:
        obj_request_done_set(obj_request);
 }
 
-static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id)
-{
-       struct rbd_obj_request *obj_request;
-       struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
-       int ret;
-
-       obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
-                                                       OBJ_REQUEST_NODATA);
-       if (!obj_request)
-               return -ENOMEM;
-
-       ret = -ENOMEM;
-       obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
-                                                 obj_request);
-       if (!obj_request->osd_req)
-               goto out;
-
-       osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
-                                       notify_id, 0, 0);
-       rbd_osd_req_format_read(obj_request);
+static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev);
+static void __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev);
 
-       ret = rbd_obj_request_submit(osdc, obj_request);
-       if (ret)
-               goto out;
-       ret = rbd_obj_request_wait(obj_request);
-out:
-       rbd_obj_request_put(obj_request);
-
-       return ret;
-}
-
-static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
+static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
+                        u64 notifier_id, void *data, size_t data_len)
 {
-       struct rbd_device *rbd_dev = (struct rbd_device *)data;
+       struct rbd_device *rbd_dev = arg;
+       struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
        int ret;
 
-       dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
-               rbd_dev->header_name, (unsigned long long)notify_id,
-               (unsigned int)opcode);
+       dout("%s rbd_dev %p cookie %llu notify_id %llu\n", __func__, rbd_dev,
+            cookie, notify_id);
 
        /*
         * Until adequate refresh error handling is in place, there is
@@ -3140,63 +3107,31 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
        if (ret)
                rbd_warn(rbd_dev, "refresh failed: %d", ret);
 
-       ret = rbd_obj_notify_ack_sync(rbd_dev, notify_id);
+       ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
+                                  &rbd_dev->header_oloc, notify_id, cookie,
+                                  NULL, 0);
        if (ret)
                rbd_warn(rbd_dev, "notify_ack ret %d", ret);
 }
 
-/*
- * Send a (un)watch request and wait for the ack.  Return a request
- * with a ref held on success or error.
- */
-static struct rbd_obj_request *rbd_obj_watch_request_helper(
-                                               struct rbd_device *rbd_dev,
-                                               bool watch)
+static void rbd_watch_errcb(void *arg, u64 cookie, int err)
 {
-       struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
-       struct ceph_options *opts = osdc->client->options;
-       struct rbd_obj_request *obj_request;
+       struct rbd_device *rbd_dev = arg;
        int ret;
 
-       obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
-                                            OBJ_REQUEST_NODATA);
-       if (!obj_request)
-               return ERR_PTR(-ENOMEM);
-
-       obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_WRITE, 1,
-                                                 obj_request);
-       if (!obj_request->osd_req) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
-                             rbd_dev->watch_event->cookie, 0, watch);
-       rbd_osd_req_format_write(obj_request);
+       rbd_warn(rbd_dev, "encountered watch error: %d", err);
 
-       if (watch)
-               ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
-
-       ret = rbd_obj_request_submit(osdc, obj_request);
-       if (ret)
-               goto out;
+       __rbd_dev_header_unwatch_sync(rbd_dev);
 
-       ret = rbd_obj_request_wait_timeout(obj_request, opts->mount_timeout);
-       if (ret)
-               goto out;
-
-       ret = obj_request->result;
+       ret = rbd_dev_header_watch_sync(rbd_dev);
        if (ret) {
-               if (watch)
-                       rbd_obj_request_end(obj_request);
-               goto out;
+               rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
+               return;
        }
 
-       return obj_request;
-
-out:
-       rbd_obj_request_put(obj_request);
-       return ERR_PTR(ret);
+       ret = rbd_dev_refresh(rbd_dev);
+       if (ret)
+               rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret);
 }
 
 /*
@@ -3205,35 +3140,33 @@ out:
 static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
 {
        struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
-       struct rbd_obj_request *obj_request;
-       int ret;
+       struct ceph_osd_linger_request *handle;
 
-       rbd_assert(!rbd_dev->watch_event);
-       rbd_assert(!rbd_dev->watch_request);
+       rbd_assert(!rbd_dev->watch_handle);
 
-       ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
-                                    &rbd_dev->watch_event);
-       if (ret < 0)
-               return ret;
+       handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
+                                &rbd_dev->header_oloc, rbd_watch_cb,
+                                rbd_watch_errcb, rbd_dev);
+       if (IS_ERR(handle))
+               return PTR_ERR(handle);
 
-       obj_request = rbd_obj_watch_request_helper(rbd_dev, true);
-       if (IS_ERR(obj_request)) {
-               ceph_osdc_cancel_event(rbd_dev->watch_event);
-               rbd_dev->watch_event = NULL;
-               return PTR_ERR(obj_request);
-       }
+       rbd_dev->watch_handle = handle;
+       return 0;
+}
 
-       /*
-        * A watch request is set to linger, so the underlying osd
-        * request won't go away until we unregister it.  We retain
-        * a pointer to the object request during that time (in
-        * rbd_dev->watch_request), so we'll keep a reference to it.
-        * We'll drop that reference after we've unregistered it in
-        * rbd_dev_header_unwatch_sync().
-        */
-       rbd_dev->watch_request = obj_request;
+static void __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
+{
+       struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+       int ret;
 
-       return 0;
+       if (!rbd_dev->watch_handle)
+               return;
+
+       ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
+       if (ret)
+               rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
+
+       rbd_dev->watch_handle = NULL;
 }
 
 /*
@@ -3241,24 +3174,7 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
  */
 static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
 {
-       struct rbd_obj_request *obj_request;
-
-       rbd_assert(rbd_dev->watch_event);
-       rbd_assert(rbd_dev->watch_request);
-
-       rbd_obj_request_end(rbd_dev->watch_request);
-       rbd_obj_request_put(rbd_dev->watch_request);
-       rbd_dev->watch_request = NULL;
-
-       obj_request = rbd_obj_watch_request_helper(rbd_dev, false);
-       if (!IS_ERR(obj_request))
-               rbd_obj_request_put(obj_request);
-       else
-               rbd_warn(rbd_dev, "unable to tear down watch request (%ld)",
-                        PTR_ERR(obj_request));
-
-       ceph_osdc_cancel_event(rbd_dev->watch_event);
-       rbd_dev->watch_event = NULL;
+       __rbd_dev_header_unwatch_sync(rbd_dev);
 
        dout("%s flushing notifies\n", __func__);
        ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
@@ -3591,7 +3507,7 @@ static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
                if (!ondisk)
                        return -ENOMEM;
 
-               ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
+               ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_oid.name,
                                       0, size, ondisk);
                if (ret < 0)
                        goto out;
@@ -4033,6 +3949,8 @@ static void rbd_dev_release(struct device *dev)
        struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
        bool need_put = !!rbd_dev->opts;
 
+       ceph_oid_destroy(&rbd_dev->header_oid);
+
        rbd_put_client(rbd_dev->rbd_client);
        rbd_spec_put(rbd_dev->spec);
        kfree(rbd_dev->opts);
@@ -4063,6 +3981,9 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
        INIT_LIST_HEAD(&rbd_dev->node);
        init_rwsem(&rbd_dev->header_rwsem);
 
+       ceph_oid_init(&rbd_dev->header_oid);
+       ceph_oloc_init(&rbd_dev->header_oloc);
+
        rbd_dev->dev.bus = &rbd_bus_type;
        rbd_dev->dev.type = &rbd_device_type;
        rbd_dev->dev.parent = &rbd_root_dev;
@@ -4111,7 +4032,7 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
                __le64 size;
        } __attribute__ ((packed)) size_buf = { 0 };
 
-       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
                                "rbd", "get_size",
                                &snapid, sizeof (snapid),
                                &size_buf, sizeof (size_buf));
@@ -4151,7 +4072,7 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
        if (!reply_buf)
                return -ENOMEM;
 
-       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
                                "rbd", "get_object_prefix", NULL, 0,
                                reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
@@ -4186,7 +4107,7 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
        u64 unsup;
        int ret;
 
-       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
                                "rbd", "get_features",
                                &snapid, sizeof (snapid),
                                &features_buf, sizeof (features_buf));
@@ -4248,7 +4169,7 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
        }
 
        snapid = cpu_to_le64(rbd_dev->spec->snap_id);
-       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
                                "rbd", "get_parent",
                                &snapid, sizeof (snapid),
                                reply_buf, size);
@@ -4351,7 +4272,7 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
        u64 stripe_count;
        int ret;
 
-       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
                                "rbd", "get_stripe_unit_count", NULL, 0,
                                (char *)&striping_info_buf, size);
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
@@ -4599,7 +4520,7 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
        if (!reply_buf)
                return -ENOMEM;
 
-       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
                                "rbd", "get_snapcontext", NULL, 0,
                                reply_buf, size);
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
@@ -4664,7 +4585,7 @@ static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
                return ERR_PTR(-ENOMEM);
 
        snapid = cpu_to_le64(snap_id);
-       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
                                "rbd", "get_snapshot_name",
                                &snapid, sizeof (snapid),
                                reply_buf, size);
@@ -4975,13 +4896,13 @@ static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
 again:
        ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name);
        if (ret == -ENOENT && tries++ < 1) {
-               ret = ceph_monc_do_get_version(&rbdc->client->monc, "osdmap",
-                                              &newest_epoch);
+               ret = ceph_monc_get_version(&rbdc->client->monc, "osdmap",
+                                           &newest_epoch);
                if (ret < 0)
                        return ret;
 
                if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
-                       ceph_monc_request_next_osdmap(&rbdc->client->monc);
+                       ceph_osdc_maybe_request_map(&rbdc->client->osdc);
                        (void) ceph_monc_wait_osdmap(&rbdc->client->monc,
                                                     newest_epoch,
                                                     opts->mount_timeout);
@@ -5260,35 +5181,26 @@ err_out_unlock:
 static int rbd_dev_header_name(struct rbd_device *rbd_dev)
 {
        struct rbd_spec *spec = rbd_dev->spec;
-       size_t size;
+       int ret;
 
        /* Record the header object name for this rbd image. */
 
        rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
 
+       rbd_dev->header_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
        if (rbd_dev->image_format == 1)
-               size = strlen(spec->image_name) + sizeof (RBD_SUFFIX);
+               ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
+                                      spec->image_name, RBD_SUFFIX);
        else
-               size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id);
-
-       rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
-       if (!rbd_dev->header_name)
-               return -ENOMEM;
+               ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
+                                      RBD_HEADER_PREFIX, spec->image_id);
 
-       if (rbd_dev->image_format == 1)
-               sprintf(rbd_dev->header_name, "%s%s",
-                       spec->image_name, RBD_SUFFIX);
-       else
-               sprintf(rbd_dev->header_name, "%s%s",
-                       RBD_HEADER_PREFIX, spec->image_id);
-       return 0;
+       return ret;
 }
 
 static void rbd_dev_image_release(struct rbd_device *rbd_dev)
 {
        rbd_dev_unprobe(rbd_dev);
-       kfree(rbd_dev->header_name);
-       rbd_dev->header_name = NULL;
        rbd_dev->image_format = 0;
        kfree(rbd_dev->spec->image_id);
        rbd_dev->spec->image_id = NULL;
@@ -5327,7 +5239,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
                                pr_info("image %s/%s does not exist\n",
                                        rbd_dev->spec->pool_name,
                                        rbd_dev->spec->image_name);
-                       goto out_header_name;
+                       goto err_out_format;
                }
        }
 
@@ -5373,7 +5285,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
                goto err_out_probe;
 
        dout("discovered format %u image, header name is %s\n",
-               rbd_dev->image_format, rbd_dev->header_name);
+               rbd_dev->image_format, rbd_dev->header_oid.name);
        return 0;
 
 err_out_probe:
@@ -5381,9 +5293,6 @@ err_out_probe:
 err_out_watch:
        if (!depth)
                rbd_dev_header_unwatch_sync(rbd_dev);
-out_header_name:
-       kfree(rbd_dev->header_name);
-       rbd_dev->header_name = NULL;
 err_out_format:
        rbd_dev->image_format = 0;
        kfree(rbd_dev->spec->image_id);
index 8830458..1630a1f 100644 (file)
@@ -59,6 +59,7 @@ static int clk_pwm_probe(struct platform_device *pdev)
        struct clk_init_data init;
        struct clk_pwm *clk_pwm;
        struct pwm_device *pwm;
+       struct pwm_args pargs;
        const char *clk_name;
        struct clk *clk;
        int ret;
@@ -71,22 +72,28 @@ static int clk_pwm_probe(struct platform_device *pdev)
        if (IS_ERR(pwm))
                return PTR_ERR(pwm);
 
-       if (!pwm->period) {
+       pwm_get_args(pwm, &pargs);
+       if (!pargs.period) {
                dev_err(&pdev->dev, "invalid PWM period\n");
                return -EINVAL;
        }
 
        if (of_property_read_u32(node, "clock-frequency", &clk_pwm->fixed_rate))
-               clk_pwm->fixed_rate = NSEC_PER_SEC / pwm->period;
+               clk_pwm->fixed_rate = NSEC_PER_SEC / pargs.period;
 
-       if (pwm->period != NSEC_PER_SEC / clk_pwm->fixed_rate &&
-           pwm->period != DIV_ROUND_UP(NSEC_PER_SEC, clk_pwm->fixed_rate)) {
+       if (pargs.period != NSEC_PER_SEC / clk_pwm->fixed_rate &&
+           pargs.period != DIV_ROUND_UP(NSEC_PER_SEC, clk_pwm->fixed_rate)) {
                dev_err(&pdev->dev,
                        "clock-frequency does not match PWM period\n");
                return -EINVAL;
        }
 
-       ret = pwm_config(pwm, (pwm->period + 1) >> 1, pwm->period);
+       /*
+        * FIXME: pwm_apply_args() should be removed when switching to the
+        * atomic PWM API.
+        */
+       pwm_apply_args(pwm);
+       ret = pwm_config(pwm, (pargs.period + 1) >> 1, pargs.period);
        if (ret < 0)
                return ret;
 
index b855181..456cf58 100644 (file)
@@ -1221,7 +1221,7 @@ static int tegra210_pll_fixed_mdiv_cfg(struct clk_hw *hw,
                p = rate >= params->vco_min ? 1 : -EINVAL;
        }
 
-       if (IS_ERR_VALUE(p))
+       if (p < 0)
                return -EINVAL;
 
        cfg->m = tegra_pll_get_fixed_mdiv(hw, input_rate);
index 035513b..9009295 100644 (file)
@@ -78,9 +78,14 @@ static int cpufreq_governor(struct cpufreq_policy *policy, unsigned int event);
 static unsigned int __cpufreq_get(struct cpufreq_policy *policy);
 static int cpufreq_start_governor(struct cpufreq_policy *policy);
 
-static inline int cpufreq_exit_governor(struct cpufreq_policy *policy)
+static inline void cpufreq_exit_governor(struct cpufreq_policy *policy)
 {
-       return cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);
+       (void)cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);
+}
+
+static inline void cpufreq_stop_governor(struct cpufreq_policy *policy)
+{
+       (void)cpufreq_governor(policy, CPUFREQ_GOV_STOP);
 }
 
 /**
@@ -1026,13 +1031,8 @@ static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy, unsigned int cp
                return 0;
 
        down_write(&policy->rwsem);
-       if (has_target()) {
-               ret = cpufreq_governor(policy, CPUFREQ_GOV_STOP);
-               if (ret) {
-                       pr_err("%s: Failed to stop governor\n", __func__);
-                       goto unlock;
-               }
-       }
+       if (has_target())
+               cpufreq_stop_governor(policy);
 
        cpumask_set_cpu(cpu, policy->cpus);
 
@@ -1041,8 +1041,6 @@ static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy, unsigned int cp
                if (ret)
                        pr_err("%s: Failed to start governor\n", __func__);
        }
-
-unlock:
        up_write(&policy->rwsem);
        return ret;
 }
@@ -1354,11 +1352,8 @@ static void cpufreq_offline(unsigned int cpu)
        }
 
        down_write(&policy->rwsem);
-       if (has_target()) {
-               ret = cpufreq_governor(policy, CPUFREQ_GOV_STOP);
-               if (ret)
-                       pr_err("%s: Failed to stop governor\n", __func__);
-       }
+       if (has_target())
+               cpufreq_stop_governor(policy);
 
        cpumask_clear_cpu(cpu, policy->cpus);
 
@@ -1387,12 +1382,8 @@ static void cpufreq_offline(unsigned int cpu)
        if (cpufreq_driver->stop_cpu)
                cpufreq_driver->stop_cpu(policy);
 
-       /* If cpu is last user of policy, free policy */
-       if (has_target()) {
-               ret = cpufreq_exit_governor(policy);
-               if (ret)
-                       pr_err("%s: Failed to exit governor\n", __func__);
-       }
+       if (has_target())
+               cpufreq_exit_governor(policy);
 
        /*
         * Perform the ->exit() even during light-weight tear-down,
@@ -1626,7 +1617,6 @@ EXPORT_SYMBOL(cpufreq_generic_suspend);
 void cpufreq_suspend(void)
 {
        struct cpufreq_policy *policy;
-       int ret;
 
        if (!cpufreq_driver)
                return;
@@ -1639,14 +1629,8 @@ void cpufreq_suspend(void)
        for_each_active_policy(policy) {
                if (has_target()) {
                        down_write(&policy->rwsem);
-                       ret = cpufreq_governor(policy, CPUFREQ_GOV_STOP);
+                       cpufreq_stop_governor(policy);
                        up_write(&policy->rwsem);
-
-                       if (ret) {
-                               pr_err("%s: Failed to stop governor for policy: %p\n",
-                                       __func__, policy);
-                               continue;
-                       }
                }
 
                if (cpufreq_driver->suspend && cpufreq_driver->suspend(policy))
@@ -1848,7 +1832,7 @@ EXPORT_SYMBOL(cpufreq_unregister_notifier);
 unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy,
                                        unsigned int target_freq)
 {
-       clamp_val(target_freq, policy->min, policy->max);
+       target_freq = clamp_val(target_freq, policy->min, policy->max);
 
        return cpufreq_driver->fast_switch(policy, target_freq);
 }
@@ -2049,16 +2033,15 @@ static int cpufreq_governor(struct cpufreq_policy *policy, unsigned int event)
 
        ret = policy->governor->governor(policy, event);
 
-       if (!ret) {
-               if (event == CPUFREQ_GOV_POLICY_INIT)
+       if (event == CPUFREQ_GOV_POLICY_INIT) {
+               if (ret)
+                       module_put(policy->governor->owner);
+               else
                        policy->governor->initialized++;
-               else if (event == CPUFREQ_GOV_POLICY_EXIT)
-                       policy->governor->initialized--;
-       }
-
-       if (((event == CPUFREQ_GOV_POLICY_INIT) && ret) ||
-                       ((event == CPUFREQ_GOV_POLICY_EXIT) && !ret))
+       } else if (event == CPUFREQ_GOV_POLICY_EXIT) {
+               policy->governor->initialized--;
                module_put(policy->governor->owner);
+       }
 
        return ret;
 }
@@ -2221,20 +2204,8 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
        old_gov = policy->governor;
        /* end old governor */
        if (old_gov) {
-               ret = cpufreq_governor(policy, CPUFREQ_GOV_STOP);
-               if (ret) {
-                       /* This can happen due to race with other operations */
-                       pr_debug("%s: Failed to Stop Governor: %s (%d)\n",
-                                __func__, old_gov->name, ret);
-                       return ret;
-               }
-
-               ret = cpufreq_exit_governor(policy);
-               if (ret) {
-                       pr_err("%s: Failed to Exit Governor: %s (%d)\n",
-                              __func__, old_gov->name, ret);
-                       return ret;
-               }
+               cpufreq_stop_governor(policy);
+               cpufreq_exit_governor(policy);
        }
 
        /* start new governor */
@@ -2495,10 +2466,7 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
 
        register_hotcpu_notifier(&cpufreq_cpu_notifier);
        pr_debug("driver %s up and running\n", driver_data->name);
-
-out:
-       put_online_cpus();
-       return ret;
+       goto out;
 
 err_if_unreg:
        subsys_interface_unregister(&cpufreq_interface);
@@ -2508,7 +2476,9 @@ err_null_driver:
        write_lock_irqsave(&cpufreq_driver_lock, flags);
        cpufreq_driver = NULL;
        write_unlock_irqrestore(&cpufreq_driver_lock, flags);
-       goto out;
+out:
+       put_online_cpus();
+       return ret;
 }
 EXPORT_SYMBOL_GPL(cpufreq_register_driver);
 
index b76a98d..0d159b5 100644 (file)
@@ -449,7 +449,7 @@ static void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy)
                cpu->acpi_perf_data.states[0].core_frequency =
                                        policy->cpuinfo.max_freq / 1000;
        cpu->valid_pss_table = true;
-       pr_info("_PPC limits will be enforced\n");
+       pr_debug("_PPC limits will be enforced\n");
 
        return;
 
@@ -1461,12 +1461,11 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy)
        intel_pstate_clear_update_util_hook(policy->cpu);
 
        cpu = all_cpu_data[0];
-       if (cpu->pstate.max_pstate_physical > cpu->pstate.max_pstate) {
-               if (policy->max < policy->cpuinfo.max_freq &&
-                   policy->max > cpu->pstate.max_pstate * cpu->pstate.scaling) {
-                       pr_debug("policy->max > max non turbo frequency\n");
-                       policy->max = policy->cpuinfo.max_freq;
-               }
+       if (cpu->pstate.max_pstate_physical > cpu->pstate.max_pstate &&
+           policy->max < policy->cpuinfo.max_freq &&
+           policy->max > cpu->pstate.max_pstate * cpu->pstate.scaling) {
+               pr_debug("policy->max > max non turbo frequency\n");
+               policy->max = policy->cpuinfo.max_freq;
        }
 
        if (policy->policy == CPUFREQ_POLICY_PERFORMANCE) {
index 6f602c7..643f431 100644 (file)
@@ -307,17 +307,24 @@ static int mtk_cpufreq_set_target(struct cpufreq_policy *policy,
        return 0;
 }
 
+#define DYNAMIC_POWER "dynamic-power-coefficient"
+
 static void mtk_cpufreq_ready(struct cpufreq_policy *policy)
 {
        struct mtk_cpu_dvfs_info *info = policy->driver_data;
        struct device_node *np = of_node_get(info->cpu_dev->of_node);
+       u32 capacitance = 0;
 
        if (WARN_ON(!np))
                return;
 
        if (of_find_property(np, "#cooling-cells", NULL)) {
-               info->cdev = of_cpufreq_cooling_register(np,
-                                                        policy->related_cpus);
+               of_property_read_u32(np, DYNAMIC_POWER, &capacitance);
+
+               info->cdev = of_cpufreq_power_cooling_register(np,
+                                               policy->related_cpus,
+                                               capacitance,
+                                               NULL);
 
                if (IS_ERR(info->cdev)) {
                        dev_err(info->cpu_dev,
index cead9be..376e63c 100644 (file)
@@ -54,7 +54,7 @@ static int omap_target(struct cpufreq_policy *policy, unsigned int index)
 
        freq = new_freq * 1000;
        ret = clk_round_rate(policy->clk, freq);
-       if (IS_ERR_VALUE(ret)) {
+       if (ret < 0) {
                dev_warn(mpu_dev,
                         "CPUfreq: Cannot find matching frequency for %lu\n",
                         freq);
index 2b8e6ce..a4d0059 100644 (file)
@@ -214,7 +214,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
                tick_broadcast_exit();
        }
 
-       if (!cpuidle_state_is_coupled(drv, entered_state))
+       if (!cpuidle_state_is_coupled(drv, index))
                local_irq_enable();
 
        /*
index 44d30b4..5ad5f30 100644 (file)
@@ -402,7 +402,7 @@ int caam_get_era(void)
        ret = of_property_read_u32(caam_node, "fsl,sec-era", &prop);
        of_node_put(caam_node);
 
-       return IS_ERR_VALUE(ret) ? -ENOTSUPP : prop;
+       return ret ? -ENOTSUPP : prop;
 }
 EXPORT_SYMBOL(caam_get_era);
 
index 52c7395..0d0d452 100644 (file)
@@ -122,6 +122,7 @@ static int ccp_aes_xts_crypt(struct ablkcipher_request *req,
        struct ccp_ctx *ctx = crypto_tfm_ctx(req->base.tfm);
        struct ccp_aes_req_ctx *rctx = ablkcipher_request_ctx(req);
        unsigned int unit;
+       u32 unit_size;
        int ret;
 
        if (!ctx->u.aes.key_len)
@@ -133,11 +134,17 @@ static int ccp_aes_xts_crypt(struct ablkcipher_request *req,
        if (!req->info)
                return -EINVAL;
 
-       for (unit = 0; unit < ARRAY_SIZE(unit_size_map); unit++)
-               if (!(req->nbytes & (unit_size_map[unit].size - 1)))
-                       break;
+       unit_size = CCP_XTS_AES_UNIT_SIZE__LAST;
+       if (req->nbytes <= unit_size_map[0].size) {
+               for (unit = 0; unit < ARRAY_SIZE(unit_size_map); unit++) {
+                       if (!(req->nbytes & (unit_size_map[unit].size - 1))) {
+                               unit_size = unit_size_map[unit].value;
+                               break;
+                       }
+               }
+       }
 
-       if ((unit_size_map[unit].value == CCP_XTS_AES_UNIT_SIZE__LAST) ||
+       if ((unit_size == CCP_XTS_AES_UNIT_SIZE__LAST) ||
            (ctx->u.aes.key_len != AES_KEYSIZE_128)) {
                /* Use the fallback to process the request for any
                 * unsupported unit sizes or key sizes
@@ -158,7 +165,7 @@ static int ccp_aes_xts_crypt(struct ablkcipher_request *req,
        rctx->cmd.engine = CCP_ENGINE_XTS_AES_128;
        rctx->cmd.u.xts.action = (encrypt) ? CCP_AES_ACTION_ENCRYPT
                                           : CCP_AES_ACTION_DECRYPT;
-       rctx->cmd.u.xts.unit_size = unit_size_map[unit].value;
+       rctx->cmd.u.xts.unit_size = unit_size;
        rctx->cmd.u.xts.key = &ctx->u.aes.key_sg;
        rctx->cmd.u.xts.key_len = ctx->u.aes.key_len;
        rctx->cmd.u.xts.iv = &rctx->iv_sg;
index 6eefaa2..63464e8 100644 (file)
@@ -1986,7 +1986,7 @@ err_algs:
                                        &dd->pdata->algs_info[i].algs_list[j]);
 err_pm:
        pm_runtime_disable(dev);
-       if (dd->polling_mode)
+       if (!dd->polling_mode)
                dma_release_channel(dd->dma_lch);
 data_err:
        dev_err(dev, "initialization failed.\n");
index 4a2c07e..6355ab3 100644 (file)
@@ -33,6 +33,7 @@
 #include <linux/seq_file.h>
 #include <linux/poll.h>
 #include <linux/reservation.h>
+#include <linux/mm.h>
 
 #include <uapi/linux/dma-buf.h>
 
@@ -90,7 +91,7 @@ static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma)
        dmabuf = file->private_data;
 
        /* check for overflowing the buffer's size */
-       if (vma->vm_pgoff + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) >
+       if (vma->vm_pgoff + vma_pages(vma) >
            dmabuf->size >> PAGE_SHIFT)
                return -EINVAL;
 
@@ -723,11 +724,11 @@ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
                return -EINVAL;
 
        /* check for offset overflow */
-       if (pgoff + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) < pgoff)
+       if (pgoff + vma_pages(vma) < pgoff)
                return -EOVERFLOW;
 
        /* check for overflowing the buffer's size */
-       if (pgoff + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) >
+       if (pgoff + vma_pages(vma) >
            dmabuf->size >> PAGE_SHIFT)
                return -EINVAL;
 
index c0bd572..9566a62 100644 (file)
 #include <linux/reservation.h>
 #include <linux/export.h>
 
+/**
+ * DOC: Reservation Object Overview
+ *
+ * The reservation object provides a mechanism to manage shared and
+ * exclusive fences associated with a buffer.  A reservation object
+ * can have attached one exclusive fence (normally associated with
+ * write operations) or N shared fences (read operations).  The RCU
+ * mechanism is used to protect read access to fences from locked
+ * write-side updates.
+ */
+
 DEFINE_WW_CLASS(reservation_ww_class);
 EXPORT_SYMBOL(reservation_ww_class);
 
@@ -43,9 +54,17 @@ EXPORT_SYMBOL(reservation_seqcount_class);
 
 const char reservation_seqcount_string[] = "reservation_seqcount";
 EXPORT_SYMBOL(reservation_seqcount_string);
-/*
- * Reserve space to add a shared fence to a reservation_object,
- * must be called with obj->lock held.
+
+/**
+ * reservation_object_reserve_shared - Reserve space to add a shared
+ * fence to a reservation_object.
+ * @obj: reservation object
+ *
+ * Should be called before reservation_object_add_shared_fence().  Must
+ * be called with obj->lock held.
+ *
+ * RETURNS
+ * Zero for success, or -errno
  */
 int reservation_object_reserve_shared(struct reservation_object *obj)
 {
@@ -180,7 +199,11 @@ done:
                fence_put(old_fence);
 }
 
-/*
+/**
+ * reservation_object_add_shared_fence - Add a fence to a shared slot
+ * @obj: the reservation object
+ * @fence: the shared fence to add
+ *
  * Add a fence to a shared slot, obj->lock must be held, and
  * reservation_object_reserve_shared_fence has been called.
  */
@@ -200,6 +223,13 @@ void reservation_object_add_shared_fence(struct reservation_object *obj,
 }
 EXPORT_SYMBOL(reservation_object_add_shared_fence);
 
+/**
+ * reservation_object_add_excl_fence - Add an exclusive fence.
+ * @obj: the reservation object
+ * @fence: the shared fence to add
+ *
+ * Add a fence to the exclusive slot.  The obj->lock must be held.
+ */
 void reservation_object_add_excl_fence(struct reservation_object *obj,
                                       struct fence *fence)
 {
@@ -233,6 +263,18 @@ void reservation_object_add_excl_fence(struct reservation_object *obj,
 }
 EXPORT_SYMBOL(reservation_object_add_excl_fence);
 
+/**
+ * reservation_object_get_fences_rcu - Get an object's shared and exclusive
+ * fences without update side lock held
+ * @obj: the reservation object
+ * @pfence_excl: the returned exclusive fence (or NULL)
+ * @pshared_count: the number of shared fences returned
+ * @pshared: the array of shared fence ptrs returned (array is krealloc'd to
+ * the required size, and must be freed by caller)
+ *
+ * RETURNS
+ * Zero or -errno
+ */
 int reservation_object_get_fences_rcu(struct reservation_object *obj,
                                      struct fence **pfence_excl,
                                      unsigned *pshared_count,
@@ -319,6 +361,18 @@ unlock:
 }
 EXPORT_SYMBOL_GPL(reservation_object_get_fences_rcu);
 
+/**
+ * reservation_object_wait_timeout_rcu - Wait on reservation's objects
+ * shared and/or exclusive fences.
+ * @obj: the reservation object
+ * @wait_all: if true, wait on all fences, else wait on just exclusive fence
+ * @intr: if true, do interruptible wait
+ * @timeout: timeout value in jiffies or zero to return immediately
+ *
+ * RETURNS
+ * Returns -ERESTARTSYS if interrupted, 0 if the wait timed out, or
+ * greater than zer on success.
+ */
 long reservation_object_wait_timeout_rcu(struct reservation_object *obj,
                                         bool wait_all, bool intr,
                                         unsigned long timeout)
@@ -416,6 +470,16 @@ reservation_object_test_signaled_single(struct fence *passed_fence)
        return ret;
 }
 
+/**
+ * reservation_object_test_signaled_rcu - Test if a reservation object's
+ * fences have been signaled.
+ * @obj: the reservation object
+ * @test_all: if true, test all fences, otherwise only test the exclusive
+ * fence
+ *
+ * RETURNS
+ * true if all fences signaled, else false
+ */
 bool reservation_object_test_signaled_rcu(struct reservation_object *obj,
                                          bool test_all)
 {
index e0df233..57aa227 100644 (file)
@@ -461,25 +461,25 @@ generate_ndma_promise(struct dma_chan *chan, dma_addr_t src, dma_addr_t dest,
 
        /* Source burst */
        ret = convert_burst(sconfig->src_maxburst);
-       if (IS_ERR_VALUE(ret))
+       if (ret < 0)
                goto fail;
        promise->cfg |= SUN4I_DMA_CFG_SRC_BURST_LENGTH(ret);
 
        /* Destination burst */
        ret = convert_burst(sconfig->dst_maxburst);
-       if (IS_ERR_VALUE(ret))
+       if (ret < 0)
                goto fail;
        promise->cfg |= SUN4I_DMA_CFG_DST_BURST_LENGTH(ret);
 
        /* Source bus width */
        ret = convert_buswidth(sconfig->src_addr_width);
-       if (IS_ERR_VALUE(ret))
+       if (ret < 0)
                goto fail;
        promise->cfg |= SUN4I_DMA_CFG_SRC_DATA_WIDTH(ret);
 
        /* Destination bus width */
        ret = convert_buswidth(sconfig->dst_addr_width);
-       if (IS_ERR_VALUE(ret))
+       if (ret < 0)
                goto fail;
        promise->cfg |= SUN4I_DMA_CFG_DST_DATA_WIDTH(ret);
 
@@ -518,25 +518,25 @@ generate_ddma_promise(struct dma_chan *chan, dma_addr_t src, dma_addr_t dest,
 
        /* Source burst */
        ret = convert_burst(sconfig->src_maxburst);
-       if (IS_ERR_VALUE(ret))
+       if (ret < 0)
                goto fail;
        promise->cfg |= SUN4I_DMA_CFG_SRC_BURST_LENGTH(ret);
 
        /* Destination burst */
        ret = convert_burst(sconfig->dst_maxburst);
-       if (IS_ERR_VALUE(ret))
+       if (ret < 0)
                goto fail;
        promise->cfg |= SUN4I_DMA_CFG_DST_BURST_LENGTH(ret);
 
        /* Source bus width */
        ret = convert_buswidth(sconfig->src_addr_width);
-       if (IS_ERR_VALUE(ret))
+       if (ret < 0)
                goto fail;
        promise->cfg |= SUN4I_DMA_CFG_SRC_DATA_WIDTH(ret);
 
        /* Destination bus width */
        ret = convert_buswidth(sconfig->dst_addr_width);
-       if (IS_ERR_VALUE(ret))
+       if (ret < 0)
                goto fail;
        promise->cfg |= SUN4I_DMA_CFG_DST_DATA_WIDTH(ret);
 
index d39014d..fc5f197 100644 (file)
@@ -29,7 +29,6 @@
 
 #include <mach/hardware.h>
 #include <mach/platform.h>
-#include <mach/irqs.h>
 
 #define LPC32XX_GPIO_P3_INP_STATE              _GPREG(0x000)
 #define LPC32XX_GPIO_P3_OUTP_SET               _GPREG(0x004)
@@ -371,61 +370,16 @@ static int lpc32xx_gpio_request(struct gpio_chip *chip, unsigned pin)
 
 static int lpc32xx_gpio_to_irq_p01(struct gpio_chip *chip, unsigned offset)
 {
-       return IRQ_LPC32XX_P0_P1_IRQ;
+       return -ENXIO;
 }
 
-static const char lpc32xx_gpio_to_irq_gpio_p3_table[] = {
-       IRQ_LPC32XX_GPIO_00,
-       IRQ_LPC32XX_GPIO_01,
-       IRQ_LPC32XX_GPIO_02,
-       IRQ_LPC32XX_GPIO_03,
-       IRQ_LPC32XX_GPIO_04,
-       IRQ_LPC32XX_GPIO_05,
-};
-
 static int lpc32xx_gpio_to_irq_gpio_p3(struct gpio_chip *chip, unsigned offset)
 {
-       if (offset < ARRAY_SIZE(lpc32xx_gpio_to_irq_gpio_p3_table))
-               return lpc32xx_gpio_to_irq_gpio_p3_table[offset];
        return -ENXIO;
 }
 
-static const char lpc32xx_gpio_to_irq_gpi_p3_table[] = {
-       IRQ_LPC32XX_GPI_00,
-       IRQ_LPC32XX_GPI_01,
-       IRQ_LPC32XX_GPI_02,
-       IRQ_LPC32XX_GPI_03,
-       IRQ_LPC32XX_GPI_04,
-       IRQ_LPC32XX_GPI_05,
-       IRQ_LPC32XX_GPI_06,
-       IRQ_LPC32XX_GPI_07,
-       IRQ_LPC32XX_GPI_08,
-       IRQ_LPC32XX_GPI_09,
-       -ENXIO, /* 10 */
-       -ENXIO, /* 11 */
-       -ENXIO, /* 12 */
-       -ENXIO, /* 13 */
-       -ENXIO, /* 14 */
-       -ENXIO, /* 15 */
-       -ENXIO, /* 16 */
-       -ENXIO, /* 17 */
-       -ENXIO, /* 18 */
-       IRQ_LPC32XX_GPI_19,
-       -ENXIO, /* 20 */
-       -ENXIO, /* 21 */
-       -ENXIO, /* 22 */
-       -ENXIO, /* 23 */
-       -ENXIO, /* 24 */
-       -ENXIO, /* 25 */
-       -ENXIO, /* 26 */
-       -ENXIO, /* 27 */
-       IRQ_LPC32XX_GPI_28,
-};
-
 static int lpc32xx_gpio_to_irq_gpi_p3(struct gpio_chip *chip, unsigned offset)
 {
-       if (offset < ARRAY_SIZE(lpc32xx_gpio_to_irq_gpi_p3_table))
-               return lpc32xx_gpio_to_irq_gpi_p3_table[offset];
        return -ENXIO;
 }
 
index 08897dc..1a33a19 100644 (file)
@@ -393,7 +393,7 @@ static int xlp_gpio_probe(struct platform_device *pdev)
                irq_base = irq_alloc_descs(-1, 0, gc->ngpio, 0);
        else
                irq_base = irq_alloc_descs(-1, XLP_GPIO_IRQ_BASE, gc->ngpio, 0);
-       if (IS_ERR_VALUE(irq_base)) {
+       if (irq_base < 0) {
                dev_err(&pdev->dev, "Failed to allocate IRQ numbers\n");
                return irq_base;
        }
index d407f90..24f60d2 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/cdev.h>
 #include <linux/fs.h>
 #include <linux/uaccess.h>
+#include <linux/compat.h>
 #include <uapi/linux/gpio.h>
 
 #include "gpiolib.h"
@@ -316,7 +317,7 @@ static long gpio_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
        struct gpio_device *gdev = filp->private_data;
        struct gpio_chip *chip = gdev->chip;
-       int __user *ip = (int __user *)arg;
+       void __user *ip = (void __user *)arg;
 
        /* We fail any subsequent ioctl():s when the chip is gone */
        if (!chip)
@@ -388,6 +389,14 @@ static long gpio_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
        return -EINVAL;
 }
 
+#ifdef CONFIG_COMPAT
+static long gpio_ioctl_compat(struct file *filp, unsigned int cmd,
+                             unsigned long arg)
+{
+       return gpio_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
+
 /**
  * gpio_chrdev_open() - open the chardev for ioctl operations
  * @inode: inode for this chardev
@@ -431,7 +440,9 @@ static const struct file_operations gpio_fileops = {
        .owner = THIS_MODULE,
        .llseek = noop_llseek,
        .unlocked_ioctl = gpio_ioctl,
-       .compat_ioctl = gpio_ioctl,
+#ifdef CONFIG_COMPAT
+       .compat_ioctl = gpio_ioctl_compat,
+#endif
 };
 
 static void gpiodevice_release(struct device *dev)
@@ -618,6 +629,8 @@ int gpiochip_add_data(struct gpio_chip *chip, void *data)
                goto err_free_label;
        }
 
+       spin_unlock_irqrestore(&gpio_lock, flags);
+
        for (i = 0; i < chip->ngpio; i++) {
                struct gpio_desc *desc = &gdev->descs[i];
 
@@ -649,8 +662,6 @@ int gpiochip_add_data(struct gpio_chip *chip, void *data)
                }
        }
 
-       spin_unlock_irqrestore(&gpio_lock, flags);
-
 #ifdef CONFIG_PINCTRL
        INIT_LIST_HEAD(&gdev->pin_ranges);
 #endif
@@ -1356,10 +1367,13 @@ done:
 /*
  * This descriptor validation needs to be inserted verbatim into each
  * function taking a descriptor, so we need to use a preprocessor
- * macro to avoid endless duplication.
+ * macro to avoid endless duplication. If the desc is NULL it is an
+ * optional GPIO and calls should just bail out.
  */
 #define VALIDATE_DESC(desc) do { \
-       if (!desc || !desc->gdev) { \
+       if (!desc) \
+               return 0; \
+       if (!desc->gdev) { \
                pr_warn("%s: invalid GPIO\n", __func__); \
                return -EINVAL; \
        } \
@@ -1370,7 +1384,9 @@ done:
        } } while (0)
 
 #define VALIDATE_DESC_VOID(desc) do { \
-       if (!desc || !desc->gdev) { \
+       if (!desc) \
+               return; \
+       if (!desc->gdev) { \
                pr_warn("%s: invalid GPIO\n", __func__); \
                return; \
        } \
@@ -2066,17 +2082,30 @@ EXPORT_SYMBOL_GPL(gpiod_to_irq);
  */
 int gpiochip_lock_as_irq(struct gpio_chip *chip, unsigned int offset)
 {
-       if (offset >= chip->ngpio)
-               return -EINVAL;
+       struct gpio_desc *desc;
+
+       desc = gpiochip_get_desc(chip, offset);
+       if (IS_ERR(desc))
+               return PTR_ERR(desc);
+
+       /* Flush direction if something changed behind our back */
+       if (chip->get_direction) {
+               int dir = chip->get_direction(chip, offset);
+
+               if (dir)
+                       clear_bit(FLAG_IS_OUT, &desc->flags);
+               else
+                       set_bit(FLAG_IS_OUT, &desc->flags);
+       }
 
-       if (test_bit(FLAG_IS_OUT, &chip->gpiodev->descs[offset].flags)) {
+       if (test_bit(FLAG_IS_OUT, &desc->flags)) {
                chip_err(chip,
                          "%s: tried to flag a GPIO set as output for IRQ\n",
                          __func__);
                return -EIO;
        }
 
-       set_bit(FLAG_USED_AS_IRQ, &chip->gpiodev->descs[offset].flags);
+       set_bit(FLAG_USED_AS_IRQ, &desc->flags);
        return 0;
 }
 EXPORT_SYMBOL_GPL(gpiochip_lock_as_irq);
index 2bd3e5a..be43afb 100644 (file)
@@ -23,7 +23,7 @@ drm-$(CONFIG_AGP) += drm_agpsupport.o
 
 drm_kms_helper-y := drm_crtc_helper.o drm_dp_helper.o drm_probe_helper.o \
                drm_plane_helper.o drm_dp_mst_topology.o drm_atomic_helper.o \
-               drm_kms_helper_common.o
+               drm_kms_helper_common.o drm_dp_dual_mode_helper.o
 
 drm_kms_helper-$(CONFIG_DRM_LOAD_EDID_FIRMWARE) += drm_edid_load.o
 drm_kms_helper-$(CONFIG_DRM_FBDEV_EMULATION) += drm_fb_helper.o
index ca77ec1..e503e3d 100644 (file)
@@ -2,6 +2,7 @@ menu "ACP (Audio CoProcessor) Configuration"
 
 config DRM_AMD_ACP
        bool "Enable AMD Audio CoProcessor IP support"
+       depends on DRM_AMDGPU
        select MFD_CORE
        select PM_GENERIC_DOMAINS if PM
        help
index 2a009c3..01c36b8 100644 (file)
@@ -602,6 +602,8 @@ int amdgpu_sync_wait(struct amdgpu_sync *sync);
 void amdgpu_sync_free(struct amdgpu_sync *sync);
 int amdgpu_sync_init(void);
 void amdgpu_sync_fini(void);
+int amdgpu_fence_slab_init(void);
+void amdgpu_fence_slab_fini(void);
 
 /*
  * GART structures, functions & helpers
@@ -797,6 +799,7 @@ struct amdgpu_ring {
        unsigned                cond_exe_offs;
        u64                             cond_exe_gpu_addr;
        volatile u32    *cond_exe_cpu_addr;
+       int                     vmid;
 };
 
 /*
@@ -934,7 +937,8 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring,
                    unsigned vm_id, uint64_t pd_addr,
                    uint32_t gds_base, uint32_t gds_size,
                    uint32_t gws_base, uint32_t gws_size,
-                   uint32_t oa_base, uint32_t oa_size);
+                   uint32_t oa_base, uint32_t oa_size,
+                   bool vmid_switch);
 void amdgpu_vm_reset_id(struct amdgpu_device *adev, unsigned vm_id);
 uint64_t amdgpu_vm_map_gart(const dma_addr_t *pages_addr, uint64_t addr);
 int amdgpu_vm_update_page_directory(struct amdgpu_device *adev,
index 199f76b..8943099 100644 (file)
@@ -696,6 +696,17 @@ static uint32_t fw_type_convert(struct cgs_device *cgs_device, uint32_t fw_type)
        return result;
 }
 
+static int amdgpu_cgs_rel_firmware(struct cgs_device *cgs_device, enum cgs_ucode_id type)
+{
+       CGS_FUNC_ADEV;
+       if ((CGS_UCODE_ID_SMU == type) || (CGS_UCODE_ID_SMU_SK == type)) {
+               release_firmware(adev->pm.fw);
+               return 0;
+       }
+       /* cannot release other firmware because they are not created by cgs */
+       return -EINVAL;
+}
+
 static int amdgpu_cgs_get_firmware_info(struct cgs_device *cgs_device,
                                        enum cgs_ucode_id type,
                                        struct cgs_firmware_info *info)
@@ -1125,6 +1136,7 @@ static const struct cgs_ops amdgpu_cgs_ops = {
        amdgpu_cgs_pm_query_clock_limits,
        amdgpu_cgs_set_camera_voltages,
        amdgpu_cgs_get_firmware_info,
+       amdgpu_cgs_rel_firmware,
        amdgpu_cgs_set_powergating_state,
        amdgpu_cgs_set_clockgating_state,
        amdgpu_cgs_get_active_displays_info,
index 60a0c9a..cb07da4 100644 (file)
@@ -194,12 +194,12 @@ int amdgpu_connector_get_monitor_bpc(struct drm_connector *connector)
                                bpc = 8;
                                DRM_DEBUG("%s: HDMI deep color 10 bpc exceeds max tmds clock. Using %d bpc.\n",
                                          connector->name, bpc);
-                       } else if (bpc > 8) {
-                               /* max_tmds_clock missing, but hdmi spec mandates it for deep color. */
-                               DRM_DEBUG("%s: Required max tmds clock for HDMI deep color missing. Using 8 bpc.\n",
-                                         connector->name);
-                               bpc = 8;
                        }
+               } else if (bpc > 8) {
+                       /* max_tmds_clock missing, but hdmi spec mandates it for deep color. */
+                       DRM_DEBUG("%s: Required max tmds clock for HDMI deep color missing. Using 8 bpc.\n",
+                                 connector->name);
+                       bpc = 8;
                }
        }
 
index bb8b149..964f314 100644 (file)
@@ -827,8 +827,10 @@ static uint32_t cail_ioreg_read(struct card_info *info, uint32_t reg)
  */
 static void amdgpu_atombios_fini(struct amdgpu_device *adev)
 {
-       if (adev->mode_info.atom_context)
+       if (adev->mode_info.atom_context) {
                kfree(adev->mode_info.atom_context->scratch);
+               kfree(adev->mode_info.atom_context->iio);
+       }
        kfree(adev->mode_info.atom_context);
        adev->mode_info.atom_context = NULL;
        kfree(adev->mode_info.atom_card_info);
@@ -1325,6 +1327,11 @@ static int amdgpu_fini(struct amdgpu_device *adev)
                adev->ip_block_status[i].valid = false;
        }
 
+       for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
+               if (adev->ip_blocks[i].funcs->late_fini)
+                       adev->ip_blocks[i].funcs->late_fini((void *)adev);
+       }
+
        return 0;
 }
 
@@ -1513,8 +1520,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
                amdgpu_atombios_has_gpu_virtualization_table(adev);
 
        /* Post card if necessary */
-       if (!amdgpu_card_posted(adev) ||
-           adev->virtualization.supports_sr_iov) {
+       if (!amdgpu_card_posted(adev)) {
                if (!adev->bios) {
                        dev_err(adev->dev, "Card not posted and no BIOS - ignoring\n");
                        return -EINVAL;
index 1dab5f2..f888c01 100644 (file)
  * KMS wrapper.
  * - 3.0.0 - initial driver
  * - 3.1.0 - allow reading more status registers (GRBM, SRBM, SDMA, CP)
+ * - 3.2.0 - GFX8: Uses EOP_TC_WB_ACTION_EN, so UMDs don't have to do the same
+ *           at the end of IBs.
  */
 #define KMS_DRIVER_MAJOR       3
-#define KMS_DRIVER_MINOR       1
+#define KMS_DRIVER_MINOR       2
 #define KMS_DRIVER_PATCHLEVEL  0
 
 int amdgpu_vram_limit = 0;
@@ -279,14 +281,26 @@ static const struct pci_device_id pciidlist[] = {
        {0x1002, 0x98E4, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_STONEY|AMD_IS_APU},
        /* Polaris11 */
        {0x1002, 0x67E0, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS11},
-       {0x1002, 0x67E1, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS11},
+       {0x1002, 0x67E3, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS11},
        {0x1002, 0x67E8, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS11},
-       {0x1002, 0x67E9, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS11},
        {0x1002, 0x67EB, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS11},
+       {0x1002, 0x67EF, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS11},
        {0x1002, 0x67FF, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS11},
+       {0x1002, 0x67E1, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS11},
+       {0x1002, 0x67E7, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS11},
+       {0x1002, 0x67E9, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS11},
        /* Polaris10 */
        {0x1002, 0x67C0, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS10},
+       {0x1002, 0x67C1, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS10},
+       {0x1002, 0x67C2, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS10},
+       {0x1002, 0x67C4, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS10},
+       {0x1002, 0x67C7, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS10},
        {0x1002, 0x67DF, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS10},
+       {0x1002, 0x67C8, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS10},
+       {0x1002, 0x67C9, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS10},
+       {0x1002, 0x67CA, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS10},
+       {0x1002, 0x67CC, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS10},
+       {0x1002, 0x67CF, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS10},
 
        {0, 0, 0}
 };
@@ -563,9 +577,12 @@ static struct pci_driver amdgpu_kms_pci_driver = {
        .driver.pm = &amdgpu_pm_ops,
 };
 
+
+
 static int __init amdgpu_init(void)
 {
        amdgpu_sync_init();
+       amdgpu_fence_slab_init();
        if (vgacon_text_force()) {
                DRM_ERROR("VGACON disables amdgpu kernel modesetting.\n");
                return -EINVAL;
@@ -576,7 +593,6 @@ static int __init amdgpu_init(void)
        driver->driver_features |= DRIVER_MODESET;
        driver->num_ioctls = amdgpu_max_kms_ioctl;
        amdgpu_register_atpx_handler();
-
        /* let modprobe override vga console setting */
        return drm_pci_init(driver, pdriver);
 }
@@ -587,6 +603,7 @@ static void __exit amdgpu_exit(void)
        drm_pci_exit(driver, pdriver);
        amdgpu_unregister_atpx_handler();
        amdgpu_sync_fini();
+       amdgpu_fence_slab_fini();
 }
 
 module_init(amdgpu_init);
index ba9c042..d155876 100644 (file)
@@ -55,8 +55,21 @@ struct amdgpu_fence {
 };
 
 static struct kmem_cache *amdgpu_fence_slab;
-static atomic_t amdgpu_fence_slab_ref = ATOMIC_INIT(0);
 
+int amdgpu_fence_slab_init(void)
+{
+       amdgpu_fence_slab = kmem_cache_create(
+               "amdgpu_fence", sizeof(struct amdgpu_fence), 0,
+               SLAB_HWCACHE_ALIGN, NULL);
+       if (!amdgpu_fence_slab)
+               return -ENOMEM;
+       return 0;
+}
+
+void amdgpu_fence_slab_fini(void)
+{
+       kmem_cache_destroy(amdgpu_fence_slab);
+}
 /*
  * Cast helper
  */
@@ -396,13 +409,6 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
  */
 int amdgpu_fence_driver_init(struct amdgpu_device *adev)
 {
-       if (atomic_inc_return(&amdgpu_fence_slab_ref) == 1) {
-               amdgpu_fence_slab = kmem_cache_create(
-                       "amdgpu_fence", sizeof(struct amdgpu_fence), 0,
-                       SLAB_HWCACHE_ALIGN, NULL);
-               if (!amdgpu_fence_slab)
-                       return -ENOMEM;
-       }
        if (amdgpu_debugfs_fence_init(adev))
                dev_err(adev->dev, "fence debugfs file creation failed\n");
 
@@ -437,13 +443,10 @@ void amdgpu_fence_driver_fini(struct amdgpu_device *adev)
                amd_sched_fini(&ring->sched);
                del_timer_sync(&ring->fence_drv.fallback_timer);
                for (j = 0; j <= ring->fence_drv.num_fences_mask; ++j)
-                       fence_put(ring->fence_drv.fences[i]);
+                       fence_put(ring->fence_drv.fences[j]);
                kfree(ring->fence_drv.fences);
                ring->fence_drv.initialized = false;
        }
-
-       if (atomic_dec_and_test(&amdgpu_fence_slab_ref))
-               kmem_cache_destroy(amdgpu_fence_slab);
 }
 
 /**
index 34e3542..7a0b1e5 100644 (file)
@@ -122,6 +122,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
        bool skip_preamble, need_ctx_switch;
        unsigned patch_offset = ~0;
        struct amdgpu_vm *vm;
+       int vmid = 0, old_vmid = ring->vmid;
        struct fence *hwf;
        uint64_t ctx;
 
@@ -135,9 +136,11 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
        if (job) {
                vm = job->vm;
                ctx = job->ctx;
+               vmid = job->vm_id;
        } else {
                vm = NULL;
                ctx = 0;
+               vmid = 0;
        }
 
        if (!ring->ready) {
@@ -163,7 +166,8 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
                r = amdgpu_vm_flush(ring, job->vm_id, job->vm_pd_addr,
                                    job->gds_base, job->gds_size,
                                    job->gws_base, job->gws_size,
-                                   job->oa_base, job->oa_size);
+                                   job->oa_base, job->oa_size,
+                                   (ring->current_ctx == ctx) && (old_vmid != vmid));
                if (r) {
                        amdgpu_ring_undo(ring);
                        return r;
@@ -180,7 +184,6 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
        need_ctx_switch = ring->current_ctx != ctx;
        for (i = 0; i < num_ibs; ++i) {
                ib = &ibs[i];
-
                /* drop preamble IBs if we don't have a context switch */
                if ((ib->flags & AMDGPU_IB_FLAG_PREAMBLE) && skip_preamble)
                        continue;
@@ -188,6 +191,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
                amdgpu_ring_emit_ib(ring, ib, job ? job->vm_id : 0,
                                    need_ctx_switch);
                need_ctx_switch = false;
+               ring->vmid = vmid;
        }
 
        if (ring->funcs->emit_hdp_invalidate)
@@ -198,6 +202,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
                dev_err(adev->dev, "failed to emit fence (%d)\n", r);
                if (job && job->vm_id)
                        amdgpu_vm_reset_id(adev, job->vm_id);
+               ring->vmid = old_vmid;
                amdgpu_ring_undo(ring);
                return r;
        }
index 6bd961f..8225655 100644 (file)
@@ -183,13 +183,6 @@ static int amdgpu_pp_sw_fini(void *handle)
        if (ret)
                return ret;
 
-#ifdef CONFIG_DRM_AMD_POWERPLAY
-       if (adev->pp_enabled) {
-               amdgpu_pm_sysfs_fini(adev);
-               amd_powerplay_fini(adev->powerplay.pp_handle);
-       }
-#endif
-
        return ret;
 }
 
@@ -223,6 +216,22 @@ static int amdgpu_pp_hw_fini(void *handle)
        return ret;
 }
 
+static void amdgpu_pp_late_fini(void *handle)
+{
+#ifdef CONFIG_DRM_AMD_POWERPLAY
+       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+
+       if (adev->pp_enabled) {
+               amdgpu_pm_sysfs_fini(adev);
+               amd_powerplay_fini(adev->powerplay.pp_handle);
+       }
+
+       if (adev->powerplay.ip_funcs->late_fini)
+               adev->powerplay.ip_funcs->late_fini(
+                         adev->powerplay.pp_handle);
+#endif
+}
+
 static int amdgpu_pp_suspend(void *handle)
 {
        int ret = 0;
@@ -311,6 +320,7 @@ const struct amd_ip_funcs amdgpu_pp_ip_funcs = {
        .sw_fini = amdgpu_pp_sw_fini,
        .hw_init = amdgpu_pp_hw_init,
        .hw_fini = amdgpu_pp_hw_fini,
+       .late_fini = amdgpu_pp_late_fini,
        .suspend = amdgpu_pp_suspend,
        .resume = amdgpu_pp_resume,
        .is_idle = amdgpu_pp_is_idle,
index 3b02272..870f949 100644 (file)
@@ -343,6 +343,7 @@ void amdgpu_ring_fini(struct amdgpu_ring *ring)
        ring->ring = NULL;
        ring->ring_obj = NULL;
 
+       amdgpu_wb_free(ring->adev, ring->cond_exe_offs);
        amdgpu_wb_free(ring->adev, ring->fence_offs);
        amdgpu_wb_free(ring->adev, ring->rptr_offs);
        amdgpu_wb_free(ring->adev, ring->wptr_offs);
index 8bf84ef..48618ee 100644 (file)
@@ -115,6 +115,7 @@ int amdgpu_sa_bo_manager_start(struct amdgpu_device *adev,
                return r;
        }
        r = amdgpu_bo_kmap(sa_manager->bo, &sa_manager->cpu_ptr);
+       memset(sa_manager->cpu_ptr, 0, sa_manager->size);
        amdgpu_bo_unreserve(sa_manager->bo);
        return r;
 }
index 01abfc2..e19520c 100644 (file)
@@ -253,19 +253,20 @@ int amdgpu_uvd_sw_fini(struct amdgpu_device *adev)
 {
        int r;
 
-       if (adev->uvd.vcpu_bo == NULL)
-               return 0;
+       kfree(adev->uvd.saved_bo);
 
        amd_sched_entity_fini(&adev->uvd.ring.sched, &adev->uvd.entity);
 
-       r = amdgpu_bo_reserve(adev->uvd.vcpu_bo, false);
-       if (!r) {
-               amdgpu_bo_kunmap(adev->uvd.vcpu_bo);
-               amdgpu_bo_unpin(adev->uvd.vcpu_bo);
-               amdgpu_bo_unreserve(adev->uvd.vcpu_bo);
-       }
+       if (adev->uvd.vcpu_bo) {
+               r = amdgpu_bo_reserve(adev->uvd.vcpu_bo, false);
+               if (!r) {
+                       amdgpu_bo_kunmap(adev->uvd.vcpu_bo);
+                       amdgpu_bo_unpin(adev->uvd.vcpu_bo);
+                       amdgpu_bo_unreserve(adev->uvd.vcpu_bo);
+               }
 
-       amdgpu_bo_unref(&adev->uvd.vcpu_bo);
+               amdgpu_bo_unref(&adev->uvd.vcpu_bo);
+       }
 
        amdgpu_ring_fini(&adev->uvd.ring);
 
index ea708cb..62a4c12 100644 (file)
 /* Special value that no flush is necessary */
 #define AMDGPU_VM_NO_FLUSH (~0ll)
 
+/* Local structure. Encapsulate some VM table update parameters to reduce
+ * the number of function parameters
+ */
+struct amdgpu_vm_update_params {
+       /* address where to copy page table entries from */
+       uint64_t src;
+       /* DMA addresses to use for mapping */
+       dma_addr_t *pages_addr;
+       /* indirect buffer to fill with commands */
+       struct amdgpu_ib *ib;
+};
+
 /**
  * amdgpu_vm_num_pde - return the number of page directory entries
  *
@@ -286,7 +298,8 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring,
                    unsigned vm_id, uint64_t pd_addr,
                    uint32_t gds_base, uint32_t gds_size,
                    uint32_t gws_base, uint32_t gws_size,
-                   uint32_t oa_base, uint32_t oa_size)
+                   uint32_t oa_base, uint32_t oa_size,
+                   bool vmid_switch)
 {
        struct amdgpu_device *adev = ring->adev;
        struct amdgpu_vm_id *id = &adev->vm_manager.ids[vm_id];
@@ -300,8 +313,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring,
        int r;
 
        if (ring->funcs->emit_pipeline_sync && (
-           pd_addr != AMDGPU_VM_NO_FLUSH || gds_switch_needed ||
-                   ring->type == AMDGPU_RING_TYPE_COMPUTE))
+           pd_addr != AMDGPU_VM_NO_FLUSH || gds_switch_needed || vmid_switch))
                amdgpu_ring_emit_pipeline_sync(ring);
 
        if (ring->funcs->emit_vm_flush &&
@@ -389,9 +401,7 @@ struct amdgpu_bo_va *amdgpu_vm_bo_find(struct amdgpu_vm *vm,
  * amdgpu_vm_update_pages - helper to call the right asic function
  *
  * @adev: amdgpu_device pointer
- * @src: address where to copy page table entries from
- * @pages_addr: DMA addresses to use for mapping
- * @ib: indirect buffer to fill with commands
+ * @vm_update_params: see amdgpu_vm_update_params definition
  * @pe: addr of the page entry
  * @addr: dst addr to write into pe
  * @count: number of page entries to update
@@ -402,29 +412,29 @@ struct amdgpu_bo_va *amdgpu_vm_bo_find(struct amdgpu_vm *vm,
  * to setup the page table using the DMA.
  */
 static void amdgpu_vm_update_pages(struct amdgpu_device *adev,
-                                  uint64_t src,
-                                  dma_addr_t *pages_addr,
-                                  struct amdgpu_ib *ib,
+                                  struct amdgpu_vm_update_params
+                                       *vm_update_params,
                                   uint64_t pe, uint64_t addr,
                                   unsigned count, uint32_t incr,
                                   uint32_t flags)
 {
        trace_amdgpu_vm_set_page(pe, addr, count, incr, flags);
 
-       if (src) {
-               src += (addr >> 12) * 8;
-               amdgpu_vm_copy_pte(adev, ib, pe, src, count);
+       if (vm_update_params->src) {
+               amdgpu_vm_copy_pte(adev, vm_update_params->ib,
+                       pe, (vm_update_params->src + (addr >> 12) * 8), count);
 
-       } else if (pages_addr) {
-               amdgpu_vm_write_pte(adev, ib, pages_addr, pe, addr,
-                                   count, incr, flags);
+       } else if (vm_update_params->pages_addr) {
+               amdgpu_vm_write_pte(adev, vm_update_params->ib,
+                       vm_update_params->pages_addr,
+                       pe, addr, count, incr, flags);
 
        } else if (count < 3) {
-               amdgpu_vm_write_pte(adev, ib, NULL, pe, addr,
+               amdgpu_vm_write_pte(adev, vm_update_params->ib, NULL, pe, addr,
                                    count, incr, flags);
 
        } else {
-               amdgpu_vm_set_pte_pde(adev, ib, pe, addr,
+               amdgpu_vm_set_pte_pde(adev, vm_update_params->ib, pe, addr,
                                      count, incr, flags);
        }
 }
@@ -444,10 +454,12 @@ static int amdgpu_vm_clear_bo(struct amdgpu_device *adev,
        struct amdgpu_ring *ring;
        struct fence *fence = NULL;
        struct amdgpu_job *job;
+       struct amdgpu_vm_update_params vm_update_params;
        unsigned entries;
        uint64_t addr;
        int r;
 
+       memset(&vm_update_params, 0, sizeof(vm_update_params));
        ring = container_of(vm->entity.sched, struct amdgpu_ring, sched);
 
        r = reservation_object_reserve_shared(bo->tbo.resv);
@@ -465,7 +477,8 @@ static int amdgpu_vm_clear_bo(struct amdgpu_device *adev,
        if (r)
                goto error;
 
-       amdgpu_vm_update_pages(adev, 0, NULL, &job->ibs[0], addr, 0, entries,
+       vm_update_params.ib = &job->ibs[0];
+       amdgpu_vm_update_pages(adev, &vm_update_params, addr, 0, entries,
                               0, 0);
        amdgpu_ring_pad_ib(ring, &job->ibs[0]);
 
@@ -538,11 +551,12 @@ int amdgpu_vm_update_page_directory(struct amdgpu_device *adev,
        uint64_t last_pde = ~0, last_pt = ~0;
        unsigned count = 0, pt_idx, ndw;
        struct amdgpu_job *job;
-       struct amdgpu_ib *ib;
+       struct amdgpu_vm_update_params vm_update_params;
        struct fence *fence = NULL;
 
        int r;
 
+       memset(&vm_update_params, 0, sizeof(vm_update_params));
        ring = container_of(vm->entity.sched, struct amdgpu_ring, sched);
 
        /* padding, etc. */
@@ -555,7 +569,7 @@ int amdgpu_vm_update_page_directory(struct amdgpu_device *adev,
        if (r)
                return r;
 
-       ib = &job->ibs[0];
+       vm_update_params.ib = &job->ibs[0];
 
        /* walk over the address space and update the page directory */
        for (pt_idx = 0; pt_idx <= vm->max_pde_used; ++pt_idx) {
@@ -575,7 +589,7 @@ int amdgpu_vm_update_page_directory(struct amdgpu_device *adev,
                    ((last_pt + incr * count) != pt)) {
 
                        if (count) {
-                               amdgpu_vm_update_pages(adev, 0, NULL, ib,
+                               amdgpu_vm_update_pages(adev, &vm_update_params,
                                                       last_pde, last_pt,
                                                       count, incr,
                                                       AMDGPU_PTE_VALID);
@@ -590,14 +604,15 @@ int amdgpu_vm_update_page_directory(struct amdgpu_device *adev,
        }
 
        if (count)
-               amdgpu_vm_update_pages(adev, 0, NULL, ib, last_pde, last_pt,
-                                      count, incr, AMDGPU_PTE_VALID);
+               amdgpu_vm_update_pages(adev, &vm_update_params,
+                                       last_pde, last_pt,
+                                       count, incr, AMDGPU_PTE_VALID);
 
-       if (ib->length_dw != 0) {
-               amdgpu_ring_pad_ib(ring, ib);
+       if (vm_update_params.ib->length_dw != 0) {
+               amdgpu_ring_pad_ib(ring, vm_update_params.ib);
                amdgpu_sync_resv(adev, &job->sync, pd->tbo.resv,
                                 AMDGPU_FENCE_OWNER_VM);
-               WARN_ON(ib->length_dw > ndw);
+               WARN_ON(vm_update_params.ib->length_dw > ndw);
                r = amdgpu_job_submit(job, ring, &vm->entity,
                                      AMDGPU_FENCE_OWNER_VM, &fence);
                if (r)
@@ -623,18 +638,15 @@ error_free:
  * amdgpu_vm_frag_ptes - add fragment information to PTEs
  *
  * @adev: amdgpu_device pointer
- * @src: address where to copy page table entries from
- * @pages_addr: DMA addresses to use for mapping
- * @ib: IB for the update
+ * @vm_update_params: see amdgpu_vm_update_params definition
  * @pe_start: first PTE to handle
  * @pe_end: last PTE to handle
  * @addr: addr those PTEs should point to
  * @flags: hw mapping flags
  */
 static void amdgpu_vm_frag_ptes(struct amdgpu_device *adev,
-                               uint64_t src,
-                               dma_addr_t *pages_addr,
-                               struct amdgpu_ib *ib,
+                               struct amdgpu_vm_update_params
+                                       *vm_update_params,
                                uint64_t pe_start, uint64_t pe_end,
                                uint64_t addr, uint32_t flags)
 {
@@ -671,11 +683,11 @@ static void amdgpu_vm_frag_ptes(struct amdgpu_device *adev,
                return;
 
        /* system pages are non continuously */
-       if (src || pages_addr || !(flags & AMDGPU_PTE_VALID) ||
-           (frag_start >= frag_end)) {
+       if (vm_update_params->src || vm_update_params->pages_addr ||
+               !(flags & AMDGPU_PTE_VALID) || (frag_start >= frag_end)) {
 
                count = (pe_end - pe_start) / 8;
-               amdgpu_vm_update_pages(adev, src, pages_addr, ib, pe_start,
+               amdgpu_vm_update_pages(adev, vm_update_params, pe_start,
                                       addr, count, AMDGPU_GPU_PAGE_SIZE,
                                       flags);
                return;
@@ -684,21 +696,21 @@ static void amdgpu_vm_frag_ptes(struct amdgpu_device *adev,
        /* handle the 4K area at the beginning */
        if (pe_start != frag_start) {
                count = (frag_start - pe_start) / 8;
-               amdgpu_vm_update_pages(adev, 0, NULL, ib, pe_start, addr,
+               amdgpu_vm_update_pages(adev, vm_update_params, pe_start, addr,
                                       count, AMDGPU_GPU_PAGE_SIZE, flags);
                addr += AMDGPU_GPU_PAGE_SIZE * count;
        }
 
        /* handle the area in the middle */
        count = (frag_end - frag_start) / 8;
-       amdgpu_vm_update_pages(adev, 0, NULL, ib, frag_start, addr, count,
+       amdgpu_vm_update_pages(adev, vm_update_params, frag_start, addr, count,
                               AMDGPU_GPU_PAGE_SIZE, flags | frag_flags);
 
        /* handle the 4K area at the end */
        if (frag_end != pe_end) {
                addr += AMDGPU_GPU_PAGE_SIZE * count;
                count = (pe_end - frag_end) / 8;
-               amdgpu_vm_update_pages(adev, 0, NULL, ib, frag_end, addr,
+               amdgpu_vm_update_pages(adev, vm_update_params, frag_end, addr,
                                       count, AMDGPU_GPU_PAGE_SIZE, flags);
        }
 }
@@ -707,8 +719,7 @@ static void amdgpu_vm_frag_ptes(struct amdgpu_device *adev,
  * amdgpu_vm_update_ptes - make sure that page tables are valid
  *
  * @adev: amdgpu_device pointer
- * @src: address where to copy page table entries from
- * @pages_addr: DMA addresses to use for mapping
+ * @vm_update_params: see amdgpu_vm_update_params definition
  * @vm: requested vm
  * @start: start of GPU address range
  * @end: end of GPU address range
@@ -718,10 +729,9 @@ static void amdgpu_vm_frag_ptes(struct amdgpu_device *adev,
  * Update the page tables in the range @start - @end.
  */
 static void amdgpu_vm_update_ptes(struct amdgpu_device *adev,
-                                 uint64_t src,
-                                 dma_addr_t *pages_addr,
+                                 struct amdgpu_vm_update_params
+                                       *vm_update_params,
                                  struct amdgpu_vm *vm,
-                                 struct amdgpu_ib *ib,
                                  uint64_t start, uint64_t end,
                                  uint64_t dst, uint32_t flags)
 {
@@ -747,7 +757,7 @@ static void amdgpu_vm_update_ptes(struct amdgpu_device *adev,
 
                if (last_pe_end != pe_start) {
 
-                       amdgpu_vm_frag_ptes(adev, src, pages_addr, ib,
+                       amdgpu_vm_frag_ptes(adev, vm_update_params,
                                            last_pe_start, last_pe_end,
                                            last_dst, flags);
 
@@ -762,7 +772,7 @@ static void amdgpu_vm_update_ptes(struct amdgpu_device *adev,
                dst += nptes * AMDGPU_GPU_PAGE_SIZE;
        }
 
-       amdgpu_vm_frag_ptes(adev, src, pages_addr, ib, last_pe_start,
+       amdgpu_vm_frag_ptes(adev, vm_update_params, last_pe_start,
                            last_pe_end, last_dst, flags);
 }
 
@@ -794,11 +804,14 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
        void *owner = AMDGPU_FENCE_OWNER_VM;
        unsigned nptes, ncmds, ndw;
        struct amdgpu_job *job;
-       struct amdgpu_ib *ib;
+       struct amdgpu_vm_update_params vm_update_params;
        struct fence *f = NULL;
        int r;
 
        ring = container_of(vm->entity.sched, struct amdgpu_ring, sched);
+       memset(&vm_update_params, 0, sizeof(vm_update_params));
+       vm_update_params.src = src;
+       vm_update_params.pages_addr = pages_addr;
 
        /* sync to everything on unmapping */
        if (!(flags & AMDGPU_PTE_VALID))
@@ -815,11 +828,11 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
        /* padding, etc. */
        ndw = 64;
 
-       if (src) {
+       if (vm_update_params.src) {
                /* only copy commands needed */
                ndw += ncmds * 7;
 
-       } else if (pages_addr) {
+       } else if (vm_update_params.pages_addr) {
                /* header for write data commands */
                ndw += ncmds * 4;
 
@@ -838,7 +851,7 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
        if (r)
                return r;
 
-       ib = &job->ibs[0];
+       vm_update_params.ib = &job->ibs[0];
 
        r = amdgpu_sync_resv(adev, &job->sync, vm->page_directory->tbo.resv,
                             owner);
@@ -849,11 +862,11 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
        if (r)
                goto error_free;
 
-       amdgpu_vm_update_ptes(adev, src, pages_addr, vm, ib, start,
+       amdgpu_vm_update_ptes(adev, &vm_update_params, vm, start,
                              last + 1, addr, flags);
 
-       amdgpu_ring_pad_ib(ring, ib);
-       WARN_ON(ib->length_dw > ndw);
+       amdgpu_ring_pad_ib(ring, vm_update_params.ib);
+       WARN_ON(vm_update_params.ib->length_dw > ndw);
        r = amdgpu_job_submit(job, ring, &vm->entity,
                              AMDGPU_FENCE_OWNER_VM, &f);
        if (r)
index ea407db..5ec1f1e 100644 (file)
@@ -6221,6 +6221,9 @@ static int ci_dpm_sw_fini(void *handle)
        ci_dpm_fini(adev);
        mutex_unlock(&adev->pm.mutex);
 
+       release_firmware(adev->pm.fw);
+       adev->pm.fw = NULL;
+
        return 0;
 }
 
index 845c21b..be3d6f7 100644 (file)
@@ -103,7 +103,6 @@ static void cik_ih_disable_interrupts(struct amdgpu_device *adev)
  */
 static int cik_ih_irq_init(struct amdgpu_device *adev)
 {
-       int ret = 0;
        int rb_bufsz;
        u32 interrupt_cntl, ih_cntl, ih_rb_cntl;
        u64 wptr_off;
@@ -156,7 +155,7 @@ static int cik_ih_irq_init(struct amdgpu_device *adev)
        /* enable irqs */
        cik_ih_enable_interrupts(adev);
 
-       return ret;
+       return 0;
 }
 
 /**
index 518dca4..9dc4e24 100644 (file)
@@ -66,6 +66,16 @@ MODULE_FIRMWARE("radeon/mullins_sdma1.bin");
 
 u32 amdgpu_cik_gpu_check_soft_reset(struct amdgpu_device *adev);
 
+
+static void cik_sdma_free_microcode(struct amdgpu_device *adev)
+{
+       int i;
+       for (i = 0; i < adev->sdma.num_instances; i++) {
+                       release_firmware(adev->sdma.instance[i].fw);
+                       adev->sdma.instance[i].fw = NULL;
+       }
+}
+
 /*
  * sDMA - System DMA
  * Starting with CIK, the GPU has new asynchronous
@@ -419,6 +429,8 @@ static int cik_sdma_gfx_resume(struct amdgpu_device *adev)
                /* Initialize the ring buffer's read and write pointers */
                WREG32(mmSDMA0_GFX_RB_RPTR + sdma_offsets[i], 0);
                WREG32(mmSDMA0_GFX_RB_WPTR + sdma_offsets[i], 0);
+               WREG32(mmSDMA0_GFX_IB_RPTR + sdma_offsets[i], 0);
+               WREG32(mmSDMA0_GFX_IB_OFFSET + sdma_offsets[i], 0);
 
                /* set the wb address whether it's enabled or not */
                WREG32(mmSDMA0_GFX_RB_RPTR_ADDR_HI + sdma_offsets[i],
@@ -446,7 +458,12 @@ static int cik_sdma_gfx_resume(struct amdgpu_device *adev)
                WREG32(mmSDMA0_GFX_IB_CNTL + sdma_offsets[i], ib_cntl);
 
                ring->ready = true;
+       }
+
+       cik_sdma_enable(adev, true);
 
+       for (i = 0; i < adev->sdma.num_instances; i++) {
+               ring = &adev->sdma.instance[i].ring;
                r = amdgpu_ring_test_ring(ring);
                if (r) {
                        ring->ready = false;
@@ -529,8 +546,8 @@ static int cik_sdma_start(struct amdgpu_device *adev)
        if (r)
                return r;
 
-       /* unhalt the MEs */
-       cik_sdma_enable(adev, true);
+       /* halt the engine before programing */
+       cik_sdma_enable(adev, false);
 
        /* start the gfx rings and rlc compute queues */
        r = cik_sdma_gfx_resume(adev);
@@ -998,6 +1015,7 @@ static int cik_sdma_sw_fini(void *handle)
        for (i = 0; i < adev->sdma.num_instances; i++)
                amdgpu_ring_fini(&adev->sdma.instance[i].ring);
 
+       cik_sdma_free_microcode(adev);
        return 0;
 }
 
index fa4449e..933e425 100644 (file)
@@ -1579,7 +1579,6 @@ static int cz_dpm_update_sclk_limit(struct amdgpu_device *adev)
 
 static int cz_dpm_set_deep_sleep_sclk_threshold(struct amdgpu_device *adev)
 {
-       int ret = 0;
        struct cz_power_info *pi = cz_get_pi(adev);
 
        if (pi->caps_sclk_ds) {
@@ -1588,20 +1587,19 @@ static int cz_dpm_set_deep_sleep_sclk_threshold(struct amdgpu_device *adev)
                                CZ_MIN_DEEP_SLEEP_SCLK);
        }
 
-       return ret;
+       return 0;
 }
 
 /* ?? without dal support, is this still needed in setpowerstate list*/
 static int cz_dpm_set_watermark_threshold(struct amdgpu_device *adev)
 {
-       int ret = 0;
        struct cz_power_info *pi = cz_get_pi(adev);
 
        cz_send_msg_to_smc_with_parameter(adev,
                        PPSMC_MSG_SetWatermarkFrequency,
                        pi->sclk_dpm.soft_max_clk);
 
-       return ret;
+       return 0;
 }
 
 static int cz_dpm_enable_nbdpm(struct amdgpu_device *adev)
@@ -1636,7 +1634,6 @@ static void cz_dpm_nbdpm_lm_pstate_enable(struct amdgpu_device *adev,
 
 static int cz_dpm_update_low_memory_pstate(struct amdgpu_device *adev)
 {
-       int ret = 0;
        struct cz_power_info *pi = cz_get_pi(adev);
        struct cz_ps *ps = &pi->requested_ps;
 
@@ -1647,21 +1644,19 @@ static int cz_dpm_update_low_memory_pstate(struct amdgpu_device *adev)
                        cz_dpm_nbdpm_lm_pstate_enable(adev, true);
        }
 
-       return ret;
+       return 0;
 }
 
 /* with dpm enabled */
 static int cz_dpm_set_power_state(struct amdgpu_device *adev)
 {
-       int ret = 0;
-
        cz_dpm_update_sclk_limit(adev);
        cz_dpm_set_deep_sleep_sclk_threshold(adev);
        cz_dpm_set_watermark_threshold(adev);
        cz_dpm_enable_nbdpm(adev);
        cz_dpm_update_low_memory_pstate(adev);
 
-       return ret;
+       return 0;
 }
 
 static void cz_dpm_post_set_power_state(struct amdgpu_device *adev)
index 863cb16..3d23a70 100644 (file)
@@ -103,7 +103,6 @@ static void cz_ih_disable_interrupts(struct amdgpu_device *adev)
  */
 static int cz_ih_irq_init(struct amdgpu_device *adev)
 {
-       int ret = 0;
        int rb_bufsz;
        u32 interrupt_cntl, ih_cntl, ih_rb_cntl;
        u64 wptr_off;
@@ -157,7 +156,7 @@ static int cz_ih_irq_init(struct amdgpu_device *adev)
        /* enable interrupts */
        cz_ih_enable_interrupts(adev);
 
-       return ret;
+       return 0;
 }
 
 /**
index c11b600..af26ec0 100644 (file)
@@ -137,7 +137,7 @@ static const u32 polaris11_golden_settings_a11[] =
        mmDCI_CLK_CNTL, 0x00000080, 0x00000000,
        mmFBC_DEBUG_COMP, 0x000000f0, 0x00000070,
        mmFBC_DEBUG1, 0xffffffff, 0x00000008,
-       mmFBC_MISC, 0x9f313fff, 0x14300008,
+       mmFBC_MISC, 0x9f313fff, 0x14302008,
        mmHDMI_CONTROL, 0x313f031f, 0x00000011,
 };
 
@@ -145,7 +145,7 @@ static const u32 polaris10_golden_settings_a11[] =
 {
        mmDCI_CLK_CNTL, 0x00000080, 0x00000000,
        mmFBC_DEBUG_COMP, 0x000000f0, 0x00000070,
-       mmFBC_MISC, 0x9f313fff, 0x14300008,
+       mmFBC_MISC, 0x9f313fff, 0x14302008,
        mmHDMI_CONTROL, 0x313f031f, 0x00000011,
 };
 
index 245cabf..ed03b75 100644 (file)
@@ -72,6 +72,11 @@ static int fiji_dpm_sw_init(void *handle)
 
 static int fiji_dpm_sw_fini(void *handle)
 {
+       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+
+       release_firmware(adev->pm.fw);
+       adev->pm.fw = NULL;
+
        return 0;
 }
 
index 7f18a53..8c6ad1e 100644 (file)
@@ -991,6 +991,22 @@ out:
        return err;
 }
 
+static void gfx_v7_0_free_microcode(struct amdgpu_device *adev)
+{
+       release_firmware(adev->gfx.pfp_fw);
+       adev->gfx.pfp_fw = NULL;
+       release_firmware(adev->gfx.me_fw);
+       adev->gfx.me_fw = NULL;
+       release_firmware(adev->gfx.ce_fw);
+       adev->gfx.ce_fw = NULL;
+       release_firmware(adev->gfx.mec_fw);
+       adev->gfx.mec_fw = NULL;
+       release_firmware(adev->gfx.mec2_fw);
+       adev->gfx.mec2_fw = NULL;
+       release_firmware(adev->gfx.rlc_fw);
+       adev->gfx.rlc_fw = NULL;
+}
+
 /**
  * gfx_v7_0_tiling_mode_table_init - init the hw tiling table
  *
@@ -4489,6 +4505,7 @@ static int gfx_v7_0_sw_fini(void *handle)
        gfx_v7_0_cp_compute_fini(adev);
        gfx_v7_0_rlc_fini(adev);
        gfx_v7_0_mec_fini(adev);
+       gfx_v7_0_free_microcode(adev);
 
        return 0;
 }
index 92647fb..9f6f866 100644 (file)
@@ -267,10 +267,13 @@ static const u32 tonga_mgcg_cgcg_init[] =
 
 static const u32 golden_settings_polaris11_a11[] =
 {
+       mmCB_HW_CONTROL, 0xfffdf3cf, 0x00006208,
        mmCB_HW_CONTROL_3, 0x000001ff, 0x00000040,
        mmDB_DEBUG2, 0xf00fffff, 0x00000400,
        mmPA_SC_ENHANCE, 0xffffffff, 0x20000001,
        mmPA_SC_LINE_STIPPLE_STATE, 0x0000ff0f, 0x00000000,
+       mmPA_SC_RASTER_CONFIG, 0x3f3fffff, 0x16000012,
+       mmPA_SC_RASTER_CONFIG_1, 0x0000003f, 0x00000000,
        mmRLC_CGCG_CGLS_CTRL, 0x00000003, 0x0001003c,
        mmRLC_CGCG_CGLS_CTRL_3D, 0xffffffff, 0x0001003c,
        mmSQ_CONFIG, 0x07f80000, 0x07180000,
@@ -284,8 +287,6 @@ static const u32 golden_settings_polaris11_a11[] =
 static const u32 polaris11_golden_common_all[] =
 {
        mmGRBM_GFX_INDEX, 0xffffffff, 0xe0000000,
-       mmPA_SC_RASTER_CONFIG, 0xffffffff, 0x16000012,
-       mmPA_SC_RASTER_CONFIG_1, 0xffffffff, 0x00000000,
        mmGB_ADDR_CONFIG, 0xffffffff, 0x22011002,
        mmSPI_RESOURCE_RESERVE_CU_0, 0xffffffff, 0x00000800,
        mmSPI_RESOURCE_RESERVE_CU_1, 0xffffffff, 0x00000800,
@@ -296,6 +297,7 @@ static const u32 polaris11_golden_common_all[] =
 static const u32 golden_settings_polaris10_a11[] =
 {
        mmATC_MISC_CG, 0x000c0fc0, 0x000c0200,
+       mmCB_HW_CONTROL, 0xfffdf3cf, 0x00006208,
        mmCB_HW_CONTROL_3, 0x000001ff, 0x00000040,
        mmDB_DEBUG2, 0xf00fffff, 0x00000400,
        mmPA_SC_ENHANCE, 0xffffffff, 0x20000001,
@@ -834,6 +836,26 @@ err1:
        return r;
 }
 
+
+static void gfx_v8_0_free_microcode(struct amdgpu_device *adev) {
+       release_firmware(adev->gfx.pfp_fw);
+       adev->gfx.pfp_fw = NULL;
+       release_firmware(adev->gfx.me_fw);
+       adev->gfx.me_fw = NULL;
+       release_firmware(adev->gfx.ce_fw);
+       adev->gfx.ce_fw = NULL;
+       release_firmware(adev->gfx.rlc_fw);
+       adev->gfx.rlc_fw = NULL;
+       release_firmware(adev->gfx.mec_fw);
+       adev->gfx.mec_fw = NULL;
+       if ((adev->asic_type != CHIP_STONEY) &&
+           (adev->asic_type != CHIP_TOPAZ))
+               release_firmware(adev->gfx.mec2_fw);
+       adev->gfx.mec2_fw = NULL;
+
+       kfree(adev->gfx.rlc.register_list_format);
+}
+
 static int gfx_v8_0_init_microcode(struct amdgpu_device *adev)
 {
        const char *chip_name;
@@ -1981,7 +2003,7 @@ static int gfx_v8_0_sw_fini(void *handle)
 
        gfx_v8_0_rlc_fini(adev);
 
-       kfree(adev->gfx.rlc.register_list_format);
+       gfx_v8_0_free_microcode(adev);
 
        return 0;
 }
@@ -3972,11 +3994,15 @@ static int gfx_v8_0_cp_gfx_start(struct amdgpu_device *adev)
                amdgpu_ring_write(ring, 0x3a00161a);
                amdgpu_ring_write(ring, 0x0000002e);
                break;
-       case CHIP_TOPAZ:
        case CHIP_CARRIZO:
                amdgpu_ring_write(ring, 0x00000002);
                amdgpu_ring_write(ring, 0x00000000);
                break;
+       case CHIP_TOPAZ:
+               amdgpu_ring_write(ring, adev->gfx.config.num_rbs == 1 ?
+                               0x00000000 : 0x00000002);
+               amdgpu_ring_write(ring, 0x00000000);
+               break;
        case CHIP_STONEY:
                amdgpu_ring_write(ring, 0x00000000);
                amdgpu_ring_write(ring, 0x00000000);
@@ -5725,6 +5751,7 @@ static void gfx_v8_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64 addr,
        amdgpu_ring_write(ring, PACKET3(PACKET3_EVENT_WRITE_EOP, 4));
        amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
                                 EOP_TC_ACTION_EN |
+                                EOP_TC_WB_ACTION_EN |
                                 EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) |
                                 EVENT_INDEX(5)));
        amdgpu_ring_write(ring, addr & 0xfffffffc);
index 460bc8a..825ccd6 100644 (file)
@@ -72,6 +72,11 @@ static int iceland_dpm_sw_init(void *handle)
 
 static int iceland_dpm_sw_fini(void *handle)
 {
+       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+
+       release_firmware(adev->pm.fw);
+       adev->pm.fw = NULL;
+
        return 0;
 }
 
index 39bfc52..3b8906c 100644 (file)
@@ -103,7 +103,6 @@ static void iceland_ih_disable_interrupts(struct amdgpu_device *adev)
  */
 static int iceland_ih_irq_init(struct amdgpu_device *adev)
 {
-       int ret = 0;
        int rb_bufsz;
        u32 interrupt_cntl, ih_cntl, ih_rb_cntl;
        u64 wptr_off;
@@ -157,7 +156,7 @@ static int iceland_ih_irq_init(struct amdgpu_device *adev)
        /* enable interrupts */
        iceland_ih_enable_interrupts(adev);
 
-       return ret;
+       return 0;
 }
 
 /**
index b45f547..a789a86 100644 (file)
@@ -2252,7 +2252,7 @@ static void kv_apply_state_adjust_rules(struct amdgpu_device *adev,
        if (pi->caps_stable_p_state) {
                stable_p_state_sclk = (max_limits->sclk * 75) / 100;
 
-               for (i = table->count - 1; i >= 0; i++) {
+               for (i = table->count - 1; i >= 0; i--) {
                        if (stable_p_state_sclk >= table->entries[i].clk) {
                                stable_p_state_sclk = table->entries[i].clk;
                                break;
index f4c3130..b556bd0 100644 (file)
@@ -105,6 +105,15 @@ static void sdma_v2_4_init_golden_registers(struct amdgpu_device *adev)
        }
 }
 
+static void sdma_v2_4_free_microcode(struct amdgpu_device *adev)
+{
+       int i;
+       for (i = 0; i < adev->sdma.num_instances; i++) {
+               release_firmware(adev->sdma.instance[i].fw);
+               adev->sdma.instance[i].fw = NULL;
+       }
+}
+
 /**
  * sdma_v2_4_init_microcode - load ucode images from disk
  *
@@ -461,6 +470,8 @@ static int sdma_v2_4_gfx_resume(struct amdgpu_device *adev)
                /* Initialize the ring buffer's read and write pointers */
                WREG32(mmSDMA0_GFX_RB_RPTR + sdma_offsets[i], 0);
                WREG32(mmSDMA0_GFX_RB_WPTR + sdma_offsets[i], 0);
+               WREG32(mmSDMA0_GFX_IB_RPTR + sdma_offsets[i], 0);
+               WREG32(mmSDMA0_GFX_IB_OFFSET + sdma_offsets[i], 0);
 
                /* set the wb address whether it's enabled or not */
                WREG32(mmSDMA0_GFX_RB_RPTR_ADDR_HI + sdma_offsets[i],
@@ -489,7 +500,11 @@ static int sdma_v2_4_gfx_resume(struct amdgpu_device *adev)
                WREG32(mmSDMA0_GFX_IB_CNTL + sdma_offsets[i], ib_cntl);
 
                ring->ready = true;
+       }
 
+       sdma_v2_4_enable(adev, true);
+       for (i = 0; i < adev->sdma.num_instances; i++) {
+               ring = &adev->sdma.instance[i].ring;
                r = amdgpu_ring_test_ring(ring);
                if (r) {
                        ring->ready = false;
@@ -580,8 +595,8 @@ static int sdma_v2_4_start(struct amdgpu_device *adev)
                        return -EINVAL;
        }
 
-       /* unhalt the MEs */
-       sdma_v2_4_enable(adev, true);
+       /* halt the engine before programing */
+       sdma_v2_4_enable(adev, false);
 
        /* start the gfx rings and rlc compute queues */
        r = sdma_v2_4_gfx_resume(adev);
@@ -1012,6 +1027,7 @@ static int sdma_v2_4_sw_fini(void *handle)
        for (i = 0; i < adev->sdma.num_instances; i++)
                amdgpu_ring_fini(&adev->sdma.instance[i].ring);
 
+       sdma_v2_4_free_microcode(adev);
        return 0;
 }
 
index 063f08a..532ea88 100644 (file)
@@ -109,10 +109,12 @@ static const u32 fiji_mgcg_cgcg_init[] =
 static const u32 golden_settings_polaris11_a11[] =
 {
        mmSDMA0_CHICKEN_BITS, 0xfc910007, 0x00810007,
+       mmSDMA0_CLK_CTRL, 0xff000fff, 0x00000000,
        mmSDMA0_GFX_IB_CNTL, 0x800f0111, 0x00000100,
        mmSDMA0_RLC0_IB_CNTL, 0x800f0111, 0x00000100,
        mmSDMA0_RLC1_IB_CNTL, 0x800f0111, 0x00000100,
        mmSDMA1_CHICKEN_BITS, 0xfc910007, 0x00810007,
+       mmSDMA1_CLK_CTRL, 0xff000fff, 0x00000000,
        mmSDMA1_GFX_IB_CNTL, 0x800f0111, 0x00000100,
        mmSDMA1_RLC0_IB_CNTL, 0x800f0111, 0x00000100,
        mmSDMA1_RLC1_IB_CNTL, 0x800f0111, 0x00000100,
@@ -234,6 +236,15 @@ static void sdma_v3_0_init_golden_registers(struct amdgpu_device *adev)
        }
 }
 
+static void sdma_v3_0_free_microcode(struct amdgpu_device *adev)
+{
+       int i;
+       for (i = 0; i < adev->sdma.num_instances; i++) {
+               release_firmware(adev->sdma.instance[i].fw);
+               adev->sdma.instance[i].fw = NULL;
+       }
+}
+
 /**
  * sdma_v3_0_init_microcode - load ucode images from disk
  *
@@ -670,6 +681,8 @@ static int sdma_v3_0_gfx_resume(struct amdgpu_device *adev)
                /* Initialize the ring buffer's read and write pointers */
                WREG32(mmSDMA0_GFX_RB_RPTR + sdma_offsets[i], 0);
                WREG32(mmSDMA0_GFX_RB_WPTR + sdma_offsets[i], 0);
+               WREG32(mmSDMA0_GFX_IB_RPTR + sdma_offsets[i], 0);
+               WREG32(mmSDMA0_GFX_IB_OFFSET + sdma_offsets[i], 0);
 
                /* set the wb address whether it's enabled or not */
                WREG32(mmSDMA0_GFX_RB_RPTR_ADDR_HI + sdma_offsets[i],
@@ -709,7 +722,15 @@ static int sdma_v3_0_gfx_resume(struct amdgpu_device *adev)
                WREG32(mmSDMA0_GFX_IB_CNTL + sdma_offsets[i], ib_cntl);
 
                ring->ready = true;
+       }
+
+       /* unhalt the MEs */
+       sdma_v3_0_enable(adev, true);
+       /* enable sdma ring preemption */
+       sdma_v3_0_ctx_switch_enable(adev, true);
 
+       for (i = 0; i < adev->sdma.num_instances; i++) {
+               ring = &adev->sdma.instance[i].ring;
                r = amdgpu_ring_test_ring(ring);
                if (r) {
                        ring->ready = false;
@@ -802,10 +823,9 @@ static int sdma_v3_0_start(struct amdgpu_device *adev)
                }
        }
 
-       /* unhalt the MEs */
-       sdma_v3_0_enable(adev, true);
-       /* enable sdma ring preemption */
-       sdma_v3_0_ctx_switch_enable(adev, true);
+       /* disble sdma engine before programing it */
+       sdma_v3_0_ctx_switch_enable(adev, false);
+       sdma_v3_0_enable(adev, false);
 
        /* start the gfx rings and rlc compute queues */
        r = sdma_v3_0_gfx_resume(adev);
@@ -1245,6 +1265,7 @@ static int sdma_v3_0_sw_fini(void *handle)
        for (i = 0; i < adev->sdma.num_instances; i++)
                amdgpu_ring_fini(&adev->sdma.instance[i].ring);
 
+       sdma_v3_0_free_microcode(adev);
        return 0;
 }
 
index b7615ce..f06f6f4 100644 (file)
@@ -71,6 +71,11 @@ static int tonga_dpm_sw_init(void *handle)
 
 static int tonga_dpm_sw_fini(void *handle)
 {
+       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+
+       release_firmware(adev->pm.fw);
+       adev->pm.fw = NULL;
+
        return 0;
 }
 
index f036af9..c920558 100644 (file)
@@ -99,7 +99,6 @@ static void tonga_ih_disable_interrupts(struct amdgpu_device *adev)
  */
 static int tonga_ih_irq_init(struct amdgpu_device *adev)
 {
-       int ret = 0;
        int rb_bufsz;
        u32 interrupt_cntl, ih_rb_cntl, ih_doorbell_rtpr;
        u64 wptr_off;
@@ -165,7 +164,7 @@ static int tonga_ih_irq_init(struct amdgpu_device *adev)
        /* enable interrupts */
        tonga_ih_enable_interrupts(adev);
 
-       return ret;
+       return 0;
 }
 
 /**
index ac00579..7708d90 100644 (file)
@@ -242,13 +242,19 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
        pqm_uninit(&p->pqm);
 
        /* Iterate over all process device data structure and check
-        * if we should reset all wavefronts */
-       list_for_each_entry(pdd, &p->per_device_data, per_device_list)
+        * if we should delete debug managers and reset all wavefronts
+        */
+       list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
+               if ((pdd->dev->dbgmgr) &&
+                               (pdd->dev->dbgmgr->pasid == p->pasid))
+                       kfd_dbgmgr_destroy(pdd->dev->dbgmgr);
+
                if (pdd->reset_wavefronts) {
                        pr_warn("amdkfd: Resetting all wave fronts\n");
                        dbgdev_wave_reset_wavefronts(pdd->dev, p);
                        pdd->reset_wavefronts = false;
                }
+       }
 
        mutex_unlock(&p->mutex);
 
@@ -404,42 +410,52 @@ void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid)
 
        idx = srcu_read_lock(&kfd_processes_srcu);
 
+       /*
+        * Look for the process that matches the pasid. If there is no such
+        * process, we either released it in amdkfd's own notifier, or there
+        * is a bug. Unfortunately, there is no way to tell...
+        */
        hash_for_each_rcu(kfd_processes_table, i, p, kfd_processes)
-               if (p->pasid == pasid)
-                       break;
+               if (p->pasid == pasid) {
 
-       srcu_read_unlock(&kfd_processes_srcu, idx);
+                       srcu_read_unlock(&kfd_processes_srcu, idx);
 
-       BUG_ON(p->pasid != pasid);
+                       pr_debug("Unbinding process %d from IOMMU\n", pasid);
 
-       mutex_lock(&p->mutex);
+                       mutex_lock(&p->mutex);
 
-       if ((dev->dbgmgr) && (dev->dbgmgr->pasid == p->pasid))
-               kfd_dbgmgr_destroy(dev->dbgmgr);
+                       if ((dev->dbgmgr) && (dev->dbgmgr->pasid == p->pasid))
+                               kfd_dbgmgr_destroy(dev->dbgmgr);
 
-       pqm_uninit(&p->pqm);
+                       pqm_uninit(&p->pqm);
 
-       pdd = kfd_get_process_device_data(dev, p);
+                       pdd = kfd_get_process_device_data(dev, p);
 
-       if (!pdd) {
-               mutex_unlock(&p->mutex);
-               return;
-       }
+                       if (!pdd) {
+                               mutex_unlock(&p->mutex);
+                               return;
+                       }
 
-       if (pdd->reset_wavefronts) {
-               dbgdev_wave_reset_wavefronts(pdd->dev, p);
-               pdd->reset_wavefronts = false;
-       }
+                       if (pdd->reset_wavefronts) {
+                               dbgdev_wave_reset_wavefronts(pdd->dev, p);
+                               pdd->reset_wavefronts = false;
+                       }
 
-       /*
-        * Just mark pdd as unbound, because we still need it to call
-        * amd_iommu_unbind_pasid() in when the process exits.
-        * We don't call amd_iommu_unbind_pasid() here
-        * because the IOMMU called us.
-        */
-       pdd->bound = false;
+                       /*
+                        * Just mark pdd as unbound, because we still need it
+                        * to call amd_iommu_unbind_pasid() in when the
+                        * process exits.
+                        * We don't call amd_iommu_unbind_pasid() here
+                        * because the IOMMU called us.
+                        */
+                       pdd->bound = false;
 
-       mutex_unlock(&p->mutex);
+                       mutex_unlock(&p->mutex);
+
+                       return;
+               }
+
+       srcu_read_unlock(&kfd_processes_srcu, idx);
 }
 
 struct kfd_process_device *kfd_get_first_process_device_data(struct kfd_process *p)
index 74909e7..884c96f 100644 (file)
@@ -666,7 +666,7 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
                        dev->node_props.simd_count);
 
        if (dev->mem_bank_count < dev->node_props.mem_banks_count) {
-               pr_warn("kfd: mem_banks_count truncated from %d to %d\n",
+               pr_info_once("kfd: mem_banks_count truncated from %d to %d\n",
                                dev->node_props.mem_banks_count,
                                dev->mem_bank_count);
                sysfs_show_32bit_prop(buffer, "mem_banks_count",
index 6080951..afce1ed 100644 (file)
@@ -157,6 +157,7 @@ struct amd_ip_funcs {
        int (*hw_init)(void *handle);
        /* tears down the hw state */
        int (*hw_fini)(void *handle);
+       void (*late_fini)(void *handle);
        /* handles IP specific hw/sw changes for suspend */
        int (*suspend)(void *handle);
        /* handles IP specific hw/sw changes for resume */
index a461e15..7464daf 100644 (file)
@@ -581,6 +581,9 @@ typedef int (*cgs_get_firmware_info)(struct cgs_device *cgs_device,
                                     enum cgs_ucode_id type,
                                     struct cgs_firmware_info *info);
 
+typedef int (*cgs_rel_firmware)(struct cgs_device *cgs_device,
+                                        enum cgs_ucode_id type);
+
 typedef int(*cgs_set_powergating_state)(struct cgs_device *cgs_device,
                                  enum amd_ip_block_type block_type,
                                  enum amd_powergating_state state);
@@ -645,6 +648,7 @@ struct cgs_ops {
        cgs_set_camera_voltages_t set_camera_voltages;
        /* Firmware Info */
        cgs_get_firmware_info get_firmware_info;
+       cgs_rel_firmware rel_firmware;
        /* cg pg interface*/
        cgs_set_powergating_state set_powergating_state;
        cgs_set_clockgating_state set_clockgating_state;
@@ -738,6 +742,8 @@ struct cgs_device
        CGS_CALL(set_camera_voltages,dev,mask,voltages)
 #define cgs_get_firmware_info(dev, type, info) \
        CGS_CALL(get_firmware_info, dev, type, info)
+#define cgs_rel_firmware(dev, type)    \
+       CGS_CALL(rel_firmware, dev, type)
 #define cgs_set_powergating_state(dev, block_type, state)      \
        CGS_CALL(set_powergating_state, dev, block_type, state)
 #define cgs_set_clockgating_state(dev, block_type, state)      \
index 8e345bf..e629f8a 100644 (file)
@@ -73,11 +73,14 @@ static int pp_sw_init(void *handle)
 
        ret = hwmgr->hwmgr_func->backend_init(hwmgr);
        if (ret)
-               goto err;
+               goto err1;
 
        pr_info("amdgpu: powerplay initialized\n");
 
        return 0;
+err1:
+       if (hwmgr->pptable_func->pptable_fini)
+               hwmgr->pptable_func->pptable_fini(hwmgr);
 err:
        pr_err("amdgpu: powerplay initialization failed\n");
        return ret;
@@ -100,6 +103,9 @@ static int pp_sw_fini(void *handle)
        if (hwmgr->hwmgr_func->backend_fini != NULL)
                ret = hwmgr->hwmgr_func->backend_fini(hwmgr);
 
+       if (hwmgr->pptable_func->pptable_fini)
+               hwmgr->pptable_func->pptable_fini(hwmgr);
+
        return ret;
 }
 
index 46410e3..fb88e4e 100644 (file)
@@ -58,9 +58,6 @@ static void pem_fini(struct pp_eventmgr *eventmgr)
        pem_unregister_interrupts(eventmgr);
 
        pem_handle_event(eventmgr, AMD_PP_EVENT_UNINITIALIZE, &event_data);
-
-       if (eventmgr != NULL)
-               kfree(eventmgr);
 }
 
 int eventmgr_init(struct pp_instance *handle)
index c94f9fa..586f732 100644 (file)
@@ -1830,7 +1830,7 @@ static uint16_t fiji_find_closest_vddci(struct pp_hwmgr *hwmgr, uint16_t vddci)
 
        PP_ASSERT_WITH_CODE(false,
                        "VDDCI is larger than max VDDCI in VDDCI Voltage Table!",
-                       return vddci_table->entries[i].value);
+                       return vddci_table->entries[i-1].value);
 }
 
 static int fiji_get_dependency_volt_by_clk(struct pp_hwmgr *hwmgr,
@@ -3573,46 +3573,11 @@ static int fiji_force_dpm_highest(struct pp_hwmgr *hwmgr)
        return 0;
 }
 
-static void fiji_apply_dal_min_voltage_request(struct pp_hwmgr *hwmgr)
-{
-       struct phm_ppt_v1_information *table_info =
-                       (struct phm_ppt_v1_information *)hwmgr->pptable;
-       struct phm_clock_voltage_dependency_table *table =
-                               table_info->vddc_dep_on_dal_pwrl;
-       struct phm_ppt_v1_clock_voltage_dependency_table *vddc_table;
-       enum PP_DAL_POWERLEVEL dal_power_level = hwmgr->dal_power_level;
-       uint32_t req_vddc = 0, req_volt, i;
-
-       if (!table && !(dal_power_level >= PP_DAL_POWERLEVEL_ULTRALOW &&
-                       dal_power_level <= PP_DAL_POWERLEVEL_PERFORMANCE))
-               return;
-
-       for (i= 0; i < table->count; i++) {
-               if (dal_power_level == table->entries[i].clk) {
-                       req_vddc = table->entries[i].v;
-                       break;
-               }
-       }
-
-       vddc_table = table_info->vdd_dep_on_sclk;
-       for (i= 0; i < vddc_table->count; i++) {
-               if (req_vddc <= vddc_table->entries[i].vddc) {
-                       req_volt = (((uint32_t)vddc_table->entries[i].vddc) * VOLTAGE_SCALE)
-                                       << VDDC_SHIFT;
-                       smum_send_msg_to_smc_with_parameter(hwmgr->smumgr,
-                                       PPSMC_MSG_VddC_Request, req_volt);
-                       return;
-               }
-       }
-       printk(KERN_ERR "DAL requested level can not"
-                       " found a available voltage in VDDC DPM Table \n");
-}
-
 static int fiji_upload_dpmlevel_enable_mask(struct pp_hwmgr *hwmgr)
 {
        struct fiji_hwmgr *data = (struct fiji_hwmgr *)(hwmgr->backend);
 
-       fiji_apply_dal_min_voltage_request(hwmgr);
+       phm_apply_dal_min_voltage_request(hwmgr);
 
        if (!data->sclk_dpm_key_disabled) {
                if (data->dpm_level_enable_mask.sclk_dpm_enable_mask)
@@ -4349,7 +4314,7 @@ static int fiji_populate_and_upload_sclk_mclk_dpm_levels(
 
        if (data->need_update_smu7_dpm_table &
                        (DPMTABLE_OD_UPDATE_SCLK + DPMTABLE_UPDATE_SCLK)) {
-               result = fiji_populate_all_memory_levels(hwmgr);
+               result = fiji_populate_all_graphic_levels(hwmgr);
                PP_ASSERT_WITH_CODE((0 == result),
                                "Failed to populate SCLK during PopulateNewDPMClocksStates Function!",
                                return result);
@@ -5109,11 +5074,11 @@ static int fiji_get_pp_table(struct pp_hwmgr *hwmgr, char **table)
        struct fiji_hwmgr *data = (struct fiji_hwmgr *)(hwmgr->backend);
 
        if (!data->soft_pp_table) {
-               data->soft_pp_table = kzalloc(hwmgr->soft_pp_table_size, GFP_KERNEL);
+               data->soft_pp_table = kmemdup(hwmgr->soft_pp_table,
+                                             hwmgr->soft_pp_table_size,
+                                             GFP_KERNEL);
                if (!data->soft_pp_table)
                        return -ENOMEM;
-               memcpy(data->soft_pp_table, hwmgr->soft_pp_table,
-                               hwmgr->soft_pp_table_size);
        }
 
        *table = (char *)&data->soft_pp_table;
index 7d69ed6..20f20e0 100644 (file)
@@ -30,6 +30,9 @@
 #include "pppcielanes.h"
 #include "pp_debug.h"
 #include "ppatomctrl.h"
+#include "ppsmc.h"
+
+#define VOLTAGE_SCALE               4
 
 extern int cz_hwmgr_init(struct pp_hwmgr *hwmgr);
 extern int tonga_hwmgr_init(struct pp_hwmgr *hwmgr);
@@ -90,6 +93,13 @@ int hwmgr_fini(struct pp_hwmgr *hwmgr)
        if (hwmgr == NULL || hwmgr->ps == NULL)
                return -EINVAL;
 
+       /* do hwmgr finish*/
+       kfree(hwmgr->backend);
+
+       kfree(hwmgr->start_thermal_controller.function_list);
+
+       kfree(hwmgr->set_temperature_range.function_list);
+
        kfree(hwmgr->ps);
        kfree(hwmgr);
        return 0;
@@ -459,7 +469,7 @@ uint16_t phm_find_closest_vddci(struct pp_atomctrl_voltage_table *vddci_table, u
 
        PP_ASSERT_WITH_CODE(false,
                        "VDDCI is larger than max VDDCI in VDDCI Voltage Table!",
-                       return vddci_table->entries[i].value);
+                       return vddci_table->entries[i-1].value);
 }
 
 int phm_find_boot_level(void *table,
@@ -566,3 +576,38 @@ uint32_t phm_get_lowest_enabled_level(struct pp_hwmgr *hwmgr, uint32_t mask)
 
        return level;
 }
+
+void phm_apply_dal_min_voltage_request(struct pp_hwmgr *hwmgr)
+{
+       struct phm_ppt_v1_information *table_info =
+                       (struct phm_ppt_v1_information *)hwmgr->pptable;
+       struct phm_clock_voltage_dependency_table *table =
+                               table_info->vddc_dep_on_dal_pwrl;
+       struct phm_ppt_v1_clock_voltage_dependency_table *vddc_table;
+       enum PP_DAL_POWERLEVEL dal_power_level = hwmgr->dal_power_level;
+       uint32_t req_vddc = 0, req_volt, i;
+
+       if (!table || table->count <= 0
+               || dal_power_level < PP_DAL_POWERLEVEL_ULTRALOW
+               || dal_power_level > PP_DAL_POWERLEVEL_PERFORMANCE)
+               return;
+
+       for (i = 0; i < table->count; i++) {
+               if (dal_power_level == table->entries[i].clk) {
+                       req_vddc = table->entries[i].v;
+                       break;
+               }
+       }
+
+       vddc_table = table_info->vdd_dep_on_sclk;
+       for (i = 0; i < vddc_table->count; i++) {
+               if (req_vddc <= vddc_table->entries[i].vddc) {
+                       req_volt = (((uint32_t)vddc_table->entries[i].vddc) * VOLTAGE_SCALE);
+                       smum_send_msg_to_smc_with_parameter(hwmgr->smumgr,
+                                       PPSMC_MSG_VddC_Request, req_volt);
+                       return;
+               }
+       }
+       printk(KERN_ERR "DAL requested level can not"
+                       " found a available voltage in VDDC DPM Table \n");
+}
index 93768fa..aa6be03 100644 (file)
@@ -189,41 +189,6 @@ int phm_get_current_pcie_lane_number(struct pp_hwmgr *hwmgr)
        return decode_pcie_lane_width(link_width);
 }
 
-void phm_apply_dal_min_voltage_request(struct pp_hwmgr *hwmgr)
-{
-       struct phm_ppt_v1_information *table_info =
-                       (struct phm_ppt_v1_information *)hwmgr->pptable;
-       struct phm_clock_voltage_dependency_table *table =
-                               table_info->vddc_dep_on_dal_pwrl;
-       struct phm_ppt_v1_clock_voltage_dependency_table *vddc_table;
-       enum PP_DAL_POWERLEVEL dal_power_level = hwmgr->dal_power_level;
-       uint32_t req_vddc = 0, req_volt, i;
-
-       if (!table && !(dal_power_level >= PP_DAL_POWERLEVEL_ULTRALOW &&
-                       dal_power_level <= PP_DAL_POWERLEVEL_PERFORMANCE))
-               return;
-
-       for (i = 0; i < table->count; i++) {
-               if (dal_power_level == table->entries[i].clk) {
-                       req_vddc = table->entries[i].v;
-                       break;
-               }
-       }
-
-       vddc_table = table_info->vdd_dep_on_sclk;
-       for (i = 0; i < vddc_table->count; i++) {
-               if (req_vddc <= vddc_table->entries[i].vddc) {
-                       req_volt = (((uint32_t)vddc_table->entries[i].vddc) * VOLTAGE_SCALE)
-                                       << VDDC_SHIFT;
-                       smum_send_msg_to_smc_with_parameter(hwmgr->smumgr,
-                                       PPSMC_MSG_VddC_Request, req_volt);
-                       return;
-               }
-       }
-       printk(KERN_ERR "DAL requested level can not"
-                       " found a available voltage in VDDC DPM Table \n");
-}
-
 /**
 * Enable voltage control
 *
@@ -2091,7 +2056,7 @@ static int polaris10_init_smc_table(struct pp_hwmgr *hwmgr)
                                "Failed to populate Clock Stretcher Data Table!",
                                return result);
        }
-
+       table->CurrSclkPllRange = 0xff;
        table->GraphicsVoltageChangeEnable  = 1;
        table->GraphicsThermThrottleEnable  = 1;
        table->GraphicsInterval = 1;
@@ -2184,6 +2149,7 @@ static int polaris10_init_smc_table(struct pp_hwmgr *hwmgr)
        CONVERT_FROM_HOST_TO_SMC_UL(table->SmioMask1);
        CONVERT_FROM_HOST_TO_SMC_UL(table->SmioMask2);
        CONVERT_FROM_HOST_TO_SMC_UL(table->SclkStepSize);
+       CONVERT_FROM_HOST_TO_SMC_UL(table->CurrSclkPllRange);
        CONVERT_FROM_HOST_TO_SMC_US(table->TemperatureLimitHigh);
        CONVERT_FROM_HOST_TO_SMC_US(table->TemperatureLimitLow);
        CONVERT_FROM_HOST_TO_SMC_US(table->VoltageResponseTime);
@@ -4760,11 +4726,11 @@ static int polaris10_get_pp_table(struct pp_hwmgr *hwmgr, char **table)
        struct polaris10_hwmgr *data = (struct polaris10_hwmgr *)(hwmgr->backend);
 
        if (!data->soft_pp_table) {
-               data->soft_pp_table = kzalloc(hwmgr->soft_pp_table_size, GFP_KERNEL);
+               data->soft_pp_table = kmemdup(hwmgr->soft_pp_table,
+                                             hwmgr->soft_pp_table_size,
+                                             GFP_KERNEL);
                if (!data->soft_pp_table)
                        return -ENOMEM;
-               memcpy(data->soft_pp_table, hwmgr->soft_pp_table,
-                               hwmgr->soft_pp_table_size);
        }
 
        *table = (char *)&data->soft_pp_table;
index 0b99ab3..ae96f14 100644 (file)
@@ -286,7 +286,7 @@ int polaris10_populate_pm_fuses(struct pp_hwmgr *hwmgr)
 
                if (polaris10_copy_bytes_to_smc(hwmgr->smumgr, pm_fuse_table_offset,
                                (uint8_t *)&data->power_tune_table,
-                               sizeof(struct SMU74_Discrete_PmFuses), data->sram_end))
+                               (sizeof(struct SMU74_Discrete_PmFuses) - 92), data->sram_end))
                        PP_ASSERT_WITH_CODE(false,
                                        "Attempt to download PmFuseTable Failed!",
                                        return -EINVAL);
index 1faad92..d27e8c4 100644 (file)
@@ -2847,27 +2847,6 @@ static int tonga_setup_default_dpm_tables(struct pp_hwmgr *hwmgr)
                }
        }
 
-       /* Initialize Vddc DPM table based on allow Vddc values.  And populate corresponding std values. */
-       for (i = 0; i < allowed_vdd_sclk_table->count; i++) {
-               data->dpm_table.vddc_table.dpm_levels[i].value = allowed_vdd_mclk_table->entries[i].vddc;
-               /* tonga_hwmgr->dpm_table.VddcTable.dpm_levels[i].param1 = stdVoltageTable->entries[i].Leakage; */
-               /* param1 is for corresponding std voltage */
-               data->dpm_table.vddc_table.dpm_levels[i].enabled = 1;
-       }
-       data->dpm_table.vddc_table.count = allowed_vdd_sclk_table->count;
-
-       if (NULL != allowed_vdd_mclk_table) {
-               /* Initialize Vddci DPM table based on allow Mclk values */
-               for (i = 0; i < allowed_vdd_mclk_table->count; i++) {
-                       data->dpm_table.vdd_ci_table.dpm_levels[i].value = allowed_vdd_mclk_table->entries[i].vddci;
-                       data->dpm_table.vdd_ci_table.dpm_levels[i].enabled = 1;
-                       data->dpm_table.mvdd_table.dpm_levels[i].value = allowed_vdd_mclk_table->entries[i].mvdd;
-                       data->dpm_table.mvdd_table.dpm_levels[i].enabled = 1;
-               }
-               data->dpm_table.vdd_ci_table.count = allowed_vdd_mclk_table->count;
-               data->dpm_table.mvdd_table.count = allowed_vdd_mclk_table->count;
-       }
-
        /* setup PCIE gen speed levels*/
        tonga_setup_default_pcie_tables(hwmgr);
 
@@ -5331,7 +5310,7 @@ static int tonga_freeze_sclk_mclk_dpm(struct pp_hwmgr *hwmgr)
                (data->need_update_smu7_dpm_table &
                (DPMTABLE_OD_UPDATE_SCLK + DPMTABLE_UPDATE_SCLK))) {
                PP_ASSERT_WITH_CODE(
-                       true == tonga_is_dpm_running(hwmgr),
+                       0 == tonga_is_dpm_running(hwmgr),
                        "Trying to freeze SCLK DPM when DPM is disabled",
                        );
                PP_ASSERT_WITH_CODE(
@@ -5344,7 +5323,7 @@ static int tonga_freeze_sclk_mclk_dpm(struct pp_hwmgr *hwmgr)
        if ((0 == data->mclk_dpm_key_disabled) &&
                (data->need_update_smu7_dpm_table &
                 DPMTABLE_OD_UPDATE_MCLK)) {
-               PP_ASSERT_WITH_CODE(true == tonga_is_dpm_running(hwmgr),
+               PP_ASSERT_WITH_CODE(0 == tonga_is_dpm_running(hwmgr),
                        "Trying to freeze MCLK DPM when DPM is disabled",
                        );
                PP_ASSERT_WITH_CODE(
@@ -5445,7 +5424,7 @@ static int tonga_populate_and_upload_sclk_mclk_dpm_levels(struct pp_hwmgr *hwmgr
        }
 
        if (data->need_update_smu7_dpm_table & (DPMTABLE_OD_UPDATE_SCLK + DPMTABLE_UPDATE_SCLK)) {
-               result = tonga_populate_all_memory_levels(hwmgr);
+               result = tonga_populate_all_graphic_levels(hwmgr);
                PP_ASSERT_WITH_CODE((0 == result),
                        "Failed to populate SCLK during PopulateNewDPMClocksStates Function!",
                        return result);
@@ -5647,7 +5626,7 @@ static int tonga_unfreeze_sclk_mclk_dpm(struct pp_hwmgr *hwmgr)
                (data->need_update_smu7_dpm_table &
                (DPMTABLE_OD_UPDATE_SCLK + DPMTABLE_UPDATE_SCLK))) {
 
-               PP_ASSERT_WITH_CODE(true == tonga_is_dpm_running(hwmgr),
+               PP_ASSERT_WITH_CODE(0 == tonga_is_dpm_running(hwmgr),
                        "Trying to Unfreeze SCLK DPM when DPM is disabled",
                        );
                PP_ASSERT_WITH_CODE(
@@ -5661,7 +5640,7 @@ static int tonga_unfreeze_sclk_mclk_dpm(struct pp_hwmgr *hwmgr)
                (data->need_update_smu7_dpm_table & DPMTABLE_OD_UPDATE_MCLK)) {
 
                PP_ASSERT_WITH_CODE(
-                               true == tonga_is_dpm_running(hwmgr),
+                               0 == tonga_is_dpm_running(hwmgr),
                                "Trying to Unfreeze MCLK DPM when DPM is disabled",
                                );
                PP_ASSERT_WITH_CODE(
@@ -6056,11 +6035,11 @@ static int tonga_get_pp_table(struct pp_hwmgr *hwmgr, char **table)
        struct tonga_hwmgr *data = (struct tonga_hwmgr *)(hwmgr->backend);
 
        if (!data->soft_pp_table) {
-               data->soft_pp_table = kzalloc(hwmgr->soft_pp_table_size, GFP_KERNEL);
+               data->soft_pp_table = kmemdup(hwmgr->soft_pp_table,
+                                             hwmgr->soft_pp_table_size,
+                                             GFP_KERNEL);
                if (!data->soft_pp_table)
                        return -ENOMEM;
-               memcpy(data->soft_pp_table, hwmgr->soft_pp_table,
-                               hwmgr->soft_pp_table_size);
        }
 
        *table = (char *)&data->soft_pp_table;
index 10e3630..296ec7e 100644 (file)
@@ -1040,48 +1040,44 @@ int tonga_pp_tables_uninitialize(struct pp_hwmgr *hwmgr)
        struct phm_ppt_v1_information *pp_table_information =
                (struct phm_ppt_v1_information *)(hwmgr->pptable);
 
-       if (NULL != hwmgr->soft_pp_table) {
-               kfree(hwmgr->soft_pp_table);
+       if (NULL != hwmgr->soft_pp_table)
                hwmgr->soft_pp_table = NULL;
-       }
 
-       if (NULL != pp_table_information->vdd_dep_on_sclk)
-               pp_table_information->vdd_dep_on_sclk = NULL;
+       kfree(pp_table_information->vdd_dep_on_sclk);
+       pp_table_information->vdd_dep_on_sclk = NULL;
 
-       if (NULL != pp_table_information->vdd_dep_on_mclk)
-               pp_table_information->vdd_dep_on_mclk = NULL;
+       kfree(pp_table_information->vdd_dep_on_mclk);
+       pp_table_information->vdd_dep_on_mclk = NULL;
 
-       if (NULL != pp_table_information->valid_mclk_values)
-               pp_table_information->valid_mclk_values = NULL;
+       kfree(pp_table_information->valid_mclk_values);
+       pp_table_information->valid_mclk_values = NULL;
 
-       if (NULL != pp_table_information->valid_sclk_values)
-               pp_table_information->valid_sclk_values = NULL;
+       kfree(pp_table_information->valid_sclk_values);
+       pp_table_information->valid_sclk_values = NULL;
 
-       if (NULL != pp_table_information->vddc_lookup_table)
-               pp_table_information->vddc_lookup_table = NULL;
+       kfree(pp_table_information->vddc_lookup_table);
+       pp_table_information->vddc_lookup_table = NULL;
 
-       if (NULL != pp_table_information->vddgfx_lookup_table)
-               pp_table_information->vddgfx_lookup_table = NULL;
+       kfree(pp_table_information->vddgfx_lookup_table);
+       pp_table_information->vddgfx_lookup_table = NULL;
 
-       if (NULL != pp_table_information->mm_dep_table)
-               pp_table_information->mm_dep_table = NULL;
+       kfree(pp_table_information->mm_dep_table);
+       pp_table_information->mm_dep_table = NULL;
 
-       if (NULL != pp_table_information->cac_dtp_table)
-               pp_table_information->cac_dtp_table = NULL;
+       kfree(pp_table_information->cac_dtp_table);
+       pp_table_information->cac_dtp_table = NULL;
 
-       if (NULL != hwmgr->dyn_state.cac_dtp_table)
-               hwmgr->dyn_state.cac_dtp_table = NULL;
+       kfree(hwmgr->dyn_state.cac_dtp_table);
+       hwmgr->dyn_state.cac_dtp_table = NULL;
 
-       if (NULL != pp_table_information->ppm_parameter_table)
-               pp_table_information->ppm_parameter_table = NULL;
+       kfree(pp_table_information->ppm_parameter_table);
+       pp_table_information->ppm_parameter_table = NULL;
 
-       if (NULL != pp_table_information->pcie_table)
-               pp_table_information->pcie_table = NULL;
+       kfree(pp_table_information->pcie_table);
+       pp_table_information->pcie_table = NULL;
 
-       if (NULL != hwmgr->pptable) {
-               kfree(hwmgr->pptable);
-               hwmgr->pptable = NULL;
-       }
+       kfree(hwmgr->pptable);
+       hwmgr->pptable = NULL;
 
        return result;
 }
index fd4ce7a..28f5714 100644 (file)
@@ -673,7 +673,7 @@ extern int phm_get_sclk_for_voltage_evv(struct pp_hwmgr *hwmgr, phm_ppt_v1_volta
 extern int phm_initializa_dynamic_state_adjustment_rule_settings(struct pp_hwmgr *hwmgr);
 extern int phm_hwmgr_backend_fini(struct pp_hwmgr *hwmgr);
 extern uint32_t phm_get_lowest_enabled_level(struct pp_hwmgr *hwmgr, uint32_t mask);
-
+extern void phm_apply_dal_min_voltage_request(struct pp_hwmgr *hwmgr);
 
 #define PHM_ENTIRE_REGISTER_MASK 0xFFFFFFFFU
 
index da18f44..87c023e 100644 (file)
@@ -639,7 +639,7 @@ static int cz_smu_populate_firmware_entries(struct pp_smumgr *smumgr)
 
        cz_smu->driver_buffer_length = 0;
 
-       for (i = 0; i < sizeof(firmware_list)/sizeof(*firmware_list); i++) {
+       for (i = 0; i < ARRAY_SIZE(firmware_list); i++) {
 
                firmware_type = cz_translate_firmware_enum_to_arg(smumgr,
                                        firmware_list[i]);
index 673a75c..8e52a2e 100644 (file)
@@ -1006,10 +1006,16 @@ static int fiji_smu_init(struct pp_smumgr *smumgr)
 
 static int fiji_smu_fini(struct pp_smumgr *smumgr)
 {
+       struct fiji_smumgr *priv = (struct fiji_smumgr *)(smumgr->backend);
+
+       smu_free_memory(smumgr->device, (void *)priv->header_buffer.handle);
+
        if (smumgr->backend) {
                kfree(smumgr->backend);
                smumgr->backend = NULL;
        }
+
+       cgs_rel_firmware(smumgr->device, CGS_UCODE_ID_SMU);
        return 0;
 }
 
index de618ea..043b6ac 100644 (file)
@@ -469,6 +469,7 @@ int polaris10_smu_fini(struct pp_smumgr *smumgr)
                kfree(smumgr->backend);
                smumgr->backend = NULL;
        }
+       cgs_rel_firmware(smumgr->device, CGS_UCODE_ID_SMU);
        return 0;
 }
 
index c483baf..0728c1e 100644 (file)
@@ -81,6 +81,7 @@ int smum_init(struct amd_pp_init *pp_init, struct pp_instance *handle)
 
 int smum_fini(struct pp_smumgr *smumgr)
 {
+       kfree(smumgr->device);
        kfree(smumgr);
        return 0;
 }
index 32820b6..b22722e 100644 (file)
@@ -328,10 +328,17 @@ int tonga_write_smc_sram_dword(struct pp_smumgr *smumgr,
 
 static int tonga_smu_fini(struct pp_smumgr *smumgr)
 {
+       struct tonga_smumgr *priv = (struct tonga_smumgr *)(smumgr->backend);
+
+       smu_free_memory(smumgr->device, (void *)priv->smu_buffer.handle);
+       smu_free_memory(smumgr->device, (void *)priv->header_buffer.handle);
+
        if (smumgr->backend != NULL) {
                kfree(smumgr->backend);
                smumgr->backend = NULL;
        }
+
+       cgs_rel_firmware(smumgr->device, CGS_UCODE_ID_SMU);
        return 0;
 }
 
index fef1b04..0813c2f 100644 (file)
  *
  */
 
+static void hdlcd_crtc_cleanup(struct drm_crtc *crtc)
+{
+       struct hdlcd_drm_private *hdlcd = crtc_to_hdlcd_priv(crtc);
+
+       /* stop the controller on cleanup */
+       hdlcd_write(hdlcd, HDLCD_REG_COMMAND, 0);
+       drm_crtc_cleanup(crtc);
+}
+
 static const struct drm_crtc_funcs hdlcd_crtc_funcs = {
-       .destroy = drm_crtc_cleanup,
+       .destroy = hdlcd_crtc_cleanup,
        .set_config = drm_atomic_helper_set_config,
        .page_flip = drm_atomic_helper_page_flip,
        .reset = drm_atomic_helper_crtc_reset,
@@ -97,7 +106,7 @@ static void hdlcd_crtc_mode_set_nofb(struct drm_crtc *crtc)
        struct hdlcd_drm_private *hdlcd = crtc_to_hdlcd_priv(crtc);
        struct drm_display_mode *m = &crtc->state->adjusted_mode;
        struct videomode vm;
-       unsigned int polarities, line_length, err;
+       unsigned int polarities, err;
 
        vm.vfront_porch = m->crtc_vsync_start - m->crtc_vdisplay;
        vm.vback_porch = m->crtc_vtotal - m->crtc_vsync_end;
@@ -113,23 +122,18 @@ static void hdlcd_crtc_mode_set_nofb(struct drm_crtc *crtc)
        if (m->flags & DRM_MODE_FLAG_PVSYNC)
                polarities |= HDLCD_POLARITY_VSYNC;
 
-       line_length = crtc->primary->state->fb->pitches[0];
-
        /* Allow max number of outstanding requests and largest burst size */
        hdlcd_write(hdlcd, HDLCD_REG_BUS_OPTIONS,
                    HDLCD_BUS_MAX_OUTSTAND | HDLCD_BUS_BURST_16);
 
-       hdlcd_write(hdlcd, HDLCD_REG_FB_LINE_LENGTH, line_length);
-       hdlcd_write(hdlcd, HDLCD_REG_FB_LINE_PITCH, line_length);
-       hdlcd_write(hdlcd, HDLCD_REG_FB_LINE_COUNT, m->crtc_vdisplay - 1);
        hdlcd_write(hdlcd, HDLCD_REG_V_DATA, m->crtc_vdisplay - 1);
        hdlcd_write(hdlcd, HDLCD_REG_V_BACK_PORCH, vm.vback_porch - 1);
        hdlcd_write(hdlcd, HDLCD_REG_V_FRONT_PORCH, vm.vfront_porch - 1);
        hdlcd_write(hdlcd, HDLCD_REG_V_SYNC, vm.vsync_len - 1);
+       hdlcd_write(hdlcd, HDLCD_REG_H_DATA, m->crtc_hdisplay - 1);
        hdlcd_write(hdlcd, HDLCD_REG_H_BACK_PORCH, vm.hback_porch - 1);
        hdlcd_write(hdlcd, HDLCD_REG_H_FRONT_PORCH, vm.hfront_porch - 1);
        hdlcd_write(hdlcd, HDLCD_REG_H_SYNC, vm.hsync_len - 1);
-       hdlcd_write(hdlcd, HDLCD_REG_H_DATA, m->crtc_hdisplay - 1);
        hdlcd_write(hdlcd, HDLCD_REG_POLARITIES, polarities);
 
        err = hdlcd_set_pxl_fmt(crtc);
@@ -144,20 +148,19 @@ static void hdlcd_crtc_enable(struct drm_crtc *crtc)
        struct hdlcd_drm_private *hdlcd = crtc_to_hdlcd_priv(crtc);
 
        clk_prepare_enable(hdlcd->clk);
+       hdlcd_crtc_mode_set_nofb(crtc);
        hdlcd_write(hdlcd, HDLCD_REG_COMMAND, 1);
-       drm_crtc_vblank_on(crtc);
 }
 
 static void hdlcd_crtc_disable(struct drm_crtc *crtc)
 {
        struct hdlcd_drm_private *hdlcd = crtc_to_hdlcd_priv(crtc);
 
-       if (!crtc->primary->fb)
+       if (!crtc->state->active)
                return;
 
-       clk_disable_unprepare(hdlcd->clk);
        hdlcd_write(hdlcd, HDLCD_REG_COMMAND, 0);
-       drm_crtc_vblank_off(crtc);
+       clk_disable_unprepare(hdlcd->clk);
 }
 
 static int hdlcd_crtc_atomic_check(struct drm_crtc *crtc,
@@ -179,20 +182,17 @@ static int hdlcd_crtc_atomic_check(struct drm_crtc *crtc,
 static void hdlcd_crtc_atomic_begin(struct drm_crtc *crtc,
                                    struct drm_crtc_state *state)
 {
-       struct hdlcd_drm_private *hdlcd = crtc_to_hdlcd_priv(crtc);
-       unsigned long flags;
-
-       if (crtc->state->event) {
-               struct drm_pending_vblank_event *event = crtc->state->event;
+       struct drm_pending_vblank_event *event = crtc->state->event;
 
+       if (event) {
                crtc->state->event = NULL;
-               event->pipe = drm_crtc_index(crtc);
-
-               WARN_ON(drm_crtc_vblank_get(crtc) != 0);
 
-               spin_lock_irqsave(&crtc->dev->event_lock, flags);
-               list_add_tail(&event->base.link, &hdlcd->event_list);
-               spin_unlock_irqrestore(&crtc->dev->event_lock, flags);
+               spin_lock_irq(&crtc->dev->event_lock);
+               if (drm_crtc_vblank_get(crtc) == 0)
+                       drm_crtc_arm_vblank_event(crtc, event);
+               else
+                       drm_crtc_send_vblank_event(crtc, event);
+               spin_unlock_irq(&crtc->dev->event_lock);
        }
 }
 
@@ -225,6 +225,15 @@ static const struct drm_crtc_helper_funcs hdlcd_crtc_helper_funcs = {
 static int hdlcd_plane_atomic_check(struct drm_plane *plane,
                                    struct drm_plane_state *state)
 {
+       u32 src_w, src_h;
+
+       src_w = state->src_w >> 16;
+       src_h = state->src_h >> 16;
+
+       /* we can't do any scaling of the plane source */
+       if ((src_w != state->crtc_w) || (src_h != state->crtc_h))
+               return -EINVAL;
+
        return 0;
 }
 
@@ -233,20 +242,31 @@ static void hdlcd_plane_atomic_update(struct drm_plane *plane,
 {
        struct hdlcd_drm_private *hdlcd;
        struct drm_gem_cma_object *gem;
+       unsigned int depth, bpp;
+       u32 src_w, src_h, dest_w, dest_h;
        dma_addr_t scanout_start;
 
-       if (!plane->state->crtc || !plane->state->fb)
+       if (!plane->state->fb)
                return;
 
-       hdlcd = crtc_to_hdlcd_priv(plane->state->crtc);
+       drm_fb_get_bpp_depth(plane->state->fb->pixel_format, &depth, &bpp);
+       src_w = plane->state->src_w >> 16;
+       src_h = plane->state->src_h >> 16;
+       dest_w = plane->state->crtc_w;
+       dest_h = plane->state->crtc_h;
        gem = drm_fb_cma_get_gem_obj(plane->state->fb, 0);
-       scanout_start = gem->paddr;
+       scanout_start = gem->paddr + plane->state->fb->offsets[0] +
+               plane->state->crtc_y * plane->state->fb->pitches[0] +
+               plane->state->crtc_x * bpp / 8;
+
+       hdlcd = plane->dev->dev_private;
+       hdlcd_write(hdlcd, HDLCD_REG_FB_LINE_LENGTH, plane->state->fb->pitches[0]);
+       hdlcd_write(hdlcd, HDLCD_REG_FB_LINE_PITCH, plane->state->fb->pitches[0]);
+       hdlcd_write(hdlcd, HDLCD_REG_FB_LINE_COUNT, dest_h - 1);
        hdlcd_write(hdlcd, HDLCD_REG_FB_BASE, scanout_start);
 }
 
 static const struct drm_plane_helper_funcs hdlcd_plane_helper_funcs = {
-       .prepare_fb = NULL,
-       .cleanup_fb = NULL,
        .atomic_check = hdlcd_plane_atomic_check,
        .atomic_update = hdlcd_plane_atomic_update,
 };
@@ -294,16 +314,6 @@ static struct drm_plane *hdlcd_plane_init(struct drm_device *drm)
        return plane;
 }
 
-void hdlcd_crtc_suspend(struct drm_crtc *crtc)
-{
-       hdlcd_crtc_disable(crtc);
-}
-
-void hdlcd_crtc_resume(struct drm_crtc *crtc)
-{
-       hdlcd_crtc_enable(crtc);
-}
-
 int hdlcd_setup_crtc(struct drm_device *drm)
 {
        struct hdlcd_drm_private *hdlcd = drm->dev_private;
index b987c63..a6ca36f 100644 (file)
@@ -49,8 +49,6 @@ static int hdlcd_load(struct drm_device *drm, unsigned long flags)
        atomic_set(&hdlcd->dma_end_count, 0);
 #endif
 
-       INIT_LIST_HEAD(&hdlcd->event_list);
-
        res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
        hdlcd->mmio = devm_ioremap_resource(drm->dev, res);
        if (IS_ERR(hdlcd->mmio)) {
@@ -84,11 +82,7 @@ static int hdlcd_load(struct drm_device *drm, unsigned long flags)
                goto setup_fail;
        }
 
-       pm_runtime_enable(drm->dev);
-
-       pm_runtime_get_sync(drm->dev);
        ret = drm_irq_install(drm, platform_get_irq(pdev, 0));
-       pm_runtime_put_sync(drm->dev);
        if (ret < 0) {
                DRM_ERROR("failed to install IRQ handler\n");
                goto irq_fail;
@@ -164,24 +158,9 @@ static irqreturn_t hdlcd_irq(int irq, void *arg)
                atomic_inc(&hdlcd->vsync_count);
 
 #endif
-       if (irq_status & HDLCD_INTERRUPT_VSYNC) {
-               bool events_sent = false;
-               unsigned long flags;
-               struct drm_pending_vblank_event *e, *t;
-
+       if (irq_status & HDLCD_INTERRUPT_VSYNC)
                drm_crtc_handle_vblank(&hdlcd->crtc);
 
-               spin_lock_irqsave(&drm->event_lock, flags);
-               list_for_each_entry_safe(e, t, &hdlcd->event_list, base.link) {
-                       list_del(&e->base.link);
-                       drm_crtc_send_vblank_event(&hdlcd->crtc, e);
-                       events_sent = true;
-               }
-               if (events_sent)
-                       drm_crtc_vblank_put(&hdlcd->crtc);
-               spin_unlock_irqrestore(&drm->event_lock, flags);
-       }
-
        /* acknowledge interrupt(s) */
        hdlcd_write(hdlcd, HDLCD_REG_INT_CLEAR, irq_status);
 
@@ -275,6 +254,7 @@ static int hdlcd_show_pxlclock(struct seq_file *m, void *arg)
 static struct drm_info_list hdlcd_debugfs_list[] = {
        { "interrupt_count", hdlcd_show_underrun_count, 0 },
        { "clocks", hdlcd_show_pxlclock, 0 },
+       { "fb", drm_fb_cma_debugfs_show, 0 },
 };
 
 static int hdlcd_debugfs_init(struct drm_minor *minor)
@@ -357,6 +337,8 @@ static int hdlcd_drm_bind(struct device *dev)
                return -ENOMEM;
 
        drm->dev_private = hdlcd;
+       dev_set_drvdata(dev, drm);
+
        hdlcd_setup_mode_config(drm);
        ret = hdlcd_load(drm, 0);
        if (ret)
@@ -366,14 +348,18 @@ static int hdlcd_drm_bind(struct device *dev)
        if (ret)
                goto err_unload;
 
-       dev_set_drvdata(dev, drm);
-
        ret = component_bind_all(dev, drm);
        if (ret) {
                DRM_ERROR("Failed to bind all components\n");
                goto err_unregister;
        }
 
+       ret = pm_runtime_set_active(dev);
+       if (ret)
+               goto err_pm_active;
+
+       pm_runtime_enable(dev);
+
        ret = drm_vblank_init(drm, drm->mode_config.num_crtc);
        if (ret < 0) {
                DRM_ERROR("failed to initialise vblank\n");
@@ -399,16 +385,16 @@ err_fbdev:
        drm_mode_config_cleanup(drm);
        drm_vblank_cleanup(drm);
 err_vblank:
+       pm_runtime_disable(drm->dev);
+err_pm_active:
        component_unbind_all(dev, drm);
 err_unregister:
        drm_dev_unregister(drm);
 err_unload:
-       pm_runtime_get_sync(drm->dev);
        drm_irq_uninstall(drm);
-       pm_runtime_put_sync(drm->dev);
-       pm_runtime_disable(drm->dev);
        of_reserved_mem_device_release(drm->dev);
 err_free:
+       dev_set_drvdata(dev, NULL);
        drm_dev_unref(drm);
 
        return ret;
@@ -495,30 +481,34 @@ MODULE_DEVICE_TABLE(of, hdlcd_of_match);
 static int __maybe_unused hdlcd_pm_suspend(struct device *dev)
 {
        struct drm_device *drm = dev_get_drvdata(dev);
-       struct drm_crtc *crtc;
+       struct hdlcd_drm_private *hdlcd = drm ? drm->dev_private : NULL;
 
-       if (pm_runtime_suspended(dev))
+       if (!hdlcd)
                return 0;
 
-       drm_modeset_lock_all(drm);
-       list_for_each_entry(crtc, &drm->mode_config.crtc_list, head)
-               hdlcd_crtc_suspend(crtc);
-       drm_modeset_unlock_all(drm);
+       drm_kms_helper_poll_disable(drm);
+
+       hdlcd->state = drm_atomic_helper_suspend(drm);
+       if (IS_ERR(hdlcd->state)) {
+               drm_kms_helper_poll_enable(drm);
+               return PTR_ERR(hdlcd->state);
+       }
+
        return 0;
 }
 
 static int __maybe_unused hdlcd_pm_resume(struct device *dev)
 {
        struct drm_device *drm = dev_get_drvdata(dev);
-       struct drm_crtc *crtc;
+       struct hdlcd_drm_private *hdlcd = drm ? drm->dev_private : NULL;
 
-       if (!pm_runtime_suspended(dev))
+       if (!hdlcd)
                return 0;
 
-       drm_modeset_lock_all(drm);
-       list_for_each_entry(crtc, &drm->mode_config.crtc_list, head)
-               hdlcd_crtc_resume(crtc);
-       drm_modeset_unlock_all(drm);
+       drm_atomic_helper_resume(drm, hdlcd->state);
+       drm_kms_helper_poll_enable(drm);
+       pm_runtime_set_active(dev);
+
        return 0;
 }
 
index aa23478..e3950a0 100644 (file)
@@ -9,10 +9,9 @@ struct hdlcd_drm_private {
        void __iomem                    *mmio;
        struct clk                      *clk;
        struct drm_fbdev_cma            *fbdev;
-       struct drm_framebuffer          *fb;
-       struct list_head                event_list;
        struct drm_crtc                 crtc;
        struct drm_plane                *plane;
+       struct drm_atomic_state         *state;
 #ifdef CONFIG_DEBUG_FS
        atomic_t buffer_underrun_count;
        atomic_t bus_error_count;
@@ -36,7 +35,5 @@ static inline u32 hdlcd_read(struct hdlcd_drm_private *hdlcd, unsigned int reg)
 
 int hdlcd_setup_crtc(struct drm_device *dev);
 void hdlcd_set_scanout(struct hdlcd_drm_private *hdlcd);
-void hdlcd_crtc_suspend(struct drm_crtc *crtc);
-void hdlcd_crtc_resume(struct drm_crtc *crtc);
 
 #endif /* __HDLCD_DRV_H__ */
index cf23a75..bd12231 100644 (file)
@@ -391,12 +391,11 @@ void atmel_hlcdc_crtc_reset(struct drm_crtc *crtc)
 {
        struct atmel_hlcdc_crtc_state *state;
 
-       if (crtc->state && crtc->state->mode_blob)
-               drm_property_unreference_blob(crtc->state->mode_blob);
-
        if (crtc->state) {
+               __drm_atomic_helper_crtc_destroy_state(crtc->state);
                state = drm_crtc_state_to_atmel_hlcdc_crtc_state(crtc->state);
                kfree(state);
+               crtc->state = NULL;
        }
 
        state = kzalloc(sizeof(*state), GFP_KERNEL);
@@ -415,8 +414,9 @@ atmel_hlcdc_crtc_duplicate_state(struct drm_crtc *crtc)
                return NULL;
 
        state = kmalloc(sizeof(*state), GFP_KERNEL);
-       if (state)
-               __drm_atomic_helper_crtc_duplicate_state(crtc, &state->base);
+       if (!state)
+               return NULL;
+       __drm_atomic_helper_crtc_duplicate_state(crtc, &state->base);
 
        cur = drm_crtc_state_to_atmel_hlcdc_crtc_state(crtc->state);
        state->output_mode = cur->output_mode;
index 3ff1ed7..c204ef3 100644 (file)
@@ -351,6 +351,8 @@ int drm_atomic_set_mode_prop_for_crtc(struct drm_crtc_state *state,
        drm_property_unreference_blob(state->mode_blob);
        state->mode_blob = NULL;
 
+       memset(&state->mode, 0, sizeof(state->mode));
+
        if (blob) {
                if (blob->length != sizeof(struct drm_mode_modeinfo) ||
                    drm_mode_convert_umode(&state->mode,
@@ -363,7 +365,6 @@ int drm_atomic_set_mode_prop_for_crtc(struct drm_crtc_state *state,
                DRM_DEBUG_ATOMIC("Set [MODE:%s] for CRTC state %p\n",
                                 state->mode.name, state);
        } else {
-               memset(&state->mode, 0, sizeof(state->mode));
                state->enable = false;
                DRM_DEBUG_ATOMIC("Set [NOMODE] for CRTC state %p\n",
                                 state);
index d2a6d95..0e3cc66 100644 (file)
@@ -2821,8 +2821,6 @@ int drm_mode_setcrtc(struct drm_device *dev, void *data,
                        goto out;
                }
 
-               drm_mode_set_crtcinfo(mode, CRTC_INTERLACE_HALVE_V);
-
                /*
                 * Check whether the primary plane supports the fb pixel format.
                 * Drivers not implementing the universal planes API use a
@@ -4841,7 +4839,8 @@ bool drm_property_change_valid_get(struct drm_property *property,
                if (value == 0)
                        return true;
 
-               return _object_find(property->dev, value, property->values[0]) != NULL;
+               *ref = _object_find(property->dev, value, property->values[0]);
+               return *ref != NULL;
        }
 
        for (i = 0; i < property->num_values; i++)
index a6e4243..26feb2f 100644 (file)
@@ -528,11 +528,11 @@ drm_crtc_helper_disable(struct drm_crtc *crtc)
 int drm_crtc_helper_set_config(struct drm_mode_set *set)
 {
        struct drm_device *dev;
-       struct drm_crtc *new_crtc;
-       struct drm_encoder *save_encoders, *new_encoder, *encoder;
+       struct drm_crtc **save_encoder_crtcs, *new_crtc;
+       struct drm_encoder **save_connector_encoders, *new_encoder, *encoder;
        bool mode_changed = false; /* if true do a full mode set */
        bool fb_changed = false; /* if true and !mode_changed just do a flip */
-       struct drm_connector *save_connectors, *connector;
+       struct drm_connector *connector;
        int count = 0, ro, fail = 0;
        const struct drm_crtc_helper_funcs *crtc_funcs;
        struct drm_mode_set save_set;
@@ -574,15 +574,15 @@ int drm_crtc_helper_set_config(struct drm_mode_set *set)
         * Allocate space for the backup of all (non-pointer) encoder and
         * connector data.
         */
-       save_encoders = kzalloc(dev->mode_config.num_encoder *
-                               sizeof(struct drm_encoder), GFP_KERNEL);
-       if (!save_encoders)
+       save_encoder_crtcs = kzalloc(dev->mode_config.num_encoder *
+                               sizeof(struct drm_crtc *), GFP_KERNEL);
+       if (!save_encoder_crtcs)
                return -ENOMEM;
 
-       save_connectors = kzalloc(dev->mode_config.num_connector *
-                               sizeof(struct drm_connector), GFP_KERNEL);
-       if (!save_connectors) {
-               kfree(save_encoders);
+       save_connector_encoders = kzalloc(dev->mode_config.num_connector *
+                               sizeof(struct drm_encoder *), GFP_KERNEL);
+       if (!save_connector_encoders) {
+               kfree(save_encoder_crtcs);
                return -ENOMEM;
        }
 
@@ -593,12 +593,12 @@ int drm_crtc_helper_set_config(struct drm_mode_set *set)
         */
        count = 0;
        drm_for_each_encoder(encoder, dev) {
-               save_encoders[count++] = *encoder;
+               save_encoder_crtcs[count++] = encoder->crtc;
        }
 
        count = 0;
        drm_for_each_connector(connector, dev) {
-               save_connectors[count++] = *connector;
+               save_connector_encoders[count++] = connector->encoder;
        }
 
        save_set.crtc = set->crtc;
@@ -631,8 +631,12 @@ int drm_crtc_helper_set_config(struct drm_mode_set *set)
                mode_changed = true;
        }
 
-       /* take a reference on all connectors in set */
+       /* take a reference on all unbound connectors in set, reuse the
+        * already taken reference for bound connectors
+        */
        for (ro = 0; ro < set->num_connectors; ro++) {
+               if (set->connectors[ro]->encoder)
+                       continue;
                drm_connector_reference(set->connectors[ro]);
        }
 
@@ -754,30 +758,28 @@ int drm_crtc_helper_set_config(struct drm_mode_set *set)
                }
        }
 
-       /* after fail drop reference on all connectors in save set */
-       count = 0;
-       drm_for_each_connector(connector, dev) {
-               drm_connector_unreference(&save_connectors[count++]);
-       }
-
-       kfree(save_connectors);
-       kfree(save_encoders);
+       kfree(save_connector_encoders);
+       kfree(save_encoder_crtcs);
        return 0;
 
 fail:
        /* Restore all previous data. */
        count = 0;
        drm_for_each_encoder(encoder, dev) {
-               *encoder = save_encoders[count++];
+               encoder->crtc = save_encoder_crtcs[count++];
        }
 
        count = 0;
        drm_for_each_connector(connector, dev) {
-               *connector = save_connectors[count++];
+               connector->encoder = save_connector_encoders[count++];
        }
 
-       /* after fail drop reference on all connectors in set */
+       /* after fail drop reference on all unbound connectors in set, let
+        * bound connectors keep their reference
+        */
        for (ro = 0; ro < set->num_connectors; ro++) {
+               if (set->connectors[ro]->encoder)
+                       continue;
                drm_connector_unreference(set->connectors[ro]);
        }
 
@@ -787,8 +789,8 @@ fail:
                                      save_set.y, save_set.fb))
                DRM_ERROR("failed to restore config after modeset failure\n");
 
-       kfree(save_connectors);
-       kfree(save_encoders);
+       kfree(save_connector_encoders);
+       kfree(save_encoder_crtcs);
        return ret;
 }
 EXPORT_SYMBOL(drm_crtc_helper_set_config);
diff --git a/drivers/gpu/drm/drm_dp_dual_mode_helper.c b/drivers/gpu/drm/drm_dp_dual_mode_helper.c
new file mode 100644 (file)
index 0000000..a7b2a75
--- /dev/null
@@ -0,0 +1,366 @@
+/*
+ * Copyright Â© 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/export.h>
+#include <linux/i2c.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <drm/drm_dp_dual_mode_helper.h>
+#include <drm/drmP.h>
+
+/**
+ * DOC: dp dual mode helpers
+ *
+ * Helper functions to deal with DP dual mode (aka. DP++) adaptors.
+ *
+ * Type 1:
+ * Adaptor registers (if any) and the sink DDC bus may be accessed via I2C.
+ *
+ * Type 2:
+ * Adaptor registers and sink DDC bus can be accessed either via I2C or
+ * I2C-over-AUX. Source devices may choose to implement either of these
+ * access methods.
+ */
+
+#define DP_DUAL_MODE_SLAVE_ADDRESS 0x40
+
+/**
+ * drm_dp_dual_mode_read - Read from the DP dual mode adaptor register(s)
+ * @adapter: I2C adapter for the DDC bus
+ * @offset: register offset
+ * @buffer: buffer for return data
+ * @size: sizo of the buffer
+ *
+ * Reads @size bytes from the DP dual mode adaptor registers
+ * starting at @offset.
+ *
+ * Returns:
+ * 0 on success, negative error code on failure
+ */
+ssize_t drm_dp_dual_mode_read(struct i2c_adapter *adapter,
+                             u8 offset, void *buffer, size_t size)
+{
+       struct i2c_msg msgs[] = {
+               {
+                       .addr = DP_DUAL_MODE_SLAVE_ADDRESS,
+                       .flags = 0,
+                       .len = 1,
+                       .buf = &offset,
+               },
+               {
+                       .addr = DP_DUAL_MODE_SLAVE_ADDRESS,
+                       .flags = I2C_M_RD,
+                       .len = size,
+                       .buf = buffer,
+               },
+       };
+       int ret;
+
+       ret = i2c_transfer(adapter, msgs, ARRAY_SIZE(msgs));
+       if (ret < 0)
+               return ret;
+       if (ret != ARRAY_SIZE(msgs))
+               return -EPROTO;
+
+       return 0;
+}
+EXPORT_SYMBOL(drm_dp_dual_mode_read);
+
+/**
+ * drm_dp_dual_mode_write - Write to the DP dual mode adaptor register(s)
+ * @adapter: I2C adapter for the DDC bus
+ * @offset: register offset
+ * @buffer: buffer for write data
+ * @size: sizo of the buffer
+ *
+ * Writes @size bytes to the DP dual mode adaptor registers
+ * starting at @offset.
+ *
+ * Returns:
+ * 0 on success, negative error code on failure
+ */
+ssize_t drm_dp_dual_mode_write(struct i2c_adapter *adapter,
+                              u8 offset, const void *buffer, size_t size)
+{
+       struct i2c_msg msg = {
+               .addr = DP_DUAL_MODE_SLAVE_ADDRESS,
+               .flags = 0,
+               .len = 1 + size,
+               .buf = NULL,
+       };
+       void *data;
+       int ret;
+
+       data = kmalloc(msg.len, GFP_TEMPORARY);
+       if (!data)
+               return -ENOMEM;
+
+       msg.buf = data;
+
+       memcpy(data, &offset, 1);
+       memcpy(data + 1, buffer, size);
+
+       ret = i2c_transfer(adapter, &msg, 1);
+
+       kfree(data);
+
+       if (ret < 0)
+               return ret;
+       if (ret != 1)
+               return -EPROTO;
+
+       return 0;
+}
+EXPORT_SYMBOL(drm_dp_dual_mode_write);
+
+static bool is_hdmi_adaptor(const char hdmi_id[DP_DUAL_MODE_HDMI_ID_LEN])
+{
+       static const char dp_dual_mode_hdmi_id[DP_DUAL_MODE_HDMI_ID_LEN] =
+               "DP-HDMI ADAPTOR\x04";
+
+       return memcmp(hdmi_id, dp_dual_mode_hdmi_id,
+                     sizeof(dp_dual_mode_hdmi_id)) == 0;
+}
+
+static bool is_type2_adaptor(uint8_t adaptor_id)
+{
+       return adaptor_id == (DP_DUAL_MODE_TYPE_TYPE2 |
+                             DP_DUAL_MODE_REV_TYPE2);
+}
+
+/**
+ * drm_dp_dual_mode_detect - Identify the DP dual mode adaptor
+ * @adapter: I2C adapter for the DDC bus
+ *
+ * Attempt to identify the type of the DP dual mode adaptor used.
+ *
+ * Note that when the answer is @DRM_DP_DUAL_MODE_UNKNOWN it's not
+ * certain whether we're dealing with a native HDMI port or
+ * a type 1 DVI dual mode adaptor. The driver will have to use
+ * some other hardware/driver specific mechanism to make that
+ * distinction.
+ *
+ * Returns:
+ * The type of the DP dual mode adaptor used
+ */
+enum drm_dp_dual_mode_type drm_dp_dual_mode_detect(struct i2c_adapter *adapter)
+{
+       char hdmi_id[DP_DUAL_MODE_HDMI_ID_LEN] = {};
+       uint8_t adaptor_id = 0x00;
+       ssize_t ret;
+
+       /*
+        * Let's see if the adaptor is there the by reading the
+        * HDMI ID registers.
+        *
+        * Note that type 1 DVI adaptors are not required to implemnt
+        * any registers, and that presents a problem for detection.
+        * If the i2c transfer is nacked, we may or may not be dealing
+        * with a type 1 DVI adaptor. Some other mechanism of detecting
+        * the presence of the adaptor is required. One way would be
+        * to check the state of the CONFIG1 pin, Another method would
+        * simply require the driver to know whether the port is a DP++
+        * port or a native HDMI port. Both of these methods are entirely
+        * hardware/driver specific so we can't deal with them here.
+        */
+       ret = drm_dp_dual_mode_read(adapter, DP_DUAL_MODE_HDMI_ID,
+                                   hdmi_id, sizeof(hdmi_id));
+       if (ret)
+               return DRM_DP_DUAL_MODE_UNKNOWN;
+
+       /*
+        * Sigh. Some (maybe all?) type 1 adaptors are broken and ack
+        * the offset but ignore it, and instead they just always return
+        * data from the start of the HDMI ID buffer. So for a broken
+        * type 1 HDMI adaptor a single byte read will always give us
+        * 0x44, and for a type 1 DVI adaptor it should give 0x00
+        * (assuming it implements any registers). Fortunately neither
+        * of those values will match the type 2 signature of the
+        * DP_DUAL_MODE_ADAPTOR_ID register so we can proceed with
+        * the type 2 adaptor detection safely even in the presence
+        * of broken type 1 adaptors.
+        */
+       ret = drm_dp_dual_mode_read(adapter, DP_DUAL_MODE_ADAPTOR_ID,
+                                   &adaptor_id, sizeof(adaptor_id));
+       if (ret == 0) {
+               if (is_type2_adaptor(adaptor_id)) {
+                       if (is_hdmi_adaptor(hdmi_id))
+                               return DRM_DP_DUAL_MODE_TYPE2_HDMI;
+                       else
+                               return DRM_DP_DUAL_MODE_TYPE2_DVI;
+               }
+       }
+
+       if (is_hdmi_adaptor(hdmi_id))
+               return DRM_DP_DUAL_MODE_TYPE1_HDMI;
+       else
+               return DRM_DP_DUAL_MODE_TYPE1_DVI;
+}
+EXPORT_SYMBOL(drm_dp_dual_mode_detect);
+
+/**
+ * drm_dp_dual_mode_max_tmds_clock - Max TMDS clock for DP dual mode adaptor
+ * @type: DP dual mode adaptor type
+ * @adapter: I2C adapter for the DDC bus
+ *
+ * Determine the max TMDS clock the adaptor supports based on the
+ * type of the dual mode adaptor and the DP_DUAL_MODE_MAX_TMDS_CLOCK
+ * register (on type2 adaptors). As some type 1 adaptors have
+ * problems with registers (see comments in drm_dp_dual_mode_detect())
+ * we don't read the register on those, instead we simply assume
+ * a 165 MHz limit based on the specification.
+ *
+ * Returns:
+ * Maximum supported TMDS clock rate for the DP dual mode adaptor in kHz.
+ */
+int drm_dp_dual_mode_max_tmds_clock(enum drm_dp_dual_mode_type type,
+                                   struct i2c_adapter *adapter)
+{
+       uint8_t max_tmds_clock;
+       ssize_t ret;
+
+       /* native HDMI so no limit */
+       if (type == DRM_DP_DUAL_MODE_NONE)
+               return 0;
+
+       /*
+        * Type 1 adaptors are limited to 165MHz
+        * Type 2 adaptors can tells us their limit
+        */
+       if (type < DRM_DP_DUAL_MODE_TYPE2_DVI)
+               return 165000;
+
+       ret = drm_dp_dual_mode_read(adapter, DP_DUAL_MODE_MAX_TMDS_CLOCK,
+                                   &max_tmds_clock, sizeof(max_tmds_clock));
+       if (ret || max_tmds_clock == 0x00 || max_tmds_clock == 0xff) {
+               DRM_DEBUG_KMS("Failed to query max TMDS clock\n");
+               return 165000;
+       }
+
+       return max_tmds_clock * 5000 / 2;
+}
+EXPORT_SYMBOL(drm_dp_dual_mode_max_tmds_clock);
+
+/**
+ * drm_dp_dual_mode_get_tmds_output - Get the state of the TMDS output buffers in the DP dual mode adaptor
+ * @type: DP dual mode adaptor type
+ * @adapter: I2C adapter for the DDC bus
+ * @enabled: current state of the TMDS output buffers
+ *
+ * Get the state of the TMDS output buffers in the adaptor. For
+ * type2 adaptors this is queried from the DP_DUAL_MODE_TMDS_OEN
+ * register. As some type 1 adaptors have problems with registers
+ * (see comments in drm_dp_dual_mode_detect()) we don't read the
+ * register on those, instead we simply assume that the buffers
+ * are always enabled.
+ *
+ * Returns:
+ * 0 on success, negative error code on failure
+ */
+int drm_dp_dual_mode_get_tmds_output(enum drm_dp_dual_mode_type type,
+                                    struct i2c_adapter *adapter,
+                                    bool *enabled)
+{
+       uint8_t tmds_oen;
+       ssize_t ret;
+
+       if (type < DRM_DP_DUAL_MODE_TYPE2_DVI) {
+               *enabled = true;
+               return 0;
+       }
+
+       ret = drm_dp_dual_mode_read(adapter, DP_DUAL_MODE_TMDS_OEN,
+                                   &tmds_oen, sizeof(tmds_oen));
+       if (ret) {
+               DRM_DEBUG_KMS("Failed to query state of TMDS output buffers\n");
+               return ret;
+       }
+
+       *enabled = !(tmds_oen & DP_DUAL_MODE_TMDS_DISABLE);
+
+       return 0;
+}
+EXPORT_SYMBOL(drm_dp_dual_mode_get_tmds_output);
+
+/**
+ * drm_dp_dual_mode_set_tmds_output - Enable/disable TMDS output buffers in the DP dual mode adaptor
+ * @type: DP dual mode adaptor type
+ * @adapter: I2C adapter for the DDC bus
+ * @enable: enable (as opposed to disable) the TMDS output buffers
+ *
+ * Set the state of the TMDS output buffers in the adaptor. For
+ * type2 this is set via the DP_DUAL_MODE_TMDS_OEN register. As
+ * some type 1 adaptors have problems with registers (see comments
+ * in drm_dp_dual_mode_detect()) we avoid touching the register,
+ * making this function a no-op on type 1 adaptors.
+ *
+ * Returns:
+ * 0 on success, negative error code on failure
+ */
+int drm_dp_dual_mode_set_tmds_output(enum drm_dp_dual_mode_type type,
+                                    struct i2c_adapter *adapter, bool enable)
+{
+       uint8_t tmds_oen = enable ? 0 : DP_DUAL_MODE_TMDS_DISABLE;
+       ssize_t ret;
+
+       if (type < DRM_DP_DUAL_MODE_TYPE2_DVI)
+               return 0;
+
+       ret = drm_dp_dual_mode_write(adapter, DP_DUAL_MODE_TMDS_OEN,
+                                    &tmds_oen, sizeof(tmds_oen));
+       if (ret) {
+               DRM_DEBUG_KMS("Failed to %s TMDS output buffers\n",
+                             enable ? "enable" : "disable");
+               return ret;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL(drm_dp_dual_mode_set_tmds_output);
+
+/**
+ * drm_dp_get_dual_mode_type_name - Get the name of the DP dual mode adaptor type as a string
+ * @type: DP dual mode adaptor type
+ *
+ * Returns:
+ * String representation of the DP dual mode adaptor type
+ */
+const char *drm_dp_get_dual_mode_type_name(enum drm_dp_dual_mode_type type)
+{
+       switch (type) {
+       case DRM_DP_DUAL_MODE_NONE:
+               return "none";
+       case DRM_DP_DUAL_MODE_TYPE1_DVI:
+               return "type 1 DVI";
+       case DRM_DP_DUAL_MODE_TYPE1_HDMI:
+               return "type 1 HDMI";
+       case DRM_DP_DUAL_MODE_TYPE2_DVI:
+               return "type 2 DVI";
+       case DRM_DP_DUAL_MODE_TYPE2_HDMI:
+               return "type 2 HDMI";
+       default:
+               WARN_ON(type != DRM_DP_DUAL_MODE_UNKNOWN);
+               return "unknown";
+       }
+}
+EXPORT_SYMBOL(drm_dp_get_dual_mode_type_name);
index a13edf5..6537908 100644 (file)
@@ -2927,11 +2927,9 @@ static void drm_dp_destroy_connector_work(struct work_struct *work)
                drm_dp_port_teardown_pdt(port, port->pdt);
 
                if (!port->input && port->vcpi.vcpi > 0) {
-                       if (mgr->mst_state) {
-                               drm_dp_mst_reset_vcpi_slots(mgr, port);
-                               drm_dp_update_payload_part1(mgr);
-                               drm_dp_mst_put_payload_id(mgr, port->vcpi.vcpi);
-                       }
+                       drm_dp_mst_reset_vcpi_slots(mgr, port);
+                       drm_dp_update_payload_part1(mgr);
+                       drm_dp_mst_put_payload_id(mgr, port->vcpi.vcpi);
                }
 
                kref_put(&port->kref, drm_dp_free_mst_port);
index 172cafe..5075fae 100644 (file)
@@ -445,7 +445,7 @@ err_cma_destroy:
 err_fb_info_destroy:
        drm_fb_helper_release_fbi(helper);
 err_gem_free_object:
-       dev->driver->gem_free_object(&obj->base);
+       drm_gem_object_unreference_unlocked(&obj->base);
        return ret;
 }
 EXPORT_SYMBOL(drm_fbdev_cma_create_with_funcs);
index e1ab008..1d6c335 100644 (file)
@@ -121,7 +121,7 @@ struct drm_gem_cma_object *drm_gem_cma_create(struct drm_device *drm,
        return cma_obj;
 
 error:
-       drm->driver->gem_free_object(&cma_obj->base);
+       drm_gem_object_unreference_unlocked(&cma_obj->base);
        return ERR_PTR(ret);
 }
 EXPORT_SYMBOL_GPL(drm_gem_cma_create);
@@ -162,18 +162,12 @@ drm_gem_cma_create_with_handle(struct drm_file *file_priv,
         * and handle has the id what user can see.
         */
        ret = drm_gem_handle_create(file_priv, gem_obj, handle);
-       if (ret)
-               goto err_handle_create;
-
        /* drop reference from allocate - handle holds it now. */
        drm_gem_object_unreference_unlocked(gem_obj);
+       if (ret)
+               return ERR_PTR(ret);
 
        return cma_obj;
-
-err_handle_create:
-       drm->driver->gem_free_object(gem_obj);
-
-       return ERR_PTR(ret);
 }
 
 /**
index 7def3d5..e5e6f50 100644 (file)
@@ -1518,6 +1518,8 @@ int drm_mode_convert_umode(struct drm_display_mode *out,
        if (out->status != MODE_OK)
                goto out;
 
+       drm_mode_set_crtcinfo(out, CRTC_INTERLACE_HALVE_V);
+
        ret = 0;
 
 out:
index 0ec1ad9..dc723f7 100644 (file)
@@ -42,9 +42,10 @@ static const struct regmap_config fsl_dcu_regmap_config = {
        .reg_bits = 32,
        .reg_stride = 4,
        .val_bits = 32,
-       .cache_type = REGCACHE_RBTREE,
+       .cache_type = REGCACHE_FLAT,
 
        .volatile_reg = fsl_dcu_drm_is_volatile_reg,
+       .max_register = 0x11fc,
 };
 
 static int fsl_dcu_drm_irq_init(struct drm_device *dev)
index 15615fb..b3198fc 100644 (file)
@@ -1183,6 +1183,12 @@ static int i915_driver_init_hw(struct drm_i915_private *dev_priv)
        if (ret)
                return ret;
 
+       ret = i915_ggtt_enable_hw(dev);
+       if (ret) {
+               DRM_ERROR("failed to enable GGTT\n");
+               goto out_ggtt;
+       }
+
        /* WARNING: Apparently we must kick fbdev drivers before vgacon,
         * otherwise the vga fbdev driver falls over. */
        ret = i915_kick_out_firmware_fb(dev_priv);
index d37c0a6..f313b4d 100644 (file)
@@ -734,9 +734,14 @@ int i915_suspend_switcheroo(struct drm_device *dev, pm_message_t state)
 static int i915_drm_resume(struct drm_device *dev)
 {
        struct drm_i915_private *dev_priv = dev->dev_private;
+       int ret;
 
        disable_rpm_wakeref_asserts(dev_priv);
 
+       ret = i915_ggtt_enable_hw(dev);
+       if (ret)
+               DRM_ERROR("failed to re-enable GGTT\n");
+
        intel_csr_ucode_resume(dev_priv);
 
        mutex_lock(&dev->struct_mutex);
index b87ca4f..7c334e9 100644 (file)
@@ -3481,7 +3481,9 @@ int intel_bios_init(struct drm_i915_private *dev_priv);
 bool intel_bios_is_valid_vbt(const void *buf, size_t size);
 bool intel_bios_is_tv_present(struct drm_i915_private *dev_priv);
 bool intel_bios_is_lvds_present(struct drm_i915_private *dev_priv, u8 *i2c_pin);
+bool intel_bios_is_port_present(struct drm_i915_private *dev_priv, enum port port);
 bool intel_bios_is_port_edp(struct drm_i915_private *dev_priv, enum port port);
+bool intel_bios_is_port_dp_dual_mode(struct drm_i915_private *dev_priv, enum port port);
 bool intel_bios_is_dsi_present(struct drm_i915_private *dev_priv, enum port *port);
 bool intel_bios_is_port_hpd_inverted(struct drm_i915_private *dev_priv,
                                     enum port port);
index 9b99490..aad2685 100644 (file)
@@ -1456,7 +1456,10 @@ i915_wait_request(struct drm_i915_gem_request *req)
        if (ret)
                return ret;
 
-       __i915_gem_request_retire__upto(req);
+       /* If the GPU hung, we want to keep the requests to find the guilty. */
+       if (req->reset_counter == i915_reset_counter(&dev_priv->gpu_error))
+               __i915_gem_request_retire__upto(req);
+
        return 0;
 }
 
@@ -1513,7 +1516,8 @@ i915_gem_object_retire_request(struct drm_i915_gem_object *obj,
        else if (obj->last_write_req == req)
                i915_gem_object_retire__write(obj);
 
-       __i915_gem_request_retire__upto(req);
+       if (req->reset_counter == i915_reset_counter(&req->i915->gpu_error))
+               __i915_gem_request_retire__upto(req);
 }
 
 /* A nonblocking variant of the above wait. This is a highly dangerous routine
@@ -4860,9 +4864,6 @@ i915_gem_init_hw(struct drm_device *dev)
        struct intel_engine_cs *engine;
        int ret, j;
 
-       if (INTEL_INFO(dev)->gen < 6 && !intel_enable_gtt())
-               return -EIO;
-
        /* Double layer security blanket, see i915_gem_init() */
        intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
 
index 0d666b3..92acdff 100644 (file)
@@ -3236,6 +3236,14 @@ out_gtt_cleanup:
        return ret;
 }
 
+int i915_ggtt_enable_hw(struct drm_device *dev)
+{
+       if (INTEL_INFO(dev)->gen < 6 && !intel_enable_gtt())
+               return -EIO;
+
+       return 0;
+}
+
 void i915_gem_restore_gtt_mappings(struct drm_device *dev)
 {
        struct drm_i915_private *dev_priv = to_i915(dev);
index d7dd3d8..0008543 100644 (file)
@@ -514,6 +514,7 @@ i915_page_dir_dma_addr(const struct i915_hw_ppgtt *ppgtt, const unsigned n)
 }
 
 int i915_ggtt_init_hw(struct drm_device *dev);
+int i915_ggtt_enable_hw(struct drm_device *dev);
 void i915_gem_init_ggtt(struct drm_device *dev);
 void i915_ggtt_cleanup_hw(struct drm_device *dev);
 
index e72dd9a..b9022fa 100644 (file)
@@ -139,6 +139,11 @@ fill_detail_timing_data(struct drm_display_mode *panel_fixed_mode,
        else
                panel_fixed_mode->flags |= DRM_MODE_FLAG_NVSYNC;
 
+       panel_fixed_mode->width_mm = (dvo_timing->himage_hi << 8) |
+               dvo_timing->himage_lo;
+       panel_fixed_mode->height_mm = (dvo_timing->vimage_hi << 8) |
+               dvo_timing->vimage_lo;
+
        /* Some VBTs have bogus h/vtotal values */
        if (panel_fixed_mode->hsync_end > panel_fixed_mode->htotal)
                panel_fixed_mode->htotal = panel_fixed_mode->hsync_end + 1;
@@ -1187,7 +1192,7 @@ parse_device_mapping(struct drm_i915_private *dev_priv,
        }
        if (bdb->version < 106) {
                expected_size = 22;
-       } else if (bdb->version < 109) {
+       } else if (bdb->version < 111) {
                expected_size = 27;
        } else if (bdb->version < 195) {
                BUILD_BUG_ON(sizeof(struct old_child_dev_config) != 33);
@@ -1545,6 +1550,45 @@ bool intel_bios_is_lvds_present(struct drm_i915_private *dev_priv, u8 *i2c_pin)
        return false;
 }
 
+/**
+ * intel_bios_is_port_present - is the specified digital port present
+ * @dev_priv:  i915 device instance
+ * @port:      port to check
+ *
+ * Return true if the device in %port is present.
+ */
+bool intel_bios_is_port_present(struct drm_i915_private *dev_priv, enum port port)
+{
+       static const struct {
+               u16 dp, hdmi;
+       } port_mapping[] = {
+               [PORT_B] = { DVO_PORT_DPB, DVO_PORT_HDMIB, },
+               [PORT_C] = { DVO_PORT_DPC, DVO_PORT_HDMIC, },
+               [PORT_D] = { DVO_PORT_DPD, DVO_PORT_HDMID, },
+               [PORT_E] = { DVO_PORT_DPE, DVO_PORT_HDMIE, },
+       };
+       int i;
+
+       /* FIXME maybe deal with port A as well? */
+       if (WARN_ON(port == PORT_A) || port >= ARRAY_SIZE(port_mapping))
+               return false;
+
+       if (!dev_priv->vbt.child_dev_num)
+               return false;
+
+       for (i = 0; i < dev_priv->vbt.child_dev_num; i++) {
+               const union child_device_config *p_child =
+                       &dev_priv->vbt.child_dev[i];
+               if ((p_child->common.dvo_port == port_mapping[port].dp ||
+                    p_child->common.dvo_port == port_mapping[port].hdmi) &&
+                   (p_child->common.device_type & (DEVICE_TYPE_TMDS_DVI_SIGNALING |
+                                                   DEVICE_TYPE_DISPLAYPORT_OUTPUT)))
+                       return true;
+       }
+
+       return false;
+}
+
 /**
  * intel_bios_is_port_edp - is the device in given port eDP
  * @dev_priv:  i915 device instance
@@ -1578,6 +1622,42 @@ bool intel_bios_is_port_edp(struct drm_i915_private *dev_priv, enum port port)
        return false;
 }
 
+bool intel_bios_is_port_dp_dual_mode(struct drm_i915_private *dev_priv, enum port port)
+{
+       static const struct {
+               u16 dp, hdmi;
+       } port_mapping[] = {
+               /*
+                * Buggy VBTs may declare DP ports as having
+                * HDMI type dvo_port :( So let's check both.
+                */
+               [PORT_B] = { DVO_PORT_DPB, DVO_PORT_HDMIB, },
+               [PORT_C] = { DVO_PORT_DPC, DVO_PORT_HDMIC, },
+               [PORT_D] = { DVO_PORT_DPD, DVO_PORT_HDMID, },
+               [PORT_E] = { DVO_PORT_DPE, DVO_PORT_HDMIE, },
+       };
+       int i;
+
+       if (port == PORT_A || port >= ARRAY_SIZE(port_mapping))
+               return false;
+
+       if (!dev_priv->vbt.child_dev_num)
+               return false;
+
+       for (i = 0; i < dev_priv->vbt.child_dev_num; i++) {
+               const union child_device_config *p_child =
+                       &dev_priv->vbt.child_dev[i];
+
+               if ((p_child->common.dvo_port == port_mapping[port].dp ||
+                    p_child->common.dvo_port == port_mapping[port].hdmi) &&
+                   (p_child->common.device_type & DEVICE_TYPE_DP_DUAL_MODE_BITS) ==
+                   (DEVICE_TYPE_DP_DUAL_MODE & DEVICE_TYPE_DP_DUAL_MODE_BITS))
+                       return true;
+       }
+
+       return false;
+}
+
 /**
  * intel_bios_is_dsi_present - is DSI present in VBT
  * @dev_priv:  i915 device instance
index 3fac046..01e523d 100644 (file)
@@ -1601,6 +1601,12 @@ static void intel_ddi_pre_enable(struct intel_encoder *intel_encoder)
        enum port port = intel_ddi_get_encoder_port(intel_encoder);
        int type = intel_encoder->type;
 
+       if (type == INTEL_OUTPUT_HDMI) {
+               struct intel_hdmi *intel_hdmi = enc_to_intel_hdmi(encoder);
+
+               intel_dp_dual_mode_set_tmds_output(intel_hdmi, true);
+       }
+
        intel_prepare_ddi_buffer(intel_encoder);
 
        if (type == INTEL_OUTPUT_EDP) {
@@ -1667,6 +1673,12 @@ static void intel_ddi_post_disable(struct intel_encoder *intel_encoder)
                                        DPLL_CTRL2_DDI_CLK_OFF(port)));
        else if (INTEL_INFO(dev)->gen < 9)
                I915_WRITE(PORT_CLK_SEL(port), PORT_CLK_SEL_NONE);
+
+       if (type == INTEL_OUTPUT_HDMI) {
+               struct intel_hdmi *intel_hdmi = enc_to_intel_hdmi(encoder);
+
+               intel_dp_dual_mode_set_tmds_output(intel_hdmi, false);
+       }
 }
 
 static void intel_enable_ddi(struct intel_encoder *intel_encoder)
@@ -2180,8 +2192,10 @@ void intel_ddi_get_config(struct intel_encoder *encoder,
 
                if (intel_hdmi->infoframe_enabled(&encoder->base, pipe_config))
                        pipe_config->has_infoframe = true;
-               break;
+               /* fall through */
        case TRANS_DDI_MODE_SELECT_DVI:
+               pipe_config->lane_count = 4;
+               break;
        case TRANS_DDI_MODE_SELECT_FDI:
                break;
        case TRANS_DDI_MODE_SELECT_DP_SST:
index 46f9be3..56a1637 100644 (file)
@@ -8275,12 +8275,14 @@ static void ironlake_init_pch_refclk(struct drm_device *dev)
 {
        struct drm_i915_private *dev_priv = dev->dev_private;
        struct intel_encoder *encoder;
+       int i;
        u32 val, final;
        bool has_lvds = false;
        bool has_cpu_edp = false;
        bool has_panel = false;
        bool has_ck505 = false;
        bool can_ssc = false;
+       bool using_ssc_source = false;
 
        /* We need to take the global config into account */
        for_each_intel_encoder(dev, encoder) {
@@ -8307,8 +8309,22 @@ static void ironlake_init_pch_refclk(struct drm_device *dev)
                can_ssc = true;
        }
 
-       DRM_DEBUG_KMS("has_panel %d has_lvds %d has_ck505 %d\n",
-                     has_panel, has_lvds, has_ck505);
+       /* Check if any DPLLs are using the SSC source */
+       for (i = 0; i < dev_priv->num_shared_dpll; i++) {
+               u32 temp = I915_READ(PCH_DPLL(i));
+
+               if (!(temp & DPLL_VCO_ENABLE))
+                       continue;
+
+               if ((temp & PLL_REF_INPUT_MASK) ==
+                   PLLB_REF_INPUT_SPREADSPECTRUMIN) {
+                       using_ssc_source = true;
+                       break;
+               }
+       }
+
+       DRM_DEBUG_KMS("has_panel %d has_lvds %d has_ck505 %d using_ssc_source %d\n",
+                     has_panel, has_lvds, has_ck505, using_ssc_source);
 
        /* Ironlake: try to setup display ref clock before DPLL
         * enabling. This is only under driver's control after
@@ -8345,9 +8361,9 @@ static void ironlake_init_pch_refclk(struct drm_device *dev)
                                final |= DREF_CPU_SOURCE_OUTPUT_NONSPREAD;
                } else
                        final |= DREF_CPU_SOURCE_OUTPUT_DISABLE;
-       } else {
-               final |= DREF_SSC_SOURCE_DISABLE;
-               final |= DREF_CPU_SOURCE_OUTPUT_DISABLE;
+       } else if (using_ssc_source) {
+               final |= DREF_SSC_SOURCE_ENABLE;
+               final |= DREF_SSC1_ENABLE;
        }
 
        if (final == val)
@@ -8393,7 +8409,7 @@ static void ironlake_init_pch_refclk(struct drm_device *dev)
                POSTING_READ(PCH_DREF_CONTROL);
                udelay(200);
        } else {
-               DRM_DEBUG_KMS("Disabling SSC entirely\n");
+               DRM_DEBUG_KMS("Disabling CPU source output\n");
 
                val &= ~DREF_CPU_SOURCE_OUTPUT_MASK;
 
@@ -8404,16 +8420,20 @@ static void ironlake_init_pch_refclk(struct drm_device *dev)
                POSTING_READ(PCH_DREF_CONTROL);
                udelay(200);
 
-               /* Turn off the SSC source */
-               val &= ~DREF_SSC_SOURCE_MASK;
-               val |= DREF_SSC_SOURCE_DISABLE;
+               if (!using_ssc_source) {
+                       DRM_DEBUG_KMS("Disabling SSC source\n");
 
-               /* Turn off SSC1 */
-               val &= ~DREF_SSC1_ENABLE;
+                       /* Turn off the SSC source */
+                       val &= ~DREF_SSC_SOURCE_MASK;
+                       val |= DREF_SSC_SOURCE_DISABLE;
 
-               I915_WRITE(PCH_DREF_CONTROL, val);
-               POSTING_READ(PCH_DREF_CONTROL);
-               udelay(200);
+                       /* Turn off SSC1 */
+                       val &= ~DREF_SSC1_ENABLE;
+
+                       I915_WRITE(PCH_DREF_CONTROL, val);
+                       POSTING_READ(PCH_DREF_CONTROL);
+                       udelay(200);
+               }
        }
 
        BUG_ON(val != final);
@@ -12005,6 +12025,9 @@ static int intel_crtc_atomic_check(struct drm_crtc *crtc,
                        DRM_DEBUG_KMS("No valid intermediate pipe watermarks are possible\n");
                        return ret;
                }
+       } else if (dev_priv->display.compute_intermediate_wm) {
+               if (HAS_PCH_SPLIT(dev_priv) && INTEL_GEN(dev_priv) < 9)
+                       pipe_config->wm.intermediate = pipe_config->wm.optimal.ilk;
        }
 
        if (INTEL_INFO(dev)->gen >= 9) {
@@ -14551,6 +14574,8 @@ static void intel_setup_outputs(struct drm_device *dev)
                if (I915_READ(PCH_DP_D) & DP_DETECTED)
                        intel_dp_init(dev, PCH_DP_D, PORT_D);
        } else if (IS_VALLEYVIEW(dev) || IS_CHERRYVIEW(dev)) {
+               bool has_edp, has_port;
+
                /*
                 * The DP_DETECTED bit is the latched state of the DDC
                 * SDA pin at boot. However since eDP doesn't require DDC
@@ -14559,27 +14584,37 @@ static void intel_setup_outputs(struct drm_device *dev)
                 * Thus we can't rely on the DP_DETECTED bit alone to detect
                 * eDP ports. Consult the VBT as well as DP_DETECTED to
                 * detect eDP ports.
+                *
+                * Sadly the straps seem to be missing sometimes even for HDMI
+                * ports (eg. on Voyo V3 - CHT x7-Z8700), so check both strap
+                * and VBT for the presence of the port. Additionally we can't
+                * trust the port type the VBT declares as we've seen at least
+                * HDMI ports that the VBT claim are DP or eDP.
                 */
-               if (I915_READ(VLV_HDMIB) & SDVO_DETECTED &&
-                   !intel_dp_is_edp(dev, PORT_B))
+               has_edp = intel_dp_is_edp(dev, PORT_B);
+               has_port = intel_bios_is_port_present(dev_priv, PORT_B);
+               if (I915_READ(VLV_DP_B) & DP_DETECTED || has_port)
+                       has_edp &= intel_dp_init(dev, VLV_DP_B, PORT_B);
+               if ((I915_READ(VLV_HDMIB) & SDVO_DETECTED || has_port) && !has_edp)
                        intel_hdmi_init(dev, VLV_HDMIB, PORT_B);
-               if (I915_READ(VLV_DP_B) & DP_DETECTED ||
-                   intel_dp_is_edp(dev, PORT_B))
-                       intel_dp_init(dev, VLV_DP_B, PORT_B);
 
-               if (I915_READ(VLV_HDMIC) & SDVO_DETECTED &&
-                   !intel_dp_is_edp(dev, PORT_C))
+               has_edp = intel_dp_is_edp(dev, PORT_C);
+               has_port = intel_bios_is_port_present(dev_priv, PORT_C);
+               if (I915_READ(VLV_DP_C) & DP_DETECTED || has_port)
+                       has_edp &= intel_dp_init(dev, VLV_DP_C, PORT_C);
+               if ((I915_READ(VLV_HDMIC) & SDVO_DETECTED || has_port) && !has_edp)
                        intel_hdmi_init(dev, VLV_HDMIC, PORT_C);
-               if (I915_READ(VLV_DP_C) & DP_DETECTED ||
-                   intel_dp_is_edp(dev, PORT_C))
-                       intel_dp_init(dev, VLV_DP_C, PORT_C);
 
                if (IS_CHERRYVIEW(dev)) {
-                       /* eDP not supported on port D, so don't check VBT */
-                       if (I915_READ(CHV_HDMID) & SDVO_DETECTED)
-                               intel_hdmi_init(dev, CHV_HDMID, PORT_D);
-                       if (I915_READ(CHV_DP_D) & DP_DETECTED)
+                       /*
+                        * eDP not supported on port D,
+                        * so no need to worry about it
+                        */
+                       has_port = intel_bios_is_port_present(dev_priv, PORT_D);
+                       if (I915_READ(CHV_DP_D) & DP_DETECTED || has_port)
                                intel_dp_init(dev, CHV_DP_D, PORT_D);
+                       if (I915_READ(CHV_HDMID) & SDVO_DETECTED || has_port)
+                               intel_hdmi_init(dev, CHV_HDMID, PORT_D);
                }
 
                intel_dsi_init(dev);
@@ -15990,6 +16025,9 @@ retry:
 
                state->acquire_ctx = &ctx;
 
+               /* ignore any reset values/BIOS leftovers in the WM registers */
+               to_intel_atomic_state(state)->skip_intermediate_wm = true;
+
                for_each_crtc_in_state(state, crtc, crtc_state, i) {
                        /*
                         * Force recalculation even if we restore
index f192f58..ffe5f84 100644 (file)
@@ -5725,8 +5725,11 @@ static bool intel_edp_init_connector(struct intel_dp *intel_dp,
        if (!fixed_mode && dev_priv->vbt.lfp_lvds_vbt_mode) {
                fixed_mode = drm_mode_duplicate(dev,
                                        dev_priv->vbt.lfp_lvds_vbt_mode);
-               if (fixed_mode)
+               if (fixed_mode) {
                        fixed_mode->type |= DRM_MODE_TYPE_PREFERRED;
+                       connector->display_info.width_mm = fixed_mode->width_mm;
+                       connector->display_info.height_mm = fixed_mode->height_mm;
+               }
        }
        mutex_unlock(&dev->mode_config.mutex);
 
@@ -5923,9 +5926,9 @@ fail:
        return false;
 }
 
-void
-intel_dp_init(struct drm_device *dev,
-             i915_reg_t output_reg, enum port port)
+bool intel_dp_init(struct drm_device *dev,
+                  i915_reg_t output_reg,
+                  enum port port)
 {
        struct drm_i915_private *dev_priv = dev->dev_private;
        struct intel_digital_port *intel_dig_port;
@@ -5935,7 +5938,7 @@ intel_dp_init(struct drm_device *dev,
 
        intel_dig_port = kzalloc(sizeof(*intel_dig_port), GFP_KERNEL);
        if (!intel_dig_port)
-               return;
+               return false;
 
        intel_connector = intel_connector_alloc();
        if (!intel_connector)
@@ -5992,7 +5995,7 @@ intel_dp_init(struct drm_device *dev,
        if (!intel_dp_init_connector(intel_dig_port, intel_connector))
                goto err_init_connector;
 
-       return;
+       return true;
 
 err_init_connector:
        drm_encoder_cleanup(encoder);
@@ -6000,8 +6003,7 @@ err_encoder_init:
        kfree(intel_connector);
 err_connector_alloc:
        kfree(intel_dig_port);
-
-       return;
+       return false;
 }
 
 void intel_dp_mst_suspend(struct drm_device *dev)
index 639bf02..baf6f55 100644 (file)
@@ -366,6 +366,9 @@ ibx_get_dpll(struct intel_crtc *crtc, struct intel_crtc_state *crtc_state,
                                             DPLL_ID_PCH_PLL_B);
        }
 
+       if (!pll)
+               return NULL;
+
        /* reference the pll */
        intel_reference_shared_dpll(pll, crtc_state);
 
@@ -1702,9 +1705,9 @@ static const struct intel_dpll_mgr hsw_pll_mgr = {
 
 static const struct dpll_info skl_plls[] = {
        { "DPLL 0", DPLL_ID_SKL_DPLL0, &skl_ddi_dpll0_funcs, INTEL_DPLL_ALWAYS_ON },
-       { "DPPL 1", DPLL_ID_SKL_DPLL1, &skl_ddi_pll_funcs,   0 },
-       { "DPPL 2", DPLL_ID_SKL_DPLL2, &skl_ddi_pll_funcs,   0 },
-       { "DPPL 3", DPLL_ID_SKL_DPLL3, &skl_ddi_pll_funcs,   0 },
+       { "DPLL 1", DPLL_ID_SKL_DPLL1, &skl_ddi_pll_funcs,   0 },
+       { "DPLL 2", DPLL_ID_SKL_DPLL2, &skl_ddi_pll_funcs,   0 },
+       { "DPLL 3", DPLL_ID_SKL_DPLL3, &skl_ddi_pll_funcs,   0 },
        { NULL, -1, NULL, },
 };
 
index 5da29a0..4a24b00 100644 (file)
@@ -33,6 +33,7 @@
 #include <drm/drm_crtc.h>
 #include <drm/drm_crtc_helper.h>
 #include <drm/drm_fb_helper.h>
+#include <drm/drm_dp_dual_mode_helper.h>
 #include <drm/drm_dp_mst_helper.h>
 #include <drm/drm_rect.h>
 #include <drm/drm_atomic.h>
@@ -753,6 +754,10 @@ struct cxsr_latency {
 struct intel_hdmi {
        i915_reg_t hdmi_reg;
        int ddc_bus;
+       struct {
+               enum drm_dp_dual_mode_type type;
+               int max_tmds_clock;
+       } dp_dual_mode;
        bool limited_color_range;
        bool color_range_auto;
        bool has_hdmi_sink;
@@ -1279,7 +1284,7 @@ void intel_csr_ucode_suspend(struct drm_i915_private *);
 void intel_csr_ucode_resume(struct drm_i915_private *);
 
 /* intel_dp.c */
-void intel_dp_init(struct drm_device *dev, i915_reg_t output_reg, enum port port);
+bool intel_dp_init(struct drm_device *dev, i915_reg_t output_reg, enum port port);
 bool intel_dp_init_connector(struct intel_digital_port *intel_dig_port,
                             struct intel_connector *intel_connector);
 void intel_dp_set_link_params(struct intel_dp *intel_dp,
@@ -1401,6 +1406,7 @@ void intel_hdmi_init_connector(struct intel_digital_port *intel_dig_port,
 struct intel_hdmi *enc_to_intel_hdmi(struct drm_encoder *encoder);
 bool intel_hdmi_compute_config(struct intel_encoder *encoder,
                               struct intel_crtc_state *pipe_config);
+void intel_dp_dual_mode_set_tmds_output(struct intel_hdmi *hdmi, bool enable);
 
 
 /* intel_lvds.c */
index 2b22bb9..4756ef6 100644 (file)
@@ -46,6 +46,22 @@ static const struct {
        },
 };
 
+/* return pixels in terms of txbyteclkhs */
+static u16 txbyteclkhs(u16 pixels, int bpp, int lane_count,
+                      u16 burst_mode_ratio)
+{
+       return DIV_ROUND_UP(DIV_ROUND_UP(pixels * bpp * burst_mode_ratio,
+                                        8 * 100), lane_count);
+}
+
+/* return pixels equvalent to txbyteclkhs */
+static u16 pixels_from_txbyteclkhs(u16 clk_hs, int bpp, int lane_count,
+                       u16 burst_mode_ratio)
+{
+       return DIV_ROUND_UP((clk_hs * lane_count * 8 * 100),
+                                               (bpp * burst_mode_ratio));
+}
+
 enum mipi_dsi_pixel_format pixel_format_from_register_bits(u32 fmt)
 {
        /* It just so happens the VBT matches register contents. */
@@ -780,10 +796,19 @@ static void bxt_dsi_get_pipe_config(struct intel_encoder *encoder,
        struct drm_i915_private *dev_priv = dev->dev_private;
        struct drm_display_mode *adjusted_mode =
                                        &pipe_config->base.adjusted_mode;
+       struct drm_display_mode *adjusted_mode_sw;
+       struct intel_crtc *intel_crtc;
        struct intel_dsi *intel_dsi = enc_to_intel_dsi(&encoder->base);
+       unsigned int lane_count = intel_dsi->lane_count;
        unsigned int bpp, fmt;
        enum port port;
-       u16 vfp, vsync, vbp;
+       u16 hactive, hfp, hsync, hbp, vfp, vsync, vbp;
+       u16 hfp_sw, hsync_sw, hbp_sw;
+       u16 crtc_htotal_sw, crtc_hsync_start_sw, crtc_hsync_end_sw,
+                               crtc_hblank_start_sw, crtc_hblank_end_sw;
+
+       intel_crtc = to_intel_crtc(encoder->base.crtc);
+       adjusted_mode_sw = &intel_crtc->config->base.adjusted_mode;
 
        /*
         * Atleast one port is active as encoder->get_config called only if
@@ -808,26 +833,118 @@ static void bxt_dsi_get_pipe_config(struct intel_encoder *encoder,
        adjusted_mode->crtc_vtotal =
                                I915_READ(BXT_MIPI_TRANS_VTOTAL(port));
 
+       hactive = adjusted_mode->crtc_hdisplay;
+       hfp = I915_READ(MIPI_HFP_COUNT(port));
+
        /*
-        * TODO: Retrieve hfp, hsync and hbp. Adjust them for dual link and
-        * calculate hsync_start, hsync_end, htotal and hblank_end
+        * Meaningful for video mode non-burst sync pulse mode only,
+        * can be zero for non-burst sync events and burst modes
         */
+       hsync = I915_READ(MIPI_HSYNC_PADDING_COUNT(port));
+       hbp = I915_READ(MIPI_HBP_COUNT(port));
+
+       /* harizontal values are in terms of high speed byte clock */
+       hfp = pixels_from_txbyteclkhs(hfp, bpp, lane_count,
+                                               intel_dsi->burst_mode_ratio);
+       hsync = pixels_from_txbyteclkhs(hsync, bpp, lane_count,
+                                               intel_dsi->burst_mode_ratio);
+       hbp = pixels_from_txbyteclkhs(hbp, bpp, lane_count,
+                                               intel_dsi->burst_mode_ratio);
+
+       if (intel_dsi->dual_link) {
+               hfp *= 2;
+               hsync *= 2;
+               hbp *= 2;
+       }
 
        /* vertical values are in terms of lines */
        vfp = I915_READ(MIPI_VFP_COUNT(port));
        vsync = I915_READ(MIPI_VSYNC_PADDING_COUNT(port));
        vbp = I915_READ(MIPI_VBP_COUNT(port));
 
+       adjusted_mode->crtc_htotal = hactive + hfp + hsync + hbp;
+       adjusted_mode->crtc_hsync_start = hfp + adjusted_mode->crtc_hdisplay;
+       adjusted_mode->crtc_hsync_end = hsync + adjusted_mode->crtc_hsync_start;
        adjusted_mode->crtc_hblank_start = adjusted_mode->crtc_hdisplay;
+       adjusted_mode->crtc_hblank_end = adjusted_mode->crtc_htotal;
 
-       adjusted_mode->crtc_vsync_start =
-                               vfp + adjusted_mode->crtc_vdisplay;
-       adjusted_mode->crtc_vsync_end =
-                               vsync + adjusted_mode->crtc_vsync_start;
+       adjusted_mode->crtc_vsync_start = vfp + adjusted_mode->crtc_vdisplay;
+       adjusted_mode->crtc_vsync_end = vsync + adjusted_mode->crtc_vsync_start;
        adjusted_mode->crtc_vblank_start = adjusted_mode->crtc_vdisplay;
        adjusted_mode->crtc_vblank_end = adjusted_mode->crtc_vtotal;
-}
 
+       /*
+        * In BXT DSI there is no regs programmed with few horizontal timings
+        * in Pixels but txbyteclkhs.. So retrieval process adds some
+        * ROUND_UP ERRORS in the process of PIXELS<==>txbyteclkhs.
+        * Actually here for the given adjusted_mode, we are calculating the
+        * value programmed to the port and then back to the horizontal timing
+        * param in pixels. This is the expected value, including roundup errors
+        * And if that is same as retrieved value from port, then
+        * (HW state) adjusted_mode's horizontal timings are corrected to
+        * match with SW state to nullify the errors.
+        */
+       /* Calculating the value programmed to the Port register */
+       hfp_sw = adjusted_mode_sw->crtc_hsync_start -
+                                       adjusted_mode_sw->crtc_hdisplay;
+       hsync_sw = adjusted_mode_sw->crtc_hsync_end -
+                                       adjusted_mode_sw->crtc_hsync_start;
+       hbp_sw = adjusted_mode_sw->crtc_htotal -
+                                       adjusted_mode_sw->crtc_hsync_end;
+
+       if (intel_dsi->dual_link) {
+               hfp_sw /= 2;
+               hsync_sw /= 2;
+               hbp_sw /= 2;
+       }
+
+       hfp_sw = txbyteclkhs(hfp_sw, bpp, lane_count,
+                                               intel_dsi->burst_mode_ratio);
+       hsync_sw = txbyteclkhs(hsync_sw, bpp, lane_count,
+                           intel_dsi->burst_mode_ratio);
+       hbp_sw = txbyteclkhs(hbp_sw, bpp, lane_count,
+                                               intel_dsi->burst_mode_ratio);
+
+       /* Reverse calculating the adjusted mode parameters from port reg vals*/
+       hfp_sw = pixels_from_txbyteclkhs(hfp_sw, bpp, lane_count,
+                                               intel_dsi->burst_mode_ratio);
+       hsync_sw = pixels_from_txbyteclkhs(hsync_sw, bpp, lane_count,
+                                               intel_dsi->burst_mode_ratio);
+       hbp_sw = pixels_from_txbyteclkhs(hbp_sw, bpp, lane_count,
+                                               intel_dsi->burst_mode_ratio);
+
+       if (intel_dsi->dual_link) {
+               hfp_sw *= 2;
+               hsync_sw *= 2;
+               hbp_sw *= 2;
+       }
+
+       crtc_htotal_sw = adjusted_mode_sw->crtc_hdisplay + hfp_sw +
+                                                       hsync_sw + hbp_sw;
+       crtc_hsync_start_sw = hfp_sw + adjusted_mode_sw->crtc_hdisplay;
+       crtc_hsync_end_sw = hsync_sw + crtc_hsync_start_sw;
+       crtc_hblank_start_sw = adjusted_mode_sw->crtc_hdisplay;
+       crtc_hblank_end_sw = crtc_htotal_sw;
+
+       if (adjusted_mode->crtc_htotal == crtc_htotal_sw)
+               adjusted_mode->crtc_htotal = adjusted_mode_sw->crtc_htotal;
+
+       if (adjusted_mode->crtc_hsync_start == crtc_hsync_start_sw)
+               adjusted_mode->crtc_hsync_start =
+                                       adjusted_mode_sw->crtc_hsync_start;
+
+       if (adjusted_mode->crtc_hsync_end == crtc_hsync_end_sw)
+               adjusted_mode->crtc_hsync_end =
+                                       adjusted_mode_sw->crtc_hsync_end;
+
+       if (adjusted_mode->crtc_hblank_start == crtc_hblank_start_sw)
+               adjusted_mode->crtc_hblank_start =
+                                       adjusted_mode_sw->crtc_hblank_start;
+
+       if (adjusted_mode->crtc_hblank_end == crtc_hblank_end_sw)
+               adjusted_mode->crtc_hblank_end =
+                                       adjusted_mode_sw->crtc_hblank_end;
+}
 
 static void intel_dsi_get_config(struct intel_encoder *encoder,
                                 struct intel_crtc_state *pipe_config)
@@ -891,14 +1008,6 @@ static u16 txclkesc(u32 divider, unsigned int us)
        }
 }
 
-/* return pixels in terms of txbyteclkhs */
-static u16 txbyteclkhs(u16 pixels, int bpp, int lane_count,
-                      u16 burst_mode_ratio)
-{
-       return DIV_ROUND_UP(DIV_ROUND_UP(pixels * bpp * burst_mode_ratio,
-                                        8 * 100), lane_count);
-}
-
 static void set_dsi_timings(struct drm_encoder *encoder,
                            const struct drm_display_mode *adjusted_mode)
 {
@@ -1436,6 +1545,9 @@ void intel_dsi_init(struct drm_device *dev)
                goto err;
        }
 
+       connector->display_info.width_mm = fixed_mode->width_mm;
+       connector->display_info.height_mm = fixed_mode->height_mm;
+
        intel_panel_init(&intel_connector->panel, fixed_mode, NULL);
 
        intel_dsi_add_properties(intel_connector);
index 2cdab73..a884470 100644 (file)
@@ -836,6 +836,22 @@ static void hsw_set_infoframes(struct drm_encoder *encoder,
        intel_hdmi_set_hdmi_infoframe(encoder, adjusted_mode);
 }
 
+void intel_dp_dual_mode_set_tmds_output(struct intel_hdmi *hdmi, bool enable)
+{
+       struct drm_i915_private *dev_priv = to_i915(intel_hdmi_to_dev(hdmi));
+       struct i2c_adapter *adapter =
+               intel_gmbus_get_adapter(dev_priv, hdmi->ddc_bus);
+
+       if (hdmi->dp_dual_mode.type < DRM_DP_DUAL_MODE_TYPE2_DVI)
+               return;
+
+       DRM_DEBUG_KMS("%s DP dual mode adaptor TMDS output\n",
+                     enable ? "Enabling" : "Disabling");
+
+       drm_dp_dual_mode_set_tmds_output(hdmi->dp_dual_mode.type,
+                                        adapter, enable);
+}
+
 static void intel_hdmi_prepare(struct intel_encoder *encoder)
 {
        struct drm_device *dev = encoder->base.dev;
@@ -845,6 +861,8 @@ static void intel_hdmi_prepare(struct intel_encoder *encoder)
        const struct drm_display_mode *adjusted_mode = &crtc->config->base.adjusted_mode;
        u32 hdmi_val;
 
+       intel_dp_dual_mode_set_tmds_output(intel_hdmi, true);
+
        hdmi_val = SDVO_ENCODING_HDMI;
        if (!HAS_PCH_SPLIT(dev) && crtc->config->limited_color_range)
                hdmi_val |= HDMI_COLOR_RANGE_16_235;
@@ -953,6 +971,8 @@ static void intel_hdmi_get_config(struct intel_encoder *encoder,
                dotclock /= pipe_config->pixel_multiplier;
 
        pipe_config->base.adjusted_mode.crtc_clock = dotclock;
+
+       pipe_config->lane_count = 4;
 }
 
 static void intel_enable_hdmi_audio(struct intel_encoder *encoder)
@@ -1140,6 +1160,8 @@ static void intel_disable_hdmi(struct intel_encoder *encoder)
        }
 
        intel_hdmi->set_infoframes(&encoder->base, false, NULL);
+
+       intel_dp_dual_mode_set_tmds_output(intel_hdmi, false);
 }
 
 static void g4x_disable_hdmi(struct intel_encoder *encoder)
@@ -1165,27 +1187,42 @@ static void pch_post_disable_hdmi(struct intel_encoder *encoder)
        intel_disable_hdmi(encoder);
 }
 
-static int hdmi_port_clock_limit(struct intel_hdmi *hdmi, bool respect_dvi_limit)
+static int intel_hdmi_source_max_tmds_clock(struct drm_i915_private *dev_priv)
 {
-       struct drm_device *dev = intel_hdmi_to_dev(hdmi);
-
-       if ((respect_dvi_limit && !hdmi->has_hdmi_sink) || IS_G4X(dev))
+       if (IS_G4X(dev_priv))
                return 165000;
-       else if (IS_HASWELL(dev) || INTEL_INFO(dev)->gen >= 8)
+       else if (IS_HASWELL(dev_priv) || INTEL_INFO(dev_priv)->gen >= 8)
                return 300000;
        else
                return 225000;
 }
 
+static int hdmi_port_clock_limit(struct intel_hdmi *hdmi,
+                                bool respect_downstream_limits)
+{
+       struct drm_device *dev = intel_hdmi_to_dev(hdmi);
+       int max_tmds_clock = intel_hdmi_source_max_tmds_clock(to_i915(dev));
+
+       if (respect_downstream_limits) {
+               if (hdmi->dp_dual_mode.max_tmds_clock)
+                       max_tmds_clock = min(max_tmds_clock,
+                                            hdmi->dp_dual_mode.max_tmds_clock);
+               if (!hdmi->has_hdmi_sink)
+                       max_tmds_clock = min(max_tmds_clock, 165000);
+       }
+
+       return max_tmds_clock;
+}
+
 static enum drm_mode_status
 hdmi_port_clock_valid(struct intel_hdmi *hdmi,
-                     int clock, bool respect_dvi_limit)
+                     int clock, bool respect_downstream_limits)
 {
        struct drm_device *dev = intel_hdmi_to_dev(hdmi);
 
        if (clock < 25000)
                return MODE_CLOCK_LOW;
-       if (clock > hdmi_port_clock_limit(hdmi, respect_dvi_limit))
+       if (clock > hdmi_port_clock_limit(hdmi, respect_downstream_limits))
                return MODE_CLOCK_HIGH;
 
        /* BXT DPLL can't generate 223-240 MHz */
@@ -1309,7 +1346,7 @@ bool intel_hdmi_compute_config(struct intel_encoder *encoder,
         * within limits.
         */
        if (pipe_config->pipe_bpp > 8*3 && pipe_config->has_hdmi_sink &&
-           hdmi_port_clock_valid(intel_hdmi, clock_12bpc, false) == MODE_OK &&
+           hdmi_port_clock_valid(intel_hdmi, clock_12bpc, true) == MODE_OK &&
            hdmi_12bpc_possible(pipe_config)) {
                DRM_DEBUG_KMS("picking bpc to 12 for HDMI output\n");
                desired_bpp = 12*3;
@@ -1337,6 +1374,8 @@ bool intel_hdmi_compute_config(struct intel_encoder *encoder,
        /* Set user selected PAR to incoming mode's member */
        adjusted_mode->picture_aspect_ratio = intel_hdmi->aspect_ratio;
 
+       pipe_config->lane_count = 4;
+
        return true;
 }
 
@@ -1349,10 +1388,57 @@ intel_hdmi_unset_edid(struct drm_connector *connector)
        intel_hdmi->has_audio = false;
        intel_hdmi->rgb_quant_range_selectable = false;
 
+       intel_hdmi->dp_dual_mode.type = DRM_DP_DUAL_MODE_NONE;
+       intel_hdmi->dp_dual_mode.max_tmds_clock = 0;
+
        kfree(to_intel_connector(connector)->detect_edid);
        to_intel_connector(connector)->detect_edid = NULL;
 }
 
+static void
+intel_hdmi_dp_dual_mode_detect(struct drm_connector *connector, bool has_edid)
+{
+       struct drm_i915_private *dev_priv = to_i915(connector->dev);
+       struct intel_hdmi *hdmi = intel_attached_hdmi(connector);
+       enum port port = hdmi_to_dig_port(hdmi)->port;
+       struct i2c_adapter *adapter =
+               intel_gmbus_get_adapter(dev_priv, hdmi->ddc_bus);
+       enum drm_dp_dual_mode_type type = drm_dp_dual_mode_detect(adapter);
+
+       /*
+        * Type 1 DVI adaptors are not required to implement any
+        * registers, so we can't always detect their presence.
+        * Ideally we should be able to check the state of the
+        * CONFIG1 pin, but no such luck on our hardware.
+        *
+        * The only method left to us is to check the VBT to see
+        * if the port is a dual mode capable DP port. But let's
+        * only do that when we sucesfully read the EDID, to avoid
+        * confusing log messages about DP dual mode adaptors when
+        * there's nothing connected to the port.
+        */
+       if (type == DRM_DP_DUAL_MODE_UNKNOWN) {
+               if (has_edid &&
+                   intel_bios_is_port_dp_dual_mode(dev_priv, port)) {
+                       DRM_DEBUG_KMS("Assuming DP dual mode adaptor presence based on VBT\n");
+                       type = DRM_DP_DUAL_MODE_TYPE1_DVI;
+               } else {
+                       type = DRM_DP_DUAL_MODE_NONE;
+               }
+       }
+
+       if (type == DRM_DP_DUAL_MODE_NONE)
+               return;
+
+       hdmi->dp_dual_mode.type = type;
+       hdmi->dp_dual_mode.max_tmds_clock =
+               drm_dp_dual_mode_max_tmds_clock(type, adapter);
+
+       DRM_DEBUG_KMS("DP dual mode adaptor (%s) detected (max TMDS clock: %d kHz)\n",
+                     drm_dp_get_dual_mode_type_name(type),
+                     hdmi->dp_dual_mode.max_tmds_clock);
+}
+
 static bool
 intel_hdmi_set_edid(struct drm_connector *connector, bool force)
 {
@@ -1368,6 +1454,8 @@ intel_hdmi_set_edid(struct drm_connector *connector, bool force)
                                    intel_gmbus_get_adapter(dev_priv,
                                    intel_hdmi->ddc_bus));
 
+               intel_hdmi_dp_dual_mode_detect(connector, edid != NULL);
+
                intel_display_power_put(dev_priv, POWER_DOMAIN_GMBUS);
        }
 
@@ -2054,6 +2142,9 @@ void intel_hdmi_init_connector(struct intel_digital_port *intel_dig_port,
        enum port port = intel_dig_port->port;
        uint8_t alternate_ddc_pin;
 
+       DRM_DEBUG_KMS("Adding HDMI connector on port %c\n",
+                     port_name(port));
+
        if (WARN(intel_dig_port->max_lanes < 4,
                 "Not enough lanes (%d) for HDMI on port %c\n",
                 intel_dig_port->max_lanes, port_name(port)))
index 6179b59..42eac37 100644 (file)
@@ -721,48 +721,6 @@ int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request
        return ret;
 }
 
-static int logical_ring_wait_for_space(struct drm_i915_gem_request *req,
-                                      int bytes)
-{
-       struct intel_ringbuffer *ringbuf = req->ringbuf;
-       struct intel_engine_cs *engine = req->engine;
-       struct drm_i915_gem_request *target;
-       unsigned space;
-       int ret;
-
-       if (intel_ring_space(ringbuf) >= bytes)
-               return 0;
-
-       /* The whole point of reserving space is to not wait! */
-       WARN_ON(ringbuf->reserved_in_use);
-
-       list_for_each_entry(target, &engine->request_list, list) {
-               /*
-                * The request queue is per-engine, so can contain requests
-                * from multiple ringbuffers. Here, we must ignore any that
-                * aren't from the ringbuffer we're considering.
-                */
-               if (target->ringbuf != ringbuf)
-                       continue;
-
-               /* Would completion of this request free enough space? */
-               space = __intel_ring_space(target->postfix, ringbuf->tail,
-                                          ringbuf->size);
-               if (space >= bytes)
-                       break;
-       }
-
-       if (WARN_ON(&target->list == &engine->request_list))
-               return -ENOSPC;
-
-       ret = i915_wait_request(target);
-       if (ret)
-               return ret;
-
-       ringbuf->space = space;
-       return 0;
-}
-
 /*
  * intel_logical_ring_advance_and_submit() - advance the tail and submit the workload
  * @request: Request to advance the logical ringbuffer of.
@@ -814,92 +772,6 @@ intel_logical_ring_advance_and_submit(struct drm_i915_gem_request *request)
        return 0;
 }
 
-static void __wrap_ring_buffer(struct intel_ringbuffer *ringbuf)
-{
-       uint32_t __iomem *virt;
-       int rem = ringbuf->size - ringbuf->tail;
-
-       virt = ringbuf->virtual_start + ringbuf->tail;
-       rem /= 4;
-       while (rem--)
-               iowrite32(MI_NOOP, virt++);
-
-       ringbuf->tail = 0;
-       intel_ring_update_space(ringbuf);
-}
-
-static int logical_ring_prepare(struct drm_i915_gem_request *req, int bytes)
-{
-       struct intel_ringbuffer *ringbuf = req->ringbuf;
-       int remain_usable = ringbuf->effective_size - ringbuf->tail;
-       int remain_actual = ringbuf->size - ringbuf->tail;
-       int ret, total_bytes, wait_bytes = 0;
-       bool need_wrap = false;
-
-       if (ringbuf->reserved_in_use)
-               total_bytes = bytes;
-       else
-               total_bytes = bytes + ringbuf->reserved_size;
-
-       if (unlikely(bytes > remain_usable)) {
-               /*
-                * Not enough space for the basic request. So need to flush
-                * out the remainder and then wait for base + reserved.
-                */
-               wait_bytes = remain_actual + total_bytes;
-               need_wrap = true;
-       } else {
-               if (unlikely(total_bytes > remain_usable)) {
-                       /*
-                        * The base request will fit but the reserved space
-                        * falls off the end. So don't need an immediate wrap
-                        * and only need to effectively wait for the reserved
-                        * size space from the start of ringbuffer.
-                        */
-                       wait_bytes = remain_actual + ringbuf->reserved_size;
-               } else if (total_bytes > ringbuf->space) {
-                       /* No wrapping required, just waiting. */
-                       wait_bytes = total_bytes;
-               }
-       }
-
-       if (wait_bytes) {
-               ret = logical_ring_wait_for_space(req, wait_bytes);
-               if (unlikely(ret))
-                       return ret;
-
-               if (need_wrap)
-                       __wrap_ring_buffer(ringbuf);
-       }
-
-       return 0;
-}
-
-/**
- * intel_logical_ring_begin() - prepare the logical ringbuffer to accept some commands
- *
- * @req: The request to start some new work for
- * @num_dwords: number of DWORDs that we plan to write to the ringbuffer.
- *
- * The ringbuffer might not be ready to accept the commands right away (maybe it needs to
- * be wrapped, or wait a bit for the tail to be updated). This function takes care of that
- * and also preallocates a request (every workload submission is still mediated through
- * requests, same as it did with legacy ringbuffer submission).
- *
- * Return: non-zero if the ringbuffer is not ready to be written to.
- */
-int intel_logical_ring_begin(struct drm_i915_gem_request *req, int num_dwords)
-{
-       int ret;
-
-       ret = logical_ring_prepare(req, num_dwords * sizeof(uint32_t));
-       if (ret)
-               return ret;
-
-       req->ringbuf->space -= num_dwords * sizeof(uint32_t);
-       return 0;
-}
-
 int intel_logical_ring_reserve_space(struct drm_i915_gem_request *request)
 {
        /*
@@ -912,7 +784,7 @@ int intel_logical_ring_reserve_space(struct drm_i915_gem_request *request)
         */
        intel_ring_reserved_space_reserve(request->ringbuf, MIN_SPACE_FOR_ADD_REQUEST);
 
-       return intel_logical_ring_begin(request, 0);
+       return intel_ring_begin(request, 0);
 }
 
 /**
@@ -982,7 +854,7 @@ int intel_execlists_submission(struct i915_execbuffer_params *params,
 
        if (engine == &dev_priv->engine[RCS] &&
            instp_mode != dev_priv->relative_constants_mode) {
-               ret = intel_logical_ring_begin(params->request, 4);
+               ret = intel_ring_begin(params->request, 4);
                if (ret)
                        return ret;
 
@@ -1178,7 +1050,7 @@ static int intel_logical_ring_workarounds_emit(struct drm_i915_gem_request *req)
        if (ret)
                return ret;
 
-       ret = intel_logical_ring_begin(req, w->count * 2 + 2);
+       ret = intel_ring_begin(req, w->count * 2 + 2);
        if (ret)
                return ret;
 
@@ -1669,7 +1541,7 @@ static int intel_logical_ring_emit_pdps(struct drm_i915_gem_request *req)
        const int num_lri_cmds = GEN8_LEGACY_PDPES * 2;
        int i, ret;
 
-       ret = intel_logical_ring_begin(req, num_lri_cmds * 2 + 2);
+       ret = intel_ring_begin(req, num_lri_cmds * 2 + 2);
        if (ret)
                return ret;
 
@@ -1716,7 +1588,7 @@ static int gen8_emit_bb_start(struct drm_i915_gem_request *req,
                req->ctx->ppgtt->pd_dirty_rings &= ~intel_engine_flag(req->engine);
        }
 
-       ret = intel_logical_ring_begin(req, 4);
+       ret = intel_ring_begin(req, 4);
        if (ret)
                return ret;
 
@@ -1778,7 +1650,7 @@ static int gen8_emit_flush(struct drm_i915_gem_request *request,
        uint32_t cmd;
        int ret;
 
-       ret = intel_logical_ring_begin(request, 4);
+       ret = intel_ring_begin(request, 4);
        if (ret)
                return ret;
 
@@ -1846,7 +1718,7 @@ static int gen8_emit_flush_render(struct drm_i915_gem_request *request,
                        vf_flush_wa = true;
        }
 
-       ret = intel_logical_ring_begin(request, vf_flush_wa ? 12 : 6);
+       ret = intel_ring_begin(request, vf_flush_wa ? 12 : 6);
        if (ret)
                return ret;
 
@@ -1920,7 +1792,7 @@ static int gen8_emit_request(struct drm_i915_gem_request *request)
        struct intel_ringbuffer *ringbuf = request->ringbuf;
        int ret;
 
-       ret = intel_logical_ring_begin(request, 6 + WA_TAIL_DWORDS);
+       ret = intel_ring_begin(request, 6 + WA_TAIL_DWORDS);
        if (ret)
                return ret;
 
@@ -1944,7 +1816,7 @@ static int gen8_emit_request_render(struct drm_i915_gem_request *request)
        struct intel_ringbuffer *ringbuf = request->ringbuf;
        int ret;
 
-       ret = intel_logical_ring_begin(request, 8 + WA_TAIL_DWORDS);
+       ret = intel_ring_begin(request, 8 + WA_TAIL_DWORDS);
        if (ret)
                return ret;
 
index 461f1ef..60a7385 100644 (file)
@@ -63,7 +63,6 @@ int intel_logical_ring_reserve_space(struct drm_i915_gem_request *request);
 void intel_logical_ring_stop(struct intel_engine_cs *engine);
 void intel_logical_ring_cleanup(struct intel_engine_cs *engine);
 int intel_logical_rings_init(struct drm_device *dev);
-int intel_logical_ring_begin(struct drm_i915_gem_request *req, int num_dwords);
 
 int logical_ring_flush_all_caches(struct drm_i915_gem_request *req);
 /**
index bc53c0d..96281e6 100644 (file)
@@ -1082,6 +1082,8 @@ void intel_lvds_init(struct drm_device *dev)
                fixed_mode = drm_mode_duplicate(dev, dev_priv->vbt.lfp_lvds_vbt_mode);
                if (fixed_mode) {
                        fixed_mode->type |= DRM_MODE_TYPE_PREFERRED;
+                       connector->display_info.width_mm = fixed_mode->width_mm;
+                       connector->display_info.height_mm = fixed_mode->height_mm;
                        goto out;
                }
        }
index 23b8545..6ba4bf7 100644 (file)
@@ -239,11 +239,9 @@ static int emit_mocs_control_table(struct drm_i915_gem_request *req,
        if (WARN_ON(table->size > GEN9_NUM_MOCS_ENTRIES))
                return -ENODEV;
 
-       ret = intel_logical_ring_begin(req, 2 + 2 * GEN9_NUM_MOCS_ENTRIES);
-       if (ret) {
-               DRM_DEBUG("intel_logical_ring_begin failed %d\n", ret);
+       ret = intel_ring_begin(req, 2 + 2 * GEN9_NUM_MOCS_ENTRIES);
+       if (ret)
                return ret;
-       }
 
        intel_logical_ring_emit(ringbuf,
                                MI_LOAD_REGISTER_IMM(GEN9_NUM_MOCS_ENTRIES));
@@ -305,11 +303,9 @@ static int emit_mocs_l3cc_table(struct drm_i915_gem_request *req,
        if (WARN_ON(table->size > GEN9_NUM_MOCS_ENTRIES))
                return -ENODEV;
 
-       ret = intel_logical_ring_begin(req, 2 + GEN9_NUM_MOCS_ENTRIES);
-       if (ret) {
-               DRM_DEBUG("intel_logical_ring_begin failed %d\n", ret);
+       ret = intel_ring_begin(req, 2 + GEN9_NUM_MOCS_ENTRIES);
+       if (ret)
                return ret;
-       }
 
        intel_logical_ring_emit(ringbuf,
                        MI_LOAD_REGISTER_IMM(GEN9_NUM_MOCS_ENTRIES / 2));
index a078876..8357d57 100644 (file)
@@ -1638,6 +1638,12 @@ static int pwm_setup_backlight(struct intel_connector *connector,
                return -ENODEV;
        }
 
+       /*
+        * FIXME: pwm_apply_args() should be removed when switching to
+        * the atomic PWM API.
+        */
+       pwm_apply_args(panel->backlight.pwm);
+
        retval = pwm_config(panel->backlight.pwm, CRC_PMIC_PWM_PERIOD_NS,
                            CRC_PMIC_PWM_PERIOD_NS);
        if (retval < 0) {
index 4b60005..a7ef45d 100644 (file)
@@ -3904,6 +3904,8 @@ static void ilk_pipe_wm_get_hw_state(struct drm_crtc *crtc)
        if (IS_HASWELL(dev) || IS_BROADWELL(dev))
                hw->wm_linetime[pipe] = I915_READ(PIPE_WM_LINETIME(pipe));
 
+       memset(active, 0, sizeof(*active));
+
        active->pipe_enabled = intel_crtc->active;
 
        if (active->pipe_enabled) {
index c3abae4..a788d1e 100644 (file)
@@ -280,7 +280,10 @@ static void hsw_psr_enable_source(struct intel_dp *intel_dp)
         * with the 5 or 6 idle patterns.
         */
        uint32_t idle_frames = max(6, dev_priv->vbt.psr.idle_frames);
-       uint32_t val = 0x0;
+       uint32_t val = EDP_PSR_ENABLE;
+
+       val |= max_sleep_time << EDP_PSR_MAX_SLEEP_TIME_SHIFT;
+       val |= idle_frames << EDP_PSR_IDLE_FRAME_SHIFT;
 
        if (IS_HASWELL(dev))
                val |= EDP_PSR_MIN_LINK_ENTRY_TIME_8_LINES;
@@ -288,14 +291,50 @@ static void hsw_psr_enable_source(struct intel_dp *intel_dp)
        if (dev_priv->psr.link_standby)
                val |= EDP_PSR_LINK_STANDBY;
 
-       I915_WRITE(EDP_PSR_CTL, val |
-                  max_sleep_time << EDP_PSR_MAX_SLEEP_TIME_SHIFT |
-                  idle_frames << EDP_PSR_IDLE_FRAME_SHIFT |
-                  EDP_PSR_ENABLE);
+       if (dev_priv->vbt.psr.tp1_wakeup_time > 5)
+               val |= EDP_PSR_TP1_TIME_2500us;
+       else if (dev_priv->vbt.psr.tp1_wakeup_time > 1)
+               val |= EDP_PSR_TP1_TIME_500us;
+       else if (dev_priv->vbt.psr.tp1_wakeup_time > 0)
+               val |= EDP_PSR_TP1_TIME_100us;
+       else
+               val |= EDP_PSR_TP1_TIME_0us;
+
+       if (dev_priv->vbt.psr.tp2_tp3_wakeup_time > 5)
+               val |= EDP_PSR_TP2_TP3_TIME_2500us;
+       else if (dev_priv->vbt.psr.tp2_tp3_wakeup_time > 1)
+               val |= EDP_PSR_TP2_TP3_TIME_500us;
+       else if (dev_priv->vbt.psr.tp2_tp3_wakeup_time > 0)
+               val |= EDP_PSR_TP2_TP3_TIME_100us;
+       else
+               val |= EDP_PSR_TP2_TP3_TIME_0us;
+
+       if (intel_dp_source_supports_hbr2(intel_dp) &&
+           drm_dp_tps3_supported(intel_dp->dpcd))
+               val |= EDP_PSR_TP1_TP3_SEL;
+       else
+               val |= EDP_PSR_TP1_TP2_SEL;
+
+       I915_WRITE(EDP_PSR_CTL, val);
+
+       if (!dev_priv->psr.psr2_support)
+               return;
+
+       /* FIXME: selective update is probably totally broken because it doesn't
+        * mesh at all with our frontbuffer tracking. And the hw alone isn't
+        * good enough. */
+       val = EDP_PSR2_ENABLE | EDP_SU_TRACK_ENABLE;
+
+       if (dev_priv->vbt.psr.tp2_tp3_wakeup_time > 5)
+               val |= EDP_PSR2_TP2_TIME_2500;
+       else if (dev_priv->vbt.psr.tp2_tp3_wakeup_time > 1)
+               val |= EDP_PSR2_TP2_TIME_500;
+       else if (dev_priv->vbt.psr.tp2_tp3_wakeup_time > 0)
+               val |= EDP_PSR2_TP2_TIME_100;
+       else
+               val |= EDP_PSR2_TP2_TIME_50;
 
-       if (dev_priv->psr.psr2_support)
-               I915_WRITE(EDP_PSR2_CTL, EDP_PSR2_ENABLE |
-                               EDP_SU_TRACK_ENABLE | EDP_PSR2_TP2_TIME_100);
+       I915_WRITE(EDP_PSR2_CTL, val);
 }
 
 static bool intel_psr_match_conditions(struct intel_dp *intel_dp)
index 245386e..04402bb 100644 (file)
@@ -53,12 +53,6 @@ void intel_ring_update_space(struct intel_ringbuffer *ringbuf)
                                            ringbuf->tail, ringbuf->size);
 }
 
-int intel_ring_space(struct intel_ringbuffer *ringbuf)
-{
-       intel_ring_update_space(ringbuf);
-       return ringbuf->space;
-}
-
 bool intel_engine_stopped(struct intel_engine_cs *engine)
 {
        struct drm_i915_private *dev_priv = engine->dev->dev_private;
@@ -1309,7 +1303,7 @@ static int gen8_rcs_signal(struct drm_i915_gem_request *signaller_req,
                intel_ring_emit(signaller, seqno);
                intel_ring_emit(signaller, 0);
                intel_ring_emit(signaller, MI_SEMAPHORE_SIGNAL |
-                                          MI_SEMAPHORE_TARGET(waiter->id));
+                                          MI_SEMAPHORE_TARGET(waiter->hw_id));
                intel_ring_emit(signaller, 0);
        }
 
@@ -1349,7 +1343,7 @@ static int gen8_xcs_signal(struct drm_i915_gem_request *signaller_req,
                intel_ring_emit(signaller, upper_32_bits(gtt_offset));
                intel_ring_emit(signaller, seqno);
                intel_ring_emit(signaller, MI_SEMAPHORE_SIGNAL |
-                                          MI_SEMAPHORE_TARGET(waiter->id));
+                                          MI_SEMAPHORE_TARGET(waiter->hw_id));
                intel_ring_emit(signaller, 0);
        }
 
@@ -1573,6 +1567,8 @@ pc_render_add_request(struct drm_i915_gem_request *req)
 static void
 gen6_seqno_barrier(struct intel_engine_cs *engine)
 {
+       struct drm_i915_private *dev_priv = engine->dev->dev_private;
+
        /* Workaround to force correct ordering between irq and seqno writes on
         * ivb (and maybe also on snb) by reading from a CS register (like
         * ACTHD) before reading the status page.
@@ -1584,9 +1580,13 @@ gen6_seqno_barrier(struct intel_engine_cs *engine)
         * the write time to land, but that would incur a delay after every
         * batch i.e. much more frequent than a delay when waiting for the
         * interrupt (with the same net latency).
+        *
+        * Also note that to prevent whole machine hangs on gen7, we have to
+        * take the spinlock to guard against concurrent cacheline access.
         */
-       struct drm_i915_private *dev_priv = engine->dev->dev_private;
+       spin_lock_irq(&dev_priv->uncore.lock);
        POSTING_READ_FW(RING_ACTHD(engine->mmio_base));
+       spin_unlock_irq(&dev_priv->uncore.lock);
 }
 
 static u32
@@ -2312,51 +2312,6 @@ void intel_cleanup_engine(struct intel_engine_cs *engine)
        engine->dev = NULL;
 }
 
-static int ring_wait_for_space(struct intel_engine_cs *engine, int n)
-{
-       struct intel_ringbuffer *ringbuf = engine->buffer;
-       struct drm_i915_gem_request *request;
-       unsigned space;
-       int ret;
-
-       if (intel_ring_space(ringbuf) >= n)
-               return 0;
-
-       /* The whole point of reserving space is to not wait! */
-       WARN_ON(ringbuf->reserved_in_use);
-
-       list_for_each_entry(request, &engine->request_list, list) {
-               space = __intel_ring_space(request->postfix, ringbuf->tail,
-                                          ringbuf->size);
-               if (space >= n)
-                       break;
-       }
-
-       if (WARN_ON(&request->list == &engine->request_list))
-               return -ENOSPC;
-
-       ret = i915_wait_request(request);
-       if (ret)
-               return ret;
-
-       ringbuf->space = space;
-       return 0;
-}
-
-static void __wrap_ring_buffer(struct intel_ringbuffer *ringbuf)
-{
-       uint32_t __iomem *virt;
-       int rem = ringbuf->size - ringbuf->tail;
-
-       virt = ringbuf->virtual_start + ringbuf->tail;
-       rem /= 4;
-       while (rem--)
-               iowrite32(MI_NOOP, virt++);
-
-       ringbuf->tail = 0;
-       intel_ring_update_space(ringbuf);
-}
-
 int intel_engine_idle(struct intel_engine_cs *engine)
 {
        struct drm_i915_gem_request *req;
@@ -2398,63 +2353,82 @@ int intel_ring_reserve_space(struct drm_i915_gem_request *request)
 
 void intel_ring_reserved_space_reserve(struct intel_ringbuffer *ringbuf, int size)
 {
-       WARN_ON(ringbuf->reserved_size);
-       WARN_ON(ringbuf->reserved_in_use);
-
+       GEM_BUG_ON(ringbuf->reserved_size);
        ringbuf->reserved_size = size;
 }
 
 void intel_ring_reserved_space_cancel(struct intel_ringbuffer *ringbuf)
 {
-       WARN_ON(ringbuf->reserved_in_use);
-
+       GEM_BUG_ON(!ringbuf->reserved_size);
        ringbuf->reserved_size   = 0;
-       ringbuf->reserved_in_use = false;
 }
 
 void intel_ring_reserved_space_use(struct intel_ringbuffer *ringbuf)
 {
-       WARN_ON(ringbuf->reserved_in_use);
-
-       ringbuf->reserved_in_use = true;
-       ringbuf->reserved_tail   = ringbuf->tail;
+       GEM_BUG_ON(!ringbuf->reserved_size);
+       ringbuf->reserved_size   = 0;
 }
 
 void intel_ring_reserved_space_end(struct intel_ringbuffer *ringbuf)
 {
-       WARN_ON(!ringbuf->reserved_in_use);
-       if (ringbuf->tail > ringbuf->reserved_tail) {
-               WARN(ringbuf->tail > ringbuf->reserved_tail + ringbuf->reserved_size,
-                    "request reserved size too small: %d vs %d!\n",
-                    ringbuf->tail - ringbuf->reserved_tail, ringbuf->reserved_size);
-       } else {
+       GEM_BUG_ON(ringbuf->reserved_size);
+}
+
+static int wait_for_space(struct drm_i915_gem_request *req, int bytes)
+{
+       struct intel_ringbuffer *ringbuf = req->ringbuf;
+       struct intel_engine_cs *engine = req->engine;
+       struct drm_i915_gem_request *target;
+
+       intel_ring_update_space(ringbuf);
+       if (ringbuf->space >= bytes)
+               return 0;
+
+       /*
+        * Space is reserved in the ringbuffer for finalising the request,
+        * as that cannot be allowed to fail. During request finalisation,
+        * reserved_space is set to 0 to stop the overallocation and the
+        * assumption is that then we never need to wait (which has the
+        * risk of failing with EINTR).
+        *
+        * See also i915_gem_request_alloc() and i915_add_request().
+        */
+       GEM_BUG_ON(!ringbuf->reserved_size);
+
+       list_for_each_entry(target, &engine->request_list, list) {
+               unsigned space;
+
                /*
-                * The ring was wrapped while the reserved space was in use.
-                * That means that some unknown amount of the ring tail was
-                * no-op filled and skipped. Thus simply adding the ring size
-                * to the tail and doing the above space check will not work.
-                * Rather than attempt to track how much tail was skipped,
-                * it is much simpler to say that also skipping the sanity
-                * check every once in a while is not a big issue.
+                * The request queue is per-engine, so can contain requests
+                * from multiple ringbuffers. Here, we must ignore any that
+                * aren't from the ringbuffer we're considering.
                 */
+               if (target->ringbuf != ringbuf)
+                       continue;
+
+               /* Would completion of this request free enough space? */
+               space = __intel_ring_space(target->postfix, ringbuf->tail,
+                                          ringbuf->size);
+               if (space >= bytes)
+                       break;
        }
 
-       ringbuf->reserved_size   = 0;
-       ringbuf->reserved_in_use = false;
+       if (WARN_ON(&target->list == &engine->request_list))
+               return -ENOSPC;
+
+       return i915_wait_request(target);
 }
 
-static int __intel_ring_prepare(struct intel_engine_cs *engine, int bytes)
+int intel_ring_begin(struct drm_i915_gem_request *req, int num_dwords)
 {
-       struct intel_ringbuffer *ringbuf = engine->buffer;
-       int remain_usable = ringbuf->effective_size - ringbuf->tail;
+       struct intel_ringbuffer *ringbuf = req->ringbuf;
        int remain_actual = ringbuf->size - ringbuf->tail;
-       int ret, total_bytes, wait_bytes = 0;
+       int remain_usable = ringbuf->effective_size - ringbuf->tail;
+       int bytes = num_dwords * sizeof(u32);
+       int total_bytes, wait_bytes;
        bool need_wrap = false;
 
-       if (ringbuf->reserved_in_use)
-               total_bytes = bytes;
-       else
-               total_bytes = bytes + ringbuf->reserved_size;
+       total_bytes = bytes + ringbuf->reserved_size;
 
        if (unlikely(bytes > remain_usable)) {
                /*
@@ -2463,44 +2437,42 @@ static int __intel_ring_prepare(struct intel_engine_cs *engine, int bytes)
                 */
                wait_bytes = remain_actual + total_bytes;
                need_wrap = true;
+       } else if (unlikely(total_bytes > remain_usable)) {
+               /*
+                * The base request will fit but the reserved space
+                * falls off the end. So we don't need an immediate wrap
+                * and only need to effectively wait for the reserved
+                * size space from the start of ringbuffer.
+                */
+               wait_bytes = remain_actual + ringbuf->reserved_size;
        } else {
-               if (unlikely(total_bytes > remain_usable)) {
-                       /*
-                        * The base request will fit but the reserved space
-                        * falls off the end. So don't need an immediate wrap
-                        * and only need to effectively wait for the reserved
-                        * size space from the start of ringbuffer.
-                        */
-                       wait_bytes = remain_actual + ringbuf->reserved_size;
-               } else if (total_bytes > ringbuf->space) {
-                       /* No wrapping required, just waiting. */
-                       wait_bytes = total_bytes;
-               }
+               /* No wrapping required, just waiting. */
+               wait_bytes = total_bytes;
        }
 
-       if (wait_bytes) {
-               ret = ring_wait_for_space(engine, wait_bytes);
+       if (wait_bytes > ringbuf->space) {
+               int ret = wait_for_space(req, wait_bytes);
                if (unlikely(ret))
                        return ret;
 
-               if (need_wrap)
-                       __wrap_ring_buffer(ringbuf);
+               intel_ring_update_space(ringbuf);
+               if (unlikely(ringbuf->space < wait_bytes))
+                       return -EAGAIN;
        }
 
-       return 0;
-}
+       if (unlikely(need_wrap)) {
+               GEM_BUG_ON(remain_actual > ringbuf->space);
+               GEM_BUG_ON(ringbuf->tail + remain_actual > ringbuf->size);
 
-int intel_ring_begin(struct drm_i915_gem_request *req,
-                    int num_dwords)
-{
-       struct intel_engine_cs *engine = req->engine;
-       int ret;
-
-       ret = __intel_ring_prepare(engine, num_dwords * sizeof(uint32_t));
-       if (ret)
-               return ret;
+               /* Fill the tail with MI_NOOP */
+               memset(ringbuf->virtual_start + ringbuf->tail,
+                      0, remain_actual);
+               ringbuf->tail = 0;
+               ringbuf->space -= remain_actual;
+       }
 
-       engine->buffer->space -= num_dwords * sizeof(uint32_t);
+       ringbuf->space -= bytes;
+       GEM_BUG_ON(ringbuf->space < 0);
        return 0;
 }
 
@@ -2772,6 +2744,7 @@ int intel_init_render_ring_buffer(struct drm_device *dev)
        engine->name = "render ring";
        engine->id = RCS;
        engine->exec_id = I915_EXEC_RENDER;
+       engine->hw_id = 0;
        engine->mmio_base = RENDER_RING_BASE;
 
        if (INTEL_INFO(dev)->gen >= 8) {
@@ -2923,6 +2896,7 @@ int intel_init_bsd_ring_buffer(struct drm_device *dev)
        engine->name = "bsd ring";
        engine->id = VCS;
        engine->exec_id = I915_EXEC_BSD;
+       engine->hw_id = 1;
 
        engine->write_tail = ring_write_tail;
        if (INTEL_INFO(dev)->gen >= 6) {
@@ -3001,6 +2975,7 @@ int intel_init_bsd2_ring_buffer(struct drm_device *dev)
        engine->name = "bsd2 ring";
        engine->id = VCS2;
        engine->exec_id = I915_EXEC_BSD;
+       engine->hw_id = 4;
 
        engine->write_tail = ring_write_tail;
        engine->mmio_base = GEN8_BSD2_RING_BASE;
@@ -3033,6 +3008,7 @@ int intel_init_blt_ring_buffer(struct drm_device *dev)
        engine->name = "blitter ring";
        engine->id = BCS;
        engine->exec_id = I915_EXEC_BLT;
+       engine->hw_id = 2;
 
        engine->mmio_base = BLT_RING_BASE;
        engine->write_tail = ring_write_tail;
@@ -3092,6 +3068,7 @@ int intel_init_vebox_ring_buffer(struct drm_device *dev)
        engine->name = "video enhancement ring";
        engine->id = VECS;
        engine->exec_id = I915_EXEC_VEBOX;
+       engine->hw_id = 3;
 
        engine->mmio_base = VEBOX_RING_BASE;
        engine->write_tail = ring_write_tail;
index 2ade194..ff12648 100644 (file)
@@ -108,8 +108,6 @@ struct intel_ringbuffer {
        int size;
        int effective_size;
        int reserved_size;
-       int reserved_tail;
-       bool reserved_in_use;
 
        /** We track the position of the requests in the ring buffer, and
         * when each is retired we increment last_retired_head as the GPU
@@ -156,7 +154,8 @@ struct  intel_engine_cs {
 #define I915_NUM_ENGINES 5
 #define _VCS(n) (VCS + (n))
        unsigned int exec_id;
-       unsigned int guc_id;
+       unsigned int hw_id;
+       unsigned int guc_id; /* XXX same as hw_id? */
        u32             mmio_base;
        struct          drm_device *dev;
        struct intel_ringbuffer *buffer;
@@ -459,7 +458,6 @@ static inline void intel_ring_advance(struct intel_engine_cs *engine)
 }
 int __intel_ring_space(int head, int tail, int size);
 void intel_ring_update_space(struct intel_ringbuffer *ringbuf);
-int intel_ring_space(struct intel_ringbuffer *ringbuf);
 bool intel_engine_stopped(struct intel_engine_cs *engine);
 
 int __must_check intel_engine_idle(struct intel_engine_cs *engine);
index 9ff1e96..44fb0b3 100644 (file)
@@ -403,9 +403,10 @@ struct lvds_dvo_timing {
        u8 vsync_off:4;
        u8 rsvd0:6;
        u8 hsync_off_hi:2;
-       u8 h_image;
-       u8 v_image;
-       u8 max_hv;
+       u8 himage_lo;
+       u8 vimage_lo;
+       u8 vimage_hi:4;
+       u8 himage_hi:4;
        u8 h_border;
        u8 v_border;
        u8 rsvd1:3;
@@ -740,6 +741,7 @@ struct bdb_psr {
 #define         DEVICE_TYPE_INT_TV     0x1009
 #define         DEVICE_TYPE_HDMI       0x60D2
 #define         DEVICE_TYPE_DP         0x68C6
+#define         DEVICE_TYPE_DP_DUAL_MODE       0x60D6
 #define         DEVICE_TYPE_eDP        0x78C6
 
 #define  DEVICE_TYPE_CLASS_EXTENSION   (1 << 15)
@@ -774,6 +776,17 @@ struct bdb_psr {
         DEVICE_TYPE_DISPLAYPORT_OUTPUT | \
         DEVICE_TYPE_ANALOG_OUTPUT)
 
+#define DEVICE_TYPE_DP_DUAL_MODE_BITS \
+       (DEVICE_TYPE_INTERNAL_CONNECTOR | \
+        DEVICE_TYPE_MIPI_OUTPUT | \
+        DEVICE_TYPE_COMPOSITE_OUTPUT | \
+        DEVICE_TYPE_LVDS_SINGALING | \
+        DEVICE_TYPE_TMDS_DVI_SIGNALING | \
+        DEVICE_TYPE_VIDEO_SIGNALING | \
+        DEVICE_TYPE_DISPLAYPORT_OUTPUT | \
+        DEVICE_TYPE_DIGITAL_OUTPUT | \
+        DEVICE_TYPE_ANALOG_OUTPUT)
+
 /* define the DVO port for HDMI output type */
 #define                DVO_B           1
 #define                DVO_C           2
index 1080019..8265665 100644 (file)
@@ -25,6 +25,7 @@
 #include <drm/drm_fb_cma_helper.h>
 #include <drm/drm_plane_helper.h>
 #include <drm/drm_of.h>
+#include <video/imx-ipu-v3.h>
 
 #include "imx-drm.h"
 
@@ -96,8 +97,8 @@ static struct imx_drm_crtc *imx_drm_find_crtc(struct drm_crtc *crtc)
        return NULL;
 }
 
-int imx_drm_set_bus_format_pins(struct drm_encoder *encoder, u32 bus_format,
-               int hsync_pin, int vsync_pin)
+int imx_drm_set_bus_config(struct drm_encoder *encoder, u32 bus_format,
+               int hsync_pin, int vsync_pin, u32 bus_flags)
 {
        struct imx_drm_crtc_helper_funcs *helper;
        struct imx_drm_crtc *imx_crtc;
@@ -109,14 +110,17 @@ int imx_drm_set_bus_format_pins(struct drm_encoder *encoder, u32 bus_format,
        helper = &imx_crtc->imx_drm_helper_funcs;
        if (helper->set_interface_pix_fmt)
                return helper->set_interface_pix_fmt(encoder->crtc,
-                                       bus_format, hsync_pin, vsync_pin);
+                                       bus_format, hsync_pin, vsync_pin,
+                                       bus_flags);
        return 0;
 }
-EXPORT_SYMBOL_GPL(imx_drm_set_bus_format_pins);
+EXPORT_SYMBOL_GPL(imx_drm_set_bus_config);
 
 int imx_drm_set_bus_format(struct drm_encoder *encoder, u32 bus_format)
 {
-       return imx_drm_set_bus_format_pins(encoder, bus_format, 2, 3);
+       return imx_drm_set_bus_config(encoder, bus_format, 2, 3,
+                                     DRM_BUS_FLAG_DE_HIGH |
+                                     DRM_BUS_FLAG_PIXDATA_NEGEDGE);
 }
 EXPORT_SYMBOL_GPL(imx_drm_set_bus_format);
 
@@ -437,6 +441,13 @@ static int compare_of(struct device *dev, void *data)
 {
        struct device_node *np = data;
 
+       /* Special case for DI, dev->of_node may not be set yet */
+       if (strcmp(dev->driver->name, "imx-ipuv3-crtc") == 0) {
+               struct ipu_client_platformdata *pdata = dev->platform_data;
+
+               return pdata->of_node == np;
+       }
+
        /* Special case for LDB, one device for two channels */
        if (of_node_cmp(np->name, "lvds-channel") == 0) {
                np = of_get_parent(np);
index b0241b9..74320a1 100644 (file)
@@ -19,7 +19,8 @@ struct imx_drm_crtc_helper_funcs {
        int (*enable_vblank)(struct drm_crtc *crtc);
        void (*disable_vblank)(struct drm_crtc *crtc);
        int (*set_interface_pix_fmt)(struct drm_crtc *crtc,
-                       u32 bus_format, int hsync_pin, int vsync_pin);
+                       u32 bus_format, int hsync_pin, int vsync_pin,
+                       u32 bus_flags);
        const struct drm_crtc_helper_funcs *crtc_helper_funcs;
        const struct drm_crtc_funcs *crtc_funcs;
 };
@@ -41,8 +42,8 @@ void imx_drm_mode_config_init(struct drm_device *drm);
 
 struct drm_gem_cma_object *imx_drm_fb_get_obj(struct drm_framebuffer *fb);
 
-int imx_drm_set_bus_format_pins(struct drm_encoder *encoder,
-               u32 bus_format, int hsync_pin, int vsync_pin);
+int imx_drm_set_bus_config(struct drm_encoder *encoder, u32 bus_format,
+               int hsync_pin, int vsync_pin, u32 bus_flags);
 int imx_drm_set_bus_format(struct drm_encoder *encoder,
                u32 bus_format);
 
index a58eee5..beff793 100644 (file)
@@ -25,6 +25,7 @@
 #include <linux/mfd/syscon/imx6q-iomuxc-gpr.h>
 #include <linux/of_device.h>
 #include <linux/of_graph.h>
+#include <video/of_display_timing.h>
 #include <video/of_videomode.h>
 #include <linux/regmap.h>
 #include <linux/videodev2.h>
@@ -59,6 +60,7 @@ struct imx_ldb_channel {
        struct drm_encoder encoder;
        struct drm_panel *panel;
        struct device_node *child;
+       struct i2c_adapter *ddc;
        int chno;
        void *edid;
        int edid_len;
@@ -107,6 +109,9 @@ static int imx_ldb_connector_get_modes(struct drm_connector *connector)
                        return num_modes;
        }
 
+       if (!imx_ldb_ch->edid && imx_ldb_ch->ddc)
+               imx_ldb_ch->edid = drm_get_edid(connector, imx_ldb_ch->ddc);
+
        if (imx_ldb_ch->edid) {
                drm_mode_connector_update_edid_property(connector,
                                                        imx_ldb_ch->edid);
@@ -553,7 +558,8 @@ static int imx_ldb_bind(struct device *dev, struct device *master, void *data)
 
        for_each_child_of_node(np, child) {
                struct imx_ldb_channel *channel;
-               struct device_node *port;
+               struct device_node *ddc_node;
+               struct device_node *ep;
 
                ret = of_property_read_u32(child, "reg", &i);
                if (ret || i < 0 || i > 1)
@@ -576,33 +582,54 @@ static int imx_ldb_bind(struct device *dev, struct device *master, void *data)
                 * The output port is port@4 with an external 4-port mux or
                 * port@2 with the internal 2-port mux.
                 */
-               port = of_graph_get_port_by_id(child, imx_ldb->lvds_mux ? 4 : 2);
-               if (port) {
-                       struct device_node *endpoint, *remote;
-
-                       endpoint = of_get_child_by_name(port, "endpoint");
-                       if (endpoint) {
-                               remote = of_graph_get_remote_port_parent(endpoint);
-                               if (remote)
-                                       channel->panel = of_drm_find_panel(remote);
-                               else
-                                       return -EPROBE_DEFER;
-                               if (!channel->panel) {
-                                       dev_err(dev, "panel not found: %s\n",
-                                               remote->full_name);
-                                       return -EPROBE_DEFER;
-                               }
+               ep = of_graph_get_endpoint_by_regs(child,
+                                                  imx_ldb->lvds_mux ? 4 : 2,
+                                                  -1);
+               if (ep) {
+                       struct device_node *remote;
+
+                       remote = of_graph_get_remote_port_parent(ep);
+                       of_node_put(ep);
+                       if (remote)
+                               channel->panel = of_drm_find_panel(remote);
+                       else
+                               return -EPROBE_DEFER;
+                       of_node_put(remote);
+                       if (!channel->panel) {
+                               dev_err(dev, "panel not found: %s\n",
+                                       remote->full_name);
+                               return -EPROBE_DEFER;
                        }
                }
 
-               edidp = of_get_property(child, "edid", &channel->edid_len);
-               if (edidp) {
-                       channel->edid = kmemdup(edidp, channel->edid_len,
-                                               GFP_KERNEL);
-               } else if (!channel->panel) {
-                       ret = of_get_drm_display_mode(child, &channel->mode, 0);
-                       if (!ret)
-                               channel->mode_valid = 1;
+               ddc_node = of_parse_phandle(child, "ddc-i2c-bus", 0);
+               if (ddc_node) {
+                       channel->ddc = of_find_i2c_adapter_by_node(ddc_node);
+                       of_node_put(ddc_node);
+                       if (!channel->ddc) {
+                               dev_warn(dev, "failed to get ddc i2c adapter\n");
+                               return -EPROBE_DEFER;
+                       }
+               }
+
+               if (!channel->ddc) {
+                       /* if no DDC available, fallback to hardcoded EDID */
+                       dev_dbg(dev, "no ddc available\n");
+
+                       edidp = of_get_property(child, "edid",
+                                               &channel->edid_len);
+                       if (edidp) {
+                               channel->edid = kmemdup(edidp,
+                                                       channel->edid_len,
+                                                       GFP_KERNEL);
+                       } else if (!channel->panel) {
+                               /* fallback to display-timings node */
+                               ret = of_get_drm_display_mode(child,
+                                                             &channel->mode,
+                                                             OF_USE_NATIVE_MODE);
+                               if (!ret)
+                                       channel->mode_valid = 1;
+                       }
                }
 
                channel->bus_format = of_get_bus_format(dev, child);
@@ -647,6 +674,7 @@ static void imx_ldb_unbind(struct device *dev, struct device *master,
                channel->encoder.funcs->destroy(&channel->encoder);
 
                kfree(channel->edid);
+               i2c_put_adapter(channel->ddc);
        }
 }
 
index ae7a9fb..baf7881 100644 (file)
@@ -294,8 +294,10 @@ static void imx_tve_encoder_prepare(struct drm_encoder *encoder)
 
        switch (tve->mode) {
        case TVE_MODE_VGA:
-               imx_drm_set_bus_format_pins(encoder, MEDIA_BUS_FMT_GBR888_1X24,
-                                           tve->hsync_pin, tve->vsync_pin);
+               imx_drm_set_bus_config(encoder, MEDIA_BUS_FMT_GBR888_1X24,
+                                      tve->hsync_pin, tve->vsync_pin,
+                                      DRM_BUS_FLAG_DE_HIGH |
+                                      DRM_BUS_FLAG_PIXDATA_NEGEDGE);
                break;
        case TVE_MODE_TVOUT:
                imx_drm_set_bus_format(encoder, MEDIA_BUS_FMT_YUV8_1X24);
index dee8e8b..fc04041 100644 (file)
@@ -66,6 +66,7 @@ struct ipu_crtc {
        struct ipu_flip_work    *flip_work;
        int                     irq;
        u32                     bus_format;
+       u32                     bus_flags;
        int                     di_hsync_pin;
        int                     di_vsync_pin;
 };
@@ -271,8 +272,10 @@ static int ipu_crtc_mode_set(struct drm_crtc *crtc,
        else
                sig_cfg.clkflags = 0;
 
-       sig_cfg.enable_pol = 1;
-       sig_cfg.clk_pol = 0;
+       sig_cfg.enable_pol = !(ipu_crtc->bus_flags & DRM_BUS_FLAG_DE_LOW);
+       /* Default to driving pixel data on negative clock edges */
+       sig_cfg.clk_pol = !!(ipu_crtc->bus_flags &
+                            DRM_BUS_FLAG_PIXDATA_POSEDGE);
        sig_cfg.bus_format = ipu_crtc->bus_format;
        sig_cfg.v_to_h_sync = 0;
        sig_cfg.hsync_pin = ipu_crtc->di_hsync_pin;
@@ -396,11 +399,12 @@ static void ipu_disable_vblank(struct drm_crtc *crtc)
 }
 
 static int ipu_set_interface_pix_fmt(struct drm_crtc *crtc,
-               u32 bus_format, int hsync_pin, int vsync_pin)
+               u32 bus_format, int hsync_pin, int vsync_pin, u32 bus_flags)
 {
        struct ipu_crtc *ipu_crtc = to_ipu_crtc(crtc);
 
        ipu_crtc->bus_format = bus_format;
+       ipu_crtc->bus_flags = bus_flags;
        ipu_crtc->di_hsync_pin = hsync_pin;
        ipu_crtc->di_vsync_pin = vsync_pin;
 
@@ -473,7 +477,7 @@ static int ipu_crtc_init(struct ipu_crtc *ipu_crtc,
 
        ret = imx_drm_add_crtc(drm, &ipu_crtc->base, &ipu_crtc->imx_crtc,
                        &ipu_crtc->plane[0]->base, &ipu_crtc_helper_funcs,
-                       ipu_crtc->dev->of_node);
+                       pdata->of_node);
        if (ret) {
                dev_err(ipu_crtc->dev, "adding crtc failed with %d.\n", ret);
                goto err_put_resources;
index 681ec6e..a4bb441 100644 (file)
@@ -38,6 +38,8 @@ static const uint32_t ipu_plane_formats[] = {
        DRM_FORMAT_RGBX8888,
        DRM_FORMAT_BGRA8888,
        DRM_FORMAT_BGRA8888,
+       DRM_FORMAT_UYVY,
+       DRM_FORMAT_VYUY,
        DRM_FORMAT_YUYV,
        DRM_FORMAT_YVYU,
        DRM_FORMAT_YUV420,
@@ -428,7 +430,6 @@ static int ipu_update_plane(struct drm_plane *plane, struct drm_crtc *crtc,
        if (crtc != plane->crtc)
                dev_dbg(plane->dev->dev, "crtc change: %p -> %p\n",
                                plane->crtc, crtc);
-       plane->crtc = crtc;
 
        if (!ipu_plane->enabled)
                ipu_plane_enable(ipu_plane);
@@ -461,7 +462,7 @@ static void ipu_plane_destroy(struct drm_plane *plane)
        kfree(ipu_plane);
 }
 
-static struct drm_plane_funcs ipu_plane_funcs = {
+static const struct drm_plane_funcs ipu_plane_funcs = {
        .update_plane   = ipu_update_plane,
        .disable_plane  = ipu_disable_plane,
        .destroy        = ipu_plane_destroy,
index 363e2c7..2d1fd02 100644 (file)
@@ -35,7 +35,6 @@ struct imx_parallel_display {
        void *edid;
        int edid_len;
        u32 bus_format;
-       int mode_valid;
        struct drm_display_mode mode;
        struct drm_panel *panel;
 };
@@ -68,17 +67,6 @@ static int imx_pd_connector_get_modes(struct drm_connector *connector)
                num_modes = drm_add_edid_modes(connector, imxpd->edid);
        }
 
-       if (imxpd->mode_valid) {
-               struct drm_display_mode *mode = drm_mode_create(connector->dev);
-
-               if (!mode)
-                       return -EINVAL;
-               drm_mode_copy(mode, &imxpd->mode);
-               mode->type |= DRM_MODE_TYPE_DRIVER | DRM_MODE_TYPE_PREFERRED,
-               drm_mode_probed_add(connector, mode);
-               num_modes++;
-       }
-
        if (np) {
                struct drm_display_mode *mode = drm_mode_create(connector->dev);
 
@@ -115,8 +103,8 @@ static void imx_pd_encoder_dpms(struct drm_encoder *encoder, int mode)
 static void imx_pd_encoder_prepare(struct drm_encoder *encoder)
 {
        struct imx_parallel_display *imxpd = enc_to_imxpd(encoder);
-
-       imx_drm_set_bus_format(encoder, imxpd->bus_format);
+       imx_drm_set_bus_config(encoder, imxpd->bus_format, 2, 3,
+                              imxpd->connector.display_info.bus_flags);
 }
 
 static void imx_pd_encoder_commit(struct drm_encoder *encoder)
@@ -203,7 +191,7 @@ static int imx_pd_bind(struct device *dev, struct device *master, void *data)
 {
        struct drm_device *drm = data;
        struct device_node *np = dev->of_node;
-       struct device_node *port;
+       struct device_node *ep;
        const u8 *edidp;
        struct imx_parallel_display *imxpd;
        int ret;
@@ -230,18 +218,18 @@ static int imx_pd_bind(struct device *dev, struct device *master, void *data)
        }
 
        /* port@1 is the output port */
-       port = of_graph_get_port_by_id(np, 1);
-       if (port) {
-               struct device_node *endpoint, *remote;
-
-               endpoint = of_get_child_by_name(port, "endpoint");
-               if (endpoint) {
-                       remote = of_graph_get_remote_port_parent(endpoint);
-                       if (remote)
-                               imxpd->panel = of_drm_find_panel(remote);
-                       if (!imxpd->panel)
-                               return -EPROBE_DEFER;
+       ep = of_graph_get_endpoint_by_regs(np, 1, -1);
+       if (ep) {
+               struct device_node *remote;
+
+               remote = of_graph_get_remote_port_parent(ep);
+               of_node_put(ep);
+               if (remote) {
+                       imxpd->panel = of_drm_find_panel(remote);
+                       of_node_put(remote);
                }
+               if (!imxpd->panel)
+                       return -EPROBE_DEFER;
        }
 
        imxpd->dev = dev;
index d05ca79..0186e50 100644 (file)
@@ -432,11 +432,6 @@ static int mtk_dpi_set_display_mode(struct mtk_dpi *dpi,
        unsigned long pll_rate;
        unsigned int factor;
 
-       if (!dpi) {
-               dev_err(dpi->dev, "invalid argument\n");
-               return -EINVAL;
-       }
-
        pix_rate = 1000UL * mode->clock;
        if (mode->clock <= 74000)
                factor = 8 * 3;
index 2d808e5..7695591 100644 (file)
@@ -695,10 +695,8 @@ static void mtk_dsi_destroy_conn_enc(struct mtk_dsi *dsi)
 {
        drm_encoder_cleanup(&dsi->encoder);
        /* Skip connector cleanup if creation was delegated to the bridge */
-       if (dsi->conn.dev) {
-               drm_connector_unregister(&dsi->conn);
+       if (dsi->conn.dev)
                drm_connector_cleanup(&dsi->conn);
-       }
 }
 
 static void mtk_dsi_ddp_start(struct mtk_ddp_comp *comp)
index 14e64e0..d347dca 100644 (file)
@@ -182,7 +182,7 @@ static int mga_g200se_set_plls(struct mga_device *mdev, long clock)
                        }
                }
 
-               fvv = pllreffreq * testn / testm;
+               fvv = pllreffreq * (n + 1) / (m + 1);
                fvv = (fvv - 800000) / 50000;
 
                if (fvv > 15)
@@ -202,6 +202,14 @@ static int mga_g200se_set_plls(struct mga_device *mdev, long clock)
        WREG_DAC(MGA1064_PIX_PLLC_M, m);
        WREG_DAC(MGA1064_PIX_PLLC_N, n);
        WREG_DAC(MGA1064_PIX_PLLC_P, p);
+
+       if (mdev->unique_rev_id >= 0x04) {
+               WREG_DAC(0x1a, 0x09);
+               msleep(20);
+               WREG_DAC(0x1a, 0x01);
+
+       }
+
        return 0;
 }
 
index fbe304e..2aec27d 100644 (file)
@@ -408,7 +408,7 @@ int adreno_gpu_init(struct drm_device *drm, struct platform_device *pdev,
        }
 
        adreno_gpu->memptrs = msm_gem_vaddr(adreno_gpu->memptrs_bo);
-       if (!adreno_gpu->memptrs) {
+       if (IS_ERR(adreno_gpu->memptrs)) {
                dev_err(drm->dev, "could not vmap memptrs\n");
                return -ENOMEM;
        }
index d9759bf..c6cf837 100644 (file)
@@ -159,6 +159,10 @@ static int msm_fbdev_create(struct drm_fb_helper *helper,
        dev->mode_config.fb_base = paddr;
 
        fbi->screen_base = msm_gem_vaddr_locked(fbdev->bo);
+       if (IS_ERR(fbi->screen_base)) {
+               ret = PTR_ERR(fbi->screen_base);
+               goto fail_unlock;
+       }
        fbi->screen_size = fbdev->bo->size;
        fbi->fix.smem_start = paddr;
        fbi->fix.smem_len = fbdev->bo->size;
index 7daf405..69836f5 100644 (file)
@@ -398,6 +398,8 @@ void *msm_gem_vaddr_locked(struct drm_gem_object *obj)
                        return ERR_CAST(pages);
                msm_obj->vaddr = vmap(pages, obj->size >> PAGE_SHIFT,
                                VM_MAP, pgprot_writecombine(PAGE_KERNEL));
+               if (msm_obj->vaddr == NULL)
+                       return ERR_PTR(-ENOMEM);
        }
        return msm_obj->vaddr;
 }
index b89ca51..eb4bb8b 100644 (file)
@@ -40,12 +40,14 @@ static struct msm_gem_submit *submit_create(struct drm_device *dev,
 
        submit->dev = dev;
        submit->gpu = gpu;
+       submit->fence = NULL;
        submit->pid = get_pid(task_pid(current));
 
        /* initially, until copy_from_user() and bo lookup succeeds: */
        submit->nr_bos = 0;
        submit->nr_cmds = 0;
 
+       INIT_LIST_HEAD(&submit->node);
        INIT_LIST_HEAD(&submit->bo_list);
        ww_acquire_init(&submit->ticket, &reservation_ww_class);
 
@@ -75,6 +77,11 @@ static int submit_lookup_objects(struct msm_gem_submit *submit,
                void __user *userptr =
                        u64_to_user_ptr(args->bos + (i * sizeof(submit_bo)));
 
+               /* make sure we don't have garbage flags, in case we hit
+                * error path before flags is initialized:
+                */
+               submit->bos[i].flags = 0;
+
                ret = copy_from_user(&submit_bo, userptr, sizeof(submit_bo));
                if (ret) {
                        ret = -EFAULT;
index b48f73a..0857710 100644 (file)
@@ -312,6 +312,9 @@ void msm_rd_dump_submit(struct msm_gem_submit *submit)
                struct msm_gem_object *obj = submit->bos[idx].obj;
                const char *buf = msm_gem_vaddr_locked(&obj->base);
 
+               if (IS_ERR(buf))
+                       continue;
+
                buf += iova - submit->bos[idx].iova;
 
                rd_write_section(rd, RD_GPUADDR,
index 1f14b90..42f5359 100644 (file)
@@ -40,6 +40,10 @@ struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int size)
        }
 
        ring->start = msm_gem_vaddr_locked(ring->bo);
+       if (IS_ERR(ring->start)) {
+               ret = PTR_ERR(ring->start);
+               goto fail;
+       }
        ring->end   = ring->start + (size / 4);
        ring->cur   = ring->start;
 
index 73241c4..336ad4d 100644 (file)
@@ -2,6 +2,7 @@ config DRM_OMAP
        tristate "OMAP DRM"
        depends on DRM
        depends on ARCH_OMAP2PLUS || ARCH_MULTIPLATFORM
+       select OMAP2_DSS
        select DRM_KMS_HELPER
        select DRM_KMS_FB_HELPER
        select FB_SYS_FILLRECT
index 225fd8d..667ca4a 100644 (file)
@@ -9,6 +9,7 @@
  * the Free Software Foundation.
  */
 
+#include <linux/gpio/consumer.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/platform_device.h>
index 8c246c2..9594ff7 100644 (file)
@@ -14,7 +14,7 @@
  * the Free Software Foundation.
  */
 
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/module.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
index 2fd5602..671806c 100644 (file)
@@ -9,7 +9,7 @@
  * the Free Software Foundation.
  */
 
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/module.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
index e780fd4..7c2331b 100644 (file)
@@ -9,7 +9,7 @@
  * the Free Software Foundation.
  */
 
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/module.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
index 36485c2..2b11807 100644 (file)
@@ -14,7 +14,7 @@
 #include <linux/backlight.h>
 #include <linux/delay.h>
 #include <linux/fb.h>
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/interrupt.h>
 #include <linux/jiffies.h>
 #include <linux/module.h>
index 458f77b..ac680e1 100644 (file)
@@ -15,6 +15,7 @@
 #include <linux/spi/spi.h>
 #include <linux/mutex.h>
 #include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 
 #include <video/omapdss.h>
 #include <video/omap-panel-data.h>
index 780cb26..38d2920 100644 (file)
@@ -15,7 +15,7 @@
 #include <linux/delay.h>
 #include <linux/spi/spi.h>
 #include <linux/fb.h>
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/of_gpio.h>
 
 #include <video/omapdss.h>
index 529a017..4363fff 100644 (file)
@@ -10,7 +10,7 @@
  */
 
 #include <linux/delay.h>
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/module.h>
 #include <linux/of.h>
 #include <linux/of_gpio.h>
index 31efcca..deb4167 100644 (file)
@@ -29,7 +29,7 @@
 #include <linux/sched.h>
 #include <linux/backlight.h>
 #include <linux/fb.h>
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/of.h>
 #include <linux/of_gpio.h>
 
index 03e2beb..d93175b 100644 (file)
@@ -14,7 +14,7 @@
 #include <linux/delay.h>
 #include <linux/spi/spi.h>
 #include <linux/regulator/consumer.h>
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/of_gpio.h>
index 8730646..56c43f3 100644 (file)
@@ -1167,7 +1167,6 @@ static int dsi_regulator_init(struct platform_device *dsidev)
 {
        struct dsi_data *dsi = dsi_get_dsidrv_data(dsidev);
        struct regulator *vdds_dsi;
-       int r;
 
        if (dsi->vdds_dsi_reg != NULL)
                return 0;
@@ -1180,15 +1179,6 @@ static int dsi_regulator_init(struct platform_device *dsidev)
                return PTR_ERR(vdds_dsi);
        }
 
-       if (regulator_can_change_voltage(vdds_dsi)) {
-               r = regulator_set_voltage(vdds_dsi, 1800000, 1800000);
-               if (r) {
-                       devm_regulator_put(vdds_dsi);
-                       DSSERR("can't set the DSI regulator voltage\n");
-                       return r;
-               }
-       }
-
        dsi->vdds_dsi_reg = vdds_dsi;
 
        return 0;
index f95ff31..3303cfa 100644 (file)
@@ -30,6 +30,7 @@
 #include <linux/delay.h>
 #include <linux/seq_file.h>
 #include <linux/clk.h>
+#include <linux/pinctrl/consumer.h>
 #include <linux/platform_device.h>
 #include <linux/pm_runtime.h>
 #include <linux/gfp.h>
index f892ae1..4d46cdf 100644 (file)
@@ -33,6 +33,7 @@
 #include <linux/gpio.h>
 #include <linux/regulator/consumer.h>
 #include <linux/component.h>
+#include <linux/of.h>
 #include <video/omapdss.h>
 #include <sound/omap-hdmi-audio.h>
 
@@ -100,7 +101,6 @@ static irqreturn_t hdmi_irq_handler(int irq, void *data)
 
 static int hdmi_init_regulator(void)
 {
-       int r;
        struct regulator *reg;
 
        if (hdmi.vdda_reg != NULL)
@@ -114,15 +114,6 @@ static int hdmi_init_regulator(void)
                return PTR_ERR(reg);
        }
 
-       if (regulator_can_change_voltage(reg)) {
-               r = regulator_set_voltage(reg, 1800000, 1800000);
-               if (r) {
-                       devm_regulator_put(reg);
-                       DSSWARN("can't set the regulator voltage\n");
-                       return r;
-               }
-       }
-
        hdmi.vdda_reg = reg;
 
        return 0;
index fa72e73..ef3afe9 100644 (file)
@@ -211,7 +211,7 @@ static void hdmi_core_init(struct hdmi_core_video_config *video_cfg)
 static void hdmi_core_powerdown_disable(struct hdmi_core_data *core)
 {
        DSSDBG("Enter hdmi_core_powerdown_disable\n");
-       REG_FLD_MOD(core->base, HDMI_CORE_SYS_SYS_CTRL1, 0x0, 0, 0);
+       REG_FLD_MOD(core->base, HDMI_CORE_SYS_SYS_CTRL1, 0x1, 0, 0);
 }
 
 static void hdmi_core_swreset_release(struct hdmi_core_data *core)
index a43f7b1..9255c0e 100644 (file)
@@ -38,6 +38,7 @@
 #include <linux/gpio.h>
 #include <linux/regulator/consumer.h>
 #include <linux/component.h>
+#include <linux/of.h>
 #include <video/omapdss.h>
 #include <sound/omap-hdmi-audio.h>
 
@@ -119,7 +120,6 @@ static irqreturn_t hdmi_irq_handler(int irq, void *data)
 
 static int hdmi_init_regulator(void)
 {
-       int r;
        struct regulator *reg;
 
        if (hdmi.vdda_reg != NULL)
@@ -131,15 +131,6 @@ static int hdmi_init_regulator(void)
                return PTR_ERR(reg);
        }
 
-       if (regulator_can_change_voltage(reg)) {
-               r = regulator_set_voltage(reg, 1800000, 1800000);
-               if (r) {
-                       devm_regulator_put(reg);
-                       DSSWARN("can't set the regulator voltage\n");
-                       return r;
-               }
-       }
-
        hdmi.vdda_reg = reg;
 
        return 0;
index 6a39752..8ab2093 100644 (file)
@@ -51,8 +51,8 @@ static void hdmi_core_ddc_init(struct hdmi_core_data *core)
 {
        void __iomem *base = core->base;
        const unsigned long long iclk = 266000000;      /* DSS L3 ICLK */
-       const unsigned ss_scl_high = 4000;              /* ns */
-       const unsigned ss_scl_low = 4700;               /* ns */
+       const unsigned ss_scl_high = 4600;              /* ns */
+       const unsigned ss_scl_low = 5400;               /* ns */
        const unsigned fs_scl_high = 600;               /* ns */
        const unsigned fs_scl_low = 1300;               /* ns */
        const unsigned sda_hold = 1000;                 /* ns */
@@ -458,7 +458,7 @@ static void hdmi_core_write_avi_infoframe(struct hdmi_core_data *core,
 
        c = (ptr[1] >> 6) & 0x3;
        m = (ptr[1] >> 4) & 0x3;
-       r = (ptr[1] >> 0) & 0x3;
+       r = (ptr[1] >> 0) & 0xf;
 
        itc = (ptr[2] >> 7) & 0x1;
        ec = (ptr[2] >> 4) & 0x7;
index 1f5d19c..f98b750 100644 (file)
@@ -13,6 +13,7 @@
 #include <linux/io.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
+#include <linux/seq_file.h>
 #include <video/omapdss.h>
 
 #include "dss.h"
index 06e23a7..f1015e8 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/io.h>
 #include <linux/platform_device.h>
 #include <linux/clk.h>
+#include <linux/seq_file.h>
 
 #include <video/omapdss.h>
 
index 13442b9..055f62f 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/err.h>
 #include <linux/io.h>
 #include <linux/platform_device.h>
+#include <linux/seq_file.h>
 #include <video/omapdss.h>
 
 #include "dss.h"
index 6f5fc14..479bf24 100644 (file)
@@ -17,6 +17,8 @@
  * this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <linux/seq_file.h>
+
 #include <drm/drm_crtc.h>
 #include <drm/drm_fb_helper.h>
 
index de275a5..4ceed7a 100644 (file)
@@ -27,6 +27,7 @@
 #include <linux/module.h>
 #include <linux/platform_device.h> /* platform_device() */
 #include <linux/sched.h>
+#include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/vmalloc.h>
index 94ec06d..f84570d 100644 (file)
@@ -17,6 +17,8 @@
  * this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <linux/seq_file.h>
+
 #include <drm/drm_crtc.h>
 #include <drm/drm_crtc_helper.h>
 
index b97afc2..03698b6 100644 (file)
@@ -17,6 +17,7 @@
  * this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <linux/seq_file.h>
 #include <linux/shmem_fs.h>
 #include <linux/spinlock.h>
 #include <linux/pfn_t.h>
index d024074..a7e9786 100644 (file)
@@ -2164,7 +2164,7 @@ static void kv_apply_state_adjust_rules(struct radeon_device *rdev,
        if (pi->caps_stable_p_state) {
                stable_p_state_sclk = (max_limits->sclk * 75) / 100;
 
-               for (i = table->count - 1; i >= 0; i++) {
+               for (i = table->count - 1; i >= 0; i--) {
                        if (stable_p_state_sclk >= table->entries[i].clk) {
                                stable_p_state_sclk = table->entries[i].clk;
                                break;
index 505620c..e04deed 100644 (file)
@@ -51,15 +51,6 @@ static void sti_crtc_disabling(struct drm_crtc *crtc)
        mixer->status = STI_MIXER_DISABLING;
 }
 
-static bool sti_crtc_mode_fixup(struct drm_crtc *crtc,
-                               const struct drm_display_mode *mode,
-                               struct drm_display_mode *adjusted_mode)
-{
-       /* accept the provided drm_display_mode, do not fix it up */
-       drm_mode_set_crtcinfo(adjusted_mode, CRTC_INTERLACE_HALVE_V);
-       return true;
-}
-
 static int
 sti_crtc_mode_set(struct drm_crtc *crtc, struct drm_display_mode *mode)
 {
@@ -230,7 +221,6 @@ static void sti_crtc_atomic_flush(struct drm_crtc *crtc,
 static const struct drm_crtc_helper_funcs sti_crtc_helper_funcs = {
        .enable = sti_crtc_enable,
        .disable = sti_crtc_disabling,
-       .mode_fixup = sti_crtc_mode_fixup,
        .mode_set = drm_helper_crtc_mode_set,
        .mode_set_nofb = sti_crtc_mode_set_nofb,
        .mode_set_base = drm_helper_crtc_mode_set_base,
index 32c7986..6bf4ce4 100644 (file)
@@ -437,7 +437,7 @@ static int vtg_probe(struct platform_device *pdev)
                        return -EPROBE_DEFER;
        } else {
                vtg->irq = platform_get_irq(pdev, 0);
-               if (IS_ERR_VALUE(vtg->irq)) {
+               if (vtg->irq < 0) {
                        DRM_ERROR("Failed to get VTG interrupt\n");
                        return vtg->irq;
                }
@@ -447,7 +447,7 @@ static int vtg_probe(struct platform_device *pdev)
                ret = devm_request_threaded_irq(dev, vtg->irq, vtg_irq,
                                vtg_irq_thread, IRQF_ONESHOT,
                                dev_name(dev), vtg);
-               if (IS_ERR_VALUE(ret)) {
+               if (ret < 0) {
                        DRM_ERROR("Failed to register VTG interrupt\n");
                        return ret;
                }
index 7716f42..6b8c5b3 100644 (file)
@@ -342,7 +342,7 @@ static int tfp410_probe(struct platform_device *pdev)
 
        tfp410_mod->gpio = of_get_named_gpio_flags(node, "powerdn-gpio",
                        0, NULL);
-       if (IS_ERR_VALUE(tfp410_mod->gpio)) {
+       if (tfp410_mod->gpio < 0) {
                dev_warn(&pdev->dev, "No power down GPIO\n");
        } else {
                ret = gpio_request(tfp410_mod->gpio, "DVI_PDn");
index 904d075..0f18b76 100644 (file)
@@ -456,14 +456,6 @@ static void vc4_crtc_atomic_flush(struct drm_crtc *crtc,
 
        WARN_ON_ONCE(dlist_next - dlist_start != vc4_state->mm.size);
 
-       HVS_WRITE(SCALER_DISPLISTX(vc4_crtc->channel),
-                 vc4_state->mm.start);
-
-       if (debug_dump_regs) {
-               DRM_INFO("CRTC %d HVS after:\n", drm_crtc_index(crtc));
-               vc4_hvs_dump_state(dev);
-       }
-
        if (crtc->state->event) {
                unsigned long flags;
 
@@ -473,8 +465,20 @@ static void vc4_crtc_atomic_flush(struct drm_crtc *crtc,
 
                spin_lock_irqsave(&dev->event_lock, flags);
                vc4_crtc->event = crtc->state->event;
-               spin_unlock_irqrestore(&dev->event_lock, flags);
                crtc->state->event = NULL;
+
+               HVS_WRITE(SCALER_DISPLISTX(vc4_crtc->channel),
+                         vc4_state->mm.start);
+
+               spin_unlock_irqrestore(&dev->event_lock, flags);
+       } else {
+               HVS_WRITE(SCALER_DISPLISTX(vc4_crtc->channel),
+                         vc4_state->mm.start);
+       }
+
+       if (debug_dump_regs) {
+               DRM_INFO("CRTC %d HVS after:\n", drm_crtc_index(crtc));
+               vc4_hvs_dump_state(dev);
        }
 }
 
@@ -500,12 +504,17 @@ static void vc4_crtc_handle_page_flip(struct vc4_crtc *vc4_crtc)
 {
        struct drm_crtc *crtc = &vc4_crtc->base;
        struct drm_device *dev = crtc->dev;
+       struct vc4_dev *vc4 = to_vc4_dev(dev);
+       struct vc4_crtc_state *vc4_state = to_vc4_crtc_state(crtc->state);
+       u32 chan = vc4_crtc->channel;
        unsigned long flags;
 
        spin_lock_irqsave(&dev->event_lock, flags);
-       if (vc4_crtc->event) {
+       if (vc4_crtc->event &&
+           (vc4_state->mm.start == HVS_READ(SCALER_DISPLACTX(chan)))) {
                drm_crtc_send_vblank_event(crtc, vc4_crtc->event);
                vc4_crtc->event = NULL;
+               drm_crtc_vblank_put(crtc);
        }
        spin_unlock_irqrestore(&dev->event_lock, flags);
 }
@@ -556,6 +565,7 @@ vc4_async_page_flip_complete(struct vc4_seqno_cb *cb)
                spin_unlock_irqrestore(&dev->event_lock, flags);
        }
 
+       drm_crtc_vblank_put(crtc);
        drm_framebuffer_unreference(flip_state->fb);
        kfree(flip_state);
 
@@ -598,6 +608,8 @@ static int vc4_async_page_flip(struct drm_crtc *crtc,
                return ret;
        }
 
+       WARN_ON(drm_crtc_vblank_get(crtc) != 0);
+
        /* Immediately update the plane's legacy fb pointer, so that later
         * modeset prep sees the state that will be present when the semaphore
         * is released.
index 3446ece..250ed7e 100644 (file)
@@ -66,12 +66,12 @@ static const struct file_operations vc4_drm_fops = {
 };
 
 static const struct drm_ioctl_desc vc4_drm_ioctls[] = {
-       DRM_IOCTL_DEF_DRV(VC4_SUBMIT_CL, vc4_submit_cl_ioctl, 0),
-       DRM_IOCTL_DEF_DRV(VC4_WAIT_SEQNO, vc4_wait_seqno_ioctl, 0),
-       DRM_IOCTL_DEF_DRV(VC4_WAIT_BO, vc4_wait_bo_ioctl, 0),
-       DRM_IOCTL_DEF_DRV(VC4_CREATE_BO, vc4_create_bo_ioctl, 0),
-       DRM_IOCTL_DEF_DRV(VC4_MMAP_BO, vc4_mmap_bo_ioctl, 0),
-       DRM_IOCTL_DEF_DRV(VC4_CREATE_SHADER_BO, vc4_create_shader_bo_ioctl, 0),
+       DRM_IOCTL_DEF_DRV(VC4_SUBMIT_CL, vc4_submit_cl_ioctl, DRM_RENDER_ALLOW),
+       DRM_IOCTL_DEF_DRV(VC4_WAIT_SEQNO, vc4_wait_seqno_ioctl, DRM_RENDER_ALLOW),
+       DRM_IOCTL_DEF_DRV(VC4_WAIT_BO, vc4_wait_bo_ioctl, DRM_RENDER_ALLOW),
+       DRM_IOCTL_DEF_DRV(VC4_CREATE_BO, vc4_create_bo_ioctl, DRM_RENDER_ALLOW),
+       DRM_IOCTL_DEF_DRV(VC4_MMAP_BO, vc4_mmap_bo_ioctl, DRM_RENDER_ALLOW),
+       DRM_IOCTL_DEF_DRV(VC4_CREATE_SHADER_BO, vc4_create_shader_bo_ioctl, DRM_RENDER_ALLOW),
        DRM_IOCTL_DEF_DRV(VC4_GET_HANG_STATE, vc4_get_hang_state_ioctl,
                          DRM_ROOT_ONLY),
 };
@@ -91,7 +91,7 @@ static struct drm_driver vc4_drm_driver = {
 
        .enable_vblank = vc4_enable_vblank,
        .disable_vblank = vc4_disable_vblank,
-       .get_vblank_counter = drm_vblank_count,
+       .get_vblank_counter = drm_vblank_no_hw_counter,
 
 #if defined(CONFIG_DEBUG_FS)
        .debugfs_init = vc4_debugfs_init,
index cb37751..861a623 100644 (file)
@@ -117,10 +117,18 @@ static int vc4_atomic_commit(struct drm_device *dev,
                return -ENOMEM;
 
        /* Make sure that any outstanding modesets have finished. */
-       ret = down_interruptible(&vc4->async_modeset);
-       if (ret) {
-               kfree(c);
-               return ret;
+       if (nonblock) {
+               ret = down_trylock(&vc4->async_modeset);
+               if (ret) {
+                       kfree(c);
+                       return -EBUSY;
+               }
+       } else {
+               ret = down_interruptible(&vc4->async_modeset);
+               if (ret) {
+                       kfree(c);
+                       return ret;
+               }
        }
 
        ret = drm_atomic_helper_prepare_planes(dev, state);
index 6163b95..f99eece 100644 (file)
 #define SCALER_DISPLACT0                        0x00000030
 #define SCALER_DISPLACT1                        0x00000034
 #define SCALER_DISPLACT2                        0x00000038
+#define SCALER_DISPLACTX(x)                    (SCALER_DISPLACT0 +     \
+                                                (x) * (SCALER_DISPLACT1 - \
+                                                       SCALER_DISPLACT0))
+
 #define SCALER_DISPCTRL0                        0x00000040
 # define SCALER_DISPCTRLX_ENABLE               BIT(31)
 # define SCALER_DISPCTRLX_RESET                        BIT(30)
index 498b37e..e1e31e9 100644 (file)
@@ -85,7 +85,7 @@ static int _host1x_intr_init_host_sync(struct host1x *host, u32 cpm,
        err = devm_request_irq(host->dev, host->intr_syncpt_irq,
                               syncpt_thresh_isr, IRQF_SHARED,
                               "host1x_syncpt", host);
-       if (IS_ERR_VALUE(err)) {
+       if (err < 0) {
                WARN_ON(1);
                return err;
        }
index abb98c7..99dcacf 100644 (file)
@@ -997,7 +997,7 @@ struct ipu_platform_reg {
 };
 
 /* These must be in the order of the corresponding device tree port nodes */
-static const struct ipu_platform_reg client_reg[] = {
+static struct ipu_platform_reg client_reg[] = {
        {
                .pdata = {
                        .csi = 0,
@@ -1048,7 +1048,7 @@ static int ipu_add_client_devices(struct ipu_soc *ipu, unsigned long ipu_base)
        mutex_unlock(&ipu_client_id_mutex);
 
        for (i = 0; i < ARRAY_SIZE(client_reg); i++) {
-               const struct ipu_platform_reg *reg = &client_reg[i];
+               struct ipu_platform_reg *reg = &client_reg[i];
                struct platform_device *pdev;
                struct device_node *of_node;
 
@@ -1070,6 +1070,7 @@ static int ipu_add_client_devices(struct ipu_soc *ipu, unsigned long ipu_base)
 
                pdev->dev.parent = dev;
 
+               reg->pdata.of_node = of_node;
                ret = platform_device_add_data(pdev, &reg->pdata,
                                               sizeof(reg->pdata));
                if (!ret)
index 952fe69..24e395c 100644 (file)
@@ -58,7 +58,7 @@ static const u8 REG_TEMP_MAX[4] = { 0x34, 0x30, 0x31, 0x32 };
  */
 static int apd = -1;
 module_param(apd, bint, 0);
-MODULE_PARM_DESC(init, "Set to zero to disable anti-parallel diode mode");
+MODULE_PARM_DESC(apd, "Set to zero to disable anti-parallel diode mode");
 
 struct temperature {
        s8      degrees;
index 0addc84..69166ab 100644 (file)
@@ -77,7 +77,6 @@ static const u8 LM75_REG_TEMP[3] = {
 struct lm75_data {
        struct i2c_client       *client;
        struct device           *hwmon_dev;
-       struct thermal_zone_device      *tz;
        struct mutex            update_lock;
        u8                      orig_conf;
        u8                      resolution;     /* In bits, between 9 and 12 */
@@ -306,11 +305,9 @@ lm75_probe(struct i2c_client *client, const struct i2c_device_id *id)
        if (IS_ERR(data->hwmon_dev))
                return PTR_ERR(data->hwmon_dev);
 
-       data->tz = thermal_zone_of_sensor_register(data->hwmon_dev, 0,
-                                                  data->hwmon_dev,
-                                                  &lm75_of_thermal_ops);
-       if (IS_ERR(data->tz))
-               data->tz = NULL;
+       devm_thermal_zone_of_sensor_register(data->hwmon_dev, 0,
+                                            data->hwmon_dev,
+                                            &lm75_of_thermal_ops);
 
        dev_info(dev, "%s: sensor '%s'\n",
                 dev_name(data->hwmon_dev), client->name);
@@ -322,7 +319,6 @@ static int lm75_remove(struct i2c_client *client)
 {
        struct lm75_data *data = i2c_get_clientdata(client);
 
-       thermal_zone_of_sensor_unregister(data->hwmon_dev, data->tz);
        hwmon_device_unregister(data->hwmon_dev);
        lm75_write_value(client, LM75_REG_CONF, data->orig_conf);
        return 0;
index faa6e8d..8ef7b71 100644 (file)
@@ -259,7 +259,6 @@ struct ntc_data {
        struct device *dev;
        int n_comp;
        char name[PLATFORM_NAME_SIZE];
-       struct thermal_zone_device *tz;
 };
 
 #if defined(CONFIG_OF) && IS_ENABLED(CONFIG_IIO)
@@ -579,6 +578,7 @@ static const struct thermal_zone_of_device_ops ntc_of_thermal_ops = {
 
 static int ntc_thermistor_probe(struct platform_device *pdev)
 {
+       struct thermal_zone_device *tz;
        const struct of_device_id *of_id =
                        of_match_device(of_match_ptr(ntc_match), &pdev->dev);
        const struct platform_device_id *pdev_id;
@@ -677,12 +677,10 @@ static int ntc_thermistor_probe(struct platform_device *pdev)
        dev_info(&pdev->dev, "Thermistor type: %s successfully probed.\n",
                                                                pdev_id->name);
 
-       data->tz = thermal_zone_of_sensor_register(data->dev, 0, data->dev,
-                                                  &ntc_of_thermal_ops);
-       if (IS_ERR(data->tz)) {
+       tz = devm_thermal_zone_of_sensor_register(data->dev, 0, data->dev,
+                                                 &ntc_of_thermal_ops);
+       if (IS_ERR(tz))
                dev_dbg(&pdev->dev, "Failed to register to thermal fw.\n");
-               data->tz = NULL;
-       }
 
        return 0;
 err_after_sysfs:
@@ -700,8 +698,6 @@ static int ntc_thermistor_remove(struct platform_device *pdev)
        sysfs_remove_group(&data->dev->kobj, &ntc_attr_group);
        ntc_iio_channel_release(pdata);
 
-       thermal_zone_of_sensor_unregister(data->dev, data->tz);
-
        return 0;
 }
 
index 3e23003..f9af393 100644 (file)
@@ -40,15 +40,18 @@ struct pwm_fan_ctx {
 
 static int  __set_pwm(struct pwm_fan_ctx *ctx, unsigned long pwm)
 {
+       struct pwm_args pargs;
        unsigned long duty;
        int ret = 0;
 
+       pwm_get_args(ctx->pwm, &pargs);
+
        mutex_lock(&ctx->lock);
        if (ctx->pwm_value == pwm)
                goto exit_set_pwm_err;
 
-       duty = DIV_ROUND_UP(pwm * (ctx->pwm->period - 1), MAX_PWM);
-       ret = pwm_config(ctx->pwm, duty, ctx->pwm->period);
+       duty = DIV_ROUND_UP(pwm * (pargs.period - 1), MAX_PWM);
+       ret = pwm_config(ctx->pwm, duty, pargs.period);
        if (ret)
                goto exit_set_pwm_err;
 
@@ -215,6 +218,7 @@ static int pwm_fan_probe(struct platform_device *pdev)
 {
        struct thermal_cooling_device *cdev;
        struct pwm_fan_ctx *ctx;
+       struct pwm_args pargs;
        struct device *hwmon;
        int duty_cycle;
        int ret;
@@ -233,11 +237,19 @@ static int pwm_fan_probe(struct platform_device *pdev)
 
        platform_set_drvdata(pdev, ctx);
 
+       /*
+        * FIXME: pwm_apply_args() should be removed when switching to the
+        * atomic PWM API.
+        */
+       pwm_apply_args(ctx->pwm);
+
        /* Set duty cycle to maximum allowed */
-       duty_cycle = ctx->pwm->period - 1;
+       pwm_get_args(ctx->pwm, &pargs);
+
+       duty_cycle = pargs.period - 1;
        ctx->pwm_value = MAX_PWM;
 
-       ret = pwm_config(ctx->pwm, duty_cycle, ctx->pwm->period);
+       ret = pwm_config(ctx->pwm, duty_cycle, pargs.period);
        if (ret) {
                dev_err(&pdev->dev, "Failed to configure PWM\n");
                return ret;
@@ -303,14 +315,16 @@ static int pwm_fan_suspend(struct device *dev)
 static int pwm_fan_resume(struct device *dev)
 {
        struct pwm_fan_ctx *ctx = dev_get_drvdata(dev);
+       struct pwm_args pargs;
        unsigned long duty;
        int ret;
 
        if (ctx->pwm_value == 0)
                return 0;
 
-       duty = DIV_ROUND_UP(ctx->pwm_value * (ctx->pwm->period - 1), MAX_PWM);
-       ret = pwm_config(ctx->pwm, duty, ctx->pwm->period);
+       pwm_get_args(ctx->pwm, &pargs);
+       duty = DIV_ROUND_UP(ctx->pwm_value * (pargs.period - 1), MAX_PWM);
+       ret = pwm_config(ctx->pwm, duty, pargs.period);
        if (ret)
                return ret;
        return pwm_enable(ctx->pwm);
index 912b449..25b44e6 100644 (file)
@@ -31,10 +31,8 @@ struct sensor_data {
 };
 
 struct scpi_thermal_zone {
-       struct list_head list;
        int sensor_id;
        struct scpi_sensors *scpi_sensors;
-       struct thermal_zone_device *tzd;
 };
 
 struct scpi_sensors {
@@ -92,20 +90,6 @@ scpi_show_label(struct device *dev, struct device_attribute *attr, char *buf)
        return sprintf(buf, "%s\n", sensor->info.name);
 }
 
-static void
-unregister_thermal_zones(struct platform_device *pdev,
-                        struct scpi_sensors *scpi_sensors)
-{
-       struct list_head *pos;
-
-       list_for_each(pos, &scpi_sensors->thermal_zones) {
-               struct scpi_thermal_zone *zone;
-
-               zone = list_entry(pos, struct scpi_thermal_zone, list);
-               thermal_zone_of_sensor_unregister(&pdev->dev, zone->tzd);
-       }
-}
-
 static struct thermal_zone_of_device_ops scpi_sensor_ops = {
        .get_temp = scpi_read_temp,
 };
@@ -118,7 +102,7 @@ static int scpi_hwmon_probe(struct platform_device *pdev)
        struct scpi_ops *scpi_ops;
        struct device *hwdev, *dev = &pdev->dev;
        struct scpi_sensors *scpi_sensors;
-       int ret, idx;
+       int idx, ret;
 
        scpi_ops = get_scpi_ops();
        if (!scpi_ops)
@@ -232,47 +216,34 @@ static int scpi_hwmon_probe(struct platform_device *pdev)
        INIT_LIST_HEAD(&scpi_sensors->thermal_zones);
        for (i = 0; i < nr_sensors; i++) {
                struct sensor_data *sensor = &scpi_sensors->data[i];
+               struct thermal_zone_device *z;
                struct scpi_thermal_zone *zone;
 
                if (sensor->info.class != TEMPERATURE)
                        continue;
 
                zone = devm_kzalloc(dev, sizeof(*zone), GFP_KERNEL);
-               if (!zone) {
-                       ret = -ENOMEM;
-                       goto unregister_tzd;
-               }
+               if (!zone)
+                       return -ENOMEM;
 
                zone->sensor_id = i;
                zone->scpi_sensors = scpi_sensors;
-               zone->tzd = thermal_zone_of_sensor_register(dev,
-                               sensor->info.sensor_id, zone, &scpi_sensor_ops);
+               z = devm_thermal_zone_of_sensor_register(dev,
+                                                        sensor->info.sensor_id,
+                                                        zone,
+                                                        &scpi_sensor_ops);
                /*
                 * The call to thermal_zone_of_sensor_register returns
                 * an error for sensors that are not associated with
                 * any thermal zones or if the thermal subsystem is
                 * not configured.
                 */
-               if (IS_ERR(zone->tzd)) {
+               if (IS_ERR(z)) {
                        devm_kfree(dev, zone);
                        continue;
                }
-               list_add(&zone->list, &scpi_sensors->thermal_zones);
        }
 
-       return 0;
-
-unregister_tzd:
-       unregister_thermal_zones(pdev, scpi_sensors);
-       return ret;
-}
-
-static int scpi_hwmon_remove(struct platform_device *pdev)
-{
-       struct scpi_sensors *scpi_sensors = platform_get_drvdata(pdev);
-
-       unregister_thermal_zones(pdev, scpi_sensors);
-
        return 0;
 }
 
@@ -288,7 +259,6 @@ static struct platform_driver scpi_hwmon_platdrv = {
                .of_match_table = scpi_of_match,
        },
        .probe          = scpi_hwmon_probe,
-       .remove         = scpi_hwmon_remove,
 };
 module_platform_driver(scpi_hwmon_platdrv);
 
index 5289aa0..f1e96fd 100644 (file)
@@ -53,7 +53,6 @@
 struct tmp102 {
        struct i2c_client *client;
        struct device *hwmon_dev;
-       struct thermal_zone_device *tz;
        struct mutex lock;
        u16 config_orig;
        unsigned long last_update;
@@ -232,10 +231,8 @@ static int tmp102_probe(struct i2c_client *client,
                goto fail_restore_config;
        }
        tmp102->hwmon_dev = hwmon_dev;
-       tmp102->tz = thermal_zone_of_sensor_register(hwmon_dev, 0, hwmon_dev,
-                                                    &tmp102_of_thermal_ops);
-       if (IS_ERR(tmp102->tz))
-               tmp102->tz = NULL;
+       devm_thermal_zone_of_sensor_register(hwmon_dev, 0, hwmon_dev,
+                                            &tmp102_of_thermal_ops);
 
        dev_info(dev, "initialized\n");
 
@@ -251,7 +248,6 @@ static int tmp102_remove(struct i2c_client *client)
 {
        struct tmp102 *tmp102 = i2c_get_clientdata(client);
 
-       thermal_zone_of_sensor_unregister(tmp102->hwmon_dev, tmp102->tz);
        hwmon_device_unregister(tmp102->hwmon_dev);
 
        /* Stop monitoring if device was stopped originally */
index 2dd40dd..f167021 100644 (file)
@@ -965,7 +965,7 @@ config I2C_XILINX
 
 config I2C_XLR
        tristate "Netlogic XLR and Sigma Designs I2C support"
-       depends on CPU_XLR || ARCH_TANGOX
+       depends on CPU_XLR || ARCH_TANGO
        help
          This driver enables support for the on-chip I2C interface of
          the Netlogic XLR/XLS MIPS processors and Sigma Designs SOCs.
@@ -985,6 +985,7 @@ config I2C_XLP9XX
 
 config I2C_RCAR
        tristate "Renesas R-Car I2C Controller"
+       depends on HAS_DMA
        depends on ARCH_RENESAS || COMPILE_TEST
        select I2C_SLAVE
        help
index 921d32b..f233726 100644 (file)
@@ -1013,7 +1013,7 @@ static int at91_twi_configure_dma(struct at91_twi_dev *dev, u32 phy_addr)
 
 error:
        if (ret != -EPROBE_DEFER)
-               dev_info(dev->dev, "can't use DMA, error %d\n", ret);
+               dev_info(dev->dev, "can't get DMA channel, continue without DMA support\n");
        if (dma->chan_rx)
                dma_release_channel(dma->chan_rx);
        if (dma->chan_tx)
index 9aca1b4..52407f3 100644 (file)
@@ -623,7 +623,7 @@ static struct dma_chan *rcar_i2c_request_dma_chan(struct device *dev,
        char *chan_name = dir == DMA_MEM_TO_DEV ? "tx" : "rx";
        int ret;
 
-       chan = dma_request_slave_channel_reason(dev, chan_name);
+       chan = dma_request_chan(dev, chan_name);
        if (IS_ERR(chan)) {
                ret = PTR_ERR(chan);
                dev_dbg(dev, "request_channel failed for %s (%d)\n",
index 0b1108d..6ecfd76 100644 (file)
@@ -22,6 +22,7 @@
 
 /* The I2C_RDWR ioctl code is written by Kolja Waschk <waschk@telos.de> */
 
+#include <linux/cdev.h>
 #include <linux/device.h>
 #include <linux/fs.h>
 #include <linux/i2c-dev.h>
@@ -47,9 +48,10 @@ struct i2c_dev {
        struct list_head list;
        struct i2c_adapter *adap;
        struct device *dev;
+       struct cdev cdev;
 };
 
-#define I2C_MINORS     256
+#define I2C_MINORS     MINORMASK
 static LIST_HEAD(i2c_dev_list);
 static DEFINE_SPINLOCK(i2c_dev_list_lock);
 
@@ -89,7 +91,7 @@ static struct i2c_dev *get_free_i2c_dev(struct i2c_adapter *adap)
        return i2c_dev;
 }
 
-static void return_i2c_dev(struct i2c_dev *i2c_dev)
+static void put_i2c_dev(struct i2c_dev *i2c_dev)
 {
        spin_lock(&i2c_dev_list_lock);
        list_del(&i2c_dev->list);
@@ -552,6 +554,12 @@ static int i2cdev_attach_adapter(struct device *dev, void *dummy)
        if (IS_ERR(i2c_dev))
                return PTR_ERR(i2c_dev);
 
+       cdev_init(&i2c_dev->cdev, &i2cdev_fops);
+       i2c_dev->cdev.owner = THIS_MODULE;
+       res = cdev_add(&i2c_dev->cdev, MKDEV(I2C_MAJOR, adap->nr), 1);
+       if (res)
+               goto error_cdev;
+
        /* register this i2c device with the driver core */
        i2c_dev->dev = device_create(i2c_dev_class, &adap->dev,
                                     MKDEV(I2C_MAJOR, adap->nr), NULL,
@@ -565,7 +573,9 @@ static int i2cdev_attach_adapter(struct device *dev, void *dummy)
                 adap->name, adap->nr);
        return 0;
 error:
-       return_i2c_dev(i2c_dev);
+       cdev_del(&i2c_dev->cdev);
+error_cdev:
+       put_i2c_dev(i2c_dev);
        return res;
 }
 
@@ -582,7 +592,8 @@ static int i2cdev_detach_adapter(struct device *dev, void *dummy)
        if (!i2c_dev) /* attach_adapter must have failed */
                return 0;
 
-       return_i2c_dev(i2c_dev);
+       cdev_del(&i2c_dev->cdev);
+       put_i2c_dev(i2c_dev);
        device_destroy(i2c_dev_class, MKDEV(I2C_MAJOR, adap->nr));
 
        pr_debug("i2c-dev: adapter [%s] unregistered\n", adap->name);
@@ -620,7 +631,7 @@ static int __init i2c_dev_init(void)
 
        printk(KERN_INFO "i2c /dev entries driver\n");
 
-       res = register_chrdev(I2C_MAJOR, "i2c", &i2cdev_fops);
+       res = register_chrdev_region(MKDEV(I2C_MAJOR, 0), I2C_MINORS, "i2c");
        if (res)
                goto out;
 
@@ -644,7 +655,7 @@ static int __init i2c_dev_init(void)
 out_unreg_class:
        class_destroy(i2c_dev_class);
 out_unreg_chrdev:
-       unregister_chrdev(I2C_MAJOR, "i2c");
+       unregister_chrdev_region(MKDEV(I2C_MAJOR, 0), I2C_MINORS);
 out:
        printk(KERN_ERR "%s: Driver Initialisation failed\n", __FILE__);
        return res;
@@ -655,7 +666,7 @@ static void __exit i2c_dev_exit(void)
        bus_unregister_notifier(&i2c_bus_type, &i2cdev_notifier);
        i2c_for_each_dev(NULL, i2cdev_detach_adapter);
        class_destroy(i2c_dev_class);
-       unregister_chrdev(I2C_MAJOR, "i2c");
+       unregister_chrdev_region(MKDEV(I2C_MAJOR, 0), I2C_MINORS);
 }
 
 MODULE_AUTHOR("Frodo Looijaard <frodol@dds.nl> and "
index 6425c0e..2137adf 100644 (file)
@@ -85,4 +85,6 @@ source "drivers/infiniband/ulp/isert/Kconfig"
 
 source "drivers/infiniband/sw/rdmavt/Kconfig"
 
+source "drivers/infiniband/hw/hfi1/Kconfig"
+
 endif # INFINIBAND
index 26987d9..edaae9f 100644 (file)
@@ -1,8 +1,7 @@
 infiniband-$(CONFIG_INFINIBAND_ADDR_TRANS)     := rdma_cm.o
 user_access-$(CONFIG_INFINIBAND_ADDR_TRANS)    := rdma_ucm.o
 
-obj-$(CONFIG_INFINIBAND) +=            ib_core.o ib_mad.o ib_sa.o \
-                                       ib_cm.o iw_cm.o ib_addr.o \
+obj-$(CONFIG_INFINIBAND) +=            ib_core.o ib_cm.o iw_cm.o \
                                        $(infiniband-y)
 obj-$(CONFIG_INFINIBAND_USER_MAD) +=   ib_umad.o
 obj-$(CONFIG_INFINIBAND_USER_ACCESS) +=        ib_uverbs.o ib_ucm.o \
@@ -10,14 +9,11 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) +=      ib_uverbs.o ib_ucm.o \
 
 ib_core-y :=                   packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
                                device.o fmr_pool.o cache.o netlink.o \
-                               roce_gid_mgmt.o mr_pool.o
+                               roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \
+                               multicast.o mad.o smi.o agent.o mad_rmpp.o
 ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
 ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
 
-ib_mad-y :=                    mad.o smi.o agent.o mad_rmpp.o
-
-ib_sa-y :=                     sa_query.o multicast.o
-
 ib_cm-y :=                     cm.o
 
 iw_cm-y :=                     iwcm.o iwpm_util.o iwpm_msg.o
@@ -28,8 +24,6 @@ rdma_cm-$(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS) += cma_configfs.o
 
 rdma_ucm-y :=                  ucma.o
 
-ib_addr-y :=                   addr.o
-
 ib_umad-y :=                   user_mad.o
 
 ib_ucm-y :=                    ucm.o
index 337353d..1374541 100644 (file)
 #include <net/ip6_route.h>
 #include <rdma/ib_addr.h>
 #include <rdma/ib.h>
+#include <rdma/rdma_netlink.h>
+#include <net/netlink.h>
 
-MODULE_AUTHOR("Sean Hefty");
-MODULE_DESCRIPTION("IB Address Translation");
-MODULE_LICENSE("Dual BSD/GPL");
+#include "core_priv.h"
 
 struct addr_req {
        struct list_head list;
@@ -62,8 +62,11 @@ struct addr_req {
                         struct rdma_dev_addr *addr, void *context);
        unsigned long timeout;
        int status;
+       u32 seq;
 };
 
+static atomic_t ib_nl_addr_request_seq = ATOMIC_INIT(0);
+
 static void process_req(struct work_struct *work);
 
 static DEFINE_MUTEX(lock);
@@ -71,6 +74,126 @@ static LIST_HEAD(req_list);
 static DECLARE_DELAYED_WORK(work, process_req);
 static struct workqueue_struct *addr_wq;
 
+static const struct nla_policy ib_nl_addr_policy[LS_NLA_TYPE_MAX] = {
+       [LS_NLA_TYPE_DGID] = {.type = NLA_BINARY,
+               .len = sizeof(struct rdma_nla_ls_gid)},
+};
+
+static inline bool ib_nl_is_good_ip_resp(const struct nlmsghdr *nlh)
+{
+       struct nlattr *tb[LS_NLA_TYPE_MAX] = {};
+       int ret;
+
+       if (nlh->nlmsg_flags & RDMA_NL_LS_F_ERR)
+               return false;
+
+       ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
+                       nlmsg_len(nlh), ib_nl_addr_policy);
+       if (ret)
+               return false;
+
+       return true;
+}
+
+static void ib_nl_process_good_ip_rsep(const struct nlmsghdr *nlh)
+{
+       const struct nlattr *head, *curr;
+       union ib_gid gid;
+       struct addr_req *req;
+       int len, rem;
+       int found = 0;
+
+       head = (const struct nlattr *)nlmsg_data(nlh);
+       len = nlmsg_len(nlh);
+
+       nla_for_each_attr(curr, head, len, rem) {
+               if (curr->nla_type == LS_NLA_TYPE_DGID)
+                       memcpy(&gid, nla_data(curr), nla_len(curr));
+       }
+
+       mutex_lock(&lock);
+       list_for_each_entry(req, &req_list, list) {
+               if (nlh->nlmsg_seq != req->seq)
+                       continue;
+               /* We set the DGID part, the rest was set earlier */
+               rdma_addr_set_dgid(req->addr, &gid);
+               req->status = 0;
+               found = 1;
+               break;
+       }
+       mutex_unlock(&lock);
+
+       if (!found)
+               pr_info("Couldn't find request waiting for DGID: %pI6\n",
+                       &gid);
+}
+
+int ib_nl_handle_ip_res_resp(struct sk_buff *skb,
+                            struct netlink_callback *cb)
+{
+       const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh;
+
+       if ((nlh->nlmsg_flags & NLM_F_REQUEST) ||
+           !(NETLINK_CB(skb).sk) ||
+           !netlink_capable(skb, CAP_NET_ADMIN))
+               return -EPERM;
+
+       if (ib_nl_is_good_ip_resp(nlh))
+               ib_nl_process_good_ip_rsep(nlh);
+
+       return skb->len;
+}
+
+static int ib_nl_ip_send_msg(struct rdma_dev_addr *dev_addr,
+                            const void *daddr,
+                            u32 seq, u16 family)
+{
+       struct sk_buff *skb = NULL;
+       struct nlmsghdr *nlh;
+       struct rdma_ls_ip_resolve_header *header;
+       void *data;
+       size_t size;
+       int attrtype;
+       int len;
+
+       if (family == AF_INET) {
+               size = sizeof(struct in_addr);
+               attrtype = RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_IPV4;
+       } else {
+               size = sizeof(struct in6_addr);
+               attrtype = RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_IPV6;
+       }
+
+       len = nla_total_size(sizeof(size));
+       len += NLMSG_ALIGN(sizeof(*header));
+
+       skb = nlmsg_new(len, GFP_KERNEL);
+       if (!skb)
+               return -ENOMEM;
+
+       data = ibnl_put_msg(skb, &nlh, seq, 0, RDMA_NL_LS,
+                           RDMA_NL_LS_OP_IP_RESOLVE, NLM_F_REQUEST);
+       if (!data) {
+               nlmsg_free(skb);
+               return -ENODATA;
+       }
+
+       /* Construct the family header first */
+       header = (struct rdma_ls_ip_resolve_header *)
+               skb_put(skb, NLMSG_ALIGN(sizeof(*header)));
+       header->ifindex = dev_addr->bound_dev_if;
+       nla_put(skb, attrtype, size, daddr);
+
+       /* Repair the nlmsg header length */
+       nlmsg_end(skb, nlh);
+       ibnl_multicast(skb, nlh, RDMA_NL_GROUP_LS, GFP_KERNEL);
+
+       /* Make the request retry, so when we get the response from userspace
+        * we will have something.
+        */
+       return -ENODATA;
+}
+
 int rdma_addr_size(struct sockaddr *addr)
 {
        switch (addr->sa_family) {
@@ -199,6 +322,17 @@ static void queue_req(struct addr_req *req)
        mutex_unlock(&lock);
 }
 
+static int ib_nl_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr,
+                         const void *daddr, u32 seq, u16 family)
+{
+       if (ibnl_chk_listeners(RDMA_NL_GROUP_LS))
+               return -EADDRNOTAVAIL;
+
+       /* We fill in what we can, the response will fill the rest */
+       rdma_copy_addr(dev_addr, dst->dev, NULL);
+       return ib_nl_ip_send_msg(dev_addr, daddr, seq, family);
+}
+
 static int dst_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr,
                        const void *daddr)
 {
@@ -223,6 +357,39 @@ static int dst_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr,
        return ret;
 }
 
+static bool has_gateway(struct dst_entry *dst, sa_family_t family)
+{
+       struct rtable *rt;
+       struct rt6_info *rt6;
+
+       if (family == AF_INET) {
+               rt = container_of(dst, struct rtable, dst);
+               return rt->rt_uses_gateway;
+       }
+
+       rt6 = container_of(dst, struct rt6_info, dst);
+       return rt6->rt6i_flags & RTF_GATEWAY;
+}
+
+static int fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr,
+                   const struct sockaddr *dst_in, u32 seq)
+{
+       const struct sockaddr_in *dst_in4 =
+               (const struct sockaddr_in *)dst_in;
+       const struct sockaddr_in6 *dst_in6 =
+               (const struct sockaddr_in6 *)dst_in;
+       const void *daddr = (dst_in->sa_family == AF_INET) ?
+               (const void *)&dst_in4->sin_addr.s_addr :
+               (const void *)&dst_in6->sin6_addr;
+       sa_family_t family = dst_in->sa_family;
+
+       /* Gateway + ARPHRD_INFINIBAND -> IB router */
+       if (has_gateway(dst, family) && dst->dev->type == ARPHRD_INFINIBAND)
+               return ib_nl_fetch_ha(dst, dev_addr, daddr, seq, family);
+       else
+               return dst_fetch_ha(dst, dev_addr, daddr);
+}
+
 static int addr4_resolve(struct sockaddr_in *src_in,
                         const struct sockaddr_in *dst_in,
                         struct rdma_dev_addr *addr,
@@ -246,10 +413,11 @@ static int addr4_resolve(struct sockaddr_in *src_in,
        src_in->sin_family = AF_INET;
        src_in->sin_addr.s_addr = fl4.saddr;
 
-       /* If there's a gateway, we're definitely in RoCE v2 (as RoCE v1 isn't
-        * routable) and we could set the network type accordingly.
+       /* If there's a gateway and type of device not ARPHRD_INFINIBAND, we're
+        * definitely in RoCE v2 (as RoCE v1 isn't routable) set the network
+        * type accordingly.
         */
-       if (rt->rt_uses_gateway)
+       if (rt->rt_uses_gateway && rt->dst.dev->type != ARPHRD_INFINIBAND)
                addr->network = RDMA_NETWORK_IPV4;
 
        addr->hoplimit = ip4_dst_hoplimit(&rt->dst);
@@ -291,10 +459,12 @@ static int addr6_resolve(struct sockaddr_in6 *src_in,
                src_in->sin6_addr = fl6.saddr;
        }
 
-       /* If there's a gateway, we're definitely in RoCE v2 (as RoCE v1 isn't
-        * routable) and we could set the network type accordingly.
+       /* If there's a gateway and type of device not ARPHRD_INFINIBAND, we're
+        * definitely in RoCE v2 (as RoCE v1 isn't routable) set the network
+        * type accordingly.
         */
-       if (rt->rt6i_flags & RTF_GATEWAY)
+       if (rt->rt6i_flags & RTF_GATEWAY &&
+           ip6_dst_idev(dst)->dev->type != ARPHRD_INFINIBAND)
                addr->network = RDMA_NETWORK_IPV6;
 
        addr->hoplimit = ip6_dst_hoplimit(dst);
@@ -317,7 +487,8 @@ static int addr6_resolve(struct sockaddr_in6 *src_in,
 
 static int addr_resolve_neigh(struct dst_entry *dst,
                              const struct sockaddr *dst_in,
-                             struct rdma_dev_addr *addr)
+                             struct rdma_dev_addr *addr,
+                             u32 seq)
 {
        if (dst->dev->flags & IFF_LOOPBACK) {
                int ret;
@@ -331,17 +502,8 @@ static int addr_resolve_neigh(struct dst_entry *dst,
        }
 
        /* If the device doesn't do ARP internally */
-       if (!(dst->dev->flags & IFF_NOARP)) {
-               const struct sockaddr_in *dst_in4 =
-                       (const struct sockaddr_in *)dst_in;
-               const struct sockaddr_in6 *dst_in6 =
-                       (const struct sockaddr_in6 *)dst_in;
-
-               return dst_fetch_ha(dst, addr,
-                                   dst_in->sa_family == AF_INET ?
-                                   (const void *)&dst_in4->sin_addr.s_addr :
-                                   (const void *)&dst_in6->sin6_addr);
-       }
+       if (!(dst->dev->flags & IFF_NOARP))
+               return fetch_ha(dst, addr, dst_in, seq);
 
        return rdma_copy_addr(addr, dst->dev, NULL);
 }
@@ -349,7 +511,8 @@ static int addr_resolve_neigh(struct dst_entry *dst,
 static int addr_resolve(struct sockaddr *src_in,
                        const struct sockaddr *dst_in,
                        struct rdma_dev_addr *addr,
-                       bool resolve_neigh)
+                       bool resolve_neigh,
+                       u32 seq)
 {
        struct net_device *ndev;
        struct dst_entry *dst;
@@ -366,7 +529,7 @@ static int addr_resolve(struct sockaddr *src_in,
                        return ret;
 
                if (resolve_neigh)
-                       ret = addr_resolve_neigh(&rt->dst, dst_in, addr);
+                       ret = addr_resolve_neigh(&rt->dst, dst_in, addr, seq);
 
                ndev = rt->dst.dev;
                dev_hold(ndev);
@@ -383,7 +546,7 @@ static int addr_resolve(struct sockaddr *src_in,
                        return ret;
 
                if (resolve_neigh)
-                       ret = addr_resolve_neigh(dst, dst_in, addr);
+                       ret = addr_resolve_neigh(dst, dst_in, addr, seq);
 
                ndev = dst->dev;
                dev_hold(ndev);
@@ -412,7 +575,7 @@ static void process_req(struct work_struct *work)
                        src_in = (struct sockaddr *) &req->src_addr;
                        dst_in = (struct sockaddr *) &req->dst_addr;
                        req->status = addr_resolve(src_in, dst_in, req->addr,
-                                                  true);
+                                                  true, req->seq);
                        if (req->status && time_after_eq(jiffies, req->timeout))
                                req->status = -ETIMEDOUT;
                        else if (req->status == -ENODATA)
@@ -471,8 +634,9 @@ int rdma_resolve_ip(struct rdma_addr_client *client,
        req->context = context;
        req->client = client;
        atomic_inc(&client->refcount);
+       req->seq = (u32)atomic_inc_return(&ib_nl_addr_request_seq);
 
-       req->status = addr_resolve(src_in, dst_in, addr, true);
+       req->status = addr_resolve(src_in, dst_in, addr, true, req->seq);
        switch (req->status) {
        case 0:
                req->timeout = jiffies;
@@ -510,7 +674,7 @@ int rdma_resolve_ip_route(struct sockaddr *src_addr,
                src_in->sa_family = dst_addr->sa_family;
        }
 
-       return addr_resolve(src_in, dst_addr, addr, false);
+       return addr_resolve(src_in, dst_addr, addr, false, 0);
 }
 EXPORT_SYMBOL(rdma_resolve_ip_route);
 
@@ -634,7 +798,7 @@ static struct notifier_block nb = {
        .notifier_call = netevent_callback
 };
 
-static int __init addr_init(void)
+int addr_init(void)
 {
        addr_wq = create_singlethread_workqueue("ib_addr");
        if (!addr_wq)
@@ -642,15 +806,13 @@ static int __init addr_init(void)
 
        register_netevent_notifier(&nb);
        rdma_addr_register_client(&self);
+
        return 0;
 }
 
-static void __exit addr_cleanup(void)
+void addr_cleanup(void)
 {
        rdma_addr_unregister_client(&self);
        unregister_netevent_notifier(&nb);
        destroy_workqueue(addr_wq);
 }
-
-module_init(addr_init);
-module_exit(addr_cleanup);
index eab3221..19d499d 100644 (file)
@@ -137,4 +137,20 @@ static inline bool rdma_is_upper_dev_rcu(struct net_device *dev,
        return _upper == upper;
 }
 
+int addr_init(void);
+void addr_cleanup(void);
+
+int ib_mad_init(void);
+void ib_mad_cleanup(void);
+
+int ib_sa_init(void);
+void ib_sa_cleanup(void);
+
+int ib_nl_handle_resolve_resp(struct sk_buff *skb,
+                             struct netlink_callback *cb);
+int ib_nl_handle_set_timeout(struct sk_buff *skb,
+                            struct netlink_callback *cb);
+int ib_nl_handle_ip_res_resp(struct sk_buff *skb,
+                            struct netlink_callback *cb);
+
 #endif /* _CORE_PRIV_H */
index 1097984..5516fb0 100644 (file)
@@ -955,6 +955,29 @@ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev,
 }
 EXPORT_SYMBOL(ib_get_net_dev_by_params);
 
+static struct ibnl_client_cbs ibnl_ls_cb_table[] = {
+       [RDMA_NL_LS_OP_RESOLVE] = {
+               .dump = ib_nl_handle_resolve_resp,
+               .module = THIS_MODULE },
+       [RDMA_NL_LS_OP_SET_TIMEOUT] = {
+               .dump = ib_nl_handle_set_timeout,
+               .module = THIS_MODULE },
+       [RDMA_NL_LS_OP_IP_RESOLVE] = {
+               .dump = ib_nl_handle_ip_res_resp,
+               .module = THIS_MODULE },
+};
+
+static int ib_add_ibnl_clients(void)
+{
+       return ibnl_add_client(RDMA_NL_LS, ARRAY_SIZE(ibnl_ls_cb_table),
+                              ibnl_ls_cb_table);
+}
+
+static void ib_remove_ibnl_clients(void)
+{
+       ibnl_remove_client(RDMA_NL_LS);
+}
+
 static int __init ib_core_init(void)
 {
        int ret;
@@ -983,10 +1006,41 @@ static int __init ib_core_init(void)
                goto err_sysfs;
        }
 
+       ret = addr_init();
+       if (ret) {
+               pr_warn("Could't init IB address resolution\n");
+               goto err_ibnl;
+       }
+
+       ret = ib_mad_init();
+       if (ret) {
+               pr_warn("Couldn't init IB MAD\n");
+               goto err_addr;
+       }
+
+       ret = ib_sa_init();
+       if (ret) {
+               pr_warn("Couldn't init SA\n");
+               goto err_mad;
+       }
+
+       if (ib_add_ibnl_clients()) {
+               pr_warn("Couldn't register ibnl clients\n");
+               goto err_sa;
+       }
+
        ib_cache_setup();
 
        return 0;
 
+err_sa:
+       ib_sa_cleanup();
+err_mad:
+       ib_mad_cleanup();
+err_addr:
+       addr_cleanup();
+err_ibnl:
+       ibnl_cleanup();
 err_sysfs:
        class_unregister(&ib_class);
 err_comp:
@@ -999,6 +1053,10 @@ err:
 static void __exit ib_core_cleanup(void)
 {
        ib_cache_cleanup();
+       ib_remove_ibnl_clients();
+       ib_sa_cleanup();
+       ib_mad_cleanup();
+       addr_cleanup();
        ibnl_cleanup();
        class_unregister(&ib_class);
        destroy_workqueue(ib_comp_wq);
index 9fa5bf3..82fb511 100644 (file)
 #include "smi.h"
 #include "opa_smi.h"
 #include "agent.h"
-
-MODULE_LICENSE("Dual BSD/GPL");
-MODULE_DESCRIPTION("kernel IB MAD API");
-MODULE_AUTHOR("Hal Rosenstock");
-MODULE_AUTHOR("Sean Hefty");
+#include "core_priv.h"
 
 static int mad_sendq_size = IB_MAD_QP_SEND_SIZE;
 static int mad_recvq_size = IB_MAD_QP_RECV_SIZE;
@@ -3316,7 +3312,7 @@ static struct ib_client mad_client = {
        .remove = ib_mad_remove_device
 };
 
-static int __init ib_mad_init_module(void)
+int ib_mad_init(void)
 {
        mad_recvq_size = min(mad_recvq_size, IB_MAD_QP_MAX_SIZE);
        mad_recvq_size = max(mad_recvq_size, IB_MAD_QP_MIN_SIZE);
@@ -3334,10 +3330,7 @@ static int __init ib_mad_init_module(void)
        return 0;
 }
 
-static void __exit ib_mad_cleanup_module(void)
+void ib_mad_cleanup(void)
 {
        ib_unregister_client(&mad_client);
 }
-
-module_init(ib_mad_init_module);
-module_exit(ib_mad_cleanup_module);
index 250937c..a83ec28 100644 (file)
@@ -93,6 +93,18 @@ enum {
 
 struct mcast_member;
 
+/*
+* There are 4 types of join states:
+* FullMember, NonMember, SendOnlyNonMember, SendOnlyFullMember.
+*/
+enum {
+       FULLMEMBER_JOIN,
+       NONMEMBER_JOIN,
+       SENDONLY_NONMEBER_JOIN,
+       SENDONLY_FULLMEMBER_JOIN,
+       NUM_JOIN_MEMBERSHIP_TYPES,
+};
+
 struct mcast_group {
        struct ib_sa_mcmember_rec rec;
        struct rb_node          node;
@@ -102,7 +114,7 @@ struct mcast_group {
        struct list_head        pending_list;
        struct list_head        active_list;
        struct mcast_member     *last_join;
-       int                     members[3];
+       int                     members[NUM_JOIN_MEMBERSHIP_TYPES];
        atomic_t                refcount;
        enum mcast_group_state  state;
        struct ib_sa_query      *query;
@@ -220,8 +232,9 @@ static void queue_join(struct mcast_member *member)
 }
 
 /*
- * A multicast group has three types of members: full member, non member, and
- * send only member.  We need to keep track of the number of members of each
+ * A multicast group has four types of members: full member, non member,
+ * sendonly non member and sendonly full member.
+ * We need to keep track of the number of members of each
  * type based on their join state.  Adjust the number of members the belong to
  * the specified join states.
  */
@@ -229,7 +242,7 @@ static void adjust_membership(struct mcast_group *group, u8 join_state, int inc)
 {
        int i;
 
-       for (i = 0; i < 3; i++, join_state >>= 1)
+       for (i = 0; i < NUM_JOIN_MEMBERSHIP_TYPES; i++, join_state >>= 1)
                if (join_state & 0x1)
                        group->members[i] += inc;
 }
@@ -245,7 +258,7 @@ static u8 get_leave_state(struct mcast_group *group)
        u8 leave_state = 0;
        int i;
 
-       for (i = 0; i < 3; i++)
+       for (i = 0; i < NUM_JOIN_MEMBERSHIP_TYPES; i++)
                if (!group->members[i])
                        leave_state |= (0x1 << i);
 
index 3ebd108..e955386 100644 (file)
 #include "sa.h"
 #include "core_priv.h"
 
-MODULE_AUTHOR("Roland Dreier");
-MODULE_DESCRIPTION("InfiniBand subnet administration query support");
-MODULE_LICENSE("Dual BSD/GPL");
-
 #define IB_SA_LOCAL_SVC_TIMEOUT_MIN            100
 #define IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT                2000
 #define IB_SA_LOCAL_SVC_TIMEOUT_MAX            200000
@@ -119,6 +115,12 @@ struct ib_sa_guidinfo_query {
        struct ib_sa_query sa_query;
 };
 
+struct ib_sa_classport_info_query {
+       void (*callback)(int, struct ib_class_port_info *, void *);
+       void *context;
+       struct ib_sa_query sa_query;
+};
+
 struct ib_sa_mcmember_query {
        void (*callback)(int, struct ib_sa_mcmember_rec *, void *);
        void *context;
@@ -392,6 +394,82 @@ static const struct ib_field service_rec_table[] = {
          .size_bits    = 2*64 },
 };
 
+#define CLASSPORTINFO_REC_FIELD(field) \
+       .struct_offset_bytes = offsetof(struct ib_class_port_info, field),      \
+       .struct_size_bytes   = sizeof((struct ib_class_port_info *)0)->field,   \
+       .field_name          = "ib_class_port_info:" #field
+
+static const struct ib_field classport_info_rec_table[] = {
+       { CLASSPORTINFO_REC_FIELD(base_version),
+         .offset_words = 0,
+         .offset_bits  = 0,
+         .size_bits    = 8 },
+       { CLASSPORTINFO_REC_FIELD(class_version),
+         .offset_words = 0,
+         .offset_bits  = 8,
+         .size_bits    = 8 },
+       { CLASSPORTINFO_REC_FIELD(capability_mask),
+         .offset_words = 0,
+         .offset_bits  = 16,
+         .size_bits    = 16 },
+       { CLASSPORTINFO_REC_FIELD(cap_mask2_resp_time),
+         .offset_words = 1,
+         .offset_bits  = 0,
+         .size_bits    = 32 },
+       { CLASSPORTINFO_REC_FIELD(redirect_gid),
+         .offset_words = 2,
+         .offset_bits  = 0,
+         .size_bits    = 128 },
+       { CLASSPORTINFO_REC_FIELD(redirect_tcslfl),
+         .offset_words = 6,
+         .offset_bits  = 0,
+         .size_bits    = 32 },
+       { CLASSPORTINFO_REC_FIELD(redirect_lid),
+         .offset_words = 7,
+         .offset_bits  = 0,
+         .size_bits    = 16 },
+       { CLASSPORTINFO_REC_FIELD(redirect_pkey),
+         .offset_words = 7,
+         .offset_bits  = 16,
+         .size_bits    = 16 },
+
+       { CLASSPORTINFO_REC_FIELD(redirect_qp),
+         .offset_words = 8,
+         .offset_bits  = 0,
+         .size_bits    = 32 },
+       { CLASSPORTINFO_REC_FIELD(redirect_qkey),
+         .offset_words = 9,
+         .offset_bits  = 0,
+         .size_bits    = 32 },
+
+       { CLASSPORTINFO_REC_FIELD(trap_gid),
+         .offset_words = 10,
+         .offset_bits  = 0,
+         .size_bits    = 128 },
+       { CLASSPORTINFO_REC_FIELD(trap_tcslfl),
+         .offset_words = 14,
+         .offset_bits  = 0,
+         .size_bits    = 32 },
+
+       { CLASSPORTINFO_REC_FIELD(trap_lid),
+         .offset_words = 15,
+         .offset_bits  = 0,
+         .size_bits    = 16 },
+       { CLASSPORTINFO_REC_FIELD(trap_pkey),
+         .offset_words = 15,
+         .offset_bits  = 16,
+         .size_bits    = 16 },
+
+       { CLASSPORTINFO_REC_FIELD(trap_hlqp),
+         .offset_words = 16,
+         .offset_bits  = 0,
+         .size_bits    = 32 },
+       { CLASSPORTINFO_REC_FIELD(trap_qkey),
+         .offset_words = 17,
+         .offset_bits  = 0,
+         .size_bits    = 32 },
+};
+
 #define GUIDINFO_REC_FIELD(field) \
        .struct_offset_bytes = offsetof(struct ib_sa_guidinfo_rec, field),      \
        .struct_size_bytes   = sizeof((struct ib_sa_guidinfo_rec *) 0)->field,  \
@@ -705,8 +783,8 @@ static void ib_nl_request_timeout(struct work_struct *work)
        spin_unlock_irqrestore(&ib_nl_request_lock, flags);
 }
 
-static int ib_nl_handle_set_timeout(struct sk_buff *skb,
-                                   struct netlink_callback *cb)
+int ib_nl_handle_set_timeout(struct sk_buff *skb,
+                            struct netlink_callback *cb)
 {
        const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh;
        int timeout, delta, abs_delta;
@@ -782,8 +860,8 @@ static inline int ib_nl_is_good_resolve_resp(const struct nlmsghdr *nlh)
        return 1;
 }
 
-static int ib_nl_handle_resolve_resp(struct sk_buff *skb,
-                                    struct netlink_callback *cb)
+int ib_nl_handle_resolve_resp(struct sk_buff *skb,
+                             struct netlink_callback *cb)
 {
        const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh;
        unsigned long flags;
@@ -838,15 +916,6 @@ resp_out:
        return skb->len;
 }
 
-static struct ibnl_client_cbs ib_sa_cb_table[] = {
-       [RDMA_NL_LS_OP_RESOLVE] = {
-               .dump = ib_nl_handle_resolve_resp,
-               .module = THIS_MODULE },
-       [RDMA_NL_LS_OP_SET_TIMEOUT] = {
-               .dump = ib_nl_handle_set_timeout,
-               .module = THIS_MODULE },
-};
-
 static void free_sm_ah(struct kref *kref)
 {
        struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref);
@@ -1645,6 +1714,97 @@ err1:
 }
 EXPORT_SYMBOL(ib_sa_guid_info_rec_query);
 
+/* Support get SA ClassPortInfo */
+static void ib_sa_classport_info_rec_callback(struct ib_sa_query *sa_query,
+                                             int status,
+                                             struct ib_sa_mad *mad)
+{
+       struct ib_sa_classport_info_query *query =
+               container_of(sa_query, struct ib_sa_classport_info_query, sa_query);
+
+       if (mad) {
+               struct ib_class_port_info rec;
+
+               ib_unpack(classport_info_rec_table,
+                         ARRAY_SIZE(classport_info_rec_table),
+                         mad->data, &rec);
+               query->callback(status, &rec, query->context);
+       } else {
+               query->callback(status, NULL, query->context);
+       }
+}
+
+static void ib_sa_portclass_info_rec_release(struct ib_sa_query *sa_query)
+{
+       kfree(container_of(sa_query, struct ib_sa_classport_info_query,
+                          sa_query));
+}
+
+int ib_sa_classport_info_rec_query(struct ib_sa_client *client,
+                                  struct ib_device *device, u8 port_num,
+                                  int timeout_ms, gfp_t gfp_mask,
+                                  void (*callback)(int status,
+                                                   struct ib_class_port_info *resp,
+                                                   void *context),
+                                  void *context,
+                                  struct ib_sa_query **sa_query)
+{
+       struct ib_sa_classport_info_query *query;
+       struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+       struct ib_sa_port *port;
+       struct ib_mad_agent *agent;
+       struct ib_sa_mad *mad;
+       int ret;
+
+       if (!sa_dev)
+               return -ENODEV;
+
+       port  = &sa_dev->port[port_num - sa_dev->start_port];
+       agent = port->agent;
+
+       query = kzalloc(sizeof(*query), gfp_mask);
+       if (!query)
+               return -ENOMEM;
+
+       query->sa_query.port = port;
+       ret = alloc_mad(&query->sa_query, gfp_mask);
+       if (ret)
+               goto err1;
+
+       ib_sa_client_get(client);
+       query->sa_query.client = client;
+       query->callback        = callback;
+       query->context         = context;
+
+       mad = query->sa_query.mad_buf->mad;
+       init_mad(mad, agent);
+
+       query->sa_query.callback = callback ? ib_sa_classport_info_rec_callback : NULL;
+
+       query->sa_query.release  = ib_sa_portclass_info_rec_release;
+       /* support GET only */
+       mad->mad_hdr.method      = IB_MGMT_METHOD_GET;
+       mad->mad_hdr.attr_id     = cpu_to_be16(IB_SA_ATTR_CLASS_PORTINFO);
+       mad->sa_hdr.comp_mask    = 0;
+       *sa_query = &query->sa_query;
+
+       ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+       if (ret < 0)
+               goto err2;
+
+       return ret;
+
+err2:
+       *sa_query = NULL;
+       ib_sa_client_put(query->sa_query.client);
+       free_mad(&query->sa_query);
+
+err1:
+       kfree(query);
+       return ret;
+}
+EXPORT_SYMBOL(ib_sa_classport_info_rec_query);
+
 static void send_handler(struct ib_mad_agent *agent,
                         struct ib_mad_send_wc *mad_send_wc)
 {
@@ -1794,7 +1954,7 @@ static void ib_sa_remove_one(struct ib_device *device, void *client_data)
        kfree(sa_dev);
 }
 
-static int __init ib_sa_init(void)
+int ib_sa_init(void)
 {
        int ret;
 
@@ -1820,17 +1980,10 @@ static int __init ib_sa_init(void)
                goto err3;
        }
 
-       if (ibnl_add_client(RDMA_NL_LS, ARRAY_SIZE(ib_sa_cb_table),
-                           ib_sa_cb_table)) {
-               pr_err("Failed to add netlink callback\n");
-               ret = -EINVAL;
-               goto err4;
-       }
        INIT_DELAYED_WORK(&ib_nl_timed_work, ib_nl_request_timeout);
 
        return 0;
-err4:
-       destroy_workqueue(ib_nl_wq);
+
 err3:
        mcast_cleanup();
 err2:
@@ -1839,9 +1992,8 @@ err1:
        return ret;
 }
 
-static void __exit ib_sa_cleanup(void)
+void ib_sa_cleanup(void)
 {
-       ibnl_remove_client(RDMA_NL_LS);
        cancel_delayed_work(&ib_nl_timed_work);
        flush_workqueue(ib_nl_wq);
        destroy_workqueue(ib_nl_wq);
@@ -1849,6 +2001,3 @@ static void __exit ib_sa_cleanup(void)
        ib_unregister_client(&sa_client);
        idr_destroy(&query_idr);
 }
-
-module_init(ib_sa_init);
-module_exit(ib_sa_cleanup);
index 14606af..5e573bb 100644 (file)
@@ -56,8 +56,10 @@ struct ib_port {
        struct gid_attr_group *gid_attr_group;
        struct attribute_group gid_group;
        struct attribute_group pkey_group;
-       u8                     port_num;
        struct attribute_group *pma_table;
+       struct attribute_group *hw_stats_ag;
+       struct rdma_hw_stats   *hw_stats;
+       u8                     port_num;
 };
 
 struct port_attribute {
@@ -80,6 +82,18 @@ struct port_table_attribute {
        __be16                  attr_id;
 };
 
+struct hw_stats_attribute {
+       struct attribute        attr;
+       ssize_t                 (*show)(struct kobject *kobj,
+                                       struct attribute *attr, char *buf);
+       ssize_t                 (*store)(struct kobject *kobj,
+                                        struct attribute *attr,
+                                        const char *buf,
+                                        size_t count);
+       int                     index;
+       u8                      port_num;
+};
+
 static ssize_t port_attr_show(struct kobject *kobj,
                              struct attribute *attr, char *buf)
 {
@@ -733,6 +747,212 @@ static struct attribute_group *get_counter_table(struct ib_device *dev,
        return &pma_group;
 }
 
+static int update_hw_stats(struct ib_device *dev, struct rdma_hw_stats *stats,
+                          u8 port_num, int index)
+{
+       int ret;
+
+       if (time_is_after_eq_jiffies(stats->timestamp + stats->lifespan))
+               return 0;
+       ret = dev->get_hw_stats(dev, stats, port_num, index);
+       if (ret < 0)
+               return ret;
+       if (ret == stats->num_counters)
+               stats->timestamp = jiffies;
+
+       return 0;
+}
+
+static ssize_t print_hw_stat(struct rdma_hw_stats *stats, int index, char *buf)
+{
+       return sprintf(buf, "%llu\n", stats->value[index]);
+}
+
+static ssize_t show_hw_stats(struct kobject *kobj, struct attribute *attr,
+                            char *buf)
+{
+       struct ib_device *dev;
+       struct ib_port *port;
+       struct hw_stats_attribute *hsa;
+       struct rdma_hw_stats *stats;
+       int ret;
+
+       hsa = container_of(attr, struct hw_stats_attribute, attr);
+       if (!hsa->port_num) {
+               dev = container_of((struct device *)kobj,
+                                  struct ib_device, dev);
+               stats = dev->hw_stats;
+       } else {
+               port = container_of(kobj, struct ib_port, kobj);
+               dev = port->ibdev;
+               stats = port->hw_stats;
+       }
+       ret = update_hw_stats(dev, stats, hsa->port_num, hsa->index);
+       if (ret)
+               return ret;
+       return print_hw_stat(stats, hsa->index, buf);
+}
+
+static ssize_t show_stats_lifespan(struct kobject *kobj,
+                                  struct attribute *attr,
+                                  char *buf)
+{
+       struct hw_stats_attribute *hsa;
+       int msecs;
+
+       hsa = container_of(attr, struct hw_stats_attribute, attr);
+       if (!hsa->port_num) {
+               struct ib_device *dev = container_of((struct device *)kobj,
+                                                    struct ib_device, dev);
+               msecs = jiffies_to_msecs(dev->hw_stats->lifespan);
+       } else {
+               struct ib_port *p = container_of(kobj, struct ib_port, kobj);
+               msecs = jiffies_to_msecs(p->hw_stats->lifespan);
+       }
+       return sprintf(buf, "%d\n", msecs);
+}
+
+static ssize_t set_stats_lifespan(struct kobject *kobj,
+                                 struct attribute *attr,
+                                 const char *buf, size_t count)
+{
+       struct hw_stats_attribute *hsa;
+       int msecs;
+       int jiffies;
+       int ret;
+
+       ret = kstrtoint(buf, 10, &msecs);
+       if (ret)
+               return ret;
+       if (msecs < 0 || msecs > 10000)
+               return -EINVAL;
+       jiffies = msecs_to_jiffies(msecs);
+       hsa = container_of(attr, struct hw_stats_attribute, attr);
+       if (!hsa->port_num) {
+               struct ib_device *dev = container_of((struct device *)kobj,
+                                                    struct ib_device, dev);
+               dev->hw_stats->lifespan = jiffies;
+       } else {
+               struct ib_port *p = container_of(kobj, struct ib_port, kobj);
+               p->hw_stats->lifespan = jiffies;
+       }
+       return count;
+}
+
+static void free_hsag(struct kobject *kobj, struct attribute_group *attr_group)
+{
+       struct attribute **attr;
+
+       sysfs_remove_group(kobj, attr_group);
+
+       for (attr = attr_group->attrs; *attr; attr++)
+               kfree(*attr);
+       kfree(attr_group);
+}
+
+static struct attribute *alloc_hsa(int index, u8 port_num, const char *name)
+{
+       struct hw_stats_attribute *hsa;
+
+       hsa = kmalloc(sizeof(*hsa), GFP_KERNEL);
+       if (!hsa)
+               return NULL;
+
+       hsa->attr.name = (char *)name;
+       hsa->attr.mode = S_IRUGO;
+       hsa->show = show_hw_stats;
+       hsa->store = NULL;
+       hsa->index = index;
+       hsa->port_num = port_num;
+
+       return &hsa->attr;
+}
+
+static struct attribute *alloc_hsa_lifespan(char *name, u8 port_num)
+{
+       struct hw_stats_attribute *hsa;
+
+       hsa = kmalloc(sizeof(*hsa), GFP_KERNEL);
+       if (!hsa)
+               return NULL;
+
+       hsa->attr.name = name;
+       hsa->attr.mode = S_IWUSR | S_IRUGO;
+       hsa->show = show_stats_lifespan;
+       hsa->store = set_stats_lifespan;
+       hsa->index = 0;
+       hsa->port_num = port_num;
+
+       return &hsa->attr;
+}
+
+static void setup_hw_stats(struct ib_device *device, struct ib_port *port,
+                          u8 port_num)
+{
+       struct attribute_group *hsag = NULL;
+       struct rdma_hw_stats *stats;
+       int i = 0, ret;
+
+       stats = device->alloc_hw_stats(device, port_num);
+
+       if (!stats)
+               return;
+
+       if (!stats->names || stats->num_counters <= 0)
+               goto err;
+
+       hsag = kzalloc(sizeof(*hsag) +
+                      // 1 extra for the lifespan config entry
+                      sizeof(void *) * (stats->num_counters + 1),
+                      GFP_KERNEL);
+       if (!hsag)
+               return;
+
+       ret = device->get_hw_stats(device, stats, port_num,
+                                  stats->num_counters);
+       if (ret != stats->num_counters)
+               goto err;
+
+       stats->timestamp = jiffies;
+
+       hsag->name = "hw_counters";
+       hsag->attrs = (void *)hsag + sizeof(*hsag);
+
+       for (i = 0; i < stats->num_counters; i++) {
+               hsag->attrs[i] = alloc_hsa(i, port_num, stats->names[i]);
+               if (!hsag->attrs[i])
+                       goto err;
+       }
+
+       /* treat an error here as non-fatal */
+       hsag->attrs[i] = alloc_hsa_lifespan("lifespan", port_num);
+
+       if (port) {
+               struct kobject *kobj = &port->kobj;
+               ret = sysfs_create_group(kobj, hsag);
+               if (ret)
+                       goto err;
+               port->hw_stats_ag = hsag;
+               port->hw_stats = stats;
+       } else {
+               struct kobject *kobj = &device->dev.kobj;
+               ret = sysfs_create_group(kobj, hsag);
+               if (ret)
+                       goto err;
+               device->hw_stats_ag = hsag;
+               device->hw_stats = stats;
+       }
+
+       return;
+
+err:
+       kfree(stats);
+       for (; i >= 0; i--)
+               kfree(hsag->attrs[i]);
+       kfree(hsag);
+       return;
+}
+
 static int add_port(struct ib_device *device, int port_num,
                    int (*port_callback)(struct ib_device *,
                                         u8, struct kobject *))
@@ -835,6 +1055,14 @@ static int add_port(struct ib_device *device, int port_num,
                        goto err_remove_pkey;
        }
 
+       /*
+        * If port == 0, it means we have only one port and the parent
+        * device, not this port device, should be the holder of the
+        * hw_counters
+        */
+       if (device->alloc_hw_stats && port_num)
+               setup_hw_stats(device, p, port_num);
+
        list_add_tail(&p->kobj.entry, &device->port_list);
 
        kobject_uevent(&p->kobj, KOBJ_ADD);
@@ -972,120 +1200,6 @@ static struct device_attribute *ib_class_attributes[] = {
        &dev_attr_node_desc
 };
 
-/* Show a given an attribute in the statistics group */
-static ssize_t show_protocol_stat(const struct device *device,
-                           struct device_attribute *attr, char *buf,
-                           unsigned offset)
-{
-       struct ib_device *dev = container_of(device, struct ib_device, dev);
-       union rdma_protocol_stats stats;
-       ssize_t ret;
-
-       ret = dev->get_protocol_stats(dev, &stats);
-       if (ret)
-               return ret;
-
-       return sprintf(buf, "%llu\n",
-                      (unsigned long long) ((u64 *) &stats)[offset]);
-}
-
-/* generate a read-only iwarp statistics attribute */
-#define IW_STATS_ENTRY(name)                                           \
-static ssize_t show_##name(struct device *device,                      \
-                          struct device_attribute *attr, char *buf)    \
-{                                                                      \
-       return show_protocol_stat(device, attr, buf,                    \
-                                 offsetof(struct iw_protocol_stats, name) / \
-                                 sizeof (u64));                        \
-}                                                                      \
-static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
-
-IW_STATS_ENTRY(ipInReceives);
-IW_STATS_ENTRY(ipInHdrErrors);
-IW_STATS_ENTRY(ipInTooBigErrors);
-IW_STATS_ENTRY(ipInNoRoutes);
-IW_STATS_ENTRY(ipInAddrErrors);
-IW_STATS_ENTRY(ipInUnknownProtos);
-IW_STATS_ENTRY(ipInTruncatedPkts);
-IW_STATS_ENTRY(ipInDiscards);
-IW_STATS_ENTRY(ipInDelivers);
-IW_STATS_ENTRY(ipOutForwDatagrams);
-IW_STATS_ENTRY(ipOutRequests);
-IW_STATS_ENTRY(ipOutDiscards);
-IW_STATS_ENTRY(ipOutNoRoutes);
-IW_STATS_ENTRY(ipReasmTimeout);
-IW_STATS_ENTRY(ipReasmReqds);
-IW_STATS_ENTRY(ipReasmOKs);
-IW_STATS_ENTRY(ipReasmFails);
-IW_STATS_ENTRY(ipFragOKs);
-IW_STATS_ENTRY(ipFragFails);
-IW_STATS_ENTRY(ipFragCreates);
-IW_STATS_ENTRY(ipInMcastPkts);
-IW_STATS_ENTRY(ipOutMcastPkts);
-IW_STATS_ENTRY(ipInBcastPkts);
-IW_STATS_ENTRY(ipOutBcastPkts);
-IW_STATS_ENTRY(tcpRtoAlgorithm);
-IW_STATS_ENTRY(tcpRtoMin);
-IW_STATS_ENTRY(tcpRtoMax);
-IW_STATS_ENTRY(tcpMaxConn);
-IW_STATS_ENTRY(tcpActiveOpens);
-IW_STATS_ENTRY(tcpPassiveOpens);
-IW_STATS_ENTRY(tcpAttemptFails);
-IW_STATS_ENTRY(tcpEstabResets);
-IW_STATS_ENTRY(tcpCurrEstab);
-IW_STATS_ENTRY(tcpInSegs);
-IW_STATS_ENTRY(tcpOutSegs);
-IW_STATS_ENTRY(tcpRetransSegs);
-IW_STATS_ENTRY(tcpInErrs);
-IW_STATS_ENTRY(tcpOutRsts);
-
-static struct attribute *iw_proto_stats_attrs[] = {
-       &dev_attr_ipInReceives.attr,
-       &dev_attr_ipInHdrErrors.attr,
-       &dev_attr_ipInTooBigErrors.attr,
-       &dev_attr_ipInNoRoutes.attr,
-       &dev_attr_ipInAddrErrors.attr,
-       &dev_attr_ipInUnknownProtos.attr,
-       &dev_attr_ipInTruncatedPkts.attr,
-       &dev_attr_ipInDiscards.attr,
-       &dev_attr_ipInDelivers.attr,
-       &dev_attr_ipOutForwDatagrams.attr,
-       &dev_attr_ipOutRequests.attr,
-       &dev_attr_ipOutDiscards.attr,
-       &dev_attr_ipOutNoRoutes.attr,
-       &dev_attr_ipReasmTimeout.attr,
-       &dev_attr_ipReasmReqds.attr,
-       &dev_attr_ipReasmOKs.attr,
-       &dev_attr_ipReasmFails.attr,
-       &dev_attr_ipFragOKs.attr,
-       &dev_attr_ipFragFails.attr,
-       &dev_attr_ipFragCreates.attr,
-       &dev_attr_ipInMcastPkts.attr,
-       &dev_attr_ipOutMcastPkts.attr,
-       &dev_attr_ipInBcastPkts.attr,
-       &dev_attr_ipOutBcastPkts.attr,
-       &dev_attr_tcpRtoAlgorithm.attr,
-       &dev_attr_tcpRtoMin.attr,
-       &dev_attr_tcpRtoMax.attr,
-       &dev_attr_tcpMaxConn.attr,
-       &dev_attr_tcpActiveOpens.attr,
-       &dev_attr_tcpPassiveOpens.attr,
-       &dev_attr_tcpAttemptFails.attr,
-       &dev_attr_tcpEstabResets.attr,
-       &dev_attr_tcpCurrEstab.attr,
-       &dev_attr_tcpInSegs.attr,
-       &dev_attr_tcpOutSegs.attr,
-       &dev_attr_tcpRetransSegs.attr,
-       &dev_attr_tcpInErrs.attr,
-       &dev_attr_tcpOutRsts.attr,
-       NULL
-};
-
-static struct attribute_group iw_stats_group = {
-       .name   = "proto_stats",
-       .attrs  = iw_proto_stats_attrs,
-};
-
 static void free_port_list_attributes(struct ib_device *device)
 {
        struct kobject *p, *t;
@@ -1093,6 +1207,10 @@ static void free_port_list_attributes(struct ib_device *device)
        list_for_each_entry_safe(p, t, &device->port_list, entry) {
                struct ib_port *port = container_of(p, struct ib_port, kobj);
                list_del(&p->entry);
+               if (port->hw_stats) {
+                       kfree(port->hw_stats);
+                       free_hsag(&port->kobj, port->hw_stats_ag);
+               }
                sysfs_remove_group(p, port->pma_table);
                sysfs_remove_group(p, &port->pkey_group);
                sysfs_remove_group(p, &port->gid_group);
@@ -1149,11 +1267,8 @@ int ib_device_register_sysfs(struct ib_device *device,
                }
        }
 
-       if (device->node_type == RDMA_NODE_RNIC && device->get_protocol_stats) {
-               ret = sysfs_create_group(&class_dev->kobj, &iw_stats_group);
-               if (ret)
-                       goto err_put;
-       }
+       if (device->alloc_hw_stats)
+               setup_hw_stats(device, NULL, 0);
 
        return 0;
 
@@ -1169,15 +1284,18 @@ err:
 
 void ib_device_unregister_sysfs(struct ib_device *device)
 {
-       /* Hold kobject until ib_dealloc_device() */
-       struct kobject *kobj_dev = kobject_get(&device->dev.kobj);
        int i;
 
-       if (device->node_type == RDMA_NODE_RNIC && device->get_protocol_stats)
-               sysfs_remove_group(kobj_dev, &iw_stats_group);
+       /* Hold kobject until ib_dealloc_device() */
+       kobject_get(&device->dev.kobj);
 
        free_port_list_attributes(device);
 
+       if (device->hw_stats) {
+               kfree(device->hw_stats);
+               free_hsag(&device->dev.kobj, device->hw_stats_ag);
+       }
+
        for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i)
                device_remove_file(&device->dev, ib_class_attributes[i]);
 
index c7ad0a4..c0c7cf8 100644 (file)
@@ -8,3 +8,4 @@ obj-$(CONFIG_MLX5_INFINIBAND)           += mlx5/
 obj-$(CONFIG_INFINIBAND_NES)           += nes/
 obj-$(CONFIG_INFINIBAND_OCRDMA)                += ocrdma/
 obj-$(CONFIG_INFINIBAND_USNIC)         += usnic/
+obj-$(CONFIG_INFINIBAND_HFI1)          += hfi1/
index de1c61b..ada2e50 100644 (file)
@@ -327,7 +327,7 @@ int cxio_destroy_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq)
        kfree(cq->sw_queue);
        dma_free_coherent(&(rdev_p->rnic_info.pdev->dev),
                          (1UL << (cq->size_log2))
-                         * sizeof(struct t3_cqe), cq->queue,
+                         * sizeof(struct t3_cqe) + 1, cq->queue,
                          dma_unmap_addr(cq, mapping));
        cxio_hal_put_cqid(rdev_p->rscp, cq->cqid);
        return err;
index 47cb927..bb1a839 100644 (file)
@@ -1218,59 +1218,119 @@ static ssize_t show_board(struct device *dev, struct device_attribute *attr,
                       iwch_dev->rdev.rnic_info.pdev->device);
 }
 
-static int iwch_get_mib(struct ib_device *ibdev,
-                       union rdma_protocol_stats *stats)
+enum counters {
+       IPINRECEIVES,
+       IPINHDRERRORS,
+       IPINADDRERRORS,
+       IPINUNKNOWNPROTOS,
+       IPINDISCARDS,
+       IPINDELIVERS,
+       IPOUTREQUESTS,
+       IPOUTDISCARDS,
+       IPOUTNOROUTES,
+       IPREASMTIMEOUT,
+       IPREASMREQDS,
+       IPREASMOKS,
+       IPREASMFAILS,
+       TCPACTIVEOPENS,
+       TCPPASSIVEOPENS,
+       TCPATTEMPTFAILS,
+       TCPESTABRESETS,
+       TCPCURRESTAB,
+       TCPINSEGS,
+       TCPOUTSEGS,
+       TCPRETRANSSEGS,
+       TCPINERRS,
+       TCPOUTRSTS,
+       TCPRTOMIN,
+       TCPRTOMAX,
+       NR_COUNTERS
+};
+
+static const char * const names[] = {
+       [IPINRECEIVES] = "ipInReceives",
+       [IPINHDRERRORS] = "ipInHdrErrors",
+       [IPINADDRERRORS] = "ipInAddrErrors",
+       [IPINUNKNOWNPROTOS] = "ipInUnknownProtos",
+       [IPINDISCARDS] = "ipInDiscards",
+       [IPINDELIVERS] = "ipInDelivers",
+       [IPOUTREQUESTS] = "ipOutRequests",
+       [IPOUTDISCARDS] = "ipOutDiscards",
+       [IPOUTNOROUTES] = "ipOutNoRoutes",
+       [IPREASMTIMEOUT] = "ipReasmTimeout",
+       [IPREASMREQDS] = "ipReasmReqds",
+       [IPREASMOKS] = "ipReasmOKs",
+       [IPREASMFAILS] = "ipReasmFails",
+       [TCPACTIVEOPENS] = "tcpActiveOpens",
+       [TCPPASSIVEOPENS] = "tcpPassiveOpens",
+       [TCPATTEMPTFAILS] = "tcpAttemptFails",
+       [TCPESTABRESETS] = "tcpEstabResets",
+       [TCPCURRESTAB] = "tcpCurrEstab",
+       [TCPINSEGS] = "tcpInSegs",
+       [TCPOUTSEGS] = "tcpOutSegs",
+       [TCPRETRANSSEGS] = "tcpRetransSegs",
+       [TCPINERRS] = "tcpInErrs",
+       [TCPOUTRSTS] = "tcpOutRsts",
+       [TCPRTOMIN] = "tcpRtoMin",
+       [TCPRTOMAX] = "tcpRtoMax",
+};
+
+static struct rdma_hw_stats *iwch_alloc_stats(struct ib_device *ibdev,
+                                             u8 port_num)
+{
+       BUILD_BUG_ON(ARRAY_SIZE(names) != NR_COUNTERS);
+
+       /* Our driver only supports device level stats */
+       if (port_num != 0)
+               return NULL;
+
+       return rdma_alloc_hw_stats_struct(names, NR_COUNTERS,
+                                         RDMA_HW_STATS_DEFAULT_LIFESPAN);
+}
+
+static int iwch_get_mib(struct ib_device *ibdev, struct rdma_hw_stats *stats,
+                       u8 port, int index)
 {
        struct iwch_dev *dev;
        struct tp_mib_stats m;
        int ret;
 
+       if (port != 0 || !stats)
+               return -ENOSYS;
+
        PDBG("%s ibdev %p\n", __func__, ibdev);
        dev = to_iwch_dev(ibdev);
        ret = dev->rdev.t3cdev_p->ctl(dev->rdev.t3cdev_p, RDMA_GET_MIB, &m);
        if (ret)
                return -ENOSYS;
 
-       memset(stats, 0, sizeof *stats);
-       stats->iw.ipInReceives = ((u64) m.ipInReceive_hi << 32) +
-                               m.ipInReceive_lo;
-       stats->iw.ipInHdrErrors = ((u64) m.ipInHdrErrors_hi << 32) +
-                                 m.ipInHdrErrors_lo;
-       stats->iw.ipInAddrErrors = ((u64) m.ipInAddrErrors_hi << 32) +
-                                  m.ipInAddrErrors_lo;
-       stats->iw.ipInUnknownProtos = ((u64) m.ipInUnknownProtos_hi << 32) +
-                                     m.ipInUnknownProtos_lo;
-       stats->iw.ipInDiscards = ((u64) m.ipInDiscards_hi << 32) +
-                                m.ipInDiscards_lo;
-       stats->iw.ipInDelivers = ((u64) m.ipInDelivers_hi << 32) +
-                                m.ipInDelivers_lo;
-       stats->iw.ipOutRequests = ((u64) m.ipOutRequests_hi << 32) +
-                                 m.ipOutRequests_lo;
-       stats->iw.ipOutDiscards = ((u64) m.ipOutDiscards_hi << 32) +
-                                 m.ipOutDiscards_lo;
-       stats->iw.ipOutNoRoutes = ((u64) m.ipOutNoRoutes_hi << 32) +
-                                 m.ipOutNoRoutes_lo;
-       stats->iw.ipReasmTimeout = (u64) m.ipReasmTimeout;
-       stats->iw.ipReasmReqds = (u64) m.ipReasmReqds;
-       stats->iw.ipReasmOKs = (u64) m.ipReasmOKs;
-       stats->iw.ipReasmFails = (u64) m.ipReasmFails;
-       stats->iw.tcpActiveOpens = (u64) m.tcpActiveOpens;
-       stats->iw.tcpPassiveOpens = (u64) m.tcpPassiveOpens;
-       stats->iw.tcpAttemptFails = (u64) m.tcpAttemptFails;
-       stats->iw.tcpEstabResets = (u64) m.tcpEstabResets;
-       stats->iw.tcpOutRsts = (u64) m.tcpOutRsts;
-       stats->iw.tcpCurrEstab = (u64) m.tcpCurrEstab;
-       stats->iw.tcpInSegs = ((u64) m.tcpInSegs_hi << 32) +
-                             m.tcpInSegs_lo;
-       stats->iw.tcpOutSegs = ((u64) m.tcpOutSegs_hi << 32) +
-                              m.tcpOutSegs_lo;
-       stats->iw.tcpRetransSegs = ((u64) m.tcpRetransSeg_hi << 32) +
-                                 m.tcpRetransSeg_lo;
-       stats->iw.tcpInErrs = ((u64) m.tcpInErrs_hi << 32) +
-                             m.tcpInErrs_lo;
-       stats->iw.tcpRtoMin = (u64) m.tcpRtoMin;
-       stats->iw.tcpRtoMax = (u64) m.tcpRtoMax;
-       return 0;
+       stats->value[IPINRECEIVES] = ((u64)m.ipInReceive_hi << 32) +    m.ipInReceive_lo;
+       stats->value[IPINHDRERRORS] = ((u64)m.ipInHdrErrors_hi << 32) + m.ipInHdrErrors_lo;
+       stats->value[IPINADDRERRORS] = ((u64)m.ipInAddrErrors_hi << 32) + m.ipInAddrErrors_lo;
+       stats->value[IPINUNKNOWNPROTOS] = ((u64)m.ipInUnknownProtos_hi << 32) + m.ipInUnknownProtos_lo;
+       stats->value[IPINDISCARDS] = ((u64)m.ipInDiscards_hi << 32) + m.ipInDiscards_lo;
+       stats->value[IPINDELIVERS] = ((u64)m.ipInDelivers_hi << 32) + m.ipInDelivers_lo;
+       stats->value[IPOUTREQUESTS] = ((u64)m.ipOutRequests_hi << 32) + m.ipOutRequests_lo;
+       stats->value[IPOUTDISCARDS] = ((u64)m.ipOutDiscards_hi << 32) + m.ipOutDiscards_lo;
+       stats->value[IPOUTNOROUTES] = ((u64)m.ipOutNoRoutes_hi << 32) + m.ipOutNoRoutes_lo;
+       stats->value[IPREASMTIMEOUT] =  m.ipReasmTimeout;
+       stats->value[IPREASMREQDS] = m.ipReasmReqds;
+       stats->value[IPREASMOKS] = m.ipReasmOKs;
+       stats->value[IPREASMFAILS] = m.ipReasmFails;
+       stats->value[TCPACTIVEOPENS] =  m.tcpActiveOpens;
+       stats->value[TCPPASSIVEOPENS] = m.tcpPassiveOpens;
+       stats->value[TCPATTEMPTFAILS] = m.tcpAttemptFails;
+       stats->value[TCPESTABRESETS] = m.tcpEstabResets;
+       stats->value[TCPCURRESTAB] = m.tcpOutRsts;
+       stats->value[TCPINSEGS] = m.tcpCurrEstab;
+       stats->value[TCPOUTSEGS] = ((u64)m.tcpInSegs_hi << 32) + m.tcpInSegs_lo;
+       stats->value[TCPRETRANSSEGS] = ((u64)m.tcpOutSegs_hi << 32) + m.tcpOutSegs_lo;
+       stats->value[TCPINERRS] = ((u64)m.tcpRetransSeg_hi << 32) + m.tcpRetransSeg_lo,
+       stats->value[TCPOUTRSTS] = ((u64)m.tcpInErrs_hi << 32) + m.tcpInErrs_lo;
+       stats->value[TCPRTOMIN] = m.tcpRtoMin;
+       stats->value[TCPRTOMAX] = m.tcpRtoMax;
+
+       return stats->num_counters;
 }
 
 static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
@@ -1373,7 +1433,8 @@ int iwch_register_device(struct iwch_dev *dev)
        dev->ibdev.req_notify_cq = iwch_arm_cq;
        dev->ibdev.post_send = iwch_post_send;
        dev->ibdev.post_recv = iwch_post_receive;
-       dev->ibdev.get_protocol_stats = iwch_get_mib;
+       dev->ibdev.alloc_hw_stats = iwch_alloc_stats;
+       dev->ibdev.get_hw_stats = iwch_get_mib;
        dev->ibdev.uverbs_abi_ver = IWCH_UVERBS_ABI_VERSION;
        dev->ibdev.get_port_immutable = iwch_port_immutable;
 
index 7574f39..dd8a86b 100644 (file)
@@ -446,20 +446,59 @@ static ssize_t show_board(struct device *dev, struct device_attribute *attr,
                       c4iw_dev->rdev.lldi.pdev->device);
 }
 
+enum counters {
+       IP4INSEGS,
+       IP4OUTSEGS,
+       IP4RETRANSSEGS,
+       IP4OUTRSTS,
+       IP6INSEGS,
+       IP6OUTSEGS,
+       IP6RETRANSSEGS,
+       IP6OUTRSTS,
+       NR_COUNTERS
+};
+
+static const char * const names[] = {
+       [IP4INSEGS] = "ip4InSegs",
+       [IP4OUTSEGS] = "ip4OutSegs",
+       [IP4RETRANSSEGS] = "ip4RetransSegs",
+       [IP4OUTRSTS] = "ip4OutRsts",
+       [IP6INSEGS] = "ip6InSegs",
+       [IP6OUTSEGS] = "ip6OutSegs",
+       [IP6RETRANSSEGS] = "ip6RetransSegs",
+       [IP6OUTRSTS] = "ip6OutRsts"
+};
+
+static struct rdma_hw_stats *c4iw_alloc_stats(struct ib_device *ibdev,
+                                             u8 port_num)
+{
+       BUILD_BUG_ON(ARRAY_SIZE(names) != NR_COUNTERS);
+
+       if (port_num != 0)
+               return NULL;
+
+       return rdma_alloc_hw_stats_struct(names, NR_COUNTERS,
+                                         RDMA_HW_STATS_DEFAULT_LIFESPAN);
+}
+
 static int c4iw_get_mib(struct ib_device *ibdev,
-                       union rdma_protocol_stats *stats)
+                       struct rdma_hw_stats *stats,
+                       u8 port, int index)
 {
        struct tp_tcp_stats v4, v6;
        struct c4iw_dev *c4iw_dev = to_c4iw_dev(ibdev);
 
        cxgb4_get_tcp_stats(c4iw_dev->rdev.lldi.pdev, &v4, &v6);
-       memset(stats, 0, sizeof *stats);
-       stats->iw.tcpInSegs = v4.tcp_in_segs + v6.tcp_in_segs;
-       stats->iw.tcpOutSegs = v4.tcp_out_segs + v6.tcp_out_segs;
-       stats->iw.tcpRetransSegs = v4.tcp_retrans_segs + v6.tcp_retrans_segs;
-       stats->iw.tcpOutRsts = v4.tcp_out_rsts + v6.tcp_out_rsts;
-
-       return 0;
+       stats->value[IP4INSEGS] = v4.tcp_in_segs;
+       stats->value[IP4OUTSEGS] = v4.tcp_out_segs;
+       stats->value[IP4RETRANSSEGS] = v4.tcp_retrans_segs;
+       stats->value[IP4OUTRSTS] = v4.tcp_out_rsts;
+       stats->value[IP6INSEGS] = v6.tcp_in_segs;
+       stats->value[IP6OUTSEGS] = v6.tcp_out_segs;
+       stats->value[IP6RETRANSSEGS] = v6.tcp_retrans_segs;
+       stats->value[IP6OUTRSTS] = v6.tcp_out_rsts;
+
+       return stats->num_counters;
 }
 
 static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
@@ -562,7 +601,8 @@ int c4iw_register_device(struct c4iw_dev *dev)
        dev->ibdev.req_notify_cq = c4iw_arm_cq;
        dev->ibdev.post_send = c4iw_post_send;
        dev->ibdev.post_recv = c4iw_post_receive;
-       dev->ibdev.get_protocol_stats = c4iw_get_mib;
+       dev->ibdev.alloc_hw_stats = c4iw_alloc_stats;
+       dev->ibdev.get_hw_stats = c4iw_get_mib;
        dev->ibdev.uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION;
        dev->ibdev.get_port_immutable = c4iw_port_immutable;
        dev->ibdev.drain_sq = c4iw_drain_sq;
diff --git a/drivers/infiniband/hw/hfi1/Kconfig b/drivers/infiniband/hw/hfi1/Kconfig
new file mode 100644 (file)
index 0000000..a925fb0
--- /dev/null
@@ -0,0 +1,29 @@
+config INFINIBAND_HFI1
+       tristate "Intel OPA Gen1 support"
+       depends on X86_64 && INFINIBAND_RDMAVT
+       select MMU_NOTIFIER
+       select CRC32
+       default m
+       ---help---
+       This is a low-level driver for Intel OPA Gen1 adapter.
+config HFI1_DEBUG_SDMA_ORDER
+       bool "HFI1 SDMA Order debug"
+       depends on INFINIBAND_HFI1
+       default n
+       ---help---
+       This is a debug flag to test for out of order
+       sdma completions for unit testing
+config HFI1_VERBS_31BIT_PSN
+       bool "HFI1 enable 31 bit PSN"
+       depends on INFINIBAND_HFI1
+       default y
+       ---help---
+       Setting this enables 31 BIT PSN
+       For verbs RC/UC
+config SDMA_VERBOSITY
+       bool "Config SDMA Verbosity"
+       depends on INFINIBAND_HFI1
+       default n
+       ---help---
+       This is a configuration flag to enable verbose
+       SDMA debug
diff --git a/drivers/infiniband/hw/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile
new file mode 100644 (file)
index 0000000..9b5382c
--- /dev/null
@@ -0,0 +1,21 @@
+#
+# HFI driver
+#
+#
+#
+# Called from the kernel module build system.
+#
+obj-$(CONFIG_INFINIBAND_HFI1) += hfi1.o
+
+hfi1-y := affinity.o chip.o device.o driver.o efivar.o \
+       eprom.o file_ops.o firmware.o \
+       init.o intr.o mad.o mmu_rb.o pcie.o pio.o pio_copy.o platform.o \
+       qp.o qsfp.o rc.o ruc.o sdma.o sysfs.o trace.o twsi.o \
+       uc.o ud.o user_exp_rcv.o user_pages.o user_sdma.o verbs.o \
+       verbs_txreq.o
+hfi1-$(CONFIG_DEBUG_FS) += debugfs.o
+
+CFLAGS_trace.o = -I$(src)
+ifdef MVERSION
+CFLAGS_driver.o = -DHFI_DRIVER_VERSION_BASE=\"$(MVERSION)\"
+endif
diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c
new file mode 100644 (file)
index 0000000..6e7050a
--- /dev/null
@@ -0,0 +1,431 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/topology.h>
+#include <linux/cpumask.h>
+#include <linux/module.h>
+
+#include "hfi.h"
+#include "affinity.h"
+#include "sdma.h"
+#include "trace.h"
+
+/* Name of IRQ types, indexed by enum irq_type */
+static const char * const irq_type_names[] = {
+       "SDMA",
+       "RCVCTXT",
+       "GENERAL",
+       "OTHER",
+};
+
+static inline void init_cpu_mask_set(struct cpu_mask_set *set)
+{
+       cpumask_clear(&set->mask);
+       cpumask_clear(&set->used);
+       set->gen = 0;
+}
+
+/* Initialize non-HT cpu cores mask */
+int init_real_cpu_mask(struct hfi1_devdata *dd)
+{
+       struct hfi1_affinity *info;
+       int possible, curr_cpu, i, ht;
+
+       info = kzalloc(sizeof(*info), GFP_KERNEL);
+       if (!info)
+               return -ENOMEM;
+
+       cpumask_clear(&info->real_cpu_mask);
+
+       /* Start with cpu online mask as the real cpu mask */
+       cpumask_copy(&info->real_cpu_mask, cpu_online_mask);
+
+       /*
+        * Remove HT cores from the real cpu mask.  Do this in two steps below.
+        */
+       possible = cpumask_weight(&info->real_cpu_mask);
+       ht = cpumask_weight(topology_sibling_cpumask(
+                                       cpumask_first(&info->real_cpu_mask)));
+       /*
+        * Step 1.  Skip over the first N HT siblings and use them as the
+        * "real" cores.  Assumes that HT cores are not enumerated in
+        * succession (except in the single core case).
+        */
+       curr_cpu = cpumask_first(&info->real_cpu_mask);
+       for (i = 0; i < possible / ht; i++)
+               curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask);
+       /*
+        * Step 2.  Remove the remaining HT siblings.  Use cpumask_next() to
+        * skip any gaps.
+        */
+       for (; i < possible; i++) {
+               cpumask_clear_cpu(curr_cpu, &info->real_cpu_mask);
+               curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask);
+       }
+
+       dd->affinity = info;
+       return 0;
+}
+
+/*
+ * Interrupt affinity.
+ *
+ * non-rcv avail gets a default mask that
+ * starts as possible cpus with threads reset
+ * and each rcv avail reset.
+ *
+ * rcv avail gets node relative 1 wrapping back
+ * to the node relative 1 as necessary.
+ *
+ */
+void hfi1_dev_affinity_init(struct hfi1_devdata *dd)
+{
+       int node = pcibus_to_node(dd->pcidev->bus);
+       struct hfi1_affinity *info = dd->affinity;
+       const struct cpumask *local_mask;
+       int curr_cpu, possible, i;
+
+       if (node < 0)
+               node = numa_node_id();
+       dd->node = node;
+
+       spin_lock_init(&info->lock);
+
+       init_cpu_mask_set(&info->def_intr);
+       init_cpu_mask_set(&info->rcv_intr);
+       init_cpu_mask_set(&info->proc);
+
+       local_mask = cpumask_of_node(dd->node);
+       if (cpumask_first(local_mask) >= nr_cpu_ids)
+               local_mask = topology_core_cpumask(0);
+       /* Use the "real" cpu mask of this node as the default */
+       cpumask_and(&info->def_intr.mask, &info->real_cpu_mask, local_mask);
+
+       /*  fill in the receive list */
+       possible = cpumask_weight(&info->def_intr.mask);
+       curr_cpu = cpumask_first(&info->def_intr.mask);
+       if (possible == 1) {
+               /*  only one CPU, everyone will use it */
+               cpumask_set_cpu(curr_cpu, &info->rcv_intr.mask);
+       } else {
+               /*
+                * Retain the first CPU in the default list for the control
+                * context.
+                */
+               curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask);
+               /*
+                * Remove the remaining kernel receive queues from
+                * the default list and add them to the receive list.
+                */
+               for (i = 0; i < dd->n_krcv_queues - 1; i++) {
+                       cpumask_clear_cpu(curr_cpu, &info->def_intr.mask);
+                       cpumask_set_cpu(curr_cpu, &info->rcv_intr.mask);
+                       curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask);
+                       if (curr_cpu >= nr_cpu_ids)
+                               break;
+               }
+       }
+
+       cpumask_copy(&info->proc.mask, cpu_online_mask);
+}
+
+void hfi1_dev_affinity_free(struct hfi1_devdata *dd)
+{
+       kfree(dd->affinity);
+}
+
+int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix)
+{
+       int ret;
+       cpumask_var_t diff;
+       struct cpu_mask_set *set;
+       struct sdma_engine *sde = NULL;
+       struct hfi1_ctxtdata *rcd = NULL;
+       char extra[64];
+       int cpu = -1;
+
+       extra[0] = '\0';
+       cpumask_clear(&msix->mask);
+
+       ret = zalloc_cpumask_var(&diff, GFP_KERNEL);
+       if (!ret)
+               return -ENOMEM;
+
+       switch (msix->type) {
+       case IRQ_SDMA:
+               sde = (struct sdma_engine *)msix->arg;
+               scnprintf(extra, 64, "engine %u", sde->this_idx);
+               /* fall through */
+       case IRQ_GENERAL:
+               set = &dd->affinity->def_intr;
+               break;
+       case IRQ_RCVCTXT:
+               rcd = (struct hfi1_ctxtdata *)msix->arg;
+               if (rcd->ctxt == HFI1_CTRL_CTXT) {
+                       set = &dd->affinity->def_intr;
+                       cpu = cpumask_first(&set->mask);
+               } else {
+                       set = &dd->affinity->rcv_intr;
+               }
+               scnprintf(extra, 64, "ctxt %u", rcd->ctxt);
+               break;
+       default:
+               dd_dev_err(dd, "Invalid IRQ type %d\n", msix->type);
+               return -EINVAL;
+       }
+
+       /*
+        * The control receive context is placed on a particular CPU, which
+        * is set above.  Skip accounting for it.  Everything else finds its
+        * CPU here.
+        */
+       if (cpu == -1) {
+               spin_lock(&dd->affinity->lock);
+               if (cpumask_equal(&set->mask, &set->used)) {
+                       /*
+                        * We've used up all the CPUs, bump up the generation
+                        * and reset the 'used' map
+                        */
+                       set->gen++;
+                       cpumask_clear(&set->used);
+               }
+               cpumask_andnot(diff, &set->mask, &set->used);
+               cpu = cpumask_first(diff);
+               cpumask_set_cpu(cpu, &set->used);
+               spin_unlock(&dd->affinity->lock);
+       }
+
+       switch (msix->type) {
+       case IRQ_SDMA:
+               sde->cpu = cpu;
+               break;
+       case IRQ_GENERAL:
+       case IRQ_RCVCTXT:
+       case IRQ_OTHER:
+               break;
+       }
+
+       cpumask_set_cpu(cpu, &msix->mask);
+       dd_dev_info(dd, "IRQ vector: %u, type %s %s -> cpu: %d\n",
+                   msix->msix.vector, irq_type_names[msix->type],
+                   extra, cpu);
+       irq_set_affinity_hint(msix->msix.vector, &msix->mask);
+
+       free_cpumask_var(diff);
+       return 0;
+}
+
+void hfi1_put_irq_affinity(struct hfi1_devdata *dd,
+                          struct hfi1_msix_entry *msix)
+{
+       struct cpu_mask_set *set = NULL;
+       struct hfi1_ctxtdata *rcd;
+
+       switch (msix->type) {
+       case IRQ_SDMA:
+       case IRQ_GENERAL:
+               set = &dd->affinity->def_intr;
+               break;
+       case IRQ_RCVCTXT:
+               rcd = (struct hfi1_ctxtdata *)msix->arg;
+               /* only do accounting for non control contexts */
+               if (rcd->ctxt != HFI1_CTRL_CTXT)
+                       set = &dd->affinity->rcv_intr;
+               break;
+       default:
+               return;
+       }
+
+       if (set) {
+               spin_lock(&dd->affinity->lock);
+               cpumask_andnot(&set->used, &set->used, &msix->mask);
+               if (cpumask_empty(&set->used) && set->gen) {
+                       set->gen--;
+                       cpumask_copy(&set->used, &set->mask);
+               }
+               spin_unlock(&dd->affinity->lock);
+       }
+
+       irq_set_affinity_hint(msix->msix.vector, NULL);
+       cpumask_clear(&msix->mask);
+}
+
+int hfi1_get_proc_affinity(struct hfi1_devdata *dd, int node)
+{
+       int cpu = -1, ret;
+       cpumask_var_t diff, mask, intrs;
+       const struct cpumask *node_mask,
+               *proc_mask = tsk_cpus_allowed(current);
+       struct cpu_mask_set *set = &dd->affinity->proc;
+       char buf[1024];
+
+       /*
+        * check whether process/context affinity has already
+        * been set
+        */
+       if (cpumask_weight(proc_mask) == 1) {
+               scnprintf(buf, 1024, "%*pbl", cpumask_pr_args(proc_mask));
+               hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %s",
+                         current->pid, current->comm, buf);
+               /*
+                * Mark the pre-set CPU as used. This is atomic so we don't
+                * need the lock
+                */
+               cpu = cpumask_first(proc_mask);
+               cpumask_set_cpu(cpu, &set->used);
+               goto done;
+       } else if (cpumask_weight(proc_mask) < cpumask_weight(&set->mask)) {
+               scnprintf(buf, 1024, "%*pbl", cpumask_pr_args(proc_mask));
+               hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %s",
+                         current->pid, current->comm, buf);
+               goto done;
+       }
+
+       /*
+        * The process does not have a preset CPU affinity so find one to
+        * recommend. We prefer CPUs on the same NUMA as the device.
+        */
+
+       ret = zalloc_cpumask_var(&diff, GFP_KERNEL);
+       if (!ret)
+               goto done;
+       ret = zalloc_cpumask_var(&mask, GFP_KERNEL);
+       if (!ret)
+               goto free_diff;
+       ret = zalloc_cpumask_var(&intrs, GFP_KERNEL);
+       if (!ret)
+               goto free_mask;
+
+       spin_lock(&dd->affinity->lock);
+       /*
+        * If we've used all available CPUs, clear the mask and start
+        * overloading.
+        */
+       if (cpumask_equal(&set->mask, &set->used)) {
+               set->gen++;
+               cpumask_clear(&set->used);
+       }
+
+       /* CPUs used by interrupt handlers */
+       cpumask_copy(intrs, (dd->affinity->def_intr.gen ?
+                            &dd->affinity->def_intr.mask :
+                            &dd->affinity->def_intr.used));
+       cpumask_or(intrs, intrs, (dd->affinity->rcv_intr.gen ?
+                                 &dd->affinity->rcv_intr.mask :
+                                 &dd->affinity->rcv_intr.used));
+       scnprintf(buf, 1024, "%*pbl", cpumask_pr_args(intrs));
+       hfi1_cdbg(PROC, "CPUs used by interrupts: %s", buf);
+
+       /*
+        * If we don't have a NUMA node requested, preference is towards
+        * device NUMA node
+        */
+       if (node == -1)
+               node = dd->node;
+       node_mask = cpumask_of_node(node);
+       scnprintf(buf, 1024, "%*pbl", cpumask_pr_args(node_mask));
+       hfi1_cdbg(PROC, "device on NUMA %u, CPUs %s", node, buf);
+
+       /* diff will hold all unused cpus */
+       cpumask_andnot(diff, &set->mask, &set->used);
+       scnprintf(buf, 1024, "%*pbl", cpumask_pr_args(diff));
+       hfi1_cdbg(PROC, "unused CPUs (all) %s", buf);
+
+       /* get cpumask of available CPUs on preferred NUMA */
+       cpumask_and(mask, diff, node_mask);
+       scnprintf(buf, 1024, "%*pbl", cpumask_pr_args(mask));
+       hfi1_cdbg(PROC, "available cpus on NUMA %s", buf);
+
+       /*
+        * At first, we don't want to place processes on the same
+        * CPUs as interrupt handlers.
+        */
+       cpumask_andnot(diff, mask, intrs);
+       if (!cpumask_empty(diff))
+               cpumask_copy(mask, diff);
+
+       /*
+        * if we don't have a cpu on the preferred NUMA, get
+        * the list of the remaining available CPUs
+        */
+       if (cpumask_empty(mask)) {
+               cpumask_andnot(diff, &set->mask, &set->used);
+               cpumask_andnot(mask, diff, node_mask);
+       }
+       scnprintf(buf, 1024, "%*pbl", cpumask_pr_args(mask));
+       hfi1_cdbg(PROC, "possible CPUs for process %s", buf);
+
+       cpu = cpumask_first(mask);
+       if (cpu >= nr_cpu_ids) /* empty */
+               cpu = -1;
+       else
+               cpumask_set_cpu(cpu, &set->used);
+       spin_unlock(&dd->affinity->lock);
+
+       free_cpumask_var(intrs);
+free_mask:
+       free_cpumask_var(mask);
+free_diff:
+       free_cpumask_var(diff);
+done:
+       return cpu;
+}
+
+void hfi1_put_proc_affinity(struct hfi1_devdata *dd, int cpu)
+{
+       struct cpu_mask_set *set = &dd->affinity->proc;
+
+       if (cpu < 0)
+               return;
+       spin_lock(&dd->affinity->lock);
+       cpumask_clear_cpu(cpu, &set->used);
+       if (cpumask_empty(&set->used) && set->gen) {
+               set->gen--;
+               cpumask_copy(&set->used, &set->mask);
+       }
+       spin_unlock(&dd->affinity->lock);
+}
+
diff --git a/drivers/infiniband/hw/hfi1/affinity.h b/drivers/infiniband/hw/hfi1/affinity.h
new file mode 100644 (file)
index 0000000..20f52fe
--- /dev/null
@@ -0,0 +1,108 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef _HFI1_AFFINITY_H
+#define _HFI1_AFFINITY_H
+
+#include "hfi.h"
+
+enum irq_type {
+       IRQ_SDMA,
+       IRQ_RCVCTXT,
+       IRQ_GENERAL,
+       IRQ_OTHER
+};
+
+/* Can be used for both memory and cpu */
+enum affinity_flags {
+       AFF_AUTO,
+       AFF_NUMA_LOCAL,
+       AFF_DEV_LOCAL,
+       AFF_IRQ_LOCAL
+};
+
+struct cpu_mask_set {
+       struct cpumask mask;
+       struct cpumask used;
+       uint gen;
+};
+
+struct hfi1_affinity {
+       struct cpu_mask_set def_intr;
+       struct cpu_mask_set rcv_intr;
+       struct cpu_mask_set proc;
+       struct cpumask real_cpu_mask;
+       /* spin lock to protect affinity struct */
+       spinlock_t lock;
+};
+
+struct hfi1_msix_entry;
+
+/* Initialize non-HT cpu cores mask */
+int init_real_cpu_mask(struct hfi1_devdata *);
+/* Initialize driver affinity data */
+void hfi1_dev_affinity_init(struct hfi1_devdata *);
+/* Free driver affinity data */
+void hfi1_dev_affinity_free(struct hfi1_devdata *);
+/*
+ * Set IRQ affinity to a CPU. The function will determine the
+ * CPU and set the affinity to it.
+ */
+int hfi1_get_irq_affinity(struct hfi1_devdata *, struct hfi1_msix_entry *);
+/*
+ * Remove the IRQ's CPU affinity. This function also updates
+ * any internal CPU tracking data
+ */
+void hfi1_put_irq_affinity(struct hfi1_devdata *, struct hfi1_msix_entry *);
+/*
+ * Determine a CPU affinity for a user process, if the process does not
+ * have an affinity set yet.
+ */
+int hfi1_get_proc_affinity(struct hfi1_devdata *, int);
+/* Release a CPU used by a user process. */
+void hfi1_put_proc_affinity(struct hfi1_devdata *, int);
+
+#endif /* _HFI1_AFFINITY_H */
diff --git a/drivers/infiniband/hw/hfi1/aspm.h b/drivers/infiniband/hw/hfi1/aspm.h
new file mode 100644 (file)
index 0000000..0d58fe3
--- /dev/null
@@ -0,0 +1,309 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef _ASPM_H
+#define _ASPM_H
+
+#include "hfi.h"
+
+extern uint aspm_mode;
+
+enum aspm_mode {
+       ASPM_MODE_DISABLED = 0, /* ASPM always disabled, performance mode */
+       ASPM_MODE_ENABLED = 1,  /* ASPM always enabled, power saving mode */
+       ASPM_MODE_DYNAMIC = 2,  /* ASPM enabled/disabled dynamically */
+};
+
+/* Time after which the timer interrupt will re-enable ASPM */
+#define ASPM_TIMER_MS 1000
+/* Time for which interrupts are ignored after a timer has been scheduled */
+#define ASPM_RESCHED_TIMER_MS (ASPM_TIMER_MS / 2)
+/* Two interrupts within this time trigger ASPM disable */
+#define ASPM_TRIGGER_MS 1
+#define ASPM_TRIGGER_NS (ASPM_TRIGGER_MS * 1000 * 1000ull)
+#define ASPM_L1_SUPPORTED(reg) \
+       (((reg & PCI_EXP_LNKCAP_ASPMS) >> 10) & 0x2)
+
+static inline bool aspm_hw_l1_supported(struct hfi1_devdata *dd)
+{
+       struct pci_dev *parent = dd->pcidev->bus->self;
+       u32 up, dn;
+
+       /*
+        * If the driver does not have access to the upstream component,
+        * it cannot support ASPM L1 at all.
+        */
+       if (!parent)
+               return false;
+
+       pcie_capability_read_dword(dd->pcidev, PCI_EXP_LNKCAP, &dn);
+       dn = ASPM_L1_SUPPORTED(dn);
+
+       pcie_capability_read_dword(parent, PCI_EXP_LNKCAP, &up);
+       up = ASPM_L1_SUPPORTED(up);
+
+       /* ASPM works on A-step but is reported as not supported */
+       return (!!dn || is_ax(dd)) && !!up;
+}
+
+/* Set L1 entrance latency for slower entry to L1 */
+static inline void aspm_hw_set_l1_ent_latency(struct hfi1_devdata *dd)
+{
+       u32 l1_ent_lat = 0x4u;
+       u32 reg32;
+
+       pci_read_config_dword(dd->pcidev, PCIE_CFG_REG_PL3, &reg32);
+       reg32 &= ~PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SMASK;
+       reg32 |= l1_ent_lat << PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SHIFT;
+       pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL3, reg32);
+}
+
+static inline void aspm_hw_enable_l1(struct hfi1_devdata *dd)
+{
+       struct pci_dev *parent = dd->pcidev->bus->self;
+
+       /*
+        * If the driver does not have access to the upstream component,
+        * it cannot support ASPM L1 at all.
+        */
+       if (!parent)
+               return;
+
+       /* Enable ASPM L1 first in upstream component and then downstream */
+       pcie_capability_clear_and_set_word(parent, PCI_EXP_LNKCTL,
+                                          PCI_EXP_LNKCTL_ASPMC,
+                                          PCI_EXP_LNKCTL_ASPM_L1);
+       pcie_capability_clear_and_set_word(dd->pcidev, PCI_EXP_LNKCTL,
+                                          PCI_EXP_LNKCTL_ASPMC,
+                                          PCI_EXP_LNKCTL_ASPM_L1);
+}
+
+static inline void aspm_hw_disable_l1(struct hfi1_devdata *dd)
+{
+       struct pci_dev *parent = dd->pcidev->bus->self;
+
+       /* Disable ASPM L1 first in downstream component and then upstream */
+       pcie_capability_clear_and_set_word(dd->pcidev, PCI_EXP_LNKCTL,
+                                          PCI_EXP_LNKCTL_ASPMC, 0x0);
+       if (parent)
+               pcie_capability_clear_and_set_word(parent, PCI_EXP_LNKCTL,
+                                                  PCI_EXP_LNKCTL_ASPMC, 0x0);
+}
+
+static inline void aspm_enable(struct hfi1_devdata *dd)
+{
+       if (dd->aspm_enabled || aspm_mode == ASPM_MODE_DISABLED ||
+           !dd->aspm_supported)
+               return;
+
+       aspm_hw_enable_l1(dd);
+       dd->aspm_enabled = true;
+}
+
+static inline void aspm_disable(struct hfi1_devdata *dd)
+{
+       if (!dd->aspm_enabled || aspm_mode == ASPM_MODE_ENABLED)
+               return;
+
+       aspm_hw_disable_l1(dd);
+       dd->aspm_enabled = false;
+}
+
+static inline void aspm_disable_inc(struct hfi1_devdata *dd)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&dd->aspm_lock, flags);
+       aspm_disable(dd);
+       atomic_inc(&dd->aspm_disabled_cnt);
+       spin_unlock_irqrestore(&dd->aspm_lock, flags);
+}
+
+static inline void aspm_enable_dec(struct hfi1_devdata *dd)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&dd->aspm_lock, flags);
+       if (atomic_dec_and_test(&dd->aspm_disabled_cnt))
+               aspm_enable(dd);
+       spin_unlock_irqrestore(&dd->aspm_lock, flags);
+}
+
+/* ASPM processing for each receive context interrupt */
+static inline void aspm_ctx_disable(struct hfi1_ctxtdata *rcd)
+{
+       bool restart_timer;
+       bool close_interrupts;
+       unsigned long flags;
+       ktime_t now, prev;
+
+       /* Quickest exit for minimum impact */
+       if (!rcd->aspm_intr_supported)
+               return;
+
+       spin_lock_irqsave(&rcd->aspm_lock, flags);
+       /* PSM contexts are open */
+       if (!rcd->aspm_intr_enable)
+               goto unlock;
+
+       prev = rcd->aspm_ts_last_intr;
+       now = ktime_get();
+       rcd->aspm_ts_last_intr = now;
+
+       /* An interrupt pair close together in time */
+       close_interrupts = ktime_to_ns(ktime_sub(now, prev)) < ASPM_TRIGGER_NS;
+
+       /* Don't push out our timer till this much time has elapsed */
+       restart_timer = ktime_to_ns(ktime_sub(now, rcd->aspm_ts_timer_sched)) >
+                                   ASPM_RESCHED_TIMER_MS * NSEC_PER_MSEC;
+       restart_timer = restart_timer && close_interrupts;
+
+       /* Disable ASPM and schedule timer */
+       if (rcd->aspm_enabled && close_interrupts) {
+               aspm_disable_inc(rcd->dd);
+               rcd->aspm_enabled = false;
+               restart_timer = true;
+       }
+
+       if (restart_timer) {
+               mod_timer(&rcd->aspm_timer,
+                         jiffies + msecs_to_jiffies(ASPM_TIMER_MS));
+               rcd->aspm_ts_timer_sched = now;
+       }
+unlock:
+       spin_unlock_irqrestore(&rcd->aspm_lock, flags);
+}
+
+/* Timer function for re-enabling ASPM in the absence of interrupt activity */
+static inline void aspm_ctx_timer_function(unsigned long data)
+{
+       struct hfi1_ctxtdata *rcd = (struct hfi1_ctxtdata *)data;
+       unsigned long flags;
+
+       spin_lock_irqsave(&rcd->aspm_lock, flags);
+       aspm_enable_dec(rcd->dd);
+       rcd->aspm_enabled = true;
+       spin_unlock_irqrestore(&rcd->aspm_lock, flags);
+}
+
+/* Disable interrupt processing for verbs contexts when PSM contexts are open */
+static inline void aspm_disable_all(struct hfi1_devdata *dd)
+{
+       struct hfi1_ctxtdata *rcd;
+       unsigned long flags;
+       unsigned i;
+
+       for (i = 0; i < dd->first_user_ctxt; i++) {
+               rcd = dd->rcd[i];
+               del_timer_sync(&rcd->aspm_timer);
+               spin_lock_irqsave(&rcd->aspm_lock, flags);
+               rcd->aspm_intr_enable = false;
+               spin_unlock_irqrestore(&rcd->aspm_lock, flags);
+       }
+
+       aspm_disable(dd);
+       atomic_set(&dd->aspm_disabled_cnt, 0);
+}
+
+/* Re-enable interrupt processing for verbs contexts */
+static inline void aspm_enable_all(struct hfi1_devdata *dd)
+{
+       struct hfi1_ctxtdata *rcd;
+       unsigned long flags;
+       unsigned i;
+
+       aspm_enable(dd);
+
+       if (aspm_mode != ASPM_MODE_DYNAMIC)
+               return;
+
+       for (i = 0; i < dd->first_user_ctxt; i++) {
+               rcd = dd->rcd[i];
+               spin_lock_irqsave(&rcd->aspm_lock, flags);
+               rcd->aspm_intr_enable = true;
+               rcd->aspm_enabled = true;
+               spin_unlock_irqrestore(&rcd->aspm_lock, flags);
+       }
+}
+
+static inline void aspm_ctx_init(struct hfi1_ctxtdata *rcd)
+{
+       spin_lock_init(&rcd->aspm_lock);
+       setup_timer(&rcd->aspm_timer, aspm_ctx_timer_function,
+                   (unsigned long)rcd);
+       rcd->aspm_intr_supported = rcd->dd->aspm_supported &&
+               aspm_mode == ASPM_MODE_DYNAMIC &&
+               rcd->ctxt < rcd->dd->first_user_ctxt;
+}
+
+static inline void aspm_init(struct hfi1_devdata *dd)
+{
+       unsigned i;
+
+       spin_lock_init(&dd->aspm_lock);
+       dd->aspm_supported = aspm_hw_l1_supported(dd);
+
+       for (i = 0; i < dd->first_user_ctxt; i++)
+               aspm_ctx_init(dd->rcd[i]);
+
+       /* Start with ASPM disabled */
+       aspm_hw_set_l1_ent_latency(dd);
+       dd->aspm_enabled = false;
+       aspm_hw_disable_l1(dd);
+
+       /* Now turn on ASPM if configured */
+       aspm_enable_all(dd);
+}
+
+static inline void aspm_exit(struct hfi1_devdata *dd)
+{
+       aspm_disable_all(dd);
+
+       /* Turn on ASPM on exit to conserve power */
+       aspm_enable(dd);
+}
+
+#endif /* _ASPM_H */
diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c
new file mode 100644 (file)
index 0000000..3b876da
--- /dev/null
@@ -0,0 +1,14712 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This file contains all of the code that is specific to the HFI chip
+ */
+
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+
+#include "hfi.h"
+#include "trace.h"
+#include "mad.h"
+#include "pio.h"
+#include "sdma.h"
+#include "eprom.h"
+#include "efivar.h"
+#include "platform.h"
+#include "aspm.h"
+
+#define NUM_IB_PORTS 1
+
+uint kdeth_qp;
+module_param_named(kdeth_qp, kdeth_qp, uint, S_IRUGO);
+MODULE_PARM_DESC(kdeth_qp, "Set the KDETH queue pair prefix");
+
+uint num_vls = HFI1_MAX_VLS_SUPPORTED;
+module_param(num_vls, uint, S_IRUGO);
+MODULE_PARM_DESC(num_vls, "Set number of Virtual Lanes to use (1-8)");
+
+/*
+ * Default time to aggregate two 10K packets from the idle state
+ * (timer not running). The timer starts at the end of the first packet,
+ * so only the time for one 10K packet and header plus a bit extra is needed.
+ * 10 * 1024 + 64 header byte = 10304 byte
+ * 10304 byte / 12.5 GB/s = 824.32ns
+ */
+uint rcv_intr_timeout = (824 + 16); /* 16 is for coalescing interrupt */
+module_param(rcv_intr_timeout, uint, S_IRUGO);
+MODULE_PARM_DESC(rcv_intr_timeout, "Receive interrupt mitigation timeout in ns");
+
+uint rcv_intr_count = 16; /* same as qib */
+module_param(rcv_intr_count, uint, S_IRUGO);
+MODULE_PARM_DESC(rcv_intr_count, "Receive interrupt mitigation count");
+
+ushort link_crc_mask = SUPPORTED_CRCS;
+module_param(link_crc_mask, ushort, S_IRUGO);
+MODULE_PARM_DESC(link_crc_mask, "CRCs to use on the link");
+
+uint loopback;
+module_param_named(loopback, loopback, uint, S_IRUGO);
+MODULE_PARM_DESC(loopback, "Put into loopback mode (1 = serdes, 3 = external cable");
+
+/* Other driver tunables */
+uint rcv_intr_dynamic = 1; /* enable dynamic mode for rcv int mitigation*/
+static ushort crc_14b_sideband = 1;
+static uint use_flr = 1;
+uint quick_linkup; /* skip LNI */
+
+struct flag_table {
+       u64 flag;       /* the flag */
+       char *str;      /* description string */
+       u16 extra;      /* extra information */
+       u16 unused0;
+       u32 unused1;
+};
+
+/* str must be a string constant */
+#define FLAG_ENTRY(str, extra, flag) {flag, str, extra}
+#define FLAG_ENTRY0(str, flag) {flag, str, 0}
+
+/* Send Error Consequences */
+#define SEC_WRITE_DROPPED      0x1
+#define SEC_PACKET_DROPPED     0x2
+#define SEC_SC_HALTED          0x4     /* per-context only */
+#define SEC_SPC_FREEZE         0x8     /* per-HFI only */
+
+#define MIN_KERNEL_KCTXTS         2
+#define FIRST_KERNEL_KCTXT        1
+/* sizes for both the QP and RSM map tables */
+#define NUM_MAP_ENTRIES                256
+#define NUM_MAP_REGS             32
+
+/* Bit offset into the GUID which carries HFI id information */
+#define GUID_HFI_INDEX_SHIFT     39
+
+/* extract the emulation revision */
+#define emulator_rev(dd) ((dd)->irev >> 8)
+/* parallel and serial emulation versions are 3 and 4 respectively */
+#define is_emulator_p(dd) ((((dd)->irev) & 0xf) == 3)
+#define is_emulator_s(dd) ((((dd)->irev) & 0xf) == 4)
+
+/* RSM fields */
+
+/* packet type */
+#define IB_PACKET_TYPE         2ull
+#define QW_SHIFT               6ull
+/* QPN[7..1] */
+#define QPN_WIDTH              7ull
+
+/* LRH.BTH: QW 0, OFFSET 48 - for match */
+#define LRH_BTH_QW             0ull
+#define LRH_BTH_BIT_OFFSET     48ull
+#define LRH_BTH_OFFSET(off)    ((LRH_BTH_QW << QW_SHIFT) | (off))
+#define LRH_BTH_MATCH_OFFSET   LRH_BTH_OFFSET(LRH_BTH_BIT_OFFSET)
+#define LRH_BTH_SELECT
+#define LRH_BTH_MASK           3ull
+#define LRH_BTH_VALUE          2ull
+
+/* LRH.SC[3..0] QW 0, OFFSET 56 - for match */
+#define LRH_SC_QW              0ull
+#define LRH_SC_BIT_OFFSET      56ull
+#define LRH_SC_OFFSET(off)     ((LRH_SC_QW << QW_SHIFT) | (off))
+#define LRH_SC_MATCH_OFFSET    LRH_SC_OFFSET(LRH_SC_BIT_OFFSET)
+#define LRH_SC_MASK            128ull
+#define LRH_SC_VALUE           0ull
+
+/* SC[n..0] QW 0, OFFSET 60 - for select */
+#define LRH_SC_SELECT_OFFSET  ((LRH_SC_QW << QW_SHIFT) | (60ull))
+
+/* QPN[m+n:1] QW 1, OFFSET 1 */
+#define QPN_SELECT_OFFSET      ((1ull << QW_SHIFT) | (1ull))
+
+/* defines to build power on SC2VL table */
+#define SC2VL_VAL( \
+       num, \
+       sc0, sc0val, \
+       sc1, sc1val, \
+       sc2, sc2val, \
+       sc3, sc3val, \
+       sc4, sc4val, \
+       sc5, sc5val, \
+       sc6, sc6val, \
+       sc7, sc7val) \
+( \
+       ((u64)(sc0val) << SEND_SC2VLT##num##_SC##sc0##_SHIFT) | \
+       ((u64)(sc1val) << SEND_SC2VLT##num##_SC##sc1##_SHIFT) | \
+       ((u64)(sc2val) << SEND_SC2VLT##num##_SC##sc2##_SHIFT) | \
+       ((u64)(sc3val) << SEND_SC2VLT##num##_SC##sc3##_SHIFT) | \
+       ((u64)(sc4val) << SEND_SC2VLT##num##_SC##sc4##_SHIFT) | \
+       ((u64)(sc5val) << SEND_SC2VLT##num##_SC##sc5##_SHIFT) | \
+       ((u64)(sc6val) << SEND_SC2VLT##num##_SC##sc6##_SHIFT) | \
+       ((u64)(sc7val) << SEND_SC2VLT##num##_SC##sc7##_SHIFT)   \
+)
+
+#define DC_SC_VL_VAL( \
+       range, \
+       e0, e0val, \
+       e1, e1val, \
+       e2, e2val, \
+       e3, e3val, \
+       e4, e4val, \
+       e5, e5val, \
+       e6, e6val, \
+       e7, e7val, \
+       e8, e8val, \
+       e9, e9val, \
+       e10, e10val, \
+       e11, e11val, \
+       e12, e12val, \
+       e13, e13val, \
+       e14, e14val, \
+       e15, e15val) \
+( \
+       ((u64)(e0val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e0##_SHIFT) | \
+       ((u64)(e1val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e1##_SHIFT) | \
+       ((u64)(e2val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e2##_SHIFT) | \
+       ((u64)(e3val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e3##_SHIFT) | \
+       ((u64)(e4val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e4##_SHIFT) | \
+       ((u64)(e5val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e5##_SHIFT) | \
+       ((u64)(e6val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e6##_SHIFT) | \
+       ((u64)(e7val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e7##_SHIFT) | \
+       ((u64)(e8val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e8##_SHIFT) | \
+       ((u64)(e9val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e9##_SHIFT) | \
+       ((u64)(e10val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e10##_SHIFT) | \
+       ((u64)(e11val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e11##_SHIFT) | \
+       ((u64)(e12val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e12##_SHIFT) | \
+       ((u64)(e13val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e13##_SHIFT) | \
+       ((u64)(e14val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e14##_SHIFT) | \
+       ((u64)(e15val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e15##_SHIFT) \
+)
+
+/* all CceStatus sub-block freeze bits */
+#define ALL_FROZE (CCE_STATUS_SDMA_FROZE_SMASK \
+                       | CCE_STATUS_RXE_FROZE_SMASK \
+                       | CCE_STATUS_TXE_FROZE_SMASK \
+                       | CCE_STATUS_TXE_PIO_FROZE_SMASK)
+/* all CceStatus sub-block TXE pause bits */
+#define ALL_TXE_PAUSE (CCE_STATUS_TXE_PIO_PAUSED_SMASK \
+                       | CCE_STATUS_TXE_PAUSED_SMASK \
+                       | CCE_STATUS_SDMA_PAUSED_SMASK)
+/* all CceStatus sub-block RXE pause bits */
+#define ALL_RXE_PAUSE CCE_STATUS_RXE_PAUSED_SMASK
+
+/*
+ * CCE Error flags.
+ */
+static struct flag_table cce_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY0("CceCsrParityErr",
+               CCE_ERR_STATUS_CCE_CSR_PARITY_ERR_SMASK),
+/* 1*/ FLAG_ENTRY0("CceCsrReadBadAddrErr",
+               CCE_ERR_STATUS_CCE_CSR_READ_BAD_ADDR_ERR_SMASK),
+/* 2*/ FLAG_ENTRY0("CceCsrWriteBadAddrErr",
+               CCE_ERR_STATUS_CCE_CSR_WRITE_BAD_ADDR_ERR_SMASK),
+/* 3*/ FLAG_ENTRY0("CceTrgtAsyncFifoParityErr",
+               CCE_ERR_STATUS_CCE_TRGT_ASYNC_FIFO_PARITY_ERR_SMASK),
+/* 4*/ FLAG_ENTRY0("CceTrgtAccessErr",
+               CCE_ERR_STATUS_CCE_TRGT_ACCESS_ERR_SMASK),
+/* 5*/ FLAG_ENTRY0("CceRspdDataParityErr",
+               CCE_ERR_STATUS_CCE_RSPD_DATA_PARITY_ERR_SMASK),
+/* 6*/ FLAG_ENTRY0("CceCli0AsyncFifoParityErr",
+               CCE_ERR_STATUS_CCE_CLI0_ASYNC_FIFO_PARITY_ERR_SMASK),
+/* 7*/ FLAG_ENTRY0("CceCsrCfgBusParityErr",
+               CCE_ERR_STATUS_CCE_CSR_CFG_BUS_PARITY_ERR_SMASK),
+/* 8*/ FLAG_ENTRY0("CceCli2AsyncFifoParityErr",
+               CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK),
+/* 9*/ FLAG_ENTRY0("CceCli1AsyncFifoPioCrdtParityErr",
+           CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR_SMASK),
+/*10*/ FLAG_ENTRY0("CceCli1AsyncFifoPioCrdtParityErr",
+           CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR_SMASK),
+/*11*/ FLAG_ENTRY0("CceCli1AsyncFifoRxdmaParityError",
+           CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERROR_SMASK),
+/*12*/ FLAG_ENTRY0("CceCli1AsyncFifoDbgParityError",
+               CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERROR_SMASK),
+/*13*/ FLAG_ENTRY0("PcicRetryMemCorErr",
+               CCE_ERR_STATUS_PCIC_RETRY_MEM_COR_ERR_SMASK),
+/*14*/ FLAG_ENTRY0("PcicRetryMemCorErr",
+               CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_COR_ERR_SMASK),
+/*15*/ FLAG_ENTRY0("PcicPostHdQCorErr",
+               CCE_ERR_STATUS_PCIC_POST_HD_QCOR_ERR_SMASK),
+/*16*/ FLAG_ENTRY0("PcicPostHdQCorErr",
+               CCE_ERR_STATUS_PCIC_POST_DAT_QCOR_ERR_SMASK),
+/*17*/ FLAG_ENTRY0("PcicPostHdQCorErr",
+               CCE_ERR_STATUS_PCIC_CPL_HD_QCOR_ERR_SMASK),
+/*18*/ FLAG_ENTRY0("PcicCplDatQCorErr",
+               CCE_ERR_STATUS_PCIC_CPL_DAT_QCOR_ERR_SMASK),
+/*19*/ FLAG_ENTRY0("PcicNPostHQParityErr",
+               CCE_ERR_STATUS_PCIC_NPOST_HQ_PARITY_ERR_SMASK),
+/*20*/ FLAG_ENTRY0("PcicNPostDatQParityErr",
+               CCE_ERR_STATUS_PCIC_NPOST_DAT_QPARITY_ERR_SMASK),
+/*21*/ FLAG_ENTRY0("PcicRetryMemUncErr",
+               CCE_ERR_STATUS_PCIC_RETRY_MEM_UNC_ERR_SMASK),
+/*22*/ FLAG_ENTRY0("PcicRetrySotMemUncErr",
+               CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_UNC_ERR_SMASK),
+/*23*/ FLAG_ENTRY0("PcicPostHdQUncErr",
+               CCE_ERR_STATUS_PCIC_POST_HD_QUNC_ERR_SMASK),
+/*24*/ FLAG_ENTRY0("PcicPostDatQUncErr",
+               CCE_ERR_STATUS_PCIC_POST_DAT_QUNC_ERR_SMASK),
+/*25*/ FLAG_ENTRY0("PcicCplHdQUncErr",
+               CCE_ERR_STATUS_PCIC_CPL_HD_QUNC_ERR_SMASK),
+/*26*/ FLAG_ENTRY0("PcicCplDatQUncErr",
+               CCE_ERR_STATUS_PCIC_CPL_DAT_QUNC_ERR_SMASK),
+/*27*/ FLAG_ENTRY0("PcicTransmitFrontParityErr",
+               CCE_ERR_STATUS_PCIC_TRANSMIT_FRONT_PARITY_ERR_SMASK),
+/*28*/ FLAG_ENTRY0("PcicTransmitBackParityErr",
+               CCE_ERR_STATUS_PCIC_TRANSMIT_BACK_PARITY_ERR_SMASK),
+/*29*/ FLAG_ENTRY0("PcicReceiveParityErr",
+               CCE_ERR_STATUS_PCIC_RECEIVE_PARITY_ERR_SMASK),
+/*30*/ FLAG_ENTRY0("CceTrgtCplTimeoutErr",
+               CCE_ERR_STATUS_CCE_TRGT_CPL_TIMEOUT_ERR_SMASK),
+/*31*/ FLAG_ENTRY0("LATriggered",
+               CCE_ERR_STATUS_LA_TRIGGERED_SMASK),
+/*32*/ FLAG_ENTRY0("CceSegReadBadAddrErr",
+               CCE_ERR_STATUS_CCE_SEG_READ_BAD_ADDR_ERR_SMASK),
+/*33*/ FLAG_ENTRY0("CceSegWriteBadAddrErr",
+               CCE_ERR_STATUS_CCE_SEG_WRITE_BAD_ADDR_ERR_SMASK),
+/*34*/ FLAG_ENTRY0("CceRcplAsyncFifoParityErr",
+               CCE_ERR_STATUS_CCE_RCPL_ASYNC_FIFO_PARITY_ERR_SMASK),
+/*35*/ FLAG_ENTRY0("CceRxdmaConvFifoParityErr",
+               CCE_ERR_STATUS_CCE_RXDMA_CONV_FIFO_PARITY_ERR_SMASK),
+/*36*/ FLAG_ENTRY0("CceMsixTableCorErr",
+               CCE_ERR_STATUS_CCE_MSIX_TABLE_COR_ERR_SMASK),
+/*37*/ FLAG_ENTRY0("CceMsixTableUncErr",
+               CCE_ERR_STATUS_CCE_MSIX_TABLE_UNC_ERR_SMASK),
+/*38*/ FLAG_ENTRY0("CceIntMapCorErr",
+               CCE_ERR_STATUS_CCE_INT_MAP_COR_ERR_SMASK),
+/*39*/ FLAG_ENTRY0("CceIntMapUncErr",
+               CCE_ERR_STATUS_CCE_INT_MAP_UNC_ERR_SMASK),
+/*40*/ FLAG_ENTRY0("CceMsixCsrParityErr",
+               CCE_ERR_STATUS_CCE_MSIX_CSR_PARITY_ERR_SMASK),
+/*41-63 reserved*/
+};
+
+/*
+ * Misc Error flags
+ */
+#define MES(text) MISC_ERR_STATUS_MISC_##text##_ERR_SMASK
+static struct flag_table misc_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY0("CSR_PARITY", MES(CSR_PARITY)),
+/* 1*/ FLAG_ENTRY0("CSR_READ_BAD_ADDR", MES(CSR_READ_BAD_ADDR)),
+/* 2*/ FLAG_ENTRY0("CSR_WRITE_BAD_ADDR", MES(CSR_WRITE_BAD_ADDR)),
+/* 3*/ FLAG_ENTRY0("SBUS_WRITE_FAILED", MES(SBUS_WRITE_FAILED)),
+/* 4*/ FLAG_ENTRY0("KEY_MISMATCH", MES(KEY_MISMATCH)),
+/* 5*/ FLAG_ENTRY0("FW_AUTH_FAILED", MES(FW_AUTH_FAILED)),
+/* 6*/ FLAG_ENTRY0("EFUSE_CSR_PARITY", MES(EFUSE_CSR_PARITY)),
+/* 7*/ FLAG_ENTRY0("EFUSE_READ_BAD_ADDR", MES(EFUSE_READ_BAD_ADDR)),
+/* 8*/ FLAG_ENTRY0("EFUSE_WRITE", MES(EFUSE_WRITE)),
+/* 9*/ FLAG_ENTRY0("EFUSE_DONE_PARITY", MES(EFUSE_DONE_PARITY)),
+/*10*/ FLAG_ENTRY0("INVALID_EEP_CMD", MES(INVALID_EEP_CMD)),
+/*11*/ FLAG_ENTRY0("MBIST_FAIL", MES(MBIST_FAIL)),
+/*12*/ FLAG_ENTRY0("PLL_LOCK_FAIL", MES(PLL_LOCK_FAIL))
+};
+
+/*
+ * TXE PIO Error flags and consequences
+ */
+static struct flag_table pio_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY("PioWriteBadCtxt",
+       SEC_WRITE_DROPPED,
+       SEND_PIO_ERR_STATUS_PIO_WRITE_BAD_CTXT_ERR_SMASK),
+/* 1*/ FLAG_ENTRY("PioWriteAddrParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK),
+/* 2*/ FLAG_ENTRY("PioCsrParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK),
+/* 3*/ FLAG_ENTRY("PioSbMemFifo0",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK),
+/* 4*/ FLAG_ENTRY("PioSbMemFifo1",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK),
+/* 5*/ FLAG_ENTRY("PioPccFifoParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK),
+/* 6*/ FLAG_ENTRY("PioPecFifoParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK),
+/* 7*/ FLAG_ENTRY("PioSbrdctlCrrelParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK),
+/* 8*/ FLAG_ENTRY("PioSbrdctrlCrrelFifoParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK),
+/* 9*/ FLAG_ENTRY("PioPktEvictFifoParityErr",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK),
+/*10*/ FLAG_ENTRY("PioSmPktResetParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK),
+/*11*/ FLAG_ENTRY("PioVlLenMemBank0Unc",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK),
+/*12*/ FLAG_ENTRY("PioVlLenMemBank1Unc",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK),
+/*13*/ FLAG_ENTRY("PioVlLenMemBank0Cor",
+       0,
+       SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_COR_ERR_SMASK),
+/*14*/ FLAG_ENTRY("PioVlLenMemBank1Cor",
+       0,
+       SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_COR_ERR_SMASK),
+/*15*/ FLAG_ENTRY("PioCreditRetFifoParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK),
+/*16*/ FLAG_ENTRY("PioPpmcPblFifo",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK),
+/*17*/ FLAG_ENTRY("PioInitSmIn",
+       0,
+       SEND_PIO_ERR_STATUS_PIO_INIT_SM_IN_ERR_SMASK),
+/*18*/ FLAG_ENTRY("PioPktEvictSmOrArbSm",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK),
+/*19*/ FLAG_ENTRY("PioHostAddrMemUnc",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK),
+/*20*/ FLAG_ENTRY("PioHostAddrMemCor",
+       0,
+       SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_COR_ERR_SMASK),
+/*21*/ FLAG_ENTRY("PioWriteDataParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK),
+/*22*/ FLAG_ENTRY("PioStateMachine",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK),
+/*23*/ FLAG_ENTRY("PioWriteQwValidParity",
+       SEC_WRITE_DROPPED | SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK),
+/*24*/ FLAG_ENTRY("PioBlockQwCountParity",
+       SEC_WRITE_DROPPED | SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK),
+/*25*/ FLAG_ENTRY("PioVlfVlLenParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK),
+/*26*/ FLAG_ENTRY("PioVlfSopParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK),
+/*27*/ FLAG_ENTRY("PioVlFifoParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK),
+/*28*/ FLAG_ENTRY("PioPpmcBqcMemParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK),
+/*29*/ FLAG_ENTRY("PioPpmcSopLen",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK),
+/*30-31 reserved*/
+/*32*/ FLAG_ENTRY("PioCurrentFreeCntParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK),
+/*33*/ FLAG_ENTRY("PioLastReturnedCntParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK),
+/*34*/ FLAG_ENTRY("PioPccSopHeadParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK),
+/*35*/ FLAG_ENTRY("PioPecSopHeadParityErr",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK),
+/*36-63 reserved*/
+};
+
+/* TXE PIO errors that cause an SPC freeze */
+#define ALL_PIO_FREEZE_ERR \
+       (SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK)
+
+/*
+ * TXE SDMA Error flags
+ */
+static struct flag_table sdma_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY0("SDmaRpyTagErr",
+               SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK),
+/* 1*/ FLAG_ENTRY0("SDmaCsrParityErr",
+               SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK),
+/* 2*/ FLAG_ENTRY0("SDmaPcieReqTrackingUncErr",
+               SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK),
+/* 3*/ FLAG_ENTRY0("SDmaPcieReqTrackingCorErr",
+               SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_COR_ERR_SMASK),
+/*04-63 reserved*/
+};
+
+/* TXE SDMA errors that cause an SPC freeze */
+#define ALL_SDMA_FREEZE_ERR  \
+               (SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK \
+               | SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK \
+               | SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK)
+
+/* SendEgressErrInfo bits that correspond to a PortXmitDiscard counter */
+#define PORT_DISCARD_EGRESS_ERRS \
+       (SEND_EGRESS_ERR_INFO_TOO_LONG_IB_PACKET_ERR_SMASK \
+       | SEND_EGRESS_ERR_INFO_VL_MAPPING_ERR_SMASK \
+       | SEND_EGRESS_ERR_INFO_VL_ERR_SMASK)
+
+/*
+ * TXE Egress Error flags
+ */
+#define SEES(text) SEND_EGRESS_ERR_STATUS_##text##_ERR_SMASK
+static struct flag_table egress_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY0("TxPktIntegrityMemCorErr", SEES(TX_PKT_INTEGRITY_MEM_COR)),
+/* 1*/ FLAG_ENTRY0("TxPktIntegrityMemUncErr", SEES(TX_PKT_INTEGRITY_MEM_UNC)),
+/* 2 reserved */
+/* 3*/ FLAG_ENTRY0("TxEgressFifoUnderrunOrParityErr",
+               SEES(TX_EGRESS_FIFO_UNDERRUN_OR_PARITY)),
+/* 4*/ FLAG_ENTRY0("TxLinkdownErr", SEES(TX_LINKDOWN)),
+/* 5*/ FLAG_ENTRY0("TxIncorrectLinkStateErr", SEES(TX_INCORRECT_LINK_STATE)),
+/* 6 reserved */
+/* 7*/ FLAG_ENTRY0("TxPioLaunchIntfParityErr",
+               SEES(TX_PIO_LAUNCH_INTF_PARITY)),
+/* 8*/ FLAG_ENTRY0("TxSdmaLaunchIntfParityErr",
+               SEES(TX_SDMA_LAUNCH_INTF_PARITY)),
+/* 9-10 reserved */
+/*11*/ FLAG_ENTRY0("TxSbrdCtlStateMachineParityErr",
+               SEES(TX_SBRD_CTL_STATE_MACHINE_PARITY)),
+/*12*/ FLAG_ENTRY0("TxIllegalVLErr", SEES(TX_ILLEGAL_VL)),
+/*13*/ FLAG_ENTRY0("TxLaunchCsrParityErr", SEES(TX_LAUNCH_CSR_PARITY)),
+/*14*/ FLAG_ENTRY0("TxSbrdCtlCsrParityErr", SEES(TX_SBRD_CTL_CSR_PARITY)),
+/*15*/ FLAG_ENTRY0("TxConfigParityErr", SEES(TX_CONFIG_PARITY)),
+/*16*/ FLAG_ENTRY0("TxSdma0DisallowedPacketErr",
+               SEES(TX_SDMA0_DISALLOWED_PACKET)),
+/*17*/ FLAG_ENTRY0("TxSdma1DisallowedPacketErr",
+               SEES(TX_SDMA1_DISALLOWED_PACKET)),
+/*18*/ FLAG_ENTRY0("TxSdma2DisallowedPacketErr",
+               SEES(TX_SDMA2_DISALLOWED_PACKET)),
+/*19*/ FLAG_ENTRY0("TxSdma3DisallowedPacketErr",
+               SEES(TX_SDMA3_DISALLOWED_PACKET)),
+/*20*/ FLAG_ENTRY0("TxSdma4DisallowedPacketErr",
+               SEES(TX_SDMA4_DISALLOWED_PACKET)),
+/*21*/ FLAG_ENTRY0("TxSdma5DisallowedPacketErr",
+               SEES(TX_SDMA5_DISALLOWED_PACKET)),
+/*22*/ FLAG_ENTRY0("TxSdma6DisallowedPacketErr",
+               SEES(TX_SDMA6_DISALLOWED_PACKET)),
+/*23*/ FLAG_ENTRY0("TxSdma7DisallowedPacketErr",
+               SEES(TX_SDMA7_DISALLOWED_PACKET)),
+/*24*/ FLAG_ENTRY0("TxSdma8DisallowedPacketErr",
+               SEES(TX_SDMA8_DISALLOWED_PACKET)),
+/*25*/ FLAG_ENTRY0("TxSdma9DisallowedPacketErr",
+               SEES(TX_SDMA9_DISALLOWED_PACKET)),
+/*26*/ FLAG_ENTRY0("TxSdma10DisallowedPacketErr",
+               SEES(TX_SDMA10_DISALLOWED_PACKET)),
+/*27*/ FLAG_ENTRY0("TxSdma11DisallowedPacketErr",
+               SEES(TX_SDMA11_DISALLOWED_PACKET)),
+/*28*/ FLAG_ENTRY0("TxSdma12DisallowedPacketErr",
+               SEES(TX_SDMA12_DISALLOWED_PACKET)),
+/*29*/ FLAG_ENTRY0("TxSdma13DisallowedPacketErr",
+               SEES(TX_SDMA13_DISALLOWED_PACKET)),
+/*30*/ FLAG_ENTRY0("TxSdma14DisallowedPacketErr",
+               SEES(TX_SDMA14_DISALLOWED_PACKET)),
+/*31*/ FLAG_ENTRY0("TxSdma15DisallowedPacketErr",
+               SEES(TX_SDMA15_DISALLOWED_PACKET)),
+/*32*/ FLAG_ENTRY0("TxLaunchFifo0UncOrParityErr",
+               SEES(TX_LAUNCH_FIFO0_UNC_OR_PARITY)),
+/*33*/ FLAG_ENTRY0("TxLaunchFifo1UncOrParityErr",
+               SEES(TX_LAUNCH_FIFO1_UNC_OR_PARITY)),
+/*34*/ FLAG_ENTRY0("TxLaunchFifo2UncOrParityErr",
+               SEES(TX_LAUNCH_FIFO2_UNC_OR_PARITY)),
+/*35*/ FLAG_ENTRY0("TxLaunchFifo3UncOrParityErr",
+               SEES(TX_LAUNCH_FIFO3_UNC_OR_PARITY)),
+/*36*/ FLAG_ENTRY0("TxLaunchFifo4UncOrParityErr",
+               SEES(TX_LAUNCH_FIFO4_UNC_OR_PARITY)),
+/*37*/ FLAG_ENTRY0("TxLaunchFifo5UncOrParityErr",
+               SEES(TX_LAUNCH_FIFO5_UNC_OR_PARITY)),
+/*38*/ FLAG_ENTRY0("TxLaunchFifo6UncOrParityErr",
+               SEES(TX_LAUNCH_FIFO6_UNC_OR_PARITY)),
+/*39*/ FLAG_ENTRY0("TxLaunchFifo7UncOrParityErr",
+               SEES(TX_LAUNCH_FIFO7_UNC_OR_PARITY)),
+/*40*/ FLAG_ENTRY0("TxLaunchFifo8UncOrParityErr",
+               SEES(TX_LAUNCH_FIFO8_UNC_OR_PARITY)),
+/*41*/ FLAG_ENTRY0("TxCreditReturnParityErr", SEES(TX_CREDIT_RETURN_PARITY)),
+/*42*/ FLAG_ENTRY0("TxSbHdrUncErr", SEES(TX_SB_HDR_UNC)),
+/*43*/ FLAG_ENTRY0("TxReadSdmaMemoryUncErr", SEES(TX_READ_SDMA_MEMORY_UNC)),
+/*44*/ FLAG_ENTRY0("TxReadPioMemoryUncErr", SEES(TX_READ_PIO_MEMORY_UNC)),
+/*45*/ FLAG_ENTRY0("TxEgressFifoUncErr", SEES(TX_EGRESS_FIFO_UNC)),
+/*46*/ FLAG_ENTRY0("TxHcrcInsertionErr", SEES(TX_HCRC_INSERTION)),
+/*47*/ FLAG_ENTRY0("TxCreditReturnVLErr", SEES(TX_CREDIT_RETURN_VL)),
+/*48*/ FLAG_ENTRY0("TxLaunchFifo0CorErr", SEES(TX_LAUNCH_FIFO0_COR)),
+/*49*/ FLAG_ENTRY0("TxLaunchFifo1CorErr", SEES(TX_LAUNCH_FIFO1_COR)),
+/*50*/ FLAG_ENTRY0("TxLaunchFifo2CorErr", SEES(TX_LAUNCH_FIFO2_COR)),
+/*51*/ FLAG_ENTRY0("TxLaunchFifo3CorErr", SEES(TX_LAUNCH_FIFO3_COR)),
+/*52*/ FLAG_ENTRY0("TxLaunchFifo4CorErr", SEES(TX_LAUNCH_FIFO4_COR)),
+/*53*/ FLAG_ENTRY0("TxLaunchFifo5CorErr", SEES(TX_LAUNCH_FIFO5_COR)),
+/*54*/ FLAG_ENTRY0("TxLaunchFifo6CorErr", SEES(TX_LAUNCH_FIFO6_COR)),
+/*55*/ FLAG_ENTRY0("TxLaunchFifo7CorErr", SEES(TX_LAUNCH_FIFO7_COR)),
+/*56*/ FLAG_ENTRY0("TxLaunchFifo8CorErr", SEES(TX_LAUNCH_FIFO8_COR)),
+/*57*/ FLAG_ENTRY0("TxCreditOverrunErr", SEES(TX_CREDIT_OVERRUN)),
+/*58*/ FLAG_ENTRY0("TxSbHdrCorErr", SEES(TX_SB_HDR_COR)),
+/*59*/ FLAG_ENTRY0("TxReadSdmaMemoryCorErr", SEES(TX_READ_SDMA_MEMORY_COR)),
+/*60*/ FLAG_ENTRY0("TxReadPioMemoryCorErr", SEES(TX_READ_PIO_MEMORY_COR)),
+/*61*/ FLAG_ENTRY0("TxEgressFifoCorErr", SEES(TX_EGRESS_FIFO_COR)),
+/*62*/ FLAG_ENTRY0("TxReadSdmaMemoryCsrUncErr",
+               SEES(TX_READ_SDMA_MEMORY_CSR_UNC)),
+/*63*/ FLAG_ENTRY0("TxReadPioMemoryCsrUncErr",
+               SEES(TX_READ_PIO_MEMORY_CSR_UNC)),
+};
+
+/*
+ * TXE Egress Error Info flags
+ */
+#define SEEI(text) SEND_EGRESS_ERR_INFO_##text##_ERR_SMASK
+static struct flag_table egress_err_info_flags[] = {
+/* 0*/ FLAG_ENTRY0("Reserved", 0ull),
+/* 1*/ FLAG_ENTRY0("VLErr", SEEI(VL)),
+/* 2*/ FLAG_ENTRY0("JobKeyErr", SEEI(JOB_KEY)),
+/* 3*/ FLAG_ENTRY0("JobKeyErr", SEEI(JOB_KEY)),
+/* 4*/ FLAG_ENTRY0("PartitionKeyErr", SEEI(PARTITION_KEY)),
+/* 5*/ FLAG_ENTRY0("SLIDErr", SEEI(SLID)),
+/* 6*/ FLAG_ENTRY0("OpcodeErr", SEEI(OPCODE)),
+/* 7*/ FLAG_ENTRY0("VLMappingErr", SEEI(VL_MAPPING)),
+/* 8*/ FLAG_ENTRY0("RawErr", SEEI(RAW)),
+/* 9*/ FLAG_ENTRY0("RawIPv6Err", SEEI(RAW_IPV6)),
+/*10*/ FLAG_ENTRY0("GRHErr", SEEI(GRH)),
+/*11*/ FLAG_ENTRY0("BypassErr", SEEI(BYPASS)),
+/*12*/ FLAG_ENTRY0("KDETHPacketsErr", SEEI(KDETH_PACKETS)),
+/*13*/ FLAG_ENTRY0("NonKDETHPacketsErr", SEEI(NON_KDETH_PACKETS)),
+/*14*/ FLAG_ENTRY0("TooSmallIBPacketsErr", SEEI(TOO_SMALL_IB_PACKETS)),
+/*15*/ FLAG_ENTRY0("TooSmallBypassPacketsErr", SEEI(TOO_SMALL_BYPASS_PACKETS)),
+/*16*/ FLAG_ENTRY0("PbcTestErr", SEEI(PBC_TEST)),
+/*17*/ FLAG_ENTRY0("BadPktLenErr", SEEI(BAD_PKT_LEN)),
+/*18*/ FLAG_ENTRY0("TooLongIBPacketErr", SEEI(TOO_LONG_IB_PACKET)),
+/*19*/ FLAG_ENTRY0("TooLongBypassPacketsErr", SEEI(TOO_LONG_BYPASS_PACKETS)),
+/*20*/ FLAG_ENTRY0("PbcStaticRateControlErr", SEEI(PBC_STATIC_RATE_CONTROL)),
+/*21*/ FLAG_ENTRY0("BypassBadPktLenErr", SEEI(BAD_PKT_LEN)),
+};
+
+/* TXE Egress errors that cause an SPC freeze */
+#define ALL_TXE_EGRESS_FREEZE_ERR \
+       (SEES(TX_EGRESS_FIFO_UNDERRUN_OR_PARITY) \
+       | SEES(TX_PIO_LAUNCH_INTF_PARITY) \
+       | SEES(TX_SDMA_LAUNCH_INTF_PARITY) \
+       | SEES(TX_SBRD_CTL_STATE_MACHINE_PARITY) \
+       | SEES(TX_LAUNCH_CSR_PARITY) \
+       | SEES(TX_SBRD_CTL_CSR_PARITY) \
+       | SEES(TX_CONFIG_PARITY) \
+       | SEES(TX_LAUNCH_FIFO0_UNC_OR_PARITY) \
+       | SEES(TX_LAUNCH_FIFO1_UNC_OR_PARITY) \
+       | SEES(TX_LAUNCH_FIFO2_UNC_OR_PARITY) \
+       | SEES(TX_LAUNCH_FIFO3_UNC_OR_PARITY) \
+       | SEES(TX_LAUNCH_FIFO4_UNC_OR_PARITY) \
+       | SEES(TX_LAUNCH_FIFO5_UNC_OR_PARITY) \
+       | SEES(TX_LAUNCH_FIFO6_UNC_OR_PARITY) \
+       | SEES(TX_LAUNCH_FIFO7_UNC_OR_PARITY) \
+       | SEES(TX_LAUNCH_FIFO8_UNC_OR_PARITY) \
+       | SEES(TX_CREDIT_RETURN_PARITY))
+
+/*
+ * TXE Send error flags
+ */
+#define SES(name) SEND_ERR_STATUS_SEND_##name##_ERR_SMASK
+static struct flag_table send_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY0("SendCsrParityErr", SES(CSR_PARITY)),
+/* 1*/ FLAG_ENTRY0("SendCsrReadBadAddrErr", SES(CSR_READ_BAD_ADDR)),
+/* 2*/ FLAG_ENTRY0("SendCsrWriteBadAddrErr", SES(CSR_WRITE_BAD_ADDR))
+};
+
+/*
+ * TXE Send Context Error flags and consequences
+ */
+static struct flag_table sc_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY("InconsistentSop",
+               SEC_PACKET_DROPPED | SEC_SC_HALTED,
+               SEND_CTXT_ERR_STATUS_PIO_INCONSISTENT_SOP_ERR_SMASK),
+/* 1*/ FLAG_ENTRY("DisallowedPacket",
+               SEC_PACKET_DROPPED | SEC_SC_HALTED,
+               SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK),
+/* 2*/ FLAG_ENTRY("WriteCrossesBoundary",
+               SEC_WRITE_DROPPED | SEC_SC_HALTED,
+               SEND_CTXT_ERR_STATUS_PIO_WRITE_CROSSES_BOUNDARY_ERR_SMASK),
+/* 3*/ FLAG_ENTRY("WriteOverflow",
+               SEC_WRITE_DROPPED | SEC_SC_HALTED,
+               SEND_CTXT_ERR_STATUS_PIO_WRITE_OVERFLOW_ERR_SMASK),
+/* 4*/ FLAG_ENTRY("WriteOutOfBounds",
+               SEC_WRITE_DROPPED | SEC_SC_HALTED,
+               SEND_CTXT_ERR_STATUS_PIO_WRITE_OUT_OF_BOUNDS_ERR_SMASK),
+/* 5-63 reserved*/
+};
+
+/*
+ * RXE Receive Error flags
+ */
+#define RXES(name) RCV_ERR_STATUS_RX_##name##_ERR_SMASK
+static struct flag_table rxe_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY0("RxDmaCsrCorErr", RXES(DMA_CSR_COR)),
+/* 1*/ FLAG_ENTRY0("RxDcIntfParityErr", RXES(DC_INTF_PARITY)),
+/* 2*/ FLAG_ENTRY0("RxRcvHdrUncErr", RXES(RCV_HDR_UNC)),
+/* 3*/ FLAG_ENTRY0("RxRcvHdrCorErr", RXES(RCV_HDR_COR)),
+/* 4*/ FLAG_ENTRY0("RxRcvDataUncErr", RXES(RCV_DATA_UNC)),
+/* 5*/ FLAG_ENTRY0("RxRcvDataCorErr", RXES(RCV_DATA_COR)),
+/* 6*/ FLAG_ENTRY0("RxRcvQpMapTableUncErr", RXES(RCV_QP_MAP_TABLE_UNC)),
+/* 7*/ FLAG_ENTRY0("RxRcvQpMapTableCorErr", RXES(RCV_QP_MAP_TABLE_COR)),
+/* 8*/ FLAG_ENTRY0("RxRcvCsrParityErr", RXES(RCV_CSR_PARITY)),
+/* 9*/ FLAG_ENTRY0("RxDcSopEopParityErr", RXES(DC_SOP_EOP_PARITY)),
+/*10*/ FLAG_ENTRY0("RxDmaFlagUncErr", RXES(DMA_FLAG_UNC)),
+/*11*/ FLAG_ENTRY0("RxDmaFlagCorErr", RXES(DMA_FLAG_COR)),
+/*12*/ FLAG_ENTRY0("RxRcvFsmEncodingErr", RXES(RCV_FSM_ENCODING)),
+/*13*/ FLAG_ENTRY0("RxRbufFreeListUncErr", RXES(RBUF_FREE_LIST_UNC)),
+/*14*/ FLAG_ENTRY0("RxRbufFreeListCorErr", RXES(RBUF_FREE_LIST_COR)),
+/*15*/ FLAG_ENTRY0("RxRbufLookupDesRegUncErr", RXES(RBUF_LOOKUP_DES_REG_UNC)),
+/*16*/ FLAG_ENTRY0("RxRbufLookupDesRegUncCorErr",
+               RXES(RBUF_LOOKUP_DES_REG_UNC_COR)),
+/*17*/ FLAG_ENTRY0("RxRbufLookupDesUncErr", RXES(RBUF_LOOKUP_DES_UNC)),
+/*18*/ FLAG_ENTRY0("RxRbufLookupDesCorErr", RXES(RBUF_LOOKUP_DES_COR)),
+/*19*/ FLAG_ENTRY0("RxRbufBlockListReadUncErr",
+               RXES(RBUF_BLOCK_LIST_READ_UNC)),
+/*20*/ FLAG_ENTRY0("RxRbufBlockListReadCorErr",
+               RXES(RBUF_BLOCK_LIST_READ_COR)),
+/*21*/ FLAG_ENTRY0("RxRbufCsrQHeadBufNumParityErr",
+               RXES(RBUF_CSR_QHEAD_BUF_NUM_PARITY)),
+/*22*/ FLAG_ENTRY0("RxRbufCsrQEntCntParityErr",
+               RXES(RBUF_CSR_QENT_CNT_PARITY)),
+/*23*/ FLAG_ENTRY0("RxRbufCsrQNextBufParityErr",
+               RXES(RBUF_CSR_QNEXT_BUF_PARITY)),
+/*24*/ FLAG_ENTRY0("RxRbufCsrQVldBitParityErr",
+               RXES(RBUF_CSR_QVLD_BIT_PARITY)),
+/*25*/ FLAG_ENTRY0("RxRbufCsrQHdPtrParityErr", RXES(RBUF_CSR_QHD_PTR_PARITY)),
+/*26*/ FLAG_ENTRY0("RxRbufCsrQTlPtrParityErr", RXES(RBUF_CSR_QTL_PTR_PARITY)),
+/*27*/ FLAG_ENTRY0("RxRbufCsrQNumOfPktParityErr",
+               RXES(RBUF_CSR_QNUM_OF_PKT_PARITY)),
+/*28*/ FLAG_ENTRY0("RxRbufCsrQEOPDWParityErr", RXES(RBUF_CSR_QEOPDW_PARITY)),
+/*29*/ FLAG_ENTRY0("RxRbufCtxIdParityErr", RXES(RBUF_CTX_ID_PARITY)),
+/*30*/ FLAG_ENTRY0("RxRBufBadLookupErr", RXES(RBUF_BAD_LOOKUP)),
+/*31*/ FLAG_ENTRY0("RxRbufFullErr", RXES(RBUF_FULL)),
+/*32*/ FLAG_ENTRY0("RxRbufEmptyErr", RXES(RBUF_EMPTY)),
+/*33*/ FLAG_ENTRY0("RxRbufFlRdAddrParityErr", RXES(RBUF_FL_RD_ADDR_PARITY)),
+/*34*/ FLAG_ENTRY0("RxRbufFlWrAddrParityErr", RXES(RBUF_FL_WR_ADDR_PARITY)),
+/*35*/ FLAG_ENTRY0("RxRbufFlInitdoneParityErr",
+               RXES(RBUF_FL_INITDONE_PARITY)),
+/*36*/ FLAG_ENTRY0("RxRbufFlInitWrAddrParityErr",
+               RXES(RBUF_FL_INIT_WR_ADDR_PARITY)),
+/*37*/ FLAG_ENTRY0("RxRbufNextFreeBufUncErr", RXES(RBUF_NEXT_FREE_BUF_UNC)),
+/*38*/ FLAG_ENTRY0("RxRbufNextFreeBufCorErr", RXES(RBUF_NEXT_FREE_BUF_COR)),
+/*39*/ FLAG_ENTRY0("RxLookupDesPart1UncErr", RXES(LOOKUP_DES_PART1_UNC)),
+/*40*/ FLAG_ENTRY0("RxLookupDesPart1UncCorErr",
+               RXES(LOOKUP_DES_PART1_UNC_COR)),
+/*41*/ FLAG_ENTRY0("RxLookupDesPart2ParityErr",
+               RXES(LOOKUP_DES_PART2_PARITY)),
+/*42*/ FLAG_ENTRY0("RxLookupRcvArrayUncErr", RXES(LOOKUP_RCV_ARRAY_UNC)),
+/*43*/ FLAG_ENTRY0("RxLookupRcvArrayCorErr", RXES(LOOKUP_RCV_ARRAY_COR)),
+/*44*/ FLAG_ENTRY0("RxLookupCsrParityErr", RXES(LOOKUP_CSR_PARITY)),
+/*45*/ FLAG_ENTRY0("RxHqIntrCsrParityErr", RXES(HQ_INTR_CSR_PARITY)),
+/*46*/ FLAG_ENTRY0("RxHqIntrFsmErr", RXES(HQ_INTR_FSM)),
+/*47*/ FLAG_ENTRY0("RxRbufDescPart1UncErr", RXES(RBUF_DESC_PART1_UNC)),
+/*48*/ FLAG_ENTRY0("RxRbufDescPart1CorErr", RXES(RBUF_DESC_PART1_COR)),
+/*49*/ FLAG_ENTRY0("RxRbufDescPart2UncErr", RXES(RBUF_DESC_PART2_UNC)),
+/*50*/ FLAG_ENTRY0("RxRbufDescPart2CorErr", RXES(RBUF_DESC_PART2_COR)),
+/*51*/ FLAG_ENTRY0("RxDmaHdrFifoRdUncErr", RXES(DMA_HDR_FIFO_RD_UNC)),
+/*52*/ FLAG_ENTRY0("RxDmaHdrFifoRdCorErr", RXES(DMA_HDR_FIFO_RD_COR)),
+/*53*/ FLAG_ENTRY0("RxDmaDataFifoRdUncErr", RXES(DMA_DATA_FIFO_RD_UNC)),
+/*54*/ FLAG_ENTRY0("RxDmaDataFifoRdCorErr", RXES(DMA_DATA_FIFO_RD_COR)),
+/*55*/ FLAG_ENTRY0("RxRbufDataUncErr", RXES(RBUF_DATA_UNC)),
+/*56*/ FLAG_ENTRY0("RxRbufDataCorErr", RXES(RBUF_DATA_COR)),
+/*57*/ FLAG_ENTRY0("RxDmaCsrParityErr", RXES(DMA_CSR_PARITY)),
+/*58*/ FLAG_ENTRY0("RxDmaEqFsmEncodingErr", RXES(DMA_EQ_FSM_ENCODING)),
+/*59*/ FLAG_ENTRY0("RxDmaDqFsmEncodingErr", RXES(DMA_DQ_FSM_ENCODING)),
+/*60*/ FLAG_ENTRY0("RxDmaCsrUncErr", RXES(DMA_CSR_UNC)),
+/*61*/ FLAG_ENTRY0("RxCsrReadBadAddrErr", RXES(CSR_READ_BAD_ADDR)),
+/*62*/ FLAG_ENTRY0("RxCsrWriteBadAddrErr", RXES(CSR_WRITE_BAD_ADDR)),
+/*63*/ FLAG_ENTRY0("RxCsrParityErr", RXES(CSR_PARITY))
+};
+
+/* RXE errors that will trigger an SPC freeze */
+#define ALL_RXE_FREEZE_ERR  \
+       (RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RCV_CSR_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_DMA_FLAG_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RCV_FSM_ENCODING_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_FREE_LIST_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_CSR_QHEAD_BUF_NUM_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_CSR_QENT_CNT_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_CSR_QNEXT_BUF_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_CSR_QVLD_BIT_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_CSR_QHD_PTR_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_CSR_QTL_PTR_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_CSR_QNUM_OF_PKT_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_CSR_QEOPDW_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_CTX_ID_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_BAD_LOOKUP_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_FULL_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_EMPTY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_FL_RD_ADDR_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_FL_WR_ADDR_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_FL_INITDONE_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_COR_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_LOOKUP_DES_PART2_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_LOOKUP_CSR_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_HQ_INTR_CSR_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_HQ_INTR_FSM_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_DESC_PART1_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_DESC_PART1_COR_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_DESC_PART2_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_DATA_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_DMA_CSR_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_DMA_EQ_FSM_ENCODING_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_DMA_DQ_FSM_ENCODING_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_CSR_PARITY_ERR_SMASK)
+
+#define RXE_FREEZE_ABORT_MASK \
+       (RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK | \
+       RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK | \
+       RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK)
+
+/*
+ * DCC Error Flags
+ */
+#define DCCE(name) DCC_ERR_FLG_##name##_SMASK
+static struct flag_table dcc_err_flags[] = {
+       FLAG_ENTRY0("bad_l2_err", DCCE(BAD_L2_ERR)),
+       FLAG_ENTRY0("bad_sc_err", DCCE(BAD_SC_ERR)),
+       FLAG_ENTRY0("bad_mid_tail_err", DCCE(BAD_MID_TAIL_ERR)),
+       FLAG_ENTRY0("bad_preemption_err", DCCE(BAD_PREEMPTION_ERR)),
+       FLAG_ENTRY0("preemption_err", DCCE(PREEMPTION_ERR)),
+       FLAG_ENTRY0("preemptionvl15_err", DCCE(PREEMPTIONVL15_ERR)),
+       FLAG_ENTRY0("bad_vl_marker_err", DCCE(BAD_VL_MARKER_ERR)),
+       FLAG_ENTRY0("bad_dlid_target_err", DCCE(BAD_DLID_TARGET_ERR)),
+       FLAG_ENTRY0("bad_lver_err", DCCE(BAD_LVER_ERR)),
+       FLAG_ENTRY0("uncorrectable_err", DCCE(UNCORRECTABLE_ERR)),
+       FLAG_ENTRY0("bad_crdt_ack_err", DCCE(BAD_CRDT_ACK_ERR)),
+       FLAG_ENTRY0("unsup_pkt_type", DCCE(UNSUP_PKT_TYPE)),
+       FLAG_ENTRY0("bad_ctrl_flit_err", DCCE(BAD_CTRL_FLIT_ERR)),
+       FLAG_ENTRY0("event_cntr_parity_err", DCCE(EVENT_CNTR_PARITY_ERR)),
+       FLAG_ENTRY0("event_cntr_rollover_err", DCCE(EVENT_CNTR_ROLLOVER_ERR)),
+       FLAG_ENTRY0("link_err", DCCE(LINK_ERR)),
+       FLAG_ENTRY0("misc_cntr_rollover_err", DCCE(MISC_CNTR_ROLLOVER_ERR)),
+       FLAG_ENTRY0("bad_ctrl_dist_err", DCCE(BAD_CTRL_DIST_ERR)),
+       FLAG_ENTRY0("bad_tail_dist_err", DCCE(BAD_TAIL_DIST_ERR)),
+       FLAG_ENTRY0("bad_head_dist_err", DCCE(BAD_HEAD_DIST_ERR)),
+       FLAG_ENTRY0("nonvl15_state_err", DCCE(NONVL15_STATE_ERR)),
+       FLAG_ENTRY0("vl15_multi_err", DCCE(VL15_MULTI_ERR)),
+       FLAG_ENTRY0("bad_pkt_length_err", DCCE(BAD_PKT_LENGTH_ERR)),
+       FLAG_ENTRY0("unsup_vl_err", DCCE(UNSUP_VL_ERR)),
+       FLAG_ENTRY0("perm_nvl15_err", DCCE(PERM_NVL15_ERR)),
+       FLAG_ENTRY0("slid_zero_err", DCCE(SLID_ZERO_ERR)),
+       FLAG_ENTRY0("dlid_zero_err", DCCE(DLID_ZERO_ERR)),
+       FLAG_ENTRY0("length_mtu_err", DCCE(LENGTH_MTU_ERR)),
+       FLAG_ENTRY0("rx_early_drop_err", DCCE(RX_EARLY_DROP_ERR)),
+       FLAG_ENTRY0("late_short_err", DCCE(LATE_SHORT_ERR)),
+       FLAG_ENTRY0("late_long_err", DCCE(LATE_LONG_ERR)),
+       FLAG_ENTRY0("late_ebp_err", DCCE(LATE_EBP_ERR)),
+       FLAG_ENTRY0("fpe_tx_fifo_ovflw_err", DCCE(FPE_TX_FIFO_OVFLW_ERR)),
+       FLAG_ENTRY0("fpe_tx_fifo_unflw_err", DCCE(FPE_TX_FIFO_UNFLW_ERR)),
+       FLAG_ENTRY0("csr_access_blocked_host", DCCE(CSR_ACCESS_BLOCKED_HOST)),
+       FLAG_ENTRY0("csr_access_blocked_uc", DCCE(CSR_ACCESS_BLOCKED_UC)),
+       FLAG_ENTRY0("tx_ctrl_parity_err", DCCE(TX_CTRL_PARITY_ERR)),
+       FLAG_ENTRY0("tx_ctrl_parity_mbe_err", DCCE(TX_CTRL_PARITY_MBE_ERR)),
+       FLAG_ENTRY0("tx_sc_parity_err", DCCE(TX_SC_PARITY_ERR)),
+       FLAG_ENTRY0("rx_ctrl_parity_mbe_err", DCCE(RX_CTRL_PARITY_MBE_ERR)),
+       FLAG_ENTRY0("csr_parity_err", DCCE(CSR_PARITY_ERR)),
+       FLAG_ENTRY0("csr_inval_addr", DCCE(CSR_INVAL_ADDR)),
+       FLAG_ENTRY0("tx_byte_shft_parity_err", DCCE(TX_BYTE_SHFT_PARITY_ERR)),
+       FLAG_ENTRY0("rx_byte_shft_parity_err", DCCE(RX_BYTE_SHFT_PARITY_ERR)),
+       FLAG_ENTRY0("fmconfig_err", DCCE(FMCONFIG_ERR)),
+       FLAG_ENTRY0("rcvport_err", DCCE(RCVPORT_ERR)),
+};
+
+/*
+ * LCB error flags
+ */
+#define LCBE(name) DC_LCB_ERR_FLG_##name##_SMASK
+static struct flag_table lcb_err_flags[] = {
+/* 0*/ FLAG_ENTRY0("CSR_PARITY_ERR", LCBE(CSR_PARITY_ERR)),
+/* 1*/ FLAG_ENTRY0("INVALID_CSR_ADDR", LCBE(INVALID_CSR_ADDR)),
+/* 2*/ FLAG_ENTRY0("RST_FOR_FAILED_DESKEW", LCBE(RST_FOR_FAILED_DESKEW)),
+/* 3*/ FLAG_ENTRY0("ALL_LNS_FAILED_REINIT_TEST",
+               LCBE(ALL_LNS_FAILED_REINIT_TEST)),
+/* 4*/ FLAG_ENTRY0("LOST_REINIT_STALL_OR_TOS", LCBE(LOST_REINIT_STALL_OR_TOS)),
+/* 5*/ FLAG_ENTRY0("TX_LESS_THAN_FOUR_LNS", LCBE(TX_LESS_THAN_FOUR_LNS)),
+/* 6*/ FLAG_ENTRY0("RX_LESS_THAN_FOUR_LNS", LCBE(RX_LESS_THAN_FOUR_LNS)),
+/* 7*/ FLAG_ENTRY0("SEQ_CRC_ERR", LCBE(SEQ_CRC_ERR)),
+/* 8*/ FLAG_ENTRY0("REINIT_FROM_PEER", LCBE(REINIT_FROM_PEER)),
+/* 9*/ FLAG_ENTRY0("REINIT_FOR_LN_DEGRADE", LCBE(REINIT_FOR_LN_DEGRADE)),
+/*10*/ FLAG_ENTRY0("CRC_ERR_CNT_HIT_LIMIT", LCBE(CRC_ERR_CNT_HIT_LIMIT)),
+/*11*/ FLAG_ENTRY0("RCLK_STOPPED", LCBE(RCLK_STOPPED)),
+/*12*/ FLAG_ENTRY0("UNEXPECTED_REPLAY_MARKER", LCBE(UNEXPECTED_REPLAY_MARKER)),
+/*13*/ FLAG_ENTRY0("UNEXPECTED_ROUND_TRIP_MARKER",
+               LCBE(UNEXPECTED_ROUND_TRIP_MARKER)),
+/*14*/ FLAG_ENTRY0("ILLEGAL_NULL_LTP", LCBE(ILLEGAL_NULL_LTP)),
+/*15*/ FLAG_ENTRY0("ILLEGAL_FLIT_ENCODING", LCBE(ILLEGAL_FLIT_ENCODING)),
+/*16*/ FLAG_ENTRY0("FLIT_INPUT_BUF_OFLW", LCBE(FLIT_INPUT_BUF_OFLW)),
+/*17*/ FLAG_ENTRY0("VL_ACK_INPUT_BUF_OFLW", LCBE(VL_ACK_INPUT_BUF_OFLW)),
+/*18*/ FLAG_ENTRY0("VL_ACK_INPUT_PARITY_ERR", LCBE(VL_ACK_INPUT_PARITY_ERR)),
+/*19*/ FLAG_ENTRY0("VL_ACK_INPUT_WRONG_CRC_MODE",
+               LCBE(VL_ACK_INPUT_WRONG_CRC_MODE)),
+/*20*/ FLAG_ENTRY0("FLIT_INPUT_BUF_MBE", LCBE(FLIT_INPUT_BUF_MBE)),
+/*21*/ FLAG_ENTRY0("FLIT_INPUT_BUF_SBE", LCBE(FLIT_INPUT_BUF_SBE)),
+/*22*/ FLAG_ENTRY0("REPLAY_BUF_MBE", LCBE(REPLAY_BUF_MBE)),
+/*23*/ FLAG_ENTRY0("REPLAY_BUF_SBE", LCBE(REPLAY_BUF_SBE)),
+/*24*/ FLAG_ENTRY0("CREDIT_RETURN_FLIT_MBE", LCBE(CREDIT_RETURN_FLIT_MBE)),
+/*25*/ FLAG_ENTRY0("RST_FOR_LINK_TIMEOUT", LCBE(RST_FOR_LINK_TIMEOUT)),
+/*26*/ FLAG_ENTRY0("RST_FOR_INCOMPLT_RND_TRIP",
+               LCBE(RST_FOR_INCOMPLT_RND_TRIP)),
+/*27*/ FLAG_ENTRY0("HOLD_REINIT", LCBE(HOLD_REINIT)),
+/*28*/ FLAG_ENTRY0("NEG_EDGE_LINK_TRANSFER_ACTIVE",
+               LCBE(NEG_EDGE_LINK_TRANSFER_ACTIVE)),
+/*29*/ FLAG_ENTRY0("REDUNDANT_FLIT_PARITY_ERR",
+               LCBE(REDUNDANT_FLIT_PARITY_ERR))
+};
+
+/*
+ * DC8051 Error Flags
+ */
+#define D8E(name) DC_DC8051_ERR_FLG_##name##_SMASK
+static struct flag_table dc8051_err_flags[] = {
+       FLAG_ENTRY0("SET_BY_8051", D8E(SET_BY_8051)),
+       FLAG_ENTRY0("LOST_8051_HEART_BEAT", D8E(LOST_8051_HEART_BEAT)),
+       FLAG_ENTRY0("CRAM_MBE", D8E(CRAM_MBE)),
+       FLAG_ENTRY0("CRAM_SBE", D8E(CRAM_SBE)),
+       FLAG_ENTRY0("DRAM_MBE", D8E(DRAM_MBE)),
+       FLAG_ENTRY0("DRAM_SBE", D8E(DRAM_SBE)),
+       FLAG_ENTRY0("IRAM_MBE", D8E(IRAM_MBE)),
+       FLAG_ENTRY0("IRAM_SBE", D8E(IRAM_SBE)),
+       FLAG_ENTRY0("UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES",
+                   D8E(UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES)),
+       FLAG_ENTRY0("INVALID_CSR_ADDR", D8E(INVALID_CSR_ADDR)),
+};
+
+/*
+ * DC8051 Information Error flags
+ *
+ * Flags in DC8051_DBG_ERR_INFO_SET_BY_8051.ERROR field.
+ */
+static struct flag_table dc8051_info_err_flags[] = {
+       FLAG_ENTRY0("Spico ROM check failed",  SPICO_ROM_FAILED),
+       FLAG_ENTRY0("Unknown frame received",  UNKNOWN_FRAME),
+       FLAG_ENTRY0("Target BER not met",      TARGET_BER_NOT_MET),
+       FLAG_ENTRY0("Serdes internal loopback failure",
+                   FAILED_SERDES_INTERNAL_LOOPBACK),
+       FLAG_ENTRY0("Failed SerDes init",      FAILED_SERDES_INIT),
+       FLAG_ENTRY0("Failed LNI(Polling)",     FAILED_LNI_POLLING),
+       FLAG_ENTRY0("Failed LNI(Debounce)",    FAILED_LNI_DEBOUNCE),
+       FLAG_ENTRY0("Failed LNI(EstbComm)",    FAILED_LNI_ESTBCOMM),
+       FLAG_ENTRY0("Failed LNI(OptEq)",       FAILED_LNI_OPTEQ),
+       FLAG_ENTRY0("Failed LNI(VerifyCap_1)", FAILED_LNI_VERIFY_CAP1),
+       FLAG_ENTRY0("Failed LNI(VerifyCap_2)", FAILED_LNI_VERIFY_CAP2),
+       FLAG_ENTRY0("Failed LNI(ConfigLT)",    FAILED_LNI_CONFIGLT),
+       FLAG_ENTRY0("Host Handshake Timeout",  HOST_HANDSHAKE_TIMEOUT)
+};
+
+/*
+ * DC8051 Information Host Information flags
+ *
+ * Flags in DC8051_DBG_ERR_INFO_SET_BY_8051.HOST_MSG field.
+ */
+static struct flag_table dc8051_info_host_msg_flags[] = {
+       FLAG_ENTRY0("Host request done", 0x0001),
+       FLAG_ENTRY0("BC SMA message", 0x0002),
+       FLAG_ENTRY0("BC PWR_MGM message", 0x0004),
+       FLAG_ENTRY0("BC Unknown message (BCC)", 0x0008),
+       FLAG_ENTRY0("BC Unknown message (LCB)", 0x0010),
+       FLAG_ENTRY0("External device config request", 0x0020),
+       FLAG_ENTRY0("VerifyCap all frames received", 0x0040),
+       FLAG_ENTRY0("LinkUp achieved", 0x0080),
+       FLAG_ENTRY0("Link going down", 0x0100),
+};
+
+static u32 encoded_size(u32 size);
+static u32 chip_to_opa_lstate(struct hfi1_devdata *dd, u32 chip_lstate);
+static int set_physical_link_state(struct hfi1_devdata *dd, u64 state);
+static void read_vc_remote_phy(struct hfi1_devdata *dd, u8 *power_management,
+                              u8 *continuous);
+static void read_vc_remote_fabric(struct hfi1_devdata *dd, u8 *vau, u8 *z,
+                                 u8 *vcu, u16 *vl15buf, u8 *crc_sizes);
+static void read_vc_remote_link_width(struct hfi1_devdata *dd,
+                                     u8 *remote_tx_rate, u16 *link_widths);
+static void read_vc_local_link_width(struct hfi1_devdata *dd, u8 *misc_bits,
+                                    u8 *flag_bits, u16 *link_widths);
+static void read_remote_device_id(struct hfi1_devdata *dd, u16 *device_id,
+                                 u8 *device_rev);
+static void read_mgmt_allowed(struct hfi1_devdata *dd, u8 *mgmt_allowed);
+static void read_local_lni(struct hfi1_devdata *dd, u8 *enable_lane_rx);
+static int read_tx_settings(struct hfi1_devdata *dd, u8 *enable_lane_tx,
+                           u8 *tx_polarity_inversion,
+                           u8 *rx_polarity_inversion, u8 *max_rate);
+static void handle_sdma_eng_err(struct hfi1_devdata *dd,
+                               unsigned int context, u64 err_status);
+static void handle_qsfp_int(struct hfi1_devdata *dd, u32 source, u64 reg);
+static void handle_dcc_err(struct hfi1_devdata *dd,
+                          unsigned int context, u64 err_status);
+static void handle_lcb_err(struct hfi1_devdata *dd,
+                          unsigned int context, u64 err_status);
+static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_cce_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_rxe_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_misc_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_pio_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_sdma_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_egress_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_txe_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void set_partition_keys(struct hfi1_pportdata *);
+static const char *link_state_name(u32 state);
+static const char *link_state_reason_name(struct hfi1_pportdata *ppd,
+                                         u32 state);
+static int do_8051_command(struct hfi1_devdata *dd, u32 type, u64 in_data,
+                          u64 *out_data);
+static int read_idle_sma(struct hfi1_devdata *dd, u64 *data);
+static int thermal_init(struct hfi1_devdata *dd);
+
+static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state,
+                                 int msecs);
+static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc);
+static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr);
+static void handle_temp_err(struct hfi1_devdata *);
+static void dc_shutdown(struct hfi1_devdata *);
+static void dc_start(struct hfi1_devdata *);
+static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp,
+                          unsigned int *np);
+static void remove_full_mgmt_pkey(struct hfi1_pportdata *ppd);
+
+/*
+ * Error interrupt table entry.  This is used as input to the interrupt
+ * "clear down" routine used for all second tier error interrupt register.
+ * Second tier interrupt registers have a single bit representing them
+ * in the top-level CceIntStatus.
+ */
+struct err_reg_info {
+       u32 status;             /* status CSR offset */
+       u32 clear;              /* clear CSR offset */
+       u32 mask;               /* mask CSR offset */
+       void (*handler)(struct hfi1_devdata *dd, u32 source, u64 reg);
+       const char *desc;
+};
+
+#define NUM_MISC_ERRS (IS_GENERAL_ERR_END - IS_GENERAL_ERR_START)
+#define NUM_DC_ERRS (IS_DC_END - IS_DC_START)
+#define NUM_VARIOUS (IS_VARIOUS_END - IS_VARIOUS_START)
+
+/*
+ * Helpers for building HFI and DC error interrupt table entries.  Different
+ * helpers are needed because of inconsistent register names.
+ */
+#define EE(reg, handler, desc) \
+       { reg##_STATUS, reg##_CLEAR, reg##_MASK, \
+               handler, desc }
+#define DC_EE1(reg, handler, desc) \
+       { reg##_FLG, reg##_FLG_CLR, reg##_FLG_EN, handler, desc }
+#define DC_EE2(reg, handler, desc) \
+       { reg##_FLG, reg##_CLR, reg##_EN, handler, desc }
+
+/*
+ * Table of the "misc" grouping of error interrupts.  Each entry refers to
+ * another register containing more information.
+ */
+static const struct err_reg_info misc_errs[NUM_MISC_ERRS] = {
+/* 0*/ EE(CCE_ERR,             handle_cce_err,    "CceErr"),
+/* 1*/ EE(RCV_ERR,             handle_rxe_err,    "RxeErr"),
+/* 2*/ EE(MISC_ERR,    handle_misc_err,   "MiscErr"),
+/* 3*/ { 0, 0, 0, NULL }, /* reserved */
+/* 4*/ EE(SEND_PIO_ERR,    handle_pio_err,    "PioErr"),
+/* 5*/ EE(SEND_DMA_ERR,    handle_sdma_err,   "SDmaErr"),
+/* 6*/ EE(SEND_EGRESS_ERR, handle_egress_err, "EgressErr"),
+/* 7*/ EE(SEND_ERR,    handle_txe_err,    "TxeErr")
+       /* the rest are reserved */
+};
+
+/*
+ * Index into the Various section of the interrupt sources
+ * corresponding to the Critical Temperature interrupt.
+ */
+#define TCRIT_INT_SOURCE 4
+
+/*
+ * SDMA error interrupt entry - refers to another register containing more
+ * information.
+ */
+static const struct err_reg_info sdma_eng_err =
+       EE(SEND_DMA_ENG_ERR, handle_sdma_eng_err, "SDmaEngErr");
+
+static const struct err_reg_info various_err[NUM_VARIOUS] = {
+/* 0*/ { 0, 0, 0, NULL }, /* PbcInt */
+/* 1*/ { 0, 0, 0, NULL }, /* GpioAssertInt */
+/* 2*/ EE(ASIC_QSFP1,  handle_qsfp_int,        "QSFP1"),
+/* 3*/ EE(ASIC_QSFP2,  handle_qsfp_int,        "QSFP2"),
+/* 4*/ { 0, 0, 0, NULL }, /* TCritInt */
+       /* rest are reserved */
+};
+
+/*
+ * The DC encoding of mtu_cap for 10K MTU in the DCC_CFG_PORT_CONFIG
+ * register can not be derived from the MTU value because 10K is not
+ * a power of 2. Therefore, we need a constant. Everything else can
+ * be calculated.
+ */
+#define DCC_CFG_PORT_MTU_CAP_10240 7
+
+/*
+ * Table of the DC grouping of error interrupts.  Each entry refers to
+ * another register containing more information.
+ */
+static const struct err_reg_info dc_errs[NUM_DC_ERRS] = {
+/* 0*/ DC_EE1(DCC_ERR,         handle_dcc_err,        "DCC Err"),
+/* 1*/ DC_EE2(DC_LCB_ERR,      handle_lcb_err,        "LCB Err"),
+/* 2*/ DC_EE2(DC_DC8051_ERR,   handle_8051_interrupt, "DC8051 Interrupt"),
+/* 3*/ /* dc_lbm_int - special, see is_dc_int() */
+       /* the rest are reserved */
+};
+
+struct cntr_entry {
+       /*
+        * counter name
+        */
+       char *name;
+
+       /*
+        * csr to read for name (if applicable)
+        */
+       u64 csr;
+
+       /*
+        * offset into dd or ppd to store the counter's value
+        */
+       int offset;
+
+       /*
+        * flags
+        */
+       u8 flags;
+
+       /*
+        * accessor for stat element, context either dd or ppd
+        */
+       u64 (*rw_cntr)(const struct cntr_entry *, void *context, int vl,
+                      int mode, u64 data);
+};
+
+#define C_RCV_HDR_OVF_FIRST C_RCV_HDR_OVF_0
+#define C_RCV_HDR_OVF_LAST C_RCV_HDR_OVF_159
+
+#define CNTR_ELEM(name, csr, offset, flags, accessor) \
+{ \
+       name, \
+       csr, \
+       offset, \
+       flags, \
+       accessor \
+}
+
+/* 32bit RXE */
+#define RXE32_PORT_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+         (counter * 8 + RCV_COUNTER_ARRAY32), \
+         0, flags | CNTR_32BIT, \
+         port_access_u32_csr)
+
+#define RXE32_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+         (counter * 8 + RCV_COUNTER_ARRAY32), \
+         0, flags | CNTR_32BIT, \
+         dev_access_u32_csr)
+
+/* 64bit RXE */
+#define RXE64_PORT_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+         (counter * 8 + RCV_COUNTER_ARRAY64), \
+         0, flags, \
+         port_access_u64_csr)
+
+#define RXE64_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+         (counter * 8 + RCV_COUNTER_ARRAY64), \
+         0, flags, \
+         dev_access_u64_csr)
+
+#define OVR_LBL(ctx) C_RCV_HDR_OVF_ ## ctx
+#define OVR_ELM(ctx) \
+CNTR_ELEM("RcvHdrOvr" #ctx, \
+         (RCV_HDR_OVFL_CNT + ctx * 0x100), \
+         0, CNTR_NORMAL, port_access_u64_csr)
+
+/* 32bit TXE */
+#define TXE32_PORT_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+         (counter * 8 + SEND_COUNTER_ARRAY32), \
+         0, flags | CNTR_32BIT, \
+         port_access_u32_csr)
+
+/* 64bit TXE */
+#define TXE64_PORT_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+         (counter * 8 + SEND_COUNTER_ARRAY64), \
+         0, flags, \
+         port_access_u64_csr)
+
+# define TX64_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name,\
+         counter * 8 + SEND_COUNTER_ARRAY64, \
+         0, \
+         flags, \
+         dev_access_u64_csr)
+
+/* CCE */
+#define CCE_PERF_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+         (counter * 8 + CCE_COUNTER_ARRAY32), \
+         0, flags | CNTR_32BIT, \
+         dev_access_u32_csr)
+
+#define CCE_INT_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+         (counter * 8 + CCE_INT_COUNTER_ARRAY32), \
+         0, flags | CNTR_32BIT, \
+         dev_access_u32_csr)
+
+/* DC */
+#define DC_PERF_CNTR(name, counter, flags) \
+CNTR_ELEM(#name, \
+         counter, \
+         0, \
+         flags, \
+         dev_access_u64_csr)
+
+#define DC_PERF_CNTR_LCB(name, counter, flags) \
+CNTR_ELEM(#name, \
+         counter, \
+         0, \
+         flags, \
+         dc_access_lcb_cntr)
+
+/* ibp counters */
+#define SW_IBP_CNTR(name, cntr) \
+CNTR_ELEM(#name, \
+         0, \
+         0, \
+         CNTR_SYNTH, \
+         access_ibp_##cntr)
+
+u64 read_csr(const struct hfi1_devdata *dd, u32 offset)
+{
+       if (dd->flags & HFI1_PRESENT) {
+               return readq((void __iomem *)dd->kregbase + offset);
+       }
+       return -1;
+}
+
+void write_csr(const struct hfi1_devdata *dd, u32 offset, u64 value)
+{
+       if (dd->flags & HFI1_PRESENT)
+               writeq(value, (void __iomem *)dd->kregbase + offset);
+}
+
+void __iomem *get_csr_addr(
+       struct hfi1_devdata *dd,
+       u32 offset)
+{
+       return (void __iomem *)dd->kregbase + offset;
+}
+
+static inline u64 read_write_csr(const struct hfi1_devdata *dd, u32 csr,
+                                int mode, u64 value)
+{
+       u64 ret;
+
+       if (mode == CNTR_MODE_R) {
+               ret = read_csr(dd, csr);
+       } else if (mode == CNTR_MODE_W) {
+               write_csr(dd, csr, value);
+               ret = value;
+       } else {
+               dd_dev_err(dd, "Invalid cntr register access mode");
+               return 0;
+       }
+
+       hfi1_cdbg(CNTR, "csr 0x%x val 0x%llx mode %d", csr, ret, mode);
+       return ret;
+}
+
+/* Dev Access */
+static u64 dev_access_u32_csr(const struct cntr_entry *entry,
+                             void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = context;
+       u64 csr = entry->csr;
+
+       if (entry->flags & CNTR_SDMA) {
+               if (vl == CNTR_INVALID_VL)
+                       return 0;
+               csr += 0x100 * vl;
+       } else {
+               if (vl != CNTR_INVALID_VL)
+                       return 0;
+       }
+       return read_write_csr(dd, csr, mode, data);
+}
+
+static u64 access_sde_err_cnt(const struct cntr_entry *entry,
+                             void *context, int idx, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       if (dd->per_sdma && idx < dd->num_sdma)
+               return dd->per_sdma[idx].err_cnt;
+       return 0;
+}
+
+static u64 access_sde_int_cnt(const struct cntr_entry *entry,
+                             void *context, int idx, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       if (dd->per_sdma && idx < dd->num_sdma)
+               return dd->per_sdma[idx].sdma_int_cnt;
+       return 0;
+}
+
+static u64 access_sde_idle_int_cnt(const struct cntr_entry *entry,
+                                  void *context, int idx, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       if (dd->per_sdma && idx < dd->num_sdma)
+               return dd->per_sdma[idx].idle_int_cnt;
+       return 0;
+}
+
+static u64 access_sde_progress_int_cnt(const struct cntr_entry *entry,
+                                      void *context, int idx, int mode,
+                                      u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       if (dd->per_sdma && idx < dd->num_sdma)
+               return dd->per_sdma[idx].progress_int_cnt;
+       return 0;
+}
+
+static u64 dev_access_u64_csr(const struct cntr_entry *entry, void *context,
+                             int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = context;
+
+       u64 val = 0;
+       u64 csr = entry->csr;
+
+       if (entry->flags & CNTR_VL) {
+               if (vl == CNTR_INVALID_VL)
+                       return 0;
+               csr += 8 * vl;
+       } else {
+               if (vl != CNTR_INVALID_VL)
+                       return 0;
+       }
+
+       val = read_write_csr(dd, csr, mode, data);
+       return val;
+}
+
+static u64 dc_access_lcb_cntr(const struct cntr_entry *entry, void *context,
+                             int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = context;
+       u32 csr = entry->csr;
+       int ret = 0;
+
+       if (vl != CNTR_INVALID_VL)
+               return 0;
+       if (mode == CNTR_MODE_R)
+               ret = read_lcb_csr(dd, csr, &data);
+       else if (mode == CNTR_MODE_W)
+               ret = write_lcb_csr(dd, csr, data);
+
+       if (ret) {
+               dd_dev_err(dd, "Could not acquire LCB for counter 0x%x", csr);
+               return 0;
+       }
+
+       hfi1_cdbg(CNTR, "csr 0x%x val 0x%llx mode %d", csr, data, mode);
+       return data;
+}
+
+/* Port Access */
+static u64 port_access_u32_csr(const struct cntr_entry *entry, void *context,
+                              int vl, int mode, u64 data)
+{
+       struct hfi1_pportdata *ppd = context;
+
+       if (vl != CNTR_INVALID_VL)
+               return 0;
+       return read_write_csr(ppd->dd, entry->csr, mode, data);
+}
+
+static u64 port_access_u64_csr(const struct cntr_entry *entry,
+                              void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_pportdata *ppd = context;
+       u64 val;
+       u64 csr = entry->csr;
+
+       if (entry->flags & CNTR_VL) {
+               if (vl == CNTR_INVALID_VL)
+                       return 0;
+               csr += 8 * vl;
+       } else {
+               if (vl != CNTR_INVALID_VL)
+                       return 0;
+       }
+       val = read_write_csr(ppd->dd, csr, mode, data);
+       return val;
+}
+
+/* Software defined */
+static inline u64 read_write_sw(struct hfi1_devdata *dd, u64 *cntr, int mode,
+                               u64 data)
+{
+       u64 ret;
+
+       if (mode == CNTR_MODE_R) {
+               ret = *cntr;
+       } else if (mode == CNTR_MODE_W) {
+               *cntr = data;
+               ret = data;
+       } else {
+               dd_dev_err(dd, "Invalid cntr sw access mode");
+               return 0;
+       }
+
+       hfi1_cdbg(CNTR, "val 0x%llx mode %d", ret, mode);
+
+       return ret;
+}
+
+static u64 access_sw_link_dn_cnt(const struct cntr_entry *entry, void *context,
+                                int vl, int mode, u64 data)
+{
+       struct hfi1_pportdata *ppd = context;
+
+       if (vl != CNTR_INVALID_VL)
+               return 0;
+       return read_write_sw(ppd->dd, &ppd->link_downed, mode, data);
+}
+
+static u64 access_sw_link_up_cnt(const struct cntr_entry *entry, void *context,
+                                int vl, int mode, u64 data)
+{
+       struct hfi1_pportdata *ppd = context;
+
+       if (vl != CNTR_INVALID_VL)
+               return 0;
+       return read_write_sw(ppd->dd, &ppd->link_up, mode, data);
+}
+
+static u64 access_sw_unknown_frame_cnt(const struct cntr_entry *entry,
+                                      void *context, int vl, int mode,
+                                      u64 data)
+{
+       struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
+
+       if (vl != CNTR_INVALID_VL)
+               return 0;
+       return read_write_sw(ppd->dd, &ppd->unknown_frame_count, mode, data);
+}
+
+static u64 access_sw_xmit_discards(const struct cntr_entry *entry,
+                                  void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
+       u64 zero = 0;
+       u64 *counter;
+
+       if (vl == CNTR_INVALID_VL)
+               counter = &ppd->port_xmit_discards;
+       else if (vl >= 0 && vl < C_VL_COUNT)
+               counter = &ppd->port_xmit_discards_vl[vl];
+       else
+               counter = &zero;
+
+       return read_write_sw(ppd->dd, counter, mode, data);
+}
+
+static u64 access_xmit_constraint_errs(const struct cntr_entry *entry,
+                                      void *context, int vl, int mode,
+                                      u64 data)
+{
+       struct hfi1_pportdata *ppd = context;
+
+       if (vl != CNTR_INVALID_VL)
+               return 0;
+
+       return read_write_sw(ppd->dd, &ppd->port_xmit_constraint_errors,
+                            mode, data);
+}
+
+static u64 access_rcv_constraint_errs(const struct cntr_entry *entry,
+                                     void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_pportdata *ppd = context;
+
+       if (vl != CNTR_INVALID_VL)
+               return 0;
+
+       return read_write_sw(ppd->dd, &ppd->port_rcv_constraint_errors,
+                            mode, data);
+}
+
+u64 get_all_cpu_total(u64 __percpu *cntr)
+{
+       int cpu;
+       u64 counter = 0;
+
+       for_each_possible_cpu(cpu)
+               counter += *per_cpu_ptr(cntr, cpu);
+       return counter;
+}
+
+static u64 read_write_cpu(struct hfi1_devdata *dd, u64 *z_val,
+                         u64 __percpu *cntr,
+                         int vl, int mode, u64 data)
+{
+       u64 ret = 0;
+
+       if (vl != CNTR_INVALID_VL)
+               return 0;
+
+       if (mode == CNTR_MODE_R) {
+               ret = get_all_cpu_total(cntr) - *z_val;
+       } else if (mode == CNTR_MODE_W) {
+               /* A write can only zero the counter */
+               if (data == 0)
+                       *z_val = get_all_cpu_total(cntr);
+               else
+                       dd_dev_err(dd, "Per CPU cntrs can only be zeroed");
+       } else {
+               dd_dev_err(dd, "Invalid cntr sw cpu access mode");
+               return 0;
+       }
+
+       return ret;
+}
+
+static u64 access_sw_cpu_intr(const struct cntr_entry *entry,
+                             void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = context;
+
+       return read_write_cpu(dd, &dd->z_int_counter, dd->int_counter, vl,
+                             mode, data);
+}
+
+static u64 access_sw_cpu_rcv_limit(const struct cntr_entry *entry,
+                                  void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = context;
+
+       return read_write_cpu(dd, &dd->z_rcv_limit, dd->rcv_limit, vl,
+                             mode, data);
+}
+
+static u64 access_sw_pio_wait(const struct cntr_entry *entry,
+                             void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = context;
+
+       return dd->verbs_dev.n_piowait;
+}
+
+static u64 access_sw_pio_drain(const struct cntr_entry *entry,
+                              void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->verbs_dev.n_piodrain;
+}
+
+static u64 access_sw_vtx_wait(const struct cntr_entry *entry,
+                             void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = context;
+
+       return dd->verbs_dev.n_txwait;
+}
+
+static u64 access_sw_kmem_wait(const struct cntr_entry *entry,
+                              void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = context;
+
+       return dd->verbs_dev.n_kmem_wait;
+}
+
+static u64 access_sw_send_schedule(const struct cntr_entry *entry,
+                                  void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return read_write_cpu(dd, &dd->z_send_schedule, dd->send_schedule, vl,
+                             mode, data);
+}
+
+/* Software counters for the error status bits within MISC_ERR_STATUS */
+static u64 access_misc_pll_lock_fail_err_cnt(const struct cntr_entry *entry,
+                                            void *context, int vl, int mode,
+                                            u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->misc_err_status_cnt[12];
+}
+
+static u64 access_misc_mbist_fail_err_cnt(const struct cntr_entry *entry,
+                                         void *context, int vl, int mode,
+                                         u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->misc_err_status_cnt[11];
+}
+
+static u64 access_misc_invalid_eep_cmd_err_cnt(const struct cntr_entry *entry,
+                                              void *context, int vl, int mode,
+                                              u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->misc_err_status_cnt[10];
+}
+
+static u64 access_misc_efuse_done_parity_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->misc_err_status_cnt[9];
+}
+
+static u64 access_misc_efuse_write_err_cnt(const struct cntr_entry *entry,
+                                          void *context, int vl, int mode,
+                                          u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->misc_err_status_cnt[8];
+}
+
+static u64 access_misc_efuse_read_bad_addr_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->misc_err_status_cnt[7];
+}
+
+static u64 access_misc_efuse_csr_parity_err_cnt(const struct cntr_entry *entry,
+                                               void *context, int vl,
+                                               int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->misc_err_status_cnt[6];
+}
+
+static u64 access_misc_fw_auth_failed_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->misc_err_status_cnt[5];
+}
+
+static u64 access_misc_key_mismatch_err_cnt(const struct cntr_entry *entry,
+                                           void *context, int vl, int mode,
+                                           u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->misc_err_status_cnt[4];
+}
+
+static u64 access_misc_sbus_write_failed_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->misc_err_status_cnt[3];
+}
+
+static u64 access_misc_csr_write_bad_addr_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->misc_err_status_cnt[2];
+}
+
+static u64 access_misc_csr_read_bad_addr_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->misc_err_status_cnt[1];
+}
+
+static u64 access_misc_csr_parity_err_cnt(const struct cntr_entry *entry,
+                                         void *context, int vl, int mode,
+                                         u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->misc_err_status_cnt[0];
+}
+
+/*
+ * Software counter for the aggregate of
+ * individual CceErrStatus counters
+ */
+static u64 access_sw_cce_err_status_aggregated_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_cce_err_status_aggregate;
+}
+
+/*
+ * Software counters corresponding to each of the
+ * error status bits within CceErrStatus
+ */
+static u64 access_cce_msix_csr_parity_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[40];
+}
+
+static u64 access_cce_int_map_unc_err_cnt(const struct cntr_entry *entry,
+                                         void *context, int vl, int mode,
+                                         u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[39];
+}
+
+static u64 access_cce_int_map_cor_err_cnt(const struct cntr_entry *entry,
+                                         void *context, int vl, int mode,
+                                         u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[38];
+}
+
+static u64 access_cce_msix_table_unc_err_cnt(const struct cntr_entry *entry,
+                                            void *context, int vl, int mode,
+                                            u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[37];
+}
+
+static u64 access_cce_msix_table_cor_err_cnt(const struct cntr_entry *entry,
+                                            void *context, int vl, int mode,
+                                            u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[36];
+}
+
+static u64 access_cce_rxdma_conv_fifo_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[35];
+}
+
+static u64 access_cce_rcpl_async_fifo_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[34];
+}
+
+static u64 access_cce_seg_write_bad_addr_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[33];
+}
+
+static u64 access_cce_seg_read_bad_addr_err_cnt(const struct cntr_entry *entry,
+                                               void *context, int vl, int mode,
+                                               u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[32];
+}
+
+static u64 access_la_triggered_cnt(const struct cntr_entry *entry,
+                                  void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[31];
+}
+
+static u64 access_cce_trgt_cpl_timeout_err_cnt(const struct cntr_entry *entry,
+                                              void *context, int vl, int mode,
+                                              u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[30];
+}
+
+static u64 access_pcic_receive_parity_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[29];
+}
+
+static u64 access_pcic_transmit_back_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[28];
+}
+
+static u64 access_pcic_transmit_front_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[27];
+}
+
+static u64 access_pcic_cpl_dat_q_unc_err_cnt(const struct cntr_entry *entry,
+                                            void *context, int vl, int mode,
+                                            u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[26];
+}
+
+static u64 access_pcic_cpl_hd_q_unc_err_cnt(const struct cntr_entry *entry,
+                                           void *context, int vl, int mode,
+                                           u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[25];
+}
+
+static u64 access_pcic_post_dat_q_unc_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[24];
+}
+
+static u64 access_pcic_post_hd_q_unc_err_cnt(const struct cntr_entry *entry,
+                                            void *context, int vl, int mode,
+                                            u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[23];
+}
+
+static u64 access_pcic_retry_sot_mem_unc_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[22];
+}
+
+static u64 access_pcic_retry_mem_unc_err(const struct cntr_entry *entry,
+                                        void *context, int vl, int mode,
+                                        u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[21];
+}
+
+static u64 access_pcic_n_post_dat_q_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[20];
+}
+
+static u64 access_pcic_n_post_h_q_parity_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[19];
+}
+
+static u64 access_pcic_cpl_dat_q_cor_err_cnt(const struct cntr_entry *entry,
+                                            void *context, int vl, int mode,
+                                            u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[18];
+}
+
+static u64 access_pcic_cpl_hd_q_cor_err_cnt(const struct cntr_entry *entry,
+                                           void *context, int vl, int mode,
+                                           u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[17];
+}
+
+static u64 access_pcic_post_dat_q_cor_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[16];
+}
+
+static u64 access_pcic_post_hd_q_cor_err_cnt(const struct cntr_entry *entry,
+                                            void *context, int vl, int mode,
+                                            u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[15];
+}
+
+static u64 access_pcic_retry_sot_mem_cor_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[14];
+}
+
+static u64 access_pcic_retry_mem_cor_err_cnt(const struct cntr_entry *entry,
+                                            void *context, int vl, int mode,
+                                            u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[13];
+}
+
+static u64 access_cce_cli1_async_fifo_dbg_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[12];
+}
+
+static u64 access_cce_cli1_async_fifo_rxdma_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[11];
+}
+
+static u64 access_cce_cli1_async_fifo_sdma_hd_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[10];
+}
+
+static u64 access_cce_cl1_async_fifo_pio_crdt_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[9];
+}
+
+static u64 access_cce_cli2_async_fifo_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[8];
+}
+
+static u64 access_cce_csr_cfg_bus_parity_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[7];
+}
+
+static u64 access_cce_cli0_async_fifo_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[6];
+}
+
+static u64 access_cce_rspd_data_parity_err_cnt(const struct cntr_entry *entry,
+                                              void *context, int vl, int mode,
+                                              u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[5];
+}
+
+static u64 access_cce_trgt_access_err_cnt(const struct cntr_entry *entry,
+                                         void *context, int vl, int mode,
+                                         u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[4];
+}
+
+static u64 access_cce_trgt_async_fifo_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[3];
+}
+
+static u64 access_cce_csr_write_bad_addr_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[2];
+}
+
+static u64 access_cce_csr_read_bad_addr_err_cnt(const struct cntr_entry *entry,
+                                               void *context, int vl,
+                                               int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[1];
+}
+
+static u64 access_ccs_csr_parity_err_cnt(const struct cntr_entry *entry,
+                                        void *context, int vl, int mode,
+                                        u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[0];
+}
+
+/*
+ * Software counters corresponding to each of the
+ * error status bits within RcvErrStatus
+ */
+static u64 access_rx_csr_parity_err_cnt(const struct cntr_entry *entry,
+                                       void *context, int vl, int mode,
+                                       u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[63];
+}
+
+static u64 access_rx_csr_write_bad_addr_err_cnt(const struct cntr_entry *entry,
+                                               void *context, int vl,
+                                               int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[62];
+}
+
+static u64 access_rx_csr_read_bad_addr_err_cnt(const struct cntr_entry *entry,
+                                              void *context, int vl, int mode,
+                                              u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[61];
+}
+
+static u64 access_rx_dma_csr_unc_err_cnt(const struct cntr_entry *entry,
+                                        void *context, int vl, int mode,
+                                        u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[60];
+}
+
+static u64 access_rx_dma_dq_fsm_encoding_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[59];
+}
+
+static u64 access_rx_dma_eq_fsm_encoding_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[58];
+}
+
+static u64 access_rx_dma_csr_parity_err_cnt(const struct cntr_entry *entry,
+                                           void *context, int vl, int mode,
+                                           u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[57];
+}
+
+static u64 access_rx_rbuf_data_cor_err_cnt(const struct cntr_entry *entry,
+                                          void *context, int vl, int mode,
+                                          u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[56];
+}
+
+static u64 access_rx_rbuf_data_unc_err_cnt(const struct cntr_entry *entry,
+                                          void *context, int vl, int mode,
+                                          u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[55];
+}
+
+static u64 access_rx_dma_data_fifo_rd_cor_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[54];
+}
+
+static u64 access_rx_dma_data_fifo_rd_unc_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[53];
+}
+
+static u64 access_rx_dma_hdr_fifo_rd_cor_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[52];
+}
+
+static u64 access_rx_dma_hdr_fifo_rd_unc_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[51];
+}
+
+static u64 access_rx_rbuf_desc_part2_cor_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[50];
+}
+
+static u64 access_rx_rbuf_desc_part2_unc_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[49];
+}
+
+static u64 access_rx_rbuf_desc_part1_cor_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[48];
+}
+
+static u64 access_rx_rbuf_desc_part1_unc_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[47];
+}
+
+static u64 access_rx_hq_intr_fsm_err_cnt(const struct cntr_entry *entry,
+                                        void *context, int vl, int mode,
+                                        u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[46];
+}
+
+static u64 access_rx_hq_intr_csr_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[45];
+}
+
+static u64 access_rx_lookup_csr_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[44];
+}
+
+static u64 access_rx_lookup_rcv_array_cor_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[43];
+}
+
+static u64 access_rx_lookup_rcv_array_unc_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[42];
+}
+
+static u64 access_rx_lookup_des_part2_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[41];
+}
+
+static u64 access_rx_lookup_des_part1_unc_cor_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[40];
+}
+
+static u64 access_rx_lookup_des_part1_unc_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[39];
+}
+
+static u64 access_rx_rbuf_next_free_buf_cor_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[38];
+}
+
+static u64 access_rx_rbuf_next_free_buf_unc_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[37];
+}
+
+static u64 access_rbuf_fl_init_wr_addr_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[36];
+}
+
+static u64 access_rx_rbuf_fl_initdone_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[35];
+}
+
+static u64 access_rx_rbuf_fl_write_addr_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[34];
+}
+
+static u64 access_rx_rbuf_fl_rd_addr_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[33];
+}
+
+static u64 access_rx_rbuf_empty_err_cnt(const struct cntr_entry *entry,
+                                       void *context, int vl, int mode,
+                                       u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[32];
+}
+
+static u64 access_rx_rbuf_full_err_cnt(const struct cntr_entry *entry,
+                                      void *context, int vl, int mode,
+                                      u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[31];
+}
+
+static u64 access_rbuf_bad_lookup_err_cnt(const struct cntr_entry *entry,
+                                         void *context, int vl, int mode,
+                                         u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[30];
+}
+
+static u64 access_rbuf_ctx_id_parity_err_cnt(const struct cntr_entry *entry,
+                                            void *context, int vl, int mode,
+                                            u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[29];
+}
+
+static u64 access_rbuf_csr_qeopdw_parity_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[28];
+}
+
+static u64 access_rx_rbuf_csr_q_num_of_pkt_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[27];
+}
+
+static u64 access_rx_rbuf_csr_q_t1_ptr_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[26];
+}
+
+static u64 access_rx_rbuf_csr_q_hd_ptr_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[25];
+}
+
+static u64 access_rx_rbuf_csr_q_vld_bit_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[24];
+}
+
+static u64 access_rx_rbuf_csr_q_next_buf_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[23];
+}
+
+static u64 access_rx_rbuf_csr_q_ent_cnt_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[22];
+}
+
+static u64 access_rx_rbuf_csr_q_head_buf_num_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[21];
+}
+
+static u64 access_rx_rbuf_block_list_read_cor_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[20];
+}
+
+static u64 access_rx_rbuf_block_list_read_unc_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[19];
+}
+
+static u64 access_rx_rbuf_lookup_des_cor_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[18];
+}
+
+static u64 access_rx_rbuf_lookup_des_unc_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[17];
+}
+
+static u64 access_rx_rbuf_lookup_des_reg_unc_cor_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[16];
+}
+
+static u64 access_rx_rbuf_lookup_des_reg_unc_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[15];
+}
+
+static u64 access_rx_rbuf_free_list_cor_err_cnt(const struct cntr_entry *entry,
+                                               void *context, int vl,
+                                               int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[14];
+}
+
+static u64 access_rx_rbuf_free_list_unc_err_cnt(const struct cntr_entry *entry,
+                                               void *context, int vl,
+                                               int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[13];
+}
+
+static u64 access_rx_rcv_fsm_encoding_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[12];
+}
+
+static u64 access_rx_dma_flag_cor_err_cnt(const struct cntr_entry *entry,
+                                         void *context, int vl, int mode,
+                                         u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[11];
+}
+
+static u64 access_rx_dma_flag_unc_err_cnt(const struct cntr_entry *entry,
+                                         void *context, int vl, int mode,
+                                         u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[10];
+}
+
+static u64 access_rx_dc_sop_eop_parity_err_cnt(const struct cntr_entry *entry,
+                                              void *context, int vl, int mode,
+                                              u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[9];
+}
+
+static u64 access_rx_rcv_csr_parity_err_cnt(const struct cntr_entry *entry,
+                                           void *context, int vl, int mode,
+                                           u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[8];
+}
+
+static u64 access_rx_rcv_qp_map_table_cor_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[7];
+}
+
+static u64 access_rx_rcv_qp_map_table_unc_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[6];
+}
+
+static u64 access_rx_rcv_data_cor_err_cnt(const struct cntr_entry *entry,
+                                         void *context, int vl, int mode,
+                                         u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[5];
+}
+
+static u64 access_rx_rcv_data_unc_err_cnt(const struct cntr_entry *entry,
+                                         void *context, int vl, int mode,
+                                         u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[4];
+}
+
+static u64 access_rx_rcv_hdr_cor_err_cnt(const struct cntr_entry *entry,
+                                        void *context, int vl, int mode,
+                                        u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[3];
+}
+
+static u64 access_rx_rcv_hdr_unc_err_cnt(const struct cntr_entry *entry,
+                                        void *context, int vl, int mode,
+                                        u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[2];
+}
+
+static u64 access_rx_dc_intf_parity_err_cnt(const struct cntr_entry *entry,
+                                           void *context, int vl, int mode,
+                                           u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[1];
+}
+
+static u64 access_rx_dma_csr_cor_err_cnt(const struct cntr_entry *entry,
+                                        void *context, int vl, int mode,
+                                        u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[0];
+}
+
+/*
+ * Software counters corresponding to each of the
+ * error status bits within SendPioErrStatus
+ */
+static u64 access_pio_pec_sop_head_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[35];
+}
+
+static u64 access_pio_pcc_sop_head_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[34];
+}
+
+static u64 access_pio_last_returned_cnt_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[33];
+}
+
+static u64 access_pio_current_free_cnt_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[32];
+}
+
+static u64 access_pio_reserved_31_err_cnt(const struct cntr_entry *entry,
+                                         void *context, int vl, int mode,
+                                         u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[31];
+}
+
+static u64 access_pio_reserved_30_err_cnt(const struct cntr_entry *entry,
+                                         void *context, int vl, int mode,
+                                         u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[30];
+}
+
+static u64 access_pio_ppmc_sop_len_err_cnt(const struct cntr_entry *entry,
+                                          void *context, int vl, int mode,
+                                          u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[29];
+}
+
+static u64 access_pio_ppmc_bqc_mem_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[28];
+}
+
+static u64 access_pio_vl_fifo_parity_err_cnt(const struct cntr_entry *entry,
+                                            void *context, int vl, int mode,
+                                            u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[27];
+}
+
+static u64 access_pio_vlf_sop_parity_err_cnt(const struct cntr_entry *entry,
+                                            void *context, int vl, int mode,
+                                            u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[26];
+}
+
+static u64 access_pio_vlf_v1_len_parity_err_cnt(const struct cntr_entry *entry,
+                                               void *context, int vl,
+                                               int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[25];
+}
+
+static u64 access_pio_block_qw_count_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[24];
+}
+
+static u64 access_pio_write_qw_valid_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[23];
+}
+
+static u64 access_pio_state_machine_err_cnt(const struct cntr_entry *entry,
+                                           void *context, int vl, int mode,
+                                           u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[22];
+}
+
+static u64 access_pio_write_data_parity_err_cnt(const struct cntr_entry *entry,
+                                               void *context, int vl,
+                                               int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[21];
+}
+
+static u64 access_pio_host_addr_mem_cor_err_cnt(const struct cntr_entry *entry,
+                                               void *context, int vl,
+                                               int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[20];
+}
+
+static u64 access_pio_host_addr_mem_unc_err_cnt(const struct cntr_entry *entry,
+                                               void *context, int vl,
+                                               int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[19];
+}
+
+static u64 access_pio_pkt_evict_sm_or_arb_sm_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[18];
+}
+
+static u64 access_pio_init_sm_in_err_cnt(const struct cntr_entry *entry,
+                                        void *context, int vl, int mode,
+                                        u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[17];
+}
+
+static u64 access_pio_ppmc_pbl_fifo_err_cnt(const struct cntr_entry *entry,
+                                           void *context, int vl, int mode,
+                                           u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[16];
+}
+
+static u64 access_pio_credit_ret_fifo_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[15];
+}
+
+static u64 access_pio_v1_len_mem_bank1_cor_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[14];
+}
+
+static u64 access_pio_v1_len_mem_bank0_cor_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[13];
+}
+
+static u64 access_pio_v1_len_mem_bank1_unc_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[12];
+}
+
+static u64 access_pio_v1_len_mem_bank0_unc_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[11];
+}
+
+static u64 access_pio_sm_pkt_reset_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[10];
+}
+
+static u64 access_pio_pkt_evict_fifo_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[9];
+}
+
+static u64 access_pio_sbrdctrl_crrel_fifo_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[8];
+}
+
+static u64 access_pio_sbrdctl_crrel_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[7];
+}
+
+static u64 access_pio_pec_fifo_parity_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[6];
+}
+
+static u64 access_pio_pcc_fifo_parity_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[5];
+}
+
+static u64 access_pio_sb_mem_fifo1_err_cnt(const struct cntr_entry *entry,
+                                          void *context, int vl, int mode,
+                                          u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[4];
+}
+
+static u64 access_pio_sb_mem_fifo0_err_cnt(const struct cntr_entry *entry,
+                                          void *context, int vl, int mode,
+                                          u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[3];
+}
+
+static u64 access_pio_csr_parity_err_cnt(const struct cntr_entry *entry,
+                                        void *context, int vl, int mode,
+                                        u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[2];
+}
+
+static u64 access_pio_write_addr_parity_err_cnt(const struct cntr_entry *entry,
+                                               void *context, int vl,
+                                               int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[1];
+}
+
+static u64 access_pio_write_bad_ctxt_err_cnt(const struct cntr_entry *entry,
+                                            void *context, int vl, int mode,
+                                            u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[0];
+}
+
+/*
+ * Software counters corresponding to each of the
+ * error status bits within SendDmaErrStatus
+ */
+static u64 access_sdma_pcie_req_tracking_cor_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_dma_err_status_cnt[3];
+}
+
+static u64 access_sdma_pcie_req_tracking_unc_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_dma_err_status_cnt[2];
+}
+
+static u64 access_sdma_csr_parity_err_cnt(const struct cntr_entry *entry,
+                                         void *context, int vl, int mode,
+                                         u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_dma_err_status_cnt[1];
+}
+
+static u64 access_sdma_rpy_tag_err_cnt(const struct cntr_entry *entry,
+                                      void *context, int vl, int mode,
+                                      u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_dma_err_status_cnt[0];
+}
+
+/*
+ * Software counters corresponding to each of the
+ * error status bits within SendEgressErrStatus
+ */
+static u64 access_tx_read_pio_memory_csr_unc_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[63];
+}
+
+static u64 access_tx_read_sdma_memory_csr_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[62];
+}
+
+static u64 access_tx_egress_fifo_cor_err_cnt(const struct cntr_entry *entry,
+                                            void *context, int vl, int mode,
+                                            u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[61];
+}
+
+static u64 access_tx_read_pio_memory_cor_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[60];
+}
+
+static u64 access_tx_read_sdma_memory_cor_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[59];
+}
+
+static u64 access_tx_sb_hdr_cor_err_cnt(const struct cntr_entry *entry,
+                                       void *context, int vl, int mode,
+                                       u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[58];
+}
+
+static u64 access_tx_credit_overrun_err_cnt(const struct cntr_entry *entry,
+                                           void *context, int vl, int mode,
+                                           u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[57];
+}
+
+static u64 access_tx_launch_fifo8_cor_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[56];
+}
+
+static u64 access_tx_launch_fifo7_cor_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[55];
+}
+
+static u64 access_tx_launch_fifo6_cor_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[54];
+}
+
+static u64 access_tx_launch_fifo5_cor_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[53];
+}
+
+static u64 access_tx_launch_fifo4_cor_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[52];
+}
+
+static u64 access_tx_launch_fifo3_cor_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[51];
+}
+
+static u64 access_tx_launch_fifo2_cor_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[50];
+}
+
+static u64 access_tx_launch_fifo1_cor_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[49];
+}
+
+static u64 access_tx_launch_fifo0_cor_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[48];
+}
+
+static u64 access_tx_credit_return_vl_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[47];
+}
+
+static u64 access_tx_hcrc_insertion_err_cnt(const struct cntr_entry *entry,
+                                           void *context, int vl, int mode,
+                                           u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[46];
+}
+
+static u64 access_tx_egress_fifo_unc_err_cnt(const struct cntr_entry *entry,
+                                            void *context, int vl, int mode,
+                                            u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[45];
+}
+
+static u64 access_tx_read_pio_memory_unc_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[44];
+}
+
+static u64 access_tx_read_sdma_memory_unc_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[43];
+}
+
+static u64 access_tx_sb_hdr_unc_err_cnt(const struct cntr_entry *entry,
+                                       void *context, int vl, int mode,
+                                       u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[42];
+}
+
+static u64 access_tx_credit_return_partiy_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[41];
+}
+
+static u64 access_tx_launch_fifo8_unc_or_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[40];
+}
+
+static u64 access_tx_launch_fifo7_unc_or_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[39];
+}
+
+static u64 access_tx_launch_fifo6_unc_or_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[38];
+}
+
+static u64 access_tx_launch_fifo5_unc_or_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[37];
+}
+
+static u64 access_tx_launch_fifo4_unc_or_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[36];
+}
+
+static u64 access_tx_launch_fifo3_unc_or_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[35];
+}
+
+static u64 access_tx_launch_fifo2_unc_or_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[34];
+}
+
+static u64 access_tx_launch_fifo1_unc_or_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[33];
+}
+
+static u64 access_tx_launch_fifo0_unc_or_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[32];
+}
+
+static u64 access_tx_sdma15_disallowed_packet_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[31];
+}
+
+static u64 access_tx_sdma14_disallowed_packet_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[30];
+}
+
+static u64 access_tx_sdma13_disallowed_packet_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[29];
+}
+
+static u64 access_tx_sdma12_disallowed_packet_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[28];
+}
+
+static u64 access_tx_sdma11_disallowed_packet_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[27];
+}
+
+static u64 access_tx_sdma10_disallowed_packet_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[26];
+}
+
+static u64 access_tx_sdma9_disallowed_packet_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[25];
+}
+
+static u64 access_tx_sdma8_disallowed_packet_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[24];
+}
+
+static u64 access_tx_sdma7_disallowed_packet_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[23];
+}
+
+static u64 access_tx_sdma6_disallowed_packet_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[22];
+}
+
+static u64 access_tx_sdma5_disallowed_packet_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[21];
+}
+
+static u64 access_tx_sdma4_disallowed_packet_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[20];
+}
+
+static u64 access_tx_sdma3_disallowed_packet_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[19];
+}
+
+static u64 access_tx_sdma2_disallowed_packet_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[18];
+}
+
+static u64 access_tx_sdma1_disallowed_packet_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[17];
+}
+
+static u64 access_tx_sdma0_disallowed_packet_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[16];
+}
+
+static u64 access_tx_config_parity_err_cnt(const struct cntr_entry *entry,
+                                          void *context, int vl, int mode,
+                                          u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[15];
+}
+
+static u64 access_tx_sbrd_ctl_csr_parity_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[14];
+}
+
+static u64 access_tx_launch_csr_parity_err_cnt(const struct cntr_entry *entry,
+                                              void *context, int vl, int mode,
+                                              u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[13];
+}
+
+static u64 access_tx_illegal_vl_err_cnt(const struct cntr_entry *entry,
+                                       void *context, int vl, int mode,
+                                       u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[12];
+}
+
+static u64 access_tx_sbrd_ctl_state_machine_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[11];
+}
+
+static u64 access_egress_reserved_10_err_cnt(const struct cntr_entry *entry,
+                                            void *context, int vl, int mode,
+                                            u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[10];
+}
+
+static u64 access_egress_reserved_9_err_cnt(const struct cntr_entry *entry,
+                                           void *context, int vl, int mode,
+                                           u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[9];
+}
+
+static u64 access_tx_sdma_launch_intf_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[8];
+}
+
+static u64 access_tx_pio_launch_intf_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[7];
+}
+
+static u64 access_egress_reserved_6_err_cnt(const struct cntr_entry *entry,
+                                           void *context, int vl, int mode,
+                                           u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[6];
+}
+
+static u64 access_tx_incorrect_link_state_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[5];
+}
+
+static u64 access_tx_linkdown_err_cnt(const struct cntr_entry *entry,
+                                     void *context, int vl, int mode,
+                                     u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[4];
+}
+
+static u64 access_tx_egress_fifi_underrun_or_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[3];
+}
+
+static u64 access_egress_reserved_2_err_cnt(const struct cntr_entry *entry,
+                                           void *context, int vl, int mode,
+                                           u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[2];
+}
+
+static u64 access_tx_pkt_integrity_mem_unc_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[1];
+}
+
+static u64 access_tx_pkt_integrity_mem_cor_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[0];
+}
+
+/*
+ * Software counters corresponding to each of the
+ * error status bits within SendErrStatus
+ */
+static u64 access_send_csr_write_bad_addr_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_err_status_cnt[2];
+}
+
+static u64 access_send_csr_read_bad_addr_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_err_status_cnt[1];
+}
+
+static u64 access_send_csr_parity_cnt(const struct cntr_entry *entry,
+                                     void *context, int vl, int mode,
+                                     u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_err_status_cnt[0];
+}
+
+/*
+ * Software counters corresponding to each of the
+ * error status bits within SendCtxtErrStatus
+ */
+static u64 access_pio_write_out_of_bounds_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_ctxt_err_status_cnt[4];
+}
+
+static u64 access_pio_write_overflow_err_cnt(const struct cntr_entry *entry,
+                                            void *context, int vl, int mode,
+                                            u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_ctxt_err_status_cnt[3];
+}
+
+static u64 access_pio_write_crosses_boundary_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_ctxt_err_status_cnt[2];
+}
+
+static u64 access_pio_disallowed_packet_err_cnt(const struct cntr_entry *entry,
+                                               void *context, int vl,
+                                               int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_ctxt_err_status_cnt[1];
+}
+
+static u64 access_pio_inconsistent_sop_err_cnt(const struct cntr_entry *entry,
+                                              void *context, int vl, int mode,
+                                              u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_ctxt_err_status_cnt[0];
+}
+
+/*
+ * Software counters corresponding to each of the
+ * error status bits within SendDmaEngErrStatus
+ */
+static u64 access_sdma_header_request_fifo_cor_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[23];
+}
+
+static u64 access_sdma_header_storage_cor_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[22];
+}
+
+static u64 access_sdma_packet_tracking_cor_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[21];
+}
+
+static u64 access_sdma_assembly_cor_err_cnt(const struct cntr_entry *entry,
+                                           void *context, int vl, int mode,
+                                           u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[20];
+}
+
+static u64 access_sdma_desc_table_cor_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[19];
+}
+
+static u64 access_sdma_header_request_fifo_unc_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[18];
+}
+
+static u64 access_sdma_header_storage_unc_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[17];
+}
+
+static u64 access_sdma_packet_tracking_unc_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[16];
+}
+
+static u64 access_sdma_assembly_unc_err_cnt(const struct cntr_entry *entry,
+                                           void *context, int vl, int mode,
+                                           u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[15];
+}
+
+static u64 access_sdma_desc_table_unc_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[14];
+}
+
+static u64 access_sdma_timeout_err_cnt(const struct cntr_entry *entry,
+                                      void *context, int vl, int mode,
+                                      u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[13];
+}
+
+static u64 access_sdma_header_length_err_cnt(const struct cntr_entry *entry,
+                                            void *context, int vl, int mode,
+                                            u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[12];
+}
+
+static u64 access_sdma_header_address_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[11];
+}
+
+static u64 access_sdma_header_select_err_cnt(const struct cntr_entry *entry,
+                                            void *context, int vl, int mode,
+                                            u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[10];
+}
+
+static u64 access_sdma_reserved_9_err_cnt(const struct cntr_entry *entry,
+                                         void *context, int vl, int mode,
+                                         u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[9];
+}
+
+static u64 access_sdma_packet_desc_overflow_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[8];
+}
+
+static u64 access_sdma_length_mismatch_err_cnt(const struct cntr_entry *entry,
+                                              void *context, int vl,
+                                              int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[7];
+}
+
+static u64 access_sdma_halt_err_cnt(const struct cntr_entry *entry,
+                                   void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[6];
+}
+
+static u64 access_sdma_mem_read_err_cnt(const struct cntr_entry *entry,
+                                       void *context, int vl, int mode,
+                                       u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[5];
+}
+
+static u64 access_sdma_first_desc_err_cnt(const struct cntr_entry *entry,
+                                         void *context, int vl, int mode,
+                                         u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[4];
+}
+
+static u64 access_sdma_tail_out_of_bounds_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[3];
+}
+
+static u64 access_sdma_too_long_err_cnt(const struct cntr_entry *entry,
+                                       void *context, int vl, int mode,
+                                       u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[2];
+}
+
+static u64 access_sdma_gen_mismatch_err_cnt(const struct cntr_entry *entry,
+                                           void *context, int vl, int mode,
+                                           u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[1];
+}
+
+static u64 access_sdma_wrong_dw_err_cnt(const struct cntr_entry *entry,
+                                       void *context, int vl, int mode,
+                                       u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[0];
+}
+
+#define def_access_sw_cpu(cntr) \
+static u64 access_sw_cpu_##cntr(const struct cntr_entry *entry,                      \
+                             void *context, int vl, int mode, u64 data)      \
+{                                                                            \
+       struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;        \
+       return read_write_cpu(ppd->dd, &ppd->ibport_data.rvp.z_ ##cntr,       \
+                             ppd->ibport_data.rvp.cntr, vl,                  \
+                             mode, data);                                    \
+}
+
+def_access_sw_cpu(rc_acks);
+def_access_sw_cpu(rc_qacks);
+def_access_sw_cpu(rc_delayed_comp);
+
+#define def_access_ibp_counter(cntr) \
+static u64 access_ibp_##cntr(const struct cntr_entry *entry,                 \
+                               void *context, int vl, int mode, u64 data)    \
+{                                                                            \
+       struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;        \
+                                                                             \
+       if (vl != CNTR_INVALID_VL)                                            \
+               return 0;                                                     \
+                                                                             \
+       return read_write_sw(ppd->dd, &ppd->ibport_data.rvp.n_ ##cntr,        \
+                            mode, data);                                     \
+}
+
+def_access_ibp_counter(loop_pkts);
+def_access_ibp_counter(rc_resends);
+def_access_ibp_counter(rnr_naks);
+def_access_ibp_counter(other_naks);
+def_access_ibp_counter(rc_timeouts);
+def_access_ibp_counter(pkt_drops);
+def_access_ibp_counter(dmawait);
+def_access_ibp_counter(rc_seqnak);
+def_access_ibp_counter(rc_dupreq);
+def_access_ibp_counter(rdma_seq);
+def_access_ibp_counter(unaligned);
+def_access_ibp_counter(seq_naks);
+
+static struct cntr_entry dev_cntrs[DEV_CNTR_LAST] = {
+[C_RCV_OVF] = RXE32_DEV_CNTR_ELEM(RcvOverflow, RCV_BUF_OVFL_CNT, CNTR_SYNTH),
+[C_RX_TID_FULL] = RXE32_DEV_CNTR_ELEM(RxTIDFullEr, RCV_TID_FULL_ERR_CNT,
+                       CNTR_NORMAL),
+[C_RX_TID_INVALID] = RXE32_DEV_CNTR_ELEM(RxTIDInvalid, RCV_TID_VALID_ERR_CNT,
+                       CNTR_NORMAL),
+[C_RX_TID_FLGMS] = RXE32_DEV_CNTR_ELEM(RxTidFLGMs,
+                       RCV_TID_FLOW_GEN_MISMATCH_CNT,
+                       CNTR_NORMAL),
+[C_RX_CTX_EGRS] = RXE32_DEV_CNTR_ELEM(RxCtxEgrS, RCV_CONTEXT_EGR_STALL,
+                       CNTR_NORMAL),
+[C_RCV_TID_FLSMS] = RXE32_DEV_CNTR_ELEM(RxTidFLSMs,
+                       RCV_TID_FLOW_SEQ_MISMATCH_CNT, CNTR_NORMAL),
+[C_CCE_PCI_CR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePciCrSt,
+                       CCE_PCIE_POSTED_CRDT_STALL_CNT, CNTR_NORMAL),
+[C_CCE_PCI_TR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePciTrSt, CCE_PCIE_TRGT_STALL_CNT,
+                       CNTR_NORMAL),
+[C_CCE_PIO_WR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePioWrSt, CCE_PIO_WR_STALL_CNT,
+                       CNTR_NORMAL),
+[C_CCE_ERR_INT] = CCE_INT_DEV_CNTR_ELEM(CceErrInt, CCE_ERR_INT_CNT,
+                       CNTR_NORMAL),
+[C_CCE_SDMA_INT] = CCE_INT_DEV_CNTR_ELEM(CceSdmaInt, CCE_SDMA_INT_CNT,
+                       CNTR_NORMAL),
+[C_CCE_MISC_INT] = CCE_INT_DEV_CNTR_ELEM(CceMiscInt, CCE_MISC_INT_CNT,
+                       CNTR_NORMAL),
+[C_CCE_RCV_AV_INT] = CCE_INT_DEV_CNTR_ELEM(CceRcvAvInt, CCE_RCV_AVAIL_INT_CNT,
+                       CNTR_NORMAL),
+[C_CCE_RCV_URG_INT] = CCE_INT_DEV_CNTR_ELEM(CceRcvUrgInt,
+                       CCE_RCV_URGENT_INT_CNT, CNTR_NORMAL),
+[C_CCE_SEND_CR_INT] = CCE_INT_DEV_CNTR_ELEM(CceSndCrInt,
+                       CCE_SEND_CREDIT_INT_CNT, CNTR_NORMAL),
+[C_DC_UNC_ERR] = DC_PERF_CNTR(DcUnctblErr, DCC_ERR_UNCORRECTABLE_CNT,
+                             CNTR_SYNTH),
+[C_DC_RCV_ERR] = DC_PERF_CNTR(DcRecvErr, DCC_ERR_PORTRCV_ERR_CNT, CNTR_SYNTH),
+[C_DC_FM_CFG_ERR] = DC_PERF_CNTR(DcFmCfgErr, DCC_ERR_FMCONFIG_ERR_CNT,
+                                CNTR_SYNTH),
+[C_DC_RMT_PHY_ERR] = DC_PERF_CNTR(DcRmtPhyErr, DCC_ERR_RCVREMOTE_PHY_ERR_CNT,
+                                 CNTR_SYNTH),
+[C_DC_DROPPED_PKT] = DC_PERF_CNTR(DcDroppedPkt, DCC_ERR_DROPPED_PKT_CNT,
+                                 CNTR_SYNTH),
+[C_DC_MC_XMIT_PKTS] = DC_PERF_CNTR(DcMcXmitPkts,
+                                  DCC_PRF_PORT_XMIT_MULTICAST_CNT, CNTR_SYNTH),
+[C_DC_MC_RCV_PKTS] = DC_PERF_CNTR(DcMcRcvPkts,
+                                 DCC_PRF_PORT_RCV_MULTICAST_PKT_CNT,
+                                 CNTR_SYNTH),
+[C_DC_XMIT_CERR] = DC_PERF_CNTR(DcXmitCorr,
+                               DCC_PRF_PORT_XMIT_CORRECTABLE_CNT, CNTR_SYNTH),
+[C_DC_RCV_CERR] = DC_PERF_CNTR(DcRcvCorrCnt, DCC_PRF_PORT_RCV_CORRECTABLE_CNT,
+                              CNTR_SYNTH),
+[C_DC_RCV_FCC] = DC_PERF_CNTR(DcRxFCntl, DCC_PRF_RX_FLOW_CRTL_CNT,
+                             CNTR_SYNTH),
+[C_DC_XMIT_FCC] = DC_PERF_CNTR(DcXmitFCntl, DCC_PRF_TX_FLOW_CRTL_CNT,
+                              CNTR_SYNTH),
+[C_DC_XMIT_FLITS] = DC_PERF_CNTR(DcXmitFlits, DCC_PRF_PORT_XMIT_DATA_CNT,
+                                CNTR_SYNTH),
+[C_DC_RCV_FLITS] = DC_PERF_CNTR(DcRcvFlits, DCC_PRF_PORT_RCV_DATA_CNT,
+                               CNTR_SYNTH),
+[C_DC_XMIT_PKTS] = DC_PERF_CNTR(DcXmitPkts, DCC_PRF_PORT_XMIT_PKTS_CNT,
+                               CNTR_SYNTH),
+[C_DC_RCV_PKTS] = DC_PERF_CNTR(DcRcvPkts, DCC_PRF_PORT_RCV_PKTS_CNT,
+                              CNTR_SYNTH),
+[C_DC_RX_FLIT_VL] = DC_PERF_CNTR(DcRxFlitVl, DCC_PRF_PORT_VL_RCV_DATA_CNT,
+                                CNTR_SYNTH | CNTR_VL),
+[C_DC_RX_PKT_VL] = DC_PERF_CNTR(DcRxPktVl, DCC_PRF_PORT_VL_RCV_PKTS_CNT,
+                               CNTR_SYNTH | CNTR_VL),
+[C_DC_RCV_FCN] = DC_PERF_CNTR(DcRcvFcn, DCC_PRF_PORT_RCV_FECN_CNT, CNTR_SYNTH),
+[C_DC_RCV_FCN_VL] = DC_PERF_CNTR(DcRcvFcnVl, DCC_PRF_PORT_VL_RCV_FECN_CNT,
+                                CNTR_SYNTH | CNTR_VL),
+[C_DC_RCV_BCN] = DC_PERF_CNTR(DcRcvBcn, DCC_PRF_PORT_RCV_BECN_CNT, CNTR_SYNTH),
+[C_DC_RCV_BCN_VL] = DC_PERF_CNTR(DcRcvBcnVl, DCC_PRF_PORT_VL_RCV_BECN_CNT,
+                                CNTR_SYNTH | CNTR_VL),
+[C_DC_RCV_BBL] = DC_PERF_CNTR(DcRcvBbl, DCC_PRF_PORT_RCV_BUBBLE_CNT,
+                             CNTR_SYNTH),
+[C_DC_RCV_BBL_VL] = DC_PERF_CNTR(DcRcvBblVl, DCC_PRF_PORT_VL_RCV_BUBBLE_CNT,
+                                CNTR_SYNTH | CNTR_VL),
+[C_DC_MARK_FECN] = DC_PERF_CNTR(DcMarkFcn, DCC_PRF_PORT_MARK_FECN_CNT,
+                               CNTR_SYNTH),
+[C_DC_MARK_FECN_VL] = DC_PERF_CNTR(DcMarkFcnVl, DCC_PRF_PORT_VL_MARK_FECN_CNT,
+                                  CNTR_SYNTH | CNTR_VL),
+[C_DC_TOTAL_CRC] =
+       DC_PERF_CNTR_LCB(DcTotCrc, DC_LCB_ERR_INFO_TOTAL_CRC_ERR,
+                        CNTR_SYNTH),
+[C_DC_CRC_LN0] = DC_PERF_CNTR_LCB(DcCrcLn0, DC_LCB_ERR_INFO_CRC_ERR_LN0,
+                                 CNTR_SYNTH),
+[C_DC_CRC_LN1] = DC_PERF_CNTR_LCB(DcCrcLn1, DC_LCB_ERR_INFO_CRC_ERR_LN1,
+                                 CNTR_SYNTH),
+[C_DC_CRC_LN2] = DC_PERF_CNTR_LCB(DcCrcLn2, DC_LCB_ERR_INFO_CRC_ERR_LN2,
+                                 CNTR_SYNTH),
+[C_DC_CRC_LN3] = DC_PERF_CNTR_LCB(DcCrcLn3, DC_LCB_ERR_INFO_CRC_ERR_LN3,
+                                 CNTR_SYNTH),
+[C_DC_CRC_MULT_LN] =
+       DC_PERF_CNTR_LCB(DcMultLn, DC_LCB_ERR_INFO_CRC_ERR_MULTI_LN,
+                        CNTR_SYNTH),
+[C_DC_TX_REPLAY] = DC_PERF_CNTR_LCB(DcTxReplay, DC_LCB_ERR_INFO_TX_REPLAY_CNT,
+                                   CNTR_SYNTH),
+[C_DC_RX_REPLAY] = DC_PERF_CNTR_LCB(DcRxReplay, DC_LCB_ERR_INFO_RX_REPLAY_CNT,
+                                   CNTR_SYNTH),
+[C_DC_SEQ_CRC_CNT] =
+       DC_PERF_CNTR_LCB(DcLinkSeqCrc, DC_LCB_ERR_INFO_SEQ_CRC_CNT,
+                        CNTR_SYNTH),
+[C_DC_ESC0_ONLY_CNT] =
+       DC_PERF_CNTR_LCB(DcEsc0, DC_LCB_ERR_INFO_ESCAPE_0_ONLY_CNT,
+                        CNTR_SYNTH),
+[C_DC_ESC0_PLUS1_CNT] =
+       DC_PERF_CNTR_LCB(DcEsc1, DC_LCB_ERR_INFO_ESCAPE_0_PLUS1_CNT,
+                        CNTR_SYNTH),
+[C_DC_ESC0_PLUS2_CNT] =
+       DC_PERF_CNTR_LCB(DcEsc0Plus2, DC_LCB_ERR_INFO_ESCAPE_0_PLUS2_CNT,
+                        CNTR_SYNTH),
+[C_DC_REINIT_FROM_PEER_CNT] =
+       DC_PERF_CNTR_LCB(DcReinitPeer, DC_LCB_ERR_INFO_REINIT_FROM_PEER_CNT,
+                        CNTR_SYNTH),
+[C_DC_SBE_CNT] = DC_PERF_CNTR_LCB(DcSbe, DC_LCB_ERR_INFO_SBE_CNT,
+                                 CNTR_SYNTH),
+[C_DC_MISC_FLG_CNT] =
+       DC_PERF_CNTR_LCB(DcMiscFlg, DC_LCB_ERR_INFO_MISC_FLG_CNT,
+                        CNTR_SYNTH),
+[C_DC_PRF_GOOD_LTP_CNT] =
+       DC_PERF_CNTR_LCB(DcGoodLTP, DC_LCB_PRF_GOOD_LTP_CNT, CNTR_SYNTH),
+[C_DC_PRF_ACCEPTED_LTP_CNT] =
+       DC_PERF_CNTR_LCB(DcAccLTP, DC_LCB_PRF_ACCEPTED_LTP_CNT,
+                        CNTR_SYNTH),
+[C_DC_PRF_RX_FLIT_CNT] =
+       DC_PERF_CNTR_LCB(DcPrfRxFlit, DC_LCB_PRF_RX_FLIT_CNT, CNTR_SYNTH),
+[C_DC_PRF_TX_FLIT_CNT] =
+       DC_PERF_CNTR_LCB(DcPrfTxFlit, DC_LCB_PRF_TX_FLIT_CNT, CNTR_SYNTH),
+[C_DC_PRF_CLK_CNTR] =
+       DC_PERF_CNTR_LCB(DcPrfClk, DC_LCB_PRF_CLK_CNTR, CNTR_SYNTH),
+[C_DC_PG_DBG_FLIT_CRDTS_CNT] =
+       DC_PERF_CNTR_LCB(DcFltCrdts, DC_LCB_PG_DBG_FLIT_CRDTS_CNT, CNTR_SYNTH),
+[C_DC_PG_STS_PAUSE_COMPLETE_CNT] =
+       DC_PERF_CNTR_LCB(DcPauseComp, DC_LCB_PG_STS_PAUSE_COMPLETE_CNT,
+                        CNTR_SYNTH),
+[C_DC_PG_STS_TX_SBE_CNT] =
+       DC_PERF_CNTR_LCB(DcStsTxSbe, DC_LCB_PG_STS_TX_SBE_CNT, CNTR_SYNTH),
+[C_DC_PG_STS_TX_MBE_CNT] =
+       DC_PERF_CNTR_LCB(DcStsTxMbe, DC_LCB_PG_STS_TX_MBE_CNT,
+                        CNTR_SYNTH),
+[C_SW_CPU_INTR] = CNTR_ELEM("Intr", 0, 0, CNTR_NORMAL,
+                           access_sw_cpu_intr),
+[C_SW_CPU_RCV_LIM] = CNTR_ELEM("RcvLimit", 0, 0, CNTR_NORMAL,
+                           access_sw_cpu_rcv_limit),
+[C_SW_VTX_WAIT] = CNTR_ELEM("vTxWait", 0, 0, CNTR_NORMAL,
+                           access_sw_vtx_wait),
+[C_SW_PIO_WAIT] = CNTR_ELEM("PioWait", 0, 0, CNTR_NORMAL,
+                           access_sw_pio_wait),
+[C_SW_PIO_DRAIN] = CNTR_ELEM("PioDrain", 0, 0, CNTR_NORMAL,
+                           access_sw_pio_drain),
+[C_SW_KMEM_WAIT] = CNTR_ELEM("KmemWait", 0, 0, CNTR_NORMAL,
+                           access_sw_kmem_wait),
+[C_SW_SEND_SCHED] = CNTR_ELEM("SendSched", 0, 0, CNTR_NORMAL,
+                           access_sw_send_schedule),
+[C_SDMA_DESC_FETCHED_CNT] = CNTR_ELEM("SDEDscFdCn",
+                                     SEND_DMA_DESC_FETCHED_CNT, 0,
+                                     CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA,
+                                     dev_access_u32_csr),
+[C_SDMA_INT_CNT] = CNTR_ELEM("SDMAInt", 0, 0,
+                            CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA,
+                            access_sde_int_cnt),
+[C_SDMA_ERR_CNT] = CNTR_ELEM("SDMAErrCt", 0, 0,
+                            CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA,
+                            access_sde_err_cnt),
+[C_SDMA_IDLE_INT_CNT] = CNTR_ELEM("SDMAIdInt", 0, 0,
+                                 CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA,
+                                 access_sde_idle_int_cnt),
+[C_SDMA_PROGRESS_INT_CNT] = CNTR_ELEM("SDMAPrIntCn", 0, 0,
+                                     CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA,
+                                     access_sde_progress_int_cnt),
+/* MISC_ERR_STATUS */
+[C_MISC_PLL_LOCK_FAIL_ERR] = CNTR_ELEM("MISC_PLL_LOCK_FAIL_ERR", 0, 0,
+                               CNTR_NORMAL,
+                               access_misc_pll_lock_fail_err_cnt),
+[C_MISC_MBIST_FAIL_ERR] = CNTR_ELEM("MISC_MBIST_FAIL_ERR", 0, 0,
+                               CNTR_NORMAL,
+                               access_misc_mbist_fail_err_cnt),
+[C_MISC_INVALID_EEP_CMD_ERR] = CNTR_ELEM("MISC_INVALID_EEP_CMD_ERR", 0, 0,
+                               CNTR_NORMAL,
+                               access_misc_invalid_eep_cmd_err_cnt),
+[C_MISC_EFUSE_DONE_PARITY_ERR] = CNTR_ELEM("MISC_EFUSE_DONE_PARITY_ERR", 0, 0,
+                               CNTR_NORMAL,
+                               access_misc_efuse_done_parity_err_cnt),
+[C_MISC_EFUSE_WRITE_ERR] = CNTR_ELEM("MISC_EFUSE_WRITE_ERR", 0, 0,
+                               CNTR_NORMAL,
+                               access_misc_efuse_write_err_cnt),
+[C_MISC_EFUSE_READ_BAD_ADDR_ERR] = CNTR_ELEM("MISC_EFUSE_READ_BAD_ADDR_ERR", 0,
+                               0, CNTR_NORMAL,
+                               access_misc_efuse_read_bad_addr_err_cnt),
+[C_MISC_EFUSE_CSR_PARITY_ERR] = CNTR_ELEM("MISC_EFUSE_CSR_PARITY_ERR", 0, 0,
+                               CNTR_NORMAL,
+                               access_misc_efuse_csr_parity_err_cnt),
+[C_MISC_FW_AUTH_FAILED_ERR] = CNTR_ELEM("MISC_FW_AUTH_FAILED_ERR", 0, 0,
+                               CNTR_NORMAL,
+                               access_misc_fw_auth_failed_err_cnt),
+[C_MISC_KEY_MISMATCH_ERR] = CNTR_ELEM("MISC_KEY_MISMATCH_ERR", 0, 0,
+                               CNTR_NORMAL,
+                               access_misc_key_mismatch_err_cnt),
+[C_MISC_SBUS_WRITE_FAILED_ERR] = CNTR_ELEM("MISC_SBUS_WRITE_FAILED_ERR", 0, 0,
+                               CNTR_NORMAL,
+                               access_misc_sbus_write_failed_err_cnt),
+[C_MISC_CSR_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("MISC_CSR_WRITE_BAD_ADDR_ERR", 0, 0,
+                               CNTR_NORMAL,
+                               access_misc_csr_write_bad_addr_err_cnt),
+[C_MISC_CSR_READ_BAD_ADDR_ERR] = CNTR_ELEM("MISC_CSR_READ_BAD_ADDR_ERR", 0, 0,
+                               CNTR_NORMAL,
+                               access_misc_csr_read_bad_addr_err_cnt),
+[C_MISC_CSR_PARITY_ERR] = CNTR_ELEM("MISC_CSR_PARITY_ERR", 0, 0,
+                               CNTR_NORMAL,
+                               access_misc_csr_parity_err_cnt),
+/* CceErrStatus */
+[C_CCE_ERR_STATUS_AGGREGATED_CNT] = CNTR_ELEM("CceErrStatusAggregatedCnt", 0, 0,
+                               CNTR_NORMAL,
+                               access_sw_cce_err_status_aggregated_cnt),
+[C_CCE_MSIX_CSR_PARITY_ERR] = CNTR_ELEM("CceMsixCsrParityErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_cce_msix_csr_parity_err_cnt),
+[C_CCE_INT_MAP_UNC_ERR] = CNTR_ELEM("CceIntMapUncErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_cce_int_map_unc_err_cnt),
+[C_CCE_INT_MAP_COR_ERR] = CNTR_ELEM("CceIntMapCorErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_cce_int_map_cor_err_cnt),
+[C_CCE_MSIX_TABLE_UNC_ERR] = CNTR_ELEM("CceMsixTableUncErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_cce_msix_table_unc_err_cnt),
+[C_CCE_MSIX_TABLE_COR_ERR] = CNTR_ELEM("CceMsixTableCorErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_cce_msix_table_cor_err_cnt),
+[C_CCE_RXDMA_CONV_FIFO_PARITY_ERR] = CNTR_ELEM("CceRxdmaConvFifoParityErr", 0,
+                               0, CNTR_NORMAL,
+                               access_cce_rxdma_conv_fifo_parity_err_cnt),
+[C_CCE_RCPL_ASYNC_FIFO_PARITY_ERR] = CNTR_ELEM("CceRcplAsyncFifoParityErr", 0,
+                               0, CNTR_NORMAL,
+                               access_cce_rcpl_async_fifo_parity_err_cnt),
+[C_CCE_SEG_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("CceSegWriteBadAddrErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_cce_seg_write_bad_addr_err_cnt),
+[C_CCE_SEG_READ_BAD_ADDR_ERR] = CNTR_ELEM("CceSegReadBadAddrErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_cce_seg_read_bad_addr_err_cnt),
+[C_LA_TRIGGERED] = CNTR_ELEM("Cce LATriggered", 0, 0,
+                               CNTR_NORMAL,
+                               access_la_triggered_cnt),
+[C_CCE_TRGT_CPL_TIMEOUT_ERR] = CNTR_ELEM("CceTrgtCplTimeoutErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_cce_trgt_cpl_timeout_err_cnt),
+[C_PCIC_RECEIVE_PARITY_ERR] = CNTR_ELEM("PcicReceiveParityErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_pcic_receive_parity_err_cnt),
+[C_PCIC_TRANSMIT_BACK_PARITY_ERR] = CNTR_ELEM("PcicTransmitBackParityErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_pcic_transmit_back_parity_err_cnt),
+[C_PCIC_TRANSMIT_FRONT_PARITY_ERR] = CNTR_ELEM("PcicTransmitFrontParityErr", 0,
+                               0, CNTR_NORMAL,
+                               access_pcic_transmit_front_parity_err_cnt),
+[C_PCIC_CPL_DAT_Q_UNC_ERR] = CNTR_ELEM("PcicCplDatQUncErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_pcic_cpl_dat_q_unc_err_cnt),
+[C_PCIC_CPL_HD_Q_UNC_ERR] = CNTR_ELEM("PcicCplHdQUncErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_pcic_cpl_hd_q_unc_err_cnt),
+[C_PCIC_POST_DAT_Q_UNC_ERR] = CNTR_ELEM("PcicPostDatQUncErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_pcic_post_dat_q_unc_err_cnt),
+[C_PCIC_POST_HD_Q_UNC_ERR] = CNTR_ELEM("PcicPostHdQUncErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_pcic_post_hd_q_unc_err_cnt),
+[C_PCIC_RETRY_SOT_MEM_UNC_ERR] = CNTR_ELEM("PcicRetrySotMemUncErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_pcic_retry_sot_mem_unc_err_cnt),
+[C_PCIC_RETRY_MEM_UNC_ERR] = CNTR_ELEM("PcicRetryMemUncErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_pcic_retry_mem_unc_err),
+[C_PCIC_N_POST_DAT_Q_PARITY_ERR] = CNTR_ELEM("PcicNPostDatQParityErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_pcic_n_post_dat_q_parity_err_cnt),
+[C_PCIC_N_POST_H_Q_PARITY_ERR] = CNTR_ELEM("PcicNPostHQParityErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_pcic_n_post_h_q_parity_err_cnt),
+[C_PCIC_CPL_DAT_Q_COR_ERR] = CNTR_ELEM("PcicCplDatQCorErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_pcic_cpl_dat_q_cor_err_cnt),
+[C_PCIC_CPL_HD_Q_COR_ERR] = CNTR_ELEM("PcicCplHdQCorErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_pcic_cpl_hd_q_cor_err_cnt),
+[C_PCIC_POST_DAT_Q_COR_ERR] = CNTR_ELEM("PcicPostDatQCorErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_pcic_post_dat_q_cor_err_cnt),
+[C_PCIC_POST_HD_Q_COR_ERR] = CNTR_ELEM("PcicPostHdQCorErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_pcic_post_hd_q_cor_err_cnt),
+[C_PCIC_RETRY_SOT_MEM_COR_ERR] = CNTR_ELEM("PcicRetrySotMemCorErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_pcic_retry_sot_mem_cor_err_cnt),
+[C_PCIC_RETRY_MEM_COR_ERR] = CNTR_ELEM("PcicRetryMemCorErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_pcic_retry_mem_cor_err_cnt),
+[C_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERR] = CNTR_ELEM(
+                               "CceCli1AsyncFifoDbgParityError", 0, 0,
+                               CNTR_NORMAL,
+                               access_cce_cli1_async_fifo_dbg_parity_err_cnt),
+[C_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERR] = CNTR_ELEM(
+                               "CceCli1AsyncFifoRxdmaParityError", 0, 0,
+                               CNTR_NORMAL,
+                               access_cce_cli1_async_fifo_rxdma_parity_err_cnt
+                               ),
+[C_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR] = CNTR_ELEM(
+                       "CceCli1AsyncFifoSdmaHdParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_cce_cli1_async_fifo_sdma_hd_parity_err_cnt),
+[C_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR] = CNTR_ELEM(
+                       "CceCli1AsyncFifoPioCrdtParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_cce_cl1_async_fifo_pio_crdt_parity_err_cnt),
+[C_CCE_CLI2_ASYNC_FIFO_PARITY_ERR] = CNTR_ELEM("CceCli2AsyncFifoParityErr", 0,
+                       0, CNTR_NORMAL,
+                       access_cce_cli2_async_fifo_parity_err_cnt),
+[C_CCE_CSR_CFG_BUS_PARITY_ERR] = CNTR_ELEM("CceCsrCfgBusParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_cce_csr_cfg_bus_parity_err_cnt),
+[C_CCE_CLI0_ASYNC_FIFO_PARTIY_ERR] = CNTR_ELEM("CceCli0AsyncFifoParityErr", 0,
+                       0, CNTR_NORMAL,
+                       access_cce_cli0_async_fifo_parity_err_cnt),
+[C_CCE_RSPD_DATA_PARITY_ERR] = CNTR_ELEM("CceRspdDataParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_cce_rspd_data_parity_err_cnt),
+[C_CCE_TRGT_ACCESS_ERR] = CNTR_ELEM("CceTrgtAccessErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_cce_trgt_access_err_cnt),
+[C_CCE_TRGT_ASYNC_FIFO_PARITY_ERR] = CNTR_ELEM("CceTrgtAsyncFifoParityErr", 0,
+                       0, CNTR_NORMAL,
+                       access_cce_trgt_async_fifo_parity_err_cnt),
+[C_CCE_CSR_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("CceCsrWriteBadAddrErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_cce_csr_write_bad_addr_err_cnt),
+[C_CCE_CSR_READ_BAD_ADDR_ERR] = CNTR_ELEM("CceCsrReadBadAddrErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_cce_csr_read_bad_addr_err_cnt),
+[C_CCE_CSR_PARITY_ERR] = CNTR_ELEM("CceCsrParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_ccs_csr_parity_err_cnt),
+
+/* RcvErrStatus */
+[C_RX_CSR_PARITY_ERR] = CNTR_ELEM("RxCsrParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_csr_parity_err_cnt),
+[C_RX_CSR_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("RxCsrWriteBadAddrErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_csr_write_bad_addr_err_cnt),
+[C_RX_CSR_READ_BAD_ADDR_ERR] = CNTR_ELEM("RxCsrReadBadAddrErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_csr_read_bad_addr_err_cnt),
+[C_RX_DMA_CSR_UNC_ERR] = CNTR_ELEM("RxDmaCsrUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_dma_csr_unc_err_cnt),
+[C_RX_DMA_DQ_FSM_ENCODING_ERR] = CNTR_ELEM("RxDmaDqFsmEncodingErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_dma_dq_fsm_encoding_err_cnt),
+[C_RX_DMA_EQ_FSM_ENCODING_ERR] = CNTR_ELEM("RxDmaEqFsmEncodingErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_dma_eq_fsm_encoding_err_cnt),
+[C_RX_DMA_CSR_PARITY_ERR] = CNTR_ELEM("RxDmaCsrParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_dma_csr_parity_err_cnt),
+[C_RX_RBUF_DATA_COR_ERR] = CNTR_ELEM("RxRbufDataCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_data_cor_err_cnt),
+[C_RX_RBUF_DATA_UNC_ERR] = CNTR_ELEM("RxRbufDataUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_data_unc_err_cnt),
+[C_RX_DMA_DATA_FIFO_RD_COR_ERR] = CNTR_ELEM("RxDmaDataFifoRdCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_dma_data_fifo_rd_cor_err_cnt),
+[C_RX_DMA_DATA_FIFO_RD_UNC_ERR] = CNTR_ELEM("RxDmaDataFifoRdUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_dma_data_fifo_rd_unc_err_cnt),
+[C_RX_DMA_HDR_FIFO_RD_COR_ERR] = CNTR_ELEM("RxDmaHdrFifoRdCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_dma_hdr_fifo_rd_cor_err_cnt),
+[C_RX_DMA_HDR_FIFO_RD_UNC_ERR] = CNTR_ELEM("RxDmaHdrFifoRdUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_dma_hdr_fifo_rd_unc_err_cnt),
+[C_RX_RBUF_DESC_PART2_COR_ERR] = CNTR_ELEM("RxRbufDescPart2CorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_desc_part2_cor_err_cnt),
+[C_RX_RBUF_DESC_PART2_UNC_ERR] = CNTR_ELEM("RxRbufDescPart2UncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_desc_part2_unc_err_cnt),
+[C_RX_RBUF_DESC_PART1_COR_ERR] = CNTR_ELEM("RxRbufDescPart1CorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_desc_part1_cor_err_cnt),
+[C_RX_RBUF_DESC_PART1_UNC_ERR] = CNTR_ELEM("RxRbufDescPart1UncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_desc_part1_unc_err_cnt),
+[C_RX_HQ_INTR_FSM_ERR] = CNTR_ELEM("RxHqIntrFsmErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_hq_intr_fsm_err_cnt),
+[C_RX_HQ_INTR_CSR_PARITY_ERR] = CNTR_ELEM("RxHqIntrCsrParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_hq_intr_csr_parity_err_cnt),
+[C_RX_LOOKUP_CSR_PARITY_ERR] = CNTR_ELEM("RxLookupCsrParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_lookup_csr_parity_err_cnt),
+[C_RX_LOOKUP_RCV_ARRAY_COR_ERR] = CNTR_ELEM("RxLookupRcvArrayCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_lookup_rcv_array_cor_err_cnt),
+[C_RX_LOOKUP_RCV_ARRAY_UNC_ERR] = CNTR_ELEM("RxLookupRcvArrayUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_lookup_rcv_array_unc_err_cnt),
+[C_RX_LOOKUP_DES_PART2_PARITY_ERR] = CNTR_ELEM("RxLookupDesPart2ParityErr", 0,
+                       0, CNTR_NORMAL,
+                       access_rx_lookup_des_part2_parity_err_cnt),
+[C_RX_LOOKUP_DES_PART1_UNC_COR_ERR] = CNTR_ELEM("RxLookupDesPart1UncCorErr", 0,
+                       0, CNTR_NORMAL,
+                       access_rx_lookup_des_part1_unc_cor_err_cnt),
+[C_RX_LOOKUP_DES_PART1_UNC_ERR] = CNTR_ELEM("RxLookupDesPart1UncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_lookup_des_part1_unc_err_cnt),
+[C_RX_RBUF_NEXT_FREE_BUF_COR_ERR] = CNTR_ELEM("RxRbufNextFreeBufCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_next_free_buf_cor_err_cnt),
+[C_RX_RBUF_NEXT_FREE_BUF_UNC_ERR] = CNTR_ELEM("RxRbufNextFreeBufUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_next_free_buf_unc_err_cnt),
+[C_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR] = CNTR_ELEM(
+                       "RxRbufFlInitWrAddrParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rbuf_fl_init_wr_addr_parity_err_cnt),
+[C_RX_RBUF_FL_INITDONE_PARITY_ERR] = CNTR_ELEM("RxRbufFlInitdoneParityErr", 0,
+                       0, CNTR_NORMAL,
+                       access_rx_rbuf_fl_initdone_parity_err_cnt),
+[C_RX_RBUF_FL_WRITE_ADDR_PARITY_ERR] = CNTR_ELEM("RxRbufFlWrAddrParityErr", 0,
+                       0, CNTR_NORMAL,
+                       access_rx_rbuf_fl_write_addr_parity_err_cnt),
+[C_RX_RBUF_FL_RD_ADDR_PARITY_ERR] = CNTR_ELEM("RxRbufFlRdAddrParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_fl_rd_addr_parity_err_cnt),
+[C_RX_RBUF_EMPTY_ERR] = CNTR_ELEM("RxRbufEmptyErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_empty_err_cnt),
+[C_RX_RBUF_FULL_ERR] = CNTR_ELEM("RxRbufFullErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_full_err_cnt),
+[C_RX_RBUF_BAD_LOOKUP_ERR] = CNTR_ELEM("RxRBufBadLookupErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rbuf_bad_lookup_err_cnt),
+[C_RX_RBUF_CTX_ID_PARITY_ERR] = CNTR_ELEM("RxRbufCtxIdParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rbuf_ctx_id_parity_err_cnt),
+[C_RX_RBUF_CSR_QEOPDW_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQEOPDWParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rbuf_csr_qeopdw_parity_err_cnt),
+[C_RX_RBUF_CSR_Q_NUM_OF_PKT_PARITY_ERR] = CNTR_ELEM(
+                       "RxRbufCsrQNumOfPktParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_csr_q_num_of_pkt_parity_err_cnt),
+[C_RX_RBUF_CSR_Q_T1_PTR_PARITY_ERR] = CNTR_ELEM(
+                       "RxRbufCsrQTlPtrParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_csr_q_t1_ptr_parity_err_cnt),
+[C_RX_RBUF_CSR_Q_HD_PTR_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQHdPtrParityErr", 0,
+                       0, CNTR_NORMAL,
+                       access_rx_rbuf_csr_q_hd_ptr_parity_err_cnt),
+[C_RX_RBUF_CSR_Q_VLD_BIT_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQVldBitParityErr", 0,
+                       0, CNTR_NORMAL,
+                       access_rx_rbuf_csr_q_vld_bit_parity_err_cnt),
+[C_RX_RBUF_CSR_Q_NEXT_BUF_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQNextBufParityErr",
+                       0, 0, CNTR_NORMAL,
+                       access_rx_rbuf_csr_q_next_buf_parity_err_cnt),
+[C_RX_RBUF_CSR_Q_ENT_CNT_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQEntCntParityErr", 0,
+                       0, CNTR_NORMAL,
+                       access_rx_rbuf_csr_q_ent_cnt_parity_err_cnt),
+[C_RX_RBUF_CSR_Q_HEAD_BUF_NUM_PARITY_ERR] = CNTR_ELEM(
+                       "RxRbufCsrQHeadBufNumParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_csr_q_head_buf_num_parity_err_cnt),
+[C_RX_RBUF_BLOCK_LIST_READ_COR_ERR] = CNTR_ELEM("RxRbufBlockListReadCorErr", 0,
+                       0, CNTR_NORMAL,
+                       access_rx_rbuf_block_list_read_cor_err_cnt),
+[C_RX_RBUF_BLOCK_LIST_READ_UNC_ERR] = CNTR_ELEM("RxRbufBlockListReadUncErr", 0,
+                       0, CNTR_NORMAL,
+                       access_rx_rbuf_block_list_read_unc_err_cnt),
+[C_RX_RBUF_LOOKUP_DES_COR_ERR] = CNTR_ELEM("RxRbufLookupDesCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_lookup_des_cor_err_cnt),
+[C_RX_RBUF_LOOKUP_DES_UNC_ERR] = CNTR_ELEM("RxRbufLookupDesUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_lookup_des_unc_err_cnt),
+[C_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR] = CNTR_ELEM(
+                       "RxRbufLookupDesRegUncCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_lookup_des_reg_unc_cor_err_cnt),
+[C_RX_RBUF_LOOKUP_DES_REG_UNC_ERR] = CNTR_ELEM("RxRbufLookupDesRegUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_lookup_des_reg_unc_err_cnt),
+[C_RX_RBUF_FREE_LIST_COR_ERR] = CNTR_ELEM("RxRbufFreeListCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_free_list_cor_err_cnt),
+[C_RX_RBUF_FREE_LIST_UNC_ERR] = CNTR_ELEM("RxRbufFreeListUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_free_list_unc_err_cnt),
+[C_RX_RCV_FSM_ENCODING_ERR] = CNTR_ELEM("RxRcvFsmEncodingErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rcv_fsm_encoding_err_cnt),
+[C_RX_DMA_FLAG_COR_ERR] = CNTR_ELEM("RxDmaFlagCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_dma_flag_cor_err_cnt),
+[C_RX_DMA_FLAG_UNC_ERR] = CNTR_ELEM("RxDmaFlagUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_dma_flag_unc_err_cnt),
+[C_RX_DC_SOP_EOP_PARITY_ERR] = CNTR_ELEM("RxDcSopEopParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_dc_sop_eop_parity_err_cnt),
+[C_RX_RCV_CSR_PARITY_ERR] = CNTR_ELEM("RxRcvCsrParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rcv_csr_parity_err_cnt),
+[C_RX_RCV_QP_MAP_TABLE_COR_ERR] = CNTR_ELEM("RxRcvQpMapTableCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rcv_qp_map_table_cor_err_cnt),
+[C_RX_RCV_QP_MAP_TABLE_UNC_ERR] = CNTR_ELEM("RxRcvQpMapTableUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rcv_qp_map_table_unc_err_cnt),
+[C_RX_RCV_DATA_COR_ERR] = CNTR_ELEM("RxRcvDataCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rcv_data_cor_err_cnt),
+[C_RX_RCV_DATA_UNC_ERR] = CNTR_ELEM("RxRcvDataUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rcv_data_unc_err_cnt),
+[C_RX_RCV_HDR_COR_ERR] = CNTR_ELEM("RxRcvHdrCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rcv_hdr_cor_err_cnt),
+[C_RX_RCV_HDR_UNC_ERR] = CNTR_ELEM("RxRcvHdrUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rcv_hdr_unc_err_cnt),
+[C_RX_DC_INTF_PARITY_ERR] = CNTR_ELEM("RxDcIntfParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_dc_intf_parity_err_cnt),
+[C_RX_DMA_CSR_COR_ERR] = CNTR_ELEM("RxDmaCsrCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_dma_csr_cor_err_cnt),
+/* SendPioErrStatus */
+[C_PIO_PEC_SOP_HEAD_PARITY_ERR] = CNTR_ELEM("PioPecSopHeadParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_pec_sop_head_parity_err_cnt),
+[C_PIO_PCC_SOP_HEAD_PARITY_ERR] = CNTR_ELEM("PioPccSopHeadParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_pcc_sop_head_parity_err_cnt),
+[C_PIO_LAST_RETURNED_CNT_PARITY_ERR] = CNTR_ELEM("PioLastReturnedCntParityErr",
+                       0, 0, CNTR_NORMAL,
+                       access_pio_last_returned_cnt_parity_err_cnt),
+[C_PIO_CURRENT_FREE_CNT_PARITY_ERR] = CNTR_ELEM("PioCurrentFreeCntParityErr", 0,
+                       0, CNTR_NORMAL,
+                       access_pio_current_free_cnt_parity_err_cnt),
+[C_PIO_RSVD_31_ERR] = CNTR_ELEM("Pio Reserved 31", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_reserved_31_err_cnt),
+[C_PIO_RSVD_30_ERR] = CNTR_ELEM("Pio Reserved 30", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_reserved_30_err_cnt),
+[C_PIO_PPMC_SOP_LEN_ERR] = CNTR_ELEM("PioPpmcSopLenErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_ppmc_sop_len_err_cnt),
+[C_PIO_PPMC_BQC_MEM_PARITY_ERR] = CNTR_ELEM("PioPpmcBqcMemParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_ppmc_bqc_mem_parity_err_cnt),
+[C_PIO_VL_FIFO_PARITY_ERR] = CNTR_ELEM("PioVlFifoParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_vl_fifo_parity_err_cnt),
+[C_PIO_VLF_SOP_PARITY_ERR] = CNTR_ELEM("PioVlfSopParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_vlf_sop_parity_err_cnt),
+[C_PIO_VLF_V1_LEN_PARITY_ERR] = CNTR_ELEM("PioVlfVlLenParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_vlf_v1_len_parity_err_cnt),
+[C_PIO_BLOCK_QW_COUNT_PARITY_ERR] = CNTR_ELEM("PioBlockQwCountParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_block_qw_count_parity_err_cnt),
+[C_PIO_WRITE_QW_VALID_PARITY_ERR] = CNTR_ELEM("PioWriteQwValidParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_write_qw_valid_parity_err_cnt),
+[C_PIO_STATE_MACHINE_ERR] = CNTR_ELEM("PioStateMachineErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_state_machine_err_cnt),
+[C_PIO_WRITE_DATA_PARITY_ERR] = CNTR_ELEM("PioWriteDataParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_write_data_parity_err_cnt),
+[C_PIO_HOST_ADDR_MEM_COR_ERR] = CNTR_ELEM("PioHostAddrMemCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_host_addr_mem_cor_err_cnt),
+[C_PIO_HOST_ADDR_MEM_UNC_ERR] = CNTR_ELEM("PioHostAddrMemUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_host_addr_mem_unc_err_cnt),
+[C_PIO_PKT_EVICT_SM_OR_ARM_SM_ERR] = CNTR_ELEM("PioPktEvictSmOrArbSmErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_pkt_evict_sm_or_arb_sm_err_cnt),
+[C_PIO_INIT_SM_IN_ERR] = CNTR_ELEM("PioInitSmInErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_init_sm_in_err_cnt),
+[C_PIO_PPMC_PBL_FIFO_ERR] = CNTR_ELEM("PioPpmcPblFifoErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_ppmc_pbl_fifo_err_cnt),
+[C_PIO_CREDIT_RET_FIFO_PARITY_ERR] = CNTR_ELEM("PioCreditRetFifoParityErr", 0,
+                       0, CNTR_NORMAL,
+                       access_pio_credit_ret_fifo_parity_err_cnt),
+[C_PIO_V1_LEN_MEM_BANK1_COR_ERR] = CNTR_ELEM("PioVlLenMemBank1CorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_v1_len_mem_bank1_cor_err_cnt),
+[C_PIO_V1_LEN_MEM_BANK0_COR_ERR] = CNTR_ELEM("PioVlLenMemBank0CorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_v1_len_mem_bank0_cor_err_cnt),
+[C_PIO_V1_LEN_MEM_BANK1_UNC_ERR] = CNTR_ELEM("PioVlLenMemBank1UncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_v1_len_mem_bank1_unc_err_cnt),
+[C_PIO_V1_LEN_MEM_BANK0_UNC_ERR] = CNTR_ELEM("PioVlLenMemBank0UncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_v1_len_mem_bank0_unc_err_cnt),
+[C_PIO_SM_PKT_RESET_PARITY_ERR] = CNTR_ELEM("PioSmPktResetParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_sm_pkt_reset_parity_err_cnt),
+[C_PIO_PKT_EVICT_FIFO_PARITY_ERR] = CNTR_ELEM("PioPktEvictFifoParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_pkt_evict_fifo_parity_err_cnt),
+[C_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR] = CNTR_ELEM(
+                       "PioSbrdctrlCrrelFifoParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_sbrdctrl_crrel_fifo_parity_err_cnt),
+[C_PIO_SBRDCTL_CRREL_PARITY_ERR] = CNTR_ELEM("PioSbrdctlCrrelParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_sbrdctl_crrel_parity_err_cnt),
+[C_PIO_PEC_FIFO_PARITY_ERR] = CNTR_ELEM("PioPecFifoParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_pec_fifo_parity_err_cnt),
+[C_PIO_PCC_FIFO_PARITY_ERR] = CNTR_ELEM("PioPccFifoParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_pcc_fifo_parity_err_cnt),
+[C_PIO_SB_MEM_FIFO1_ERR] = CNTR_ELEM("PioSbMemFifo1Err", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_sb_mem_fifo1_err_cnt),
+[C_PIO_SB_MEM_FIFO0_ERR] = CNTR_ELEM("PioSbMemFifo0Err", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_sb_mem_fifo0_err_cnt),
+[C_PIO_CSR_PARITY_ERR] = CNTR_ELEM("PioCsrParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_csr_parity_err_cnt),
+[C_PIO_WRITE_ADDR_PARITY_ERR] = CNTR_ELEM("PioWriteAddrParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_write_addr_parity_err_cnt),
+[C_PIO_WRITE_BAD_CTXT_ERR] = CNTR_ELEM("PioWriteBadCtxtErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_write_bad_ctxt_err_cnt),
+/* SendDmaErrStatus */
+[C_SDMA_PCIE_REQ_TRACKING_COR_ERR] = CNTR_ELEM("SDmaPcieReqTrackingCorErr", 0,
+                       0, CNTR_NORMAL,
+                       access_sdma_pcie_req_tracking_cor_err_cnt),
+[C_SDMA_PCIE_REQ_TRACKING_UNC_ERR] = CNTR_ELEM("SDmaPcieReqTrackingUncErr", 0,
+                       0, CNTR_NORMAL,
+                       access_sdma_pcie_req_tracking_unc_err_cnt),
+[C_SDMA_CSR_PARITY_ERR] = CNTR_ELEM("SDmaCsrParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_csr_parity_err_cnt),
+[C_SDMA_RPY_TAG_ERR] = CNTR_ELEM("SDmaRpyTagErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_rpy_tag_err_cnt),
+/* SendEgressErrStatus */
+[C_TX_READ_PIO_MEMORY_CSR_UNC_ERR] = CNTR_ELEM("TxReadPioMemoryCsrUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_read_pio_memory_csr_unc_err_cnt),
+[C_TX_READ_SDMA_MEMORY_CSR_UNC_ERR] = CNTR_ELEM("TxReadSdmaMemoryCsrUncErr", 0,
+                       0, CNTR_NORMAL,
+                       access_tx_read_sdma_memory_csr_err_cnt),
+[C_TX_EGRESS_FIFO_COR_ERR] = CNTR_ELEM("TxEgressFifoCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_egress_fifo_cor_err_cnt),
+[C_TX_READ_PIO_MEMORY_COR_ERR] = CNTR_ELEM("TxReadPioMemoryCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_read_pio_memory_cor_err_cnt),
+[C_TX_READ_SDMA_MEMORY_COR_ERR] = CNTR_ELEM("TxReadSdmaMemoryCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_read_sdma_memory_cor_err_cnt),
+[C_TX_SB_HDR_COR_ERR] = CNTR_ELEM("TxSbHdrCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_sb_hdr_cor_err_cnt),
+[C_TX_CREDIT_OVERRUN_ERR] = CNTR_ELEM("TxCreditOverrunErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_credit_overrun_err_cnt),
+[C_TX_LAUNCH_FIFO8_COR_ERR] = CNTR_ELEM("TxLaunchFifo8CorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_launch_fifo8_cor_err_cnt),
+[C_TX_LAUNCH_FIFO7_COR_ERR] = CNTR_ELEM("TxLaunchFifo7CorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_launch_fifo7_cor_err_cnt),
+[C_TX_LAUNCH_FIFO6_COR_ERR] = CNTR_ELEM("TxLaunchFifo6CorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_launch_fifo6_cor_err_cnt),
+[C_TX_LAUNCH_FIFO5_COR_ERR] = CNTR_ELEM("TxLaunchFifo5CorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_launch_fifo5_cor_err_cnt),
+[C_TX_LAUNCH_FIFO4_COR_ERR] = CNTR_ELEM("TxLaunchFifo4CorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_launch_fifo4_cor_err_cnt),
+[C_TX_LAUNCH_FIFO3_COR_ERR] = CNTR_ELEM("TxLaunchFifo3CorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_launch_fifo3_cor_err_cnt),
+[C_TX_LAUNCH_FIFO2_COR_ERR] = CNTR_ELEM("TxLaunchFifo2CorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_launch_fifo2_cor_err_cnt),
+[C_TX_LAUNCH_FIFO1_COR_ERR] = CNTR_ELEM("TxLaunchFifo1CorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_launch_fifo1_cor_err_cnt),
+[C_TX_LAUNCH_FIFO0_COR_ERR] = CNTR_ELEM("TxLaunchFifo0CorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_launch_fifo0_cor_err_cnt),
+[C_TX_CREDIT_RETURN_VL_ERR] = CNTR_ELEM("TxCreditReturnVLErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_credit_return_vl_err_cnt),
+[C_TX_HCRC_INSERTION_ERR] = CNTR_ELEM("TxHcrcInsertionErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_hcrc_insertion_err_cnt),
+[C_TX_EGRESS_FIFI_UNC_ERR] = CNTR_ELEM("TxEgressFifoUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_egress_fifo_unc_err_cnt),
+[C_TX_READ_PIO_MEMORY_UNC_ERR] = CNTR_ELEM("TxReadPioMemoryUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_read_pio_memory_unc_err_cnt),
+[C_TX_READ_SDMA_MEMORY_UNC_ERR] = CNTR_ELEM("TxReadSdmaMemoryUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_read_sdma_memory_unc_err_cnt),
+[C_TX_SB_HDR_UNC_ERR] = CNTR_ELEM("TxSbHdrUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_sb_hdr_unc_err_cnt),
+[C_TX_CREDIT_RETURN_PARITY_ERR] = CNTR_ELEM("TxCreditReturnParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_credit_return_partiy_err_cnt),
+[C_TX_LAUNCH_FIFO8_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo8UncOrParityErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_launch_fifo8_unc_or_parity_err_cnt),
+[C_TX_LAUNCH_FIFO7_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo7UncOrParityErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_launch_fifo7_unc_or_parity_err_cnt),
+[C_TX_LAUNCH_FIFO6_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo6UncOrParityErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_launch_fifo6_unc_or_parity_err_cnt),
+[C_TX_LAUNCH_FIFO5_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo5UncOrParityErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_launch_fifo5_unc_or_parity_err_cnt),
+[C_TX_LAUNCH_FIFO4_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo4UncOrParityErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_launch_fifo4_unc_or_parity_err_cnt),
+[C_TX_LAUNCH_FIFO3_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo3UncOrParityErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_launch_fifo3_unc_or_parity_err_cnt),
+[C_TX_LAUNCH_FIFO2_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo2UncOrParityErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_launch_fifo2_unc_or_parity_err_cnt),
+[C_TX_LAUNCH_FIFO1_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo1UncOrParityErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_launch_fifo1_unc_or_parity_err_cnt),
+[C_TX_LAUNCH_FIFO0_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo0UncOrParityErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_launch_fifo0_unc_or_parity_err_cnt),
+[C_TX_SDMA15_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma15DisallowedPacketErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_sdma15_disallowed_packet_err_cnt),
+[C_TX_SDMA14_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma14DisallowedPacketErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_sdma14_disallowed_packet_err_cnt),
+[C_TX_SDMA13_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma13DisallowedPacketErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_sdma13_disallowed_packet_err_cnt),
+[C_TX_SDMA12_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma12DisallowedPacketErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_sdma12_disallowed_packet_err_cnt),
+[C_TX_SDMA11_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma11DisallowedPacketErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_sdma11_disallowed_packet_err_cnt),
+[C_TX_SDMA10_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma10DisallowedPacketErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_sdma10_disallowed_packet_err_cnt),
+[C_TX_SDMA9_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma9DisallowedPacketErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_sdma9_disallowed_packet_err_cnt),
+[C_TX_SDMA8_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma8DisallowedPacketErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_sdma8_disallowed_packet_err_cnt),
+[C_TX_SDMA7_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma7DisallowedPacketErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_sdma7_disallowed_packet_err_cnt),
+[C_TX_SDMA6_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma6DisallowedPacketErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_sdma6_disallowed_packet_err_cnt),
+[C_TX_SDMA5_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma5DisallowedPacketErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_sdma5_disallowed_packet_err_cnt),
+[C_TX_SDMA4_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma4DisallowedPacketErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_sdma4_disallowed_packet_err_cnt),
+[C_TX_SDMA3_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma3DisallowedPacketErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_sdma3_disallowed_packet_err_cnt),
+[C_TX_SDMA2_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma2DisallowedPacketErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_sdma2_disallowed_packet_err_cnt),
+[C_TX_SDMA1_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma1DisallowedPacketErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_sdma1_disallowed_packet_err_cnt),
+[C_TX_SDMA0_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma0DisallowedPacketErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_sdma0_disallowed_packet_err_cnt),
+[C_TX_CONFIG_PARITY_ERR] = CNTR_ELEM("TxConfigParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_config_parity_err_cnt),
+[C_TX_SBRD_CTL_CSR_PARITY_ERR] = CNTR_ELEM("TxSbrdCtlCsrParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_sbrd_ctl_csr_parity_err_cnt),
+[C_TX_LAUNCH_CSR_PARITY_ERR] = CNTR_ELEM("TxLaunchCsrParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_launch_csr_parity_err_cnt),
+[C_TX_ILLEGAL_CL_ERR] = CNTR_ELEM("TxIllegalVLErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_illegal_vl_err_cnt),
+[C_TX_SBRD_CTL_STATE_MACHINE_PARITY_ERR] = CNTR_ELEM(
+                       "TxSbrdCtlStateMachineParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_sbrd_ctl_state_machine_parity_err_cnt),
+[C_TX_RESERVED_10] = CNTR_ELEM("Tx Egress Reserved 10", 0, 0,
+                       CNTR_NORMAL,
+                       access_egress_reserved_10_err_cnt),
+[C_TX_RESERVED_9] = CNTR_ELEM("Tx Egress Reserved 9", 0, 0,
+                       CNTR_NORMAL,
+                       access_egress_reserved_9_err_cnt),
+[C_TX_SDMA_LAUNCH_INTF_PARITY_ERR] = CNTR_ELEM("TxSdmaLaunchIntfParityErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_sdma_launch_intf_parity_err_cnt),
+[C_TX_PIO_LAUNCH_INTF_PARITY_ERR] = CNTR_ELEM("TxPioLaunchIntfParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_pio_launch_intf_parity_err_cnt),
+[C_TX_RESERVED_6] = CNTR_ELEM("Tx Egress Reserved 6", 0, 0,
+                       CNTR_NORMAL,
+                       access_egress_reserved_6_err_cnt),
+[C_TX_INCORRECT_LINK_STATE_ERR] = CNTR_ELEM("TxIncorrectLinkStateErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_incorrect_link_state_err_cnt),
+[C_TX_LINK_DOWN_ERR] = CNTR_ELEM("TxLinkdownErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_linkdown_err_cnt),
+[C_TX_EGRESS_FIFO_UNDERRUN_OR_PARITY_ERR] = CNTR_ELEM(
+                       "EgressFifoUnderrunOrParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_egress_fifi_underrun_or_parity_err_cnt),
+[C_TX_RESERVED_2] = CNTR_ELEM("Tx Egress Reserved 2", 0, 0,
+                       CNTR_NORMAL,
+                       access_egress_reserved_2_err_cnt),
+[C_TX_PKT_INTEGRITY_MEM_UNC_ERR] = CNTR_ELEM("TxPktIntegrityMemUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_pkt_integrity_mem_unc_err_cnt),
+[C_TX_PKT_INTEGRITY_MEM_COR_ERR] = CNTR_ELEM("TxPktIntegrityMemCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_pkt_integrity_mem_cor_err_cnt),
+/* SendErrStatus */
+[C_SEND_CSR_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("SendCsrWriteBadAddrErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_send_csr_write_bad_addr_err_cnt),
+[C_SEND_CSR_READ_BAD_ADD_ERR] = CNTR_ELEM("SendCsrReadBadAddrErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_send_csr_read_bad_addr_err_cnt),
+[C_SEND_CSR_PARITY_ERR] = CNTR_ELEM("SendCsrParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_send_csr_parity_cnt),
+/* SendCtxtErrStatus */
+[C_PIO_WRITE_OUT_OF_BOUNDS_ERR] = CNTR_ELEM("PioWriteOutOfBoundsErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_write_out_of_bounds_err_cnt),
+[C_PIO_WRITE_OVERFLOW_ERR] = CNTR_ELEM("PioWriteOverflowErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_write_overflow_err_cnt),
+[C_PIO_WRITE_CROSSES_BOUNDARY_ERR] = CNTR_ELEM("PioWriteCrossesBoundaryErr",
+                       0, 0, CNTR_NORMAL,
+                       access_pio_write_crosses_boundary_err_cnt),
+[C_PIO_DISALLOWED_PACKET_ERR] = CNTR_ELEM("PioDisallowedPacketErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_disallowed_packet_err_cnt),
+[C_PIO_INCONSISTENT_SOP_ERR] = CNTR_ELEM("PioInconsistentSopErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_inconsistent_sop_err_cnt),
+/* SendDmaEngErrStatus */
+[C_SDMA_HEADER_REQUEST_FIFO_COR_ERR] = CNTR_ELEM("SDmaHeaderRequestFifoCorErr",
+                       0, 0, CNTR_NORMAL,
+                       access_sdma_header_request_fifo_cor_err_cnt),
+[C_SDMA_HEADER_STORAGE_COR_ERR] = CNTR_ELEM("SDmaHeaderStorageCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_header_storage_cor_err_cnt),
+[C_SDMA_PACKET_TRACKING_COR_ERR] = CNTR_ELEM("SDmaPacketTrackingCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_packet_tracking_cor_err_cnt),
+[C_SDMA_ASSEMBLY_COR_ERR] = CNTR_ELEM("SDmaAssemblyCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_assembly_cor_err_cnt),
+[C_SDMA_DESC_TABLE_COR_ERR] = CNTR_ELEM("SDmaDescTableCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_desc_table_cor_err_cnt),
+[C_SDMA_HEADER_REQUEST_FIFO_UNC_ERR] = CNTR_ELEM("SDmaHeaderRequestFifoUncErr",
+                       0, 0, CNTR_NORMAL,
+                       access_sdma_header_request_fifo_unc_err_cnt),
+[C_SDMA_HEADER_STORAGE_UNC_ERR] = CNTR_ELEM("SDmaHeaderStorageUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_header_storage_unc_err_cnt),
+[C_SDMA_PACKET_TRACKING_UNC_ERR] = CNTR_ELEM("SDmaPacketTrackingUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_packet_tracking_unc_err_cnt),
+[C_SDMA_ASSEMBLY_UNC_ERR] = CNTR_ELEM("SDmaAssemblyUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_assembly_unc_err_cnt),
+[C_SDMA_DESC_TABLE_UNC_ERR] = CNTR_ELEM("SDmaDescTableUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_desc_table_unc_err_cnt),
+[C_SDMA_TIMEOUT_ERR] = CNTR_ELEM("SDmaTimeoutErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_timeout_err_cnt),
+[C_SDMA_HEADER_LENGTH_ERR] = CNTR_ELEM("SDmaHeaderLengthErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_header_length_err_cnt),
+[C_SDMA_HEADER_ADDRESS_ERR] = CNTR_ELEM("SDmaHeaderAddressErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_header_address_err_cnt),
+[C_SDMA_HEADER_SELECT_ERR] = CNTR_ELEM("SDmaHeaderSelectErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_header_select_err_cnt),
+[C_SMDA_RESERVED_9] = CNTR_ELEM("SDma Reserved 9", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_reserved_9_err_cnt),
+[C_SDMA_PACKET_DESC_OVERFLOW_ERR] = CNTR_ELEM("SDmaPacketDescOverflowErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_packet_desc_overflow_err_cnt),
+[C_SDMA_LENGTH_MISMATCH_ERR] = CNTR_ELEM("SDmaLengthMismatchErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_length_mismatch_err_cnt),
+[C_SDMA_HALT_ERR] = CNTR_ELEM("SDmaHaltErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_halt_err_cnt),
+[C_SDMA_MEM_READ_ERR] = CNTR_ELEM("SDmaMemReadErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_mem_read_err_cnt),
+[C_SDMA_FIRST_DESC_ERR] = CNTR_ELEM("SDmaFirstDescErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_first_desc_err_cnt),
+[C_SDMA_TAIL_OUT_OF_BOUNDS_ERR] = CNTR_ELEM("SDmaTailOutOfBoundsErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_tail_out_of_bounds_err_cnt),
+[C_SDMA_TOO_LONG_ERR] = CNTR_ELEM("SDmaTooLongErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_too_long_err_cnt),
+[C_SDMA_GEN_MISMATCH_ERR] = CNTR_ELEM("SDmaGenMismatchErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_gen_mismatch_err_cnt),
+[C_SDMA_WRONG_DW_ERR] = CNTR_ELEM("SDmaWrongDwErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_wrong_dw_err_cnt),
+};
+
+static struct cntr_entry port_cntrs[PORT_CNTR_LAST] = {
+[C_TX_UNSUP_VL] = TXE32_PORT_CNTR_ELEM(TxUnVLErr, SEND_UNSUP_VL_ERR_CNT,
+                       CNTR_NORMAL),
+[C_TX_INVAL_LEN] = TXE32_PORT_CNTR_ELEM(TxInvalLen, SEND_LEN_ERR_CNT,
+                       CNTR_NORMAL),
+[C_TX_MM_LEN_ERR] = TXE32_PORT_CNTR_ELEM(TxMMLenErr, SEND_MAX_MIN_LEN_ERR_CNT,
+                       CNTR_NORMAL),
+[C_TX_UNDERRUN] = TXE32_PORT_CNTR_ELEM(TxUnderrun, SEND_UNDERRUN_CNT,
+                       CNTR_NORMAL),
+[C_TX_FLOW_STALL] = TXE32_PORT_CNTR_ELEM(TxFlowStall, SEND_FLOW_STALL_CNT,
+                       CNTR_NORMAL),
+[C_TX_DROPPED] = TXE32_PORT_CNTR_ELEM(TxDropped, SEND_DROPPED_PKT_CNT,
+                       CNTR_NORMAL),
+[C_TX_HDR_ERR] = TXE32_PORT_CNTR_ELEM(TxHdrErr, SEND_HEADERS_ERR_CNT,
+                       CNTR_NORMAL),
+[C_TX_PKT] = TXE64_PORT_CNTR_ELEM(TxPkt, SEND_DATA_PKT_CNT, CNTR_NORMAL),
+[C_TX_WORDS] = TXE64_PORT_CNTR_ELEM(TxWords, SEND_DWORD_CNT, CNTR_NORMAL),
+[C_TX_WAIT] = TXE64_PORT_CNTR_ELEM(TxWait, SEND_WAIT_CNT, CNTR_SYNTH),
+[C_TX_FLIT_VL] = TXE64_PORT_CNTR_ELEM(TxFlitVL, SEND_DATA_VL0_CNT,
+                                     CNTR_SYNTH | CNTR_VL),
+[C_TX_PKT_VL] = TXE64_PORT_CNTR_ELEM(TxPktVL, SEND_DATA_PKT_VL0_CNT,
+                                    CNTR_SYNTH | CNTR_VL),
+[C_TX_WAIT_VL] = TXE64_PORT_CNTR_ELEM(TxWaitVL, SEND_WAIT_VL0_CNT,
+                                     CNTR_SYNTH | CNTR_VL),
+[C_RX_PKT] = RXE64_PORT_CNTR_ELEM(RxPkt, RCV_DATA_PKT_CNT, CNTR_NORMAL),
+[C_RX_WORDS] = RXE64_PORT_CNTR_ELEM(RxWords, RCV_DWORD_CNT, CNTR_NORMAL),
+[C_SW_LINK_DOWN] = CNTR_ELEM("SwLinkDown", 0, 0, CNTR_SYNTH | CNTR_32BIT,
+                            access_sw_link_dn_cnt),
+[C_SW_LINK_UP] = CNTR_ELEM("SwLinkUp", 0, 0, CNTR_SYNTH | CNTR_32BIT,
+                          access_sw_link_up_cnt),
+[C_SW_UNKNOWN_FRAME] = CNTR_ELEM("UnknownFrame", 0, 0, CNTR_NORMAL,
+                                access_sw_unknown_frame_cnt),
+[C_SW_XMIT_DSCD] = CNTR_ELEM("XmitDscd", 0, 0, CNTR_SYNTH | CNTR_32BIT,
+                            access_sw_xmit_discards),
+[C_SW_XMIT_DSCD_VL] = CNTR_ELEM("XmitDscdVl", 0, 0,
+                               CNTR_SYNTH | CNTR_32BIT | CNTR_VL,
+                               access_sw_xmit_discards),
+[C_SW_XMIT_CSTR_ERR] = CNTR_ELEM("XmitCstrErr", 0, 0, CNTR_SYNTH,
+                                access_xmit_constraint_errs),
+[C_SW_RCV_CSTR_ERR] = CNTR_ELEM("RcvCstrErr", 0, 0, CNTR_SYNTH,
+                               access_rcv_constraint_errs),
+[C_SW_IBP_LOOP_PKTS] = SW_IBP_CNTR(LoopPkts, loop_pkts),
+[C_SW_IBP_RC_RESENDS] = SW_IBP_CNTR(RcResend, rc_resends),
+[C_SW_IBP_RNR_NAKS] = SW_IBP_CNTR(RnrNak, rnr_naks),
+[C_SW_IBP_OTHER_NAKS] = SW_IBP_CNTR(OtherNak, other_naks),
+[C_SW_IBP_RC_TIMEOUTS] = SW_IBP_CNTR(RcTimeOut, rc_timeouts),
+[C_SW_IBP_PKT_DROPS] = SW_IBP_CNTR(PktDrop, pkt_drops),
+[C_SW_IBP_DMA_WAIT] = SW_IBP_CNTR(DmaWait, dmawait),
+[C_SW_IBP_RC_SEQNAK] = SW_IBP_CNTR(RcSeqNak, rc_seqnak),
+[C_SW_IBP_RC_DUPREQ] = SW_IBP_CNTR(RcDupRew, rc_dupreq),
+[C_SW_IBP_RDMA_SEQ] = SW_IBP_CNTR(RdmaSeq, rdma_seq),
+[C_SW_IBP_UNALIGNED] = SW_IBP_CNTR(Unaligned, unaligned),
+[C_SW_IBP_SEQ_NAK] = SW_IBP_CNTR(SeqNak, seq_naks),
+[C_SW_CPU_RC_ACKS] = CNTR_ELEM("RcAcks", 0, 0, CNTR_NORMAL,
+                              access_sw_cpu_rc_acks),
+[C_SW_CPU_RC_QACKS] = CNTR_ELEM("RcQacks", 0, 0, CNTR_NORMAL,
+                               access_sw_cpu_rc_qacks),
+[C_SW_CPU_RC_DELAYED_COMP] = CNTR_ELEM("RcDelayComp", 0, 0, CNTR_NORMAL,
+                                      access_sw_cpu_rc_delayed_comp),
+[OVR_LBL(0)] = OVR_ELM(0), [OVR_LBL(1)] = OVR_ELM(1),
+[OVR_LBL(2)] = OVR_ELM(2), [OVR_LBL(3)] = OVR_ELM(3),
+[OVR_LBL(4)] = OVR_ELM(4), [OVR_LBL(5)] = OVR_ELM(5),
+[OVR_LBL(6)] = OVR_ELM(6), [OVR_LBL(7)] = OVR_ELM(7),
+[OVR_LBL(8)] = OVR_ELM(8), [OVR_LBL(9)] = OVR_ELM(9),
+[OVR_LBL(10)] = OVR_ELM(10), [OVR_LBL(11)] = OVR_ELM(11),
+[OVR_LBL(12)] = OVR_ELM(12), [OVR_LBL(13)] = OVR_ELM(13),
+[OVR_LBL(14)] = OVR_ELM(14), [OVR_LBL(15)] = OVR_ELM(15),
+[OVR_LBL(16)] = OVR_ELM(16), [OVR_LBL(17)] = OVR_ELM(17),
+[OVR_LBL(18)] = OVR_ELM(18), [OVR_LBL(19)] = OVR_ELM(19),
+[OVR_LBL(20)] = OVR_ELM(20), [OVR_LBL(21)] = OVR_ELM(21),
+[OVR_LBL(22)] = OVR_ELM(22), [OVR_LBL(23)] = OVR_ELM(23),
+[OVR_LBL(24)] = OVR_ELM(24), [OVR_LBL(25)] = OVR_ELM(25),
+[OVR_LBL(26)] = OVR_ELM(26), [OVR_LBL(27)] = OVR_ELM(27),
+[OVR_LBL(28)] = OVR_ELM(28), [OVR_LBL(29)] = OVR_ELM(29),
+[OVR_LBL(30)] = OVR_ELM(30), [OVR_LBL(31)] = OVR_ELM(31),
+[OVR_LBL(32)] = OVR_ELM(32), [OVR_LBL(33)] = OVR_ELM(33),
+[OVR_LBL(34)] = OVR_ELM(34), [OVR_LBL(35)] = OVR_ELM(35),
+[OVR_LBL(36)] = OVR_ELM(36), [OVR_LBL(37)] = OVR_ELM(37),
+[OVR_LBL(38)] = OVR_ELM(38), [OVR_LBL(39)] = OVR_ELM(39),
+[OVR_LBL(40)] = OVR_ELM(40), [OVR_LBL(41)] = OVR_ELM(41),
+[OVR_LBL(42)] = OVR_ELM(42), [OVR_LBL(43)] = OVR_ELM(43),
+[OVR_LBL(44)] = OVR_ELM(44), [OVR_LBL(45)] = OVR_ELM(45),
+[OVR_LBL(46)] = OVR_ELM(46), [OVR_LBL(47)] = OVR_ELM(47),
+[OVR_LBL(48)] = OVR_ELM(48), [OVR_LBL(49)] = OVR_ELM(49),
+[OVR_LBL(50)] = OVR_ELM(50), [OVR_LBL(51)] = OVR_ELM(51),
+[OVR_LBL(52)] = OVR_ELM(52), [OVR_LBL(53)] = OVR_ELM(53),
+[OVR_LBL(54)] = OVR_ELM(54), [OVR_LBL(55)] = OVR_ELM(55),
+[OVR_LBL(56)] = OVR_ELM(56), [OVR_LBL(57)] = OVR_ELM(57),
+[OVR_LBL(58)] = OVR_ELM(58), [OVR_LBL(59)] = OVR_ELM(59),
+[OVR_LBL(60)] = OVR_ELM(60), [OVR_LBL(61)] = OVR_ELM(61),
+[OVR_LBL(62)] = OVR_ELM(62), [OVR_LBL(63)] = OVR_ELM(63),
+[OVR_LBL(64)] = OVR_ELM(64), [OVR_LBL(65)] = OVR_ELM(65),
+[OVR_LBL(66)] = OVR_ELM(66), [OVR_LBL(67)] = OVR_ELM(67),
+[OVR_LBL(68)] = OVR_ELM(68), [OVR_LBL(69)] = OVR_ELM(69),
+[OVR_LBL(70)] = OVR_ELM(70), [OVR_LBL(71)] = OVR_ELM(71),
+[OVR_LBL(72)] = OVR_ELM(72), [OVR_LBL(73)] = OVR_ELM(73),
+[OVR_LBL(74)] = OVR_ELM(74), [OVR_LBL(75)] = OVR_ELM(75),
+[OVR_LBL(76)] = OVR_ELM(76), [OVR_LBL(77)] = OVR_ELM(77),
+[OVR_LBL(78)] = OVR_ELM(78), [OVR_LBL(79)] = OVR_ELM(79),
+[OVR_LBL(80)] = OVR_ELM(80), [OVR_LBL(81)] = OVR_ELM(81),
+[OVR_LBL(82)] = OVR_ELM(82), [OVR_LBL(83)] = OVR_ELM(83),
+[OVR_LBL(84)] = OVR_ELM(84), [OVR_LBL(85)] = OVR_ELM(85),
+[OVR_LBL(86)] = OVR_ELM(86), [OVR_LBL(87)] = OVR_ELM(87),
+[OVR_LBL(88)] = OVR_ELM(88), [OVR_LBL(89)] = OVR_ELM(89),
+[OVR_LBL(90)] = OVR_ELM(90), [OVR_LBL(91)] = OVR_ELM(91),
+[OVR_LBL(92)] = OVR_ELM(92), [OVR_LBL(93)] = OVR_ELM(93),
+[OVR_LBL(94)] = OVR_ELM(94), [OVR_LBL(95)] = OVR_ELM(95),
+[OVR_LBL(96)] = OVR_ELM(96), [OVR_LBL(97)] = OVR_ELM(97),
+[OVR_LBL(98)] = OVR_ELM(98), [OVR_LBL(99)] = OVR_ELM(99),
+[OVR_LBL(100)] = OVR_ELM(100), [OVR_LBL(101)] = OVR_ELM(101),
+[OVR_LBL(102)] = OVR_ELM(102), [OVR_LBL(103)] = OVR_ELM(103),
+[OVR_LBL(104)] = OVR_ELM(104), [OVR_LBL(105)] = OVR_ELM(105),
+[OVR_LBL(106)] = OVR_ELM(106), [OVR_LBL(107)] = OVR_ELM(107),
+[OVR_LBL(108)] = OVR_ELM(108), [OVR_LBL(109)] = OVR_ELM(109),
+[OVR_LBL(110)] = OVR_ELM(110), [OVR_LBL(111)] = OVR_ELM(111),
+[OVR_LBL(112)] = OVR_ELM(112), [OVR_LBL(113)] = OVR_ELM(113),
+[OVR_LBL(114)] = OVR_ELM(114), [OVR_LBL(115)] = OVR_ELM(115),
+[OVR_LBL(116)] = OVR_ELM(116), [OVR_LBL(117)] = OVR_ELM(117),
+[OVR_LBL(118)] = OVR_ELM(118), [OVR_LBL(119)] = OVR_ELM(119),
+[OVR_LBL(120)] = OVR_ELM(120), [OVR_LBL(121)] = OVR_ELM(121),
+[OVR_LBL(122)] = OVR_ELM(122), [OVR_LBL(123)] = OVR_ELM(123),
+[OVR_LBL(124)] = OVR_ELM(124), [OVR_LBL(125)] = OVR_ELM(125),
+[OVR_LBL(126)] = OVR_ELM(126), [OVR_LBL(127)] = OVR_ELM(127),
+[OVR_LBL(128)] = OVR_ELM(128), [OVR_LBL(129)] = OVR_ELM(129),
+[OVR_LBL(130)] = OVR_ELM(130), [OVR_LBL(131)] = OVR_ELM(131),
+[OVR_LBL(132)] = OVR_ELM(132), [OVR_LBL(133)] = OVR_ELM(133),
+[OVR_LBL(134)] = OVR_ELM(134), [OVR_LBL(135)] = OVR_ELM(135),
+[OVR_LBL(136)] = OVR_ELM(136), [OVR_LBL(137)] = OVR_ELM(137),
+[OVR_LBL(138)] = OVR_ELM(138), [OVR_LBL(139)] = OVR_ELM(139),
+[OVR_LBL(140)] = OVR_ELM(140), [OVR_LBL(141)] = OVR_ELM(141),
+[OVR_LBL(142)] = OVR_ELM(142), [OVR_LBL(143)] = OVR_ELM(143),
+[OVR_LBL(144)] = OVR_ELM(144), [OVR_LBL(145)] = OVR_ELM(145),
+[OVR_LBL(146)] = OVR_ELM(146), [OVR_LBL(147)] = OVR_ELM(147),
+[OVR_LBL(148)] = OVR_ELM(148), [OVR_LBL(149)] = OVR_ELM(149),
+[OVR_LBL(150)] = OVR_ELM(150), [OVR_LBL(151)] = OVR_ELM(151),
+[OVR_LBL(152)] = OVR_ELM(152), [OVR_LBL(153)] = OVR_ELM(153),
+[OVR_LBL(154)] = OVR_ELM(154), [OVR_LBL(155)] = OVR_ELM(155),
+[OVR_LBL(156)] = OVR_ELM(156), [OVR_LBL(157)] = OVR_ELM(157),
+[OVR_LBL(158)] = OVR_ELM(158), [OVR_LBL(159)] = OVR_ELM(159),
+};
+
+/* ======================================================================== */
+
+/* return true if this is chip revision revision a */
+int is_ax(struct hfi1_devdata *dd)
+{
+       u8 chip_rev_minor =
+               dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT
+                       & CCE_REVISION_CHIP_REV_MINOR_MASK;
+       return (chip_rev_minor & 0xf0) == 0;
+}
+
+/* return true if this is chip revision revision b */
+int is_bx(struct hfi1_devdata *dd)
+{
+       u8 chip_rev_minor =
+               dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT
+                       & CCE_REVISION_CHIP_REV_MINOR_MASK;
+       return (chip_rev_minor & 0xF0) == 0x10;
+}
+
+/*
+ * Append string s to buffer buf.  Arguments curp and len are the current
+ * position and remaining length, respectively.
+ *
+ * return 0 on success, 1 on out of room
+ */
+static int append_str(char *buf, char **curp, int *lenp, const char *s)
+{
+       char *p = *curp;
+       int len = *lenp;
+       int result = 0; /* success */
+       char c;
+
+       /* add a comma, if first in the buffer */
+       if (p != buf) {
+               if (len == 0) {
+                       result = 1; /* out of room */
+                       goto done;
+               }
+               *p++ = ',';
+               len--;
+       }
+
+       /* copy the string */
+       while ((c = *s++) != 0) {
+               if (len == 0) {
+                       result = 1; /* out of room */
+                       goto done;
+               }
+               *p++ = c;
+               len--;
+       }
+
+done:
+       /* write return values */
+       *curp = p;
+       *lenp = len;
+
+       return result;
+}
+
+/*
+ * Using the given flag table, print a comma separated string into
+ * the buffer.  End in '*' if the buffer is too short.
+ */
+static char *flag_string(char *buf, int buf_len, u64 flags,
+                        struct flag_table *table, int table_size)
+{
+       char extra[32];
+       char *p = buf;
+       int len = buf_len;
+       int no_room = 0;
+       int i;
+
+       /* make sure there is at least 2 so we can form "*" */
+       if (len < 2)
+               return "";
+
+       len--;  /* leave room for a nul */
+       for (i = 0; i < table_size; i++) {
+               if (flags & table[i].flag) {
+                       no_room = append_str(buf, &p, &len, table[i].str);
+                       if (no_room)
+                               break;
+                       flags &= ~table[i].flag;
+               }
+       }
+
+       /* any undocumented bits left? */
+       if (!no_room && flags) {
+               snprintf(extra, sizeof(extra), "bits 0x%llx", flags);
+               no_room = append_str(buf, &p, &len, extra);
+       }
+
+       /* add * if ran out of room */
+       if (no_room) {
+               /* may need to back up to add space for a '*' */
+               if (len == 0)
+                       --p;
+               *p++ = '*';
+       }
+
+       /* add final nul - space already allocated above */
+       *p = 0;
+       return buf;
+}
+
+/* first 8 CCE error interrupt source names */
+static const char * const cce_misc_names[] = {
+       "CceErrInt",            /* 0 */
+       "RxeErrInt",            /* 1 */
+       "MiscErrInt",           /* 2 */
+       "Reserved3",            /* 3 */
+       "PioErrInt",            /* 4 */
+       "SDmaErrInt",           /* 5 */
+       "EgressErrInt",         /* 6 */
+       "TxeErrInt"             /* 7 */
+};
+
+/*
+ * Return the miscellaneous error interrupt name.
+ */
+static char *is_misc_err_name(char *buf, size_t bsize, unsigned int source)
+{
+       if (source < ARRAY_SIZE(cce_misc_names))
+               strncpy(buf, cce_misc_names[source], bsize);
+       else
+               snprintf(buf, bsize, "Reserved%u",
+                        source + IS_GENERAL_ERR_START);
+
+       return buf;
+}
+
+/*
+ * Return the SDMA engine error interrupt name.
+ */
+static char *is_sdma_eng_err_name(char *buf, size_t bsize, unsigned int source)
+{
+       snprintf(buf, bsize, "SDmaEngErrInt%u", source);
+       return buf;
+}
+
+/*
+ * Return the send context error interrupt name.
+ */
+static char *is_sendctxt_err_name(char *buf, size_t bsize, unsigned int source)
+{
+       snprintf(buf, bsize, "SendCtxtErrInt%u", source);
+       return buf;
+}
+
+static const char * const various_names[] = {
+       "PbcInt",
+       "GpioAssertInt",
+       "Qsfp1Int",
+       "Qsfp2Int",
+       "TCritInt"
+};
+
+/*
+ * Return the various interrupt name.
+ */
+static char *is_various_name(char *buf, size_t bsize, unsigned int source)
+{
+       if (source < ARRAY_SIZE(various_names))
+               strncpy(buf, various_names[source], bsize);
+       else
+               snprintf(buf, bsize, "Reserved%u", source + IS_VARIOUS_START);
+       return buf;
+}
+
+/*
+ * Return the DC interrupt name.
+ */
+static char *is_dc_name(char *buf, size_t bsize, unsigned int source)
+{
+       static const char * const dc_int_names[] = {
+               "common",
+               "lcb",
+               "8051",
+               "lbm"   /* local block merge */
+       };
+
+       if (source < ARRAY_SIZE(dc_int_names))
+               snprintf(buf, bsize, "dc_%s_int", dc_int_names[source]);
+       else
+               snprintf(buf, bsize, "DCInt%u", source);
+       return buf;
+}
+
+static const char * const sdma_int_names[] = {
+       "SDmaInt",
+       "SdmaIdleInt",
+       "SdmaProgressInt",
+};
+
+/*
+ * Return the SDMA engine interrupt name.
+ */
+static char *is_sdma_eng_name(char *buf, size_t bsize, unsigned int source)
+{
+       /* what interrupt */
+       unsigned int what  = source / TXE_NUM_SDMA_ENGINES;
+       /* which engine */
+       unsigned int which = source % TXE_NUM_SDMA_ENGINES;
+
+       if (likely(what < 3))
+               snprintf(buf, bsize, "%s%u", sdma_int_names[what], which);
+       else
+               snprintf(buf, bsize, "Invalid SDMA interrupt %u", source);
+       return buf;
+}
+
+/*
+ * Return the receive available interrupt name.
+ */
+static char *is_rcv_avail_name(char *buf, size_t bsize, unsigned int source)
+{
+       snprintf(buf, bsize, "RcvAvailInt%u", source);
+       return buf;
+}
+
+/*
+ * Return the receive urgent interrupt name.
+ */
+static char *is_rcv_urgent_name(char *buf, size_t bsize, unsigned int source)
+{
+       snprintf(buf, bsize, "RcvUrgentInt%u", source);
+       return buf;
+}
+
+/*
+ * Return the send credit interrupt name.
+ */
+static char *is_send_credit_name(char *buf, size_t bsize, unsigned int source)
+{
+       snprintf(buf, bsize, "SendCreditInt%u", source);
+       return buf;
+}
+
+/*
+ * Return the reserved interrupt name.
+ */
+static char *is_reserved_name(char *buf, size_t bsize, unsigned int source)
+{
+       snprintf(buf, bsize, "Reserved%u", source + IS_RESERVED_START);
+       return buf;
+}
+
+static char *cce_err_status_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags,
+                          cce_err_status_flags,
+                          ARRAY_SIZE(cce_err_status_flags));
+}
+
+static char *rxe_err_status_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags,
+                          rxe_err_status_flags,
+                          ARRAY_SIZE(rxe_err_status_flags));
+}
+
+static char *misc_err_status_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags, misc_err_status_flags,
+                          ARRAY_SIZE(misc_err_status_flags));
+}
+
+static char *pio_err_status_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags,
+                          pio_err_status_flags,
+                          ARRAY_SIZE(pio_err_status_flags));
+}
+
+static char *sdma_err_status_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags,
+                          sdma_err_status_flags,
+                          ARRAY_SIZE(sdma_err_status_flags));
+}
+
+static char *egress_err_status_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags,
+                          egress_err_status_flags,
+                          ARRAY_SIZE(egress_err_status_flags));
+}
+
+static char *egress_err_info_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags,
+                          egress_err_info_flags,
+                          ARRAY_SIZE(egress_err_info_flags));
+}
+
+static char *send_err_status_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags,
+                          send_err_status_flags,
+                          ARRAY_SIZE(send_err_status_flags));
+}
+
+static void handle_cce_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       char buf[96];
+       int i = 0;
+
+       /*
+        * For most these errors, there is nothing that can be done except
+        * report or record it.
+        */
+       dd_dev_info(dd, "CCE Error: %s\n",
+                   cce_err_status_string(buf, sizeof(buf), reg));
+
+       if ((reg & CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK) &&
+           is_ax(dd) && (dd->icode != ICODE_FUNCTIONAL_SIMULATOR)) {
+               /* this error requires a manual drop into SPC freeze mode */
+               /* then a fix up */
+               start_freeze_handling(dd->pport, FREEZE_SELF);
+       }
+
+       for (i = 0; i < NUM_CCE_ERR_STATUS_COUNTERS; i++) {
+               if (reg & (1ull << i)) {
+                       incr_cntr64(&dd->cce_err_status_cnt[i]);
+                       /* maintain a counter over all cce_err_status errors */
+                       incr_cntr64(&dd->sw_cce_err_status_aggregate);
+               }
+       }
+}
+
+/*
+ * Check counters for receive errors that do not have an interrupt
+ * associated with them.
+ */
+#define RCVERR_CHECK_TIME 10
+static void update_rcverr_timer(unsigned long opaque)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)opaque;
+       struct hfi1_pportdata *ppd = dd->pport;
+       u32 cur_ovfl_cnt = read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL);
+
+       if (dd->rcv_ovfl_cnt < cur_ovfl_cnt &&
+           ppd->port_error_action & OPA_PI_MASK_EX_BUFFER_OVERRUN) {
+               dd_dev_info(dd, "%s: PortErrorAction bounce\n", __func__);
+               set_link_down_reason(
+               ppd, OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN, 0,
+               OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN);
+               queue_work(ppd->hfi1_wq, &ppd->link_bounce_work);
+       }
+       dd->rcv_ovfl_cnt = (u32)cur_ovfl_cnt;
+
+       mod_timer(&dd->rcverr_timer, jiffies + HZ * RCVERR_CHECK_TIME);
+}
+
+static int init_rcverr(struct hfi1_devdata *dd)
+{
+       setup_timer(&dd->rcverr_timer, update_rcverr_timer, (unsigned long)dd);
+       /* Assume the hardware counter has been reset */
+       dd->rcv_ovfl_cnt = 0;
+       return mod_timer(&dd->rcverr_timer, jiffies + HZ * RCVERR_CHECK_TIME);
+}
+
+static void free_rcverr(struct hfi1_devdata *dd)
+{
+       if (dd->rcverr_timer.data)
+               del_timer_sync(&dd->rcverr_timer);
+       dd->rcverr_timer.data = 0;
+}
+
+static void handle_rxe_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       char buf[96];
+       int i = 0;
+
+       dd_dev_info(dd, "Receive Error: %s\n",
+                   rxe_err_status_string(buf, sizeof(buf), reg));
+
+       if (reg & ALL_RXE_FREEZE_ERR) {
+               int flags = 0;
+
+               /*
+                * Freeze mode recovery is disabled for the errors
+                * in RXE_FREEZE_ABORT_MASK
+                */
+               if (is_ax(dd) && (reg & RXE_FREEZE_ABORT_MASK))
+                       flags = FREEZE_ABORT;
+
+               start_freeze_handling(dd->pport, flags);
+       }
+
+       for (i = 0; i < NUM_RCV_ERR_STATUS_COUNTERS; i++) {
+               if (reg & (1ull << i))
+                       incr_cntr64(&dd->rcv_err_status_cnt[i]);
+       }
+}
+
+static void handle_misc_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       char buf[96];
+       int i = 0;
+
+       dd_dev_info(dd, "Misc Error: %s",
+                   misc_err_status_string(buf, sizeof(buf), reg));
+       for (i = 0; i < NUM_MISC_ERR_STATUS_COUNTERS; i++) {
+               if (reg & (1ull << i))
+                       incr_cntr64(&dd->misc_err_status_cnt[i]);
+       }
+}
+
+static void handle_pio_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       char buf[96];
+       int i = 0;
+
+       dd_dev_info(dd, "PIO Error: %s\n",
+                   pio_err_status_string(buf, sizeof(buf), reg));
+
+       if (reg & ALL_PIO_FREEZE_ERR)
+               start_freeze_handling(dd->pport, 0);
+
+       for (i = 0; i < NUM_SEND_PIO_ERR_STATUS_COUNTERS; i++) {
+               if (reg & (1ull << i))
+                       incr_cntr64(&dd->send_pio_err_status_cnt[i]);
+       }
+}
+
+static void handle_sdma_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       char buf[96];
+       int i = 0;
+
+       dd_dev_info(dd, "SDMA Error: %s\n",
+                   sdma_err_status_string(buf, sizeof(buf), reg));
+
+       if (reg & ALL_SDMA_FREEZE_ERR)
+               start_freeze_handling(dd->pport, 0);
+
+       for (i = 0; i < NUM_SEND_DMA_ERR_STATUS_COUNTERS; i++) {
+               if (reg & (1ull << i))
+                       incr_cntr64(&dd->send_dma_err_status_cnt[i]);
+       }
+}
+
+static inline void __count_port_discards(struct hfi1_pportdata *ppd)
+{
+       incr_cntr64(&ppd->port_xmit_discards);
+}
+
+static void count_port_inactive(struct hfi1_devdata *dd)
+{
+       __count_port_discards(dd->pport);
+}
+
+/*
+ * We have had a "disallowed packet" error during egress. Determine the
+ * integrity check which failed, and update relevant error counter, etc.
+ *
+ * Note that the SEND_EGRESS_ERR_INFO register has only a single
+ * bit of state per integrity check, and so we can miss the reason for an
+ * egress error if more than one packet fails the same integrity check
+ * since we cleared the corresponding bit in SEND_EGRESS_ERR_INFO.
+ */
+static void handle_send_egress_err_info(struct hfi1_devdata *dd,
+                                       int vl)
+{
+       struct hfi1_pportdata *ppd = dd->pport;
+       u64 src = read_csr(dd, SEND_EGRESS_ERR_SOURCE); /* read first */
+       u64 info = read_csr(dd, SEND_EGRESS_ERR_INFO);
+       char buf[96];
+
+       /* clear down all observed info as quickly as possible after read */
+       write_csr(dd, SEND_EGRESS_ERR_INFO, info);
+
+       dd_dev_info(dd,
+                   "Egress Error Info: 0x%llx, %s Egress Error Src 0x%llx\n",
+                   info, egress_err_info_string(buf, sizeof(buf), info), src);
+
+       /* Eventually add other counters for each bit */
+       if (info & PORT_DISCARD_EGRESS_ERRS) {
+               int weight, i;
+
+               /*
+                * Count all applicable bits as individual errors and
+                * attribute them to the packet that triggered this handler.
+                * This may not be completely accurate due to limitations
+                * on the available hardware error information.  There is
+                * a single information register and any number of error
+                * packets may have occurred and contributed to it before
+                * this routine is called.  This means that:
+                * a) If multiple packets with the same error occur before
+                *    this routine is called, earlier packets are missed.
+                *    There is only a single bit for each error type.
+                * b) Errors may not be attributed to the correct VL.
+                *    The driver is attributing all bits in the info register
+                *    to the packet that triggered this call, but bits
+                *    could be an accumulation of different packets with
+                *    different VLs.
+                * c) A single error packet may have multiple counts attached
+                *    to it.  There is no way for the driver to know if
+                *    multiple bits set in the info register are due to a
+                *    single packet or multiple packets.  The driver assumes
+                *    multiple packets.
+                */
+               weight = hweight64(info & PORT_DISCARD_EGRESS_ERRS);
+               for (i = 0; i < weight; i++) {
+                       __count_port_discards(ppd);
+                       if (vl >= 0 && vl < TXE_NUM_DATA_VL)
+                               incr_cntr64(&ppd->port_xmit_discards_vl[vl]);
+                       else if (vl == 15)
+                               incr_cntr64(&ppd->port_xmit_discards_vl
+                                           [C_VL_15]);
+               }
+       }
+}
+
+/*
+ * Input value is a bit position within the SEND_EGRESS_ERR_STATUS
+ * register. Does it represent a 'port inactive' error?
+ */
+static inline int port_inactive_err(u64 posn)
+{
+       return (posn >= SEES(TX_LINKDOWN) &&
+               posn <= SEES(TX_INCORRECT_LINK_STATE));
+}
+
+/*
+ * Input value is a bit position within the SEND_EGRESS_ERR_STATUS
+ * register. Does it represent a 'disallowed packet' error?
+ */
+static inline int disallowed_pkt_err(int posn)
+{
+       return (posn >= SEES(TX_SDMA0_DISALLOWED_PACKET) &&
+               posn <= SEES(TX_SDMA15_DISALLOWED_PACKET));
+}
+
+/*
+ * Input value is a bit position of one of the SDMA engine disallowed
+ * packet errors.  Return which engine.  Use of this must be guarded by
+ * disallowed_pkt_err().
+ */
+static inline int disallowed_pkt_engine(int posn)
+{
+       return posn - SEES(TX_SDMA0_DISALLOWED_PACKET);
+}
+
+/*
+ * Translate an SDMA engine to a VL.  Return -1 if the tranlation cannot
+ * be done.
+ */
+static int engine_to_vl(struct hfi1_devdata *dd, int engine)
+{
+       struct sdma_vl_map *m;
+       int vl;
+
+       /* range check */
+       if (engine < 0 || engine >= TXE_NUM_SDMA_ENGINES)
+               return -1;
+
+       rcu_read_lock();
+       m = rcu_dereference(dd->sdma_map);
+       vl = m->engine_to_vl[engine];
+       rcu_read_unlock();
+
+       return vl;
+}
+
+/*
+ * Translate the send context (sofware index) into a VL.  Return -1 if the
+ * translation cannot be done.
+ */
+static int sc_to_vl(struct hfi1_devdata *dd, int sw_index)
+{
+       struct send_context_info *sci;
+       struct send_context *sc;
+       int i;
+
+       sci = &dd->send_contexts[sw_index];
+
+       /* there is no information for user (PSM) and ack contexts */
+       if ((sci->type != SC_KERNEL) && (sci->type != SC_VL15))
+               return -1;
+
+       sc = sci->sc;
+       if (!sc)
+               return -1;
+       if (dd->vld[15].sc == sc)
+               return 15;
+       for (i = 0; i < num_vls; i++)
+               if (dd->vld[i].sc == sc)
+                       return i;
+
+       return -1;
+}
+
+static void handle_egress_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       u64 reg_copy = reg, handled = 0;
+       char buf[96];
+       int i = 0;
+
+       if (reg & ALL_TXE_EGRESS_FREEZE_ERR)
+               start_freeze_handling(dd->pport, 0);
+       else if (is_ax(dd) &&
+                (reg & SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_VL_ERR_SMASK) &&
+                (dd->icode != ICODE_FUNCTIONAL_SIMULATOR))
+               start_freeze_handling(dd->pport, 0);
+
+       while (reg_copy) {
+               int posn = fls64(reg_copy);
+               /* fls64() returns a 1-based offset, we want it zero based */
+               int shift = posn - 1;
+               u64 mask = 1ULL << shift;
+
+               if (port_inactive_err(shift)) {
+                       count_port_inactive(dd);
+                       handled |= mask;
+               } else if (disallowed_pkt_err(shift)) {
+                       int vl = engine_to_vl(dd, disallowed_pkt_engine(shift));
+
+                       handle_send_egress_err_info(dd, vl);
+                       handled |= mask;
+               }
+               reg_copy &= ~mask;
+       }
+
+       reg &= ~handled;
+
+       if (reg)
+               dd_dev_info(dd, "Egress Error: %s\n",
+                           egress_err_status_string(buf, sizeof(buf), reg));
+
+       for (i = 0; i < NUM_SEND_EGRESS_ERR_STATUS_COUNTERS; i++) {
+               if (reg & (1ull << i))
+                       incr_cntr64(&dd->send_egress_err_status_cnt[i]);
+       }
+}
+
+static void handle_txe_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       char buf[96];
+       int i = 0;
+
+       dd_dev_info(dd, "Send Error: %s\n",
+                   send_err_status_string(buf, sizeof(buf), reg));
+
+       for (i = 0; i < NUM_SEND_ERR_STATUS_COUNTERS; i++) {
+               if (reg & (1ull << i))
+                       incr_cntr64(&dd->send_err_status_cnt[i]);
+       }
+}
+
+/*
+ * The maximum number of times the error clear down will loop before
+ * blocking a repeating error.  This value is arbitrary.
+ */
+#define MAX_CLEAR_COUNT 20
+
+/*
+ * Clear and handle an error register.  All error interrupts are funneled
+ * through here to have a central location to correctly handle single-
+ * or multi-shot errors.
+ *
+ * For non per-context registers, call this routine with a context value
+ * of 0 so the per-context offset is zero.
+ *
+ * If the handler loops too many times, assume that something is wrong
+ * and can't be fixed, so mask the error bits.
+ */
+static void interrupt_clear_down(struct hfi1_devdata *dd,
+                                u32 context,
+                                const struct err_reg_info *eri)
+{
+       u64 reg;
+       u32 count;
+
+       /* read in a loop until no more errors are seen */
+       count = 0;
+       while (1) {
+               reg = read_kctxt_csr(dd, context, eri->status);
+               if (reg == 0)
+                       break;
+               write_kctxt_csr(dd, context, eri->clear, reg);
+               if (likely(eri->handler))
+                       eri->handler(dd, context, reg);
+               count++;
+               if (count > MAX_CLEAR_COUNT) {
+                       u64 mask;
+
+                       dd_dev_err(dd, "Repeating %s bits 0x%llx - masking\n",
+                                  eri->desc, reg);
+                       /*
+                        * Read-modify-write so any other masked bits
+                        * remain masked.
+                        */
+                       mask = read_kctxt_csr(dd, context, eri->mask);
+                       mask &= ~reg;
+                       write_kctxt_csr(dd, context, eri->mask, mask);
+                       break;
+               }
+       }
+}
+
+/*
+ * CCE block "misc" interrupt.  Source is < 16.
+ */
+static void is_misc_err_int(struct hfi1_devdata *dd, unsigned int source)
+{
+       const struct err_reg_info *eri = &misc_errs[source];
+
+       if (eri->handler) {
+               interrupt_clear_down(dd, 0, eri);
+       } else {
+               dd_dev_err(dd, "Unexpected misc interrupt (%u) - reserved\n",
+                          source);
+       }
+}
+
+static char *send_context_err_status_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags,
+                          sc_err_status_flags,
+                          ARRAY_SIZE(sc_err_status_flags));
+}
+
+/*
+ * Send context error interrupt.  Source (hw_context) is < 160.
+ *
+ * All send context errors cause the send context to halt.  The normal
+ * clear-down mechanism cannot be used because we cannot clear the
+ * error bits until several other long-running items are done first.
+ * This is OK because with the context halted, nothing else is going
+ * to happen on it anyway.
+ */
+static void is_sendctxt_err_int(struct hfi1_devdata *dd,
+                               unsigned int hw_context)
+{
+       struct send_context_info *sci;
+       struct send_context *sc;
+       char flags[96];
+       u64 status;
+       u32 sw_index;
+       int i = 0;
+
+       sw_index = dd->hw_to_sw[hw_context];
+       if (sw_index >= dd->num_send_contexts) {
+               dd_dev_err(dd,
+                          "out of range sw index %u for send context %u\n",
+                          sw_index, hw_context);
+               return;
+       }
+       sci = &dd->send_contexts[sw_index];
+       sc = sci->sc;
+       if (!sc) {
+               dd_dev_err(dd, "%s: context %u(%u): no sc?\n", __func__,
+                          sw_index, hw_context);
+               return;
+       }
+
+       /* tell the software that a halt has begun */
+       sc_stop(sc, SCF_HALTED);
+
+       status = read_kctxt_csr(dd, hw_context, SEND_CTXT_ERR_STATUS);
+
+       dd_dev_info(dd, "Send Context %u(%u) Error: %s\n", sw_index, hw_context,
+                   send_context_err_status_string(flags, sizeof(flags),
+                                                  status));
+
+       if (status & SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK)
+               handle_send_egress_err_info(dd, sc_to_vl(dd, sw_index));
+
+       /*
+        * Automatically restart halted kernel contexts out of interrupt
+        * context.  User contexts must ask the driver to restart the context.
+        */
+       if (sc->type != SC_USER)
+               queue_work(dd->pport->hfi1_wq, &sc->halt_work);
+
+       /*
+        * Update the counters for the corresponding status bits.
+        * Note that these particular counters are aggregated over all
+        * 160 contexts.
+        */
+       for (i = 0; i < NUM_SEND_CTXT_ERR_STATUS_COUNTERS; i++) {
+               if (status & (1ull << i))
+                       incr_cntr64(&dd->sw_ctxt_err_status_cnt[i]);
+       }
+}
+
+static void handle_sdma_eng_err(struct hfi1_devdata *dd,
+                               unsigned int source, u64 status)
+{
+       struct sdma_engine *sde;
+       int i = 0;
+
+       sde = &dd->per_sdma[source];
+#ifdef CONFIG_SDMA_VERBOSITY
+       dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
+                  slashstrip(__FILE__), __LINE__, __func__);
+       dd_dev_err(sde->dd, "CONFIG SDMA(%u) source: %u status 0x%llx\n",
+                  sde->this_idx, source, (unsigned long long)status);
+#endif
+       sde->err_cnt++;
+       sdma_engine_error(sde, status);
+
+       /*
+       * Update the counters for the corresponding status bits.
+       * Note that these particular counters are aggregated over
+       * all 16 DMA engines.
+       */
+       for (i = 0; i < NUM_SEND_DMA_ENG_ERR_STATUS_COUNTERS; i++) {
+               if (status & (1ull << i))
+                       incr_cntr64(&dd->sw_send_dma_eng_err_status_cnt[i]);
+       }
+}
+
+/*
+ * CCE block SDMA error interrupt.  Source is < 16.
+ */
+static void is_sdma_eng_err_int(struct hfi1_devdata *dd, unsigned int source)
+{
+#ifdef CONFIG_SDMA_VERBOSITY
+       struct sdma_engine *sde = &dd->per_sdma[source];
+
+       dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
+                  slashstrip(__FILE__), __LINE__, __func__);
+       dd_dev_err(dd, "CONFIG SDMA(%u) source: %u\n", sde->this_idx,
+                  source);
+       sdma_dumpstate(sde);
+#endif
+       interrupt_clear_down(dd, source, &sdma_eng_err);
+}
+
+/*
+ * CCE block "various" interrupt.  Source is < 8.
+ */
+static void is_various_int(struct hfi1_devdata *dd, unsigned int source)
+{
+       const struct err_reg_info *eri = &various_err[source];
+
+       /*
+        * TCritInt cannot go through interrupt_clear_down()
+        * because it is not a second tier interrupt. The handler
+        * should be called directly.
+        */
+       if (source == TCRIT_INT_SOURCE)
+               handle_temp_err(dd);
+       else if (eri->handler)
+               interrupt_clear_down(dd, 0, eri);
+       else
+               dd_dev_info(dd,
+                           "%s: Unimplemented/reserved interrupt %d\n",
+                           __func__, source);
+}
+
+static void handle_qsfp_int(struct hfi1_devdata *dd, u32 src_ctx, u64 reg)
+{
+       /* src_ctx is always zero */
+       struct hfi1_pportdata *ppd = dd->pport;
+       unsigned long flags;
+       u64 qsfp_int_mgmt = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N);
+
+       if (reg & QSFP_HFI0_MODPRST_N) {
+               if (!qsfp_mod_present(ppd)) {
+                       dd_dev_info(dd, "%s: QSFP module removed\n",
+                                   __func__);
+
+                       ppd->driver_link_ready = 0;
+                       /*
+                        * Cable removed, reset all our information about the
+                        * cache and cable capabilities
+                        */
+
+                       spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+                       /*
+                        * We don't set cache_refresh_required here as we expect
+                        * an interrupt when a cable is inserted
+                        */
+                       ppd->qsfp_info.cache_valid = 0;
+                       ppd->qsfp_info.reset_needed = 0;
+                       ppd->qsfp_info.limiting_active = 0;
+                       spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock,
+                                              flags);
+                       /* Invert the ModPresent pin now to detect plug-in */
+                       write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_INVERT :
+                                 ASIC_QSFP1_INVERT, qsfp_int_mgmt);
+
+                       if ((ppd->offline_disabled_reason >
+                         HFI1_ODR_MASK(
+                         OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED)) ||
+                         (ppd->offline_disabled_reason ==
+                         HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE)))
+                               ppd->offline_disabled_reason =
+                               HFI1_ODR_MASK(
+                               OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED);
+
+                       if (ppd->host_link_state == HLS_DN_POLL) {
+                               /*
+                                * The link is still in POLL. This means
+                                * that the normal link down processing
+                                * will not happen. We have to do it here
+                                * before turning the DC off.
+                                */
+                               queue_work(ppd->hfi1_wq, &ppd->link_down_work);
+                       }
+               } else {
+                       dd_dev_info(dd, "%s: QSFP module inserted\n",
+                                   __func__);
+
+                       spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+                       ppd->qsfp_info.cache_valid = 0;
+                       ppd->qsfp_info.cache_refresh_required = 1;
+                       spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock,
+                                              flags);
+
+                       /*
+                        * Stop inversion of ModPresent pin to detect
+                        * removal of the cable
+                        */
+                       qsfp_int_mgmt &= ~(u64)QSFP_HFI0_MODPRST_N;
+                       write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_INVERT :
+                                 ASIC_QSFP1_INVERT, qsfp_int_mgmt);
+
+                       ppd->offline_disabled_reason =
+                               HFI1_ODR_MASK(OPA_LINKDOWN_REASON_TRANSIENT);
+               }
+       }
+
+       if (reg & QSFP_HFI0_INT_N) {
+               dd_dev_info(dd, "%s: Interrupt received from QSFP module\n",
+                           __func__);
+               spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+               ppd->qsfp_info.check_interrupt_flags = 1;
+               spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, flags);
+       }
+
+       /* Schedule the QSFP work only if there is a cable attached. */
+       if (qsfp_mod_present(ppd))
+               queue_work(ppd->hfi1_wq, &ppd->qsfp_info.qsfp_work);
+}
+
+static int request_host_lcb_access(struct hfi1_devdata *dd)
+{
+       int ret;
+
+       ret = do_8051_command(dd, HCMD_MISC,
+                             (u64)HCMD_MISC_REQUEST_LCB_ACCESS <<
+                             LOAD_DATA_FIELD_ID_SHIFT, NULL);
+       if (ret != HCMD_SUCCESS) {
+               dd_dev_err(dd, "%s: command failed with error %d\n",
+                          __func__, ret);
+       }
+       return ret == HCMD_SUCCESS ? 0 : -EBUSY;
+}
+
+static int request_8051_lcb_access(struct hfi1_devdata *dd)
+{
+       int ret;
+
+       ret = do_8051_command(dd, HCMD_MISC,
+                             (u64)HCMD_MISC_GRANT_LCB_ACCESS <<
+                             LOAD_DATA_FIELD_ID_SHIFT, NULL);
+       if (ret != HCMD_SUCCESS) {
+               dd_dev_err(dd, "%s: command failed with error %d\n",
+                          __func__, ret);
+       }
+       return ret == HCMD_SUCCESS ? 0 : -EBUSY;
+}
+
+/*
+ * Set the LCB selector - allow host access.  The DCC selector always
+ * points to the host.
+ */
+static inline void set_host_lcb_access(struct hfi1_devdata *dd)
+{
+       write_csr(dd, DC_DC8051_CFG_CSR_ACCESS_SEL,
+                 DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK |
+                 DC_DC8051_CFG_CSR_ACCESS_SEL_LCB_SMASK);
+}
+
+/*
+ * Clear the LCB selector - allow 8051 access.  The DCC selector always
+ * points to the host.
+ */
+static inline void set_8051_lcb_access(struct hfi1_devdata *dd)
+{
+       write_csr(dd, DC_DC8051_CFG_CSR_ACCESS_SEL,
+                 DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK);
+}
+
+/*
+ * Acquire LCB access from the 8051.  If the host already has access,
+ * just increment a counter.  Otherwise, inform the 8051 that the
+ * host is taking access.
+ *
+ * Returns:
+ *     0 on success
+ *     -EBUSY if the 8051 has control and cannot be disturbed
+ *     -errno if unable to acquire access from the 8051
+ */
+int acquire_lcb_access(struct hfi1_devdata *dd, int sleep_ok)
+{
+       struct hfi1_pportdata *ppd = dd->pport;
+       int ret = 0;
+
+       /*
+        * Use the host link state lock so the operation of this routine
+        * { link state check, selector change, count increment } can occur
+        * as a unit against a link state change.  Otherwise there is a
+        * race between the state change and the count increment.
+        */
+       if (sleep_ok) {
+               mutex_lock(&ppd->hls_lock);
+       } else {
+               while (!mutex_trylock(&ppd->hls_lock))
+                       udelay(1);
+       }
+
+       /* this access is valid only when the link is up */
+       if (ppd->host_link_state & HLS_DOWN) {
+               dd_dev_info(dd, "%s: link state %s not up\n",
+                           __func__, link_state_name(ppd->host_link_state));
+               ret = -EBUSY;
+               goto done;
+       }
+
+       if (dd->lcb_access_count == 0) {
+               ret = request_host_lcb_access(dd);
+               if (ret) {
+                       dd_dev_err(dd,
+                                  "%s: unable to acquire LCB access, err %d\n",
+                                  __func__, ret);
+                       goto done;
+               }
+               set_host_lcb_access(dd);
+       }
+       dd->lcb_access_count++;
+done:
+       mutex_unlock(&ppd->hls_lock);
+       return ret;
+}
+
+/*
+ * Release LCB access by decrementing the use count.  If the count is moving
+ * from 1 to 0, inform 8051 that it has control back.
+ *
+ * Returns:
+ *     0 on success
+ *     -errno if unable to release access to the 8051
+ */
+int release_lcb_access(struct hfi1_devdata *dd, int sleep_ok)
+{
+       int ret = 0;
+
+       /*
+        * Use the host link state lock because the acquire needed it.
+        * Here, we only need to keep { selector change, count decrement }
+        * as a unit.
+        */
+       if (sleep_ok) {
+               mutex_lock(&dd->pport->hls_lock);
+       } else {
+               while (!mutex_trylock(&dd->pport->hls_lock))
+                       udelay(1);
+       }
+
+       if (dd->lcb_access_count == 0) {
+               dd_dev_err(dd, "%s: LCB access count is zero.  Skipping.\n",
+                          __func__);
+               goto done;
+       }
+
+       if (dd->lcb_access_count == 1) {
+               set_8051_lcb_access(dd);
+               ret = request_8051_lcb_access(dd);
+               if (ret) {
+                       dd_dev_err(dd,
+                                  "%s: unable to release LCB access, err %d\n",
+                                  __func__, ret);
+                       /* restore host access if the grant didn't work */
+                       set_host_lcb_access(dd);
+                       goto done;
+               }
+       }
+       dd->lcb_access_count--;
+done:
+       mutex_unlock(&dd->pport->hls_lock);
+       return ret;
+}
+
+/*
+ * Initialize LCB access variables and state.  Called during driver load,
+ * after most of the initialization is finished.
+ *
+ * The DC default is LCB access on for the host.  The driver defaults to
+ * leaving access to the 8051.  Assign access now - this constrains the call
+ * to this routine to be after all LCB set-up is done.  In particular, after
+ * hf1_init_dd() -> set_up_interrupts() -> clear_all_interrupts()
+ */
+static void init_lcb_access(struct hfi1_devdata *dd)
+{
+       dd->lcb_access_count = 0;
+}
+
+/*
+ * Write a response back to a 8051 request.
+ */
+static void hreq_response(struct hfi1_devdata *dd, u8 return_code, u16 rsp_data)
+{
+       write_csr(dd, DC_DC8051_CFG_EXT_DEV_0,
+                 DC_DC8051_CFG_EXT_DEV_0_COMPLETED_SMASK |
+                 (u64)return_code <<
+                 DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT |
+                 (u64)rsp_data << DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT);
+}
+
+/*
+ * Handle host requests from the 8051.
+ */
+static void handle_8051_request(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u64 reg;
+       u16 data = 0;
+       u8 type;
+
+       reg = read_csr(dd, DC_DC8051_CFG_EXT_DEV_1);
+       if ((reg & DC_DC8051_CFG_EXT_DEV_1_REQ_NEW_SMASK) == 0)
+               return; /* no request */
+
+       /* zero out COMPLETED so the response is seen */
+       write_csr(dd, DC_DC8051_CFG_EXT_DEV_0, 0);
+
+       /* extract request details */
+       type = (reg >> DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_SHIFT)
+                       & DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_MASK;
+       data = (reg >> DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT)
+                       & DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_MASK;
+
+       switch (type) {
+       case HREQ_LOAD_CONFIG:
+       case HREQ_SAVE_CONFIG:
+       case HREQ_READ_CONFIG:
+       case HREQ_SET_TX_EQ_ABS:
+       case HREQ_SET_TX_EQ_REL:
+       case HREQ_ENABLE:
+               dd_dev_info(dd, "8051 request: request 0x%x not supported\n",
+                           type);
+               hreq_response(dd, HREQ_NOT_SUPPORTED, 0);
+               break;
+       case HREQ_CONFIG_DONE:
+               hreq_response(dd, HREQ_SUCCESS, 0);
+               break;
+
+       case HREQ_INTERFACE_TEST:
+               hreq_response(dd, HREQ_SUCCESS, data);
+               break;
+       default:
+               dd_dev_err(dd, "8051 request: unknown request 0x%x\n", type);
+               hreq_response(dd, HREQ_NOT_SUPPORTED, 0);
+               break;
+       }
+}
+
+static void write_global_credit(struct hfi1_devdata *dd,
+                               u8 vau, u16 total, u16 shared)
+{
+       write_csr(dd, SEND_CM_GLOBAL_CREDIT,
+                 ((u64)total <<
+                  SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT) |
+                 ((u64)shared <<
+                  SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT) |
+                 ((u64)vau << SEND_CM_GLOBAL_CREDIT_AU_SHIFT));
+}
+
+/*
+ * Set up initial VL15 credits of the remote.  Assumes the rest of
+ * the CM credit registers are zero from a previous global or credit reset .
+ */
+void set_up_vl15(struct hfi1_devdata *dd, u8 vau, u16 vl15buf)
+{
+       /* leave shared count at zero for both global and VL15 */
+       write_global_credit(dd, vau, vl15buf, 0);
+
+       /* We may need some credits for another VL when sending packets
+        * with the snoop interface. Dividing it down the middle for VL15
+        * and VL0 should suffice.
+        */
+       if (unlikely(dd->hfi1_snoop.mode_flag == HFI1_PORT_SNOOP_MODE)) {
+               write_csr(dd, SEND_CM_CREDIT_VL15, (u64)(vl15buf >> 1)
+                   << SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT);
+               write_csr(dd, SEND_CM_CREDIT_VL, (u64)(vl15buf >> 1)
+                   << SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT);
+       } else {
+               write_csr(dd, SEND_CM_CREDIT_VL15, (u64)vl15buf
+                       << SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT);
+       }
+}
+
+/*
+ * Zero all credit details from the previous connection and
+ * reset the CM manager's internal counters.
+ */
+void reset_link_credits(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /* remove all previous VL credit limits */
+       for (i = 0; i < TXE_NUM_DATA_VL; i++)
+               write_csr(dd, SEND_CM_CREDIT_VL + (8 * i), 0);
+       write_csr(dd, SEND_CM_CREDIT_VL15, 0);
+       write_global_credit(dd, 0, 0, 0);
+       /* reset the CM block */
+       pio_send_control(dd, PSC_CM_RESET);
+}
+
+/* convert a vCU to a CU */
+static u32 vcu_to_cu(u8 vcu)
+{
+       return 1 << vcu;
+}
+
+/* convert a CU to a vCU */
+static u8 cu_to_vcu(u32 cu)
+{
+       return ilog2(cu);
+}
+
+/* convert a vAU to an AU */
+static u32 vau_to_au(u8 vau)
+{
+       return 8 * (1 << vau);
+}
+
+static void set_linkup_defaults(struct hfi1_pportdata *ppd)
+{
+       ppd->sm_trap_qp = 0x0;
+       ppd->sa_qp = 0x1;
+}
+
+/*
+ * Graceful LCB shutdown.  This leaves the LCB FIFOs in reset.
+ */
+static void lcb_shutdown(struct hfi1_devdata *dd, int abort)
+{
+       u64 reg;
+
+       /* clear lcb run: LCB_CFG_RUN.EN = 0 */
+       write_csr(dd, DC_LCB_CFG_RUN, 0);
+       /* set tx fifo reset: LCB_CFG_TX_FIFOS_RESET.VAL = 1 */
+       write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET,
+                 1ull << DC_LCB_CFG_TX_FIFOS_RESET_VAL_SHIFT);
+       /* set dcc reset csr: DCC_CFG_RESET.{reset_lcb,reset_rx_fpe} = 1 */
+       dd->lcb_err_en = read_csr(dd, DC_LCB_ERR_EN);
+       reg = read_csr(dd, DCC_CFG_RESET);
+       write_csr(dd, DCC_CFG_RESET, reg |
+                 (1ull << DCC_CFG_RESET_RESET_LCB_SHIFT) |
+                 (1ull << DCC_CFG_RESET_RESET_RX_FPE_SHIFT));
+       (void)read_csr(dd, DCC_CFG_RESET); /* make sure the write completed */
+       if (!abort) {
+               udelay(1);    /* must hold for the longer of 16cclks or 20ns */
+               write_csr(dd, DCC_CFG_RESET, reg);
+               write_csr(dd, DC_LCB_ERR_EN, dd->lcb_err_en);
+       }
+}
+
+/*
+ * This routine should be called after the link has been transitioned to
+ * OFFLINE (OFFLINE state has the side effect of putting the SerDes into
+ * reset).
+ *
+ * The expectation is that the caller of this routine would have taken
+ * care of properly transitioning the link into the correct state.
+ */
+static void dc_shutdown(struct hfi1_devdata *dd)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&dd->dc8051_lock, flags);
+       if (dd->dc_shutdown) {
+               spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+               return;
+       }
+       dd->dc_shutdown = 1;
+       spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+       /* Shutdown the LCB */
+       lcb_shutdown(dd, 1);
+       /*
+        * Going to OFFLINE would have causes the 8051 to put the
+        * SerDes into reset already. Just need to shut down the 8051,
+        * itself.
+        */
+       write_csr(dd, DC_DC8051_CFG_RST, 0x1);
+}
+
+/*
+ * Calling this after the DC has been brought out of reset should not
+ * do any damage.
+ */
+static void dc_start(struct hfi1_devdata *dd)
+{
+       unsigned long flags;
+       int ret;
+
+       spin_lock_irqsave(&dd->dc8051_lock, flags);
+       if (!dd->dc_shutdown)
+               goto done;
+       spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+       /* Take the 8051 out of reset */
+       write_csr(dd, DC_DC8051_CFG_RST, 0ull);
+       /* Wait until 8051 is ready */
+       ret = wait_fm_ready(dd, TIMEOUT_8051_START);
+       if (ret) {
+               dd_dev_err(dd, "%s: timeout starting 8051 firmware\n",
+                          __func__);
+       }
+       /* Take away reset for LCB and RX FPE (set in lcb_shutdown). */
+       write_csr(dd, DCC_CFG_RESET, 0x10);
+       /* lcb_shutdown() with abort=1 does not restore these */
+       write_csr(dd, DC_LCB_ERR_EN, dd->lcb_err_en);
+       spin_lock_irqsave(&dd->dc8051_lock, flags);
+       dd->dc_shutdown = 0;
+done:
+       spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+}
+
+/*
+ * These LCB adjustments are for the Aurora SerDes core in the FPGA.
+ */
+static void adjust_lcb_for_fpga_serdes(struct hfi1_devdata *dd)
+{
+       u64 rx_radr, tx_radr;
+       u32 version;
+
+       if (dd->icode != ICODE_FPGA_EMULATION)
+               return;
+
+       /*
+        * These LCB defaults on emulator _s are good, nothing to do here:
+        *      LCB_CFG_TX_FIFOS_RADR
+        *      LCB_CFG_RX_FIFOS_RADR
+        *      LCB_CFG_LN_DCLK
+        *      LCB_CFG_IGNORE_LOST_RCLK
+        */
+       if (is_emulator_s(dd))
+               return;
+       /* else this is _p */
+
+       version = emulator_rev(dd);
+       if (!is_ax(dd))
+               version = 0x2d; /* all B0 use 0x2d or higher settings */
+
+       if (version <= 0x12) {
+               /* release 0x12 and below */
+
+               /*
+                * LCB_CFG_RX_FIFOS_RADR.RST_VAL = 0x9
+                * LCB_CFG_RX_FIFOS_RADR.OK_TO_JUMP_VAL = 0x9
+                * LCB_CFG_RX_FIFOS_RADR.DO_NOT_JUMP_VAL = 0xa
+                */
+               rx_radr =
+                     0xaull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+                   | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+                   | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+               /*
+                * LCB_CFG_TX_FIFOS_RADR.ON_REINIT = 0 (default)
+                * LCB_CFG_TX_FIFOS_RADR.RST_VAL = 6
+                */
+               tx_radr = 6ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+       } else if (version <= 0x18) {
+               /* release 0x13 up to 0x18 */
+               /* LCB_CFG_RX_FIFOS_RADR = 0x988 */
+               rx_radr =
+                     0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+                   | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+                   | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+               tx_radr = 7ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+       } else if (version == 0x19) {
+               /* release 0x19 */
+               /* LCB_CFG_RX_FIFOS_RADR = 0xa99 */
+               rx_radr =
+                     0xAull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+                   | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+                   | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+               tx_radr = 3ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+       } else if (version == 0x1a) {
+               /* release 0x1a */
+               /* LCB_CFG_RX_FIFOS_RADR = 0x988 */
+               rx_radr =
+                     0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+                   | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+                   | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+               tx_radr = 7ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+               write_csr(dd, DC_LCB_CFG_LN_DCLK, 1ull);
+       } else {
+               /* release 0x1b and higher */
+               /* LCB_CFG_RX_FIFOS_RADR = 0x877 */
+               rx_radr =
+                     0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+                   | 0x7ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+                   | 0x7ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+               tx_radr = 3ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+       }
+
+       write_csr(dd, DC_LCB_CFG_RX_FIFOS_RADR, rx_radr);
+       /* LCB_CFG_IGNORE_LOST_RCLK.EN = 1 */
+       write_csr(dd, DC_LCB_CFG_IGNORE_LOST_RCLK,
+                 DC_LCB_CFG_IGNORE_LOST_RCLK_EN_SMASK);
+       write_csr(dd, DC_LCB_CFG_TX_FIFOS_RADR, tx_radr);
+}
+
+/*
+ * Handle a SMA idle message
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_sma_message(struct work_struct *work)
+{
+       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+                                                       sma_message_work);
+       struct hfi1_devdata *dd = ppd->dd;
+       u64 msg;
+       int ret;
+
+       /*
+        * msg is bytes 1-4 of the 40-bit idle message - the command code
+        * is stripped off
+        */
+       ret = read_idle_sma(dd, &msg);
+       if (ret)
+               return;
+       dd_dev_info(dd, "%s: SMA message 0x%llx\n", __func__, msg);
+       /*
+        * React to the SMA message.  Byte[1] (0 for us) is the command.
+        */
+       switch (msg & 0xff) {
+       case SMA_IDLE_ARM:
+               /*
+                * See OPAv1 table 9-14 - HFI and External Switch Ports Key
+                * State Transitions
+                *
+                * Only expected in INIT or ARMED, discard otherwise.
+                */
+               if (ppd->host_link_state & (HLS_UP_INIT | HLS_UP_ARMED))
+                       ppd->neighbor_normal = 1;
+               break;
+       case SMA_IDLE_ACTIVE:
+               /*
+                * See OPAv1 table 9-14 - HFI and External Switch Ports Key
+                * State Transitions
+                *
+                * Can activate the node.  Discard otherwise.
+                */
+               if (ppd->host_link_state == HLS_UP_ARMED &&
+                   ppd->is_active_optimize_enabled) {
+                       ppd->neighbor_normal = 1;
+                       ret = set_link_state(ppd, HLS_UP_ACTIVE);
+                       if (ret)
+                               dd_dev_err(
+                                       dd,
+                                       "%s: received Active SMA idle message, couldn't set link to Active\n",
+                                       __func__);
+               }
+               break;
+       default:
+               dd_dev_err(dd,
+                          "%s: received unexpected SMA idle message 0x%llx\n",
+                          __func__, msg);
+               break;
+       }
+}
+
+static void adjust_rcvctrl(struct hfi1_devdata *dd, u64 add, u64 clear)
+{
+       u64 rcvctrl;
+       unsigned long flags;
+
+       spin_lock_irqsave(&dd->rcvctrl_lock, flags);
+       rcvctrl = read_csr(dd, RCV_CTRL);
+       rcvctrl |= add;
+       rcvctrl &= ~clear;
+       write_csr(dd, RCV_CTRL, rcvctrl);
+       spin_unlock_irqrestore(&dd->rcvctrl_lock, flags);
+}
+
+static inline void add_rcvctrl(struct hfi1_devdata *dd, u64 add)
+{
+       adjust_rcvctrl(dd, add, 0);
+}
+
+static inline void clear_rcvctrl(struct hfi1_devdata *dd, u64 clear)
+{
+       adjust_rcvctrl(dd, 0, clear);
+}
+
+/*
+ * Called from all interrupt handlers to start handling an SPC freeze.
+ */
+void start_freeze_handling(struct hfi1_pportdata *ppd, int flags)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       struct send_context *sc;
+       int i;
+
+       if (flags & FREEZE_SELF)
+               write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_FREEZE_SMASK);
+
+       /* enter frozen mode */
+       dd->flags |= HFI1_FROZEN;
+
+       /* notify all SDMA engines that they are going into a freeze */
+       sdma_freeze_notify(dd, !!(flags & FREEZE_LINK_DOWN));
+
+       /* do halt pre-handling on all enabled send contexts */
+       for (i = 0; i < dd->num_send_contexts; i++) {
+               sc = dd->send_contexts[i].sc;
+               if (sc && (sc->flags & SCF_ENABLED))
+                       sc_stop(sc, SCF_FROZEN | SCF_HALTED);
+       }
+
+       /* Send context are frozen. Notify user space */
+       hfi1_set_uevent_bits(ppd, _HFI1_EVENT_FROZEN_BIT);
+
+       if (flags & FREEZE_ABORT) {
+               dd_dev_err(dd,
+                          "Aborted freeze recovery. Please REBOOT system\n");
+               return;
+       }
+       /* queue non-interrupt handler */
+       queue_work(ppd->hfi1_wq, &ppd->freeze_work);
+}
+
+/*
+ * Wait until all 4 sub-blocks indicate that they have frozen or unfrozen,
+ * depending on the "freeze" parameter.
+ *
+ * No need to return an error if it times out, our only option
+ * is to proceed anyway.
+ */
+static void wait_for_freeze_status(struct hfi1_devdata *dd, int freeze)
+{
+       unsigned long timeout;
+       u64 reg;
+
+       timeout = jiffies + msecs_to_jiffies(FREEZE_STATUS_TIMEOUT);
+       while (1) {
+               reg = read_csr(dd, CCE_STATUS);
+               if (freeze) {
+                       /* waiting until all indicators are set */
+                       if ((reg & ALL_FROZE) == ALL_FROZE)
+                               return; /* all done */
+               } else {
+                       /* waiting until all indicators are clear */
+                       if ((reg & ALL_FROZE) == 0)
+                               return; /* all done */
+               }
+
+               if (time_after(jiffies, timeout)) {
+                       dd_dev_err(dd,
+                                  "Time out waiting for SPC %sfreeze, bits 0x%llx, expecting 0x%llx, continuing",
+                                  freeze ? "" : "un", reg & ALL_FROZE,
+                                  freeze ? ALL_FROZE : 0ull);
+                       return;
+               }
+               usleep_range(80, 120);
+       }
+}
+
+/*
+ * Do all freeze handling for the RXE block.
+ */
+static void rxe_freeze(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /* disable port */
+       clear_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+
+       /* disable all receive contexts */
+       for (i = 0; i < dd->num_rcv_contexts; i++)
+               hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS, i);
+}
+
+/*
+ * Unfreeze handling for the RXE block - kernel contexts only.
+ * This will also enable the port.  User contexts will do unfreeze
+ * handling on a per-context basis as they call into the driver.
+ *
+ */
+static void rxe_kernel_unfreeze(struct hfi1_devdata *dd)
+{
+       u32 rcvmask;
+       int i;
+
+       /* enable all kernel contexts */
+       for (i = 0; i < dd->n_krcv_queues; i++) {
+               rcvmask = HFI1_RCVCTRL_CTXT_ENB;
+               /* HFI1_RCVCTRL_TAILUPD_[ENB|DIS] needs to be set explicitly */
+               rcvmask |= HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, DMA_RTAIL) ?
+                       HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS;
+               hfi1_rcvctrl(dd, rcvmask, i);
+       }
+
+       /* enable port */
+       add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+}
+
+/*
+ * Non-interrupt SPC freeze handling.
+ *
+ * This is a work-queue function outside of the triggering interrupt.
+ */
+void handle_freeze(struct work_struct *work)
+{
+       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+                                                               freeze_work);
+       struct hfi1_devdata *dd = ppd->dd;
+
+       /* wait for freeze indicators on all affected blocks */
+       wait_for_freeze_status(dd, 1);
+
+       /* SPC is now frozen */
+
+       /* do send PIO freeze steps */
+       pio_freeze(dd);
+
+       /* do send DMA freeze steps */
+       sdma_freeze(dd);
+
+       /* do send egress freeze steps - nothing to do */
+
+       /* do receive freeze steps */
+       rxe_freeze(dd);
+
+       /*
+        * Unfreeze the hardware - clear the freeze, wait for each
+        * block's frozen bit to clear, then clear the frozen flag.
+        */
+       write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_UNFREEZE_SMASK);
+       wait_for_freeze_status(dd, 0);
+
+       if (is_ax(dd)) {
+               write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_FREEZE_SMASK);
+               wait_for_freeze_status(dd, 1);
+               write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_UNFREEZE_SMASK);
+               wait_for_freeze_status(dd, 0);
+       }
+
+       /* do send PIO unfreeze steps for kernel contexts */
+       pio_kernel_unfreeze(dd);
+
+       /* do send DMA unfreeze steps */
+       sdma_unfreeze(dd);
+
+       /* do send egress unfreeze steps - nothing to do */
+
+       /* do receive unfreeze steps for kernel contexts */
+       rxe_kernel_unfreeze(dd);
+
+       /*
+        * The unfreeze procedure touches global device registers when
+        * it disables and re-enables RXE. Mark the device unfrozen
+        * after all that is done so other parts of the driver waiting
+        * for the device to unfreeze don't do things out of order.
+        *
+        * The above implies that the meaning of HFI1_FROZEN flag is
+        * "Device has gone into freeze mode and freeze mode handling
+        * is still in progress."
+        *
+        * The flag will be removed when freeze mode processing has
+        * completed.
+        */
+       dd->flags &= ~HFI1_FROZEN;
+       wake_up(&dd->event_queue);
+
+       /* no longer frozen */
+}
+
+/*
+ * Handle a link up interrupt from the 8051.
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_link_up(struct work_struct *work)
+{
+       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+                                                 link_up_work);
+       set_link_state(ppd, HLS_UP_INIT);
+
+       /* cache the read of DC_LCB_STS_ROUND_TRIP_LTP_CNT */
+       read_ltp_rtt(ppd->dd);
+       /*
+        * OPA specifies that certain counters are cleared on a transition
+        * to link up, so do that.
+        */
+       clear_linkup_counters(ppd->dd);
+       /*
+        * And (re)set link up default values.
+        */
+       set_linkup_defaults(ppd);
+
+       /* enforce link speed enabled */
+       if ((ppd->link_speed_active & ppd->link_speed_enabled) == 0) {
+               /* oops - current speed is not enabled, bounce */
+               dd_dev_err(ppd->dd,
+                          "Link speed active 0x%x is outside enabled 0x%x, downing link\n",
+                          ppd->link_speed_active, ppd->link_speed_enabled);
+               set_link_down_reason(ppd, OPA_LINKDOWN_REASON_SPEED_POLICY, 0,
+                                    OPA_LINKDOWN_REASON_SPEED_POLICY);
+               set_link_state(ppd, HLS_DN_OFFLINE);
+               tune_serdes(ppd);
+               start_link(ppd);
+       }
+}
+
+/*
+ * Several pieces of LNI information were cached for SMA in ppd.
+ * Reset these on link down
+ */
+static void reset_neighbor_info(struct hfi1_pportdata *ppd)
+{
+       ppd->neighbor_guid = 0;
+       ppd->neighbor_port_number = 0;
+       ppd->neighbor_type = 0;
+       ppd->neighbor_fm_security = 0;
+}
+
+static const char * const link_down_reason_strs[] = {
+       [OPA_LINKDOWN_REASON_NONE] = "None",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_0] = "Recive error 0",
+       [OPA_LINKDOWN_REASON_BAD_PKT_LEN] = "Bad packet length",
+       [OPA_LINKDOWN_REASON_PKT_TOO_LONG] = "Packet too long",
+       [OPA_LINKDOWN_REASON_PKT_TOO_SHORT] = "Packet too short",
+       [OPA_LINKDOWN_REASON_BAD_SLID] = "Bad SLID",
+       [OPA_LINKDOWN_REASON_BAD_DLID] = "Bad DLID",
+       [OPA_LINKDOWN_REASON_BAD_L2] = "Bad L2",
+       [OPA_LINKDOWN_REASON_BAD_SC] = "Bad SC",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_8] = "Receive error 8",
+       [OPA_LINKDOWN_REASON_BAD_MID_TAIL] = "Bad mid tail",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_10] = "Receive error 10",
+       [OPA_LINKDOWN_REASON_PREEMPT_ERROR] = "Preempt error",
+       [OPA_LINKDOWN_REASON_PREEMPT_VL15] = "Preempt vl15",
+       [OPA_LINKDOWN_REASON_BAD_VL_MARKER] = "Bad VL marker",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_14] = "Receive error 14",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_15] = "Receive error 15",
+       [OPA_LINKDOWN_REASON_BAD_HEAD_DIST] = "Bad head distance",
+       [OPA_LINKDOWN_REASON_BAD_TAIL_DIST] = "Bad tail distance",
+       [OPA_LINKDOWN_REASON_BAD_CTRL_DIST] = "Bad control distance",
+       [OPA_LINKDOWN_REASON_BAD_CREDIT_ACK] = "Bad credit ack",
+       [OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER] = "Unsupported VL marker",
+       [OPA_LINKDOWN_REASON_BAD_PREEMPT] = "Bad preempt",
+       [OPA_LINKDOWN_REASON_BAD_CONTROL_FLIT] = "Bad control flit",
+       [OPA_LINKDOWN_REASON_EXCEED_MULTICAST_LIMIT] = "Exceed multicast limit",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_24] = "Receive error 24",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_25] = "Receive error 25",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_26] = "Receive error 26",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_27] = "Receive error 27",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_28] = "Receive error 28",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_29] = "Receive error 29",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_30] = "Receive error 30",
+       [OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN] =
+                                       "Excessive buffer overrun",
+       [OPA_LINKDOWN_REASON_UNKNOWN] = "Unknown",
+       [OPA_LINKDOWN_REASON_REBOOT] = "Reboot",
+       [OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN] = "Neighbor unknown",
+       [OPA_LINKDOWN_REASON_FM_BOUNCE] = "FM bounce",
+       [OPA_LINKDOWN_REASON_SPEED_POLICY] = "Speed policy",
+       [OPA_LINKDOWN_REASON_WIDTH_POLICY] = "Width policy",
+       [OPA_LINKDOWN_REASON_DISCONNECTED] = "Disconnected",
+       [OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED] =
+                                       "Local media not installed",
+       [OPA_LINKDOWN_REASON_NOT_INSTALLED] = "Not installed",
+       [OPA_LINKDOWN_REASON_CHASSIS_CONFIG] = "Chassis config",
+       [OPA_LINKDOWN_REASON_END_TO_END_NOT_INSTALLED] =
+                                       "End to end not installed",
+       [OPA_LINKDOWN_REASON_POWER_POLICY] = "Power policy",
+       [OPA_LINKDOWN_REASON_LINKSPEED_POLICY] = "Link speed policy",
+       [OPA_LINKDOWN_REASON_LINKWIDTH_POLICY] = "Link width policy",
+       [OPA_LINKDOWN_REASON_SWITCH_MGMT] = "Switch management",
+       [OPA_LINKDOWN_REASON_SMA_DISABLED] = "SMA disabled",
+       [OPA_LINKDOWN_REASON_TRANSIENT] = "Transient"
+};
+
+/* return the neighbor link down reason string */
+static const char *link_down_reason_str(u8 reason)
+{
+       const char *str = NULL;
+
+       if (reason < ARRAY_SIZE(link_down_reason_strs))
+               str = link_down_reason_strs[reason];
+       if (!str)
+               str = "(invalid)";
+
+       return str;
+}
+
+/*
+ * Handle a link down interrupt from the 8051.
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_link_down(struct work_struct *work)
+{
+       u8 lcl_reason, neigh_reason = 0;
+       u8 link_down_reason;
+       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+                                                 link_down_work);
+       int was_up;
+       static const char ldr_str[] = "Link down reason: ";
+
+       if ((ppd->host_link_state &
+            (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) &&
+            ppd->port_type == PORT_TYPE_FIXED)
+               ppd->offline_disabled_reason =
+                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NOT_INSTALLED);
+
+       /* Go offline first, then deal with reading/writing through 8051 */
+       was_up = !!(ppd->host_link_state & HLS_UP);
+       set_link_state(ppd, HLS_DN_OFFLINE);
+
+       if (was_up) {
+               lcl_reason = 0;
+               /* link down reason is only valid if the link was up */
+               read_link_down_reason(ppd->dd, &link_down_reason);
+               switch (link_down_reason) {
+               case LDR_LINK_TRANSFER_ACTIVE_LOW:
+                       /* the link went down, no idle message reason */
+                       dd_dev_info(ppd->dd, "%sUnexpected link down\n",
+                                   ldr_str);
+                       break;
+               case LDR_RECEIVED_LINKDOWN_IDLE_MSG:
+                       /*
+                        * The neighbor reason is only valid if an idle message
+                        * was received for it.
+                        */
+                       read_planned_down_reason_code(ppd->dd, &neigh_reason);
+                       dd_dev_info(ppd->dd,
+                                   "%sNeighbor link down message %d, %s\n",
+                                   ldr_str, neigh_reason,
+                                   link_down_reason_str(neigh_reason));
+                       break;
+               case LDR_RECEIVED_HOST_OFFLINE_REQ:
+                       dd_dev_info(ppd->dd,
+                                   "%sHost requested link to go offline\n",
+                                   ldr_str);
+                       break;
+               default:
+                       dd_dev_info(ppd->dd, "%sUnknown reason 0x%x\n",
+                                   ldr_str, link_down_reason);
+                       break;
+               }
+
+               /*
+                * If no reason, assume peer-initiated but missed
+                * LinkGoingDown idle flits.
+                */
+               if (neigh_reason == 0)
+                       lcl_reason = OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN;
+       } else {
+               /* went down while polling or going up */
+               lcl_reason = OPA_LINKDOWN_REASON_TRANSIENT;
+       }
+
+       set_link_down_reason(ppd, lcl_reason, neigh_reason, 0);
+
+       /* inform the SMA when the link transitions from up to down */
+       if (was_up && ppd->local_link_down_reason.sma == 0 &&
+           ppd->neigh_link_down_reason.sma == 0) {
+               ppd->local_link_down_reason.sma =
+                                       ppd->local_link_down_reason.latest;
+               ppd->neigh_link_down_reason.sma =
+                                       ppd->neigh_link_down_reason.latest;
+       }
+
+       reset_neighbor_info(ppd);
+       if (ppd->mgmt_allowed)
+               remove_full_mgmt_pkey(ppd);
+
+       /* disable the port */
+       clear_rcvctrl(ppd->dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+
+       /*
+        * If there is no cable attached, turn the DC off. Otherwise,
+        * start the link bring up.
+        */
+       if (ppd->port_type == PORT_TYPE_QSFP && !qsfp_mod_present(ppd)) {
+               dc_shutdown(ppd->dd);
+       } else {
+               tune_serdes(ppd);
+               start_link(ppd);
+       }
+}
+
+void handle_link_bounce(struct work_struct *work)
+{
+       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+                                                       link_bounce_work);
+
+       /*
+        * Only do something if the link is currently up.
+        */
+       if (ppd->host_link_state & HLS_UP) {
+               set_link_state(ppd, HLS_DN_OFFLINE);
+               tune_serdes(ppd);
+               start_link(ppd);
+       } else {
+               dd_dev_info(ppd->dd, "%s: link not up (%s), nothing to do\n",
+                           __func__, link_state_name(ppd->host_link_state));
+       }
+}
+
+/*
+ * Mask conversion: Capability exchange to Port LTP.  The capability
+ * exchange has an implicit 16b CRC that is mandatory.
+ */
+static int cap_to_port_ltp(int cap)
+{
+       int port_ltp = PORT_LTP_CRC_MODE_16; /* this mode is mandatory */
+
+       if (cap & CAP_CRC_14B)
+               port_ltp |= PORT_LTP_CRC_MODE_14;
+       if (cap & CAP_CRC_48B)
+               port_ltp |= PORT_LTP_CRC_MODE_48;
+       if (cap & CAP_CRC_12B_16B_PER_LANE)
+               port_ltp |= PORT_LTP_CRC_MODE_PER_LANE;
+
+       return port_ltp;
+}
+
+/*
+ * Convert an OPA Port LTP mask to capability mask
+ */
+int port_ltp_to_cap(int port_ltp)
+{
+       int cap_mask = 0;
+
+       if (port_ltp & PORT_LTP_CRC_MODE_14)
+               cap_mask |= CAP_CRC_14B;
+       if (port_ltp & PORT_LTP_CRC_MODE_48)
+               cap_mask |= CAP_CRC_48B;
+       if (port_ltp & PORT_LTP_CRC_MODE_PER_LANE)
+               cap_mask |= CAP_CRC_12B_16B_PER_LANE;
+
+       return cap_mask;
+}
+
+/*
+ * Convert a single DC LCB CRC mode to an OPA Port LTP mask.
+ */
+static int lcb_to_port_ltp(int lcb_crc)
+{
+       int port_ltp = 0;
+
+       if (lcb_crc == LCB_CRC_12B_16B_PER_LANE)
+               port_ltp = PORT_LTP_CRC_MODE_PER_LANE;
+       else if (lcb_crc == LCB_CRC_48B)
+               port_ltp = PORT_LTP_CRC_MODE_48;
+       else if (lcb_crc == LCB_CRC_14B)
+               port_ltp = PORT_LTP_CRC_MODE_14;
+       else
+               port_ltp = PORT_LTP_CRC_MODE_16;
+
+       return port_ltp;
+}
+
+/*
+ * Our neighbor has indicated that we are allowed to act as a fabric
+ * manager, so place the full management partition key in the second
+ * (0-based) pkey array position (see OPAv1, section 20.2.2.6.8). Note
+ * that we should already have the limited management partition key in
+ * array element 1, and also that the port is not yet up when
+ * add_full_mgmt_pkey() is invoked.
+ */
+static void add_full_mgmt_pkey(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+
+       /* Sanity check - ppd->pkeys[2] should be 0, or already initalized */
+       if (!((ppd->pkeys[2] == 0) || (ppd->pkeys[2] == FULL_MGMT_P_KEY)))
+               dd_dev_warn(dd, "%s pkey[2] already set to 0x%x, resetting it to 0x%x\n",
+                           __func__, ppd->pkeys[2], FULL_MGMT_P_KEY);
+       ppd->pkeys[2] = FULL_MGMT_P_KEY;
+       (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_PKEYS, 0);
+}
+
+static void remove_full_mgmt_pkey(struct hfi1_pportdata *ppd)
+{
+       ppd->pkeys[2] = 0;
+       (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_PKEYS, 0);
+}
+
+/*
+ * Convert the given link width to the OPA link width bitmask.
+ */
+static u16 link_width_to_bits(struct hfi1_devdata *dd, u16 width)
+{
+       switch (width) {
+       case 0:
+               /*
+                * Simulator and quick linkup do not set the width.
+                * Just set it to 4x without complaint.
+                */
+               if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR || quick_linkup)
+                       return OPA_LINK_WIDTH_4X;
+               return 0; /* no lanes up */
+       case 1: return OPA_LINK_WIDTH_1X;
+       case 2: return OPA_LINK_WIDTH_2X;
+       case 3: return OPA_LINK_WIDTH_3X;
+       default:
+               dd_dev_info(dd, "%s: invalid width %d, using 4\n",
+                           __func__, width);
+               /* fall through */
+       case 4: return OPA_LINK_WIDTH_4X;
+       }
+}
+
+/*
+ * Do a population count on the bottom nibble.
+ */
+static const u8 bit_counts[16] = {
+       0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4
+};
+
+static inline u8 nibble_to_count(u8 nibble)
+{
+       return bit_counts[nibble & 0xf];
+}
+
+/*
+ * Read the active lane information from the 8051 registers and return
+ * their widths.
+ *
+ * Active lane information is found in these 8051 registers:
+ *     enable_lane_tx
+ *     enable_lane_rx
+ */
+static void get_link_widths(struct hfi1_devdata *dd, u16 *tx_width,
+                           u16 *rx_width)
+{
+       u16 tx, rx;
+       u8 enable_lane_rx;
+       u8 enable_lane_tx;
+       u8 tx_polarity_inversion;
+       u8 rx_polarity_inversion;
+       u8 max_rate;
+
+       /* read the active lanes */
+       read_tx_settings(dd, &enable_lane_tx, &tx_polarity_inversion,
+                        &rx_polarity_inversion, &max_rate);
+       read_local_lni(dd, &enable_lane_rx);
+
+       /* convert to counts */
+       tx = nibble_to_count(enable_lane_tx);
+       rx = nibble_to_count(enable_lane_rx);
+
+       /*
+        * Set link_speed_active here, overriding what was set in
+        * handle_verify_cap().  The ASIC 8051 firmware does not correctly
+        * set the max_rate field in handle_verify_cap until v0.19.
+        */
+       if ((dd->icode == ICODE_RTL_SILICON) &&
+           (dd->dc8051_ver < dc8051_ver(0, 19))) {
+               /* max_rate: 0 = 12.5G, 1 = 25G */
+               switch (max_rate) {
+               case 0:
+                       dd->pport[0].link_speed_active = OPA_LINK_SPEED_12_5G;
+                       break;
+               default:
+                       dd_dev_err(dd,
+                                  "%s: unexpected max rate %d, using 25Gb\n",
+                                  __func__, (int)max_rate);
+                       /* fall through */
+               case 1:
+                       dd->pport[0].link_speed_active = OPA_LINK_SPEED_25G;
+                       break;
+               }
+       }
+
+       dd_dev_info(dd,
+                   "Fabric active lanes (width): tx 0x%x (%d), rx 0x%x (%d)\n",
+                   enable_lane_tx, tx, enable_lane_rx, rx);
+       *tx_width = link_width_to_bits(dd, tx);
+       *rx_width = link_width_to_bits(dd, rx);
+}
+
+/*
+ * Read verify_cap_local_fm_link_width[1] to obtain the link widths.
+ * Valid after the end of VerifyCap and during LinkUp.  Does not change
+ * after link up.  I.e. look elsewhere for downgrade information.
+ *
+ * Bits are:
+ *     + bits [7:4] contain the number of active transmitters
+ *     + bits [3:0] contain the number of active receivers
+ * These are numbers 1 through 4 and can be different values if the
+ * link is asymmetric.
+ *
+ * verify_cap_local_fm_link_width[0] retains its original value.
+ */
+static void get_linkup_widths(struct hfi1_devdata *dd, u16 *tx_width,
+                             u16 *rx_width)
+{
+       u16 widths, tx, rx;
+       u8 misc_bits, local_flags;
+       u16 active_tx, active_rx;
+
+       read_vc_local_link_width(dd, &misc_bits, &local_flags, &widths);
+       tx = widths >> 12;
+       rx = (widths >> 8) & 0xf;
+
+       *tx_width = link_width_to_bits(dd, tx);
+       *rx_width = link_width_to_bits(dd, rx);
+
+       /* print the active widths */
+       get_link_widths(dd, &active_tx, &active_rx);
+}
+
+/*
+ * Set ppd->link_width_active and ppd->link_width_downgrade_active using
+ * hardware information when the link first comes up.
+ *
+ * The link width is not available until after VerifyCap.AllFramesReceived
+ * (the trigger for handle_verify_cap), so this is outside that routine
+ * and should be called when the 8051 signals linkup.
+ */
+void get_linkup_link_widths(struct hfi1_pportdata *ppd)
+{
+       u16 tx_width, rx_width;
+
+       /* get end-of-LNI link widths */
+       get_linkup_widths(ppd->dd, &tx_width, &rx_width);
+
+       /* use tx_width as the link is supposed to be symmetric on link up */
+       ppd->link_width_active = tx_width;
+       /* link width downgrade active (LWD.A) starts out matching LW.A */
+       ppd->link_width_downgrade_tx_active = ppd->link_width_active;
+       ppd->link_width_downgrade_rx_active = ppd->link_width_active;
+       /* per OPA spec, on link up LWD.E resets to LWD.S */
+       ppd->link_width_downgrade_enabled = ppd->link_width_downgrade_supported;
+       /* cache the active egress rate (units {10^6 bits/sec]) */
+       ppd->current_egress_rate = active_egress_rate(ppd);
+}
+
+/*
+ * Handle a verify capabilities interrupt from the 8051.
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_verify_cap(struct work_struct *work)
+{
+       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+                                                               link_vc_work);
+       struct hfi1_devdata *dd = ppd->dd;
+       u64 reg;
+       u8 power_management;
+       u8 continious;
+       u8 vcu;
+       u8 vau;
+       u8 z;
+       u16 vl15buf;
+       u16 link_widths;
+       u16 crc_mask;
+       u16 crc_val;
+       u16 device_id;
+       u16 active_tx, active_rx;
+       u8 partner_supported_crc;
+       u8 remote_tx_rate;
+       u8 device_rev;
+
+       set_link_state(ppd, HLS_VERIFY_CAP);
+
+       lcb_shutdown(dd, 0);
+       adjust_lcb_for_fpga_serdes(dd);
+
+       /*
+        * These are now valid:
+        *      remote VerifyCap fields in the general LNI config
+        *      CSR DC8051_STS_REMOTE_GUID
+        *      CSR DC8051_STS_REMOTE_NODE_TYPE
+        *      CSR DC8051_STS_REMOTE_FM_SECURITY
+        *      CSR DC8051_STS_REMOTE_PORT_NO
+        */
+
+       read_vc_remote_phy(dd, &power_management, &continious);
+       read_vc_remote_fabric(dd, &vau, &z, &vcu, &vl15buf,
+                             &partner_supported_crc);
+       read_vc_remote_link_width(dd, &remote_tx_rate, &link_widths);
+       read_remote_device_id(dd, &device_id, &device_rev);
+       /*
+        * And the 'MgmtAllowed' information, which is exchanged during
+        * LNI, is also be available at this point.
+        */
+       read_mgmt_allowed(dd, &ppd->mgmt_allowed);
+       /* print the active widths */
+       get_link_widths(dd, &active_tx, &active_rx);
+       dd_dev_info(dd,
+                   "Peer PHY: power management 0x%x, continuous updates 0x%x\n",
+                   (int)power_management, (int)continious);
+       dd_dev_info(dd,
+                   "Peer Fabric: vAU %d, Z %d, vCU %d, vl15 credits 0x%x, CRC sizes 0x%x\n",
+                   (int)vau, (int)z, (int)vcu, (int)vl15buf,
+                   (int)partner_supported_crc);
+       dd_dev_info(dd, "Peer Link Width: tx rate 0x%x, widths 0x%x\n",
+                   (u32)remote_tx_rate, (u32)link_widths);
+       dd_dev_info(dd, "Peer Device ID: 0x%04x, Revision 0x%02x\n",
+                   (u32)device_id, (u32)device_rev);
+       /*
+        * The peer vAU value just read is the peer receiver value.  HFI does
+        * not support a transmit vAU of 0 (AU == 8).  We advertised that
+        * with Z=1 in the fabric capabilities sent to the peer.  The peer
+        * will see our Z=1, and, if it advertised a vAU of 0, will move its
+        * receive to vAU of 1 (AU == 16).  Do the same here.  We do not care
+        * about the peer Z value - our sent vAU is 3 (hardwired) and is not
+        * subject to the Z value exception.
+        */
+       if (vau == 0)
+               vau = 1;
+       set_up_vl15(dd, vau, vl15buf);
+
+       /* set up the LCB CRC mode */
+       crc_mask = ppd->port_crc_mode_enabled & partner_supported_crc;
+
+       /* order is important: use the lowest bit in common */
+       if (crc_mask & CAP_CRC_14B)
+               crc_val = LCB_CRC_14B;
+       else if (crc_mask & CAP_CRC_48B)
+               crc_val = LCB_CRC_48B;
+       else if (crc_mask & CAP_CRC_12B_16B_PER_LANE)
+               crc_val = LCB_CRC_12B_16B_PER_LANE;
+       else
+               crc_val = LCB_CRC_16B;
+
+       dd_dev_info(dd, "Final LCB CRC mode: %d\n", (int)crc_val);
+       write_csr(dd, DC_LCB_CFG_CRC_MODE,
+                 (u64)crc_val << DC_LCB_CFG_CRC_MODE_TX_VAL_SHIFT);
+
+       /* set (14b only) or clear sideband credit */
+       reg = read_csr(dd, SEND_CM_CTRL);
+       if (crc_val == LCB_CRC_14B && crc_14b_sideband) {
+               write_csr(dd, SEND_CM_CTRL,
+                         reg | SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK);
+       } else {
+               write_csr(dd, SEND_CM_CTRL,
+                         reg & ~SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK);
+       }
+
+       ppd->link_speed_active = 0;     /* invalid value */
+       if (dd->dc8051_ver < dc8051_ver(0, 20)) {
+               /* remote_tx_rate: 0 = 12.5G, 1 = 25G */
+               switch (remote_tx_rate) {
+               case 0:
+                       ppd->link_speed_active = OPA_LINK_SPEED_12_5G;
+                       break;
+               case 1:
+                       ppd->link_speed_active = OPA_LINK_SPEED_25G;
+                       break;
+               }
+       } else {
+               /* actual rate is highest bit of the ANDed rates */
+               u8 rate = remote_tx_rate & ppd->local_tx_rate;
+
+               if (rate & 2)
+                       ppd->link_speed_active = OPA_LINK_SPEED_25G;
+               else if (rate & 1)
+                       ppd->link_speed_active = OPA_LINK_SPEED_12_5G;
+       }
+       if (ppd->link_speed_active == 0) {
+               dd_dev_err(dd, "%s: unexpected remote tx rate %d, using 25Gb\n",
+                          __func__, (int)remote_tx_rate);
+               ppd->link_speed_active = OPA_LINK_SPEED_25G;
+       }
+
+       /*
+        * Cache the values of the supported, enabled, and active
+        * LTP CRC modes to return in 'portinfo' queries. But the bit
+        * flags that are returned in the portinfo query differ from
+        * what's in the link_crc_mask, crc_sizes, and crc_val
+        * variables. Convert these here.
+        */
+       ppd->port_ltp_crc_mode = cap_to_port_ltp(link_crc_mask) << 8;
+               /* supported crc modes */
+       ppd->port_ltp_crc_mode |=
+               cap_to_port_ltp(ppd->port_crc_mode_enabled) << 4;
+               /* enabled crc modes */
+       ppd->port_ltp_crc_mode |= lcb_to_port_ltp(crc_val);
+               /* active crc mode */
+
+       /* set up the remote credit return table */
+       assign_remote_cm_au_table(dd, vcu);
+
+       /*
+        * The LCB is reset on entry to handle_verify_cap(), so this must
+        * be applied on every link up.
+        *
+        * Adjust LCB error kill enable to kill the link if
+        * these RBUF errors are seen:
+        *      REPLAY_BUF_MBE_SMASK
+        *      FLIT_INPUT_BUF_MBE_SMASK
+        */
+       if (is_ax(dd)) {                        /* fixed in B0 */
+               reg = read_csr(dd, DC_LCB_CFG_LINK_KILL_EN);
+               reg |= DC_LCB_CFG_LINK_KILL_EN_REPLAY_BUF_MBE_SMASK
+                       | DC_LCB_CFG_LINK_KILL_EN_FLIT_INPUT_BUF_MBE_SMASK;
+               write_csr(dd, DC_LCB_CFG_LINK_KILL_EN, reg);
+       }
+
+       /* pull LCB fifos out of reset - all fifo clocks must be stable */
+       write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0);
+
+       /* give 8051 access to the LCB CSRs */
+       write_csr(dd, DC_LCB_ERR_EN, 0); /* mask LCB errors */
+       set_8051_lcb_access(dd);
+
+       ppd->neighbor_guid =
+               read_csr(dd, DC_DC8051_STS_REMOTE_GUID);
+       ppd->neighbor_port_number = read_csr(dd, DC_DC8051_STS_REMOTE_PORT_NO) &
+                                       DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK;
+       ppd->neighbor_type =
+               read_csr(dd, DC_DC8051_STS_REMOTE_NODE_TYPE) &
+               DC_DC8051_STS_REMOTE_NODE_TYPE_VAL_MASK;
+       ppd->neighbor_fm_security =
+               read_csr(dd, DC_DC8051_STS_REMOTE_FM_SECURITY) &
+               DC_DC8051_STS_LOCAL_FM_SECURITY_DISABLED_MASK;
+       dd_dev_info(dd,
+                   "Neighbor Guid: %llx Neighbor type %d MgmtAllowed %d FM security bypass %d\n",
+                   ppd->neighbor_guid, ppd->neighbor_type,
+                   ppd->mgmt_allowed, ppd->neighbor_fm_security);
+       if (ppd->mgmt_allowed)
+               add_full_mgmt_pkey(ppd);
+
+       /* tell the 8051 to go to LinkUp */
+       set_link_state(ppd, HLS_GOING_UP);
+}
+
+/*
+ * Apply the link width downgrade enabled policy against the current active
+ * link widths.
+ *
+ * Called when the enabled policy changes or the active link widths change.
+ */
+void apply_link_downgrade_policy(struct hfi1_pportdata *ppd, int refresh_widths)
+{
+       int do_bounce = 0;
+       int tries;
+       u16 lwde;
+       u16 tx, rx;
+
+       /* use the hls lock to avoid a race with actual link up */
+       tries = 0;
+retry:
+       mutex_lock(&ppd->hls_lock);
+       /* only apply if the link is up */
+       if (ppd->host_link_state & HLS_DOWN) {
+               /* still going up..wait and retry */
+               if (ppd->host_link_state & HLS_GOING_UP) {
+                       if (++tries < 1000) {
+                               mutex_unlock(&ppd->hls_lock);
+                               usleep_range(100, 120); /* arbitrary */
+                               goto retry;
+                       }
+                       dd_dev_err(ppd->dd,
+                                  "%s: giving up waiting for link state change\n",
+                                  __func__);
+               }
+               goto done;
+       }
+
+       lwde = ppd->link_width_downgrade_enabled;
+
+       if (refresh_widths) {
+               get_link_widths(ppd->dd, &tx, &rx);
+               ppd->link_width_downgrade_tx_active = tx;
+               ppd->link_width_downgrade_rx_active = rx;
+       }
+
+       if (ppd->link_width_downgrade_tx_active == 0 ||
+           ppd->link_width_downgrade_rx_active == 0) {
+               /* the 8051 reported a dead link as a downgrade */
+               dd_dev_err(ppd->dd, "Link downgrade is really a link down, ignoring\n");
+       } else if (lwde == 0) {
+               /* downgrade is disabled */
+
+               /* bounce if not at starting active width */
+               if ((ppd->link_width_active !=
+                    ppd->link_width_downgrade_tx_active) ||
+                   (ppd->link_width_active !=
+                    ppd->link_width_downgrade_rx_active)) {
+                       dd_dev_err(ppd->dd,
+                                  "Link downgrade is disabled and link has downgraded, downing link\n");
+                       dd_dev_err(ppd->dd,
+                                  "  original 0x%x, tx active 0x%x, rx active 0x%x\n",
+                                  ppd->link_width_active,
+                                  ppd->link_width_downgrade_tx_active,
+                                  ppd->link_width_downgrade_rx_active);
+                       do_bounce = 1;
+               }
+       } else if ((lwde & ppd->link_width_downgrade_tx_active) == 0 ||
+                  (lwde & ppd->link_width_downgrade_rx_active) == 0) {
+               /* Tx or Rx is outside the enabled policy */
+               dd_dev_err(ppd->dd,
+                          "Link is outside of downgrade allowed, downing link\n");
+               dd_dev_err(ppd->dd,
+                          "  enabled 0x%x, tx active 0x%x, rx active 0x%x\n",
+                          lwde, ppd->link_width_downgrade_tx_active,
+                          ppd->link_width_downgrade_rx_active);
+               do_bounce = 1;
+       }
+
+done:
+       mutex_unlock(&ppd->hls_lock);
+
+       if (do_bounce) {
+               set_link_down_reason(ppd, OPA_LINKDOWN_REASON_WIDTH_POLICY, 0,
+                                    OPA_LINKDOWN_REASON_WIDTH_POLICY);
+               set_link_state(ppd, HLS_DN_OFFLINE);
+               tune_serdes(ppd);
+               start_link(ppd);
+       }
+}
+
+/*
+ * Handle a link downgrade interrupt from the 8051.
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_link_downgrade(struct work_struct *work)
+{
+       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+                                                       link_downgrade_work);
+
+       dd_dev_info(ppd->dd, "8051: Link width downgrade\n");
+       apply_link_downgrade_policy(ppd, 1);
+}
+
+static char *dcc_err_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags, dcc_err_flags,
+               ARRAY_SIZE(dcc_err_flags));
+}
+
+static char *lcb_err_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags, lcb_err_flags,
+               ARRAY_SIZE(lcb_err_flags));
+}
+
+static char *dc8051_err_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags, dc8051_err_flags,
+               ARRAY_SIZE(dc8051_err_flags));
+}
+
+static char *dc8051_info_err_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags, dc8051_info_err_flags,
+               ARRAY_SIZE(dc8051_info_err_flags));
+}
+
+static char *dc8051_info_host_msg_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags, dc8051_info_host_msg_flags,
+               ARRAY_SIZE(dc8051_info_host_msg_flags));
+}
+
+static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       struct hfi1_pportdata *ppd = dd->pport;
+       u64 info, err, host_msg;
+       int queue_link_down = 0;
+       char buf[96];
+
+       /* look at the flags */
+       if (reg & DC_DC8051_ERR_FLG_SET_BY_8051_SMASK) {
+               /* 8051 information set by firmware */
+               /* read DC8051_DBG_ERR_INFO_SET_BY_8051 for details */
+               info = read_csr(dd, DC_DC8051_DBG_ERR_INFO_SET_BY_8051);
+               err = (info >> DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_SHIFT)
+                       & DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_MASK;
+               host_msg = (info >>
+                       DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_SHIFT)
+                       & DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_MASK;
+
+               /*
+                * Handle error flags.
+                */
+               if (err & FAILED_LNI) {
+                       /*
+                        * LNI error indications are cleared by the 8051
+                        * only when starting polling.  Only pay attention
+                        * to them when in the states that occur during
+                        * LNI.
+                        */
+                       if (ppd->host_link_state
+                           & (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) {
+                               queue_link_down = 1;
+                               dd_dev_info(dd, "Link error: %s\n",
+                                           dc8051_info_err_string(buf,
+                                                                  sizeof(buf),
+                                                                  err &
+                                                                  FAILED_LNI));
+                       }
+                       err &= ~(u64)FAILED_LNI;
+               }
+               /* unknown frames can happen durning LNI, just count */
+               if (err & UNKNOWN_FRAME) {
+                       ppd->unknown_frame_count++;
+                       err &= ~(u64)UNKNOWN_FRAME;
+               }
+               if (err) {
+                       /* report remaining errors, but do not do anything */
+                       dd_dev_err(dd, "8051 info error: %s\n",
+                                  dc8051_info_err_string(buf, sizeof(buf),
+                                                         err));
+               }
+
+               /*
+                * Handle host message flags.
+                */
+               if (host_msg & HOST_REQ_DONE) {
+                       /*
+                        * Presently, the driver does a busy wait for
+                        * host requests to complete.  This is only an
+                        * informational message.
+                        * NOTE: The 8051 clears the host message
+                        * information *on the next 8051 command*.
+                        * Therefore, when linkup is achieved,
+                        * this flag will still be set.
+                        */
+                       host_msg &= ~(u64)HOST_REQ_DONE;
+               }
+               if (host_msg & BC_SMA_MSG) {
+                       queue_work(ppd->hfi1_wq, &ppd->sma_message_work);
+                       host_msg &= ~(u64)BC_SMA_MSG;
+               }
+               if (host_msg & LINKUP_ACHIEVED) {
+                       dd_dev_info(dd, "8051: Link up\n");
+                       queue_work(ppd->hfi1_wq, &ppd->link_up_work);
+                       host_msg &= ~(u64)LINKUP_ACHIEVED;
+               }
+               if (host_msg & EXT_DEVICE_CFG_REQ) {
+                       handle_8051_request(ppd);
+                       host_msg &= ~(u64)EXT_DEVICE_CFG_REQ;
+               }
+               if (host_msg & VERIFY_CAP_FRAME) {
+                       queue_work(ppd->hfi1_wq, &ppd->link_vc_work);
+                       host_msg &= ~(u64)VERIFY_CAP_FRAME;
+               }
+               if (host_msg & LINK_GOING_DOWN) {
+                       const char *extra = "";
+                       /* no downgrade action needed if going down */
+                       if (host_msg & LINK_WIDTH_DOWNGRADED) {
+                               host_msg &= ~(u64)LINK_WIDTH_DOWNGRADED;
+                               extra = " (ignoring downgrade)";
+                       }
+                       dd_dev_info(dd, "8051: Link down%s\n", extra);
+                       queue_link_down = 1;
+                       host_msg &= ~(u64)LINK_GOING_DOWN;
+               }
+               if (host_msg & LINK_WIDTH_DOWNGRADED) {
+                       queue_work(ppd->hfi1_wq, &ppd->link_downgrade_work);
+                       host_msg &= ~(u64)LINK_WIDTH_DOWNGRADED;
+               }
+               if (host_msg) {
+                       /* report remaining messages, but do not do anything */
+                       dd_dev_info(dd, "8051 info host message: %s\n",
+                                   dc8051_info_host_msg_string(buf,
+                                                               sizeof(buf),
+                                                               host_msg));
+               }
+
+               reg &= ~DC_DC8051_ERR_FLG_SET_BY_8051_SMASK;
+       }
+       if (reg & DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK) {
+               /*
+                * Lost the 8051 heartbeat.  If this happens, we
+                * receive constant interrupts about it.  Disable
+                * the interrupt after the first.
+                */
+               dd_dev_err(dd, "Lost 8051 heartbeat\n");
+               write_csr(dd, DC_DC8051_ERR_EN,
+                         read_csr(dd, DC_DC8051_ERR_EN) &
+                         ~DC_DC8051_ERR_EN_LOST_8051_HEART_BEAT_SMASK);
+
+               reg &= ~DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK;
+       }
+       if (reg) {
+               /* report the error, but do not do anything */
+               dd_dev_err(dd, "8051 error: %s\n",
+                          dc8051_err_string(buf, sizeof(buf), reg));
+       }
+
+       if (queue_link_down) {
+               /*
+                * if the link is already going down or disabled, do not
+                * queue another
+                */
+               if ((ppd->host_link_state &
+                   (HLS_GOING_OFFLINE | HLS_LINK_COOLDOWN)) ||
+                   ppd->link_enabled == 0) {
+                       dd_dev_info(dd, "%s: not queuing link down\n",
+                                   __func__);
+               } else {
+                       queue_work(ppd->hfi1_wq, &ppd->link_down_work);
+               }
+       }
+}
+
+static const char * const fm_config_txt[] = {
+[0] =
+       "BadHeadDist: Distance violation between two head flits",
+[1] =
+       "BadTailDist: Distance violation between two tail flits",
+[2] =
+       "BadCtrlDist: Distance violation between two credit control flits",
+[3] =
+       "BadCrdAck: Credits return for unsupported VL",
+[4] =
+       "UnsupportedVLMarker: Received VL Marker",
+[5] =
+       "BadPreempt: Exceeded the preemption nesting level",
+[6] =
+       "BadControlFlit: Received unsupported control flit",
+/* no 7 */
+[8] =
+       "UnsupportedVLMarker: Received VL Marker for unconfigured or disabled VL",
+};
+
+static const char * const port_rcv_txt[] = {
+[1] =
+       "BadPktLen: Illegal PktLen",
+[2] =
+       "PktLenTooLong: Packet longer than PktLen",
+[3] =
+       "PktLenTooShort: Packet shorter than PktLen",
+[4] =
+       "BadSLID: Illegal SLID (0, using multicast as SLID, does not include security validation of SLID)",
+[5] =
+       "BadDLID: Illegal DLID (0, doesn't match HFI)",
+[6] =
+       "BadL2: Illegal L2 opcode",
+[7] =
+       "BadSC: Unsupported SC",
+[9] =
+       "BadRC: Illegal RC",
+[11] =
+       "PreemptError: Preempting with same VL",
+[12] =
+       "PreemptVL15: Preempting a VL15 packet",
+};
+
+#define OPA_LDR_FMCONFIG_OFFSET 16
+#define OPA_LDR_PORTRCV_OFFSET 0
+static void handle_dcc_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       u64 info, hdr0, hdr1;
+       const char *extra;
+       char buf[96];
+       struct hfi1_pportdata *ppd = dd->pport;
+       u8 lcl_reason = 0;
+       int do_bounce = 0;
+
+       if (reg & DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK) {
+               if (!(dd->err_info_uncorrectable & OPA_EI_STATUS_SMASK)) {
+                       info = read_csr(dd, DCC_ERR_INFO_UNCORRECTABLE);
+                       dd->err_info_uncorrectable = info & OPA_EI_CODE_SMASK;
+                       /* set status bit */
+                       dd->err_info_uncorrectable |= OPA_EI_STATUS_SMASK;
+               }
+               reg &= ~DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK;
+       }
+
+       if (reg & DCC_ERR_FLG_LINK_ERR_SMASK) {
+               struct hfi1_pportdata *ppd = dd->pport;
+               /* this counter saturates at (2^32) - 1 */
+               if (ppd->link_downed < (u32)UINT_MAX)
+                       ppd->link_downed++;
+               reg &= ~DCC_ERR_FLG_LINK_ERR_SMASK;
+       }
+
+       if (reg & DCC_ERR_FLG_FMCONFIG_ERR_SMASK) {
+               u8 reason_valid = 1;
+
+               info = read_csr(dd, DCC_ERR_INFO_FMCONFIG);
+               if (!(dd->err_info_fmconfig & OPA_EI_STATUS_SMASK)) {
+                       dd->err_info_fmconfig = info & OPA_EI_CODE_SMASK;
+                       /* set status bit */
+                       dd->err_info_fmconfig |= OPA_EI_STATUS_SMASK;
+               }
+               switch (info) {
+               case 0:
+               case 1:
+               case 2:
+               case 3:
+               case 4:
+               case 5:
+               case 6:
+                       extra = fm_config_txt[info];
+                       break;
+               case 8:
+                       extra = fm_config_txt[info];
+                       if (ppd->port_error_action &
+                           OPA_PI_MASK_FM_CFG_UNSUPPORTED_VL_MARKER) {
+                               do_bounce = 1;
+                               /*
+                                * lcl_reason cannot be derived from info
+                                * for this error
+                                */
+                               lcl_reason =
+                                 OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER;
+                       }
+                       break;
+               default:
+                       reason_valid = 0;
+                       snprintf(buf, sizeof(buf), "reserved%lld", info);
+                       extra = buf;
+                       break;
+               }
+
+               if (reason_valid && !do_bounce) {
+                       do_bounce = ppd->port_error_action &
+                                       (1 << (OPA_LDR_FMCONFIG_OFFSET + info));
+                       lcl_reason = info + OPA_LINKDOWN_REASON_BAD_HEAD_DIST;
+               }
+
+               /* just report this */
+               dd_dev_info(dd, "DCC Error: fmconfig error: %s\n", extra);
+               reg &= ~DCC_ERR_FLG_FMCONFIG_ERR_SMASK;
+       }
+
+       if (reg & DCC_ERR_FLG_RCVPORT_ERR_SMASK) {
+               u8 reason_valid = 1;
+
+               info = read_csr(dd, DCC_ERR_INFO_PORTRCV);
+               hdr0 = read_csr(dd, DCC_ERR_INFO_PORTRCV_HDR0);
+               hdr1 = read_csr(dd, DCC_ERR_INFO_PORTRCV_HDR1);
+               if (!(dd->err_info_rcvport.status_and_code &
+                     OPA_EI_STATUS_SMASK)) {
+                       dd->err_info_rcvport.status_and_code =
+                               info & OPA_EI_CODE_SMASK;
+                       /* set status bit */
+                       dd->err_info_rcvport.status_and_code |=
+                               OPA_EI_STATUS_SMASK;
+                       /*
+                        * save first 2 flits in the packet that caused
+                        * the error
+                        */
+                        dd->err_info_rcvport.packet_flit1 = hdr0;
+                        dd->err_info_rcvport.packet_flit2 = hdr1;
+               }
+               switch (info) {
+               case 1:
+               case 2:
+               case 3:
+               case 4:
+               case 5:
+               case 6:
+               case 7:
+               case 9:
+               case 11:
+               case 12:
+                       extra = port_rcv_txt[info];
+                       break;
+               default:
+                       reason_valid = 0;
+                       snprintf(buf, sizeof(buf), "reserved%lld", info);
+                       extra = buf;
+                       break;
+               }
+
+               if (reason_valid && !do_bounce) {
+                       do_bounce = ppd->port_error_action &
+                                       (1 << (OPA_LDR_PORTRCV_OFFSET + info));
+                       lcl_reason = info + OPA_LINKDOWN_REASON_RCV_ERROR_0;
+               }
+
+               /* just report this */
+               dd_dev_info(dd, "DCC Error: PortRcv error: %s\n", extra);
+               dd_dev_info(dd, "           hdr0 0x%llx, hdr1 0x%llx\n",
+                           hdr0, hdr1);
+
+               reg &= ~DCC_ERR_FLG_RCVPORT_ERR_SMASK;
+       }
+
+       if (reg & DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK) {
+               /* informative only */
+               dd_dev_info(dd, "8051 access to LCB blocked\n");
+               reg &= ~DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK;
+       }
+       if (reg & DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK) {
+               /* informative only */
+               dd_dev_info(dd, "host access to LCB blocked\n");
+               reg &= ~DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK;
+       }
+
+       /* report any remaining errors */
+       if (reg)
+               dd_dev_info(dd, "DCC Error: %s\n",
+                           dcc_err_string(buf, sizeof(buf), reg));
+
+       if (lcl_reason == 0)
+               lcl_reason = OPA_LINKDOWN_REASON_UNKNOWN;
+
+       if (do_bounce) {
+               dd_dev_info(dd, "%s: PortErrorAction bounce\n", __func__);
+               set_link_down_reason(ppd, lcl_reason, 0, lcl_reason);
+               queue_work(ppd->hfi1_wq, &ppd->link_bounce_work);
+       }
+}
+
+static void handle_lcb_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       char buf[96];
+
+       dd_dev_info(dd, "LCB Error: %s\n",
+                   lcb_err_string(buf, sizeof(buf), reg));
+}
+
+/*
+ * CCE block DC interrupt.  Source is < 8.
+ */
+static void is_dc_int(struct hfi1_devdata *dd, unsigned int source)
+{
+       const struct err_reg_info *eri = &dc_errs[source];
+
+       if (eri->handler) {
+               interrupt_clear_down(dd, 0, eri);
+       } else if (source == 3 /* dc_lbm_int */) {
+               /*
+                * This indicates that a parity error has occurred on the
+                * address/control lines presented to the LBM.  The error
+                * is a single pulse, there is no associated error flag,
+                * and it is non-maskable.  This is because if a parity
+                * error occurs on the request the request is dropped.
+                * This should never occur, but it is nice to know if it
+                * ever does.
+                */
+               dd_dev_err(dd, "Parity error in DC LBM block\n");
+       } else {
+               dd_dev_err(dd, "Invalid DC interrupt %u\n", source);
+       }
+}
+
+/*
+ * TX block send credit interrupt.  Source is < 160.
+ */
+static void is_send_credit_int(struct hfi1_devdata *dd, unsigned int source)
+{
+       sc_group_release_update(dd, source);
+}
+
+/*
+ * TX block SDMA interrupt.  Source is < 48.
+ *
+ * SDMA interrupts are grouped by type:
+ *
+ *      0 -  N-1 = SDma
+ *      N - 2N-1 = SDmaProgress
+ *     2N - 3N-1 = SDmaIdle
+ */
+static void is_sdma_eng_int(struct hfi1_devdata *dd, unsigned int source)
+{
+       /* what interrupt */
+       unsigned int what  = source / TXE_NUM_SDMA_ENGINES;
+       /* which engine */
+       unsigned int which = source % TXE_NUM_SDMA_ENGINES;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+       dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", which,
+                  slashstrip(__FILE__), __LINE__, __func__);
+       sdma_dumpstate(&dd->per_sdma[which]);
+#endif
+
+       if (likely(what < 3 && which < dd->num_sdma)) {
+               sdma_engine_interrupt(&dd->per_sdma[which], 1ull << source);
+       } else {
+               /* should not happen */
+               dd_dev_err(dd, "Invalid SDMA interrupt 0x%x\n", source);
+       }
+}
+
+/*
+ * RX block receive available interrupt.  Source is < 160.
+ */
+static void is_rcv_avail_int(struct hfi1_devdata *dd, unsigned int source)
+{
+       struct hfi1_ctxtdata *rcd;
+       char *err_detail;
+
+       if (likely(source < dd->num_rcv_contexts)) {
+               rcd = dd->rcd[source];
+               if (rcd) {
+                       if (source < dd->first_user_ctxt)
+                               rcd->do_interrupt(rcd, 0);
+                       else
+                               handle_user_interrupt(rcd);
+                       return; /* OK */
+               }
+               /* received an interrupt, but no rcd */
+               err_detail = "dataless";
+       } else {
+               /* received an interrupt, but are not using that context */
+               err_detail = "out of range";
+       }
+       dd_dev_err(dd, "unexpected %s receive available context interrupt %u\n",
+                  err_detail, source);
+}
+
+/*
+ * RX block receive urgent interrupt.  Source is < 160.
+ */
+static void is_rcv_urgent_int(struct hfi1_devdata *dd, unsigned int source)
+{
+       struct hfi1_ctxtdata *rcd;
+       char *err_detail;
+
+       if (likely(source < dd->num_rcv_contexts)) {
+               rcd = dd->rcd[source];
+               if (rcd) {
+                       /* only pay attention to user urgent interrupts */
+                       if (source >= dd->first_user_ctxt)
+                               handle_user_interrupt(rcd);
+                       return; /* OK */
+               }
+               /* received an interrupt, but no rcd */
+               err_detail = "dataless";
+       } else {
+               /* received an interrupt, but are not using that context */
+               err_detail = "out of range";
+       }
+       dd_dev_err(dd, "unexpected %s receive urgent context interrupt %u\n",
+                  err_detail, source);
+}
+
+/*
+ * Reserved range interrupt.  Should not be called in normal operation.
+ */
+static void is_reserved_int(struct hfi1_devdata *dd, unsigned int source)
+{
+       char name[64];
+
+       dd_dev_err(dd, "unexpected %s interrupt\n",
+                  is_reserved_name(name, sizeof(name), source));
+}
+
+static const struct is_table is_table[] = {
+/*
+ * start                end
+ *                             name func               interrupt func
+ */
+{ IS_GENERAL_ERR_START,  IS_GENERAL_ERR_END,
+                               is_misc_err_name,       is_misc_err_int },
+{ IS_SDMAENG_ERR_START,  IS_SDMAENG_ERR_END,
+                               is_sdma_eng_err_name,   is_sdma_eng_err_int },
+{ IS_SENDCTXT_ERR_START, IS_SENDCTXT_ERR_END,
+                               is_sendctxt_err_name,   is_sendctxt_err_int },
+{ IS_SDMA_START,            IS_SDMA_END,
+                               is_sdma_eng_name,       is_sdma_eng_int },
+{ IS_VARIOUS_START,         IS_VARIOUS_END,
+                               is_various_name,        is_various_int },
+{ IS_DC_START,      IS_DC_END,
+                               is_dc_name,             is_dc_int },
+{ IS_RCVAVAIL_START,     IS_RCVAVAIL_END,
+                               is_rcv_avail_name,      is_rcv_avail_int },
+{ IS_RCVURGENT_START,    IS_RCVURGENT_END,
+                               is_rcv_urgent_name,     is_rcv_urgent_int },
+{ IS_SENDCREDIT_START,   IS_SENDCREDIT_END,
+                               is_send_credit_name,    is_send_credit_int},
+{ IS_RESERVED_START,     IS_RESERVED_END,
+                               is_reserved_name,       is_reserved_int},
+};
+
+/*
+ * Interrupt source interrupt - called when the given source has an interrupt.
+ * Source is a bit index into an array of 64-bit integers.
+ */
+static void is_interrupt(struct hfi1_devdata *dd, unsigned int source)
+{
+       const struct is_table *entry;
+
+       /* avoids a double compare by walking the table in-order */
+       for (entry = &is_table[0]; entry->is_name; entry++) {
+               if (source < entry->end) {
+                       trace_hfi1_interrupt(dd, entry, source);
+                       entry->is_int(dd, source - entry->start);
+                       return;
+               }
+       }
+       /* fell off the end */
+       dd_dev_err(dd, "invalid interrupt source %u\n", source);
+}
+
+/*
+ * General interrupt handler.  This is able to correctly handle
+ * all interrupts in case INTx is used.
+ */
+static irqreturn_t general_interrupt(int irq, void *data)
+{
+       struct hfi1_devdata *dd = data;
+       u64 regs[CCE_NUM_INT_CSRS];
+       u32 bit;
+       int i;
+
+       this_cpu_inc(*dd->int_counter);
+
+       /* phase 1: scan and clear all handled interrupts */
+       for (i = 0; i < CCE_NUM_INT_CSRS; i++) {
+               if (dd->gi_mask[i] == 0) {
+                       regs[i] = 0;    /* used later */
+                       continue;
+               }
+               regs[i] = read_csr(dd, CCE_INT_STATUS + (8 * i)) &
+                               dd->gi_mask[i];
+               /* only clear if anything is set */
+               if (regs[i])
+                       write_csr(dd, CCE_INT_CLEAR + (8 * i), regs[i]);
+       }
+
+       /* phase 2: call the appropriate handler */
+       for_each_set_bit(bit, (unsigned long *)&regs[0],
+                        CCE_NUM_INT_CSRS * 64) {
+               is_interrupt(dd, bit);
+       }
+
+       return IRQ_HANDLED;
+}
+
+static irqreturn_t sdma_interrupt(int irq, void *data)
+{
+       struct sdma_engine *sde = data;
+       struct hfi1_devdata *dd = sde->dd;
+       u64 status;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+       dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
+                  slashstrip(__FILE__), __LINE__, __func__);
+       sdma_dumpstate(sde);
+#endif
+
+       this_cpu_inc(*dd->int_counter);
+
+       /* This read_csr is really bad in the hot path */
+       status = read_csr(dd,
+                         CCE_INT_STATUS + (8 * (IS_SDMA_START / 64)))
+                         & sde->imask;
+       if (likely(status)) {
+               /* clear the interrupt(s) */
+               write_csr(dd,
+                         CCE_INT_CLEAR + (8 * (IS_SDMA_START / 64)),
+                         status);
+
+               /* handle the interrupt(s) */
+               sdma_engine_interrupt(sde, status);
+       } else
+               dd_dev_err(dd, "SDMA engine %u interrupt, but no status bits set\n",
+                          sde->this_idx);
+
+       return IRQ_HANDLED;
+}
+
+/*
+ * Clear the receive interrupt.  Use a read of the interrupt clear CSR
+ * to insure that the write completed.  This does NOT guarantee that
+ * queued DMA writes to memory from the chip are pushed.
+ */
+static inline void clear_recv_intr(struct hfi1_ctxtdata *rcd)
+{
+       struct hfi1_devdata *dd = rcd->dd;
+       u32 addr = CCE_INT_CLEAR + (8 * rcd->ireg);
+
+       mmiowb();       /* make sure everything before is written */
+       write_csr(dd, addr, rcd->imask);
+       /* force the above write on the chip and get a value back */
+       (void)read_csr(dd, addr);
+}
+
+/* force the receive interrupt */
+void force_recv_intr(struct hfi1_ctxtdata *rcd)
+{
+       write_csr(rcd->dd, CCE_INT_FORCE + (8 * rcd->ireg), rcd->imask);
+}
+
+/*
+ * Return non-zero if a packet is present.
+ *
+ * This routine is called when rechecking for packets after the RcvAvail
+ * interrupt has been cleared down.  First, do a quick check of memory for
+ * a packet present.  If not found, use an expensive CSR read of the context
+ * tail to determine the actual tail.  The CSR read is necessary because there
+ * is no method to push pending DMAs to memory other than an interrupt and we
+ * are trying to determine if we need to force an interrupt.
+ */
+static inline int check_packet_present(struct hfi1_ctxtdata *rcd)
+{
+       u32 tail;
+       int present;
+
+       if (!HFI1_CAP_IS_KSET(DMA_RTAIL))
+               present = (rcd->seq_cnt ==
+                               rhf_rcv_seq(rhf_to_cpu(get_rhf_addr(rcd))));
+       else /* is RDMA rtail */
+               present = (rcd->head != get_rcvhdrtail(rcd));
+
+       if (present)
+               return 1;
+
+       /* fall back to a CSR read, correct indpendent of DMA_RTAIL */
+       tail = (u32)read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_TAIL);
+       return rcd->head != tail;
+}
+
+/*
+ * Receive packet IRQ handler.  This routine expects to be on its own IRQ.
+ * This routine will try to handle packets immediately (latency), but if
+ * it finds too many, it will invoke the thread handler (bandwitdh).  The
+ * chip receive interrupt is *not* cleared down until this or the thread (if
+ * invoked) is finished.  The intent is to avoid extra interrupts while we
+ * are processing packets anyway.
+ */
+static irqreturn_t receive_context_interrupt(int irq, void *data)
+{
+       struct hfi1_ctxtdata *rcd = data;
+       struct hfi1_devdata *dd = rcd->dd;
+       int disposition;
+       int present;
+
+       trace_hfi1_receive_interrupt(dd, rcd->ctxt);
+       this_cpu_inc(*dd->int_counter);
+       aspm_ctx_disable(rcd);
+
+       /* receive interrupt remains blocked while processing packets */
+       disposition = rcd->do_interrupt(rcd, 0);
+
+       /*
+        * Too many packets were seen while processing packets in this
+        * IRQ handler.  Invoke the handler thread.  The receive interrupt
+        * remains blocked.
+        */
+       if (disposition == RCV_PKT_LIMIT)
+               return IRQ_WAKE_THREAD;
+
+       /*
+        * The packet processor detected no more packets.  Clear the receive
+        * interrupt and recheck for a packet packet that may have arrived
+        * after the previous check and interrupt clear.  If a packet arrived,
+        * force another interrupt.
+        */
+       clear_recv_intr(rcd);
+       present = check_packet_present(rcd);
+       if (present)
+               force_recv_intr(rcd);
+
+       return IRQ_HANDLED;
+}
+
+/*
+ * Receive packet thread handler.  This expects to be invoked with the
+ * receive interrupt still blocked.
+ */
+static irqreturn_t receive_context_thread(int irq, void *data)
+{
+       struct hfi1_ctxtdata *rcd = data;
+       int present;
+
+       /* receive interrupt is still blocked from the IRQ handler */
+       (void)rcd->do_interrupt(rcd, 1);
+
+       /*
+        * The packet processor will only return if it detected no more
+        * packets.  Hold IRQs here so we can safely clear the interrupt and
+        * recheck for a packet that may have arrived after the previous
+        * check and the interrupt clear.  If a packet arrived, force another
+        * interrupt.
+        */
+       local_irq_disable();
+       clear_recv_intr(rcd);
+       present = check_packet_present(rcd);
+       if (present)
+               force_recv_intr(rcd);
+       local_irq_enable();
+
+       return IRQ_HANDLED;
+}
+
+/* ========================================================================= */
+
+u32 read_physical_state(struct hfi1_devdata *dd)
+{
+       u64 reg;
+
+       reg = read_csr(dd, DC_DC8051_STS_CUR_STATE);
+       return (reg >> DC_DC8051_STS_CUR_STATE_PORT_SHIFT)
+                               & DC_DC8051_STS_CUR_STATE_PORT_MASK;
+}
+
+u32 read_logical_state(struct hfi1_devdata *dd)
+{
+       u64 reg;
+
+       reg = read_csr(dd, DCC_CFG_PORT_CONFIG);
+       return (reg >> DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT)
+                               & DCC_CFG_PORT_CONFIG_LINK_STATE_MASK;
+}
+
+static void set_logical_state(struct hfi1_devdata *dd, u32 chip_lstate)
+{
+       u64 reg;
+
+       reg = read_csr(dd, DCC_CFG_PORT_CONFIG);
+       /* clear current state, set new state */
+       reg &= ~DCC_CFG_PORT_CONFIG_LINK_STATE_SMASK;
+       reg |= (u64)chip_lstate << DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT;
+       write_csr(dd, DCC_CFG_PORT_CONFIG, reg);
+}
+
+/*
+ * Use the 8051 to read a LCB CSR.
+ */
+static int read_lcb_via_8051(struct hfi1_devdata *dd, u32 addr, u64 *data)
+{
+       u32 regno;
+       int ret;
+
+       if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
+               if (acquire_lcb_access(dd, 0) == 0) {
+                       *data = read_csr(dd, addr);
+                       release_lcb_access(dd, 0);
+                       return 0;
+               }
+               return -EBUSY;
+       }
+
+       /* register is an index of LCB registers: (offset - base) / 8 */
+       regno = (addr - DC_LCB_CFG_RUN) >> 3;
+       ret = do_8051_command(dd, HCMD_READ_LCB_CSR, regno, data);
+       if (ret != HCMD_SUCCESS)
+               return -EBUSY;
+       return 0;
+}
+
+/*
+ * Read an LCB CSR.  Access may not be in host control, so check.
+ * Return 0 on success, -EBUSY on failure.
+ */
+int read_lcb_csr(struct hfi1_devdata *dd, u32 addr, u64 *data)
+{
+       struct hfi1_pportdata *ppd = dd->pport;
+
+       /* if up, go through the 8051 for the value */
+       if (ppd->host_link_state & HLS_UP)
+               return read_lcb_via_8051(dd, addr, data);
+       /* if going up or down, no access */
+       if (ppd->host_link_state & (HLS_GOING_UP | HLS_GOING_OFFLINE))
+               return -EBUSY;
+       /* otherwise, host has access */
+       *data = read_csr(dd, addr);
+       return 0;
+}
+
+/*
+ * Use the 8051 to write a LCB CSR.
+ */
+static int write_lcb_via_8051(struct hfi1_devdata *dd, u32 addr, u64 data)
+{
+       u32 regno;
+       int ret;
+
+       if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR ||
+           (dd->dc8051_ver < dc8051_ver(0, 20))) {
+               if (acquire_lcb_access(dd, 0) == 0) {
+                       write_csr(dd, addr, data);
+                       release_lcb_access(dd, 0);
+                       return 0;
+               }
+               return -EBUSY;
+       }
+
+       /* register is an index of LCB registers: (offset - base) / 8 */
+       regno = (addr - DC_LCB_CFG_RUN) >> 3;
+       ret = do_8051_command(dd, HCMD_WRITE_LCB_CSR, regno, &data);
+       if (ret != HCMD_SUCCESS)
+               return -EBUSY;
+       return 0;
+}
+
+/*
+ * Write an LCB CSR.  Access may not be in host control, so check.
+ * Return 0 on success, -EBUSY on failure.
+ */
+int write_lcb_csr(struct hfi1_devdata *dd, u32 addr, u64 data)
+{
+       struct hfi1_pportdata *ppd = dd->pport;
+
+       /* if up, go through the 8051 for the value */
+       if (ppd->host_link_state & HLS_UP)
+               return write_lcb_via_8051(dd, addr, data);
+       /* if going up or down, no access */
+       if (ppd->host_link_state & (HLS_GOING_UP | HLS_GOING_OFFLINE))
+               return -EBUSY;
+       /* otherwise, host has access */
+       write_csr(dd, addr, data);
+       return 0;
+}
+
+/*
+ * Returns:
+ *     < 0 = Linux error, not able to get access
+ *     > 0 = 8051 command RETURN_CODE
+ */
+static int do_8051_command(
+       struct hfi1_devdata *dd,
+       u32 type,
+       u64 in_data,
+       u64 *out_data)
+{
+       u64 reg, completed;
+       int return_code;
+       unsigned long flags;
+       unsigned long timeout;
+
+       hfi1_cdbg(DC8051, "type %d, data 0x%012llx", type, in_data);
+
+       /*
+        * Alternative to holding the lock for a long time:
+        * - keep busy wait - have other users bounce off
+        */
+       spin_lock_irqsave(&dd->dc8051_lock, flags);
+
+       /* We can't send any commands to the 8051 if it's in reset */
+       if (dd->dc_shutdown) {
+               return_code = -ENODEV;
+               goto fail;
+       }
+
+       /*
+        * If an 8051 host command timed out previously, then the 8051 is
+        * stuck.
+        *
+        * On first timeout, attempt to reset and restart the entire DC
+        * block (including 8051). (Is this too big of a hammer?)
+        *
+        * If the 8051 times out a second time, the reset did not bring it
+        * back to healthy life. In that case, fail any subsequent commands.
+        */
+       if (dd->dc8051_timed_out) {
+               if (dd->dc8051_timed_out > 1) {
+                       dd_dev_err(dd,
+                                  "Previous 8051 host command timed out, skipping command %u\n",
+                                  type);
+                       return_code = -ENXIO;
+                       goto fail;
+               }
+               spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+               dc_shutdown(dd);
+               dc_start(dd);
+               spin_lock_irqsave(&dd->dc8051_lock, flags);
+       }
+
+       /*
+        * If there is no timeout, then the 8051 command interface is
+        * waiting for a command.
+        */
+
+       /*
+        * When writing a LCB CSR, out_data contains the full value to
+        * to be written, while in_data contains the relative LCB
+        * address in 7:0.  Do the work here, rather than the caller,
+        * of distrubting the write data to where it needs to go:
+        *
+        * Write data
+        *   39:00 -> in_data[47:8]
+        *   47:40 -> DC8051_CFG_EXT_DEV_0.RETURN_CODE
+        *   63:48 -> DC8051_CFG_EXT_DEV_0.RSP_DATA
+        */
+       if (type == HCMD_WRITE_LCB_CSR) {
+               in_data |= ((*out_data) & 0xffffffffffull) << 8;
+               reg = ((((*out_data) >> 40) & 0xff) <<
+                               DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT)
+                     | ((((*out_data) >> 48) & 0xffff) <<
+                               DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT);
+               write_csr(dd, DC_DC8051_CFG_EXT_DEV_0, reg);
+       }
+
+       /*
+        * Do two writes: the first to stabilize the type and req_data, the
+        * second to activate.
+        */
+       reg = ((u64)type & DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_MASK)
+                       << DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_SHIFT
+               | (in_data & DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_MASK)
+                       << DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_SHIFT;
+       write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, reg);
+       reg |= DC_DC8051_CFG_HOST_CMD_0_REQ_NEW_SMASK;
+       write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, reg);
+
+       /* wait for completion, alternate: interrupt */
+       timeout = jiffies + msecs_to_jiffies(DC8051_COMMAND_TIMEOUT);
+       while (1) {
+               reg = read_csr(dd, DC_DC8051_CFG_HOST_CMD_1);
+               completed = reg & DC_DC8051_CFG_HOST_CMD_1_COMPLETED_SMASK;
+               if (completed)
+                       break;
+               if (time_after(jiffies, timeout)) {
+                       dd->dc8051_timed_out++;
+                       dd_dev_err(dd, "8051 host command %u timeout\n", type);
+                       if (out_data)
+                               *out_data = 0;
+                       return_code = -ETIMEDOUT;
+                       goto fail;
+               }
+               udelay(2);
+       }
+
+       if (out_data) {
+               *out_data = (reg >> DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_SHIFT)
+                               & DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_MASK;
+               if (type == HCMD_READ_LCB_CSR) {
+                       /* top 16 bits are in a different register */
+                       *out_data |= (read_csr(dd, DC_DC8051_CFG_EXT_DEV_1)
+                               & DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SMASK)
+                               << (48
+                                   - DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT);
+               }
+       }
+       return_code = (reg >> DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_SHIFT)
+                               & DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_MASK;
+       dd->dc8051_timed_out = 0;
+       /*
+        * Clear command for next user.
+        */
+       write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, 0);
+
+fail:
+       spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+
+       return return_code;
+}
+
+static int set_physical_link_state(struct hfi1_devdata *dd, u64 state)
+{
+       return do_8051_command(dd, HCMD_CHANGE_PHY_STATE, state, NULL);
+}
+
+int load_8051_config(struct hfi1_devdata *dd, u8 field_id,
+                    u8 lane_id, u32 config_data)
+{
+       u64 data;
+       int ret;
+
+       data = (u64)field_id << LOAD_DATA_FIELD_ID_SHIFT
+               | (u64)lane_id << LOAD_DATA_LANE_ID_SHIFT
+               | (u64)config_data << LOAD_DATA_DATA_SHIFT;
+       ret = do_8051_command(dd, HCMD_LOAD_CONFIG_DATA, data, NULL);
+       if (ret != HCMD_SUCCESS) {
+               dd_dev_err(dd,
+                          "load 8051 config: field id %d, lane %d, err %d\n",
+                          (int)field_id, (int)lane_id, ret);
+       }
+       return ret;
+}
+
+/*
+ * Read the 8051 firmware "registers".  Use the RAM directly.  Always
+ * set the result, even on error.
+ * Return 0 on success, -errno on failure
+ */
+int read_8051_config(struct hfi1_devdata *dd, u8 field_id, u8 lane_id,
+                    u32 *result)
+{
+       u64 big_data;
+       u32 addr;
+       int ret;
+
+       /* address start depends on the lane_id */
+       if (lane_id < 4)
+               addr = (4 * NUM_GENERAL_FIELDS)
+                       + (lane_id * 4 * NUM_LANE_FIELDS);
+       else
+               addr = 0;
+       addr += field_id * 4;
+
+       /* read is in 8-byte chunks, hardware will truncate the address down */
+       ret = read_8051_data(dd, addr, 8, &big_data);
+
+       if (ret == 0) {
+               /* extract the 4 bytes we want */
+               if (addr & 0x4)
+                       *result = (u32)(big_data >> 32);
+               else
+                       *result = (u32)big_data;
+       } else {
+               *result = 0;
+               dd_dev_err(dd, "%s: direct read failed, lane %d, field %d!\n",
+                          __func__, lane_id, field_id);
+       }
+
+       return ret;
+}
+
+static int write_vc_local_phy(struct hfi1_devdata *dd, u8 power_management,
+                             u8 continuous)
+{
+       u32 frame;
+
+       frame = continuous << CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT
+               | power_management << POWER_MANAGEMENT_SHIFT;
+       return load_8051_config(dd, VERIFY_CAP_LOCAL_PHY,
+                               GENERAL_CONFIG, frame);
+}
+
+static int write_vc_local_fabric(struct hfi1_devdata *dd, u8 vau, u8 z, u8 vcu,
+                                u16 vl15buf, u8 crc_sizes)
+{
+       u32 frame;
+
+       frame = (u32)vau << VAU_SHIFT
+               | (u32)z << Z_SHIFT
+               | (u32)vcu << VCU_SHIFT
+               | (u32)vl15buf << VL15BUF_SHIFT
+               | (u32)crc_sizes << CRC_SIZES_SHIFT;
+       return load_8051_config(dd, VERIFY_CAP_LOCAL_FABRIC,
+                               GENERAL_CONFIG, frame);
+}
+
+static void read_vc_local_link_width(struct hfi1_devdata *dd, u8 *misc_bits,
+                                    u8 *flag_bits, u16 *link_widths)
+{
+       u32 frame;
+
+       read_8051_config(dd, VERIFY_CAP_LOCAL_LINK_WIDTH, GENERAL_CONFIG,
+                        &frame);
+       *misc_bits = (frame >> MISC_CONFIG_BITS_SHIFT) & MISC_CONFIG_BITS_MASK;
+       *flag_bits = (frame >> LOCAL_FLAG_BITS_SHIFT) & LOCAL_FLAG_BITS_MASK;
+       *link_widths = (frame >> LINK_WIDTH_SHIFT) & LINK_WIDTH_MASK;
+}
+
+static int write_vc_local_link_width(struct hfi1_devdata *dd,
+                                    u8 misc_bits,
+                                    u8 flag_bits,
+                                    u16 link_widths)
+{
+       u32 frame;
+
+       frame = (u32)misc_bits << MISC_CONFIG_BITS_SHIFT
+               | (u32)flag_bits << LOCAL_FLAG_BITS_SHIFT
+               | (u32)link_widths << LINK_WIDTH_SHIFT;
+       return load_8051_config(dd, VERIFY_CAP_LOCAL_LINK_WIDTH, GENERAL_CONFIG,
+                    frame);
+}
+
+static int write_local_device_id(struct hfi1_devdata *dd, u16 device_id,
+                                u8 device_rev)
+{
+       u32 frame;
+
+       frame = ((u32)device_id << LOCAL_DEVICE_ID_SHIFT)
+               | ((u32)device_rev << LOCAL_DEVICE_REV_SHIFT);
+       return load_8051_config(dd, LOCAL_DEVICE_ID, GENERAL_CONFIG, frame);
+}
+
+static void read_remote_device_id(struct hfi1_devdata *dd, u16 *device_id,
+                                 u8 *device_rev)
+{
+       u32 frame;
+
+       read_8051_config(dd, REMOTE_DEVICE_ID, GENERAL_CONFIG, &frame);
+       *device_id = (frame >> REMOTE_DEVICE_ID_SHIFT) & REMOTE_DEVICE_ID_MASK;
+       *device_rev = (frame >> REMOTE_DEVICE_REV_SHIFT)
+                       & REMOTE_DEVICE_REV_MASK;
+}
+
+void read_misc_status(struct hfi1_devdata *dd, u8 *ver_a, u8 *ver_b)
+{
+       u32 frame;
+
+       read_8051_config(dd, MISC_STATUS, GENERAL_CONFIG, &frame);
+       *ver_a = (frame >> STS_FM_VERSION_A_SHIFT) & STS_FM_VERSION_A_MASK;
+       *ver_b = (frame >> STS_FM_VERSION_B_SHIFT) & STS_FM_VERSION_B_MASK;
+}
+
+static void read_vc_remote_phy(struct hfi1_devdata *dd, u8 *power_management,
+                              u8 *continuous)
+{
+       u32 frame;
+
+       read_8051_config(dd, VERIFY_CAP_REMOTE_PHY, GENERAL_CONFIG, &frame);
+       *power_management = (frame >> POWER_MANAGEMENT_SHIFT)
+                                       & POWER_MANAGEMENT_MASK;
+       *continuous = (frame >> CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT)
+                                       & CONTINIOUS_REMOTE_UPDATE_SUPPORT_MASK;
+}
+
+static void read_vc_remote_fabric(struct hfi1_devdata *dd, u8 *vau, u8 *z,
+                                 u8 *vcu, u16 *vl15buf, u8 *crc_sizes)
+{
+       u32 frame;
+
+       read_8051_config(dd, VERIFY_CAP_REMOTE_FABRIC, GENERAL_CONFIG, &frame);
+       *vau = (frame >> VAU_SHIFT) & VAU_MASK;
+       *z = (frame >> Z_SHIFT) & Z_MASK;
+       *vcu = (frame >> VCU_SHIFT) & VCU_MASK;
+       *vl15buf = (frame >> VL15BUF_SHIFT) & VL15BUF_MASK;
+       *crc_sizes = (frame >> CRC_SIZES_SHIFT) & CRC_SIZES_MASK;
+}
+
+static void read_vc_remote_link_width(struct hfi1_devdata *dd,
+                                     u8 *remote_tx_rate,
+                                     u16 *link_widths)
+{
+       u32 frame;
+
+       read_8051_config(dd, VERIFY_CAP_REMOTE_LINK_WIDTH, GENERAL_CONFIG,
+                        &frame);
+       *remote_tx_rate = (frame >> REMOTE_TX_RATE_SHIFT)
+                               & REMOTE_TX_RATE_MASK;
+       *link_widths = (frame >> LINK_WIDTH_SHIFT) & LINK_WIDTH_MASK;
+}
+
+static void read_local_lni(struct hfi1_devdata *dd, u8 *enable_lane_rx)
+{
+       u32 frame;
+
+       read_8051_config(dd, LOCAL_LNI_INFO, GENERAL_CONFIG, &frame);
+       *enable_lane_rx = (frame >> ENABLE_LANE_RX_SHIFT) & ENABLE_LANE_RX_MASK;
+}
+
+static void read_mgmt_allowed(struct hfi1_devdata *dd, u8 *mgmt_allowed)
+{
+       u32 frame;
+
+       read_8051_config(dd, REMOTE_LNI_INFO, GENERAL_CONFIG, &frame);
+       *mgmt_allowed = (frame >> MGMT_ALLOWED_SHIFT) & MGMT_ALLOWED_MASK;
+}
+
+static void read_last_local_state(struct hfi1_devdata *dd, u32 *lls)
+{
+       read_8051_config(dd, LAST_LOCAL_STATE_COMPLETE, GENERAL_CONFIG, lls);
+}
+
+static void read_last_remote_state(struct hfi1_devdata *dd, u32 *lrs)
+{
+       read_8051_config(dd, LAST_REMOTE_STATE_COMPLETE, GENERAL_CONFIG, lrs);
+}
+
+void hfi1_read_link_quality(struct hfi1_devdata *dd, u8 *link_quality)
+{
+       u32 frame;
+       int ret;
+
+       *link_quality = 0;
+       if (dd->pport->host_link_state & HLS_UP) {
+               ret = read_8051_config(dd, LINK_QUALITY_INFO, GENERAL_CONFIG,
+                                      &frame);
+               if (ret == 0)
+                       *link_quality = (frame >> LINK_QUALITY_SHIFT)
+                                               & LINK_QUALITY_MASK;
+       }
+}
+
+static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc)
+{
+       u32 frame;
+
+       read_8051_config(dd, LINK_QUALITY_INFO, GENERAL_CONFIG, &frame);
+       *pdrrc = (frame >> DOWN_REMOTE_REASON_SHIFT) & DOWN_REMOTE_REASON_MASK;
+}
+
+static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr)
+{
+       u32 frame;
+
+       read_8051_config(dd, LINK_DOWN_REASON, GENERAL_CONFIG, &frame);
+       *ldr = (frame & 0xff);
+}
+
+static int read_tx_settings(struct hfi1_devdata *dd,
+                           u8 *enable_lane_tx,
+                           u8 *tx_polarity_inversion,
+                           u8 *rx_polarity_inversion,
+                           u8 *max_rate)
+{
+       u32 frame;
+       int ret;
+
+       ret = read_8051_config(dd, TX_SETTINGS, GENERAL_CONFIG, &frame);
+       *enable_lane_tx = (frame >> ENABLE_LANE_TX_SHIFT)
+                               & ENABLE_LANE_TX_MASK;
+       *tx_polarity_inversion = (frame >> TX_POLARITY_INVERSION_SHIFT)
+                               & TX_POLARITY_INVERSION_MASK;
+       *rx_polarity_inversion = (frame >> RX_POLARITY_INVERSION_SHIFT)
+                               & RX_POLARITY_INVERSION_MASK;
+       *max_rate = (frame >> MAX_RATE_SHIFT) & MAX_RATE_MASK;
+       return ret;
+}
+
+static int write_tx_settings(struct hfi1_devdata *dd,
+                            u8 enable_lane_tx,
+                            u8 tx_polarity_inversion,
+                            u8 rx_polarity_inversion,
+                            u8 max_rate)
+{
+       u32 frame;
+
+       /* no need to mask, all variable sizes match field widths */
+       frame = enable_lane_tx << ENABLE_LANE_TX_SHIFT
+               | tx_polarity_inversion << TX_POLARITY_INVERSION_SHIFT
+               | rx_polarity_inversion << RX_POLARITY_INVERSION_SHIFT
+               | max_rate << MAX_RATE_SHIFT;
+       return load_8051_config(dd, TX_SETTINGS, GENERAL_CONFIG, frame);
+}
+
+static void check_fabric_firmware_versions(struct hfi1_devdata *dd)
+{
+       u32 frame, version, prod_id;
+       int ret, lane;
+
+       /* 4 lanes */
+       for (lane = 0; lane < 4; lane++) {
+               ret = read_8051_config(dd, SPICO_FW_VERSION, lane, &frame);
+               if (ret) {
+                       dd_dev_err(dd,
+                                  "Unable to read lane %d firmware details\n",
+                                  lane);
+                       continue;
+               }
+               version = (frame >> SPICO_ROM_VERSION_SHIFT)
+                                       & SPICO_ROM_VERSION_MASK;
+               prod_id = (frame >> SPICO_ROM_PROD_ID_SHIFT)
+                                       & SPICO_ROM_PROD_ID_MASK;
+               dd_dev_info(dd,
+                           "Lane %d firmware: version 0x%04x, prod_id 0x%04x\n",
+                           lane, version, prod_id);
+       }
+}
+
+/*
+ * Read an idle LCB message.
+ *
+ * Returns 0 on success, -EINVAL on error
+ */
+static int read_idle_message(struct hfi1_devdata *dd, u64 type, u64 *data_out)
+{
+       int ret;
+
+       ret = do_8051_command(dd, HCMD_READ_LCB_IDLE_MSG, type, data_out);
+       if (ret != HCMD_SUCCESS) {
+               dd_dev_err(dd, "read idle message: type %d, err %d\n",
+                          (u32)type, ret);
+               return -EINVAL;
+       }
+       dd_dev_info(dd, "%s: read idle message 0x%llx\n", __func__, *data_out);
+       /* return only the payload as we already know the type */
+       *data_out >>= IDLE_PAYLOAD_SHIFT;
+       return 0;
+}
+
+/*
+ * Read an idle SMA message.  To be done in response to a notification from
+ * the 8051.
+ *
+ * Returns 0 on success, -EINVAL on error
+ */
+static int read_idle_sma(struct hfi1_devdata *dd, u64 *data)
+{
+       return read_idle_message(dd, (u64)IDLE_SMA << IDLE_MSG_TYPE_SHIFT,
+                                data);
+}
+
+/*
+ * Send an idle LCB message.
+ *
+ * Returns 0 on success, -EINVAL on error
+ */
+static int send_idle_message(struct hfi1_devdata *dd, u64 data)
+{
+       int ret;
+
+       dd_dev_info(dd, "%s: sending idle message 0x%llx\n", __func__, data);
+       ret = do_8051_command(dd, HCMD_SEND_LCB_IDLE_MSG, data, NULL);
+       if (ret != HCMD_SUCCESS) {
+               dd_dev_err(dd, "send idle message: data 0x%llx, err %d\n",
+                          data, ret);
+               return -EINVAL;
+       }
+       return 0;
+}
+
+/*
+ * Send an idle SMA message.
+ *
+ * Returns 0 on success, -EINVAL on error
+ */
+int send_idle_sma(struct hfi1_devdata *dd, u64 message)
+{
+       u64 data;
+
+       data = ((message & IDLE_PAYLOAD_MASK) << IDLE_PAYLOAD_SHIFT) |
+               ((u64)IDLE_SMA << IDLE_MSG_TYPE_SHIFT);
+       return send_idle_message(dd, data);
+}
+
+/*
+ * Initialize the LCB then do a quick link up.  This may or may not be
+ * in loopback.
+ *
+ * return 0 on success, -errno on error
+ */
+static int do_quick_linkup(struct hfi1_devdata *dd)
+{
+       u64 reg;
+       unsigned long timeout;
+       int ret;
+
+       lcb_shutdown(dd, 0);
+
+       if (loopback) {
+               /* LCB_CFG_LOOPBACK.VAL = 2 */
+               /* LCB_CFG_LANE_WIDTH.VAL = 0 */
+               write_csr(dd, DC_LCB_CFG_LOOPBACK,
+                         IB_PACKET_TYPE << DC_LCB_CFG_LOOPBACK_VAL_SHIFT);
+               write_csr(dd, DC_LCB_CFG_LANE_WIDTH, 0);
+       }
+
+       /* start the LCBs */
+       /* LCB_CFG_TX_FIFOS_RESET.VAL = 0 */
+       write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0);
+
+       /* simulator only loopback steps */
+       if (loopback && dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
+               /* LCB_CFG_RUN.EN = 1 */
+               write_csr(dd, DC_LCB_CFG_RUN,
+                         1ull << DC_LCB_CFG_RUN_EN_SHIFT);
+
+               /* watch LCB_STS_LINK_TRANSFER_ACTIVE */
+               timeout = jiffies + msecs_to_jiffies(10);
+               while (1) {
+                       reg = read_csr(dd, DC_LCB_STS_LINK_TRANSFER_ACTIVE);
+                       if (reg)
+                               break;
+                       if (time_after(jiffies, timeout)) {
+                               dd_dev_err(dd,
+                                          "timeout waiting for LINK_TRANSFER_ACTIVE\n");
+                               return -ETIMEDOUT;
+                       }
+                       udelay(2);
+               }
+
+               write_csr(dd, DC_LCB_CFG_ALLOW_LINK_UP,
+                         1ull << DC_LCB_CFG_ALLOW_LINK_UP_VAL_SHIFT);
+       }
+
+       if (!loopback) {
+               /*
+                * When doing quick linkup and not in loopback, both
+                * sides must be done with LCB set-up before either
+                * starts the quick linkup.  Put a delay here so that
+                * both sides can be started and have a chance to be
+                * done with LCB set up before resuming.
+                */
+               dd_dev_err(dd,
+                          "Pausing for peer to be finished with LCB set up\n");
+               msleep(5000);
+               dd_dev_err(dd, "Continuing with quick linkup\n");
+       }
+
+       write_csr(dd, DC_LCB_ERR_EN, 0); /* mask LCB errors */
+       set_8051_lcb_access(dd);
+
+       /*
+        * State "quick" LinkUp request sets the physical link state to
+        * LinkUp without a verify capability sequence.
+        * This state is in simulator v37 and later.
+        */
+       ret = set_physical_link_state(dd, PLS_QUICK_LINKUP);
+       if (ret != HCMD_SUCCESS) {
+               dd_dev_err(dd,
+                          "%s: set physical link state to quick LinkUp failed with return %d\n",
+                          __func__, ret);
+
+               set_host_lcb_access(dd);
+               write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */
+
+               if (ret >= 0)
+                       ret = -EINVAL;
+               return ret;
+       }
+
+       return 0; /* success */
+}
+
+/*
+ * Set the SerDes to internal loopback mode.
+ * Returns 0 on success, -errno on error.
+ */
+static int set_serdes_loopback_mode(struct hfi1_devdata *dd)
+{
+       int ret;
+
+       ret = set_physical_link_state(dd, PLS_INTERNAL_SERDES_LOOPBACK);
+       if (ret == HCMD_SUCCESS)
+               return 0;
+       dd_dev_err(dd,
+                  "Set physical link state to SerDes Loopback failed with return %d\n",
+                  ret);
+       if (ret >= 0)
+               ret = -EINVAL;
+       return ret;
+}
+
+/*
+ * Do all special steps to set up loopback.
+ */
+static int init_loopback(struct hfi1_devdata *dd)
+{
+       dd_dev_info(dd, "Entering loopback mode\n");
+
+       /* all loopbacks should disable self GUID check */
+       write_csr(dd, DC_DC8051_CFG_MODE,
+                 (read_csr(dd, DC_DC8051_CFG_MODE) | DISABLE_SELF_GUID_CHECK));
+
+       /*
+        * The simulator has only one loopback option - LCB.  Switch
+        * to that option, which includes quick link up.
+        *
+        * Accept all valid loopback values.
+        */
+       if ((dd->icode == ICODE_FUNCTIONAL_SIMULATOR) &&
+           (loopback == LOOPBACK_SERDES || loopback == LOOPBACK_LCB ||
+            loopback == LOOPBACK_CABLE)) {
+               loopback = LOOPBACK_LCB;
+               quick_linkup = 1;
+               return 0;
+       }
+
+       /* handle serdes loopback */
+       if (loopback == LOOPBACK_SERDES) {
+               /* internal serdes loopack needs quick linkup on RTL */
+               if (dd->icode == ICODE_RTL_SILICON)
+                       quick_linkup = 1;
+               return set_serdes_loopback_mode(dd);
+       }
+
+       /* LCB loopback - handled at poll time */
+       if (loopback == LOOPBACK_LCB) {
+               quick_linkup = 1; /* LCB is always quick linkup */
+
+               /* not supported in emulation due to emulation RTL changes */
+               if (dd->icode == ICODE_FPGA_EMULATION) {
+                       dd_dev_err(dd,
+                                  "LCB loopback not supported in emulation\n");
+                       return -EINVAL;
+               }
+               return 0;
+       }
+
+       /* external cable loopback requires no extra steps */
+       if (loopback == LOOPBACK_CABLE)
+               return 0;
+
+       dd_dev_err(dd, "Invalid loopback mode %d\n", loopback);
+       return -EINVAL;
+}
+
+/*
+ * Translate from the OPA_LINK_WIDTH handed to us by the FM to bits
+ * used in the Verify Capability link width attribute.
+ */
+static u16 opa_to_vc_link_widths(u16 opa_widths)
+{
+       int i;
+       u16 result = 0;
+
+       static const struct link_bits {
+               u16 from;
+               u16 to;
+       } opa_link_xlate[] = {
+               { OPA_LINK_WIDTH_1X, 1 << (1 - 1)  },
+               { OPA_LINK_WIDTH_2X, 1 << (2 - 1)  },
+               { OPA_LINK_WIDTH_3X, 1 << (3 - 1)  },
+               { OPA_LINK_WIDTH_4X, 1 << (4 - 1)  },
+       };
+
+       for (i = 0; i < ARRAY_SIZE(opa_link_xlate); i++) {
+               if (opa_widths & opa_link_xlate[i].from)
+                       result |= opa_link_xlate[i].to;
+       }
+       return result;
+}
+
+/*
+ * Set link attributes before moving to polling.
+ */
+static int set_local_link_attributes(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u8 enable_lane_tx;
+       u8 tx_polarity_inversion;
+       u8 rx_polarity_inversion;
+       int ret;
+
+       /* reset our fabric serdes to clear any lingering problems */
+       fabric_serdes_reset(dd);
+
+       /* set the local tx rate - need to read-modify-write */
+       ret = read_tx_settings(dd, &enable_lane_tx, &tx_polarity_inversion,
+                              &rx_polarity_inversion, &ppd->local_tx_rate);
+       if (ret)
+               goto set_local_link_attributes_fail;
+
+       if (dd->dc8051_ver < dc8051_ver(0, 20)) {
+               /* set the tx rate to the fastest enabled */
+               if (ppd->link_speed_enabled & OPA_LINK_SPEED_25G)
+                       ppd->local_tx_rate = 1;
+               else
+                       ppd->local_tx_rate = 0;
+       } else {
+               /* set the tx rate to all enabled */
+               ppd->local_tx_rate = 0;
+               if (ppd->link_speed_enabled & OPA_LINK_SPEED_25G)
+                       ppd->local_tx_rate |= 2;
+               if (ppd->link_speed_enabled & OPA_LINK_SPEED_12_5G)
+                       ppd->local_tx_rate |= 1;
+       }
+
+       enable_lane_tx = 0xF; /* enable all four lanes */
+       ret = write_tx_settings(dd, enable_lane_tx, tx_polarity_inversion,
+                               rx_polarity_inversion, ppd->local_tx_rate);
+       if (ret != HCMD_SUCCESS)
+               goto set_local_link_attributes_fail;
+
+       /*
+        * DC supports continuous updates.
+        */
+       ret = write_vc_local_phy(dd,
+                                0 /* no power management */,
+                                1 /* continuous updates */);
+       if (ret != HCMD_SUCCESS)
+               goto set_local_link_attributes_fail;
+
+       /* z=1 in the next call: AU of 0 is not supported by the hardware */
+       ret = write_vc_local_fabric(dd, dd->vau, 1, dd->vcu, dd->vl15_init,
+                                   ppd->port_crc_mode_enabled);
+       if (ret != HCMD_SUCCESS)
+               goto set_local_link_attributes_fail;
+
+       ret = write_vc_local_link_width(dd, 0, 0,
+                                       opa_to_vc_link_widths(
+                                               ppd->link_width_enabled));
+       if (ret != HCMD_SUCCESS)
+               goto set_local_link_attributes_fail;
+
+       /* let peer know who we are */
+       ret = write_local_device_id(dd, dd->pcidev->device, dd->minrev);
+       if (ret == HCMD_SUCCESS)
+               return 0;
+
+set_local_link_attributes_fail:
+       dd_dev_err(dd,
+                  "Failed to set local link attributes, return 0x%x\n",
+                  ret);
+       return ret;
+}
+
+/*
+ * Call this to start the link.
+ * Do not do anything if the link is disabled.
+ * Returns 0 if link is disabled, moved to polling, or the driver is not ready.
+ */
+int start_link(struct hfi1_pportdata *ppd)
+{
+       if (!ppd->link_enabled) {
+               dd_dev_info(ppd->dd,
+                           "%s: stopping link start because link is disabled\n",
+                           __func__);
+               return 0;
+       }
+       if (!ppd->driver_link_ready) {
+               dd_dev_info(ppd->dd,
+                           "%s: stopping link start because driver is not ready\n",
+                           __func__);
+               return 0;
+       }
+
+       return set_link_state(ppd, HLS_DN_POLL);
+}
+
+static void wait_for_qsfp_init(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u64 mask;
+       unsigned long timeout;
+
+       /*
+        * Check for QSFP interrupt for t_init (SFF 8679)
+        */
+       timeout = jiffies + msecs_to_jiffies(2000);
+       while (1) {
+               mask = read_csr(dd, dd->hfi1_id ?
+                               ASIC_QSFP2_IN : ASIC_QSFP1_IN);
+               if (!(mask & QSFP_HFI0_INT_N)) {
+                       write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_CLEAR :
+                                 ASIC_QSFP1_CLEAR, QSFP_HFI0_INT_N);
+                       break;
+               }
+               if (time_after(jiffies, timeout)) {
+                       dd_dev_info(dd, "%s: No IntN detected, reset complete\n",
+                                   __func__);
+                       break;
+               }
+               udelay(2);
+       }
+}
+
+static void set_qsfp_int_n(struct hfi1_pportdata *ppd, u8 enable)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u64 mask;
+
+       mask = read_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK);
+       if (enable)
+               mask |= (u64)QSFP_HFI0_INT_N;
+       else
+               mask &= ~(u64)QSFP_HFI0_INT_N;
+       write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK, mask);
+}
+
+void reset_qsfp(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u64 mask, qsfp_mask;
+
+       /* Disable INT_N from triggering QSFP interrupts */
+       set_qsfp_int_n(ppd, 0);
+
+       /* Reset the QSFP */
+       mask = (u64)QSFP_HFI0_RESET_N;
+
+       qsfp_mask = read_csr(dd,
+                            dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT);
+       qsfp_mask &= ~mask;
+       write_csr(dd,
+                 dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT, qsfp_mask);
+
+       udelay(10);
+
+       qsfp_mask |= mask;
+       write_csr(dd,
+                 dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT, qsfp_mask);
+
+       wait_for_qsfp_init(ppd);
+
+       /*
+        * Allow INT_N to trigger the QSFP interrupt to watch
+        * for alarms and warnings
+        */
+       set_qsfp_int_n(ppd, 1);
+}
+
+static int handle_qsfp_error_conditions(struct hfi1_pportdata *ppd,
+                                       u8 *qsfp_interrupt_status)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+
+       if ((qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_ALARM) ||
+           (qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_WARNING))
+               dd_dev_info(dd, "%s: QSFP cable on fire\n",
+                           __func__);
+
+       if ((qsfp_interrupt_status[0] & QSFP_LOW_TEMP_ALARM) ||
+           (qsfp_interrupt_status[0] & QSFP_LOW_TEMP_WARNING))
+               dd_dev_info(dd, "%s: QSFP cable temperature too low\n",
+                           __func__);
+
+       /*
+        * The remaining alarms/warnings don't matter if the link is down.
+        */
+       if (ppd->host_link_state & HLS_DOWN)
+               return 0;
+
+       if ((qsfp_interrupt_status[1] & QSFP_HIGH_VCC_ALARM) ||
+           (qsfp_interrupt_status[1] & QSFP_HIGH_VCC_WARNING))
+               dd_dev_info(dd, "%s: QSFP supply voltage too high\n",
+                           __func__);
+
+       if ((qsfp_interrupt_status[1] & QSFP_LOW_VCC_ALARM) ||
+           (qsfp_interrupt_status[1] & QSFP_LOW_VCC_WARNING))
+               dd_dev_info(dd, "%s: QSFP supply voltage too low\n",
+                           __func__);
+
+       /* Byte 2 is vendor specific */
+
+       if ((qsfp_interrupt_status[3] & QSFP_HIGH_POWER_ALARM) ||
+           (qsfp_interrupt_status[3] & QSFP_HIGH_POWER_WARNING))
+               dd_dev_info(dd, "%s: Cable RX channel 1/2 power too high\n",
+                           __func__);
+
+       if ((qsfp_interrupt_status[3] & QSFP_LOW_POWER_ALARM) ||
+           (qsfp_interrupt_status[3] & QSFP_LOW_POWER_WARNING))
+               dd_dev_info(dd, "%s: Cable RX channel 1/2 power too low\n",
+                           __func__);
+
+       if ((qsfp_interrupt_status[4] & QSFP_HIGH_POWER_ALARM) ||
+           (qsfp_interrupt_status[4] & QSFP_HIGH_POWER_WARNING))
+               dd_dev_info(dd, "%s: Cable RX channel 3/4 power too high\n",
+                           __func__);
+
+       if ((qsfp_interrupt_status[4] & QSFP_LOW_POWER_ALARM) ||
+           (qsfp_interrupt_status[4] & QSFP_LOW_POWER_WARNING))
+               dd_dev_info(dd, "%s: Cable RX channel 3/4 power too low\n",
+                           __func__);
+
+       if ((qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_ALARM) ||
+           (qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_WARNING))
+               dd_dev_info(dd, "%s: Cable TX channel 1/2 bias too high\n",
+                           __func__);
+
+       if ((qsfp_interrupt_status[5] & QSFP_LOW_BIAS_ALARM) ||
+           (qsfp_interrupt_status[5] & QSFP_LOW_BIAS_WARNING))
+               dd_dev_info(dd, "%s: Cable TX channel 1/2 bias too low\n",
+                           __func__);
+
+       if ((qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_ALARM) ||
+           (qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_WARNING))
+               dd_dev_info(dd, "%s: Cable TX channel 3/4 bias too high\n",
+                           __func__);
+
+       if ((qsfp_interrupt_status[6] & QSFP_LOW_BIAS_ALARM) ||
+           (qsfp_interrupt_status[6] & QSFP_LOW_BIAS_WARNING))
+               dd_dev_info(dd, "%s: Cable TX channel 3/4 bias too low\n",
+                           __func__);
+
+       if ((qsfp_interrupt_status[7] & QSFP_HIGH_POWER_ALARM) ||
+           (qsfp_interrupt_status[7] & QSFP_HIGH_POWER_WARNING))
+               dd_dev_info(dd, "%s: Cable TX channel 1/2 power too high\n",
+                           __func__);
+
+       if ((qsfp_interrupt_status[7] & QSFP_LOW_POWER_ALARM) ||
+           (qsfp_interrupt_status[7] & QSFP_LOW_POWER_WARNING))
+               dd_dev_info(dd, "%s: Cable TX channel 1/2 power too low\n",
+                           __func__);
+
+       if ((qsfp_interrupt_status[8] & QSFP_HIGH_POWER_ALARM) ||
+           (qsfp_interrupt_status[8] & QSFP_HIGH_POWER_WARNING))
+               dd_dev_info(dd, "%s: Cable TX channel 3/4 power too high\n",
+                           __func__);
+
+       if ((qsfp_interrupt_status[8] & QSFP_LOW_POWER_ALARM) ||
+           (qsfp_interrupt_status[8] & QSFP_LOW_POWER_WARNING))
+               dd_dev_info(dd, "%s: Cable TX channel 3/4 power too low\n",
+                           __func__);
+
+       /* Bytes 9-10 and 11-12 are reserved */
+       /* Bytes 13-15 are vendor specific */
+
+       return 0;
+}
+
+/* This routine will only be scheduled if the QSFP module present is asserted */
+void qsfp_event(struct work_struct *work)
+{
+       struct qsfp_data *qd;
+       struct hfi1_pportdata *ppd;
+       struct hfi1_devdata *dd;
+
+       qd = container_of(work, struct qsfp_data, qsfp_work);
+       ppd = qd->ppd;
+       dd = ppd->dd;
+
+       /* Sanity check */
+       if (!qsfp_mod_present(ppd))
+               return;
+
+       /*
+        * Turn DC back on after cable has been re-inserted. Up until
+        * now, the DC has been in reset to save power.
+        */
+       dc_start(dd);
+
+       if (qd->cache_refresh_required) {
+               set_qsfp_int_n(ppd, 0);
+
+               wait_for_qsfp_init(ppd);
+
+               /*
+                * Allow INT_N to trigger the QSFP interrupt to watch
+                * for alarms and warnings
+                */
+               set_qsfp_int_n(ppd, 1);
+
+               tune_serdes(ppd);
+
+               start_link(ppd);
+       }
+
+       if (qd->check_interrupt_flags) {
+               u8 qsfp_interrupt_status[16] = {0,};
+
+               if (one_qsfp_read(ppd, dd->hfi1_id, 6,
+                                 &qsfp_interrupt_status[0], 16) != 16) {
+                       dd_dev_info(dd,
+                                   "%s: Failed to read status of QSFP module\n",
+                                   __func__);
+               } else {
+                       unsigned long flags;
+
+                       handle_qsfp_error_conditions(
+                                       ppd, qsfp_interrupt_status);
+                       spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+                       ppd->qsfp_info.check_interrupt_flags = 0;
+                       spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock,
+                                              flags);
+               }
+       }
+}
+
+static void init_qsfp_int(struct hfi1_devdata *dd)
+{
+       struct hfi1_pportdata *ppd = dd->pport;
+       u64 qsfp_mask, cce_int_mask;
+       const int qsfp1_int_smask = QSFP1_INT % 64;
+       const int qsfp2_int_smask = QSFP2_INT % 64;
+
+       /*
+        * disable QSFP1 interrupts for HFI1, QSFP2 interrupts for HFI0
+        * Qsfp1Int and Qsfp2Int are adjacent bits in the same CSR,
+        * therefore just one of QSFP1_INT/QSFP2_INT can be used to find
+        * the index of the appropriate CSR in the CCEIntMask CSR array
+        */
+       cce_int_mask = read_csr(dd, CCE_INT_MASK +
+                               (8 * (QSFP1_INT / 64)));
+       if (dd->hfi1_id) {
+               cce_int_mask &= ~((u64)1 << qsfp1_int_smask);
+               write_csr(dd, CCE_INT_MASK + (8 * (QSFP1_INT / 64)),
+                         cce_int_mask);
+       } else {
+               cce_int_mask &= ~((u64)1 << qsfp2_int_smask);
+               write_csr(dd, CCE_INT_MASK + (8 * (QSFP2_INT / 64)),
+                         cce_int_mask);
+       }
+
+       qsfp_mask = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N);
+       /* Clear current status to avoid spurious interrupts */
+       write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_CLEAR : ASIC_QSFP1_CLEAR,
+                 qsfp_mask);
+       write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK,
+                 qsfp_mask);
+
+       set_qsfp_int_n(ppd, 0);
+
+       /* Handle active low nature of INT_N and MODPRST_N pins */
+       if (qsfp_mod_present(ppd))
+               qsfp_mask &= ~(u64)QSFP_HFI0_MODPRST_N;
+       write_csr(dd,
+                 dd->hfi1_id ? ASIC_QSFP2_INVERT : ASIC_QSFP1_INVERT,
+                 qsfp_mask);
+}
+
+/*
+ * Do a one-time initialize of the LCB block.
+ */
+static void init_lcb(struct hfi1_devdata *dd)
+{
+       /* simulator does not correctly handle LCB cclk loopback, skip */
+       if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
+               return;
+
+       /* the DC has been reset earlier in the driver load */
+
+       /* set LCB for cclk loopback on the port */
+       write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0x01);
+       write_csr(dd, DC_LCB_CFG_LANE_WIDTH, 0x00);
+       write_csr(dd, DC_LCB_CFG_REINIT_AS_SLAVE, 0x00);
+       write_csr(dd, DC_LCB_CFG_CNT_FOR_SKIP_STALL, 0x110);
+       write_csr(dd, DC_LCB_CFG_CLK_CNTR, 0x08);
+       write_csr(dd, DC_LCB_CFG_LOOPBACK, 0x02);
+       write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0x00);
+}
+
+int bringup_serdes(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u64 guid;
+       int ret;
+
+       if (HFI1_CAP_IS_KSET(EXTENDED_PSN))
+               add_rcvctrl(dd, RCV_CTRL_RCV_EXTENDED_PSN_ENABLE_SMASK);
+
+       guid = ppd->guid;
+       if (!guid) {
+               if (dd->base_guid)
+                       guid = dd->base_guid + ppd->port - 1;
+               ppd->guid = guid;
+       }
+
+       /* Set linkinit_reason on power up per OPA spec */
+       ppd->linkinit_reason = OPA_LINKINIT_REASON_LINKUP;
+
+       /* one-time init of the LCB */
+       init_lcb(dd);
+
+       if (loopback) {
+               ret = init_loopback(dd);
+               if (ret < 0)
+                       return ret;
+       }
+
+       get_port_type(ppd);
+       if (ppd->port_type == PORT_TYPE_QSFP) {
+               set_qsfp_int_n(ppd, 0);
+               wait_for_qsfp_init(ppd);
+               set_qsfp_int_n(ppd, 1);
+       }
+
+       /*
+        * Tune the SerDes to a ballpark setting for
+        * optimal signal and bit error rate
+        * Needs to be done before starting the link
+        */
+       tune_serdes(ppd);
+
+       return start_link(ppd);
+}
+
+void hfi1_quiet_serdes(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+
+       /*
+        * Shut down the link and keep it down.   First turn off that the
+        * driver wants to allow the link to be up (driver_link_ready).
+        * Then make sure the link is not automatically restarted
+        * (link_enabled).  Cancel any pending restart.  And finally
+        * go offline.
+        */
+       ppd->driver_link_ready = 0;
+       ppd->link_enabled = 0;
+
+       ppd->offline_disabled_reason =
+                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_SMA_DISABLED);
+       set_link_down_reason(ppd, OPA_LINKDOWN_REASON_SMA_DISABLED, 0,
+                            OPA_LINKDOWN_REASON_SMA_DISABLED);
+       set_link_state(ppd, HLS_DN_OFFLINE);
+
+       /* disable the port */
+       clear_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+}
+
+static inline int init_cpu_counters(struct hfi1_devdata *dd)
+{
+       struct hfi1_pportdata *ppd;
+       int i;
+
+       ppd = (struct hfi1_pportdata *)(dd + 1);
+       for (i = 0; i < dd->num_pports; i++, ppd++) {
+               ppd->ibport_data.rvp.rc_acks = NULL;
+               ppd->ibport_data.rvp.rc_qacks = NULL;
+               ppd->ibport_data.rvp.rc_acks = alloc_percpu(u64);
+               ppd->ibport_data.rvp.rc_qacks = alloc_percpu(u64);
+               ppd->ibport_data.rvp.rc_delayed_comp = alloc_percpu(u64);
+               if (!ppd->ibport_data.rvp.rc_acks ||
+                   !ppd->ibport_data.rvp.rc_delayed_comp ||
+                   !ppd->ibport_data.rvp.rc_qacks)
+                       return -ENOMEM;
+       }
+
+       return 0;
+}
+
+static const char * const pt_names[] = {
+       "expected",
+       "eager",
+       "invalid"
+};
+
+static const char *pt_name(u32 type)
+{
+       return type >= ARRAY_SIZE(pt_names) ? "unknown" : pt_names[type];
+}
+
+/*
+ * index is the index into the receive array
+ */
+void hfi1_put_tid(struct hfi1_devdata *dd, u32 index,
+                 u32 type, unsigned long pa, u16 order)
+{
+       u64 reg;
+       void __iomem *base = (dd->rcvarray_wc ? dd->rcvarray_wc :
+                             (dd->kregbase + RCV_ARRAY));
+
+       if (!(dd->flags & HFI1_PRESENT))
+               goto done;
+
+       if (type == PT_INVALID) {
+               pa = 0;
+       } else if (type > PT_INVALID) {
+               dd_dev_err(dd,
+                          "unexpected receive array type %u for index %u, not handled\n",
+                          type, index);
+               goto done;
+       }
+
+       hfi1_cdbg(TID, "type %s, index 0x%x, pa 0x%lx, bsize 0x%lx",
+                 pt_name(type), index, pa, (unsigned long)order);
+
+#define RT_ADDR_SHIFT 12       /* 4KB kernel address boundary */
+       reg = RCV_ARRAY_RT_WRITE_ENABLE_SMASK
+               | (u64)order << RCV_ARRAY_RT_BUF_SIZE_SHIFT
+               | ((pa >> RT_ADDR_SHIFT) & RCV_ARRAY_RT_ADDR_MASK)
+                                       << RCV_ARRAY_RT_ADDR_SHIFT;
+       writeq(reg, base + (index * 8));
+
+       if (type == PT_EAGER)
+               /*
+                * Eager entries are written one-by-one so we have to push them
+                * after we write the entry.
+                */
+               flush_wc();
+done:
+       return;
+}
+
+void hfi1_clear_tids(struct hfi1_ctxtdata *rcd)
+{
+       struct hfi1_devdata *dd = rcd->dd;
+       u32 i;
+
+       /* this could be optimized */
+       for (i = rcd->eager_base; i < rcd->eager_base +
+                    rcd->egrbufs.alloced; i++)
+               hfi1_put_tid(dd, i, PT_INVALID, 0, 0);
+
+       for (i = rcd->expected_base;
+                       i < rcd->expected_base + rcd->expected_count; i++)
+               hfi1_put_tid(dd, i, PT_INVALID, 0, 0);
+}
+
+int hfi1_get_base_kinfo(struct hfi1_ctxtdata *rcd,
+                       struct hfi1_ctxt_info *kinfo)
+{
+       kinfo->runtime_flags = (HFI1_MISC_GET() << HFI1_CAP_USER_SHIFT) |
+               HFI1_CAP_UGET(MASK) | HFI1_CAP_KGET(K2U);
+       return 0;
+}
+
+struct hfi1_message_header *hfi1_get_msgheader(
+                               struct hfi1_devdata *dd, __le32 *rhf_addr)
+{
+       u32 offset = rhf_hdrq_offset(rhf_to_cpu(rhf_addr));
+
+       return (struct hfi1_message_header *)
+               (rhf_addr - dd->rhf_offset + offset);
+}
+
+static const char * const ib_cfg_name_strings[] = {
+       "HFI1_IB_CFG_LIDLMC",
+       "HFI1_IB_CFG_LWID_DG_ENB",
+       "HFI1_IB_CFG_LWID_ENB",
+       "HFI1_IB_CFG_LWID",
+       "HFI1_IB_CFG_SPD_ENB",
+       "HFI1_IB_CFG_SPD",
+       "HFI1_IB_CFG_RXPOL_ENB",
+       "HFI1_IB_CFG_LREV_ENB",
+       "HFI1_IB_CFG_LINKLATENCY",
+       "HFI1_IB_CFG_HRTBT",
+       "HFI1_IB_CFG_OP_VLS",
+       "HFI1_IB_CFG_VL_HIGH_CAP",
+       "HFI1_IB_CFG_VL_LOW_CAP",
+       "HFI1_IB_CFG_OVERRUN_THRESH",
+       "HFI1_IB_CFG_PHYERR_THRESH",
+       "HFI1_IB_CFG_LINKDEFAULT",
+       "HFI1_IB_CFG_PKEYS",
+       "HFI1_IB_CFG_MTU",
+       "HFI1_IB_CFG_LSTATE",
+       "HFI1_IB_CFG_VL_HIGH_LIMIT",
+       "HFI1_IB_CFG_PMA_TICKS",
+       "HFI1_IB_CFG_PORT"
+};
+
+static const char *ib_cfg_name(int which)
+{
+       if (which < 0 || which >= ARRAY_SIZE(ib_cfg_name_strings))
+               return "invalid";
+       return ib_cfg_name_strings[which];
+}
+
+int hfi1_get_ib_cfg(struct hfi1_pportdata *ppd, int which)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       int val = 0;
+
+       switch (which) {
+       case HFI1_IB_CFG_LWID_ENB: /* allowed Link-width */
+               val = ppd->link_width_enabled;
+               break;
+       case HFI1_IB_CFG_LWID: /* currently active Link-width */
+               val = ppd->link_width_active;
+               break;
+       case HFI1_IB_CFG_SPD_ENB: /* allowed Link speeds */
+               val = ppd->link_speed_enabled;
+               break;
+       case HFI1_IB_CFG_SPD: /* current Link speed */
+               val = ppd->link_speed_active;
+               break;
+
+       case HFI1_IB_CFG_RXPOL_ENB: /* Auto-RX-polarity enable */
+       case HFI1_IB_CFG_LREV_ENB: /* Auto-Lane-reversal enable */
+       case HFI1_IB_CFG_LINKLATENCY:
+               goto unimplemented;
+
+       case HFI1_IB_CFG_OP_VLS:
+               val = ppd->vls_operational;
+               break;
+       case HFI1_IB_CFG_VL_HIGH_CAP: /* VL arb high priority table size */
+               val = VL_ARB_HIGH_PRIO_TABLE_SIZE;
+               break;
+       case HFI1_IB_CFG_VL_LOW_CAP: /* VL arb low priority table size */
+               val = VL_ARB_LOW_PRIO_TABLE_SIZE;
+               break;
+       case HFI1_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
+               val = ppd->overrun_threshold;
+               break;
+       case HFI1_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
+               val = ppd->phy_error_threshold;
+               break;
+       case HFI1_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
+               val = dd->link_default;
+               break;
+
+       case HFI1_IB_CFG_HRTBT: /* Heartbeat off/enable/auto */
+       case HFI1_IB_CFG_PMA_TICKS:
+       default:
+unimplemented:
+               if (HFI1_CAP_IS_KSET(PRINT_UNIMPL))
+                       dd_dev_info(
+                               dd,
+                               "%s: which %s: not implemented\n",
+                               __func__,
+                               ib_cfg_name(which));
+               break;
+       }
+
+       return val;
+}
+
+/*
+ * The largest MAD packet size.
+ */
+#define MAX_MAD_PACKET 2048
+
+/*
+ * Return the maximum header bytes that can go on the _wire_
+ * for this device. This count includes the ICRC which is
+ * not part of the packet held in memory but it is appended
+ * by the HW.
+ * This is dependent on the device's receive header entry size.
+ * HFI allows this to be set per-receive context, but the
+ * driver presently enforces a global value.
+ */
+u32 lrh_max_header_bytes(struct hfi1_devdata *dd)
+{
+       /*
+        * The maximum non-payload (MTU) bytes in LRH.PktLen are
+        * the Receive Header Entry Size minus the PBC (or RHF) size
+        * plus one DW for the ICRC appended by HW.
+        *
+        * dd->rcd[0].rcvhdrqentsize is in DW.
+        * We use rcd[0] as all context will have the same value. Also,
+        * the first kernel context would have been allocated by now so
+        * we are guaranteed a valid value.
+        */
+       return (dd->rcd[0]->rcvhdrqentsize - 2/*PBC/RHF*/ + 1/*ICRC*/) << 2;
+}
+
+/*
+ * Set Send Length
+ * @ppd - per port data
+ *
+ * Set the MTU by limiting how many DWs may be sent.  The SendLenCheck*
+ * registers compare against LRH.PktLen, so use the max bytes included
+ * in the LRH.
+ *
+ * This routine changes all VL values except VL15, which it maintains at
+ * the same value.
+ */
+static void set_send_length(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u32 max_hb = lrh_max_header_bytes(dd), dcmtu;
+       u32 maxvlmtu = dd->vld[15].mtu;
+       u64 len1 = 0, len2 = (((dd->vld[15].mtu + max_hb) >> 2)
+                             & SEND_LEN_CHECK1_LEN_VL15_MASK) <<
+               SEND_LEN_CHECK1_LEN_VL15_SHIFT;
+       int i;
+       u32 thres;
+
+       for (i = 0; i < ppd->vls_supported; i++) {
+               if (dd->vld[i].mtu > maxvlmtu)
+                       maxvlmtu = dd->vld[i].mtu;
+               if (i <= 3)
+                       len1 |= (((dd->vld[i].mtu + max_hb) >> 2)
+                                & SEND_LEN_CHECK0_LEN_VL0_MASK) <<
+                               ((i % 4) * SEND_LEN_CHECK0_LEN_VL1_SHIFT);
+               else
+                       len2 |= (((dd->vld[i].mtu + max_hb) >> 2)
+                                & SEND_LEN_CHECK1_LEN_VL4_MASK) <<
+                               ((i % 4) * SEND_LEN_CHECK1_LEN_VL5_SHIFT);
+       }
+       write_csr(dd, SEND_LEN_CHECK0, len1);
+       write_csr(dd, SEND_LEN_CHECK1, len2);
+       /* adjust kernel credit return thresholds based on new MTUs */
+       /* all kernel receive contexts have the same hdrqentsize */
+       for (i = 0; i < ppd->vls_supported; i++) {
+               thres = min(sc_percent_to_threshold(dd->vld[i].sc, 50),
+                           sc_mtu_to_threshold(dd->vld[i].sc,
+                                               dd->vld[i].mtu,
+                                               dd->rcd[0]->rcvhdrqentsize));
+               sc_set_cr_threshold(dd->vld[i].sc, thres);
+       }
+       thres = min(sc_percent_to_threshold(dd->vld[15].sc, 50),
+                   sc_mtu_to_threshold(dd->vld[15].sc,
+                                       dd->vld[15].mtu,
+                                       dd->rcd[0]->rcvhdrqentsize));
+       sc_set_cr_threshold(dd->vld[15].sc, thres);
+
+       /* Adjust maximum MTU for the port in DC */
+       dcmtu = maxvlmtu == 10240 ? DCC_CFG_PORT_MTU_CAP_10240 :
+               (ilog2(maxvlmtu >> 8) + 1);
+       len1 = read_csr(ppd->dd, DCC_CFG_PORT_CONFIG);
+       len1 &= ~DCC_CFG_PORT_CONFIG_MTU_CAP_SMASK;
+       len1 |= ((u64)dcmtu & DCC_CFG_PORT_CONFIG_MTU_CAP_MASK) <<
+               DCC_CFG_PORT_CONFIG_MTU_CAP_SHIFT;
+       write_csr(ppd->dd, DCC_CFG_PORT_CONFIG, len1);
+}
+
+static void set_lidlmc(struct hfi1_pportdata *ppd)
+{
+       int i;
+       u64 sreg = 0;
+       struct hfi1_devdata *dd = ppd->dd;
+       u32 mask = ~((1U << ppd->lmc) - 1);
+       u64 c1 = read_csr(ppd->dd, DCC_CFG_PORT_CONFIG1);
+
+       if (dd->hfi1_snoop.mode_flag)
+               dd_dev_info(dd, "Set lid/lmc while snooping");
+
+       c1 &= ~(DCC_CFG_PORT_CONFIG1_TARGET_DLID_SMASK
+               | DCC_CFG_PORT_CONFIG1_DLID_MASK_SMASK);
+       c1 |= ((ppd->lid & DCC_CFG_PORT_CONFIG1_TARGET_DLID_MASK)
+                       << DCC_CFG_PORT_CONFIG1_TARGET_DLID_SHIFT) |
+             ((mask & DCC_CFG_PORT_CONFIG1_DLID_MASK_MASK)
+                       << DCC_CFG_PORT_CONFIG1_DLID_MASK_SHIFT);
+       write_csr(ppd->dd, DCC_CFG_PORT_CONFIG1, c1);
+
+       /*
+        * Iterate over all the send contexts and set their SLID check
+        */
+       sreg = ((mask & SEND_CTXT_CHECK_SLID_MASK_MASK) <<
+                       SEND_CTXT_CHECK_SLID_MASK_SHIFT) |
+              (((ppd->lid & mask) & SEND_CTXT_CHECK_SLID_VALUE_MASK) <<
+                       SEND_CTXT_CHECK_SLID_VALUE_SHIFT);
+
+       for (i = 0; i < dd->chip_send_contexts; i++) {
+               hfi1_cdbg(LINKVERB, "SendContext[%d].SLID_CHECK = 0x%x",
+                         i, (u32)sreg);
+               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_SLID, sreg);
+       }
+
+       /* Now we have to do the same thing for the sdma engines */
+       sdma_update_lmc(dd, mask, ppd->lid);
+}
+
+static int wait_phy_linkstate(struct hfi1_devdata *dd, u32 state, u32 msecs)
+{
+       unsigned long timeout;
+       u32 curr_state;
+
+       timeout = jiffies + msecs_to_jiffies(msecs);
+       while (1) {
+               curr_state = read_physical_state(dd);
+               if (curr_state == state)
+                       break;
+               if (time_after(jiffies, timeout)) {
+                       dd_dev_err(dd,
+                                  "timeout waiting for phy link state 0x%x, current state is 0x%x\n",
+                                  state, curr_state);
+                       return -ETIMEDOUT;
+               }
+               usleep_range(1950, 2050); /* sleep 2ms-ish */
+       }
+
+       return 0;
+}
+
+/*
+ * Helper for set_link_state().  Do not call except from that routine.
+ * Expects ppd->hls_mutex to be held.
+ *
+ * @rem_reason value to be sent to the neighbor
+ *
+ * LinkDownReasons only set if transition succeeds.
+ */
+static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u32 pstate, previous_state;
+       u32 last_local_state;
+       u32 last_remote_state;
+       int ret;
+       int do_transition;
+       int do_wait;
+
+       previous_state = ppd->host_link_state;
+       ppd->host_link_state = HLS_GOING_OFFLINE;
+       pstate = read_physical_state(dd);
+       if (pstate == PLS_OFFLINE) {
+               do_transition = 0;      /* in right state */
+               do_wait = 0;            /* ...no need to wait */
+       } else if ((pstate & 0xff) == PLS_OFFLINE) {
+               do_transition = 0;      /* in an offline transient state */
+               do_wait = 1;            /* ...wait for it to settle */
+       } else {
+               do_transition = 1;      /* need to move to offline */
+               do_wait = 1;            /* ...will need to wait */
+       }
+
+       if (do_transition) {
+               ret = set_physical_link_state(dd,
+                                             (rem_reason << 8) | PLS_OFFLINE);
+
+               if (ret != HCMD_SUCCESS) {
+                       dd_dev_err(dd,
+                                  "Failed to transition to Offline link state, return %d\n",
+                                  ret);
+                       return -EINVAL;
+               }
+               if (ppd->offline_disabled_reason ==
+                               HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE))
+                       ppd->offline_disabled_reason =
+                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_TRANSIENT);
+       }
+
+       if (do_wait) {
+               /* it can take a while for the link to go down */
+               ret = wait_phy_linkstate(dd, PLS_OFFLINE, 10000);
+               if (ret < 0)
+                       return ret;
+       }
+
+       /* make sure the logical state is also down */
+       wait_logical_linkstate(ppd, IB_PORT_DOWN, 1000);
+
+       /*
+        * Now in charge of LCB - must be after the physical state is
+        * offline.quiet and before host_link_state is changed.
+        */
+       set_host_lcb_access(dd);
+       write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */
+       ppd->host_link_state = HLS_LINK_COOLDOWN; /* LCB access allowed */
+
+       if (ppd->port_type == PORT_TYPE_QSFP &&
+           ppd->qsfp_info.limiting_active &&
+           qsfp_mod_present(ppd)) {
+               int ret;
+
+               ret = acquire_chip_resource(dd, qsfp_resource(dd), QSFP_WAIT);
+               if (ret == 0) {
+                       set_qsfp_tx(ppd, 0);
+                       release_chip_resource(dd, qsfp_resource(dd));
+               } else {
+                       /* not fatal, but should warn */
+                       dd_dev_err(dd,
+                                  "Unable to acquire lock to turn off QSFP TX\n");
+               }
+       }
+
+       /*
+        * The LNI has a mandatory wait time after the physical state
+        * moves to Offline.Quiet.  The wait time may be different
+        * depending on how the link went down.  The 8051 firmware
+        * will observe the needed wait time and only move to ready
+        * when that is completed.  The largest of the quiet timeouts
+        * is 6s, so wait that long and then at least 0.5s more for
+        * other transitions, and another 0.5s for a buffer.
+        */
+       ret = wait_fm_ready(dd, 7000);
+       if (ret) {
+               dd_dev_err(dd,
+                          "After going offline, timed out waiting for the 8051 to become ready to accept host requests\n");
+               /* state is really offline, so make it so */
+               ppd->host_link_state = HLS_DN_OFFLINE;
+               return ret;
+       }
+
+       /*
+        * The state is now offline and the 8051 is ready to accept host
+        * requests.
+        *      - change our state
+        *      - notify others if we were previously in a linkup state
+        */
+       ppd->host_link_state = HLS_DN_OFFLINE;
+       if (previous_state & HLS_UP) {
+               /* went down while link was up */
+               handle_linkup_change(dd, 0);
+       } else if (previous_state
+                       & (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) {
+               /* went down while attempting link up */
+               /* byte 1 of last_*_state is the failure reason */
+               read_last_local_state(dd, &last_local_state);
+               read_last_remote_state(dd, &last_remote_state);
+               dd_dev_err(dd,
+                          "LNI failure last states: local 0x%08x, remote 0x%08x\n",
+                          last_local_state, last_remote_state);
+       }
+
+       /* the active link width (downgrade) is 0 on link down */
+       ppd->link_width_active = 0;
+       ppd->link_width_downgrade_tx_active = 0;
+       ppd->link_width_downgrade_rx_active = 0;
+       ppd->current_egress_rate = 0;
+       return 0;
+}
+
+/* return the link state name */
+static const char *link_state_name(u32 state)
+{
+       const char *name;
+       int n = ilog2(state);
+       static const char * const names[] = {
+               [__HLS_UP_INIT_BP]       = "INIT",
+               [__HLS_UP_ARMED_BP]      = "ARMED",
+               [__HLS_UP_ACTIVE_BP]     = "ACTIVE",
+               [__HLS_DN_DOWNDEF_BP]    = "DOWNDEF",
+               [__HLS_DN_POLL_BP]       = "POLL",
+               [__HLS_DN_DISABLE_BP]    = "DISABLE",
+               [__HLS_DN_OFFLINE_BP]    = "OFFLINE",
+               [__HLS_VERIFY_CAP_BP]    = "VERIFY_CAP",
+               [__HLS_GOING_UP_BP]      = "GOING_UP",
+               [__HLS_GOING_OFFLINE_BP] = "GOING_OFFLINE",
+               [__HLS_LINK_COOLDOWN_BP] = "LINK_COOLDOWN"
+       };
+
+       name = n < ARRAY_SIZE(names) ? names[n] : NULL;
+       return name ? name : "unknown";
+}
+
+/* return the link state reason name */
+static const char *link_state_reason_name(struct hfi1_pportdata *ppd, u32 state)
+{
+       if (state == HLS_UP_INIT) {
+               switch (ppd->linkinit_reason) {
+               case OPA_LINKINIT_REASON_LINKUP:
+                       return "(LINKUP)";
+               case OPA_LINKINIT_REASON_FLAPPING:
+                       return "(FLAPPING)";
+               case OPA_LINKINIT_OUTSIDE_POLICY:
+                       return "(OUTSIDE_POLICY)";
+               case OPA_LINKINIT_QUARANTINED:
+                       return "(QUARANTINED)";
+               case OPA_LINKINIT_INSUFIC_CAPABILITY:
+                       return "(INSUFIC_CAPABILITY)";
+               default:
+                       break;
+               }
+       }
+       return "";
+}
+
+/*
+ * driver_physical_state - convert the driver's notion of a port's
+ * state (an HLS_*) into a physical state (a {IB,OPA}_PORTPHYSSTATE_*).
+ * Return -1 (converted to a u32) to indicate error.
+ */
+u32 driver_physical_state(struct hfi1_pportdata *ppd)
+{
+       switch (ppd->host_link_state) {
+       case HLS_UP_INIT:
+       case HLS_UP_ARMED:
+       case HLS_UP_ACTIVE:
+               return IB_PORTPHYSSTATE_LINKUP;
+       case HLS_DN_POLL:
+               return IB_PORTPHYSSTATE_POLLING;
+       case HLS_DN_DISABLE:
+               return IB_PORTPHYSSTATE_DISABLED;
+       case HLS_DN_OFFLINE:
+               return OPA_PORTPHYSSTATE_OFFLINE;
+       case HLS_VERIFY_CAP:
+               return IB_PORTPHYSSTATE_POLLING;
+       case HLS_GOING_UP:
+               return IB_PORTPHYSSTATE_POLLING;
+       case HLS_GOING_OFFLINE:
+               return OPA_PORTPHYSSTATE_OFFLINE;
+       case HLS_LINK_COOLDOWN:
+               return OPA_PORTPHYSSTATE_OFFLINE;
+       case HLS_DN_DOWNDEF:
+       default:
+               dd_dev_err(ppd->dd, "invalid host_link_state 0x%x\n",
+                          ppd->host_link_state);
+               return  -1;
+       }
+}
+
+/*
+ * driver_logical_state - convert the driver's notion of a port's
+ * state (an HLS_*) into a logical state (a IB_PORT_*). Return -1
+ * (converted to a u32) to indicate error.
+ */
+u32 driver_logical_state(struct hfi1_pportdata *ppd)
+{
+       if (ppd->host_link_state && (ppd->host_link_state & HLS_DOWN))
+               return IB_PORT_DOWN;
+
+       switch (ppd->host_link_state & HLS_UP) {
+       case HLS_UP_INIT:
+               return IB_PORT_INIT;
+       case HLS_UP_ARMED:
+               return IB_PORT_ARMED;
+       case HLS_UP_ACTIVE:
+               return IB_PORT_ACTIVE;
+       default:
+               dd_dev_err(ppd->dd, "invalid host_link_state 0x%x\n",
+                          ppd->host_link_state);
+       return -1;
+       }
+}
+
+void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason,
+                         u8 neigh_reason, u8 rem_reason)
+{
+       if (ppd->local_link_down_reason.latest == 0 &&
+           ppd->neigh_link_down_reason.latest == 0) {
+               ppd->local_link_down_reason.latest = lcl_reason;
+               ppd->neigh_link_down_reason.latest = neigh_reason;
+               ppd->remote_link_down_reason = rem_reason;
+       }
+}
+
+/*
+ * Change the physical and/or logical link state.
+ *
+ * Do not call this routine while inside an interrupt.  It contains
+ * calls to routines that can take multiple seconds to finish.
+ *
+ * Returns 0 on success, -errno on failure.
+ */
+int set_link_state(struct hfi1_pportdata *ppd, u32 state)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       struct ib_event event = {.device = NULL};
+       int ret1, ret = 0;
+       int orig_new_state, poll_bounce;
+
+       mutex_lock(&ppd->hls_lock);
+
+       orig_new_state = state;
+       if (state == HLS_DN_DOWNDEF)
+               state = dd->link_default;
+
+       /* interpret poll -> poll as a link bounce */
+       poll_bounce = ppd->host_link_state == HLS_DN_POLL &&
+                     state == HLS_DN_POLL;
+
+       dd_dev_info(dd, "%s: current %s, new %s %s%s\n", __func__,
+                   link_state_name(ppd->host_link_state),
+                   link_state_name(orig_new_state),
+                   poll_bounce ? "(bounce) " : "",
+                   link_state_reason_name(ppd, state));
+
+       /*
+        * If we're going to a (HLS_*) link state that implies the logical
+        * link state is neither of (IB_PORT_ARMED, IB_PORT_ACTIVE), then
+        * reset is_sm_config_started to 0.
+        */
+       if (!(state & (HLS_UP_ARMED | HLS_UP_ACTIVE)))
+               ppd->is_sm_config_started = 0;
+
+       /*
+        * Do nothing if the states match.  Let a poll to poll link bounce
+        * go through.
+        */
+       if (ppd->host_link_state == state && !poll_bounce)
+               goto done;
+
+       switch (state) {
+       case HLS_UP_INIT:
+               if (ppd->host_link_state == HLS_DN_POLL &&
+                   (quick_linkup || dd->icode == ICODE_FUNCTIONAL_SIMULATOR)) {
+                       /*
+                        * Quick link up jumps from polling to here.
+                        *
+                        * Whether in normal or loopback mode, the
+                        * simulator jumps from polling to link up.
+                        * Accept that here.
+                        */
+                       /* OK */
+               } else if (ppd->host_link_state != HLS_GOING_UP) {
+                       goto unexpected;
+               }
+
+               ppd->host_link_state = HLS_UP_INIT;
+               ret = wait_logical_linkstate(ppd, IB_PORT_INIT, 1000);
+               if (ret) {
+                       /* logical state didn't change, stay at going_up */
+                       ppd->host_link_state = HLS_GOING_UP;
+                       dd_dev_err(dd,
+                                  "%s: logical state did not change to INIT\n",
+                                  __func__);
+               } else {
+                       /* clear old transient LINKINIT_REASON code */
+                       if (ppd->linkinit_reason >= OPA_LINKINIT_REASON_CLEAR)
+                               ppd->linkinit_reason =
+                                       OPA_LINKINIT_REASON_LINKUP;
+
+                       /* enable the port */
+                       add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+
+                       handle_linkup_change(dd, 1);
+               }
+               break;
+       case HLS_UP_ARMED:
+               if (ppd->host_link_state != HLS_UP_INIT)
+                       goto unexpected;
+
+               ppd->host_link_state = HLS_UP_ARMED;
+               set_logical_state(dd, LSTATE_ARMED);
+               ret = wait_logical_linkstate(ppd, IB_PORT_ARMED, 1000);
+               if (ret) {
+                       /* logical state didn't change, stay at init */
+                       ppd->host_link_state = HLS_UP_INIT;
+                       dd_dev_err(dd,
+                                  "%s: logical state did not change to ARMED\n",
+                                  __func__);
+               }
+               /*
+                * The simulator does not currently implement SMA messages,
+                * so neighbor_normal is not set.  Set it here when we first
+                * move to Armed.
+                */
+               if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
+                       ppd->neighbor_normal = 1;
+               break;
+       case HLS_UP_ACTIVE:
+               if (ppd->host_link_state != HLS_UP_ARMED)
+                       goto unexpected;
+
+               ppd->host_link_state = HLS_UP_ACTIVE;
+               set_logical_state(dd, LSTATE_ACTIVE);
+               ret = wait_logical_linkstate(ppd, IB_PORT_ACTIVE, 1000);
+               if (ret) {
+                       /* logical state didn't change, stay at armed */
+                       ppd->host_link_state = HLS_UP_ARMED;
+                       dd_dev_err(dd,
+                                  "%s: logical state did not change to ACTIVE\n",
+                                  __func__);
+               } else {
+                       /* tell all engines to go running */
+                       sdma_all_running(dd);
+
+                       /* Signal the IB layer that the port has went active */
+                       event.device = &dd->verbs_dev.rdi.ibdev;
+                       event.element.port_num = ppd->port;
+                       event.event = IB_EVENT_PORT_ACTIVE;
+               }
+               break;
+       case HLS_DN_POLL:
+               if ((ppd->host_link_state == HLS_DN_DISABLE ||
+                    ppd->host_link_state == HLS_DN_OFFLINE) &&
+                   dd->dc_shutdown)
+                       dc_start(dd);
+               /* Hand LED control to the DC */
+               write_csr(dd, DCC_CFG_LED_CNTRL, 0);
+
+               if (ppd->host_link_state != HLS_DN_OFFLINE) {
+                       u8 tmp = ppd->link_enabled;
+
+                       ret = goto_offline(ppd, ppd->remote_link_down_reason);
+                       if (ret) {
+                               ppd->link_enabled = tmp;
+                               break;
+                       }
+                       ppd->remote_link_down_reason = 0;
+
+                       if (ppd->driver_link_ready)
+                               ppd->link_enabled = 1;
+               }
+
+               set_all_slowpath(ppd->dd);
+               ret = set_local_link_attributes(ppd);
+               if (ret)
+                       break;
+
+               ppd->port_error_action = 0;
+               ppd->host_link_state = HLS_DN_POLL;
+
+               if (quick_linkup) {
+                       /* quick linkup does not go into polling */
+                       ret = do_quick_linkup(dd);
+               } else {
+                       ret1 = set_physical_link_state(dd, PLS_POLLING);
+                       if (ret1 != HCMD_SUCCESS) {
+                               dd_dev_err(dd,
+                                          "Failed to transition to Polling link state, return 0x%x\n",
+                                          ret1);
+                               ret = -EINVAL;
+                       }
+               }
+               ppd->offline_disabled_reason =
+                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE);
+               /*
+                * If an error occurred above, go back to offline.  The
+                * caller may reschedule another attempt.
+                */
+               if (ret)
+                       goto_offline(ppd, 0);
+               break;
+       case HLS_DN_DISABLE:
+               /* link is disabled */
+               ppd->link_enabled = 0;
+
+               /* allow any state to transition to disabled */
+
+               /* must transition to offline first */
+               if (ppd->host_link_state != HLS_DN_OFFLINE) {
+                       ret = goto_offline(ppd, ppd->remote_link_down_reason);
+                       if (ret)
+                               break;
+                       ppd->remote_link_down_reason = 0;
+               }
+
+               ret1 = set_physical_link_state(dd, PLS_DISABLED);
+               if (ret1 != HCMD_SUCCESS) {
+                       dd_dev_err(dd,
+                                  "Failed to transition to Disabled link state, return 0x%x\n",
+                                  ret1);
+                       ret = -EINVAL;
+                       break;
+               }
+               ppd->host_link_state = HLS_DN_DISABLE;
+               dc_shutdown(dd);
+               break;
+       case HLS_DN_OFFLINE:
+               if (ppd->host_link_state == HLS_DN_DISABLE)
+                       dc_start(dd);
+
+               /* allow any state to transition to offline */
+               ret = goto_offline(ppd, ppd->remote_link_down_reason);
+               if (!ret)
+                       ppd->remote_link_down_reason = 0;
+               break;
+       case HLS_VERIFY_CAP:
+               if (ppd->host_link_state != HLS_DN_POLL)
+                       goto unexpected;
+               ppd->host_link_state = HLS_VERIFY_CAP;
+               break;
+       case HLS_GOING_UP:
+               if (ppd->host_link_state != HLS_VERIFY_CAP)
+                       goto unexpected;
+
+               ret1 = set_physical_link_state(dd, PLS_LINKUP);
+               if (ret1 != HCMD_SUCCESS) {
+                       dd_dev_err(dd,
+                                  "Failed to transition to link up state, return 0x%x\n",
+                                  ret1);
+                       ret = -EINVAL;
+                       break;
+               }
+               ppd->host_link_state = HLS_GOING_UP;
+               break;
+
+       case HLS_GOING_OFFLINE:         /* transient within goto_offline() */
+       case HLS_LINK_COOLDOWN:         /* transient within goto_offline() */
+       default:
+               dd_dev_info(dd, "%s: state 0x%x: not supported\n",
+                           __func__, state);
+               ret = -EINVAL;
+               break;
+       }
+
+       goto done;
+
+unexpected:
+       dd_dev_err(dd, "%s: unexpected state transition from %s to %s\n",
+                  __func__, link_state_name(ppd->host_link_state),
+                  link_state_name(state));
+       ret = -EINVAL;
+
+done:
+       mutex_unlock(&ppd->hls_lock);
+
+       if (event.device)
+               ib_dispatch_event(&event);
+
+       return ret;
+}
+
+int hfi1_set_ib_cfg(struct hfi1_pportdata *ppd, int which, u32 val)
+{
+       u64 reg;
+       int ret = 0;
+
+       switch (which) {
+       case HFI1_IB_CFG_LIDLMC:
+               set_lidlmc(ppd);
+               break;
+       case HFI1_IB_CFG_VL_HIGH_LIMIT:
+               /*
+                * The VL Arbitrator high limit is sent in units of 4k
+                * bytes, while HFI stores it in units of 64 bytes.
+                */
+               val *= 4096 / 64;
+               reg = ((u64)val & SEND_HIGH_PRIORITY_LIMIT_LIMIT_MASK)
+                       << SEND_HIGH_PRIORITY_LIMIT_LIMIT_SHIFT;
+               write_csr(ppd->dd, SEND_HIGH_PRIORITY_LIMIT, reg);
+               break;
+       case HFI1_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
+               /* HFI only supports POLL as the default link down state */
+               if (val != HLS_DN_POLL)
+                       ret = -EINVAL;
+               break;
+       case HFI1_IB_CFG_OP_VLS:
+               if (ppd->vls_operational != val) {
+                       ppd->vls_operational = val;
+                       if (!ppd->port)
+                               ret = -EINVAL;
+               }
+               break;
+       /*
+        * For link width, link width downgrade, and speed enable, always AND
+        * the setting with what is actually supported.  This has two benefits.
+        * First, enabled can't have unsupported values, no matter what the
+        * SM or FM might want.  Second, the ALL_SUPPORTED wildcards that mean
+        * "fill in with your supported value" have all the bits in the
+        * field set, so simply ANDing with supported has the desired result.
+        */
+       case HFI1_IB_CFG_LWID_ENB: /* set allowed Link-width */
+               ppd->link_width_enabled = val & ppd->link_width_supported;
+               break;
+       case HFI1_IB_CFG_LWID_DG_ENB: /* set allowed link width downgrade */
+               ppd->link_width_downgrade_enabled =
+                               val & ppd->link_width_downgrade_supported;
+               break;
+       case HFI1_IB_CFG_SPD_ENB: /* allowed Link speeds */
+               ppd->link_speed_enabled = val & ppd->link_speed_supported;
+               break;
+       case HFI1_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
+               /*
+                * HFI does not follow IB specs, save this value
+                * so we can report it, if asked.
+                */
+               ppd->overrun_threshold = val;
+               break;
+       case HFI1_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
+               /*
+                * HFI does not follow IB specs, save this value
+                * so we can report it, if asked.
+                */
+               ppd->phy_error_threshold = val;
+               break;
+
+       case HFI1_IB_CFG_MTU:
+               set_send_length(ppd);
+               break;
+
+       case HFI1_IB_CFG_PKEYS:
+               if (HFI1_CAP_IS_KSET(PKEY_CHECK))
+                       set_partition_keys(ppd);
+               break;
+
+       default:
+               if (HFI1_CAP_IS_KSET(PRINT_UNIMPL))
+                       dd_dev_info(ppd->dd,
+                                   "%s: which %s, val 0x%x: not implemented\n",
+                                   __func__, ib_cfg_name(which), val);
+               break;
+       }
+       return ret;
+}
+
+/* begin functions related to vl arbitration table caching */
+static void init_vl_arb_caches(struct hfi1_pportdata *ppd)
+{
+       int i;
+
+       BUILD_BUG_ON(VL_ARB_TABLE_SIZE !=
+                       VL_ARB_LOW_PRIO_TABLE_SIZE);
+       BUILD_BUG_ON(VL_ARB_TABLE_SIZE !=
+                       VL_ARB_HIGH_PRIO_TABLE_SIZE);
+
+       /*
+        * Note that we always return values directly from the
+        * 'vl_arb_cache' (and do no CSR reads) in response to a
+        * 'Get(VLArbTable)'. This is obviously correct after a
+        * 'Set(VLArbTable)', since the cache will then be up to
+        * date. But it's also correct prior to any 'Set(VLArbTable)'
+        * since then both the cache, and the relevant h/w registers
+        * will be zeroed.
+        */
+
+       for (i = 0; i < MAX_PRIO_TABLE; i++)
+               spin_lock_init(&ppd->vl_arb_cache[i].lock);
+}
+
+/*
+ * vl_arb_lock_cache
+ *
+ * All other vl_arb_* functions should be called only after locking
+ * the cache.
+ */
+static inline struct vl_arb_cache *
+vl_arb_lock_cache(struct hfi1_pportdata *ppd, int idx)
+{
+       if (idx != LO_PRIO_TABLE && idx != HI_PRIO_TABLE)
+               return NULL;
+       spin_lock(&ppd->vl_arb_cache[idx].lock);
+       return &ppd->vl_arb_cache[idx];
+}
+
+static inline void vl_arb_unlock_cache(struct hfi1_pportdata *ppd, int idx)
+{
+       spin_unlock(&ppd->vl_arb_cache[idx].lock);
+}
+
+static void vl_arb_get_cache(struct vl_arb_cache *cache,
+                            struct ib_vl_weight_elem *vl)
+{
+       memcpy(vl, cache->table, VL_ARB_TABLE_SIZE * sizeof(*vl));
+}
+
+static void vl_arb_set_cache(struct vl_arb_cache *cache,
+                            struct ib_vl_weight_elem *vl)
+{
+       memcpy(cache->table, vl, VL_ARB_TABLE_SIZE * sizeof(*vl));
+}
+
+static int vl_arb_match_cache(struct vl_arb_cache *cache,
+                             struct ib_vl_weight_elem *vl)
+{
+       return !memcmp(cache->table, vl, VL_ARB_TABLE_SIZE * sizeof(*vl));
+}
+
+/* end functions related to vl arbitration table caching */
+
+static int set_vl_weights(struct hfi1_pportdata *ppd, u32 target,
+                         u32 size, struct ib_vl_weight_elem *vl)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u64 reg;
+       unsigned int i, is_up = 0;
+       int drain, ret = 0;
+
+       mutex_lock(&ppd->hls_lock);
+
+       if (ppd->host_link_state & HLS_UP)
+               is_up = 1;
+
+       drain = !is_ax(dd) && is_up;
+
+       if (drain)
+               /*
+                * Before adjusting VL arbitration weights, empty per-VL
+                * FIFOs, otherwise a packet whose VL weight is being
+                * set to 0 could get stuck in a FIFO with no chance to
+                * egress.
+                */
+               ret = stop_drain_data_vls(dd);
+
+       if (ret) {
+               dd_dev_err(
+                       dd,
+                       "%s: cannot stop/drain VLs - refusing to change VL arbitration weights\n",
+                       __func__);
+               goto err;
+       }
+
+       for (i = 0; i < size; i++, vl++) {
+               /*
+                * NOTE: The low priority shift and mask are used here, but
+                * they are the same for both the low and high registers.
+                */
+               reg = (((u64)vl->vl & SEND_LOW_PRIORITY_LIST_VL_MASK)
+                               << SEND_LOW_PRIORITY_LIST_VL_SHIFT)
+                     | (((u64)vl->weight
+                               & SEND_LOW_PRIORITY_LIST_WEIGHT_MASK)
+                               << SEND_LOW_PRIORITY_LIST_WEIGHT_SHIFT);
+               write_csr(dd, target + (i * 8), reg);
+       }
+       pio_send_control(dd, PSC_GLOBAL_VLARB_ENABLE);
+
+       if (drain)
+               open_fill_data_vls(dd); /* reopen all VLs */
+
+err:
+       mutex_unlock(&ppd->hls_lock);
+
+       return ret;
+}
+
+/*
+ * Read one credit merge VL register.
+ */
+static void read_one_cm_vl(struct hfi1_devdata *dd, u32 csr,
+                          struct vl_limit *vll)
+{
+       u64 reg = read_csr(dd, csr);
+
+       vll->dedicated = cpu_to_be16(
+               (reg >> SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT)
+               & SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_MASK);
+       vll->shared = cpu_to_be16(
+               (reg >> SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT)
+               & SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_MASK);
+}
+
+/*
+ * Read the current credit merge limits.
+ */
+static int get_buffer_control(struct hfi1_devdata *dd,
+                             struct buffer_control *bc, u16 *overall_limit)
+{
+       u64 reg;
+       int i;
+
+       /* not all entries are filled in */
+       memset(bc, 0, sizeof(*bc));
+
+       /* OPA and HFI have a 1-1 mapping */
+       for (i = 0; i < TXE_NUM_DATA_VL; i++)
+               read_one_cm_vl(dd, SEND_CM_CREDIT_VL + (8 * i), &bc->vl[i]);
+
+       /* NOTE: assumes that VL* and VL15 CSRs are bit-wise identical */
+       read_one_cm_vl(dd, SEND_CM_CREDIT_VL15, &bc->vl[15]);
+
+       reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT);
+       bc->overall_shared_limit = cpu_to_be16(
+               (reg >> SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT)
+               & SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_MASK);
+       if (overall_limit)
+               *overall_limit = (reg
+                       >> SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT)
+                       & SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_MASK;
+       return sizeof(struct buffer_control);
+}
+
+static int get_sc2vlnt(struct hfi1_devdata *dd, struct sc2vlnt *dp)
+{
+       u64 reg;
+       int i;
+
+       /* each register contains 16 SC->VLnt mappings, 4 bits each */
+       reg = read_csr(dd, DCC_CFG_SC_VL_TABLE_15_0);
+       for (i = 0; i < sizeof(u64); i++) {
+               u8 byte = *(((u8 *)&reg) + i);
+
+               dp->vlnt[2 * i] = byte & 0xf;
+               dp->vlnt[(2 * i) + 1] = (byte & 0xf0) >> 4;
+       }
+
+       reg = read_csr(dd, DCC_CFG_SC_VL_TABLE_31_16);
+       for (i = 0; i < sizeof(u64); i++) {
+               u8 byte = *(((u8 *)&reg) + i);
+
+               dp->vlnt[16 + (2 * i)] = byte & 0xf;
+               dp->vlnt[16 + (2 * i) + 1] = (byte & 0xf0) >> 4;
+       }
+       return sizeof(struct sc2vlnt);
+}
+
+static void get_vlarb_preempt(struct hfi1_devdata *dd, u32 nelems,
+                             struct ib_vl_weight_elem *vl)
+{
+       unsigned int i;
+
+       for (i = 0; i < nelems; i++, vl++) {
+               vl->vl = 0xf;
+               vl->weight = 0;
+       }
+}
+
+static void set_sc2vlnt(struct hfi1_devdata *dd, struct sc2vlnt *dp)
+{
+       write_csr(dd, DCC_CFG_SC_VL_TABLE_15_0,
+                 DC_SC_VL_VAL(15_0,
+                              0, dp->vlnt[0] & 0xf,
+                              1, dp->vlnt[1] & 0xf,
+                              2, dp->vlnt[2] & 0xf,
+                              3, dp->vlnt[3] & 0xf,
+                              4, dp->vlnt[4] & 0xf,
+                              5, dp->vlnt[5] & 0xf,
+                              6, dp->vlnt[6] & 0xf,
+                              7, dp->vlnt[7] & 0xf,
+                              8, dp->vlnt[8] & 0xf,
+                              9, dp->vlnt[9] & 0xf,
+                              10, dp->vlnt[10] & 0xf,
+                              11, dp->vlnt[11] & 0xf,
+                              12, dp->vlnt[12] & 0xf,
+                              13, dp->vlnt[13] & 0xf,
+                              14, dp->vlnt[14] & 0xf,
+                              15, dp->vlnt[15] & 0xf));
+       write_csr(dd, DCC_CFG_SC_VL_TABLE_31_16,
+                 DC_SC_VL_VAL(31_16,
+                              16, dp->vlnt[16] & 0xf,
+                              17, dp->vlnt[17] & 0xf,
+                              18, dp->vlnt[18] & 0xf,
+                              19, dp->vlnt[19] & 0xf,
+                              20, dp->vlnt[20] & 0xf,
+                              21, dp->vlnt[21] & 0xf,
+                              22, dp->vlnt[22] & 0xf,
+                              23, dp->vlnt[23] & 0xf,
+                              24, dp->vlnt[24] & 0xf,
+                              25, dp->vlnt[25] & 0xf,
+                              26, dp->vlnt[26] & 0xf,
+                              27, dp->vlnt[27] & 0xf,
+                              28, dp->vlnt[28] & 0xf,
+                              29, dp->vlnt[29] & 0xf,
+                              30, dp->vlnt[30] & 0xf,
+                              31, dp->vlnt[31] & 0xf));
+}
+
+static void nonzero_msg(struct hfi1_devdata *dd, int idx, const char *what,
+                       u16 limit)
+{
+       if (limit != 0)
+               dd_dev_info(dd, "Invalid %s limit %d on VL %d, ignoring\n",
+                           what, (int)limit, idx);
+}
+
+/* change only the shared limit portion of SendCmGLobalCredit */
+static void set_global_shared(struct hfi1_devdata *dd, u16 limit)
+{
+       u64 reg;
+
+       reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT);
+       reg &= ~SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SMASK;
+       reg |= (u64)limit << SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT;
+       write_csr(dd, SEND_CM_GLOBAL_CREDIT, reg);
+}
+
+/* change only the total credit limit portion of SendCmGLobalCredit */
+static void set_global_limit(struct hfi1_devdata *dd, u16 limit)
+{
+       u64 reg;
+
+       reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT);
+       reg &= ~SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SMASK;
+       reg |= (u64)limit << SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT;
+       write_csr(dd, SEND_CM_GLOBAL_CREDIT, reg);
+}
+
+/* set the given per-VL shared limit */
+static void set_vl_shared(struct hfi1_devdata *dd, int vl, u16 limit)
+{
+       u64 reg;
+       u32 addr;
+
+       if (vl < TXE_NUM_DATA_VL)
+               addr = SEND_CM_CREDIT_VL + (8 * vl);
+       else
+               addr = SEND_CM_CREDIT_VL15;
+
+       reg = read_csr(dd, addr);
+       reg &= ~SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SMASK;
+       reg |= (u64)limit << SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT;
+       write_csr(dd, addr, reg);
+}
+
+/* set the given per-VL dedicated limit */
+static void set_vl_dedicated(struct hfi1_devdata *dd, int vl, u16 limit)
+{
+       u64 reg;
+       u32 addr;
+
+       if (vl < TXE_NUM_DATA_VL)
+               addr = SEND_CM_CREDIT_VL + (8 * vl);
+       else
+               addr = SEND_CM_CREDIT_VL15;
+
+       reg = read_csr(dd, addr);
+       reg &= ~SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SMASK;
+       reg |= (u64)limit << SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT;
+       write_csr(dd, addr, reg);
+}
+
+/* spin until the given per-VL status mask bits clear */
+static void wait_for_vl_status_clear(struct hfi1_devdata *dd, u64 mask,
+                                    const char *which)
+{
+       unsigned long timeout;
+       u64 reg;
+
+       timeout = jiffies + msecs_to_jiffies(VL_STATUS_CLEAR_TIMEOUT);
+       while (1) {
+               reg = read_csr(dd, SEND_CM_CREDIT_USED_STATUS) & mask;
+
+               if (reg == 0)
+                       return; /* success */
+               if (time_after(jiffies, timeout))
+                       break;          /* timed out */
+               udelay(1);
+       }
+
+       dd_dev_err(dd,
+                  "%s credit change status not clearing after %dms, mask 0x%llx, not clear 0x%llx\n",
+                  which, VL_STATUS_CLEAR_TIMEOUT, mask, reg);
+       /*
+        * If this occurs, it is likely there was a credit loss on the link.
+        * The only recovery from that is a link bounce.
+        */
+       dd_dev_err(dd,
+                  "Continuing anyway.  A credit loss may occur.  Suggest a link bounce\n");
+}
+
+/*
+ * The number of credits on the VLs may be changed while everything
+ * is "live", but the following algorithm must be followed due to
+ * how the hardware is actually implemented.  In particular,
+ * Return_Credit_Status[] is the only correct status check.
+ *
+ * if (reducing Global_Shared_Credit_Limit or any shared limit changing)
+ *     set Global_Shared_Credit_Limit = 0
+ *     use_all_vl = 1
+ * mask0 = all VLs that are changing either dedicated or shared limits
+ * set Shared_Limit[mask0] = 0
+ * spin until Return_Credit_Status[use_all_vl ? all VL : mask0] == 0
+ * if (changing any dedicated limit)
+ *     mask1 = all VLs that are lowering dedicated limits
+ *     lower Dedicated_Limit[mask1]
+ *     spin until Return_Credit_Status[mask1] == 0
+ *     raise Dedicated_Limits
+ * raise Shared_Limits
+ * raise Global_Shared_Credit_Limit
+ *
+ * lower = if the new limit is lower, set the limit to the new value
+ * raise = if the new limit is higher than the current value (may be changed
+ *     earlier in the algorithm), set the new limit to the new value
+ */
+int set_buffer_control(struct hfi1_pportdata *ppd,
+                      struct buffer_control *new_bc)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u64 changing_mask, ld_mask, stat_mask;
+       int change_count;
+       int i, use_all_mask;
+       int this_shared_changing;
+       int vl_count = 0, ret;
+       /*
+        * A0: add the variable any_shared_limit_changing below and in the
+        * algorithm above.  If removing A0 support, it can be removed.
+        */
+       int any_shared_limit_changing;
+       struct buffer_control cur_bc;
+       u8 changing[OPA_MAX_VLS];
+       u8 lowering_dedicated[OPA_MAX_VLS];
+       u16 cur_total;
+       u32 new_total = 0;
+       const u64 all_mask =
+       SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK
+        | SEND_CM_CREDIT_USED_STATUS_VL1_RETURN_CREDIT_STATUS_SMASK
+        | SEND_CM_CREDIT_USED_STATUS_VL2_RETURN_CREDIT_STATUS_SMASK
+        | SEND_CM_CREDIT_USED_STATUS_VL3_RETURN_CREDIT_STATUS_SMASK
+        | SEND_CM_CREDIT_USED_STATUS_VL4_RETURN_CREDIT_STATUS_SMASK
+        | SEND_CM_CREDIT_USED_STATUS_VL5_RETURN_CREDIT_STATUS_SMASK
+        | SEND_CM_CREDIT_USED_STATUS_VL6_RETURN_CREDIT_STATUS_SMASK
+        | SEND_CM_CREDIT_USED_STATUS_VL7_RETURN_CREDIT_STATUS_SMASK
+        | SEND_CM_CREDIT_USED_STATUS_VL15_RETURN_CREDIT_STATUS_SMASK;
+
+#define valid_vl(idx) ((idx) < TXE_NUM_DATA_VL || (idx) == 15)
+#define NUM_USABLE_VLS 16      /* look at VL15 and less */
+
+       /* find the new total credits, do sanity check on unused VLs */
+       for (i = 0; i < OPA_MAX_VLS; i++) {
+               if (valid_vl(i)) {
+                       new_total += be16_to_cpu(new_bc->vl[i].dedicated);
+                       continue;
+               }
+               nonzero_msg(dd, i, "dedicated",
+                           be16_to_cpu(new_bc->vl[i].dedicated));
+               nonzero_msg(dd, i, "shared",
+                           be16_to_cpu(new_bc->vl[i].shared));
+               new_bc->vl[i].dedicated = 0;
+               new_bc->vl[i].shared = 0;
+       }
+       new_total += be16_to_cpu(new_bc->overall_shared_limit);
+
+       /* fetch the current values */
+       get_buffer_control(dd, &cur_bc, &cur_total);
+
+       /*
+        * Create the masks we will use.
+        */
+       memset(changing, 0, sizeof(changing));
+       memset(lowering_dedicated, 0, sizeof(lowering_dedicated));
+       /*
+        * NOTE: Assumes that the individual VL bits are adjacent and in
+        * increasing order
+        */
+       stat_mask =
+               SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK;
+       changing_mask = 0;
+       ld_mask = 0;
+       change_count = 0;
+       any_shared_limit_changing = 0;
+       for (i = 0; i < NUM_USABLE_VLS; i++, stat_mask <<= 1) {
+               if (!valid_vl(i))
+                       continue;
+               this_shared_changing = new_bc->vl[i].shared
+                                               != cur_bc.vl[i].shared;
+               if (this_shared_changing)
+                       any_shared_limit_changing = 1;
+               if (new_bc->vl[i].dedicated != cur_bc.vl[i].dedicated ||
+                   this_shared_changing) {
+                       changing[i] = 1;
+                       changing_mask |= stat_mask;
+                       change_count++;
+               }
+               if (be16_to_cpu(new_bc->vl[i].dedicated) <
+                                       be16_to_cpu(cur_bc.vl[i].dedicated)) {
+                       lowering_dedicated[i] = 1;
+                       ld_mask |= stat_mask;
+               }
+       }
+
+       /* bracket the credit change with a total adjustment */
+       if (new_total > cur_total)
+               set_global_limit(dd, new_total);
+
+       /*
+        * Start the credit change algorithm.
+        */
+       use_all_mask = 0;
+       if ((be16_to_cpu(new_bc->overall_shared_limit) <
+            be16_to_cpu(cur_bc.overall_shared_limit)) ||
+           (is_ax(dd) && any_shared_limit_changing)) {
+               set_global_shared(dd, 0);
+               cur_bc.overall_shared_limit = 0;
+               use_all_mask = 1;
+       }
+
+       for (i = 0; i < NUM_USABLE_VLS; i++) {
+               if (!valid_vl(i))
+                       continue;
+
+               if (changing[i]) {
+                       set_vl_shared(dd, i, 0);
+                       cur_bc.vl[i].shared = 0;
+               }
+       }
+
+       wait_for_vl_status_clear(dd, use_all_mask ? all_mask : changing_mask,
+                                "shared");
+
+       if (change_count > 0) {
+               for (i = 0; i < NUM_USABLE_VLS; i++) {
+                       if (!valid_vl(i))
+                               continue;
+
+                       if (lowering_dedicated[i]) {
+                               set_vl_dedicated(dd, i,
+                                                be16_to_cpu(new_bc->
+                                                            vl[i].dedicated));
+                               cur_bc.vl[i].dedicated =
+                                               new_bc->vl[i].dedicated;
+                       }
+               }
+
+               wait_for_vl_status_clear(dd, ld_mask, "dedicated");
+
+               /* now raise all dedicated that are going up */
+               for (i = 0; i < NUM_USABLE_VLS; i++) {
+                       if (!valid_vl(i))
+                               continue;
+
+                       if (be16_to_cpu(new_bc->vl[i].dedicated) >
+                                       be16_to_cpu(cur_bc.vl[i].dedicated))
+                               set_vl_dedicated(dd, i,
+                                                be16_to_cpu(new_bc->
+                                                            vl[i].dedicated));
+               }
+       }
+
+       /* next raise all shared that are going up */
+       for (i = 0; i < NUM_USABLE_VLS; i++) {
+               if (!valid_vl(i))
+                       continue;
+
+               if (be16_to_cpu(new_bc->vl[i].shared) >
+                               be16_to_cpu(cur_bc.vl[i].shared))
+                       set_vl_shared(dd, i, be16_to_cpu(new_bc->vl[i].shared));
+       }
+
+       /* finally raise the global shared */
+       if (be16_to_cpu(new_bc->overall_shared_limit) >
+           be16_to_cpu(cur_bc.overall_shared_limit))
+               set_global_shared(dd,
+                                 be16_to_cpu(new_bc->overall_shared_limit));
+
+       /* bracket the credit change with a total adjustment */
+       if (new_total < cur_total)
+               set_global_limit(dd, new_total);
+
+       /*
+        * Determine the actual number of operational VLS using the number of
+        * dedicated and shared credits for each VL.
+        */
+       if (change_count > 0) {
+               for (i = 0; i < TXE_NUM_DATA_VL; i++)
+                       if (be16_to_cpu(new_bc->vl[i].dedicated) > 0 ||
+                           be16_to_cpu(new_bc->vl[i].shared) > 0)
+                               vl_count++;
+               ppd->actual_vls_operational = vl_count;
+               ret = sdma_map_init(dd, ppd->port - 1, vl_count ?
+                                   ppd->actual_vls_operational :
+                                   ppd->vls_operational,
+                                   NULL);
+               if (ret == 0)
+                       ret = pio_map_init(dd, ppd->port - 1, vl_count ?
+                                          ppd->actual_vls_operational :
+                                          ppd->vls_operational, NULL);
+               if (ret)
+                       return ret;
+       }
+       return 0;
+}
+
+/*
+ * Read the given fabric manager table. Return the size of the
+ * table (in bytes) on success, and a negative error code on
+ * failure.
+ */
+int fm_get_table(struct hfi1_pportdata *ppd, int which, void *t)
+
+{
+       int size;
+       struct vl_arb_cache *vlc;
+
+       switch (which) {
+       case FM_TBL_VL_HIGH_ARB:
+               size = 256;
+               /*
+                * OPA specifies 128 elements (of 2 bytes each), though
+                * HFI supports only 16 elements in h/w.
+                */
+               vlc = vl_arb_lock_cache(ppd, HI_PRIO_TABLE);
+               vl_arb_get_cache(vlc, t);
+               vl_arb_unlock_cache(ppd, HI_PRIO_TABLE);
+               break;
+       case FM_TBL_VL_LOW_ARB:
+               size = 256;
+               /*
+                * OPA specifies 128 elements (of 2 bytes each), though
+                * HFI supports only 16 elements in h/w.
+                */
+               vlc = vl_arb_lock_cache(ppd, LO_PRIO_TABLE);
+               vl_arb_get_cache(vlc, t);
+               vl_arb_unlock_cache(ppd, LO_PRIO_TABLE);
+               break;
+       case FM_TBL_BUFFER_CONTROL:
+               size = get_buffer_control(ppd->dd, t, NULL);
+               break;
+       case FM_TBL_SC2VLNT:
+               size = get_sc2vlnt(ppd->dd, t);
+               break;
+       case FM_TBL_VL_PREEMPT_ELEMS:
+               size = 256;
+               /* OPA specifies 128 elements, of 2 bytes each */
+               get_vlarb_preempt(ppd->dd, OPA_MAX_VLS, t);
+               break;
+       case FM_TBL_VL_PREEMPT_MATRIX:
+               size = 256;
+               /*
+                * OPA specifies that this is the same size as the VL
+                * arbitration tables (i.e., 256 bytes).
+                */
+               break;
+       default:
+               return -EINVAL;
+       }
+       return size;
+}
+
+/*
+ * Write the given fabric manager table.
+ */
+int fm_set_table(struct hfi1_pportdata *ppd, int which, void *t)
+{
+       int ret = 0;
+       struct vl_arb_cache *vlc;
+
+       switch (which) {
+       case FM_TBL_VL_HIGH_ARB:
+               vlc = vl_arb_lock_cache(ppd, HI_PRIO_TABLE);
+               if (vl_arb_match_cache(vlc, t)) {
+                       vl_arb_unlock_cache(ppd, HI_PRIO_TABLE);
+                       break;
+               }
+               vl_arb_set_cache(vlc, t);
+               vl_arb_unlock_cache(ppd, HI_PRIO_TABLE);
+               ret = set_vl_weights(ppd, SEND_HIGH_PRIORITY_LIST,
+                                    VL_ARB_HIGH_PRIO_TABLE_SIZE, t);
+               break;
+       case FM_TBL_VL_LOW_ARB:
+               vlc = vl_arb_lock_cache(ppd, LO_PRIO_TABLE);
+               if (vl_arb_match_cache(vlc, t)) {
+                       vl_arb_unlock_cache(ppd, LO_PRIO_TABLE);
+                       break;
+               }
+               vl_arb_set_cache(vlc, t);
+               vl_arb_unlock_cache(ppd, LO_PRIO_TABLE);
+               ret = set_vl_weights(ppd, SEND_LOW_PRIORITY_LIST,
+                                    VL_ARB_LOW_PRIO_TABLE_SIZE, t);
+               break;
+       case FM_TBL_BUFFER_CONTROL:
+               ret = set_buffer_control(ppd, t);
+               break;
+       case FM_TBL_SC2VLNT:
+               set_sc2vlnt(ppd->dd, t);
+               break;
+       default:
+               ret = -EINVAL;
+       }
+       return ret;
+}
+
+/*
+ * Disable all data VLs.
+ *
+ * Return 0 if disabled, non-zero if the VLs cannot be disabled.
+ */
+static int disable_data_vls(struct hfi1_devdata *dd)
+{
+       if (is_ax(dd))
+               return 1;
+
+       pio_send_control(dd, PSC_DATA_VL_DISABLE);
+
+       return 0;
+}
+
+/*
+ * open_fill_data_vls() - the counterpart to stop_drain_data_vls().
+ * Just re-enables all data VLs (the "fill" part happens
+ * automatically - the name was chosen for symmetry with
+ * stop_drain_data_vls()).
+ *
+ * Return 0 if successful, non-zero if the VLs cannot be enabled.
+ */
+int open_fill_data_vls(struct hfi1_devdata *dd)
+{
+       if (is_ax(dd))
+               return 1;
+
+       pio_send_control(dd, PSC_DATA_VL_ENABLE);
+
+       return 0;
+}
+
+/*
+ * drain_data_vls() - assumes that disable_data_vls() has been called,
+ * wait for occupancy (of per-VL FIFOs) for all contexts, and SDMA
+ * engines to drop to 0.
+ */
+static void drain_data_vls(struct hfi1_devdata *dd)
+{
+       sc_wait(dd);
+       sdma_wait(dd);
+       pause_for_credit_return(dd);
+}
+
+/*
+ * stop_drain_data_vls() - disable, then drain all per-VL fifos.
+ *
+ * Use open_fill_data_vls() to resume using data VLs.  This pair is
+ * meant to be used like this:
+ *
+ * stop_drain_data_vls(dd);
+ * // do things with per-VL resources
+ * open_fill_data_vls(dd);
+ */
+int stop_drain_data_vls(struct hfi1_devdata *dd)
+{
+       int ret;
+
+       ret = disable_data_vls(dd);
+       if (ret == 0)
+               drain_data_vls(dd);
+
+       return ret;
+}
+
+/*
+ * Convert a nanosecond time to a cclock count.  No matter how slow
+ * the cclock, a non-zero ns will always have a non-zero result.
+ */
+u32 ns_to_cclock(struct hfi1_devdata *dd, u32 ns)
+{
+       u32 cclocks;
+
+       if (dd->icode == ICODE_FPGA_EMULATION)
+               cclocks = (ns * 1000) / FPGA_CCLOCK_PS;
+       else  /* simulation pretends to be ASIC */
+               cclocks = (ns * 1000) / ASIC_CCLOCK_PS;
+       if (ns && !cclocks)     /* if ns nonzero, must be at least 1 */
+               cclocks = 1;
+       return cclocks;
+}
+
+/*
+ * Convert a cclock count to nanoseconds. Not matter how slow
+ * the cclock, a non-zero cclocks will always have a non-zero result.
+ */
+u32 cclock_to_ns(struct hfi1_devdata *dd, u32 cclocks)
+{
+       u32 ns;
+
+       if (dd->icode == ICODE_FPGA_EMULATION)
+               ns = (cclocks * FPGA_CCLOCK_PS) / 1000;
+       else  /* simulation pretends to be ASIC */
+               ns = (cclocks * ASIC_CCLOCK_PS) / 1000;
+       if (cclocks && !ns)
+               ns = 1;
+       return ns;
+}
+
+/*
+ * Dynamically adjust the receive interrupt timeout for a context based on
+ * incoming packet rate.
+ *
+ * NOTE: Dynamic adjustment does not allow rcv_intr_count to be zero.
+ */
+static void adjust_rcv_timeout(struct hfi1_ctxtdata *rcd, u32 npkts)
+{
+       struct hfi1_devdata *dd = rcd->dd;
+       u32 timeout = rcd->rcvavail_timeout;
+
+       /*
+        * This algorithm doubles or halves the timeout depending on whether
+        * the number of packets received in this interrupt were less than or
+        * greater equal the interrupt count.
+        *
+        * The calculations below do not allow a steady state to be achieved.
+        * Only at the endpoints it is possible to have an unchanging
+        * timeout.
+        */
+       if (npkts < rcv_intr_count) {
+               /*
+                * Not enough packets arrived before the timeout, adjust
+                * timeout downward.
+                */
+               if (timeout < 2) /* already at minimum? */
+                       return;
+               timeout >>= 1;
+       } else {
+               /*
+                * More than enough packets arrived before the timeout, adjust
+                * timeout upward.
+                */
+               if (timeout >= dd->rcv_intr_timeout_csr) /* already at max? */
+                       return;
+               timeout = min(timeout << 1, dd->rcv_intr_timeout_csr);
+       }
+
+       rcd->rcvavail_timeout = timeout;
+       /*
+        * timeout cannot be larger than rcv_intr_timeout_csr which has already
+        * been verified to be in range
+        */
+       write_kctxt_csr(dd, rcd->ctxt, RCV_AVAIL_TIME_OUT,
+                       (u64)timeout <<
+                       RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT);
+}
+
+void update_usrhead(struct hfi1_ctxtdata *rcd, u32 hd, u32 updegr, u32 egrhd,
+                   u32 intr_adjust, u32 npkts)
+{
+       struct hfi1_devdata *dd = rcd->dd;
+       u64 reg;
+       u32 ctxt = rcd->ctxt;
+
+       /*
+        * Need to write timeout register before updating RcvHdrHead to ensure
+        * that a new value is used when the HW decides to restart counting.
+        */
+       if (intr_adjust)
+               adjust_rcv_timeout(rcd, npkts);
+       if (updegr) {
+               reg = (egrhd & RCV_EGR_INDEX_HEAD_HEAD_MASK)
+                       << RCV_EGR_INDEX_HEAD_HEAD_SHIFT;
+               write_uctxt_csr(dd, ctxt, RCV_EGR_INDEX_HEAD, reg);
+       }
+       mmiowb();
+       reg = ((u64)rcv_intr_count << RCV_HDR_HEAD_COUNTER_SHIFT) |
+               (((u64)hd & RCV_HDR_HEAD_HEAD_MASK)
+                       << RCV_HDR_HEAD_HEAD_SHIFT);
+       write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, reg);
+       mmiowb();
+}
+
+u32 hdrqempty(struct hfi1_ctxtdata *rcd)
+{
+       u32 head, tail;
+
+       head = (read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_HEAD)
+               & RCV_HDR_HEAD_HEAD_SMASK) >> RCV_HDR_HEAD_HEAD_SHIFT;
+
+       if (rcd->rcvhdrtail_kvaddr)
+               tail = get_rcvhdrtail(rcd);
+       else
+               tail = read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_TAIL);
+
+       return head == tail;
+}
+
+/*
+ * Context Control and Receive Array encoding for buffer size:
+ *     0x0 invalid
+ *     0x1   4 KB
+ *     0x2   8 KB
+ *     0x3  16 KB
+ *     0x4  32 KB
+ *     0x5  64 KB
+ *     0x6 128 KB
+ *     0x7 256 KB
+ *     0x8 512 KB (Receive Array only)
+ *     0x9   1 MB (Receive Array only)
+ *     0xa   2 MB (Receive Array only)
+ *
+ *     0xB-0xF - reserved (Receive Array only)
+ *
+ *
+ * This routine assumes that the value has already been sanity checked.
+ */
+static u32 encoded_size(u32 size)
+{
+       switch (size) {
+       case   4 * 1024: return 0x1;
+       case   8 * 1024: return 0x2;
+       case  16 * 1024: return 0x3;
+       case  32 * 1024: return 0x4;
+       case  64 * 1024: return 0x5;
+       case 128 * 1024: return 0x6;
+       case 256 * 1024: return 0x7;
+       case 512 * 1024: return 0x8;
+       case   1 * 1024 * 1024: return 0x9;
+       case   2 * 1024 * 1024: return 0xa;
+       }
+       return 0x1;     /* if invalid, go with the minimum size */
+}
+
+void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, int ctxt)
+{
+       struct hfi1_ctxtdata *rcd;
+       u64 rcvctrl, reg;
+       int did_enable = 0;
+
+       rcd = dd->rcd[ctxt];
+       if (!rcd)
+               return;
+
+       hfi1_cdbg(RCVCTRL, "ctxt %d op 0x%x", ctxt, op);
+
+       rcvctrl = read_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL);
+       /* if the context already enabled, don't do the extra steps */
+       if ((op & HFI1_RCVCTRL_CTXT_ENB) &&
+           !(rcvctrl & RCV_CTXT_CTRL_ENABLE_SMASK)) {
+               /* reset the tail and hdr addresses, and sequence count */
+               write_kctxt_csr(dd, ctxt, RCV_HDR_ADDR,
+                               rcd->rcvhdrq_phys);
+               if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL))
+                       write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR,
+                                       rcd->rcvhdrqtailaddr_phys);
+               rcd->seq_cnt = 1;
+
+               /* reset the cached receive header queue head value */
+               rcd->head = 0;
+
+               /*
+                * Zero the receive header queue so we don't get false
+                * positives when checking the sequence number.  The
+                * sequence numbers could land exactly on the same spot.
+                * E.g. a rcd restart before the receive header wrapped.
+                */
+               memset(rcd->rcvhdrq, 0, rcd->rcvhdrq_size);
+
+               /* starting timeout */
+               rcd->rcvavail_timeout = dd->rcv_intr_timeout_csr;
+
+               /* enable the context */
+               rcvctrl |= RCV_CTXT_CTRL_ENABLE_SMASK;
+
+               /* clean the egr buffer size first */
+               rcvctrl &= ~RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK;
+               rcvctrl |= ((u64)encoded_size(rcd->egrbufs.rcvtid_size)
+                               & RCV_CTXT_CTRL_EGR_BUF_SIZE_MASK)
+                                       << RCV_CTXT_CTRL_EGR_BUF_SIZE_SHIFT;
+
+               /* zero RcvHdrHead - set RcvHdrHead.Counter after enable */
+               write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0);
+               did_enable = 1;
+
+               /* zero RcvEgrIndexHead */
+               write_uctxt_csr(dd, ctxt, RCV_EGR_INDEX_HEAD, 0);
+
+               /* set eager count and base index */
+               reg = (((u64)(rcd->egrbufs.alloced >> RCV_SHIFT)
+                       & RCV_EGR_CTRL_EGR_CNT_MASK)
+                      << RCV_EGR_CTRL_EGR_CNT_SHIFT) |
+                       (((rcd->eager_base >> RCV_SHIFT)
+                         & RCV_EGR_CTRL_EGR_BASE_INDEX_MASK)
+                        << RCV_EGR_CTRL_EGR_BASE_INDEX_SHIFT);
+               write_kctxt_csr(dd, ctxt, RCV_EGR_CTRL, reg);
+
+               /*
+                * Set TID (expected) count and base index.
+                * rcd->expected_count is set to individual RcvArray entries,
+                * not pairs, and the CSR takes a pair-count in groups of
+                * four, so divide by 8.
+                */
+               reg = (((rcd->expected_count >> RCV_SHIFT)
+                                       & RCV_TID_CTRL_TID_PAIR_CNT_MASK)
+                               << RCV_TID_CTRL_TID_PAIR_CNT_SHIFT) |
+                     (((rcd->expected_base >> RCV_SHIFT)
+                                       & RCV_TID_CTRL_TID_BASE_INDEX_MASK)
+                               << RCV_TID_CTRL_TID_BASE_INDEX_SHIFT);
+               write_kctxt_csr(dd, ctxt, RCV_TID_CTRL, reg);
+               if (ctxt == HFI1_CTRL_CTXT)
+                       write_csr(dd, RCV_VL15, HFI1_CTRL_CTXT);
+       }
+       if (op & HFI1_RCVCTRL_CTXT_DIS) {
+               write_csr(dd, RCV_VL15, 0);
+               /*
+                * When receive context is being disabled turn on tail
+                * update with a dummy tail address and then disable
+                * receive context.
+                */
+               if (dd->rcvhdrtail_dummy_physaddr) {
+                       write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR,
+                                       dd->rcvhdrtail_dummy_physaddr);
+                       /* Enabling RcvCtxtCtrl.TailUpd is intentional. */
+                       rcvctrl |= RCV_CTXT_CTRL_TAIL_UPD_SMASK;
+               }
+
+               rcvctrl &= ~RCV_CTXT_CTRL_ENABLE_SMASK;
+       }
+       if (op & HFI1_RCVCTRL_INTRAVAIL_ENB)
+               rcvctrl |= RCV_CTXT_CTRL_INTR_AVAIL_SMASK;
+       if (op & HFI1_RCVCTRL_INTRAVAIL_DIS)
+               rcvctrl &= ~RCV_CTXT_CTRL_INTR_AVAIL_SMASK;
+       if (op & HFI1_RCVCTRL_TAILUPD_ENB && rcd->rcvhdrqtailaddr_phys)
+               rcvctrl |= RCV_CTXT_CTRL_TAIL_UPD_SMASK;
+       if (op & HFI1_RCVCTRL_TAILUPD_DIS) {
+               /* See comment on RcvCtxtCtrl.TailUpd above */
+               if (!(op & HFI1_RCVCTRL_CTXT_DIS))
+                       rcvctrl &= ~RCV_CTXT_CTRL_TAIL_UPD_SMASK;
+       }
+       if (op & HFI1_RCVCTRL_TIDFLOW_ENB)
+               rcvctrl |= RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK;
+       if (op & HFI1_RCVCTRL_TIDFLOW_DIS)
+               rcvctrl &= ~RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK;
+       if (op & HFI1_RCVCTRL_ONE_PKT_EGR_ENB) {
+               /*
+                * In one-packet-per-eager mode, the size comes from
+                * the RcvArray entry.
+                */
+               rcvctrl &= ~RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK;
+               rcvctrl |= RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK;
+       }
+       if (op & HFI1_RCVCTRL_ONE_PKT_EGR_DIS)
+               rcvctrl &= ~RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK;
+       if (op & HFI1_RCVCTRL_NO_RHQ_DROP_ENB)
+               rcvctrl |= RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK;
+       if (op & HFI1_RCVCTRL_NO_RHQ_DROP_DIS)
+               rcvctrl &= ~RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK;
+       if (op & HFI1_RCVCTRL_NO_EGR_DROP_ENB)
+               rcvctrl |= RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK;
+       if (op & HFI1_RCVCTRL_NO_EGR_DROP_DIS)
+               rcvctrl &= ~RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK;
+       rcd->rcvctrl = rcvctrl;
+       hfi1_cdbg(RCVCTRL, "ctxt %d rcvctrl 0x%llx\n", ctxt, rcvctrl);
+       write_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL, rcd->rcvctrl);
+
+       /* work around sticky RcvCtxtStatus.BlockedRHQFull */
+       if (did_enable &&
+           (rcvctrl & RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK)) {
+               reg = read_kctxt_csr(dd, ctxt, RCV_CTXT_STATUS);
+               if (reg != 0) {
+                       dd_dev_info(dd, "ctxt %d status %lld (blocked)\n",
+                                   ctxt, reg);
+                       read_uctxt_csr(dd, ctxt, RCV_HDR_HEAD);
+                       write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0x10);
+                       write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0x00);
+                       read_uctxt_csr(dd, ctxt, RCV_HDR_HEAD);
+                       reg = read_kctxt_csr(dd, ctxt, RCV_CTXT_STATUS);
+                       dd_dev_info(dd, "ctxt %d status %lld (%s blocked)\n",
+                                   ctxt, reg, reg == 0 ? "not" : "still");
+               }
+       }
+
+       if (did_enable) {
+               /*
+                * The interrupt timeout and count must be set after
+                * the context is enabled to take effect.
+                */
+               /* set interrupt timeout */
+               write_kctxt_csr(dd, ctxt, RCV_AVAIL_TIME_OUT,
+                               (u64)rcd->rcvavail_timeout <<
+                               RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT);
+
+               /* set RcvHdrHead.Counter, zero RcvHdrHead.Head (again) */
+               reg = (u64)rcv_intr_count << RCV_HDR_HEAD_COUNTER_SHIFT;
+               write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, reg);
+       }
+
+       if (op & (HFI1_RCVCTRL_TAILUPD_DIS | HFI1_RCVCTRL_CTXT_DIS))
+               /*
+                * If the context has been disabled and the Tail Update has
+                * been cleared, set the RCV_HDR_TAIL_ADDR CSR to dummy address
+                * so it doesn't contain an address that is invalid.
+                */
+               write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR,
+                               dd->rcvhdrtail_dummy_physaddr);
+}
+
+u32 hfi1_read_cntrs(struct hfi1_devdata *dd, char **namep, u64 **cntrp)
+{
+       int ret;
+       u64 val = 0;
+
+       if (namep) {
+               ret = dd->cntrnameslen;
+               *namep = dd->cntrnames;
+       } else {
+               const struct cntr_entry *entry;
+               int i, j;
+
+               ret = (dd->ndevcntrs) * sizeof(u64);
+
+               /* Get the start of the block of counters */
+               *cntrp = dd->cntrs;
+
+               /*
+                * Now go and fill in each counter in the block.
+                */
+               for (i = 0; i < DEV_CNTR_LAST; i++) {
+                       entry = &dev_cntrs[i];
+                       hfi1_cdbg(CNTR, "reading %s", entry->name);
+                       if (entry->flags & CNTR_DISABLED) {
+                               /* Nothing */
+                               hfi1_cdbg(CNTR, "\tDisabled\n");
+                       } else {
+                               if (entry->flags & CNTR_VL) {
+                                       hfi1_cdbg(CNTR, "\tPer VL\n");
+                                       for (j = 0; j < C_VL_COUNT; j++) {
+                                               val = entry->rw_cntr(entry,
+                                                                 dd, j,
+                                                                 CNTR_MODE_R,
+                                                                 0);
+                                               hfi1_cdbg(
+                                                  CNTR,
+                                                  "\t\tRead 0x%llx for %d\n",
+                                                  val, j);
+                                               dd->cntrs[entry->offset + j] =
+                                                                           val;
+                                       }
+                               } else if (entry->flags & CNTR_SDMA) {
+                                       hfi1_cdbg(CNTR,
+                                                 "\t Per SDMA Engine\n");
+                                       for (j = 0; j < dd->chip_sdma_engines;
+                                            j++) {
+                                               val =
+                                               entry->rw_cntr(entry, dd, j,
+                                                              CNTR_MODE_R, 0);
+                                               hfi1_cdbg(CNTR,
+                                                         "\t\tRead 0x%llx for %d\n",
+                                                         val, j);
+                                               dd->cntrs[entry->offset + j] =
+                                                                       val;
+                                       }
+                               } else {
+                                       val = entry->rw_cntr(entry, dd,
+                                                       CNTR_INVALID_VL,
+                                                       CNTR_MODE_R, 0);
+                                       dd->cntrs[entry->offset] = val;
+                                       hfi1_cdbg(CNTR, "\tRead 0x%llx", val);
+                               }
+                       }
+               }
+       }
+       return ret;
+}
+
+/*
+ * Used by sysfs to create files for hfi stats to read
+ */
+u32 hfi1_read_portcntrs(struct hfi1_pportdata *ppd, char **namep, u64 **cntrp)
+{
+       int ret;
+       u64 val = 0;
+
+       if (namep) {
+               ret = ppd->dd->portcntrnameslen;
+               *namep = ppd->dd->portcntrnames;
+       } else {
+               const struct cntr_entry *entry;
+               int i, j;
+
+               ret = ppd->dd->nportcntrs * sizeof(u64);
+               *cntrp = ppd->cntrs;
+
+               for (i = 0; i < PORT_CNTR_LAST; i++) {
+                       entry = &port_cntrs[i];
+                       hfi1_cdbg(CNTR, "reading %s", entry->name);
+                       if (entry->flags & CNTR_DISABLED) {
+                               /* Nothing */
+                               hfi1_cdbg(CNTR, "\tDisabled\n");
+                               continue;
+                       }
+
+                       if (entry->flags & CNTR_VL) {
+                               hfi1_cdbg(CNTR, "\tPer VL");
+                               for (j = 0; j < C_VL_COUNT; j++) {
+                                       val = entry->rw_cntr(entry, ppd, j,
+                                                              CNTR_MODE_R,
+                                                              0);
+                                       hfi1_cdbg(
+                                          CNTR,
+                                          "\t\tRead 0x%llx for %d",
+                                          val, j);
+                                       ppd->cntrs[entry->offset + j] = val;
+                               }
+                       } else {
+                               val = entry->rw_cntr(entry, ppd,
+                                                      CNTR_INVALID_VL,
+                                                      CNTR_MODE_R,
+                                                      0);
+                               ppd->cntrs[entry->offset] = val;
+                               hfi1_cdbg(CNTR, "\tRead 0x%llx", val);
+                       }
+               }
+       }
+       return ret;
+}
+
+static void free_cntrs(struct hfi1_devdata *dd)
+{
+       struct hfi1_pportdata *ppd;
+       int i;
+
+       if (dd->synth_stats_timer.data)
+               del_timer_sync(&dd->synth_stats_timer);
+       dd->synth_stats_timer.data = 0;
+       ppd = (struct hfi1_pportdata *)(dd + 1);
+       for (i = 0; i < dd->num_pports; i++, ppd++) {
+               kfree(ppd->cntrs);
+               kfree(ppd->scntrs);
+               free_percpu(ppd->ibport_data.rvp.rc_acks);
+               free_percpu(ppd->ibport_data.rvp.rc_qacks);
+               free_percpu(ppd->ibport_data.rvp.rc_delayed_comp);
+               ppd->cntrs = NULL;
+               ppd->scntrs = NULL;
+               ppd->ibport_data.rvp.rc_acks = NULL;
+               ppd->ibport_data.rvp.rc_qacks = NULL;
+               ppd->ibport_data.rvp.rc_delayed_comp = NULL;
+       }
+       kfree(dd->portcntrnames);
+       dd->portcntrnames = NULL;
+       kfree(dd->cntrs);
+       dd->cntrs = NULL;
+       kfree(dd->scntrs);
+       dd->scntrs = NULL;
+       kfree(dd->cntrnames);
+       dd->cntrnames = NULL;
+}
+
+#define CNTR_MAX 0xFFFFFFFFFFFFFFFFULL
+#define CNTR_32BIT_MAX 0x00000000FFFFFFFF
+
+static u64 read_dev_port_cntr(struct hfi1_devdata *dd, struct cntr_entry *entry,
+                             u64 *psval, void *context, int vl)
+{
+       u64 val;
+       u64 sval = *psval;
+
+       if (entry->flags & CNTR_DISABLED) {
+               dd_dev_err(dd, "Counter %s not enabled", entry->name);
+               return 0;
+       }
+
+       hfi1_cdbg(CNTR, "cntr: %s vl %d psval 0x%llx", entry->name, vl, *psval);
+
+       val = entry->rw_cntr(entry, context, vl, CNTR_MODE_R, 0);
+
+       /* If its a synthetic counter there is more work we need to do */
+       if (entry->flags & CNTR_SYNTH) {
+               if (sval == CNTR_MAX) {
+                       /* No need to read already saturated */
+                       return CNTR_MAX;
+               }
+
+               if (entry->flags & CNTR_32BIT) {
+                       /* 32bit counters can wrap multiple times */
+                       u64 upper = sval >> 32;
+                       u64 lower = (sval << 32) >> 32;
+
+                       if (lower > val) { /* hw wrapped */
+                               if (upper == CNTR_32BIT_MAX)
+                                       val = CNTR_MAX;
+                               else
+                                       upper++;
+                       }
+
+                       if (val != CNTR_MAX)
+                               val = (upper << 32) | val;
+
+               } else {
+                       /* If we rolled we are saturated */
+                       if ((val < sval) || (val > CNTR_MAX))
+                               val = CNTR_MAX;
+               }
+       }
+
+       *psval = val;
+
+       hfi1_cdbg(CNTR, "\tNew val=0x%llx", val);
+
+       return val;
+}
+
+static u64 write_dev_port_cntr(struct hfi1_devdata *dd,
+                              struct cntr_entry *entry,
+                              u64 *psval, void *context, int vl, u64 data)
+{
+       u64 val;
+
+       if (entry->flags & CNTR_DISABLED) {
+               dd_dev_err(dd, "Counter %s not enabled", entry->name);
+               return 0;
+       }
+
+       hfi1_cdbg(CNTR, "cntr: %s vl %d psval 0x%llx", entry->name, vl, *psval);
+
+       if (entry->flags & CNTR_SYNTH) {
+               *psval = data;
+               if (entry->flags & CNTR_32BIT) {
+                       val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W,
+                                            (data << 32) >> 32);
+                       val = data; /* return the full 64bit value */
+               } else {
+                       val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W,
+                                            data);
+               }
+       } else {
+               val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W, data);
+       }
+
+       *psval = val;
+
+       hfi1_cdbg(CNTR, "\tNew val=0x%llx", val);
+
+       return val;
+}
+
+u64 read_dev_cntr(struct hfi1_devdata *dd, int index, int vl)
+{
+       struct cntr_entry *entry;
+       u64 *sval;
+
+       entry = &dev_cntrs[index];
+       sval = dd->scntrs + entry->offset;
+
+       if (vl != CNTR_INVALID_VL)
+               sval += vl;
+
+       return read_dev_port_cntr(dd, entry, sval, dd, vl);
+}
+
+u64 write_dev_cntr(struct hfi1_devdata *dd, int index, int vl, u64 data)
+{
+       struct cntr_entry *entry;
+       u64 *sval;
+
+       entry = &dev_cntrs[index];
+       sval = dd->scntrs + entry->offset;
+
+       if (vl != CNTR_INVALID_VL)
+               sval += vl;
+
+       return write_dev_port_cntr(dd, entry, sval, dd, vl, data);
+}
+
+u64 read_port_cntr(struct hfi1_pportdata *ppd, int index, int vl)
+{
+       struct cntr_entry *entry;
+       u64 *sval;
+
+       entry = &port_cntrs[index];
+       sval = ppd->scntrs + entry->offset;
+
+       if (vl != CNTR_INVALID_VL)
+               sval += vl;
+
+       if ((index >= C_RCV_HDR_OVF_FIRST + ppd->dd->num_rcv_contexts) &&
+           (index <= C_RCV_HDR_OVF_LAST)) {
+               /* We do not want to bother for disabled contexts */
+               return 0;
+       }
+
+       return read_dev_port_cntr(ppd->dd, entry, sval, ppd, vl);
+}
+
+u64 write_port_cntr(struct hfi1_pportdata *ppd, int index, int vl, u64 data)
+{
+       struct cntr_entry *entry;
+       u64 *sval;
+
+       entry = &port_cntrs[index];
+       sval = ppd->scntrs + entry->offset;
+
+       if (vl != CNTR_INVALID_VL)
+               sval += vl;
+
+       if ((index >= C_RCV_HDR_OVF_FIRST + ppd->dd->num_rcv_contexts) &&
+           (index <= C_RCV_HDR_OVF_LAST)) {
+               /* We do not want to bother for disabled contexts */
+               return 0;
+       }
+
+       return write_dev_port_cntr(ppd->dd, entry, sval, ppd, vl, data);
+}
+
+static void update_synth_timer(unsigned long opaque)
+{
+       u64 cur_tx;
+       u64 cur_rx;
+       u64 total_flits;
+       u8 update = 0;
+       int i, j, vl;
+       struct hfi1_pportdata *ppd;
+       struct cntr_entry *entry;
+
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)opaque;
+
+       /*
+        * Rather than keep beating on the CSRs pick a minimal set that we can
+        * check to watch for potential roll over. We can do this by looking at
+        * the number of flits sent/recv. If the total flits exceeds 32bits then
+        * we have to iterate all the counters and update.
+        */
+       entry = &dev_cntrs[C_DC_RCV_FLITS];
+       cur_rx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL, CNTR_MODE_R, 0);
+
+       entry = &dev_cntrs[C_DC_XMIT_FLITS];
+       cur_tx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL, CNTR_MODE_R, 0);
+
+       hfi1_cdbg(
+           CNTR,
+           "[%d] curr tx=0x%llx rx=0x%llx :: last tx=0x%llx rx=0x%llx\n",
+           dd->unit, cur_tx, cur_rx, dd->last_tx, dd->last_rx);
+
+       if ((cur_tx < dd->last_tx) || (cur_rx < dd->last_rx)) {
+               /*
+                * May not be strictly necessary to update but it won't hurt and
+                * simplifies the logic here.
+                */
+               update = 1;
+               hfi1_cdbg(CNTR, "[%d] Tripwire counter rolled, updating",
+                         dd->unit);
+       } else {
+               total_flits = (cur_tx - dd->last_tx) + (cur_rx - dd->last_rx);
+               hfi1_cdbg(CNTR,
+                         "[%d] total flits 0x%llx limit 0x%llx\n", dd->unit,
+                         total_flits, (u64)CNTR_32BIT_MAX);
+               if (total_flits >= CNTR_32BIT_MAX) {
+                       hfi1_cdbg(CNTR, "[%d] 32bit limit hit, updating",
+                                 dd->unit);
+                       update = 1;
+               }
+       }
+
+       if (update) {
+               hfi1_cdbg(CNTR, "[%d] Updating dd and ppd counters", dd->unit);
+               for (i = 0; i < DEV_CNTR_LAST; i++) {
+                       entry = &dev_cntrs[i];
+                       if (entry->flags & CNTR_VL) {
+                               for (vl = 0; vl < C_VL_COUNT; vl++)
+                                       read_dev_cntr(dd, i, vl);
+                       } else {
+                               read_dev_cntr(dd, i, CNTR_INVALID_VL);
+                       }
+               }
+               ppd = (struct hfi1_pportdata *)(dd + 1);
+               for (i = 0; i < dd->num_pports; i++, ppd++) {
+                       for (j = 0; j < PORT_CNTR_LAST; j++) {
+                               entry = &port_cntrs[j];
+                               if (entry->flags & CNTR_VL) {
+                                       for (vl = 0; vl < C_VL_COUNT; vl++)
+                                               read_port_cntr(ppd, j, vl);
+                               } else {
+                                       read_port_cntr(ppd, j, CNTR_INVALID_VL);
+                               }
+                       }
+               }
+
+               /*
+                * We want the value in the register. The goal is to keep track
+                * of the number of "ticks" not the counter value. In other
+                * words if the register rolls we want to notice it and go ahead
+                * and force an update.
+                */
+               entry = &dev_cntrs[C_DC_XMIT_FLITS];
+               dd->last_tx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL,
+                                               CNTR_MODE_R, 0);
+
+               entry = &dev_cntrs[C_DC_RCV_FLITS];
+               dd->last_rx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL,
+                                               CNTR_MODE_R, 0);
+
+               hfi1_cdbg(CNTR, "[%d] setting last tx/rx to 0x%llx 0x%llx",
+                         dd->unit, dd->last_tx, dd->last_rx);
+
+       } else {
+               hfi1_cdbg(CNTR, "[%d] No update necessary", dd->unit);
+       }
+
+mod_timer(&dd->synth_stats_timer, jiffies + HZ * SYNTH_CNT_TIME);
+}
+
+#define C_MAX_NAME 13 /* 12 chars + one for /0 */
+static int init_cntrs(struct hfi1_devdata *dd)
+{
+       int i, rcv_ctxts, j;
+       size_t sz;
+       char *p;
+       char name[C_MAX_NAME];
+       struct hfi1_pportdata *ppd;
+       const char *bit_type_32 = ",32";
+       const int bit_type_32_sz = strlen(bit_type_32);
+
+       /* set up the stats timer; the add_timer is done at the end */
+       setup_timer(&dd->synth_stats_timer, update_synth_timer,
+                   (unsigned long)dd);
+
+       /***********************/
+       /* per device counters */
+       /***********************/
+
+       /* size names and determine how many we have*/
+       dd->ndevcntrs = 0;
+       sz = 0;
+
+       for (i = 0; i < DEV_CNTR_LAST; i++) {
+               if (dev_cntrs[i].flags & CNTR_DISABLED) {
+                       hfi1_dbg_early("\tSkipping %s\n", dev_cntrs[i].name);
+                       continue;
+               }
+
+               if (dev_cntrs[i].flags & CNTR_VL) {
+                       dev_cntrs[i].offset = dd->ndevcntrs;
+                       for (j = 0; j < C_VL_COUNT; j++) {
+                               snprintf(name, C_MAX_NAME, "%s%d",
+                                        dev_cntrs[i].name, vl_from_idx(j));
+                               sz += strlen(name);
+                               /* Add ",32" for 32-bit counters */
+                               if (dev_cntrs[i].flags & CNTR_32BIT)
+                                       sz += bit_type_32_sz;
+                               sz++;
+                               dd->ndevcntrs++;
+                       }
+               } else if (dev_cntrs[i].flags & CNTR_SDMA) {
+                       dev_cntrs[i].offset = dd->ndevcntrs;
+                       for (j = 0; j < dd->chip_sdma_engines; j++) {
+                               snprintf(name, C_MAX_NAME, "%s%d",
+                                        dev_cntrs[i].name, j);
+                               sz += strlen(name);
+                               /* Add ",32" for 32-bit counters */
+                               if (dev_cntrs[i].flags & CNTR_32BIT)
+                                       sz += bit_type_32_sz;
+                               sz++;
+                               dd->ndevcntrs++;
+                       }
+               } else {
+                       /* +1 for newline. */
+                       sz += strlen(dev_cntrs[i].name) + 1;
+                       /* Add ",32" for 32-bit counters */
+                       if (dev_cntrs[i].flags & CNTR_32BIT)
+                               sz += bit_type_32_sz;
+                       dev_cntrs[i].offset = dd->ndevcntrs;
+                       dd->ndevcntrs++;
+               }
+       }
+
+       /* allocate space for the counter values */
+       dd->cntrs = kcalloc(dd->ndevcntrs, sizeof(u64), GFP_KERNEL);
+       if (!dd->cntrs)
+               goto bail;
+
+       dd->scntrs = kcalloc(dd->ndevcntrs, sizeof(u64), GFP_KERNEL);
+       if (!dd->scntrs)
+               goto bail;
+
+       /* allocate space for the counter names */
+       dd->cntrnameslen = sz;
+       dd->cntrnames = kmalloc(sz, GFP_KERNEL);
+       if (!dd->cntrnames)
+               goto bail;
+
+       /* fill in the names */
+       for (p = dd->cntrnames, i = 0; i < DEV_CNTR_LAST; i++) {
+               if (dev_cntrs[i].flags & CNTR_DISABLED) {
+                       /* Nothing */
+               } else if (dev_cntrs[i].flags & CNTR_VL) {
+                       for (j = 0; j < C_VL_COUNT; j++) {
+                               snprintf(name, C_MAX_NAME, "%s%d",
+                                        dev_cntrs[i].name,
+                                        vl_from_idx(j));
+                               memcpy(p, name, strlen(name));
+                               p += strlen(name);
+
+                               /* Counter is 32 bits */
+                               if (dev_cntrs[i].flags & CNTR_32BIT) {
+                                       memcpy(p, bit_type_32, bit_type_32_sz);
+                                       p += bit_type_32_sz;
+                               }
+
+                               *p++ = '\n';
+                       }
+               } else if (dev_cntrs[i].flags & CNTR_SDMA) {
+                       for (j = 0; j < dd->chip_sdma_engines; j++) {
+                               snprintf(name, C_MAX_NAME, "%s%d",
+                                        dev_cntrs[i].name, j);
+                               memcpy(p, name, strlen(name));
+                               p += strlen(name);
+
+                               /* Counter is 32 bits */
+                               if (dev_cntrs[i].flags & CNTR_32BIT) {
+                                       memcpy(p, bit_type_32, bit_type_32_sz);
+                                       p += bit_type_32_sz;
+                               }
+
+                               *p++ = '\n';
+                       }
+               } else {
+                       memcpy(p, dev_cntrs[i].name, strlen(dev_cntrs[i].name));
+                       p += strlen(dev_cntrs[i].name);
+
+                       /* Counter is 32 bits */
+                       if (dev_cntrs[i].flags & CNTR_32BIT) {
+                               memcpy(p, bit_type_32, bit_type_32_sz);
+                               p += bit_type_32_sz;
+                       }
+
+                       *p++ = '\n';
+               }
+       }
+
+       /*********************/
+       /* per port counters */
+       /*********************/
+
+       /*
+        * Go through the counters for the overflows and disable the ones we
+        * don't need. This varies based on platform so we need to do it
+        * dynamically here.
+        */
+       rcv_ctxts = dd->num_rcv_contexts;
+       for (i = C_RCV_HDR_OVF_FIRST + rcv_ctxts;
+            i <= C_RCV_HDR_OVF_LAST; i++) {
+               port_cntrs[i].flags |= CNTR_DISABLED;
+       }
+
+       /* size port counter names and determine how many we have*/
+       sz = 0;
+       dd->nportcntrs = 0;
+       for (i = 0; i < PORT_CNTR_LAST; i++) {
+               if (port_cntrs[i].flags & CNTR_DISABLED) {
+                       hfi1_dbg_early("\tSkipping %s\n", port_cntrs[i].name);
+                       continue;
+               }
+
+               if (port_cntrs[i].flags & CNTR_VL) {
+                       port_cntrs[i].offset = dd->nportcntrs;
+                       for (j = 0; j < C_VL_COUNT; j++) {
+                               snprintf(name, C_MAX_NAME, "%s%d",
+                                        port_cntrs[i].name, vl_from_idx(j));
+                               sz += strlen(name);
+                               /* Add ",32" for 32-bit counters */
+                               if (port_cntrs[i].flags & CNTR_32BIT)
+                                       sz += bit_type_32_sz;
+                               sz++;
+                               dd->nportcntrs++;
+                       }
+               } else {
+                       /* +1 for newline */
+                       sz += strlen(port_cntrs[i].name) + 1;
+                       /* Add ",32" for 32-bit counters */
+                       if (port_cntrs[i].flags & CNTR_32BIT)
+                               sz += bit_type_32_sz;
+                       port_cntrs[i].offset = dd->nportcntrs;
+                       dd->nportcntrs++;
+               }
+       }
+
+       /* allocate space for the counter names */
+       dd->portcntrnameslen = sz;
+       dd->portcntrnames = kmalloc(sz, GFP_KERNEL);
+       if (!dd->portcntrnames)
+               goto bail;
+
+       /* fill in port cntr names */
+       for (p = dd->portcntrnames, i = 0; i < PORT_CNTR_LAST; i++) {
+               if (port_cntrs[i].flags & CNTR_DISABLED)
+                       continue;
+
+               if (port_cntrs[i].flags & CNTR_VL) {
+                       for (j = 0; j < C_VL_COUNT; j++) {
+                               snprintf(name, C_MAX_NAME, "%s%d",
+                                        port_cntrs[i].name, vl_from_idx(j));
+                               memcpy(p, name, strlen(name));
+                               p += strlen(name);
+
+                               /* Counter is 32 bits */
+                               if (port_cntrs[i].flags & CNTR_32BIT) {
+                                       memcpy(p, bit_type_32, bit_type_32_sz);
+                                       p += bit_type_32_sz;
+                               }
+
+                               *p++ = '\n';
+                       }
+               } else {
+                       memcpy(p, port_cntrs[i].name,
+                              strlen(port_cntrs[i].name));
+                       p += strlen(port_cntrs[i].name);
+
+                       /* Counter is 32 bits */
+                       if (port_cntrs[i].flags & CNTR_32BIT) {
+                               memcpy(p, bit_type_32, bit_type_32_sz);
+                               p += bit_type_32_sz;
+                       }
+
+                       *p++ = '\n';
+               }
+       }
+
+       /* allocate per port storage for counter values */
+       ppd = (struct hfi1_pportdata *)(dd + 1);
+       for (i = 0; i < dd->num_pports; i++, ppd++) {
+               ppd->cntrs = kcalloc(dd->nportcntrs, sizeof(u64), GFP_KERNEL);
+               if (!ppd->cntrs)
+                       goto bail;
+
+               ppd->scntrs = kcalloc(dd->nportcntrs, sizeof(u64), GFP_KERNEL);
+               if (!ppd->scntrs)
+                       goto bail;
+       }
+
+       /* CPU counters need to be allocated and zeroed */
+       if (init_cpu_counters(dd))
+               goto bail;
+
+       mod_timer(&dd->synth_stats_timer, jiffies + HZ * SYNTH_CNT_TIME);
+       return 0;
+bail:
+       free_cntrs(dd);
+       return -ENOMEM;
+}
+
+static u32 chip_to_opa_lstate(struct hfi1_devdata *dd, u32 chip_lstate)
+{
+       switch (chip_lstate) {
+       default:
+               dd_dev_err(dd,
+                          "Unknown logical state 0x%x, reporting IB_PORT_DOWN\n",
+                          chip_lstate);
+               /* fall through */
+       case LSTATE_DOWN:
+               return IB_PORT_DOWN;
+       case LSTATE_INIT:
+               return IB_PORT_INIT;
+       case LSTATE_ARMED:
+               return IB_PORT_ARMED;
+       case LSTATE_ACTIVE:
+               return IB_PORT_ACTIVE;
+       }
+}
+
+u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate)
+{
+       /* look at the HFI meta-states only */
+       switch (chip_pstate & 0xf0) {
+       default:
+               dd_dev_err(dd, "Unexpected chip physical state of 0x%x\n",
+                          chip_pstate);
+               /* fall through */
+       case PLS_DISABLED:
+               return IB_PORTPHYSSTATE_DISABLED;
+       case PLS_OFFLINE:
+               return OPA_PORTPHYSSTATE_OFFLINE;
+       case PLS_POLLING:
+               return IB_PORTPHYSSTATE_POLLING;
+       case PLS_CONFIGPHY:
+               return IB_PORTPHYSSTATE_TRAINING;
+       case PLS_LINKUP:
+               return IB_PORTPHYSSTATE_LINKUP;
+       case PLS_PHYTEST:
+               return IB_PORTPHYSSTATE_PHY_TEST;
+       }
+}
+
+/* return the OPA port logical state name */
+const char *opa_lstate_name(u32 lstate)
+{
+       static const char * const port_logical_names[] = {
+               "PORT_NOP",
+               "PORT_DOWN",
+               "PORT_INIT",
+               "PORT_ARMED",
+               "PORT_ACTIVE",
+               "PORT_ACTIVE_DEFER",
+       };
+       if (lstate < ARRAY_SIZE(port_logical_names))
+               return port_logical_names[lstate];
+       return "unknown";
+}
+
+/* return the OPA port physical state name */
+const char *opa_pstate_name(u32 pstate)
+{
+       static const char * const port_physical_names[] = {
+               "PHYS_NOP",
+               "reserved1",
+               "PHYS_POLL",
+               "PHYS_DISABLED",
+               "PHYS_TRAINING",
+               "PHYS_LINKUP",
+               "PHYS_LINK_ERR_RECOVER",
+               "PHYS_PHY_TEST",
+               "reserved8",
+               "PHYS_OFFLINE",
+               "PHYS_GANGED",
+               "PHYS_TEST",
+       };
+       if (pstate < ARRAY_SIZE(port_physical_names))
+               return port_physical_names[pstate];
+       return "unknown";
+}
+
+/*
+ * Read the hardware link state and set the driver's cached value of it.
+ * Return the (new) current value.
+ */
+u32 get_logical_state(struct hfi1_pportdata *ppd)
+{
+       u32 new_state;
+
+       new_state = chip_to_opa_lstate(ppd->dd, read_logical_state(ppd->dd));
+       if (new_state != ppd->lstate) {
+               dd_dev_info(ppd->dd, "logical state changed to %s (0x%x)\n",
+                           opa_lstate_name(new_state), new_state);
+               ppd->lstate = new_state;
+       }
+       /*
+        * Set port status flags in the page mapped into userspace
+        * memory. Do it here to ensure a reliable state - this is
+        * the only function called by all state handling code.
+        * Always set the flags due to the fact that the cache value
+        * might have been changed explicitly outside of this
+        * function.
+        */
+       if (ppd->statusp) {
+               switch (ppd->lstate) {
+               case IB_PORT_DOWN:
+               case IB_PORT_INIT:
+                       *ppd->statusp &= ~(HFI1_STATUS_IB_CONF |
+                                          HFI1_STATUS_IB_READY);
+                       break;
+               case IB_PORT_ARMED:
+                       *ppd->statusp |= HFI1_STATUS_IB_CONF;
+                       break;
+               case IB_PORT_ACTIVE:
+                       *ppd->statusp |= HFI1_STATUS_IB_READY;
+                       break;
+               }
+       }
+       return ppd->lstate;
+}
+
+/**
+ * wait_logical_linkstate - wait for an IB link state change to occur
+ * @ppd: port device
+ * @state: the state to wait for
+ * @msecs: the number of milliseconds to wait
+ *
+ * Wait up to msecs milliseconds for IB link state change to occur.
+ * For now, take the easy polling route.
+ * Returns 0 if state reached, otherwise -ETIMEDOUT.
+ */
+static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state,
+                                 int msecs)
+{
+       unsigned long timeout;
+
+       timeout = jiffies + msecs_to_jiffies(msecs);
+       while (1) {
+               if (get_logical_state(ppd) == state)
+                       return 0;
+               if (time_after(jiffies, timeout))
+                       break;
+               msleep(20);
+       }
+       dd_dev_err(ppd->dd, "timeout waiting for link state 0x%x\n", state);
+
+       return -ETIMEDOUT;
+}
+
+u8 hfi1_ibphys_portstate(struct hfi1_pportdata *ppd)
+{
+       u32 pstate;
+       u32 ib_pstate;
+
+       pstate = read_physical_state(ppd->dd);
+       ib_pstate = chip_to_opa_pstate(ppd->dd, pstate);
+       if (ppd->last_pstate != ib_pstate) {
+               dd_dev_info(ppd->dd,
+                           "%s: physical state changed to %s (0x%x), phy 0x%x\n",
+                           __func__, opa_pstate_name(ib_pstate), ib_pstate,
+                           pstate);
+               ppd->last_pstate = ib_pstate;
+       }
+       return ib_pstate;
+}
+
+/*
+ * Read/modify/write ASIC_QSFP register bits as selected by mask
+ * data: 0 or 1 in the positions depending on what needs to be written
+ * dir: 0 for read, 1 for write
+ * mask: select by setting
+ *      I2CCLK  (bit 0)
+ *      I2CDATA (bit 1)
+ */
+u64 hfi1_gpio_mod(struct hfi1_devdata *dd, u32 target, u32 data, u32 dir,
+                 u32 mask)
+{
+       u64 qsfp_oe, target_oe;
+
+       target_oe = target ? ASIC_QSFP2_OE : ASIC_QSFP1_OE;
+       if (mask) {
+               /* We are writing register bits, so lock access */
+               dir &= mask;
+               data &= mask;
+
+               qsfp_oe = read_csr(dd, target_oe);
+               qsfp_oe = (qsfp_oe & ~(u64)mask) | (u64)dir;
+               write_csr(dd, target_oe, qsfp_oe);
+       }
+       /* We are exclusively reading bits here, but it is unlikely
+        * we'll get valid data when we set the direction of the pin
+        * in the same call, so read should call this function again
+        * to get valid data
+        */
+       return read_csr(dd, target ? ASIC_QSFP2_IN : ASIC_QSFP1_IN);
+}
+
+#define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \
+(r &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
+
+#define SET_STATIC_RATE_CONTROL_SMASK(r) \
+(r |= SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
+
+int hfi1_init_ctxt(struct send_context *sc)
+{
+       if (sc) {
+               struct hfi1_devdata *dd = sc->dd;
+               u64 reg;
+               u8 set = (sc->type == SC_USER ?
+                         HFI1_CAP_IS_USET(STATIC_RATE_CTRL) :
+                         HFI1_CAP_IS_KSET(STATIC_RATE_CTRL));
+               reg = read_kctxt_csr(dd, sc->hw_context,
+                                    SEND_CTXT_CHECK_ENABLE);
+               if (set)
+                       CLEAR_STATIC_RATE_CONTROL_SMASK(reg);
+               else
+                       SET_STATIC_RATE_CONTROL_SMASK(reg);
+               write_kctxt_csr(dd, sc->hw_context,
+                               SEND_CTXT_CHECK_ENABLE, reg);
+       }
+       return 0;
+}
+
+int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp)
+{
+       int ret = 0;
+       u64 reg;
+
+       if (dd->icode != ICODE_RTL_SILICON) {
+               if (HFI1_CAP_IS_KSET(PRINT_UNIMPL))
+                       dd_dev_info(dd, "%s: tempsense not supported by HW\n",
+                                   __func__);
+               return -EINVAL;
+       }
+       reg = read_csr(dd, ASIC_STS_THERM);
+       temp->curr = ((reg >> ASIC_STS_THERM_CURR_TEMP_SHIFT) &
+                     ASIC_STS_THERM_CURR_TEMP_MASK);
+       temp->lo_lim = ((reg >> ASIC_STS_THERM_LO_TEMP_SHIFT) &
+                       ASIC_STS_THERM_LO_TEMP_MASK);
+       temp->hi_lim = ((reg >> ASIC_STS_THERM_HI_TEMP_SHIFT) &
+                       ASIC_STS_THERM_HI_TEMP_MASK);
+       temp->crit_lim = ((reg >> ASIC_STS_THERM_CRIT_TEMP_SHIFT) &
+                         ASIC_STS_THERM_CRIT_TEMP_MASK);
+       /* triggers is a 3-bit value - 1 bit per trigger. */
+       temp->triggers = (u8)((reg >> ASIC_STS_THERM_LOW_SHIFT) & 0x7);
+
+       return ret;
+}
+
+/* ========================================================================= */
+
+/*
+ * Enable/disable chip from delivering interrupts.
+ */
+void set_intr_state(struct hfi1_devdata *dd, u32 enable)
+{
+       int i;
+
+       /*
+        * In HFI, the mask needs to be 1 to allow interrupts.
+        */
+       if (enable) {
+               /* enable all interrupts */
+               for (i = 0; i < CCE_NUM_INT_CSRS; i++)
+                       write_csr(dd, CCE_INT_MASK + (8 * i), ~(u64)0);
+
+               init_qsfp_int(dd);
+       } else {
+               for (i = 0; i < CCE_NUM_INT_CSRS; i++)
+                       write_csr(dd, CCE_INT_MASK + (8 * i), 0ull);
+       }
+}
+
+/*
+ * Clear all interrupt sources on the chip.
+ */
+static void clear_all_interrupts(struct hfi1_devdata *dd)
+{
+       int i;
+
+       for (i = 0; i < CCE_NUM_INT_CSRS; i++)
+               write_csr(dd, CCE_INT_CLEAR + (8 * i), ~(u64)0);
+
+       write_csr(dd, CCE_ERR_CLEAR, ~(u64)0);
+       write_csr(dd, MISC_ERR_CLEAR, ~(u64)0);
+       write_csr(dd, RCV_ERR_CLEAR, ~(u64)0);
+       write_csr(dd, SEND_ERR_CLEAR, ~(u64)0);
+       write_csr(dd, SEND_PIO_ERR_CLEAR, ~(u64)0);
+       write_csr(dd, SEND_DMA_ERR_CLEAR, ~(u64)0);
+       write_csr(dd, SEND_EGRESS_ERR_CLEAR, ~(u64)0);
+       for (i = 0; i < dd->chip_send_contexts; i++)
+               write_kctxt_csr(dd, i, SEND_CTXT_ERR_CLEAR, ~(u64)0);
+       for (i = 0; i < dd->chip_sdma_engines; i++)
+               write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_CLEAR, ~(u64)0);
+
+       write_csr(dd, DCC_ERR_FLG_CLR, ~(u64)0);
+       write_csr(dd, DC_LCB_ERR_CLR, ~(u64)0);
+       write_csr(dd, DC_DC8051_ERR_CLR, ~(u64)0);
+}
+
+/* Move to pcie.c? */
+static void disable_intx(struct pci_dev *pdev)
+{
+       pci_intx(pdev, 0);
+}
+
+static void clean_up_interrupts(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /* remove irqs - must happen before disabling/turning off */
+       if (dd->num_msix_entries) {
+               /* MSI-X */
+               struct hfi1_msix_entry *me = dd->msix_entries;
+
+               for (i = 0; i < dd->num_msix_entries; i++, me++) {
+                       if (!me->arg) /* => no irq, no affinity */
+                               continue;
+                       hfi1_put_irq_affinity(dd, &dd->msix_entries[i]);
+                       free_irq(me->msix.vector, me->arg);
+               }
+       } else {
+               /* INTx */
+               if (dd->requested_intx_irq) {
+                       free_irq(dd->pcidev->irq, dd);
+                       dd->requested_intx_irq = 0;
+               }
+       }
+
+       /* turn off interrupts */
+       if (dd->num_msix_entries) {
+               /* MSI-X */
+               pci_disable_msix(dd->pcidev);
+       } else {
+               /* INTx */
+               disable_intx(dd->pcidev);
+       }
+
+       /* clean structures */
+       kfree(dd->msix_entries);
+       dd->msix_entries = NULL;
+       dd->num_msix_entries = 0;
+}
+
+/*
+ * Remap the interrupt source from the general handler to the given MSI-X
+ * interrupt.
+ */
+static void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr)
+{
+       u64 reg;
+       int m, n;
+
+       /* clear from the handled mask of the general interrupt */
+       m = isrc / 64;
+       n = isrc % 64;
+       dd->gi_mask[m] &= ~((u64)1 << n);
+
+       /* direct the chip source to the given MSI-X interrupt */
+       m = isrc / 8;
+       n = isrc % 8;
+       reg = read_csr(dd, CCE_INT_MAP + (8 * m));
+       reg &= ~((u64)0xff << (8 * n));
+       reg |= ((u64)msix_intr & 0xff) << (8 * n);
+       write_csr(dd, CCE_INT_MAP + (8 * m), reg);
+}
+
+static void remap_sdma_interrupts(struct hfi1_devdata *dd,
+                                 int engine, int msix_intr)
+{
+       /*
+        * SDMA engine interrupt sources grouped by type, rather than
+        * engine.  Per-engine interrupts are as follows:
+        *      SDMA
+        *      SDMAProgress
+        *      SDMAIdle
+        */
+       remap_intr(dd, IS_SDMA_START + 0 * TXE_NUM_SDMA_ENGINES + engine,
+                  msix_intr);
+       remap_intr(dd, IS_SDMA_START + 1 * TXE_NUM_SDMA_ENGINES + engine,
+                  msix_intr);
+       remap_intr(dd, IS_SDMA_START + 2 * TXE_NUM_SDMA_ENGINES + engine,
+                  msix_intr);
+}
+
+static int request_intx_irq(struct hfi1_devdata *dd)
+{
+       int ret;
+
+       snprintf(dd->intx_name, sizeof(dd->intx_name), DRIVER_NAME "_%d",
+                dd->unit);
+       ret = request_irq(dd->pcidev->irq, general_interrupt,
+                         IRQF_SHARED, dd->intx_name, dd);
+       if (ret)
+               dd_dev_err(dd, "unable to request INTx interrupt, err %d\n",
+                          ret);
+       else
+               dd->requested_intx_irq = 1;
+       return ret;
+}
+
+static int request_msix_irqs(struct hfi1_devdata *dd)
+{
+       int first_general, last_general;
+       int first_sdma, last_sdma;
+       int first_rx, last_rx;
+       int i, ret = 0;
+
+       /* calculate the ranges we are going to use */
+       first_general = 0;
+       last_general = first_general + 1;
+       first_sdma = last_general;
+       last_sdma = first_sdma + dd->num_sdma;
+       first_rx = last_sdma;
+       last_rx = first_rx + dd->n_krcv_queues;
+
+       /*
+        * Sanity check - the code expects all SDMA chip source
+        * interrupts to be in the same CSR, starting at bit 0.  Verify
+        * that this is true by checking the bit location of the start.
+        */
+       BUILD_BUG_ON(IS_SDMA_START % 64);
+
+       for (i = 0; i < dd->num_msix_entries; i++) {
+               struct hfi1_msix_entry *me = &dd->msix_entries[i];
+               const char *err_info;
+               irq_handler_t handler;
+               irq_handler_t thread = NULL;
+               void *arg;
+               int idx;
+               struct hfi1_ctxtdata *rcd = NULL;
+               struct sdma_engine *sde = NULL;
+
+               /* obtain the arguments to request_irq */
+               if (first_general <= i && i < last_general) {
+                       idx = i - first_general;
+                       handler = general_interrupt;
+                       arg = dd;
+                       snprintf(me->name, sizeof(me->name),
+                                DRIVER_NAME "_%d", dd->unit);
+                       err_info = "general";
+                       me->type = IRQ_GENERAL;
+               } else if (first_sdma <= i && i < last_sdma) {
+                       idx = i - first_sdma;
+                       sde = &dd->per_sdma[idx];
+                       handler = sdma_interrupt;
+                       arg = sde;
+                       snprintf(me->name, sizeof(me->name),
+                                DRIVER_NAME "_%d sdma%d", dd->unit, idx);
+                       err_info = "sdma";
+                       remap_sdma_interrupts(dd, idx, i);
+                       me->type = IRQ_SDMA;
+               } else if (first_rx <= i && i < last_rx) {
+                       idx = i - first_rx;
+                       rcd = dd->rcd[idx];
+                       /* no interrupt if no rcd */
+                       if (!rcd)
+                               continue;
+                       /*
+                        * Set the interrupt register and mask for this
+                        * context's interrupt.
+                        */
+                       rcd->ireg = (IS_RCVAVAIL_START + idx) / 64;
+                       rcd->imask = ((u64)1) <<
+                                       ((IS_RCVAVAIL_START + idx) % 64);
+                       handler = receive_context_interrupt;
+                       thread = receive_context_thread;
+                       arg = rcd;
+                       snprintf(me->name, sizeof(me->name),
+                                DRIVER_NAME "_%d kctxt%d", dd->unit, idx);
+                       err_info = "receive context";
+                       remap_intr(dd, IS_RCVAVAIL_START + idx, i);
+                       me->type = IRQ_RCVCTXT;
+               } else {
+                       /* not in our expected range - complain, then
+                        * ignore it
+                        */
+                       dd_dev_err(dd,
+                                  "Unexpected extra MSI-X interrupt %d\n", i);
+                       continue;
+               }
+               /* no argument, no interrupt */
+               if (!arg)
+                       continue;
+               /* make sure the name is terminated */
+               me->name[sizeof(me->name) - 1] = 0;
+
+               ret = request_threaded_irq(me->msix.vector, handler, thread, 0,
+                                          me->name, arg);
+               if (ret) {
+                       dd_dev_err(dd,
+                                  "unable to allocate %s interrupt, vector %d, index %d, err %d\n",
+                                  err_info, me->msix.vector, idx, ret);
+                       return ret;
+               }
+               /*
+                * assign arg after request_irq call, so it will be
+                * cleaned up
+                */
+               me->arg = arg;
+
+               ret = hfi1_get_irq_affinity(dd, me);
+               if (ret)
+                       dd_dev_err(dd,
+                                  "unable to pin IRQ %d\n", ret);
+       }
+
+       return ret;
+}
+
+/*
+ * Set the general handler to accept all interrupts, remap all
+ * chip interrupts back to MSI-X 0.
+ */
+static void reset_interrupts(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /* all interrupts handled by the general handler */
+       for (i = 0; i < CCE_NUM_INT_CSRS; i++)
+               dd->gi_mask[i] = ~(u64)0;
+
+       /* all chip interrupts map to MSI-X 0 */
+       for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++)
+               write_csr(dd, CCE_INT_MAP + (8 * i), 0);
+}
+
+static int set_up_interrupts(struct hfi1_devdata *dd)
+{
+       struct hfi1_msix_entry *entries;
+       u32 total, request;
+       int i, ret;
+       int single_interrupt = 0; /* we expect to have all the interrupts */
+
+       /*
+        * Interrupt count:
+        *      1 general, "slow path" interrupt (includes the SDMA engines
+        *              slow source, SDMACleanupDone)
+        *      N interrupts - one per used SDMA engine
+        *      M interrupt - one per kernel receive context
+        */
+       total = 1 + dd->num_sdma + dd->n_krcv_queues;
+
+       entries = kcalloc(total, sizeof(*entries), GFP_KERNEL);
+       if (!entries) {
+               ret = -ENOMEM;
+               goto fail;
+       }
+       /* 1-1 MSI-X entry assignment */
+       for (i = 0; i < total; i++)
+               entries[i].msix.entry = i;
+
+       /* ask for MSI-X interrupts */
+       request = total;
+       request_msix(dd, &request, entries);
+
+       if (request == 0) {
+               /* using INTx */
+               /* dd->num_msix_entries already zero */
+               kfree(entries);
+               single_interrupt = 1;
+               dd_dev_err(dd, "MSI-X failed, using INTx interrupts\n");
+       } else {
+               /* using MSI-X */
+               dd->num_msix_entries = request;
+               dd->msix_entries = entries;
+
+               if (request != total) {
+                       /* using MSI-X, with reduced interrupts */
+                       dd_dev_err(
+                               dd,
+                               "cannot handle reduced interrupt case, want %u, got %u\n",
+                               total, request);
+                       ret = -EINVAL;
+                       goto fail;
+               }
+               dd_dev_info(dd, "%u MSI-X interrupts allocated\n", total);
+       }
+
+       /* mask all interrupts */
+       set_intr_state(dd, 0);
+       /* clear all pending interrupts */
+       clear_all_interrupts(dd);
+
+       /* reset general handler mask, chip MSI-X mappings */
+       reset_interrupts(dd);
+
+       if (single_interrupt)
+               ret = request_intx_irq(dd);
+       else
+               ret = request_msix_irqs(dd);
+       if (ret)
+               goto fail;
+
+       return 0;
+
+fail:
+       clean_up_interrupts(dd);
+       return ret;
+}
+
+/*
+ * Set up context values in dd.  Sets:
+ *
+ *     num_rcv_contexts - number of contexts being used
+ *     n_krcv_queues - number of kernel contexts
+ *     first_user_ctxt - first non-kernel context in array of contexts
+ *     freectxts  - number of free user contexts
+ *     num_send_contexts - number of PIO send contexts being used
+ */
+static int set_up_context_variables(struct hfi1_devdata *dd)
+{
+       int num_kernel_contexts;
+       int total_contexts;
+       int ret;
+       unsigned ngroups;
+       int qos_rmt_count;
+       int user_rmt_reduced;
+
+       /*
+        * Kernel receive contexts:
+        * - min of 2 or 1 context/numa (excluding control context)
+        * - Context 0 - control context (VL15/multicast/error)
+        * - Context 1 - first kernel context
+        * - Context 2 - second kernel context
+        * ...
+        */
+       if (n_krcvqs)
+               /*
+                * n_krcvqs is the sum of module parameter kernel receive
+                * contexts, krcvqs[].  It does not include the control
+                * context, so add that.
+                */
+               num_kernel_contexts = n_krcvqs + 1;
+       else
+               num_kernel_contexts = num_online_nodes() + 1;
+       num_kernel_contexts =
+               max_t(int, MIN_KERNEL_KCTXTS, num_kernel_contexts);
+       /*
+        * Every kernel receive context needs an ACK send context.
+        * one send context is allocated for each VL{0-7} and VL15
+        */
+       if (num_kernel_contexts > (dd->chip_send_contexts - num_vls - 1)) {
+               dd_dev_err(dd,
+                          "Reducing # kernel rcv contexts to: %d, from %d\n",
+                          (int)(dd->chip_send_contexts - num_vls - 1),
+                          (int)num_kernel_contexts);
+               num_kernel_contexts = dd->chip_send_contexts - num_vls - 1;
+       }
+       /*
+        * User contexts:
+        *      - default to 1 user context per real (non-HT) CPU core if
+        *        num_user_contexts is negative
+        */
+       if (num_user_contexts < 0)
+               num_user_contexts =
+                       cpumask_weight(&dd->affinity->real_cpu_mask);
+
+       total_contexts = num_kernel_contexts + num_user_contexts;
+
+       /*
+        * Adjust the counts given a global max.
+        */
+       if (total_contexts > dd->chip_rcv_contexts) {
+               dd_dev_err(dd,
+                          "Reducing # user receive contexts to: %d, from %d\n",
+                          (int)(dd->chip_rcv_contexts - num_kernel_contexts),
+                          (int)num_user_contexts);
+               num_user_contexts = dd->chip_rcv_contexts - num_kernel_contexts;
+               /* recalculate */
+               total_contexts = num_kernel_contexts + num_user_contexts;
+       }
+
+       /* each user context requires an entry in the RMT */
+       qos_rmt_count = qos_rmt_entries(dd, NULL, NULL);
+       if (qos_rmt_count + num_user_contexts > NUM_MAP_ENTRIES) {
+               user_rmt_reduced = NUM_MAP_ENTRIES - qos_rmt_count;
+               dd_dev_err(dd,
+                          "RMT size is reducing the number of user receive contexts from %d to %d\n",
+                          (int)num_user_contexts,
+                          user_rmt_reduced);
+               /* recalculate */
+               num_user_contexts = user_rmt_reduced;
+               total_contexts = num_kernel_contexts + num_user_contexts;
+       }
+
+       /* the first N are kernel contexts, the rest are user contexts */
+       dd->num_rcv_contexts = total_contexts;
+       dd->n_krcv_queues = num_kernel_contexts;
+       dd->first_user_ctxt = num_kernel_contexts;
+       dd->num_user_contexts = num_user_contexts;
+       dd->freectxts = num_user_contexts;
+       dd_dev_info(dd,
+                   "rcv contexts: chip %d, used %d (kernel %d, user %d)\n",
+                   (int)dd->chip_rcv_contexts,
+                   (int)dd->num_rcv_contexts,
+                   (int)dd->n_krcv_queues,
+                   (int)dd->num_rcv_contexts - dd->n_krcv_queues);
+
+       /*
+        * Receive array allocation:
+        *   All RcvArray entries are divided into groups of 8. This
+        *   is required by the hardware and will speed up writes to
+        *   consecutive entries by using write-combining of the entire
+        *   cacheline.
+        *
+        *   The number of groups are evenly divided among all contexts.
+        *   any left over groups will be given to the first N user
+        *   contexts.
+        */
+       dd->rcv_entries.group_size = RCV_INCREMENT;
+       ngroups = dd->chip_rcv_array_count / dd->rcv_entries.group_size;
+       dd->rcv_entries.ngroups = ngroups / dd->num_rcv_contexts;
+       dd->rcv_entries.nctxt_extra = ngroups -
+               (dd->num_rcv_contexts * dd->rcv_entries.ngroups);
+       dd_dev_info(dd, "RcvArray groups %u, ctxts extra %u\n",
+                   dd->rcv_entries.ngroups,
+                   dd->rcv_entries.nctxt_extra);
+       if (dd->rcv_entries.ngroups * dd->rcv_entries.group_size >
+           MAX_EAGER_ENTRIES * 2) {
+               dd->rcv_entries.ngroups = (MAX_EAGER_ENTRIES * 2) /
+                       dd->rcv_entries.group_size;
+               dd_dev_info(dd,
+                           "RcvArray group count too high, change to %u\n",
+                           dd->rcv_entries.ngroups);
+               dd->rcv_entries.nctxt_extra = 0;
+       }
+       /*
+        * PIO send contexts
+        */
+       ret = init_sc_pools_and_sizes(dd);
+       if (ret >= 0) { /* success */
+               dd->num_send_contexts = ret;
+               dd_dev_info(
+                       dd,
+                       "send contexts: chip %d, used %d (kernel %d, ack %d, user %d, vl15 %d)\n",
+                       dd->chip_send_contexts,
+                       dd->num_send_contexts,
+                       dd->sc_sizes[SC_KERNEL].count,
+                       dd->sc_sizes[SC_ACK].count,
+                       dd->sc_sizes[SC_USER].count,
+                       dd->sc_sizes[SC_VL15].count);
+               ret = 0;        /* success */
+       }
+
+       return ret;
+}
+
+/*
+ * Set the device/port partition key table. The MAD code
+ * will ensure that, at least, the partial management
+ * partition key is present in the table.
+ */
+static void set_partition_keys(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u64 reg = 0;
+       int i;
+
+       dd_dev_info(dd, "Setting partition keys\n");
+       for (i = 0; i < hfi1_get_npkeys(dd); i++) {
+               reg |= (ppd->pkeys[i] &
+                       RCV_PARTITION_KEY_PARTITION_KEY_A_MASK) <<
+                       ((i % 4) *
+                        RCV_PARTITION_KEY_PARTITION_KEY_B_SHIFT);
+               /* Each register holds 4 PKey values. */
+               if ((i % 4) == 3) {
+                       write_csr(dd, RCV_PARTITION_KEY +
+                                 ((i - 3) * 2), reg);
+                       reg = 0;
+               }
+       }
+
+       /* Always enable HW pkeys check when pkeys table is set */
+       add_rcvctrl(dd, RCV_CTRL_RCV_PARTITION_KEY_ENABLE_SMASK);
+}
+
+/*
+ * These CSRs and memories are uninitialized on reset and must be
+ * written before reading to set the ECC/parity bits.
+ *
+ * NOTE: All user context CSRs that are not mmaped write-only
+ * (e.g. the TID flows) must be initialized even if the driver never
+ * reads them.
+ */
+static void write_uninitialized_csrs_and_memories(struct hfi1_devdata *dd)
+{
+       int i, j;
+
+       /* CceIntMap */
+       for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++)
+               write_csr(dd, CCE_INT_MAP + (8 * i), 0);
+
+       /* SendCtxtCreditReturnAddr */
+       for (i = 0; i < dd->chip_send_contexts; i++)
+               write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_RETURN_ADDR, 0);
+
+       /* PIO Send buffers */
+       /* SDMA Send buffers */
+       /*
+        * These are not normally read, and (presently) have no method
+        * to be read, so are not pre-initialized
+        */
+
+       /* RcvHdrAddr */
+       /* RcvHdrTailAddr */
+       /* RcvTidFlowTable */
+       for (i = 0; i < dd->chip_rcv_contexts; i++) {
+               write_kctxt_csr(dd, i, RCV_HDR_ADDR, 0);
+               write_kctxt_csr(dd, i, RCV_HDR_TAIL_ADDR, 0);
+               for (j = 0; j < RXE_NUM_TID_FLOWS; j++)
+                       write_uctxt_csr(dd, i, RCV_TID_FLOW_TABLE + (8 * j), 0);
+       }
+
+       /* RcvArray */
+       for (i = 0; i < dd->chip_rcv_array_count; i++)
+               write_csr(dd, RCV_ARRAY + (8 * i),
+                         RCV_ARRAY_RT_WRITE_ENABLE_SMASK);
+
+       /* RcvQPMapTable */
+       for (i = 0; i < 32; i++)
+               write_csr(dd, RCV_QP_MAP_TABLE + (8 * i), 0);
+}
+
+/*
+ * Use the ctrl_bits in CceCtrl to clear the status_bits in CceStatus.
+ */
+static void clear_cce_status(struct hfi1_devdata *dd, u64 status_bits,
+                            u64 ctrl_bits)
+{
+       unsigned long timeout;
+       u64 reg;
+
+       /* is the condition present? */
+       reg = read_csr(dd, CCE_STATUS);
+       if ((reg & status_bits) == 0)
+               return;
+
+       /* clear the condition */
+       write_csr(dd, CCE_CTRL, ctrl_bits);
+
+       /* wait for the condition to clear */
+       timeout = jiffies + msecs_to_jiffies(CCE_STATUS_TIMEOUT);
+       while (1) {
+               reg = read_csr(dd, CCE_STATUS);
+               if ((reg & status_bits) == 0)
+                       return;
+               if (time_after(jiffies, timeout)) {
+                       dd_dev_err(dd,
+                                  "Timeout waiting for CceStatus to clear bits 0x%llx, remaining 0x%llx\n",
+                                  status_bits, reg & status_bits);
+                       return;
+               }
+               udelay(1);
+       }
+}
+
+/* set CCE CSRs to chip reset defaults */
+static void reset_cce_csrs(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /* CCE_REVISION read-only */
+       /* CCE_REVISION2 read-only */
+       /* CCE_CTRL - bits clear automatically */
+       /* CCE_STATUS read-only, use CceCtrl to clear */
+       clear_cce_status(dd, ALL_FROZE, CCE_CTRL_SPC_UNFREEZE_SMASK);
+       clear_cce_status(dd, ALL_TXE_PAUSE, CCE_CTRL_TXE_RESUME_SMASK);
+       clear_cce_status(dd, ALL_RXE_PAUSE, CCE_CTRL_RXE_RESUME_SMASK);
+       for (i = 0; i < CCE_NUM_SCRATCH; i++)
+               write_csr(dd, CCE_SCRATCH + (8 * i), 0);
+       /* CCE_ERR_STATUS read-only */
+       write_csr(dd, CCE_ERR_MASK, 0);
+       write_csr(dd, CCE_ERR_CLEAR, ~0ull);
+       /* CCE_ERR_FORCE leave alone */
+       for (i = 0; i < CCE_NUM_32_BIT_COUNTERS; i++)
+               write_csr(dd, CCE_COUNTER_ARRAY32 + (8 * i), 0);
+       write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_RESETCSR);
+       /* CCE_PCIE_CTRL leave alone */
+       for (i = 0; i < CCE_NUM_MSIX_VECTORS; i++) {
+               write_csr(dd, CCE_MSIX_TABLE_LOWER + (8 * i), 0);
+               write_csr(dd, CCE_MSIX_TABLE_UPPER + (8 * i),
+                         CCE_MSIX_TABLE_UPPER_RESETCSR);
+       }
+       for (i = 0; i < CCE_NUM_MSIX_PBAS; i++) {
+               /* CCE_MSIX_PBA read-only */
+               write_csr(dd, CCE_MSIX_INT_GRANTED, ~0ull);
+               write_csr(dd, CCE_MSIX_VEC_CLR_WITHOUT_INT, ~0ull);
+       }
+       for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++)
+               write_csr(dd, CCE_INT_MAP, 0);
+       for (i = 0; i < CCE_NUM_INT_CSRS; i++) {
+               /* CCE_INT_STATUS read-only */
+               write_csr(dd, CCE_INT_MASK + (8 * i), 0);
+               write_csr(dd, CCE_INT_CLEAR + (8 * i), ~0ull);
+               /* CCE_INT_FORCE leave alone */
+               /* CCE_INT_BLOCKED read-only */
+       }
+       for (i = 0; i < CCE_NUM_32_BIT_INT_COUNTERS; i++)
+               write_csr(dd, CCE_INT_COUNTER_ARRAY32 + (8 * i), 0);
+}
+
+/* set MISC CSRs to chip reset defaults */
+static void reset_misc_csrs(struct hfi1_devdata *dd)
+{
+       int i;
+
+       for (i = 0; i < 32; i++) {
+               write_csr(dd, MISC_CFG_RSA_R2 + (8 * i), 0);
+               write_csr(dd, MISC_CFG_RSA_SIGNATURE + (8 * i), 0);
+               write_csr(dd, MISC_CFG_RSA_MODULUS + (8 * i), 0);
+       }
+       /*
+        * MISC_CFG_SHA_PRELOAD leave alone - always reads 0 and can
+        * only be written 128-byte chunks
+        */
+       /* init RSA engine to clear lingering errors */
+       write_csr(dd, MISC_CFG_RSA_CMD, 1);
+       write_csr(dd, MISC_CFG_RSA_MU, 0);
+       write_csr(dd, MISC_CFG_FW_CTRL, 0);
+       /* MISC_STS_8051_DIGEST read-only */
+       /* MISC_STS_SBM_DIGEST read-only */
+       /* MISC_STS_PCIE_DIGEST read-only */
+       /* MISC_STS_FAB_DIGEST read-only */
+       /* MISC_ERR_STATUS read-only */
+       write_csr(dd, MISC_ERR_MASK, 0);
+       write_csr(dd, MISC_ERR_CLEAR, ~0ull);
+       /* MISC_ERR_FORCE leave alone */
+}
+
+/* set TXE CSRs to chip reset defaults */
+static void reset_txe_csrs(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /*
+        * TXE Kernel CSRs
+        */
+       write_csr(dd, SEND_CTRL, 0);
+       __cm_reset(dd, 0);      /* reset CM internal state */
+       /* SEND_CONTEXTS read-only */
+       /* SEND_DMA_ENGINES read-only */
+       /* SEND_PIO_MEM_SIZE read-only */
+       /* SEND_DMA_MEM_SIZE read-only */
+       write_csr(dd, SEND_HIGH_PRIORITY_LIMIT, 0);
+       pio_reset_all(dd);      /* SEND_PIO_INIT_CTXT */
+       /* SEND_PIO_ERR_STATUS read-only */
+       write_csr(dd, SEND_PIO_ERR_MASK, 0);
+       write_csr(dd, SEND_PIO_ERR_CLEAR, ~0ull);
+       /* SEND_PIO_ERR_FORCE leave alone */
+       /* SEND_DMA_ERR_STATUS read-only */
+       write_csr(dd, SEND_DMA_ERR_MASK, 0);
+       write_csr(dd, SEND_DMA_ERR_CLEAR, ~0ull);
+       /* SEND_DMA_ERR_FORCE leave alone */
+       /* SEND_EGRESS_ERR_STATUS read-only */
+       write_csr(dd, SEND_EGRESS_ERR_MASK, 0);
+       write_csr(dd, SEND_EGRESS_ERR_CLEAR, ~0ull);
+       /* SEND_EGRESS_ERR_FORCE leave alone */
+       write_csr(dd, SEND_BTH_QP, 0);
+       write_csr(dd, SEND_STATIC_RATE_CONTROL, 0);
+       write_csr(dd, SEND_SC2VLT0, 0);
+       write_csr(dd, SEND_SC2VLT1, 0);
+       write_csr(dd, SEND_SC2VLT2, 0);
+       write_csr(dd, SEND_SC2VLT3, 0);
+       write_csr(dd, SEND_LEN_CHECK0, 0);
+       write_csr(dd, SEND_LEN_CHECK1, 0);
+       /* SEND_ERR_STATUS read-only */
+       write_csr(dd, SEND_ERR_MASK, 0);
+       write_csr(dd, SEND_ERR_CLEAR, ~0ull);
+       /* SEND_ERR_FORCE read-only */
+       for (i = 0; i < VL_ARB_LOW_PRIO_TABLE_SIZE; i++)
+               write_csr(dd, SEND_LOW_PRIORITY_LIST + (8 * i), 0);
+       for (i = 0; i < VL_ARB_HIGH_PRIO_TABLE_SIZE; i++)
+               write_csr(dd, SEND_HIGH_PRIORITY_LIST + (8 * i), 0);
+       for (i = 0; i < dd->chip_send_contexts / NUM_CONTEXTS_PER_SET; i++)
+               write_csr(dd, SEND_CONTEXT_SET_CTRL + (8 * i), 0);
+       for (i = 0; i < TXE_NUM_32_BIT_COUNTER; i++)
+               write_csr(dd, SEND_COUNTER_ARRAY32 + (8 * i), 0);
+       for (i = 0; i < TXE_NUM_64_BIT_COUNTER; i++)
+               write_csr(dd, SEND_COUNTER_ARRAY64 + (8 * i), 0);
+       write_csr(dd, SEND_CM_CTRL, SEND_CM_CTRL_RESETCSR);
+       write_csr(dd, SEND_CM_GLOBAL_CREDIT, SEND_CM_GLOBAL_CREDIT_RESETCSR);
+       /* SEND_CM_CREDIT_USED_STATUS read-only */
+       write_csr(dd, SEND_CM_TIMER_CTRL, 0);
+       write_csr(dd, SEND_CM_LOCAL_AU_TABLE0_TO3, 0);
+       write_csr(dd, SEND_CM_LOCAL_AU_TABLE4_TO7, 0);
+       write_csr(dd, SEND_CM_REMOTE_AU_TABLE0_TO3, 0);
+       write_csr(dd, SEND_CM_REMOTE_AU_TABLE4_TO7, 0);
+       for (i = 0; i < TXE_NUM_DATA_VL; i++)
+               write_csr(dd, SEND_CM_CREDIT_VL + (8 * i), 0);
+       write_csr(dd, SEND_CM_CREDIT_VL15, 0);
+       /* SEND_CM_CREDIT_USED_VL read-only */
+       /* SEND_CM_CREDIT_USED_VL15 read-only */
+       /* SEND_EGRESS_CTXT_STATUS read-only */
+       /* SEND_EGRESS_SEND_DMA_STATUS read-only */
+       write_csr(dd, SEND_EGRESS_ERR_INFO, ~0ull);
+       /* SEND_EGRESS_ERR_INFO read-only */
+       /* SEND_EGRESS_ERR_SOURCE read-only */
+
+       /*
+        * TXE Per-Context CSRs
+        */
+       for (i = 0; i < dd->chip_send_contexts; i++) {
+               write_kctxt_csr(dd, i, SEND_CTXT_CTRL, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_CTRL, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_RETURN_ADDR, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_FORCE, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_ERR_MASK, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_ERR_CLEAR, ~0ull);
+               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_ENABLE, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_VL, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_JOB_KEY, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_PARTITION_KEY, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_SLID, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_OPCODE, 0);
+       }
+
+       /*
+        * TXE Per-SDMA CSRs
+        */
+       for (i = 0; i < dd->chip_sdma_engines; i++) {
+               write_kctxt_csr(dd, i, SEND_DMA_CTRL, 0);
+               /* SEND_DMA_STATUS read-only */
+               write_kctxt_csr(dd, i, SEND_DMA_BASE_ADDR, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_LEN_GEN, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_TAIL, 0);
+               /* SEND_DMA_HEAD read-only */
+               write_kctxt_csr(dd, i, SEND_DMA_HEAD_ADDR, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_PRIORITY_THLD, 0);
+               /* SEND_DMA_IDLE_CNT read-only */
+               write_kctxt_csr(dd, i, SEND_DMA_RELOAD_CNT, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_DESC_CNT, 0);
+               /* SEND_DMA_DESC_FETCHED_CNT read-only */
+               /* SEND_DMA_ENG_ERR_STATUS read-only */
+               write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_MASK, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_CLEAR, ~0ull);
+               /* SEND_DMA_ENG_ERR_FORCE leave alone */
+               write_kctxt_csr(dd, i, SEND_DMA_CHECK_ENABLE, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_CHECK_VL, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_CHECK_JOB_KEY, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_CHECK_PARTITION_KEY, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_CHECK_SLID, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_CHECK_OPCODE, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_MEMORY, 0);
+       }
+}
+
+/*
+ * Expect on entry:
+ * o Packet ingress is disabled, i.e. RcvCtrl.RcvPortEnable == 0
+ */
+static void init_rbufs(struct hfi1_devdata *dd)
+{
+       u64 reg;
+       int count;
+
+       /*
+        * Wait for DMA to stop: RxRbufPktPending and RxPktInProgress are
+        * clear.
+        */
+       count = 0;
+       while (1) {
+               reg = read_csr(dd, RCV_STATUS);
+               if ((reg & (RCV_STATUS_RX_RBUF_PKT_PENDING_SMASK
+                           | RCV_STATUS_RX_PKT_IN_PROGRESS_SMASK)) == 0)
+                       break;
+               /*
+                * Give up after 1ms - maximum wait time.
+                *
+                * RBuf size is 148KiB.  Slowest possible is PCIe Gen1 x1 at
+                * 250MB/s bandwidth.  Lower rate to 66% for overhead to get:
+                *      148 KB / (66% * 250MB/s) = 920us
+                */
+               if (count++ > 500) {
+                       dd_dev_err(dd,
+                                  "%s: in-progress DMA not clearing: RcvStatus 0x%llx, continuing\n",
+                                  __func__, reg);
+                       break;
+               }
+               udelay(2); /* do not busy-wait the CSR */
+       }
+
+       /* start the init - expect RcvCtrl to be 0 */
+       write_csr(dd, RCV_CTRL, RCV_CTRL_RX_RBUF_INIT_SMASK);
+
+       /*
+        * Read to force the write of Rcvtrl.RxRbufInit.  There is a brief
+        * period after the write before RcvStatus.RxRbufInitDone is valid.
+        * The delay in the first run through the loop below is sufficient and
+        * required before the first read of RcvStatus.RxRbufInintDone.
+        */
+       read_csr(dd, RCV_CTRL);
+
+       /* wait for the init to finish */
+       count = 0;
+       while (1) {
+               /* delay is required first time through - see above */
+               udelay(2); /* do not busy-wait the CSR */
+               reg = read_csr(dd, RCV_STATUS);
+               if (reg & (RCV_STATUS_RX_RBUF_INIT_DONE_SMASK))
+                       break;
+
+               /* give up after 100us - slowest possible at 33MHz is 73us */
+               if (count++ > 50) {
+                       dd_dev_err(dd,
+                                  "%s: RcvStatus.RxRbufInit not set, continuing\n",
+                                  __func__);
+                       break;
+               }
+       }
+}
+
+/* set RXE CSRs to chip reset defaults */
+static void reset_rxe_csrs(struct hfi1_devdata *dd)
+{
+       int i, j;
+
+       /*
+        * RXE Kernel CSRs
+        */
+       write_csr(dd, RCV_CTRL, 0);
+       init_rbufs(dd);
+       /* RCV_STATUS read-only */
+       /* RCV_CONTEXTS read-only */
+       /* RCV_ARRAY_CNT read-only */
+       /* RCV_BUF_SIZE read-only */
+       write_csr(dd, RCV_BTH_QP, 0);
+       write_csr(dd, RCV_MULTICAST, 0);
+       write_csr(dd, RCV_BYPASS, 0);
+       write_csr(dd, RCV_VL15, 0);
+       /* this is a clear-down */
+       write_csr(dd, RCV_ERR_INFO,
+                 RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK);
+       /* RCV_ERR_STATUS read-only */
+       write_csr(dd, RCV_ERR_MASK, 0);
+       write_csr(dd, RCV_ERR_CLEAR, ~0ull);
+       /* RCV_ERR_FORCE leave alone */
+       for (i = 0; i < 32; i++)
+               write_csr(dd, RCV_QP_MAP_TABLE + (8 * i), 0);
+       for (i = 0; i < 4; i++)
+               write_csr(dd, RCV_PARTITION_KEY + (8 * i), 0);
+       for (i = 0; i < RXE_NUM_32_BIT_COUNTERS; i++)
+               write_csr(dd, RCV_COUNTER_ARRAY32 + (8 * i), 0);
+       for (i = 0; i < RXE_NUM_64_BIT_COUNTERS; i++)
+               write_csr(dd, RCV_COUNTER_ARRAY64 + (8 * i), 0);
+       for (i = 0; i < RXE_NUM_RSM_INSTANCES; i++) {
+               write_csr(dd, RCV_RSM_CFG + (8 * i), 0);
+               write_csr(dd, RCV_RSM_SELECT + (8 * i), 0);
+               write_csr(dd, RCV_RSM_MATCH + (8 * i), 0);
+       }
+       for (i = 0; i < 32; i++)
+               write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), 0);
+
+       /*
+        * RXE Kernel and User Per-Context CSRs
+        */
+       for (i = 0; i < dd->chip_rcv_contexts; i++) {
+               /* kernel */
+               write_kctxt_csr(dd, i, RCV_CTXT_CTRL, 0);
+               /* RCV_CTXT_STATUS read-only */
+               write_kctxt_csr(dd, i, RCV_EGR_CTRL, 0);
+               write_kctxt_csr(dd, i, RCV_TID_CTRL, 0);
+               write_kctxt_csr(dd, i, RCV_KEY_CTRL, 0);
+               write_kctxt_csr(dd, i, RCV_HDR_ADDR, 0);
+               write_kctxt_csr(dd, i, RCV_HDR_CNT, 0);
+               write_kctxt_csr(dd, i, RCV_HDR_ENT_SIZE, 0);
+               write_kctxt_csr(dd, i, RCV_HDR_SIZE, 0);
+               write_kctxt_csr(dd, i, RCV_HDR_TAIL_ADDR, 0);
+               write_kctxt_csr(dd, i, RCV_AVAIL_TIME_OUT, 0);
+               write_kctxt_csr(dd, i, RCV_HDR_OVFL_CNT, 0);
+
+               /* user */
+               /* RCV_HDR_TAIL read-only */
+               write_uctxt_csr(dd, i, RCV_HDR_HEAD, 0);
+               /* RCV_EGR_INDEX_TAIL read-only */
+               write_uctxt_csr(dd, i, RCV_EGR_INDEX_HEAD, 0);
+               /* RCV_EGR_OFFSET_TAIL read-only */
+               for (j = 0; j < RXE_NUM_TID_FLOWS; j++) {
+                       write_uctxt_csr(dd, i,
+                                       RCV_TID_FLOW_TABLE + (8 * j), 0);
+               }
+       }
+}
+
+/*
+ * Set sc2vl tables.
+ *
+ * They power on to zeros, so to avoid send context errors
+ * they need to be set:
+ *
+ * SC 0-7 -> VL 0-7 (respectively)
+ * SC 15  -> VL 15
+ * otherwise
+ *        -> VL 0
+ */
+static void init_sc2vl_tables(struct hfi1_devdata *dd)
+{
+       int i;
+       /* init per architecture spec, constrained by hardware capability */
+
+       /* HFI maps sent packets */
+       write_csr(dd, SEND_SC2VLT0, SC2VL_VAL(
+               0,
+               0, 0, 1, 1,
+               2, 2, 3, 3,
+               4, 4, 5, 5,
+               6, 6, 7, 7));
+       write_csr(dd, SEND_SC2VLT1, SC2VL_VAL(
+               1,
+               8, 0, 9, 0,
+               10, 0, 11, 0,
+               12, 0, 13, 0,
+               14, 0, 15, 15));
+       write_csr(dd, SEND_SC2VLT2, SC2VL_VAL(
+               2,
+               16, 0, 17, 0,
+               18, 0, 19, 0,
+               20, 0, 21, 0,
+               22, 0, 23, 0));
+       write_csr(dd, SEND_SC2VLT3, SC2VL_VAL(
+               3,
+               24, 0, 25, 0,
+               26, 0, 27, 0,
+               28, 0, 29, 0,
+               30, 0, 31, 0));
+
+       /* DC maps received packets */
+       write_csr(dd, DCC_CFG_SC_VL_TABLE_15_0, DC_SC_VL_VAL(
+               15_0,
+               0, 0, 1, 1,  2, 2,  3, 3,  4, 4,  5, 5,  6, 6,  7,  7,
+               8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 15));
+       write_csr(dd, DCC_CFG_SC_VL_TABLE_31_16, DC_SC_VL_VAL(
+               31_16,
+               16, 0, 17, 0, 18, 0, 19, 0, 20, 0, 21, 0, 22, 0, 23, 0,
+               24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 31, 0));
+
+       /* initialize the cached sc2vl values consistently with h/w */
+       for (i = 0; i < 32; i++) {
+               if (i < 8 || i == 15)
+                       *((u8 *)(dd->sc2vl) + i) = (u8)i;
+               else
+                       *((u8 *)(dd->sc2vl) + i) = 0;
+       }
+}
+
+/*
+ * Read chip sizes and then reset parts to sane, disabled, values.  We cannot
+ * depend on the chip going through a power-on reset - a driver may be loaded
+ * and unloaded many times.
+ *
+ * Do not write any CSR values to the chip in this routine - there may be
+ * a reset following the (possible) FLR in this routine.
+ *
+ */
+static void init_chip(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /*
+        * Put the HFI CSRs in a known state.
+        * Combine this with a DC reset.
+        *
+        * Stop the device from doing anything while we do a
+        * reset.  We know there are no other active users of
+        * the device since we are now in charge.  Turn off
+        * off all outbound and inbound traffic and make sure
+        * the device does not generate any interrupts.
+        */
+
+       /* disable send contexts and SDMA engines */
+       write_csr(dd, SEND_CTRL, 0);
+       for (i = 0; i < dd->chip_send_contexts; i++)
+               write_kctxt_csr(dd, i, SEND_CTXT_CTRL, 0);
+       for (i = 0; i < dd->chip_sdma_engines; i++)
+               write_kctxt_csr(dd, i, SEND_DMA_CTRL, 0);
+       /* disable port (turn off RXE inbound traffic) and contexts */
+       write_csr(dd, RCV_CTRL, 0);
+       for (i = 0; i < dd->chip_rcv_contexts; i++)
+               write_csr(dd, RCV_CTXT_CTRL, 0);
+       /* mask all interrupt sources */
+       for (i = 0; i < CCE_NUM_INT_CSRS; i++)
+               write_csr(dd, CCE_INT_MASK + (8 * i), 0ull);
+
+       /*
+        * DC Reset: do a full DC reset before the register clear.
+        * A recommended length of time to hold is one CSR read,
+        * so reread the CceDcCtrl.  Then, hold the DC in reset
+        * across the clear.
+        */
+       write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_DC_RESET_SMASK);
+       (void)read_csr(dd, CCE_DC_CTRL);
+
+       if (use_flr) {
+               /*
+                * A FLR will reset the SPC core and part of the PCIe.
+                * The parts that need to be restored have already been
+                * saved.
+                */
+               dd_dev_info(dd, "Resetting CSRs with FLR\n");
+
+               /* do the FLR, the DC reset will remain */
+               hfi1_pcie_flr(dd);
+
+               /* restore command and BARs */
+               restore_pci_variables(dd);
+
+               if (is_ax(dd)) {
+                       dd_dev_info(dd, "Resetting CSRs with FLR\n");
+                       hfi1_pcie_flr(dd);
+                       restore_pci_variables(dd);
+               }
+       } else {
+               dd_dev_info(dd, "Resetting CSRs with writes\n");
+               reset_cce_csrs(dd);
+               reset_txe_csrs(dd);
+               reset_rxe_csrs(dd);
+               reset_misc_csrs(dd);
+       }
+       /* clear the DC reset */
+       write_csr(dd, CCE_DC_CTRL, 0);
+
+       /* Set the LED off */
+       setextled(dd, 0);
+
+       /*
+        * Clear the QSFP reset.
+        * An FLR enforces a 0 on all out pins. The driver does not touch
+        * ASIC_QSFPn_OUT otherwise.  This leaves RESET_N low and
+        * anything plugged constantly in reset, if it pays attention
+        * to RESET_N.
+        * Prime examples of this are optical cables. Set all pins high.
+        * I2CCLK and I2CDAT will change per direction, and INT_N and
+        * MODPRS_N are input only and their value is ignored.
+        */
+       write_csr(dd, ASIC_QSFP1_OUT, 0x1f);
+       write_csr(dd, ASIC_QSFP2_OUT, 0x1f);
+       init_chip_resources(dd);
+}
+
+static void init_early_variables(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /* assign link credit variables */
+       dd->vau = CM_VAU;
+       dd->link_credits = CM_GLOBAL_CREDITS;
+       if (is_ax(dd))
+               dd->link_credits--;
+       dd->vcu = cu_to_vcu(hfi1_cu);
+       /* enough room for 8 MAD packets plus header - 17K */
+       dd->vl15_init = (8 * (2048 + 128)) / vau_to_au(dd->vau);
+       if (dd->vl15_init > dd->link_credits)
+               dd->vl15_init = dd->link_credits;
+
+       write_uninitialized_csrs_and_memories(dd);
+
+       if (HFI1_CAP_IS_KSET(PKEY_CHECK))
+               for (i = 0; i < dd->num_pports; i++) {
+                       struct hfi1_pportdata *ppd = &dd->pport[i];
+
+                       set_partition_keys(ppd);
+               }
+       init_sc2vl_tables(dd);
+}
+
+static void init_kdeth_qp(struct hfi1_devdata *dd)
+{
+       /* user changed the KDETH_QP */
+       if (kdeth_qp != 0 && kdeth_qp >= 0xff) {
+               /* out of range or illegal value */
+               dd_dev_err(dd, "Invalid KDETH queue pair prefix, ignoring");
+               kdeth_qp = 0;
+       }
+       if (kdeth_qp == 0)      /* not set, or failed range check */
+               kdeth_qp = DEFAULT_KDETH_QP;
+
+       write_csr(dd, SEND_BTH_QP,
+                 (kdeth_qp & SEND_BTH_QP_KDETH_QP_MASK) <<
+                 SEND_BTH_QP_KDETH_QP_SHIFT);
+
+       write_csr(dd, RCV_BTH_QP,
+                 (kdeth_qp & RCV_BTH_QP_KDETH_QP_MASK) <<
+                 RCV_BTH_QP_KDETH_QP_SHIFT);
+}
+
+/**
+ * init_qpmap_table
+ * @dd - device data
+ * @first_ctxt - first context
+ * @last_ctxt - first context
+ *
+ * This return sets the qpn mapping table that
+ * is indexed by qpn[8:1].
+ *
+ * The routine will round robin the 256 settings
+ * from first_ctxt to last_ctxt.
+ *
+ * The first/last looks ahead to having specialized
+ * receive contexts for mgmt and bypass.  Normal
+ * verbs traffic will assumed to be on a range
+ * of receive contexts.
+ */
+static void init_qpmap_table(struct hfi1_devdata *dd,
+                            u32 first_ctxt,
+                            u32 last_ctxt)
+{
+       u64 reg = 0;
+       u64 regno = RCV_QP_MAP_TABLE;
+       int i;
+       u64 ctxt = first_ctxt;
+
+       for (i = 0; i < 256; i++) {
+               reg |= ctxt << (8 * (i % 8));
+               ctxt++;
+               if (ctxt > last_ctxt)
+                       ctxt = first_ctxt;
+               if (i % 8 == 7) {
+                       write_csr(dd, regno, reg);
+                       reg = 0;
+                       regno += 8;
+               }
+       }
+
+       add_rcvctrl(dd, RCV_CTRL_RCV_QP_MAP_ENABLE_SMASK
+                       | RCV_CTRL_RCV_BYPASS_ENABLE_SMASK);
+}
+
+struct rsm_map_table {
+       u64 map[NUM_MAP_REGS];
+       unsigned int used;
+};
+
+struct rsm_rule_data {
+       u8 offset;
+       u8 pkt_type;
+       u32 field1_off;
+       u32 field2_off;
+       u32 index1_off;
+       u32 index1_width;
+       u32 index2_off;
+       u32 index2_width;
+       u32 mask1;
+       u32 value1;
+       u32 mask2;
+       u32 value2;
+};
+
+/*
+ * Return an initialized RMT map table for users to fill in.  OK if it
+ * returns NULL, indicating no table.
+ */
+static struct rsm_map_table *alloc_rsm_map_table(struct hfi1_devdata *dd)
+{
+       struct rsm_map_table *rmt;
+       u8 rxcontext = is_ax(dd) ? 0 : 0xff;  /* 0 is default if a0 ver. */
+
+       rmt = kmalloc(sizeof(*rmt), GFP_KERNEL);
+       if (rmt) {
+               memset(rmt->map, rxcontext, sizeof(rmt->map));
+               rmt->used = 0;
+       }
+
+       return rmt;
+}
+
+/*
+ * Write the final RMT map table to the chip and free the table.  OK if
+ * table is NULL.
+ */
+static void complete_rsm_map_table(struct hfi1_devdata *dd,
+                                  struct rsm_map_table *rmt)
+{
+       int i;
+
+       if (rmt) {
+               /* write table to chip */
+               for (i = 0; i < NUM_MAP_REGS; i++)
+                       write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), rmt->map[i]);
+
+               /* enable RSM */
+               add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK);
+       }
+}
+
+/*
+ * Add a receive side mapping rule.
+ */
+static void add_rsm_rule(struct hfi1_devdata *dd, u8 rule_index,
+                        struct rsm_rule_data *rrd)
+{
+       write_csr(dd, RCV_RSM_CFG + (8 * rule_index),
+                 (u64)rrd->offset << RCV_RSM_CFG_OFFSET_SHIFT |
+                 1ull << rule_index | /* enable bit */
+                 (u64)rrd->pkt_type << RCV_RSM_CFG_PACKET_TYPE_SHIFT);
+       write_csr(dd, RCV_RSM_SELECT + (8 * rule_index),
+                 (u64)rrd->field1_off << RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT |
+                 (u64)rrd->field2_off << RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT |
+                 (u64)rrd->index1_off << RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT |
+                 (u64)rrd->index1_width << RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT |
+                 (u64)rrd->index2_off << RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT |
+                 (u64)rrd->index2_width << RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT);
+       write_csr(dd, RCV_RSM_MATCH + (8 * rule_index),
+                 (u64)rrd->mask1 << RCV_RSM_MATCH_MASK1_SHIFT |
+                 (u64)rrd->value1 << RCV_RSM_MATCH_VALUE1_SHIFT |
+                 (u64)rrd->mask2 << RCV_RSM_MATCH_MASK2_SHIFT |
+                 (u64)rrd->value2 << RCV_RSM_MATCH_VALUE2_SHIFT);
+}
+
+/* return the number of RSM map table entries that will be used for QOS */
+static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp,
+                          unsigned int *np)
+{
+       int i;
+       unsigned int m, n;
+       u8 max_by_vl = 0;
+
+       /* is QOS active at all? */
+       if (dd->n_krcv_queues <= MIN_KERNEL_KCTXTS ||
+           num_vls == 1 ||
+           krcvqsset <= 1)
+               goto no_qos;
+
+       /* determine bits for qpn */
+       for (i = 0; i < min_t(unsigned int, num_vls, krcvqsset); i++)
+               if (krcvqs[i] > max_by_vl)
+                       max_by_vl = krcvqs[i];
+       if (max_by_vl > 32)
+               goto no_qos;
+       m = ilog2(__roundup_pow_of_two(max_by_vl));
+
+       /* determine bits for vl */
+       n = ilog2(__roundup_pow_of_two(num_vls));
+
+       /* reject if too much is used */
+       if ((m + n) > 7)
+               goto no_qos;
+
+       if (mp)
+               *mp = m;
+       if (np)
+               *np = n;
+
+       return 1 << (m + n);
+
+no_qos:
+       if (mp)
+               *mp = 0;
+       if (np)
+               *np = 0;
+       return 0;
+}
+
+/**
+ * init_qos - init RX qos
+ * @dd - device data
+ * @rmt - RSM map table
+ *
+ * This routine initializes Rule 0 and the RSM map table to implement
+ * quality of service (qos).
+ *
+ * If all of the limit tests succeed, qos is applied based on the array
+ * interpretation of krcvqs where entry 0 is VL0.
+ *
+ * The number of vl bits (n) and the number of qpn bits (m) are computed to
+ * feed both the RSM map table and the single rule.
+ */
+static void init_qos(struct hfi1_devdata *dd, struct rsm_map_table *rmt)
+{
+       struct rsm_rule_data rrd;
+       unsigned qpns_per_vl, ctxt, i, qpn, n = 1, m;
+       unsigned int rmt_entries;
+       u64 reg;
+
+       if (!rmt)
+               goto bail;
+       rmt_entries = qos_rmt_entries(dd, &m, &n);
+       if (rmt_entries == 0)
+               goto bail;
+       qpns_per_vl = 1 << m;
+
+       /* enough room in the map table? */
+       rmt_entries = 1 << (m + n);
+       if (rmt->used + rmt_entries >= NUM_MAP_ENTRIES)
+               goto bail;
+
+       /* add qos entries to the the RSM map table */
+       for (i = 0, ctxt = FIRST_KERNEL_KCTXT; i < num_vls; i++) {
+               unsigned tctxt;
+
+               for (qpn = 0, tctxt = ctxt;
+                    krcvqs[i] && qpn < qpns_per_vl; qpn++) {
+                       unsigned idx, regoff, regidx;
+
+                       /* generate the index the hardware will produce */
+                       idx = rmt->used + ((qpn << n) ^ i);
+                       regoff = (idx % 8) * 8;
+                       regidx = idx / 8;
+                       /* replace default with context number */
+                       reg = rmt->map[regidx];
+                       reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK
+                               << regoff);
+                       reg |= (u64)(tctxt++) << regoff;
+                       rmt->map[regidx] = reg;
+                       if (tctxt == ctxt + krcvqs[i])
+                               tctxt = ctxt;
+               }
+               ctxt += krcvqs[i];
+       }
+
+       rrd.offset = rmt->used;
+       rrd.pkt_type = 2;
+       rrd.field1_off = LRH_BTH_MATCH_OFFSET;
+       rrd.field2_off = LRH_SC_MATCH_OFFSET;
+       rrd.index1_off = LRH_SC_SELECT_OFFSET;
+       rrd.index1_width = n;
+       rrd.index2_off = QPN_SELECT_OFFSET;
+       rrd.index2_width = m + n;
+       rrd.mask1 = LRH_BTH_MASK;
+       rrd.value1 = LRH_BTH_VALUE;
+       rrd.mask2 = LRH_SC_MASK;
+       rrd.value2 = LRH_SC_VALUE;
+
+       /* add rule 0 */
+       add_rsm_rule(dd, 0, &rrd);
+
+       /* mark RSM map entries as used */
+       rmt->used += rmt_entries;
+       /* map everything else to the mcast/err/vl15 context */
+       init_qpmap_table(dd, HFI1_CTRL_CTXT, HFI1_CTRL_CTXT);
+       dd->qos_shift = n + 1;
+       return;
+bail:
+       dd->qos_shift = 1;
+       init_qpmap_table(dd, FIRST_KERNEL_KCTXT, dd->n_krcv_queues - 1);
+}
+
+static void init_user_fecn_handling(struct hfi1_devdata *dd,
+                                   struct rsm_map_table *rmt)
+{
+       struct rsm_rule_data rrd;
+       u64 reg;
+       int i, idx, regoff, regidx;
+       u8 offset;
+
+       /* there needs to be enough room in the map table */
+       if (rmt->used + dd->num_user_contexts >= NUM_MAP_ENTRIES) {
+               dd_dev_err(dd, "User FECN handling disabled - too many user contexts allocated\n");
+               return;
+       }
+
+       /*
+        * RSM will extract the destination context as an index into the
+        * map table.  The destination contexts are a sequential block
+        * in the range first_user_ctxt...num_rcv_contexts-1 (inclusive).
+        * Map entries are accessed as offset + extracted value.  Adjust
+        * the added offset so this sequence can be placed anywhere in
+        * the table - as long as the entries themselves do not wrap.
+        * There are only enough bits in offset for the table size, so
+        * start with that to allow for a "negative" offset.
+        */
+       offset = (u8)(NUM_MAP_ENTRIES + (int)rmt->used -
+                                               (int)dd->first_user_ctxt);
+
+       for (i = dd->first_user_ctxt, idx = rmt->used;
+                               i < dd->num_rcv_contexts; i++, idx++) {
+               /* replace with identity mapping */
+               regoff = (idx % 8) * 8;
+               regidx = idx / 8;
+               reg = rmt->map[regidx];
+               reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK << regoff);
+               reg |= (u64)i << regoff;
+               rmt->map[regidx] = reg;
+       }
+
+       /*
+        * For RSM intercept of Expected FECN packets:
+        * o packet type 0 - expected
+        * o match on F (bit 95), using select/match 1, and
+        * o match on SH (bit 133), using select/match 2.
+        *
+        * Use index 1 to extract the 8-bit receive context from DestQP
+        * (start at bit 64).  Use that as the RSM map table index.
+        */
+       rrd.offset = offset;
+       rrd.pkt_type = 0;
+       rrd.field1_off = 95;
+       rrd.field2_off = 133;
+       rrd.index1_off = 64;
+       rrd.index1_width = 8;
+       rrd.index2_off = 0;
+       rrd.index2_width = 0;
+       rrd.mask1 = 1;
+       rrd.value1 = 1;
+       rrd.mask2 = 1;
+       rrd.value2 = 1;
+
+       /* add rule 1 */
+       add_rsm_rule(dd, 1, &rrd);
+
+       rmt->used += dd->num_user_contexts;
+}
+
+static void init_rxe(struct hfi1_devdata *dd)
+{
+       struct rsm_map_table *rmt;
+
+       /* enable all receive errors */
+       write_csr(dd, RCV_ERR_MASK, ~0ull);
+
+       rmt = alloc_rsm_map_table(dd);
+       /* set up QOS, including the QPN map table */
+       init_qos(dd, rmt);
+       init_user_fecn_handling(dd, rmt);
+       complete_rsm_map_table(dd, rmt);
+       kfree(rmt);
+
+       /*
+        * make sure RcvCtrl.RcvWcb <= PCIe Device Control
+        * Register Max_Payload_Size (PCI_EXP_DEVCTL in Linux PCIe config
+        * space, PciCfgCap2.MaxPayloadSize in HFI).  There is only one
+        * invalid configuration: RcvCtrl.RcvWcb set to its max of 256 and
+        * Max_PayLoad_Size set to its minimum of 128.
+        *
+        * Presently, RcvCtrl.RcvWcb is not modified from its default of 0
+        * (64 bytes).  Max_Payload_Size is possibly modified upward in
+        * tune_pcie_caps() which is called after this routine.
+        */
+}
+
+static void init_other(struct hfi1_devdata *dd)
+{
+       /* enable all CCE errors */
+       write_csr(dd, CCE_ERR_MASK, ~0ull);
+       /* enable *some* Misc errors */
+       write_csr(dd, MISC_ERR_MASK, DRIVER_MISC_MASK);
+       /* enable all DC errors, except LCB */
+       write_csr(dd, DCC_ERR_FLG_EN, ~0ull);
+       write_csr(dd, DC_DC8051_ERR_EN, ~0ull);
+}
+
+/*
+ * Fill out the given AU table using the given CU.  A CU is defined in terms
+ * AUs.  The table is a an encoding: given the index, how many AUs does that
+ * represent?
+ *
+ * NOTE: Assumes that the register layout is the same for the
+ * local and remote tables.
+ */
+static void assign_cm_au_table(struct hfi1_devdata *dd, u32 cu,
+                              u32 csr0to3, u32 csr4to7)
+{
+       write_csr(dd, csr0to3,
+                 0ull << SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE0_SHIFT |
+                 1ull << SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE1_SHIFT |
+                 2ull * cu <<
+                 SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE2_SHIFT |
+                 4ull * cu <<
+                 SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE3_SHIFT);
+       write_csr(dd, csr4to7,
+                 8ull * cu <<
+                 SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE4_SHIFT |
+                 16ull * cu <<
+                 SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE5_SHIFT |
+                 32ull * cu <<
+                 SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE6_SHIFT |
+                 64ull * cu <<
+                 SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE7_SHIFT);
+}
+
+static void assign_local_cm_au_table(struct hfi1_devdata *dd, u8 vcu)
+{
+       assign_cm_au_table(dd, vcu_to_cu(vcu), SEND_CM_LOCAL_AU_TABLE0_TO3,
+                          SEND_CM_LOCAL_AU_TABLE4_TO7);
+}
+
+void assign_remote_cm_au_table(struct hfi1_devdata *dd, u8 vcu)
+{
+       assign_cm_au_table(dd, vcu_to_cu(vcu), SEND_CM_REMOTE_AU_TABLE0_TO3,
+                          SEND_CM_REMOTE_AU_TABLE4_TO7);
+}
+
+static void init_txe(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /* enable all PIO, SDMA, general, and Egress errors */
+       write_csr(dd, SEND_PIO_ERR_MASK, ~0ull);
+       write_csr(dd, SEND_DMA_ERR_MASK, ~0ull);
+       write_csr(dd, SEND_ERR_MASK, ~0ull);
+       write_csr(dd, SEND_EGRESS_ERR_MASK, ~0ull);
+
+       /* enable all per-context and per-SDMA engine errors */
+       for (i = 0; i < dd->chip_send_contexts; i++)
+               write_kctxt_csr(dd, i, SEND_CTXT_ERR_MASK, ~0ull);
+       for (i = 0; i < dd->chip_sdma_engines; i++)
+               write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_MASK, ~0ull);
+
+       /* set the local CU to AU mapping */
+       assign_local_cm_au_table(dd, dd->vcu);
+
+       /*
+        * Set reasonable default for Credit Return Timer
+        * Don't set on Simulator - causes it to choke.
+        */
+       if (dd->icode != ICODE_FUNCTIONAL_SIMULATOR)
+               write_csr(dd, SEND_CM_TIMER_CTRL, HFI1_CREDIT_RETURN_RATE);
+}
+
+int hfi1_set_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt, u16 jkey)
+{
+       struct hfi1_ctxtdata *rcd = dd->rcd[ctxt];
+       unsigned sctxt;
+       int ret = 0;
+       u64 reg;
+
+       if (!rcd || !rcd->sc) {
+               ret = -EINVAL;
+               goto done;
+       }
+       sctxt = rcd->sc->hw_context;
+       reg = SEND_CTXT_CHECK_JOB_KEY_MASK_SMASK | /* mask is always 1's */
+               ((jkey & SEND_CTXT_CHECK_JOB_KEY_VALUE_MASK) <<
+                SEND_CTXT_CHECK_JOB_KEY_VALUE_SHIFT);
+       /* JOB_KEY_ALLOW_PERMISSIVE is not allowed by default */
+       if (HFI1_CAP_KGET_MASK(rcd->flags, ALLOW_PERM_JKEY))
+               reg |= SEND_CTXT_CHECK_JOB_KEY_ALLOW_PERMISSIVE_SMASK;
+       write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_JOB_KEY, reg);
+       /*
+        * Enable send-side J_KEY integrity check, unless this is A0 h/w
+        */
+       if (!is_ax(dd)) {
+               reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
+               reg |= SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
+               write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
+       }
+
+       /* Enable J_KEY check on receive context. */
+       reg = RCV_KEY_CTRL_JOB_KEY_ENABLE_SMASK |
+               ((jkey & RCV_KEY_CTRL_JOB_KEY_VALUE_MASK) <<
+                RCV_KEY_CTRL_JOB_KEY_VALUE_SHIFT);
+       write_kctxt_csr(dd, ctxt, RCV_KEY_CTRL, reg);
+done:
+       return ret;
+}
+
+int hfi1_clear_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt)
+{
+       struct hfi1_ctxtdata *rcd = dd->rcd[ctxt];
+       unsigned sctxt;
+       int ret = 0;
+       u64 reg;
+
+       if (!rcd || !rcd->sc) {
+               ret = -EINVAL;
+               goto done;
+       }
+       sctxt = rcd->sc->hw_context;
+       write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_JOB_KEY, 0);
+       /*
+        * Disable send-side J_KEY integrity check, unless this is A0 h/w.
+        * This check would not have been enabled for A0 h/w, see
+        * set_ctxt_jkey().
+        */
+       if (!is_ax(dd)) {
+               reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
+               reg &= ~SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
+               write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
+       }
+       /* Turn off the J_KEY on the receive side */
+       write_kctxt_csr(dd, ctxt, RCV_KEY_CTRL, 0);
+done:
+       return ret;
+}
+
+int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt, u16 pkey)
+{
+       struct hfi1_ctxtdata *rcd;
+       unsigned sctxt;
+       int ret = 0;
+       u64 reg;
+
+       if (ctxt < dd->num_rcv_contexts) {
+               rcd = dd->rcd[ctxt];
+       } else {
+               ret = -EINVAL;
+               goto done;
+       }
+       if (!rcd || !rcd->sc) {
+               ret = -EINVAL;
+               goto done;
+       }
+       sctxt = rcd->sc->hw_context;
+       reg = ((u64)pkey & SEND_CTXT_CHECK_PARTITION_KEY_VALUE_MASK) <<
+               SEND_CTXT_CHECK_PARTITION_KEY_VALUE_SHIFT;
+       write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_PARTITION_KEY, reg);
+       reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
+       reg |= SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK;
+       reg &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK;
+       write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
+done:
+       return ret;
+}
+
+int hfi1_clear_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt)
+{
+       struct hfi1_ctxtdata *rcd;
+       unsigned sctxt;
+       int ret = 0;
+       u64 reg;
+
+       if (ctxt < dd->num_rcv_contexts) {
+               rcd = dd->rcd[ctxt];
+       } else {
+               ret = -EINVAL;
+               goto done;
+       }
+       if (!rcd || !rcd->sc) {
+               ret = -EINVAL;
+               goto done;
+       }
+       sctxt = rcd->sc->hw_context;
+       reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
+       reg &= ~SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK;
+       write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
+       write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_PARTITION_KEY, 0);
+done:
+       return ret;
+}
+
+/*
+ * Start doing the clean up the the chip. Our clean up happens in multiple
+ * stages and this is just the first.
+ */
+void hfi1_start_cleanup(struct hfi1_devdata *dd)
+{
+       aspm_exit(dd);
+       free_cntrs(dd);
+       free_rcverr(dd);
+       clean_up_interrupts(dd);
+       finish_chip_resources(dd);
+}
+
+#define HFI_BASE_GUID(dev) \
+       ((dev)->base_guid & ~(1ULL << GUID_HFI_INDEX_SHIFT))
+
+/*
+ * Information can be shared between the two HFIs on the same ASIC
+ * in the same OS.  This function finds the peer device and sets
+ * up a shared structure.
+ */
+static int init_asic_data(struct hfi1_devdata *dd)
+{
+       unsigned long flags;
+       struct hfi1_devdata *tmp, *peer = NULL;
+       int ret = 0;
+
+       spin_lock_irqsave(&hfi1_devs_lock, flags);
+       /* Find our peer device */
+       list_for_each_entry(tmp, &hfi1_dev_list, list) {
+               if ((HFI_BASE_GUID(dd) == HFI_BASE_GUID(tmp)) &&
+                   dd->unit != tmp->unit) {
+                       peer = tmp;
+                       break;
+               }
+       }
+
+       if (peer) {
+               dd->asic_data = peer->asic_data;
+       } else {
+               dd->asic_data = kzalloc(sizeof(*dd->asic_data), GFP_KERNEL);
+               if (!dd->asic_data) {
+                       ret = -ENOMEM;
+                       goto done;
+               }
+               mutex_init(&dd->asic_data->asic_resource_mutex);
+       }
+       dd->asic_data->dds[dd->hfi1_id] = dd; /* self back-pointer */
+
+done:
+       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+       return ret;
+}
+
+/*
+ * Set dd->boardname.  Use a generic name if a name is not returned from
+ * EFI variable space.
+ *
+ * Return 0 on success, -ENOMEM if space could not be allocated.
+ */
+static int obtain_boardname(struct hfi1_devdata *dd)
+{
+       /* generic board description */
+       const char generic[] =
+               "Intel Omni-Path Host Fabric Interface Adapter 100 Series";
+       unsigned long size;
+       int ret;
+
+       ret = read_hfi1_efi_var(dd, "description", &size,
+                               (void **)&dd->boardname);
+       if (ret) {
+               dd_dev_info(dd, "Board description not found\n");
+               /* use generic description */
+               dd->boardname = kstrdup(generic, GFP_KERNEL);
+               if (!dd->boardname)
+                       return -ENOMEM;
+       }
+       return 0;
+}
+
+/*
+ * Check the interrupt registers to make sure that they are mapped correctly.
+ * It is intended to help user identify any mismapping by VMM when the driver
+ * is running in a VM. This function should only be called before interrupt
+ * is set up properly.
+ *
+ * Return 0 on success, -EINVAL on failure.
+ */
+static int check_int_registers(struct hfi1_devdata *dd)
+{
+       u64 reg;
+       u64 all_bits = ~(u64)0;
+       u64 mask;
+
+       /* Clear CceIntMask[0] to avoid raising any interrupts */
+       mask = read_csr(dd, CCE_INT_MASK);
+       write_csr(dd, CCE_INT_MASK, 0ull);
+       reg = read_csr(dd, CCE_INT_MASK);
+       if (reg)
+               goto err_exit;
+
+       /* Clear all interrupt status bits */
+       write_csr(dd, CCE_INT_CLEAR, all_bits);
+       reg = read_csr(dd, CCE_INT_STATUS);
+       if (reg)
+               goto err_exit;
+
+       /* Set all interrupt status bits */
+       write_csr(dd, CCE_INT_FORCE, all_bits);
+       reg = read_csr(dd, CCE_INT_STATUS);
+       if (reg != all_bits)
+               goto err_exit;
+
+       /* Restore the interrupt mask */
+       write_csr(dd, CCE_INT_CLEAR, all_bits);
+       write_csr(dd, CCE_INT_MASK, mask);
+
+       return 0;
+err_exit:
+       write_csr(dd, CCE_INT_MASK, mask);
+       dd_dev_err(dd, "Interrupt registers not properly mapped by VMM\n");
+       return -EINVAL;
+}
+
+/**
+ * Allocate and initialize the device structure for the hfi.
+ * @dev: the pci_dev for hfi1_ib device
+ * @ent: pci_device_id struct for this dev
+ *
+ * Also allocates, initializes, and returns the devdata struct for this
+ * device instance
+ *
+ * This is global, and is called directly at init to set up the
+ * chip-specific function pointers for later use.
+ */
+struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
+                                 const struct pci_device_id *ent)
+{
+       struct hfi1_devdata *dd;
+       struct hfi1_pportdata *ppd;
+       u64 reg;
+       int i, ret;
+       static const char * const inames[] = { /* implementation names */
+               "RTL silicon",
+               "RTL VCS simulation",
+               "RTL FPGA emulation",
+               "Functional simulator"
+       };
+       struct pci_dev *parent = pdev->bus->self;
+
+       dd = hfi1_alloc_devdata(pdev, NUM_IB_PORTS *
+                               sizeof(struct hfi1_pportdata));
+       if (IS_ERR(dd))
+               goto bail;
+       ppd = dd->pport;
+       for (i = 0; i < dd->num_pports; i++, ppd++) {
+               int vl;
+               /* init common fields */
+               hfi1_init_pportdata(pdev, ppd, dd, 0, 1);
+               /* DC supports 4 link widths */
+               ppd->link_width_supported =
+                       OPA_LINK_WIDTH_1X | OPA_LINK_WIDTH_2X |
+                       OPA_LINK_WIDTH_3X | OPA_LINK_WIDTH_4X;
+               ppd->link_width_downgrade_supported =
+                       ppd->link_width_supported;
+               /* start out enabling only 4X */
+               ppd->link_width_enabled = OPA_LINK_WIDTH_4X;
+               ppd->link_width_downgrade_enabled =
+                                       ppd->link_width_downgrade_supported;
+               /* link width active is 0 when link is down */
+               /* link width downgrade active is 0 when link is down */
+
+               if (num_vls < HFI1_MIN_VLS_SUPPORTED ||
+                   num_vls > HFI1_MAX_VLS_SUPPORTED) {
+                       hfi1_early_err(&pdev->dev,
+                                      "Invalid num_vls %u, using %u VLs\n",
+                                   num_vls, HFI1_MAX_VLS_SUPPORTED);
+                       num_vls = HFI1_MAX_VLS_SUPPORTED;
+               }
+               ppd->vls_supported = num_vls;
+               ppd->vls_operational = ppd->vls_supported;
+               ppd->actual_vls_operational = ppd->vls_supported;
+               /* Set the default MTU. */
+               for (vl = 0; vl < num_vls; vl++)
+                       dd->vld[vl].mtu = hfi1_max_mtu;
+               dd->vld[15].mtu = MAX_MAD_PACKET;
+               /*
+                * Set the initial values to reasonable default, will be set
+                * for real when link is up.
+                */
+               ppd->lstate = IB_PORT_DOWN;
+               ppd->overrun_threshold = 0x4;
+               ppd->phy_error_threshold = 0xf;
+               ppd->port_crc_mode_enabled = link_crc_mask;
+               /* initialize supported LTP CRC mode */
+               ppd->port_ltp_crc_mode = cap_to_port_ltp(link_crc_mask) << 8;
+               /* initialize enabled LTP CRC mode */
+               ppd->port_ltp_crc_mode |= cap_to_port_ltp(link_crc_mask) << 4;
+               /* start in offline */
+               ppd->host_link_state = HLS_DN_OFFLINE;
+               init_vl_arb_caches(ppd);
+               ppd->last_pstate = 0xff; /* invalid value */
+       }
+
+       dd->link_default = HLS_DN_POLL;
+
+       /*
+        * Do remaining PCIe setup and save PCIe values in dd.
+        * Any error printing is already done by the init code.
+        * On return, we have the chip mapped.
+        */
+       ret = hfi1_pcie_ddinit(dd, pdev, ent);
+       if (ret < 0)
+               goto bail_free;
+
+       /* verify that reads actually work, save revision for reset check */
+       dd->revision = read_csr(dd, CCE_REVISION);
+       if (dd->revision == ~(u64)0) {
+               dd_dev_err(dd, "cannot read chip CSRs\n");
+               ret = -EINVAL;
+               goto bail_cleanup;
+       }
+       dd->majrev = (dd->revision >> CCE_REVISION_CHIP_REV_MAJOR_SHIFT)
+                       & CCE_REVISION_CHIP_REV_MAJOR_MASK;
+       dd->minrev = (dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT)
+                       & CCE_REVISION_CHIP_REV_MINOR_MASK;
+
+       /*
+        * Check interrupt registers mapping if the driver has no access to
+        * the upstream component. In this case, it is likely that the driver
+        * is running in a VM.
+        */
+       if (!parent) {
+               ret = check_int_registers(dd);
+               if (ret)
+                       goto bail_cleanup;
+       }
+
+       /*
+        * obtain the hardware ID - NOT related to unit, which is a
+        * software enumeration
+        */
+       reg = read_csr(dd, CCE_REVISION2);
+       dd->hfi1_id = (reg >> CCE_REVISION2_HFI_ID_SHIFT)
+                                       & CCE_REVISION2_HFI_ID_MASK;
+       /* the variable size will remove unwanted bits */
+       dd->icode = reg >> CCE_REVISION2_IMPL_CODE_SHIFT;
+       dd->irev = reg >> CCE_REVISION2_IMPL_REVISION_SHIFT;
+       dd_dev_info(dd, "Implementation: %s, revision 0x%x\n",
+                   dd->icode < ARRAY_SIZE(inames) ?
+                   inames[dd->icode] : "unknown", (int)dd->irev);
+
+       /* speeds the hardware can support */
+       dd->pport->link_speed_supported = OPA_LINK_SPEED_25G;
+       /* speeds allowed to run at */
+       dd->pport->link_speed_enabled = dd->pport->link_speed_supported;
+       /* give a reasonable active value, will be set on link up */
+       dd->pport->link_speed_active = OPA_LINK_SPEED_25G;
+
+       dd->chip_rcv_contexts = read_csr(dd, RCV_CONTEXTS);
+       dd->chip_send_contexts = read_csr(dd, SEND_CONTEXTS);
+       dd->chip_sdma_engines = read_csr(dd, SEND_DMA_ENGINES);
+       dd->chip_pio_mem_size = read_csr(dd, SEND_PIO_MEM_SIZE);
+       dd->chip_sdma_mem_size = read_csr(dd, SEND_DMA_MEM_SIZE);
+       /* fix up link widths for emulation _p */
+       ppd = dd->pport;
+       if (dd->icode == ICODE_FPGA_EMULATION && is_emulator_p(dd)) {
+               ppd->link_width_supported =
+                       ppd->link_width_enabled =
+                       ppd->link_width_downgrade_supported =
+                       ppd->link_width_downgrade_enabled =
+                               OPA_LINK_WIDTH_1X;
+       }
+       /* insure num_vls isn't larger than number of sdma engines */
+       if (HFI1_CAP_IS_KSET(SDMA) && num_vls > dd->chip_sdma_engines) {
+               dd_dev_err(dd, "num_vls %u too large, using %u VLs\n",
+                          num_vls, dd->chip_sdma_engines);
+               num_vls = dd->chip_sdma_engines;
+               ppd->vls_supported = dd->chip_sdma_engines;
+               ppd->vls_operational = ppd->vls_supported;
+       }
+
+       /*
+        * Convert the ns parameter to the 64 * cclocks used in the CSR.
+        * Limit the max if larger than the field holds.  If timeout is
+        * non-zero, then the calculated field will be at least 1.
+        *
+        * Must be after icode is set up - the cclock rate depends
+        * on knowing the hardware being used.
+        */
+       dd->rcv_intr_timeout_csr = ns_to_cclock(dd, rcv_intr_timeout) / 64;
+       if (dd->rcv_intr_timeout_csr >
+                       RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK)
+               dd->rcv_intr_timeout_csr =
+                       RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK;
+       else if (dd->rcv_intr_timeout_csr == 0 && rcv_intr_timeout)
+               dd->rcv_intr_timeout_csr = 1;
+
+       /* needs to be done before we look for the peer device */
+       read_guid(dd);
+
+       /* set up shared ASIC data with peer device */
+       ret = init_asic_data(dd);
+       if (ret)
+               goto bail_cleanup;
+
+       /* obtain chip sizes, reset chip CSRs */
+       init_chip(dd);
+
+       /* read in the PCIe link speed information */
+       ret = pcie_speeds(dd);
+       if (ret)
+               goto bail_cleanup;
+
+       /* Needs to be called before hfi1_firmware_init */
+       get_platform_config(dd);
+
+       /* read in firmware */
+       ret = hfi1_firmware_init(dd);
+       if (ret)
+               goto bail_cleanup;
+
+       /*
+        * In general, the PCIe Gen3 transition must occur after the
+        * chip has been idled (so it won't initiate any PCIe transactions
+        * e.g. an interrupt) and before the driver changes any registers
+        * (the transition will reset the registers).
+        *
+        * In particular, place this call after:
+        * - init_chip()     - the chip will not initiate any PCIe transactions
+        * - pcie_speeds()   - reads the current link speed
+        * - hfi1_firmware_init() - the needed firmware is ready to be
+        *                          downloaded
+        */
+       ret = do_pcie_gen3_transition(dd);
+       if (ret)
+               goto bail_cleanup;
+
+       /* start setting dd values and adjusting CSRs */
+       init_early_variables(dd);
+
+       parse_platform_config(dd);
+
+       ret = obtain_boardname(dd);
+       if (ret)
+               goto bail_cleanup;
+
+       snprintf(dd->boardversion, BOARD_VERS_MAX,
+                "ChipABI %u.%u, ChipRev %u.%u, SW Compat %llu\n",
+                HFI1_CHIP_VERS_MAJ, HFI1_CHIP_VERS_MIN,
+                (u32)dd->majrev,
+                (u32)dd->minrev,
+                (dd->revision >> CCE_REVISION_SW_SHIFT)
+                   & CCE_REVISION_SW_MASK);
+
+       /*
+        * The real cpu mask is part of the affinity struct but has to be
+        * initialized earlier than the rest of the affinity struct because it
+        * is needed to calculate the number of user contexts in
+        * set_up_context_variables(). However, hfi1_dev_affinity_init(),
+        * which initializes the rest of the affinity struct members,
+        * depends on set_up_context_variables() for the number of kernel
+        * contexts, so it cannot be called before set_up_context_variables().
+        */
+       ret = init_real_cpu_mask(dd);
+       if (ret)
+               goto bail_cleanup;
+
+       ret = set_up_context_variables(dd);
+       if (ret)
+               goto bail_cleanup;
+
+       /* set initial RXE CSRs */
+       init_rxe(dd);
+       /* set initial TXE CSRs */
+       init_txe(dd);
+       /* set initial non-RXE, non-TXE CSRs */
+       init_other(dd);
+       /* set up KDETH QP prefix in both RX and TX CSRs */
+       init_kdeth_qp(dd);
+
+       hfi1_dev_affinity_init(dd);
+
+       /* send contexts must be set up before receive contexts */
+       ret = init_send_contexts(dd);
+       if (ret)
+               goto bail_cleanup;
+
+       ret = hfi1_create_ctxts(dd);
+       if (ret)
+               goto bail_cleanup;
+
+       dd->rcvhdrsize = DEFAULT_RCVHDRSIZE;
+       /*
+        * rcd[0] is guaranteed to be valid by this point. Also, all
+        * context are using the same value, as per the module parameter.
+        */
+       dd->rhf_offset = dd->rcd[0]->rcvhdrqentsize - sizeof(u64) / sizeof(u32);
+
+       ret = init_pervl_scs(dd);
+       if (ret)
+               goto bail_cleanup;
+
+       /* sdma init */
+       for (i = 0; i < dd->num_pports; ++i) {
+               ret = sdma_init(dd, i);
+               if (ret)
+                       goto bail_cleanup;
+       }
+
+       /* use contexts created by hfi1_create_ctxts */
+       ret = set_up_interrupts(dd);
+       if (ret)
+               goto bail_cleanup;
+
+       /* set up LCB access - must be after set_up_interrupts() */
+       init_lcb_access(dd);
+
+       snprintf(dd->serial, SERIAL_MAX, "0x%08llx\n",
+                dd->base_guid & 0xFFFFFF);
+
+       dd->oui1 = dd->base_guid >> 56 & 0xFF;
+       dd->oui2 = dd->base_guid >> 48 & 0xFF;
+       dd->oui3 = dd->base_guid >> 40 & 0xFF;
+
+       ret = load_firmware(dd); /* asymmetric with dispose_firmware() */
+       if (ret)
+               goto bail_clear_intr;
+       check_fabric_firmware_versions(dd);
+
+       thermal_init(dd);
+
+       ret = init_cntrs(dd);
+       if (ret)
+               goto bail_clear_intr;
+
+       ret = init_rcverr(dd);
+       if (ret)
+               goto bail_free_cntrs;
+
+       ret = eprom_init(dd);
+       if (ret)
+               goto bail_free_rcverr;
+
+       goto bail;
+
+bail_free_rcverr:
+       free_rcverr(dd);
+bail_free_cntrs:
+       free_cntrs(dd);
+bail_clear_intr:
+       clean_up_interrupts(dd);
+bail_cleanup:
+       hfi1_pcie_ddcleanup(dd);
+bail_free:
+       hfi1_free_devdata(dd);
+       dd = ERR_PTR(ret);
+bail:
+       return dd;
+}
+
+static u16 delay_cycles(struct hfi1_pportdata *ppd, u32 desired_egress_rate,
+                       u32 dw_len)
+{
+       u32 delta_cycles;
+       u32 current_egress_rate = ppd->current_egress_rate;
+       /* rates here are in units of 10^6 bits/sec */
+
+       if (desired_egress_rate == -1)
+               return 0; /* shouldn't happen */
+
+       if (desired_egress_rate >= current_egress_rate)
+               return 0; /* we can't help go faster, only slower */
+
+       delta_cycles = egress_cycles(dw_len * 4, desired_egress_rate) -
+                       egress_cycles(dw_len * 4, current_egress_rate);
+
+       return (u16)delta_cycles;
+}
+
+/**
+ * create_pbc - build a pbc for transmission
+ * @flags: special case flags or-ed in built pbc
+ * @srate: static rate
+ * @vl: vl
+ * @dwlen: dword length (header words + data words + pbc words)
+ *
+ * Create a PBC with the given flags, rate, VL, and length.
+ *
+ * NOTE: The PBC created will not insert any HCRC - all callers but one are
+ * for verbs, which does not use this PSM feature.  The lone other caller
+ * is for the diagnostic interface which calls this if the user does not
+ * supply their own PBC.
+ */
+u64 create_pbc(struct hfi1_pportdata *ppd, u64 flags, int srate_mbs, u32 vl,
+              u32 dw_len)
+{
+       u64 pbc, delay = 0;
+
+       if (unlikely(srate_mbs))
+               delay = delay_cycles(ppd, srate_mbs, dw_len);
+
+       pbc = flags
+               | (delay << PBC_STATIC_RATE_CONTROL_COUNT_SHIFT)
+               | ((u64)PBC_IHCRC_NONE << PBC_INSERT_HCRC_SHIFT)
+               | (vl & PBC_VL_MASK) << PBC_VL_SHIFT
+               | (dw_len & PBC_LENGTH_DWS_MASK)
+                       << PBC_LENGTH_DWS_SHIFT;
+
+       return pbc;
+}
+
+#define SBUS_THERMAL    0x4f
+#define SBUS_THERM_MONITOR_MODE 0x1
+
+#define THERM_FAILURE(dev, ret, reason) \
+       dd_dev_err((dd),                                                \
+                  "Thermal sensor initialization failed: %s (%d)\n",   \
+                  (reason), (ret))
+
+/*
+ * Initialize the thermal sensor.
+ *
+ * After initialization, enable polling of thermal sensor through
+ * SBus interface. In order for this to work, the SBus Master
+ * firmware has to be loaded due to the fact that the HW polling
+ * logic uses SBus interrupts, which are not supported with
+ * default firmware. Otherwise, no data will be returned through
+ * the ASIC_STS_THERM CSR.
+ */
+static int thermal_init(struct hfi1_devdata *dd)
+{
+       int ret = 0;
+
+       if (dd->icode != ICODE_RTL_SILICON ||
+           check_chip_resource(dd, CR_THERM_INIT, NULL))
+               return ret;
+
+       ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT);
+       if (ret) {
+               THERM_FAILURE(dd, ret, "Acquire SBus");
+               return ret;
+       }
+
+       dd_dev_info(dd, "Initializing thermal sensor\n");
+       /* Disable polling of thermal readings */
+       write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x0);
+       msleep(100);
+       /* Thermal Sensor Initialization */
+       /*    Step 1: Reset the Thermal SBus Receiver */
+       ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0,
+                               RESET_SBUS_RECEIVER, 0);
+       if (ret) {
+               THERM_FAILURE(dd, ret, "Bus Reset");
+               goto done;
+       }
+       /*    Step 2: Set Reset bit in Thermal block */
+       ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0,
+                               WRITE_SBUS_RECEIVER, 0x1);
+       if (ret) {
+               THERM_FAILURE(dd, ret, "Therm Block Reset");
+               goto done;
+       }
+       /*    Step 3: Write clock divider value (100MHz -> 2MHz) */
+       ret = sbus_request_slow(dd, SBUS_THERMAL, 0x1,
+                               WRITE_SBUS_RECEIVER, 0x32);
+       if (ret) {
+               THERM_FAILURE(dd, ret, "Write Clock Div");
+               goto done;
+       }
+       /*    Step 4: Select temperature mode */
+       ret = sbus_request_slow(dd, SBUS_THERMAL, 0x3,
+                               WRITE_SBUS_RECEIVER,
+                               SBUS_THERM_MONITOR_MODE);
+       if (ret) {
+               THERM_FAILURE(dd, ret, "Write Mode Sel");
+               goto done;
+       }
+       /*    Step 5: De-assert block reset and start conversion */
+       ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0,
+                               WRITE_SBUS_RECEIVER, 0x2);
+       if (ret) {
+               THERM_FAILURE(dd, ret, "Write Reset Deassert");
+               goto done;
+       }
+       /*    Step 5.1: Wait for first conversion (21.5ms per spec) */
+       msleep(22);
+
+       /* Enable polling of thermal readings */
+       write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x1);
+
+       /* Set initialized flag */
+       ret = acquire_chip_resource(dd, CR_THERM_INIT, 0);
+       if (ret)
+               THERM_FAILURE(dd, ret, "Unable to set thermal init flag");
+
+done:
+       release_chip_resource(dd, CR_SBUS);
+       return ret;
+}
+
+static void handle_temp_err(struct hfi1_devdata *dd)
+{
+       struct hfi1_pportdata *ppd = &dd->pport[0];
+       /*
+        * Thermal Critical Interrupt
+        * Put the device into forced freeze mode, take link down to
+        * offline, and put DC into reset.
+        */
+       dd_dev_emerg(dd,
+                    "Critical temperature reached! Forcing device into freeze mode!\n");
+       dd->flags |= HFI1_FORCED_FREEZE;
+       start_freeze_handling(ppd, FREEZE_SELF | FREEZE_ABORT);
+       /*
+        * Shut DC down as much and as quickly as possible.
+        *
+        * Step 1: Take the link down to OFFLINE. This will cause the
+        *         8051 to put the Serdes in reset. However, we don't want to
+        *         go through the entire link state machine since we want to
+        *         shutdown ASAP. Furthermore, this is not a graceful shutdown
+        *         but rather an attempt to save the chip.
+        *         Code below is almost the same as quiet_serdes() but avoids
+        *         all the extra work and the sleeps.
+        */
+       ppd->driver_link_ready = 0;
+       ppd->link_enabled = 0;
+       set_physical_link_state(dd, (OPA_LINKDOWN_REASON_SMA_DISABLED << 8) |
+                               PLS_OFFLINE);
+       /*
+        * Step 2: Shutdown LCB and 8051
+        *         After shutdown, do not restore DC_CFG_RESET value.
+        */
+       dc_shutdown(dd);
+}
diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h
new file mode 100644 (file)
index 0000000..66a3279
--- /dev/null
@@ -0,0 +1,1374 @@
+#ifndef _CHIP_H
+#define _CHIP_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This file contains all of the defines that is specific to the HFI chip
+ */
+
+/* sizes */
+#define CCE_NUM_MSIX_VECTORS 256
+#define CCE_NUM_INT_CSRS 12
+#define CCE_NUM_INT_MAP_CSRS 96
+#define NUM_INTERRUPT_SOURCES 768
+#define RXE_NUM_CONTEXTS 160
+#define RXE_PER_CONTEXT_SIZE 0x1000    /* 4k */
+#define RXE_NUM_TID_FLOWS 32
+#define RXE_NUM_DATA_VL 8
+#define TXE_NUM_CONTEXTS 160
+#define TXE_NUM_SDMA_ENGINES 16
+#define NUM_CONTEXTS_PER_SET 8
+#define VL_ARB_HIGH_PRIO_TABLE_SIZE 16
+#define VL_ARB_LOW_PRIO_TABLE_SIZE 16
+#define VL_ARB_TABLE_SIZE 16
+#define TXE_NUM_32_BIT_COUNTER 7
+#define TXE_NUM_64_BIT_COUNTER 30
+#define TXE_NUM_DATA_VL 8
+#define TXE_PIO_SIZE (32 * 0x100000)   /* 32 MB */
+#define PIO_BLOCK_SIZE 64                      /* bytes */
+#define SDMA_BLOCK_SIZE 64                     /* bytes */
+#define RCV_BUF_BLOCK_SIZE 64               /* bytes */
+#define PIO_CMASK 0x7ff        /* counter mask for free and fill counters */
+#define MAX_EAGER_ENTRIES    2048      /* max receive eager entries */
+#define MAX_TID_PAIR_ENTRIES 1024      /* max receive expected pairs */
+/*
+ * Virtual? Allocation Unit, defined as AU = 8*2^vAU, 64 bytes, AU is fixed
+ * at 64 bytes for all generation one devices
+ */
+#define CM_VAU 3
+/* HFI link credit count, AKA receive buffer depth (RBUF_DEPTH) */
+#define CM_GLOBAL_CREDITS 0x940
+/* Number of PKey entries in the HW */
+#define MAX_PKEY_VALUES 16
+
+#include "chip_registers.h"
+
+#define RXE_PER_CONTEXT_USER   (RXE + RXE_PER_CONTEXT_OFFSET)
+#define TXE_PIO_SEND (TXE + TXE_PIO_SEND_OFFSET)
+
+/* PBC flags */
+#define PBC_INTR               BIT_ULL(31)
+#define PBC_DC_INFO_SHIFT      (30)
+#define PBC_DC_INFO            BIT_ULL(PBC_DC_INFO_SHIFT)
+#define PBC_TEST_EBP           BIT_ULL(29)
+#define PBC_PACKET_BYPASS      BIT_ULL(28)
+#define PBC_CREDIT_RETURN      BIT_ULL(25)
+#define PBC_INSERT_BYPASS_ICRC BIT_ULL(24)
+#define PBC_TEST_BAD_ICRC      BIT_ULL(23)
+#define PBC_FECN               BIT_ULL(22)
+
+/* PbcInsertHcrc field settings */
+#define PBC_IHCRC_LKDETH 0x0   /* insert @ local KDETH offset */
+#define PBC_IHCRC_GKDETH 0x1   /* insert @ global KDETH offset */
+#define PBC_IHCRC_NONE   0x2   /* no HCRC inserted */
+
+/* PBC fields */
+#define PBC_STATIC_RATE_CONTROL_COUNT_SHIFT 32
+#define PBC_STATIC_RATE_CONTROL_COUNT_MASK 0xffffull
+#define PBC_STATIC_RATE_CONTROL_COUNT_SMASK \
+       (PBC_STATIC_RATE_CONTROL_COUNT_MASK << \
+       PBC_STATIC_RATE_CONTROL_COUNT_SHIFT)
+
+#define PBC_INSERT_HCRC_SHIFT 26
+#define PBC_INSERT_HCRC_MASK 0x3ull
+#define PBC_INSERT_HCRC_SMASK \
+       (PBC_INSERT_HCRC_MASK << PBC_INSERT_HCRC_SHIFT)
+
+#define PBC_VL_SHIFT 12
+#define PBC_VL_MASK 0xfull
+#define PBC_VL_SMASK (PBC_VL_MASK << PBC_VL_SHIFT)
+
+#define PBC_LENGTH_DWS_SHIFT 0
+#define PBC_LENGTH_DWS_MASK 0xfffull
+#define PBC_LENGTH_DWS_SMASK \
+       (PBC_LENGTH_DWS_MASK << PBC_LENGTH_DWS_SHIFT)
+
+/* Credit Return Fields */
+#define CR_COUNTER_SHIFT 0
+#define CR_COUNTER_MASK 0x7ffull
+#define CR_COUNTER_SMASK (CR_COUNTER_MASK << CR_COUNTER_SHIFT)
+
+#define CR_STATUS_SHIFT 11
+#define CR_STATUS_MASK 0x1ull
+#define CR_STATUS_SMASK (CR_STATUS_MASK << CR_STATUS_SHIFT)
+
+#define CR_CREDIT_RETURN_DUE_TO_PBC_SHIFT 12
+#define CR_CREDIT_RETURN_DUE_TO_PBC_MASK 0x1ull
+#define CR_CREDIT_RETURN_DUE_TO_PBC_SMASK \
+       (CR_CREDIT_RETURN_DUE_TO_PBC_MASK << \
+       CR_CREDIT_RETURN_DUE_TO_PBC_SHIFT)
+
+#define CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SHIFT 13
+#define CR_CREDIT_RETURN_DUE_TO_THRESHOLD_MASK 0x1ull
+#define CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SMASK \
+       (CR_CREDIT_RETURN_DUE_TO_THRESHOLD_MASK << \
+       CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SHIFT)
+
+#define CR_CREDIT_RETURN_DUE_TO_ERR_SHIFT 14
+#define CR_CREDIT_RETURN_DUE_TO_ERR_MASK 0x1ull
+#define CR_CREDIT_RETURN_DUE_TO_ERR_SMASK \
+       (CR_CREDIT_RETURN_DUE_TO_ERR_MASK << \
+       CR_CREDIT_RETURN_DUE_TO_ERR_SHIFT)
+
+#define CR_CREDIT_RETURN_DUE_TO_FORCE_SHIFT 15
+#define CR_CREDIT_RETURN_DUE_TO_FORCE_MASK 0x1ull
+#define CR_CREDIT_RETURN_DUE_TO_FORCE_SMASK \
+       (CR_CREDIT_RETURN_DUE_TO_FORCE_MASK << \
+       CR_CREDIT_RETURN_DUE_TO_FORCE_SHIFT)
+
+/* interrupt source numbers */
+#define IS_GENERAL_ERR_START     0
+#define IS_SDMAENG_ERR_START    16
+#define IS_SENDCTXT_ERR_START   32
+#define IS_SDMA_START          192 /* includes SDmaProgress,SDmaIdle */
+#define IS_VARIOUS_START               240
+#define IS_DC_START                    248
+#define IS_RCVAVAIL_START              256
+#define IS_RCVURGENT_START             416
+#define IS_SENDCREDIT_START            576
+#define IS_RESERVED_START              736
+#define IS_MAX_SOURCES         768
+
+/* derived interrupt source values */
+#define IS_GENERAL_ERR_END             IS_SDMAENG_ERR_START
+#define IS_SDMAENG_ERR_END             IS_SENDCTXT_ERR_START
+#define IS_SENDCTXT_ERR_END            IS_SDMA_START
+#define IS_SDMA_END                    IS_VARIOUS_START
+#define IS_VARIOUS_END         IS_DC_START
+#define IS_DC_END                      IS_RCVAVAIL_START
+#define IS_RCVAVAIL_END                IS_RCVURGENT_START
+#define IS_RCVURGENT_END               IS_SENDCREDIT_START
+#define IS_SENDCREDIT_END              IS_RESERVED_START
+#define IS_RESERVED_END                IS_MAX_SOURCES
+
+/* absolute interrupt numbers for QSFP1Int and QSFP2Int */
+#define QSFP1_INT              242
+#define QSFP2_INT              243
+
+/* DCC_CFG_PORT_CONFIG logical link states */
+#define LSTATE_DOWN    0x1
+#define LSTATE_INIT    0x2
+#define LSTATE_ARMED   0x3
+#define LSTATE_ACTIVE  0x4
+
+/* DC8051_STS_CUR_STATE port values (physical link states) */
+#define PLS_DISABLED                      0x30
+#define PLS_OFFLINE                               0x90
+#define PLS_OFFLINE_QUIET                         0x90
+#define PLS_OFFLINE_PLANNED_DOWN_INFORM           0x91
+#define PLS_OFFLINE_READY_TO_QUIET_LT     0x92
+#define PLS_OFFLINE_REPORT_FAILURE                0x93
+#define PLS_OFFLINE_READY_TO_QUIET_BCC    0x94
+#define PLS_POLLING                               0x20
+#define PLS_POLLING_QUIET                         0x20
+#define PLS_POLLING_ACTIVE                        0x21
+#define PLS_CONFIGPHY                     0x40
+#define PLS_CONFIGPHY_DEBOUCE             0x40
+#define PLS_CONFIGPHY_ESTCOMM             0x41
+#define PLS_CONFIGPHY_ESTCOMM_TXRX_HUNT           0x42
+#define PLS_CONFIGPHY_ESTCOMM_LOCAL_COMPLETE   0x43
+#define PLS_CONFIGPHY_OPTEQ                       0x44
+#define PLS_CONFIGPHY_OPTEQ_OPTIMIZING    0x44
+#define PLS_CONFIGPHY_OPTEQ_LOCAL_COMPLETE        0x45
+#define PLS_CONFIGPHY_VERIFYCAP                   0x46
+#define PLS_CONFIGPHY_VERIFYCAP_EXCHANGE          0x46
+#define PLS_CONFIGPHY_VERIFYCAP_LOCAL_COMPLETE 0x47
+#define PLS_CONFIGLT                      0x48
+#define PLS_CONFIGLT_CONFIGURE            0x48
+#define PLS_CONFIGLT_LINK_TRANSFER_ACTIVE         0x49
+#define PLS_LINKUP                                0x50
+#define PLS_PHYTEST                               0xB0
+#define PLS_INTERNAL_SERDES_LOOPBACK      0xe1
+#define PLS_QUICK_LINKUP                          0xe2
+
+/* DC_DC8051_CFG_HOST_CMD_0.REQ_TYPE - 8051 host commands */
+#define HCMD_LOAD_CONFIG_DATA  0x01
+#define HCMD_READ_CONFIG_DATA  0x02
+#define HCMD_CHANGE_PHY_STATE  0x03
+#define HCMD_SEND_LCB_IDLE_MSG 0x04
+#define HCMD_MISC                 0x05
+#define HCMD_READ_LCB_IDLE_MSG 0x06
+#define HCMD_READ_LCB_CSR      0x07
+#define HCMD_WRITE_LCB_CSR     0x08
+#define HCMD_INTERFACE_TEST       0xff
+
+/* DC_DC8051_CFG_HOST_CMD_1.RETURN_CODE - 8051 host command return */
+#define HCMD_SUCCESS 2
+
+/* DC_DC8051_DBG_ERR_INFO_SET_BY_8051.ERROR - error flags */
+#define SPICO_ROM_FAILED               BIT(0)
+#define UNKNOWN_FRAME                  BIT(1)
+#define TARGET_BER_NOT_MET             BIT(2)
+#define FAILED_SERDES_INTERNAL_LOOPBACK        BIT(3)
+#define FAILED_SERDES_INIT             BIT(4)
+#define FAILED_LNI_POLLING             BIT(5)
+#define FAILED_LNI_DEBOUNCE            BIT(6)
+#define FAILED_LNI_ESTBCOMM            BIT(7)
+#define FAILED_LNI_OPTEQ               BIT(8)
+#define FAILED_LNI_VERIFY_CAP1         BIT(9)
+#define FAILED_LNI_VERIFY_CAP2         BIT(10)
+#define FAILED_LNI_CONFIGLT            BIT(11)
+#define HOST_HANDSHAKE_TIMEOUT         BIT(12)
+
+#define FAILED_LNI (FAILED_LNI_POLLING | FAILED_LNI_DEBOUNCE \
+                       | FAILED_LNI_ESTBCOMM | FAILED_LNI_OPTEQ \
+                       | FAILED_LNI_VERIFY_CAP1 \
+                       | FAILED_LNI_VERIFY_CAP2 \
+                       | FAILED_LNI_CONFIGLT | HOST_HANDSHAKE_TIMEOUT)
+
+/* DC_DC8051_DBG_ERR_INFO_SET_BY_8051.HOST_MSG - host message flags */
+#define HOST_REQ_DONE          BIT(0)
+#define BC_PWR_MGM_MSG         BIT(1)
+#define BC_SMA_MSG             BIT(2)
+#define BC_BCC_UNKNOWN_MSG     BIT(3)
+#define BC_IDLE_UNKNOWN_MSG    BIT(4)
+#define EXT_DEVICE_CFG_REQ     BIT(5)
+#define VERIFY_CAP_FRAME       BIT(6)
+#define LINKUP_ACHIEVED                BIT(7)
+#define LINK_GOING_DOWN                BIT(8)
+#define LINK_WIDTH_DOWNGRADED  BIT(9)
+
+/* DC_DC8051_CFG_EXT_DEV_1.REQ_TYPE - 8051 host requests */
+#define HREQ_LOAD_CONFIG       0x01
+#define HREQ_SAVE_CONFIG       0x02
+#define HREQ_READ_CONFIG       0x03
+#define HREQ_SET_TX_EQ_ABS     0x04
+#define HREQ_SET_TX_EQ_REL     0x05
+#define HREQ_ENABLE            0x06
+#define HREQ_CONFIG_DONE       0xfe
+#define HREQ_INTERFACE_TEST    0xff
+
+/* DC_DC8051_CFG_EXT_DEV_0.RETURN_CODE - 8051 host request return codes */
+#define HREQ_INVALID           0x01
+#define HREQ_SUCCESS           0x02
+#define HREQ_NOT_SUPPORTED             0x03
+#define HREQ_FEATURE_NOT_SUPPORTED     0x04 /* request specific feature */
+#define HREQ_REQUEST_REJECTED  0xfe
+#define HREQ_EXECUTION_ONGOING 0xff
+
+/* MISC host command functions */
+#define HCMD_MISC_REQUEST_LCB_ACCESS 0x1
+#define HCMD_MISC_GRANT_LCB_ACCESS   0x2
+
+/* idle flit message types */
+#define IDLE_PHYSICAL_LINK_MGMT 0x1
+#define IDLE_CRU                   0x2
+#define IDLE_SMA                   0x3
+#define IDLE_POWER_MGMT            0x4
+
+/* idle flit message send fields (both send and read) */
+#define IDLE_PAYLOAD_MASK 0xffffffffffull /* 40 bits */
+#define IDLE_PAYLOAD_SHIFT 8
+#define IDLE_MSG_TYPE_MASK 0xf
+#define IDLE_MSG_TYPE_SHIFT 0
+
+/* idle flit message read fields */
+#define READ_IDLE_MSG_TYPE_MASK 0xf
+#define READ_IDLE_MSG_TYPE_SHIFT 0
+
+/* SMA idle flit payload commands */
+#define SMA_IDLE_ARM   1
+#define SMA_IDLE_ACTIVE 2
+
+/* DC_DC8051_CFG_MODE.GENERAL bits */
+#define DISABLE_SELF_GUID_CHECK 0x2
+
+/*
+ * Eager buffer minimum and maximum sizes supported by the hardware.
+ * All power-of-two sizes in between are supported as well.
+ * MAX_EAGER_BUFFER_TOTAL is the maximum size of memory
+ * allocatable for Eager buffer to a single context. All others
+ * are limits for the RcvArray entries.
+ */
+#define MIN_EAGER_BUFFER       (4 * 1024)
+#define MAX_EAGER_BUFFER       (256 * 1024)
+#define MAX_EAGER_BUFFER_TOTAL (64 * (1 << 20)) /* max per ctxt 64MB */
+#define MAX_EXPECTED_BUFFER    (2048 * 1024)
+
+/*
+ * Receive expected base and count and eager base and count increment -
+ * the CSR fields hold multiples of this value.
+ */
+#define RCV_SHIFT 3
+#define RCV_INCREMENT BIT(RCV_SHIFT)
+
+/*
+ * Receive header queue entry increment - the CSR holds multiples of
+ * this value.
+ */
+#define HDRQ_SIZE_SHIFT 5
+#define HDRQ_INCREMENT BIT(HDRQ_SIZE_SHIFT)
+
+/*
+ * Freeze handling flags
+ */
+#define FREEZE_ABORT     0x01  /* do not do recovery */
+#define FREEZE_SELF         0x02       /* initiate the freeze */
+#define FREEZE_LINK_DOWN 0x04  /* link is down */
+
+/*
+ * Chip implementation codes.
+ */
+#define ICODE_RTL_SILICON              0x00
+#define ICODE_RTL_VCS_SIMULATION       0x01
+#define ICODE_FPGA_EMULATION   0x02
+#define ICODE_FUNCTIONAL_SIMULATOR     0x03
+
+/*
+ * 8051 data memory size.
+ */
+#define DC8051_DATA_MEM_SIZE 0x1000
+
+/*
+ * 8051 firmware registers
+ */
+#define NUM_GENERAL_FIELDS 0x17
+#define NUM_LANE_FIELDS    0x8
+
+/* 8051 general register Field IDs */
+#define LINK_OPTIMIZATION_SETTINGS   0x00
+#define LINK_TUNING_PARAMETERS      0x02
+#define DC_HOST_COMM_SETTINGS       0x03
+#define TX_SETTINGS                 0x06
+#define VERIFY_CAP_LOCAL_PHY        0x07
+#define VERIFY_CAP_LOCAL_FABRIC             0x08
+#define VERIFY_CAP_LOCAL_LINK_WIDTH  0x09
+#define LOCAL_DEVICE_ID                     0x0a
+#define LOCAL_LNI_INFO              0x0c
+#define REMOTE_LNI_INFO              0x0d
+#define MISC_STATUS                 0x0e
+#define VERIFY_CAP_REMOTE_PHY       0x0f
+#define VERIFY_CAP_REMOTE_FABRIC     0x10
+#define VERIFY_CAP_REMOTE_LINK_WIDTH 0x11
+#define LAST_LOCAL_STATE_COMPLETE    0x12
+#define LAST_REMOTE_STATE_COMPLETE   0x13
+#define LINK_QUALITY_INFO            0x14
+#define REMOTE_DEVICE_ID            0x15
+#define LINK_DOWN_REASON            0x16
+
+/* 8051 lane specific register field IDs */
+#define TX_EQ_SETTINGS         0x00
+#define CHANNEL_LOSS_SETTINGS  0x05
+
+/* Lane ID for general configuration registers */
+#define GENERAL_CONFIG 4
+
+/* LINK_TUNING_PARAMETERS fields */
+#define TUNING_METHOD_SHIFT 24
+
+/* LINK_OPTIMIZATION_SETTINGS fields */
+#define ENABLE_EXT_DEV_CONFIG_SHIFT 24
+
+/* LOAD_DATA 8051 command shifts and fields */
+#define LOAD_DATA_FIELD_ID_SHIFT 40
+#define LOAD_DATA_FIELD_ID_MASK 0xfull
+#define LOAD_DATA_LANE_ID_SHIFT 32
+#define LOAD_DATA_LANE_ID_MASK 0xfull
+#define LOAD_DATA_DATA_SHIFT   0x0
+#define LOAD_DATA_DATA_MASK   0xffffffffull
+
+/* READ_DATA 8051 command shifts and fields */
+#define READ_DATA_FIELD_ID_SHIFT 40
+#define READ_DATA_FIELD_ID_MASK 0xffull
+#define READ_DATA_LANE_ID_SHIFT 32
+#define READ_DATA_LANE_ID_MASK 0xffull
+#define READ_DATA_DATA_SHIFT   0x0
+#define READ_DATA_DATA_MASK   0xffffffffull
+
+/* TX settings fields */
+#define ENABLE_LANE_TX_SHIFT           0
+#define ENABLE_LANE_TX_MASK            0xff
+#define TX_POLARITY_INVERSION_SHIFT    8
+#define TX_POLARITY_INVERSION_MASK     0xff
+#define RX_POLARITY_INVERSION_SHIFT    16
+#define RX_POLARITY_INVERSION_MASK     0xff
+#define MAX_RATE_SHIFT                 24
+#define MAX_RATE_MASK                  0xff
+
+/* verify capability PHY fields */
+#define CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT 0x4
+#define CONTINIOUS_REMOTE_UPDATE_SUPPORT_MASK  0x1
+#define POWER_MANAGEMENT_SHIFT                 0x0
+#define POWER_MANAGEMENT_MASK                  0xf
+
+/* 8051 lane register Field IDs */
+#define SPICO_FW_VERSION 0x7   /* SPICO firmware version */
+
+/* SPICO firmware version fields */
+#define SPICO_ROM_VERSION_SHIFT 0
+#define SPICO_ROM_VERSION_MASK 0xffff
+#define SPICO_ROM_PROD_ID_SHIFT 16
+#define SPICO_ROM_PROD_ID_MASK 0xffff
+
+/* verify capability fabric fields */
+#define VAU_SHIFT      0
+#define VAU_MASK       0x0007
+#define Z_SHIFT                3
+#define Z_MASK         0x0001
+#define VCU_SHIFT      4
+#define VCU_MASK       0x0007
+#define VL15BUF_SHIFT  8
+#define VL15BUF_MASK   0x0fff
+#define CRC_SIZES_SHIFT 20
+#define CRC_SIZES_MASK 0x7
+
+/* verify capability local link width fields */
+#define LINK_WIDTH_SHIFT 0             /* also for remote link width */
+#define LINK_WIDTH_MASK 0xffff         /* also for remote link width */
+#define LOCAL_FLAG_BITS_SHIFT 16
+#define LOCAL_FLAG_BITS_MASK 0xff
+#define MISC_CONFIG_BITS_SHIFT 24
+#define MISC_CONFIG_BITS_MASK 0xff
+
+/* verify capability remote link width fields */
+#define REMOTE_TX_RATE_SHIFT 16
+#define REMOTE_TX_RATE_MASK 0xff
+
+/* LOCAL_DEVICE_ID fields */
+#define LOCAL_DEVICE_REV_SHIFT 0
+#define LOCAL_DEVICE_REV_MASK 0xff
+#define LOCAL_DEVICE_ID_SHIFT 8
+#define LOCAL_DEVICE_ID_MASK 0xffff
+
+/* REMOTE_DEVICE_ID fields */
+#define REMOTE_DEVICE_REV_SHIFT 0
+#define REMOTE_DEVICE_REV_MASK 0xff
+#define REMOTE_DEVICE_ID_SHIFT 8
+#define REMOTE_DEVICE_ID_MASK 0xffff
+
+/* local LNI link width fields */
+#define ENABLE_LANE_RX_SHIFT 16
+#define ENABLE_LANE_RX_MASK  0xff
+
+/* mask, shift for reading 'mgmt_enabled' value from REMOTE_LNI_INFO field */
+#define MGMT_ALLOWED_SHIFT 23
+#define MGMT_ALLOWED_MASK 0x1
+
+/* mask, shift for 'link_quality' within LINK_QUALITY_INFO field */
+#define LINK_QUALITY_SHIFT 24
+#define LINK_QUALITY_MASK  0x7
+
+/*
+ * mask, shift for reading 'planned_down_remote_reason_code'
+ * from LINK_QUALITY_INFO field
+ */
+#define DOWN_REMOTE_REASON_SHIFT 16
+#define DOWN_REMOTE_REASON_MASK  0xff
+
+/* verify capability PHY power management bits */
+#define PWRM_BER_CONTROL       0x1
+#define PWRM_BANDWIDTH_CONTROL 0x2
+
+/* 8051 link down reasons */
+#define LDR_LINK_TRANSFER_ACTIVE_LOW   0xa
+#define LDR_RECEIVED_LINKDOWN_IDLE_MSG 0xb
+#define LDR_RECEIVED_HOST_OFFLINE_REQ  0xc
+
+/* verify capability fabric CRC size bits */
+enum {
+       CAP_CRC_14B = (1 << 0), /* 14b CRC */
+       CAP_CRC_48B = (1 << 1), /* 48b CRC */
+       CAP_CRC_12B_16B_PER_LANE = (1 << 2) /* 12b-16b per lane CRC */
+};
+
+#define SUPPORTED_CRCS (CAP_CRC_14B | CAP_CRC_48B)
+
+/* misc status version fields */
+#define STS_FM_VERSION_A_SHIFT 16
+#define STS_FM_VERSION_A_MASK  0xff
+#define STS_FM_VERSION_B_SHIFT 24
+#define STS_FM_VERSION_B_MASK  0xff
+
+/* LCB_CFG_CRC_MODE TX_VAL and RX_VAL CRC mode values */
+#define LCB_CRC_16B                    0x0     /* 16b CRC */
+#define LCB_CRC_14B                    0x1     /* 14b CRC */
+#define LCB_CRC_48B                    0x2     /* 48b CRC */
+#define LCB_CRC_12B_16B_PER_LANE       0x3     /* 12b-16b per lane CRC */
+
+/*
+ * the following enum is (almost) a copy/paste of the definition
+ * in the OPA spec, section 20.2.2.6.8 (PortInfo)
+ */
+enum {
+       PORT_LTP_CRC_MODE_NONE = 0,
+       PORT_LTP_CRC_MODE_14 = 1, /* 14-bit LTP CRC mode (optional) */
+       PORT_LTP_CRC_MODE_16 = 2, /* 16-bit LTP CRC mode */
+       PORT_LTP_CRC_MODE_48 = 4,
+               /* 48-bit overlapping LTP CRC mode (optional) */
+       PORT_LTP_CRC_MODE_PER_LANE = 8
+               /* 12 to 16 bit per lane LTP CRC mode (optional) */
+};
+
+/* timeouts */
+#define LINK_RESTART_DELAY 1000                /* link restart delay, in ms */
+#define TIMEOUT_8051_START 5000         /* 8051 start timeout, in ms */
+#define DC8051_COMMAND_TIMEOUT 20000   /* DC8051 command timeout, in ms */
+#define FREEZE_STATUS_TIMEOUT 20       /* wait for freeze indicators, in ms */
+#define VL_STATUS_CLEAR_TIMEOUT 5000   /* per-VL status clear, in ms */
+#define CCE_STATUS_TIMEOUT 10          /* time to clear CCE Status, in ms */
+
+/* cclock tick time, in picoseconds per tick: 1/speed * 10^12  */
+#define ASIC_CCLOCK_PS  1242   /* 805 MHz */
+#define FPGA_CCLOCK_PS 30300   /*  33 MHz */
+
+/*
+ * Mask of enabled MISC errors.  Do not enable the two RSA engine errors -
+ * see firmware.c:run_rsa() for details.
+ */
+#define DRIVER_MISC_MASK \
+       (~(MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK \
+               | MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK))
+
+/* valid values for the loopback module parameter */
+#define LOOPBACK_NONE  0       /* no loopback - default */
+#define LOOPBACK_SERDES 1
+#define LOOPBACK_LCB   2
+#define LOOPBACK_CABLE 3       /* external cable */
+
+/* read and write hardware registers */
+u64 read_csr(const struct hfi1_devdata *dd, u32 offset);
+void write_csr(const struct hfi1_devdata *dd, u32 offset, u64 value);
+
+/*
+ * The *_kctxt_* flavor of the CSR read/write functions are for
+ * per-context or per-SDMA CSRs that are not mappable to user-space.
+ * Their spacing is not a PAGE_SIZE multiple.
+ */
+static inline u64 read_kctxt_csr(const struct hfi1_devdata *dd, int ctxt,
+                                u32 offset0)
+{
+       /* kernel per-context CSRs are separated by 0x100 */
+       return read_csr(dd, offset0 + (0x100 * ctxt));
+}
+
+static inline void write_kctxt_csr(struct hfi1_devdata *dd, int ctxt,
+                                  u32 offset0, u64 value)
+{
+       /* kernel per-context CSRs are separated by 0x100 */
+       write_csr(dd, offset0 + (0x100 * ctxt), value);
+}
+
+int read_lcb_csr(struct hfi1_devdata *dd, u32 offset, u64 *data);
+int write_lcb_csr(struct hfi1_devdata *dd, u32 offset, u64 data);
+
+void __iomem *get_csr_addr(
+       struct hfi1_devdata *dd,
+       u32 offset);
+
+static inline void __iomem *get_kctxt_csr_addr(
+       struct hfi1_devdata *dd,
+       int ctxt,
+       u32 offset0)
+{
+       return get_csr_addr(dd, offset0 + (0x100 * ctxt));
+}
+
+/*
+ * The *_uctxt_* flavor of the CSR read/write functions are for
+ * per-context CSRs that are mappable to user space. All these CSRs
+ * are spaced by a PAGE_SIZE multiple in order to be mappable to
+ * different processes without exposing other contexts' CSRs
+ */
+static inline u64 read_uctxt_csr(const struct hfi1_devdata *dd, int ctxt,
+                                u32 offset0)
+{
+       /* user per-context CSRs are separated by 0x1000 */
+       return read_csr(dd, offset0 + (0x1000 * ctxt));
+}
+
+static inline void write_uctxt_csr(struct hfi1_devdata *dd, int ctxt,
+                                  u32 offset0, u64 value)
+{
+       /* user per-context CSRs are separated by 0x1000 */
+       write_csr(dd, offset0 + (0x1000 * ctxt), value);
+}
+
+u64 create_pbc(struct hfi1_pportdata *ppd, u64, int, u32, u32);
+
+/* firmware.c */
+#define SBUS_MASTER_BROADCAST 0xfd
+#define NUM_PCIE_SERDES 16     /* number of PCIe serdes on the SBus */
+extern const u8 pcie_serdes_broadcast[];
+extern const u8 pcie_pcs_addrs[2][NUM_PCIE_SERDES];
+extern uint platform_config_load;
+
+/* SBus commands */
+#define RESET_SBUS_RECEIVER 0x20
+#define WRITE_SBUS_RECEIVER 0x21
+void sbus_request(struct hfi1_devdata *dd,
+                 u8 receiver_addr, u8 data_addr, u8 command, u32 data_in);
+int sbus_request_slow(struct hfi1_devdata *dd,
+                     u8 receiver_addr, u8 data_addr, u8 command, u32 data_in);
+void set_sbus_fast_mode(struct hfi1_devdata *dd);
+void clear_sbus_fast_mode(struct hfi1_devdata *dd);
+int hfi1_firmware_init(struct hfi1_devdata *dd);
+int load_pcie_firmware(struct hfi1_devdata *dd);
+int load_firmware(struct hfi1_devdata *dd);
+void dispose_firmware(void);
+int acquire_hw_mutex(struct hfi1_devdata *dd);
+void release_hw_mutex(struct hfi1_devdata *dd);
+
+/*
+ * Bitmask of dynamic access for ASIC block chip resources.  Each HFI has its
+ * own range of bits for the resource so it can clear its own bits on
+ * starting and exiting.  If either HFI has the resource bit set, the
+ * resource is in use.  The separate bit ranges are:
+ *     HFI0 bits  7:0
+ *     HFI1 bits 15:8
+ */
+#define CR_SBUS  0x01  /* SBUS, THERM, and PCIE registers */
+#define CR_EPROM 0x02  /* EEP, GPIO registers */
+#define CR_I2C1  0x04  /* QSFP1_OE register */
+#define CR_I2C2  0x08  /* QSFP2_OE register */
+#define CR_DYN_SHIFT 8 /* dynamic flag shift */
+#define CR_DYN_MASK  ((1ull << CR_DYN_SHIFT) - 1)
+
+/*
+ * Bitmask of static ASIC states these are outside of the dynamic ASIC
+ * block chip resources above.  These are to be set once and never cleared.
+ * Must be holding the SBus dynamic flag when setting.
+ */
+#define CR_THERM_INIT  0x010000
+
+int acquire_chip_resource(struct hfi1_devdata *dd, u32 resource, u32 mswait);
+void release_chip_resource(struct hfi1_devdata *dd, u32 resource);
+bool check_chip_resource(struct hfi1_devdata *dd, u32 resource,
+                        const char *func);
+void init_chip_resources(struct hfi1_devdata *dd);
+void finish_chip_resources(struct hfi1_devdata *dd);
+
+/* ms wait time for access to an SBus resoure */
+#define SBUS_TIMEOUT 4000 /* long enough for a FW download and SBR */
+
+/* ms wait time for a qsfp (i2c) chain to become available */
+#define QSFP_WAIT 20000 /* long enough for FW update to the F4 uc */
+
+void fabric_serdes_reset(struct hfi1_devdata *dd);
+int read_8051_data(struct hfi1_devdata *dd, u32 addr, u32 len, u64 *result);
+
+/* chip.c */
+void read_misc_status(struct hfi1_devdata *dd, u8 *ver_a, u8 *ver_b);
+void read_guid(struct hfi1_devdata *dd);
+int wait_fm_ready(struct hfi1_devdata *dd, u32 mstimeout);
+void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason,
+                         u8 neigh_reason, u8 rem_reason);
+int set_link_state(struct hfi1_pportdata *, u32 state);
+int port_ltp_to_cap(int port_ltp);
+void handle_verify_cap(struct work_struct *work);
+void handle_freeze(struct work_struct *work);
+void handle_link_up(struct work_struct *work);
+void handle_link_down(struct work_struct *work);
+void handle_link_downgrade(struct work_struct *work);
+void handle_link_bounce(struct work_struct *work);
+void handle_sma_message(struct work_struct *work);
+void reset_qsfp(struct hfi1_pportdata *ppd);
+void qsfp_event(struct work_struct *work);
+void start_freeze_handling(struct hfi1_pportdata *ppd, int flags);
+int send_idle_sma(struct hfi1_devdata *dd, u64 message);
+int load_8051_config(struct hfi1_devdata *, u8, u8, u32);
+int read_8051_config(struct hfi1_devdata *, u8, u8, u32 *);
+int start_link(struct hfi1_pportdata *ppd);
+int bringup_serdes(struct hfi1_pportdata *ppd);
+void set_intr_state(struct hfi1_devdata *dd, u32 enable);
+void apply_link_downgrade_policy(struct hfi1_pportdata *ppd,
+                                int refresh_widths);
+void update_usrhead(struct hfi1_ctxtdata *, u32, u32, u32, u32, u32);
+int stop_drain_data_vls(struct hfi1_devdata *dd);
+int open_fill_data_vls(struct hfi1_devdata *dd);
+u32 ns_to_cclock(struct hfi1_devdata *dd, u32 ns);
+u32 cclock_to_ns(struct hfi1_devdata *dd, u32 cclock);
+void get_linkup_link_widths(struct hfi1_pportdata *ppd);
+void read_ltp_rtt(struct hfi1_devdata *dd);
+void clear_linkup_counters(struct hfi1_devdata *dd);
+u32 hdrqempty(struct hfi1_ctxtdata *rcd);
+int is_ax(struct hfi1_devdata *dd);
+int is_bx(struct hfi1_devdata *dd);
+u32 read_physical_state(struct hfi1_devdata *dd);
+u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate);
+u32 get_logical_state(struct hfi1_pportdata *ppd);
+const char *opa_lstate_name(u32 lstate);
+const char *opa_pstate_name(u32 pstate);
+u32 driver_physical_state(struct hfi1_pportdata *ppd);
+u32 driver_logical_state(struct hfi1_pportdata *ppd);
+
+int acquire_lcb_access(struct hfi1_devdata *dd, int sleep_ok);
+int release_lcb_access(struct hfi1_devdata *dd, int sleep_ok);
+#define LCB_START DC_LCB_CSRS
+#define LCB_END   DC_8051_CSRS /* next block is 8051 */
+static inline int is_lcb_offset(u32 offset)
+{
+       return (offset >= LCB_START && offset < LCB_END);
+}
+
+extern uint num_vls;
+
+extern uint disable_integrity;
+u64 read_dev_cntr(struct hfi1_devdata *dd, int index, int vl);
+u64 write_dev_cntr(struct hfi1_devdata *dd, int index, int vl, u64 data);
+u64 read_port_cntr(struct hfi1_pportdata *ppd, int index, int vl);
+u64 write_port_cntr(struct hfi1_pportdata *ppd, int index, int vl, u64 data);
+u32 read_logical_state(struct hfi1_devdata *dd);
+void force_recv_intr(struct hfi1_ctxtdata *rcd);
+
+/* Per VL indexes */
+enum {
+       C_VL_0 = 0,
+       C_VL_1,
+       C_VL_2,
+       C_VL_3,
+       C_VL_4,
+       C_VL_5,
+       C_VL_6,
+       C_VL_7,
+       C_VL_15,
+       C_VL_COUNT
+};
+
+static inline int vl_from_idx(int idx)
+{
+       return (idx == C_VL_15 ? 15 : idx);
+}
+
+static inline int idx_from_vl(int vl)
+{
+       return (vl == 15 ? C_VL_15 : vl);
+}
+
+/* Per device counter indexes */
+enum {
+       C_RCV_OVF = 0,
+       C_RX_TID_FULL,
+       C_RX_TID_INVALID,
+       C_RX_TID_FLGMS,
+       C_RX_CTX_EGRS,
+       C_RCV_TID_FLSMS,
+       C_CCE_PCI_CR_ST,
+       C_CCE_PCI_TR_ST,
+       C_CCE_PIO_WR_ST,
+       C_CCE_ERR_INT,
+       C_CCE_SDMA_INT,
+       C_CCE_MISC_INT,
+       C_CCE_RCV_AV_INT,
+       C_CCE_RCV_URG_INT,
+       C_CCE_SEND_CR_INT,
+       C_DC_UNC_ERR,
+       C_DC_RCV_ERR,
+       C_DC_FM_CFG_ERR,
+       C_DC_RMT_PHY_ERR,
+       C_DC_DROPPED_PKT,
+       C_DC_MC_XMIT_PKTS,
+       C_DC_MC_RCV_PKTS,
+       C_DC_XMIT_CERR,
+       C_DC_RCV_CERR,
+       C_DC_RCV_FCC,
+       C_DC_XMIT_FCC,
+       C_DC_XMIT_FLITS,
+       C_DC_RCV_FLITS,
+       C_DC_XMIT_PKTS,
+       C_DC_RCV_PKTS,
+       C_DC_RX_FLIT_VL,
+       C_DC_RX_PKT_VL,
+       C_DC_RCV_FCN,
+       C_DC_RCV_FCN_VL,
+       C_DC_RCV_BCN,
+       C_DC_RCV_BCN_VL,
+       C_DC_RCV_BBL,
+       C_DC_RCV_BBL_VL,
+       C_DC_MARK_FECN,
+       C_DC_MARK_FECN_VL,
+       C_DC_TOTAL_CRC,
+       C_DC_CRC_LN0,
+       C_DC_CRC_LN1,
+       C_DC_CRC_LN2,
+       C_DC_CRC_LN3,
+       C_DC_CRC_MULT_LN,
+       C_DC_TX_REPLAY,
+       C_DC_RX_REPLAY,
+       C_DC_SEQ_CRC_CNT,
+       C_DC_ESC0_ONLY_CNT,
+       C_DC_ESC0_PLUS1_CNT,
+       C_DC_ESC0_PLUS2_CNT,
+       C_DC_REINIT_FROM_PEER_CNT,
+       C_DC_SBE_CNT,
+       C_DC_MISC_FLG_CNT,
+       C_DC_PRF_GOOD_LTP_CNT,
+       C_DC_PRF_ACCEPTED_LTP_CNT,
+       C_DC_PRF_RX_FLIT_CNT,
+       C_DC_PRF_TX_FLIT_CNT,
+       C_DC_PRF_CLK_CNTR,
+       C_DC_PG_DBG_FLIT_CRDTS_CNT,
+       C_DC_PG_STS_PAUSE_COMPLETE_CNT,
+       C_DC_PG_STS_TX_SBE_CNT,
+       C_DC_PG_STS_TX_MBE_CNT,
+       C_SW_CPU_INTR,
+       C_SW_CPU_RCV_LIM,
+       C_SW_VTX_WAIT,
+       C_SW_PIO_WAIT,
+       C_SW_PIO_DRAIN,
+       C_SW_KMEM_WAIT,
+       C_SW_SEND_SCHED,
+       C_SDMA_DESC_FETCHED_CNT,
+       C_SDMA_INT_CNT,
+       C_SDMA_ERR_CNT,
+       C_SDMA_IDLE_INT_CNT,
+       C_SDMA_PROGRESS_INT_CNT,
+/* MISC_ERR_STATUS */
+       C_MISC_PLL_LOCK_FAIL_ERR,
+       C_MISC_MBIST_FAIL_ERR,
+       C_MISC_INVALID_EEP_CMD_ERR,
+       C_MISC_EFUSE_DONE_PARITY_ERR,
+       C_MISC_EFUSE_WRITE_ERR,
+       C_MISC_EFUSE_READ_BAD_ADDR_ERR,
+       C_MISC_EFUSE_CSR_PARITY_ERR,
+       C_MISC_FW_AUTH_FAILED_ERR,
+       C_MISC_KEY_MISMATCH_ERR,
+       C_MISC_SBUS_WRITE_FAILED_ERR,
+       C_MISC_CSR_WRITE_BAD_ADDR_ERR,
+       C_MISC_CSR_READ_BAD_ADDR_ERR,
+       C_MISC_CSR_PARITY_ERR,
+/* CceErrStatus */
+       /*
+       * A special counter that is the aggregate count
+       * of all the cce_err_status errors.  The remainder
+       * are actual bits in the CceErrStatus register.
+       */
+       C_CCE_ERR_STATUS_AGGREGATED_CNT,
+       C_CCE_MSIX_CSR_PARITY_ERR,
+       C_CCE_INT_MAP_UNC_ERR,
+       C_CCE_INT_MAP_COR_ERR,
+       C_CCE_MSIX_TABLE_UNC_ERR,
+       C_CCE_MSIX_TABLE_COR_ERR,
+       C_CCE_RXDMA_CONV_FIFO_PARITY_ERR,
+       C_CCE_RCPL_ASYNC_FIFO_PARITY_ERR,
+       C_CCE_SEG_WRITE_BAD_ADDR_ERR,
+       C_CCE_SEG_READ_BAD_ADDR_ERR,
+       C_LA_TRIGGERED,
+       C_CCE_TRGT_CPL_TIMEOUT_ERR,
+       C_PCIC_RECEIVE_PARITY_ERR,
+       C_PCIC_TRANSMIT_BACK_PARITY_ERR,
+       C_PCIC_TRANSMIT_FRONT_PARITY_ERR,
+       C_PCIC_CPL_DAT_Q_UNC_ERR,
+       C_PCIC_CPL_HD_Q_UNC_ERR,
+       C_PCIC_POST_DAT_Q_UNC_ERR,
+       C_PCIC_POST_HD_Q_UNC_ERR,
+       C_PCIC_RETRY_SOT_MEM_UNC_ERR,
+       C_PCIC_RETRY_MEM_UNC_ERR,
+       C_PCIC_N_POST_DAT_Q_PARITY_ERR,
+       C_PCIC_N_POST_H_Q_PARITY_ERR,
+       C_PCIC_CPL_DAT_Q_COR_ERR,
+       C_PCIC_CPL_HD_Q_COR_ERR,
+       C_PCIC_POST_DAT_Q_COR_ERR,
+       C_PCIC_POST_HD_Q_COR_ERR,
+       C_PCIC_RETRY_SOT_MEM_COR_ERR,
+       C_PCIC_RETRY_MEM_COR_ERR,
+       C_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERR,
+       C_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERR,
+       C_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR,
+       C_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR,
+       C_CCE_CLI2_ASYNC_FIFO_PARITY_ERR,
+       C_CCE_CSR_CFG_BUS_PARITY_ERR,
+       C_CCE_CLI0_ASYNC_FIFO_PARTIY_ERR,
+       C_CCE_RSPD_DATA_PARITY_ERR,
+       C_CCE_TRGT_ACCESS_ERR,
+       C_CCE_TRGT_ASYNC_FIFO_PARITY_ERR,
+       C_CCE_CSR_WRITE_BAD_ADDR_ERR,
+       C_CCE_CSR_READ_BAD_ADDR_ERR,
+       C_CCE_CSR_PARITY_ERR,
+/* RcvErrStatus */
+       C_RX_CSR_PARITY_ERR,
+       C_RX_CSR_WRITE_BAD_ADDR_ERR,
+       C_RX_CSR_READ_BAD_ADDR_ERR,
+       C_RX_DMA_CSR_UNC_ERR,
+       C_RX_DMA_DQ_FSM_ENCODING_ERR,
+       C_RX_DMA_EQ_FSM_ENCODING_ERR,
+       C_RX_DMA_CSR_PARITY_ERR,
+       C_RX_RBUF_DATA_COR_ERR,
+       C_RX_RBUF_DATA_UNC_ERR,
+       C_RX_DMA_DATA_FIFO_RD_COR_ERR,
+       C_RX_DMA_DATA_FIFO_RD_UNC_ERR,
+       C_RX_DMA_HDR_FIFO_RD_COR_ERR,
+       C_RX_DMA_HDR_FIFO_RD_UNC_ERR,
+       C_RX_RBUF_DESC_PART2_COR_ERR,
+       C_RX_RBUF_DESC_PART2_UNC_ERR,
+       C_RX_RBUF_DESC_PART1_COR_ERR,
+       C_RX_RBUF_DESC_PART1_UNC_ERR,
+       C_RX_HQ_INTR_FSM_ERR,
+       C_RX_HQ_INTR_CSR_PARITY_ERR,
+       C_RX_LOOKUP_CSR_PARITY_ERR,
+       C_RX_LOOKUP_RCV_ARRAY_COR_ERR,
+       C_RX_LOOKUP_RCV_ARRAY_UNC_ERR,
+       C_RX_LOOKUP_DES_PART2_PARITY_ERR,
+       C_RX_LOOKUP_DES_PART1_UNC_COR_ERR,
+       C_RX_LOOKUP_DES_PART1_UNC_ERR,
+       C_RX_RBUF_NEXT_FREE_BUF_COR_ERR,
+       C_RX_RBUF_NEXT_FREE_BUF_UNC_ERR,
+       C_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR,
+       C_RX_RBUF_FL_INITDONE_PARITY_ERR,
+       C_RX_RBUF_FL_WRITE_ADDR_PARITY_ERR,
+       C_RX_RBUF_FL_RD_ADDR_PARITY_ERR,
+       C_RX_RBUF_EMPTY_ERR,
+       C_RX_RBUF_FULL_ERR,
+       C_RX_RBUF_BAD_LOOKUP_ERR,
+       C_RX_RBUF_CTX_ID_PARITY_ERR,
+       C_RX_RBUF_CSR_QEOPDW_PARITY_ERR,
+       C_RX_RBUF_CSR_Q_NUM_OF_PKT_PARITY_ERR,
+       C_RX_RBUF_CSR_Q_T1_PTR_PARITY_ERR,
+       C_RX_RBUF_CSR_Q_HD_PTR_PARITY_ERR,
+       C_RX_RBUF_CSR_Q_VLD_BIT_PARITY_ERR,
+       C_RX_RBUF_CSR_Q_NEXT_BUF_PARITY_ERR,
+       C_RX_RBUF_CSR_Q_ENT_CNT_PARITY_ERR,
+       C_RX_RBUF_CSR_Q_HEAD_BUF_NUM_PARITY_ERR,
+       C_RX_RBUF_BLOCK_LIST_READ_COR_ERR,
+       C_RX_RBUF_BLOCK_LIST_READ_UNC_ERR,
+       C_RX_RBUF_LOOKUP_DES_COR_ERR,
+       C_RX_RBUF_LOOKUP_DES_UNC_ERR,
+       C_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR,
+       C_RX_RBUF_LOOKUP_DES_REG_UNC_ERR,
+       C_RX_RBUF_FREE_LIST_COR_ERR,
+       C_RX_RBUF_FREE_LIST_UNC_ERR,
+       C_RX_RCV_FSM_ENCODING_ERR,
+       C_RX_DMA_FLAG_COR_ERR,
+       C_RX_DMA_FLAG_UNC_ERR,
+       C_RX_DC_SOP_EOP_PARITY_ERR,
+       C_RX_RCV_CSR_PARITY_ERR,
+       C_RX_RCV_QP_MAP_TABLE_COR_ERR,
+       C_RX_RCV_QP_MAP_TABLE_UNC_ERR,
+       C_RX_RCV_DATA_COR_ERR,
+       C_RX_RCV_DATA_UNC_ERR,
+       C_RX_RCV_HDR_COR_ERR,
+       C_RX_RCV_HDR_UNC_ERR,
+       C_RX_DC_INTF_PARITY_ERR,
+       C_RX_DMA_CSR_COR_ERR,
+/* SendPioErrStatus */
+       C_PIO_PEC_SOP_HEAD_PARITY_ERR,
+       C_PIO_PCC_SOP_HEAD_PARITY_ERR,
+       C_PIO_LAST_RETURNED_CNT_PARITY_ERR,
+       C_PIO_CURRENT_FREE_CNT_PARITY_ERR,
+       C_PIO_RSVD_31_ERR,
+       C_PIO_RSVD_30_ERR,
+       C_PIO_PPMC_SOP_LEN_ERR,
+       C_PIO_PPMC_BQC_MEM_PARITY_ERR,
+       C_PIO_VL_FIFO_PARITY_ERR,
+       C_PIO_VLF_SOP_PARITY_ERR,
+       C_PIO_VLF_V1_LEN_PARITY_ERR,
+       C_PIO_BLOCK_QW_COUNT_PARITY_ERR,
+       C_PIO_WRITE_QW_VALID_PARITY_ERR,
+       C_PIO_STATE_MACHINE_ERR,
+       C_PIO_WRITE_DATA_PARITY_ERR,
+       C_PIO_HOST_ADDR_MEM_COR_ERR,
+       C_PIO_HOST_ADDR_MEM_UNC_ERR,
+       C_PIO_PKT_EVICT_SM_OR_ARM_SM_ERR,
+       C_PIO_INIT_SM_IN_ERR,
+       C_PIO_PPMC_PBL_FIFO_ERR,
+       C_PIO_CREDIT_RET_FIFO_PARITY_ERR,
+       C_PIO_V1_LEN_MEM_BANK1_COR_ERR,
+       C_PIO_V1_LEN_MEM_BANK0_COR_ERR,
+       C_PIO_V1_LEN_MEM_BANK1_UNC_ERR,
+       C_PIO_V1_LEN_MEM_BANK0_UNC_ERR,
+       C_PIO_SM_PKT_RESET_PARITY_ERR,
+       C_PIO_PKT_EVICT_FIFO_PARITY_ERR,
+       C_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR,
+       C_PIO_SBRDCTL_CRREL_PARITY_ERR,
+       C_PIO_PEC_FIFO_PARITY_ERR,
+       C_PIO_PCC_FIFO_PARITY_ERR,
+       C_PIO_SB_MEM_FIFO1_ERR,
+       C_PIO_SB_MEM_FIFO0_ERR,
+       C_PIO_CSR_PARITY_ERR,
+       C_PIO_WRITE_ADDR_PARITY_ERR,
+       C_PIO_WRITE_BAD_CTXT_ERR,
+/* SendDmaErrStatus */
+       C_SDMA_PCIE_REQ_TRACKING_COR_ERR,
+       C_SDMA_PCIE_REQ_TRACKING_UNC_ERR,
+       C_SDMA_CSR_PARITY_ERR,
+       C_SDMA_RPY_TAG_ERR,
+/* SendEgressErrStatus */
+       C_TX_READ_PIO_MEMORY_CSR_UNC_ERR,
+       C_TX_READ_SDMA_MEMORY_CSR_UNC_ERR,
+       C_TX_EGRESS_FIFO_COR_ERR,
+       C_TX_READ_PIO_MEMORY_COR_ERR,
+       C_TX_READ_SDMA_MEMORY_COR_ERR,
+       C_TX_SB_HDR_COR_ERR,
+       C_TX_CREDIT_OVERRUN_ERR,
+       C_TX_LAUNCH_FIFO8_COR_ERR,
+       C_TX_LAUNCH_FIFO7_COR_ERR,
+       C_TX_LAUNCH_FIFO6_COR_ERR,
+       C_TX_LAUNCH_FIFO5_COR_ERR,
+       C_TX_LAUNCH_FIFO4_COR_ERR,
+       C_TX_LAUNCH_FIFO3_COR_ERR,
+       C_TX_LAUNCH_FIFO2_COR_ERR,
+       C_TX_LAUNCH_FIFO1_COR_ERR,
+       C_TX_LAUNCH_FIFO0_COR_ERR,
+       C_TX_CREDIT_RETURN_VL_ERR,
+       C_TX_HCRC_INSERTION_ERR,
+       C_TX_EGRESS_FIFI_UNC_ERR,
+       C_TX_READ_PIO_MEMORY_UNC_ERR,
+       C_TX_READ_SDMA_MEMORY_UNC_ERR,
+       C_TX_SB_HDR_UNC_ERR,
+       C_TX_CREDIT_RETURN_PARITY_ERR,
+       C_TX_LAUNCH_FIFO8_UNC_OR_PARITY_ERR,
+       C_TX_LAUNCH_FIFO7_UNC_OR_PARITY_ERR,
+       C_TX_LAUNCH_FIFO6_UNC_OR_PARITY_ERR,
+       C_TX_LAUNCH_FIFO5_UNC_OR_PARITY_ERR,
+       C_TX_LAUNCH_FIFO4_UNC_OR_PARITY_ERR,
+       C_TX_LAUNCH_FIFO3_UNC_OR_PARITY_ERR,
+       C_TX_LAUNCH_FIFO2_UNC_OR_PARITY_ERR,
+       C_TX_LAUNCH_FIFO1_UNC_OR_PARITY_ERR,
+       C_TX_LAUNCH_FIFO0_UNC_OR_PARITY_ERR,
+       C_TX_SDMA15_DISALLOWED_PACKET_ERR,
+       C_TX_SDMA14_DISALLOWED_PACKET_ERR,
+       C_TX_SDMA13_DISALLOWED_PACKET_ERR,
+       C_TX_SDMA12_DISALLOWED_PACKET_ERR,
+       C_TX_SDMA11_DISALLOWED_PACKET_ERR,
+       C_TX_SDMA10_DISALLOWED_PACKET_ERR,
+       C_TX_SDMA9_DISALLOWED_PACKET_ERR,
+       C_TX_SDMA8_DISALLOWED_PACKET_ERR,
+       C_TX_SDMA7_DISALLOWED_PACKET_ERR,
+       C_TX_SDMA6_DISALLOWED_PACKET_ERR,
+       C_TX_SDMA5_DISALLOWED_PACKET_ERR,
+       C_TX_SDMA4_DISALLOWED_PACKET_ERR,
+       C_TX_SDMA3_DISALLOWED_PACKET_ERR,
+       C_TX_SDMA2_DISALLOWED_PACKET_ERR,
+       C_TX_SDMA1_DISALLOWED_PACKET_ERR,
+       C_TX_SDMA0_DISALLOWED_PACKET_ERR,
+       C_TX_CONFIG_PARITY_ERR,
+       C_TX_SBRD_CTL_CSR_PARITY_ERR,
+       C_TX_LAUNCH_CSR_PARITY_ERR,
+       C_TX_ILLEGAL_CL_ERR,
+       C_TX_SBRD_CTL_STATE_MACHINE_PARITY_ERR,
+       C_TX_RESERVED_10,
+       C_TX_RESERVED_9,
+       C_TX_SDMA_LAUNCH_INTF_PARITY_ERR,
+       C_TX_PIO_LAUNCH_INTF_PARITY_ERR,
+       C_TX_RESERVED_6,
+       C_TX_INCORRECT_LINK_STATE_ERR,
+       C_TX_LINK_DOWN_ERR,
+       C_TX_EGRESS_FIFO_UNDERRUN_OR_PARITY_ERR,
+       C_TX_RESERVED_2,
+       C_TX_PKT_INTEGRITY_MEM_UNC_ERR,
+       C_TX_PKT_INTEGRITY_MEM_COR_ERR,
+/* SendErrStatus */
+       C_SEND_CSR_WRITE_BAD_ADDR_ERR,
+       C_SEND_CSR_READ_BAD_ADD_ERR,
+       C_SEND_CSR_PARITY_ERR,
+/* SendCtxtErrStatus */
+       C_PIO_WRITE_OUT_OF_BOUNDS_ERR,
+       C_PIO_WRITE_OVERFLOW_ERR,
+       C_PIO_WRITE_CROSSES_BOUNDARY_ERR,
+       C_PIO_DISALLOWED_PACKET_ERR,
+       C_PIO_INCONSISTENT_SOP_ERR,
+/*SendDmaEngErrStatus */
+       C_SDMA_HEADER_REQUEST_FIFO_COR_ERR,
+       C_SDMA_HEADER_STORAGE_COR_ERR,
+       C_SDMA_PACKET_TRACKING_COR_ERR,
+       C_SDMA_ASSEMBLY_COR_ERR,
+       C_SDMA_DESC_TABLE_COR_ERR,
+       C_SDMA_HEADER_REQUEST_FIFO_UNC_ERR,
+       C_SDMA_HEADER_STORAGE_UNC_ERR,
+       C_SDMA_PACKET_TRACKING_UNC_ERR,
+       C_SDMA_ASSEMBLY_UNC_ERR,
+       C_SDMA_DESC_TABLE_UNC_ERR,
+       C_SDMA_TIMEOUT_ERR,
+       C_SDMA_HEADER_LENGTH_ERR,
+       C_SDMA_HEADER_ADDRESS_ERR,
+       C_SDMA_HEADER_SELECT_ERR,
+       C_SMDA_RESERVED_9,
+       C_SDMA_PACKET_DESC_OVERFLOW_ERR,
+       C_SDMA_LENGTH_MISMATCH_ERR,
+       C_SDMA_HALT_ERR,
+       C_SDMA_MEM_READ_ERR,
+       C_SDMA_FIRST_DESC_ERR,
+       C_SDMA_TAIL_OUT_OF_BOUNDS_ERR,
+       C_SDMA_TOO_LONG_ERR,
+       C_SDMA_GEN_MISMATCH_ERR,
+       C_SDMA_WRONG_DW_ERR,
+       DEV_CNTR_LAST  /* Must be kept last */
+};
+
+/* Per port counter indexes */
+enum {
+       C_TX_UNSUP_VL = 0,
+       C_TX_INVAL_LEN,
+       C_TX_MM_LEN_ERR,
+       C_TX_UNDERRUN,
+       C_TX_FLOW_STALL,
+       C_TX_DROPPED,
+       C_TX_HDR_ERR,
+       C_TX_PKT,
+       C_TX_WORDS,
+       C_TX_WAIT,
+       C_TX_FLIT_VL,
+       C_TX_PKT_VL,
+       C_TX_WAIT_VL,
+       C_RX_PKT,
+       C_RX_WORDS,
+       C_SW_LINK_DOWN,
+       C_SW_LINK_UP,
+       C_SW_UNKNOWN_FRAME,
+       C_SW_XMIT_DSCD,
+       C_SW_XMIT_DSCD_VL,
+       C_SW_XMIT_CSTR_ERR,
+       C_SW_RCV_CSTR_ERR,
+       C_SW_IBP_LOOP_PKTS,
+       C_SW_IBP_RC_RESENDS,
+       C_SW_IBP_RNR_NAKS,
+       C_SW_IBP_OTHER_NAKS,
+       C_SW_IBP_RC_TIMEOUTS,
+       C_SW_IBP_PKT_DROPS,
+       C_SW_IBP_DMA_WAIT,
+       C_SW_IBP_RC_SEQNAK,
+       C_SW_IBP_RC_DUPREQ,
+       C_SW_IBP_RDMA_SEQ,
+       C_SW_IBP_UNALIGNED,
+       C_SW_IBP_SEQ_NAK,
+       C_SW_CPU_RC_ACKS,
+       C_SW_CPU_RC_QACKS,
+       C_SW_CPU_RC_DELAYED_COMP,
+       C_RCV_HDR_OVF_0,
+       C_RCV_HDR_OVF_1,
+       C_RCV_HDR_OVF_2,
+       C_RCV_HDR_OVF_3,
+       C_RCV_HDR_OVF_4,
+       C_RCV_HDR_OVF_5,
+       C_RCV_HDR_OVF_6,
+       C_RCV_HDR_OVF_7,
+       C_RCV_HDR_OVF_8,
+       C_RCV_HDR_OVF_9,
+       C_RCV_HDR_OVF_10,
+       C_RCV_HDR_OVF_11,
+       C_RCV_HDR_OVF_12,
+       C_RCV_HDR_OVF_13,
+       C_RCV_HDR_OVF_14,
+       C_RCV_HDR_OVF_15,
+       C_RCV_HDR_OVF_16,
+       C_RCV_HDR_OVF_17,
+       C_RCV_HDR_OVF_18,
+       C_RCV_HDR_OVF_19,
+       C_RCV_HDR_OVF_20,
+       C_RCV_HDR_OVF_21,
+       C_RCV_HDR_OVF_22,
+       C_RCV_HDR_OVF_23,
+       C_RCV_HDR_OVF_24,
+       C_RCV_HDR_OVF_25,
+       C_RCV_HDR_OVF_26,
+       C_RCV_HDR_OVF_27,
+       C_RCV_HDR_OVF_28,
+       C_RCV_HDR_OVF_29,
+       C_RCV_HDR_OVF_30,
+       C_RCV_HDR_OVF_31,
+       C_RCV_HDR_OVF_32,
+       C_RCV_HDR_OVF_33,
+       C_RCV_HDR_OVF_34,
+       C_RCV_HDR_OVF_35,
+       C_RCV_HDR_OVF_36,
+       C_RCV_HDR_OVF_37,
+       C_RCV_HDR_OVF_38,
+       C_RCV_HDR_OVF_39,
+       C_RCV_HDR_OVF_40,
+       C_RCV_HDR_OVF_41,
+       C_RCV_HDR_OVF_42,
+       C_RCV_HDR_OVF_43,
+       C_RCV_HDR_OVF_44,
+       C_RCV_HDR_OVF_45,
+       C_RCV_HDR_OVF_46,
+       C_RCV_HDR_OVF_47,
+       C_RCV_HDR_OVF_48,
+       C_RCV_HDR_OVF_49,
+       C_RCV_HDR_OVF_50,
+       C_RCV_HDR_OVF_51,
+       C_RCV_HDR_OVF_52,
+       C_RCV_HDR_OVF_53,
+       C_RCV_HDR_OVF_54,
+       C_RCV_HDR_OVF_55,
+       C_RCV_HDR_OVF_56,
+       C_RCV_HDR_OVF_57,
+       C_RCV_HDR_OVF_58,
+       C_RCV_HDR_OVF_59,
+       C_RCV_HDR_OVF_60,
+       C_RCV_HDR_OVF_61,
+       C_RCV_HDR_OVF_62,
+       C_RCV_HDR_OVF_63,
+       C_RCV_HDR_OVF_64,
+       C_RCV_HDR_OVF_65,
+       C_RCV_HDR_OVF_66,
+       C_RCV_HDR_OVF_67,
+       C_RCV_HDR_OVF_68,
+       C_RCV_HDR_OVF_69,
+       C_RCV_HDR_OVF_70,
+       C_RCV_HDR_OVF_71,
+       C_RCV_HDR_OVF_72,
+       C_RCV_HDR_OVF_73,
+       C_RCV_HDR_OVF_74,
+       C_RCV_HDR_OVF_75,
+       C_RCV_HDR_OVF_76,
+       C_RCV_HDR_OVF_77,
+       C_RCV_HDR_OVF_78,
+       C_RCV_HDR_OVF_79,
+       C_RCV_HDR_OVF_80,
+       C_RCV_HDR_OVF_81,
+       C_RCV_HDR_OVF_82,
+       C_RCV_HDR_OVF_83,
+       C_RCV_HDR_OVF_84,
+       C_RCV_HDR_OVF_85,
+       C_RCV_HDR_OVF_86,
+       C_RCV_HDR_OVF_87,
+       C_RCV_HDR_OVF_88,
+       C_RCV_HDR_OVF_89,
+       C_RCV_HDR_OVF_90,
+       C_RCV_HDR_OVF_91,
+       C_RCV_HDR_OVF_92,
+       C_RCV_HDR_OVF_93,
+       C_RCV_HDR_OVF_94,
+       C_RCV_HDR_OVF_95,
+       C_RCV_HDR_OVF_96,
+       C_RCV_HDR_OVF_97,
+       C_RCV_HDR_OVF_98,
+       C_RCV_HDR_OVF_99,
+       C_RCV_HDR_OVF_100,
+       C_RCV_HDR_OVF_101,
+       C_RCV_HDR_OVF_102,
+       C_RCV_HDR_OVF_103,
+       C_RCV_HDR_OVF_104,
+       C_RCV_HDR_OVF_105,
+       C_RCV_HDR_OVF_106,
+       C_RCV_HDR_OVF_107,
+       C_RCV_HDR_OVF_108,
+       C_RCV_HDR_OVF_109,
+       C_RCV_HDR_OVF_110,
+       C_RCV_HDR_OVF_111,
+       C_RCV_HDR_OVF_112,
+       C_RCV_HDR_OVF_113,
+       C_RCV_HDR_OVF_114,
+       C_RCV_HDR_OVF_115,
+       C_RCV_HDR_OVF_116,
+       C_RCV_HDR_OVF_117,
+       C_RCV_HDR_OVF_118,
+       C_RCV_HDR_OVF_119,
+       C_RCV_HDR_OVF_120,
+       C_RCV_HDR_OVF_121,
+       C_RCV_HDR_OVF_122,
+       C_RCV_HDR_OVF_123,
+       C_RCV_HDR_OVF_124,
+       C_RCV_HDR_OVF_125,
+       C_RCV_HDR_OVF_126,
+       C_RCV_HDR_OVF_127,
+       C_RCV_HDR_OVF_128,
+       C_RCV_HDR_OVF_129,
+       C_RCV_HDR_OVF_130,
+       C_RCV_HDR_OVF_131,
+       C_RCV_HDR_OVF_132,
+       C_RCV_HDR_OVF_133,
+       C_RCV_HDR_OVF_134,
+       C_RCV_HDR_OVF_135,
+       C_RCV_HDR_OVF_136,
+       C_RCV_HDR_OVF_137,
+       C_RCV_HDR_OVF_138,
+       C_RCV_HDR_OVF_139,
+       C_RCV_HDR_OVF_140,
+       C_RCV_HDR_OVF_141,
+       C_RCV_HDR_OVF_142,
+       C_RCV_HDR_OVF_143,
+       C_RCV_HDR_OVF_144,
+       C_RCV_HDR_OVF_145,
+       C_RCV_HDR_OVF_146,
+       C_RCV_HDR_OVF_147,
+       C_RCV_HDR_OVF_148,
+       C_RCV_HDR_OVF_149,
+       C_RCV_HDR_OVF_150,
+       C_RCV_HDR_OVF_151,
+       C_RCV_HDR_OVF_152,
+       C_RCV_HDR_OVF_153,
+       C_RCV_HDR_OVF_154,
+       C_RCV_HDR_OVF_155,
+       C_RCV_HDR_OVF_156,
+       C_RCV_HDR_OVF_157,
+       C_RCV_HDR_OVF_158,
+       C_RCV_HDR_OVF_159,
+       PORT_CNTR_LAST /* Must be kept last */
+};
+
+u64 get_all_cpu_total(u64 __percpu *cntr);
+void hfi1_start_cleanup(struct hfi1_devdata *dd);
+void hfi1_clear_tids(struct hfi1_ctxtdata *rcd);
+struct hfi1_message_header *hfi1_get_msgheader(
+                               struct hfi1_devdata *dd, __le32 *rhf_addr);
+int hfi1_get_base_kinfo(struct hfi1_ctxtdata *rcd,
+                       struct hfi1_ctxt_info *kinfo);
+u64 hfi1_gpio_mod(struct hfi1_devdata *dd, u32 target, u32 data, u32 dir,
+                 u32 mask);
+int hfi1_init_ctxt(struct send_context *sc);
+void hfi1_put_tid(struct hfi1_devdata *dd, u32 index,
+                 u32 type, unsigned long pa, u16 order);
+void hfi1_quiet_serdes(struct hfi1_pportdata *ppd);
+void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, int ctxt);
+u32 hfi1_read_cntrs(struct hfi1_devdata *dd, char **namep, u64 **cntrp);
+u32 hfi1_read_portcntrs(struct hfi1_pportdata *ppd, char **namep, u64 **cntrp);
+u8 hfi1_ibphys_portstate(struct hfi1_pportdata *ppd);
+int hfi1_get_ib_cfg(struct hfi1_pportdata *ppd, int which);
+int hfi1_set_ib_cfg(struct hfi1_pportdata *ppd, int which, u32 val);
+int hfi1_set_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt, u16 jkey);
+int hfi1_clear_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt);
+int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt, u16 pkey);
+int hfi1_clear_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt);
+void hfi1_read_link_quality(struct hfi1_devdata *dd, u8 *link_quality);
+
+/*
+ * Interrupt source table.
+ *
+ * Each entry is an interrupt source "type".  It is ordered by increasing
+ * number.
+ */
+struct is_table {
+       int start;       /* interrupt source type start */
+       int end;         /* interrupt source type end */
+       /* routine that returns the name of the interrupt source */
+       char *(*is_name)(char *name, size_t size, unsigned int source);
+       /* routine to call when receiving an interrupt */
+       void (*is_int)(struct hfi1_devdata *dd, unsigned int source);
+};
+
+#endif /* _CHIP_H */
diff --git a/drivers/infiniband/hw/hfi1/chip_registers.h b/drivers/infiniband/hw/hfi1/chip_registers.h
new file mode 100644 (file)
index 0000000..8744de6
--- /dev/null
@@ -0,0 +1,1307 @@
+#ifndef DEF_CHIP_REG
+#define DEF_CHIP_REG
+
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define CORE           0x000000000000
+#define CCE                    (CORE + 0x000000000000)
+#define ASIC           (CORE + 0x000000400000)
+#define MISC           (CORE + 0x000000500000)
+#define DC_TOP_CSRS            (CORE + 0x000000600000)
+#define CHIP_DEBUG             (CORE + 0x000000700000)
+#define RXE                    (CORE + 0x000001000000)
+#define TXE                    (CORE + 0x000001800000)
+#define DCC_CSRS               (DC_TOP_CSRS + 0x000000000000)
+#define DC_LCB_CSRS            (DC_TOP_CSRS + 0x000000001000)
+#define DC_8051_CSRS           (DC_TOP_CSRS + 0x000000002000)
+#define PCIE           0
+
+#define ASIC_NUM_SCRATCH 4
+#define CCE_ERR_INT_CNT 0
+#define CCE_MISC_INT_CNT 2
+#define CCE_NUM_32_BIT_COUNTERS 3
+#define CCE_NUM_32_BIT_INT_COUNTERS 6
+#define CCE_NUM_INT_CSRS 12
+#define CCE_NUM_INT_MAP_CSRS 96
+#define CCE_NUM_MSIX_PBAS 4
+#define CCE_NUM_MSIX_VECTORS 256
+#define CCE_NUM_SCRATCH 4
+#define CCE_PCIE_POSTED_CRDT_STALL_CNT 2
+#define CCE_PCIE_TRGT_STALL_CNT 0
+#define CCE_PIO_WR_STALL_CNT 1
+#define CCE_RCV_AVAIL_INT_CNT 3
+#define CCE_RCV_URGENT_INT_CNT 4
+#define CCE_SDMA_INT_CNT 1
+#define CCE_SEND_CREDIT_INT_CNT 5
+#define DCC_CFG_LED_CNTRL (DCC_CSRS + 0x000000000040)
+#define DCC_CFG_LED_CNTRL_LED_CNTRL_SMASK 0x10ull
+#define DCC_CFG_LED_CNTRL_LED_SW_BLINK_RATE_SHIFT 0
+#define DCC_CFG_LED_CNTRL_LED_SW_BLINK_RATE_SMASK 0xFull
+#define DCC_CFG_PORT_CONFIG (DCC_CSRS + 0x000000000008)
+#define DCC_CFG_PORT_CONFIG1 (DCC_CSRS + 0x000000000010)
+#define DCC_CFG_PORT_CONFIG1_DLID_MASK_MASK 0xFFFFull
+#define DCC_CFG_PORT_CONFIG1_DLID_MASK_SHIFT 16
+#define DCC_CFG_PORT_CONFIG1_DLID_MASK_SMASK 0xFFFF0000ull
+#define DCC_CFG_PORT_CONFIG1_TARGET_DLID_MASK 0xFFFFull
+#define DCC_CFG_PORT_CONFIG1_TARGET_DLID_SHIFT 0
+#define DCC_CFG_PORT_CONFIG1_TARGET_DLID_SMASK 0xFFFFull
+#define DCC_CFG_PORT_CONFIG_LINK_STATE_MASK 0x7ull
+#define DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT 48
+#define DCC_CFG_PORT_CONFIG_LINK_STATE_SMASK 0x7000000000000ull
+#define DCC_CFG_PORT_CONFIG_MTU_CAP_MASK 0x7ull
+#define DCC_CFG_PORT_CONFIG_MTU_CAP_SHIFT 32
+#define DCC_CFG_PORT_CONFIG_MTU_CAP_SMASK 0x700000000ull
+#define DCC_CFG_RESET (DCC_CSRS + 0x000000000000)
+#define DCC_CFG_RESET_RESET_LCB_SHIFT 0
+#define DCC_CFG_RESET_RESET_RX_FPE_SHIFT 2
+#define DCC_CFG_SC_VL_TABLE_15_0 (DCC_CSRS + 0x000000000028)
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY0_SHIFT 0
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY10_SHIFT 40
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY11_SHIFT 44
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY12_SHIFT 48
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY13_SHIFT 52
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY14_SHIFT 56
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY15_SHIFT 60
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY1_SHIFT 4
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY2_SHIFT 8
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY3_SHIFT 12
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY4_SHIFT 16
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY5_SHIFT 20
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY6_SHIFT 24
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY7_SHIFT 28
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY8_SHIFT 32
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY9_SHIFT 36
+#define DCC_CFG_SC_VL_TABLE_31_16 (DCC_CSRS + 0x000000000030)
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY16_SHIFT 0
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY17_SHIFT 4
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY18_SHIFT 8
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY19_SHIFT 12
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY20_SHIFT 16
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY21_SHIFT 20
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY22_SHIFT 24
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY23_SHIFT 28
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY24_SHIFT 32
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY25_SHIFT 36
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY26_SHIFT 40
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY27_SHIFT 44
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY28_SHIFT 48
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY29_SHIFT 52
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY30_SHIFT 56
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY31_SHIFT 60
+#define DCC_ERR_DROPPED_PKT_CNT (DCC_CSRS + 0x000000000120)
+#define DCC_ERR_FLG (DCC_CSRS + 0x000000000050)
+#define DCC_ERR_FLG_BAD_CRDT_ACK_ERR_SMASK 0x4000ull
+#define DCC_ERR_FLG_BAD_CTRL_DIST_ERR_SMASK 0x200000ull
+#define DCC_ERR_FLG_BAD_CTRL_FLIT_ERR_SMASK 0x10000ull
+#define DCC_ERR_FLG_BAD_DLID_TARGET_ERR_SMASK 0x200ull
+#define DCC_ERR_FLG_BAD_HEAD_DIST_ERR_SMASK 0x800000ull
+#define DCC_ERR_FLG_BAD_L2_ERR_SMASK 0x2ull
+#define DCC_ERR_FLG_BAD_LVER_ERR_SMASK 0x400ull
+#define DCC_ERR_FLG_BAD_MID_TAIL_ERR_SMASK 0x8ull
+#define DCC_ERR_FLG_BAD_PKT_LENGTH_ERR_SMASK 0x4000000ull
+#define DCC_ERR_FLG_BAD_PREEMPTION_ERR_SMASK 0x10ull
+#define DCC_ERR_FLG_BAD_SC_ERR_SMASK 0x4ull
+#define DCC_ERR_FLG_BAD_TAIL_DIST_ERR_SMASK 0x400000ull
+#define DCC_ERR_FLG_BAD_VL_MARKER_ERR_SMASK 0x80ull
+#define DCC_ERR_FLG_CLR (DCC_CSRS + 0x000000000060)
+#define DCC_ERR_FLG_CSR_ACCESS_BLOCKED_HOST_SMASK 0x8000000000ull
+#define DCC_ERR_FLG_CSR_ACCESS_BLOCKED_UC_SMASK 0x10000000000ull
+#define DCC_ERR_FLG_CSR_INVAL_ADDR_SMASK 0x400000000000ull
+#define DCC_ERR_FLG_CSR_PARITY_ERR_SMASK 0x200000000000ull
+#define DCC_ERR_FLG_DLID_ZERO_ERR_SMASK 0x40000000ull
+#define DCC_ERR_FLG_EN (DCC_CSRS + 0x000000000058)
+#define DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK 0x8000000000ull
+#define DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK 0x10000000000ull
+#define DCC_ERR_FLG_EVENT_CNTR_PARITY_ERR_SMASK 0x20000ull
+#define DCC_ERR_FLG_EVENT_CNTR_ROLLOVER_ERR_SMASK 0x40000ull
+#define DCC_ERR_FLG_FMCONFIG_ERR_SMASK 0x40000000000000ull
+#define DCC_ERR_FLG_FPE_TX_FIFO_OVFLW_ERR_SMASK 0x2000000000ull
+#define DCC_ERR_FLG_FPE_TX_FIFO_UNFLW_ERR_SMASK 0x4000000000ull
+#define DCC_ERR_FLG_LATE_EBP_ERR_SMASK 0x1000000000ull
+#define DCC_ERR_FLG_LATE_LONG_ERR_SMASK 0x800000000ull
+#define DCC_ERR_FLG_LATE_SHORT_ERR_SMASK 0x400000000ull
+#define DCC_ERR_FLG_LENGTH_MTU_ERR_SMASK 0x80000000ull
+#define DCC_ERR_FLG_LINK_ERR_SMASK 0x80000ull
+#define DCC_ERR_FLG_MISC_CNTR_ROLLOVER_ERR_SMASK 0x100000ull
+#define DCC_ERR_FLG_NONVL15_STATE_ERR_SMASK 0x1000000ull
+#define DCC_ERR_FLG_PERM_NVL15_ERR_SMASK 0x10000000ull
+#define DCC_ERR_FLG_PREEMPTION_ERR_SMASK 0x20ull
+#define DCC_ERR_FLG_PREEMPTIONVL15_ERR_SMASK 0x40ull
+#define DCC_ERR_FLG_RCVPORT_ERR_SMASK 0x80000000000000ull
+#define DCC_ERR_FLG_RX_BYTE_SHFT_PARITY_ERR_SMASK 0x1000000000000ull
+#define DCC_ERR_FLG_RX_CTRL_PARITY_MBE_ERR_SMASK 0x100000000000ull
+#define DCC_ERR_FLG_RX_EARLY_DROP_ERR_SMASK 0x200000000ull
+#define DCC_ERR_FLG_SLID_ZERO_ERR_SMASK 0x20000000ull
+#define DCC_ERR_FLG_TX_BYTE_SHFT_PARITY_ERR_SMASK 0x800000000000ull
+#define DCC_ERR_FLG_TX_CTRL_PARITY_ERR_SMASK 0x20000000000ull
+#define DCC_ERR_FLG_TX_CTRL_PARITY_MBE_ERR_SMASK 0x40000000000ull
+#define DCC_ERR_FLG_TX_SC_PARITY_ERR_SMASK 0x80000000000ull
+#define DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK 0x2000ull
+#define DCC_ERR_FLG_UNSUP_PKT_TYPE_SMASK 0x8000ull
+#define DCC_ERR_FLG_UNSUP_VL_ERR_SMASK 0x8000000ull
+#define DCC_ERR_FLG_VL15_MULTI_ERR_SMASK 0x2000000ull
+#define DCC_ERR_FMCONFIG_ERR_CNT (DCC_CSRS + 0x000000000110)
+#define DCC_ERR_INFO_FMCONFIG (DCC_CSRS + 0x000000000090)
+#define DCC_ERR_INFO_PORTRCV (DCC_CSRS + 0x000000000078)
+#define DCC_ERR_INFO_PORTRCV_HDR0 (DCC_CSRS + 0x000000000080)
+#define DCC_ERR_INFO_PORTRCV_HDR1 (DCC_CSRS + 0x000000000088)
+#define DCC_ERR_INFO_UNCORRECTABLE (DCC_CSRS + 0x000000000098)
+#define DCC_ERR_PORTRCV_ERR_CNT (DCC_CSRS + 0x000000000108)
+#define DCC_ERR_RCVREMOTE_PHY_ERR_CNT (DCC_CSRS + 0x000000000118)
+#define DCC_ERR_UNCORRECTABLE_CNT (DCC_CSRS + 0x000000000100)
+#define DCC_PRF_PORT_MARK_FECN_CNT (DCC_CSRS + 0x000000000330)
+#define DCC_PRF_PORT_RCV_BECN_CNT (DCC_CSRS + 0x000000000290)
+#define DCC_PRF_PORT_RCV_BUBBLE_CNT (DCC_CSRS + 0x0000000002E0)
+#define DCC_PRF_PORT_RCV_CORRECTABLE_CNT (DCC_CSRS + 0x000000000140)
+#define DCC_PRF_PORT_RCV_DATA_CNT (DCC_CSRS + 0x000000000198)
+#define DCC_PRF_PORT_RCV_FECN_CNT (DCC_CSRS + 0x000000000240)
+#define DCC_PRF_PORT_RCV_MULTICAST_PKT_CNT (DCC_CSRS + 0x000000000130)
+#define DCC_PRF_PORT_RCV_PKTS_CNT (DCC_CSRS + 0x0000000001A8)
+#define DCC_PRF_PORT_VL_MARK_FECN_CNT (DCC_CSRS + 0x000000000338)
+#define DCC_PRF_PORT_VL_RCV_BECN_CNT (DCC_CSRS + 0x000000000298)
+#define DCC_PRF_PORT_VL_RCV_BUBBLE_CNT (DCC_CSRS + 0x0000000002E8)
+#define DCC_PRF_PORT_VL_RCV_DATA_CNT (DCC_CSRS + 0x0000000001B0)
+#define DCC_PRF_PORT_VL_RCV_FECN_CNT (DCC_CSRS + 0x000000000248)
+#define DCC_PRF_PORT_VL_RCV_PKTS_CNT (DCC_CSRS + 0x0000000001F8)
+#define DCC_PRF_PORT_XMIT_CORRECTABLE_CNT (DCC_CSRS + 0x000000000138)
+#define DCC_PRF_PORT_XMIT_DATA_CNT (DCC_CSRS + 0x000000000190)
+#define DCC_PRF_PORT_XMIT_MULTICAST_CNT (DCC_CSRS + 0x000000000128)
+#define DCC_PRF_PORT_XMIT_PKTS_CNT (DCC_CSRS + 0x0000000001A0)
+#define DCC_PRF_RX_FLOW_CRTL_CNT (DCC_CSRS + 0x000000000180)
+#define DCC_PRF_TX_FLOW_CRTL_CNT (DCC_CSRS + 0x000000000188)
+#define DC_DC8051_CFG_CSR_ACCESS_SEL (DC_8051_CSRS + 0x000000000110)
+#define DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK 0x2ull
+#define DC_DC8051_CFG_CSR_ACCESS_SEL_LCB_SMASK 0x1ull
+#define DC_DC8051_CFG_EXT_DEV_0 (DC_8051_CSRS + 0x000000000118)
+#define DC_DC8051_CFG_EXT_DEV_0_COMPLETED_SMASK 0x1ull
+#define DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT 8
+#define DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT 16
+#define DC_DC8051_CFG_EXT_DEV_1 (DC_8051_CSRS + 0x000000000120)
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_MASK 0xFFFFull
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT 16
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SMASK 0xFFFF0000ull
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_NEW_SMASK 0x1ull
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_MASK 0xFFull
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_SHIFT 8
+#define DC_DC8051_CFG_HOST_CMD_0 (DC_8051_CSRS + 0x000000000028)
+#define DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_MASK 0xFFFFFFFFFFFFull
+#define DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_SHIFT 16
+#define DC_DC8051_CFG_HOST_CMD_0_REQ_NEW_SMASK 0x1ull
+#define DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_MASK 0xFFull
+#define DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_SHIFT 8
+#define DC_DC8051_CFG_HOST_CMD_1 (DC_8051_CSRS + 0x000000000030)
+#define DC_DC8051_CFG_HOST_CMD_1_COMPLETED_SMASK 0x1ull
+#define DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_MASK 0xFFull
+#define DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_SHIFT 8
+#define DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_MASK 0xFFFFFFFFFFFFull
+#define DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_SHIFT 16
+#define DC_DC8051_CFG_LOCAL_GUID (DC_8051_CSRS + 0x000000000038)
+#define DC_DC8051_CFG_MODE (DC_8051_CSRS + 0x000000000070)
+#define DC_DC8051_CFG_RAM_ACCESS_CTRL (DC_8051_CSRS + 0x000000000008)
+#define DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK 0x7FFFull
+#define DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT 0
+#define DC_DC8051_CFG_RAM_ACCESS_CTRL_WRITE_ENA_SMASK 0x1000000ull
+#define DC_DC8051_CFG_RAM_ACCESS_CTRL_READ_ENA_SMASK 0x10000ull
+#define DC_DC8051_CFG_RAM_ACCESS_SETUP (DC_8051_CSRS + 0x000000000000)
+#define DC_DC8051_CFG_RAM_ACCESS_SETUP_AUTO_INCR_ADDR_SMASK 0x100ull
+#define DC_DC8051_CFG_RAM_ACCESS_SETUP_RAM_SEL_SMASK 0x1ull
+#define DC_DC8051_CFG_RAM_ACCESS_STATUS (DC_8051_CSRS + 0x000000000018)
+#define DC_DC8051_CFG_RAM_ACCESS_STATUS_ACCESS_COMPLETED_SMASK 0x10000ull
+#define DC_DC8051_CFG_RAM_ACCESS_WR_DATA (DC_8051_CSRS + 0x000000000010)
+#define DC_DC8051_CFG_RAM_ACCESS_RD_DATA (DC_8051_CSRS + 0x000000000020)
+#define DC_DC8051_CFG_RST (DC_8051_CSRS + 0x000000000068)
+#define DC_DC8051_CFG_RST_CRAM_SMASK 0x2ull
+#define DC_DC8051_CFG_RST_DRAM_SMASK 0x4ull
+#define DC_DC8051_CFG_RST_IRAM_SMASK 0x8ull
+#define DC_DC8051_CFG_RST_M8051W_SMASK 0x1ull
+#define DC_DC8051_CFG_RST_SFR_SMASK 0x10ull
+#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051 (DC_8051_CSRS + 0x0000000000D8)
+#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_MASK 0xFFFFFFFFull
+#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_SHIFT 16
+#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_MASK 0xFFFFull
+#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_SHIFT 0
+#define DC_DC8051_ERR_CLR (DC_8051_CSRS + 0x0000000000E8)
+#define DC_DC8051_ERR_EN (DC_8051_CSRS + 0x0000000000F0)
+#define DC_DC8051_ERR_EN_LOST_8051_HEART_BEAT_SMASK 0x2ull
+#define DC_DC8051_ERR_FLG (DC_8051_CSRS + 0x0000000000E0)
+#define DC_DC8051_ERR_FLG_CRAM_MBE_SMASK 0x4ull
+#define DC_DC8051_ERR_FLG_CRAM_SBE_SMASK 0x8ull
+#define DC_DC8051_ERR_FLG_DRAM_MBE_SMASK 0x10ull
+#define DC_DC8051_ERR_FLG_DRAM_SBE_SMASK 0x20ull
+#define DC_DC8051_ERR_FLG_INVALID_CSR_ADDR_SMASK 0x400ull
+#define DC_DC8051_ERR_FLG_IRAM_MBE_SMASK 0x40ull
+#define DC_DC8051_ERR_FLG_IRAM_SBE_SMASK 0x80ull
+#define DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK 0x2ull
+#define DC_DC8051_ERR_FLG_SET_BY_8051_SMASK 0x1ull
+#define DC_DC8051_ERR_FLG_UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES_SMASK 0x100ull
+#define DC_DC8051_STS_CUR_STATE (DC_8051_CSRS + 0x000000000060)
+#define DC_DC8051_STS_CUR_STATE_FIRMWARE_MASK 0xFFull
+#define DC_DC8051_STS_CUR_STATE_FIRMWARE_SHIFT 16
+#define DC_DC8051_STS_CUR_STATE_PORT_MASK 0xFFull
+#define DC_DC8051_STS_CUR_STATE_PORT_SHIFT 0
+#define DC_DC8051_STS_LOCAL_FM_SECURITY (DC_8051_CSRS + 0x000000000050)
+#define DC_DC8051_STS_LOCAL_FM_SECURITY_DISABLED_MASK 0x1ull
+#define DC_DC8051_STS_REMOTE_FM_SECURITY (DC_8051_CSRS + 0x000000000058)
+#define DC_DC8051_STS_REMOTE_GUID (DC_8051_CSRS + 0x000000000040)
+#define DC_DC8051_STS_REMOTE_NODE_TYPE (DC_8051_CSRS + 0x000000000048)
+#define DC_DC8051_STS_REMOTE_NODE_TYPE_VAL_MASK 0x3ull
+#define DC_DC8051_STS_REMOTE_PORT_NO (DC_8051_CSRS + 0x000000000130)
+#define DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK 0xFFull
+#define DC_LCB_CFG_ALLOW_LINK_UP (DC_LCB_CSRS + 0x000000000128)
+#define DC_LCB_CFG_ALLOW_LINK_UP_VAL_SHIFT 0
+#define DC_LCB_CFG_CRC_MODE (DC_LCB_CSRS + 0x000000000058)
+#define DC_LCB_CFG_CRC_MODE_TX_VAL_SHIFT 0
+#define DC_LCB_CFG_IGNORE_LOST_RCLK (DC_LCB_CSRS + 0x000000000020)
+#define DC_LCB_CFG_IGNORE_LOST_RCLK_EN_SMASK 0x1ull
+#define DC_LCB_CFG_LANE_WIDTH (DC_LCB_CSRS + 0x000000000100)
+#define DC_LCB_CFG_LINK_KILL_EN (DC_LCB_CSRS + 0x000000000120)
+#define DC_LCB_CFG_LINK_KILL_EN_FLIT_INPUT_BUF_MBE_SMASK 0x100000ull
+#define DC_LCB_CFG_LINK_KILL_EN_REPLAY_BUF_MBE_SMASK 0x400000ull
+#define DC_LCB_CFG_LN_DCLK (DC_LCB_CSRS + 0x000000000060)
+#define DC_LCB_CFG_LOOPBACK (DC_LCB_CSRS + 0x0000000000F8)
+#define DC_LCB_CFG_LOOPBACK_VAL_SHIFT 0
+#define DC_LCB_CFG_RUN (DC_LCB_CSRS + 0x000000000000)
+#define DC_LCB_CFG_RUN_EN_SHIFT 0
+#define DC_LCB_CFG_RX_FIFOS_RADR (DC_LCB_CSRS + 0x000000000018)
+#define DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT 8
+#define DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT 4
+#define DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT 0
+#define DC_LCB_CFG_TX_FIFOS_RADR (DC_LCB_CSRS + 0x000000000010)
+#define DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT 0
+#define DC_LCB_CFG_TX_FIFOS_RESET (DC_LCB_CSRS + 0x000000000008)
+#define DC_LCB_CFG_TX_FIFOS_RESET_VAL_SHIFT 0
+#define DC_LCB_CFG_REINIT_AS_SLAVE (DC_LCB_CSRS + 0x000000000030)
+#define DC_LCB_CFG_CNT_FOR_SKIP_STALL (DC_LCB_CSRS + 0x000000000040)
+#define DC_LCB_CFG_CLK_CNTR (DC_LCB_CSRS + 0x000000000110)
+#define DC_LCB_ERR_CLR (DC_LCB_CSRS + 0x000000000308)
+#define DC_LCB_ERR_EN (DC_LCB_CSRS + 0x000000000310)
+#define DC_LCB_ERR_FLG (DC_LCB_CSRS + 0x000000000300)
+#define DC_LCB_ERR_FLG_REDUNDANT_FLIT_PARITY_ERR_SMASK 0x20000000ull
+#define DC_LCB_ERR_FLG_NEG_EDGE_LINK_TRANSFER_ACTIVE_SMASK 0x10000000ull
+#define DC_LCB_ERR_FLG_HOLD_REINIT_SMASK 0x8000000ull
+#define DC_LCB_ERR_FLG_RST_FOR_INCOMPLT_RND_TRIP_SMASK 0x4000000ull
+#define DC_LCB_ERR_FLG_RST_FOR_LINK_TIMEOUT_SMASK 0x2000000ull
+#define DC_LCB_ERR_FLG_CREDIT_RETURN_FLIT_MBE_SMASK 0x1000000ull
+#define DC_LCB_ERR_FLG_REPLAY_BUF_SBE_SMASK 0x800000ull
+#define DC_LCB_ERR_FLG_REPLAY_BUF_MBE_SMASK 0x400000ull
+#define DC_LCB_ERR_FLG_FLIT_INPUT_BUF_SBE_SMASK 0x200000ull
+#define DC_LCB_ERR_FLG_FLIT_INPUT_BUF_MBE_SMASK 0x100000ull
+#define DC_LCB_ERR_FLG_VL_ACK_INPUT_WRONG_CRC_MODE_SMASK 0x80000ull
+#define DC_LCB_ERR_FLG_VL_ACK_INPUT_PARITY_ERR_SMASK 0x40000ull
+#define DC_LCB_ERR_FLG_VL_ACK_INPUT_BUF_OFLW_SMASK 0x20000ull
+#define DC_LCB_ERR_FLG_FLIT_INPUT_BUF_OFLW_SMASK 0x10000ull
+#define DC_LCB_ERR_FLG_ILLEGAL_FLIT_ENCODING_SMASK 0x8000ull
+#define DC_LCB_ERR_FLG_ILLEGAL_NULL_LTP_SMASK 0x4000ull
+#define DC_LCB_ERR_FLG_UNEXPECTED_ROUND_TRIP_MARKER_SMASK 0x2000ull
+#define DC_LCB_ERR_FLG_UNEXPECTED_REPLAY_MARKER_SMASK 0x1000ull
+#define DC_LCB_ERR_FLG_RCLK_STOPPED_SMASK 0x800ull
+#define DC_LCB_ERR_FLG_CRC_ERR_CNT_HIT_LIMIT_SMASK 0x400ull
+#define DC_LCB_ERR_FLG_REINIT_FOR_LN_DEGRADE_SMASK 0x200ull
+#define DC_LCB_ERR_FLG_REINIT_FROM_PEER_SMASK 0x100ull
+#define DC_LCB_ERR_FLG_SEQ_CRC_ERR_SMASK 0x80ull
+#define DC_LCB_ERR_FLG_RX_LESS_THAN_FOUR_LNS_SMASK 0x40ull
+#define DC_LCB_ERR_FLG_TX_LESS_THAN_FOUR_LNS_SMASK 0x20ull
+#define DC_LCB_ERR_FLG_LOST_REINIT_STALL_OR_TOS_SMASK 0x10ull
+#define DC_LCB_ERR_FLG_ALL_LNS_FAILED_REINIT_TEST_SMASK 0x8ull
+#define DC_LCB_ERR_FLG_RST_FOR_FAILED_DESKEW_SMASK 0x4ull
+#define DC_LCB_ERR_FLG_INVALID_CSR_ADDR_SMASK 0x2ull
+#define DC_LCB_ERR_FLG_CSR_PARITY_ERR_SMASK 0x1ull
+#define DC_LCB_ERR_INFO_CRC_ERR_LN0 (DC_LCB_CSRS + 0x000000000328)
+#define DC_LCB_ERR_INFO_CRC_ERR_LN1 (DC_LCB_CSRS + 0x000000000330)
+#define DC_LCB_ERR_INFO_CRC_ERR_LN2 (DC_LCB_CSRS + 0x000000000338)
+#define DC_LCB_ERR_INFO_CRC_ERR_LN3 (DC_LCB_CSRS + 0x000000000340)
+#define DC_LCB_ERR_INFO_CRC_ERR_MULTI_LN (DC_LCB_CSRS + 0x000000000348)
+#define DC_LCB_ERR_INFO_ESCAPE_0_ONLY_CNT (DC_LCB_CSRS + 0x000000000368)
+#define DC_LCB_ERR_INFO_ESCAPE_0_PLUS1_CNT (DC_LCB_CSRS + 0x000000000370)
+#define DC_LCB_ERR_INFO_ESCAPE_0_PLUS2_CNT (DC_LCB_CSRS + 0x000000000378)
+#define DC_LCB_ERR_INFO_MISC_FLG_CNT (DC_LCB_CSRS + 0x000000000390)
+#define DC_LCB_ERR_INFO_REINIT_FROM_PEER_CNT (DC_LCB_CSRS + 0x000000000380)
+#define DC_LCB_ERR_INFO_RX_REPLAY_CNT (DC_LCB_CSRS + 0x000000000358)
+#define DC_LCB_ERR_INFO_SBE_CNT (DC_LCB_CSRS + 0x000000000388)
+#define DC_LCB_ERR_INFO_SEQ_CRC_CNT (DC_LCB_CSRS + 0x000000000360)
+#define DC_LCB_ERR_INFO_TOTAL_CRC_ERR (DC_LCB_CSRS + 0x000000000320)
+#define DC_LCB_ERR_INFO_TX_REPLAY_CNT (DC_LCB_CSRS + 0x000000000350)
+#define DC_LCB_PG_DBG_FLIT_CRDTS_CNT (DC_LCB_CSRS + 0x000000000580)
+#define DC_LCB_PG_STS_PAUSE_COMPLETE_CNT (DC_LCB_CSRS + 0x0000000005F8)
+#define DC_LCB_PG_STS_TX_MBE_CNT (DC_LCB_CSRS + 0x000000000608)
+#define DC_LCB_PG_STS_TX_SBE_CNT (DC_LCB_CSRS + 0x000000000600)
+#define DC_LCB_PRF_ACCEPTED_LTP_CNT (DC_LCB_CSRS + 0x000000000408)
+#define DC_LCB_PRF_CLK_CNTR (DC_LCB_CSRS + 0x000000000420)
+#define DC_LCB_PRF_GOOD_LTP_CNT (DC_LCB_CSRS + 0x000000000400)
+#define DC_LCB_PRF_RX_FLIT_CNT (DC_LCB_CSRS + 0x000000000410)
+#define DC_LCB_PRF_TX_FLIT_CNT (DC_LCB_CSRS + 0x000000000418)
+#define DC_LCB_STS_LINK_TRANSFER_ACTIVE (DC_LCB_CSRS + 0x000000000468)
+#define DC_LCB_STS_ROUND_TRIP_LTP_CNT (DC_LCB_CSRS + 0x0000000004B0)
+#define RCV_BUF_OVFL_CNT 10
+#define RCV_CONTEXT_EGR_STALL 22
+#define RCV_DATA_PKT_CNT 0
+#define RCV_DWORD_CNT 1
+#define RCV_TID_FLOW_GEN_MISMATCH_CNT 20
+#define RCV_TID_FLOW_SEQ_MISMATCH_CNT 23
+#define RCV_TID_FULL_ERR_CNT 18
+#define RCV_TID_VALID_ERR_CNT 19
+#define RXE_NUM_32_BIT_COUNTERS 24
+#define RXE_NUM_64_BIT_COUNTERS 2
+#define RXE_NUM_RSM_INSTANCES 4
+#define RXE_NUM_TID_FLOWS 32
+#define RXE_PER_CONTEXT_OFFSET 0x0300000
+#define SEND_DATA_PKT_CNT 0
+#define SEND_DATA_PKT_VL0_CNT 12
+#define SEND_DATA_VL0_CNT 3
+#define SEND_DROPPED_PKT_CNT 5
+#define SEND_DWORD_CNT 1
+#define SEND_FLOW_STALL_CNT 4
+#define SEND_HEADERS_ERR_CNT 6
+#define SEND_LEN_ERR_CNT 1
+#define SEND_MAX_MIN_LEN_ERR_CNT 2
+#define SEND_UNDERRUN_CNT 3
+#define SEND_UNSUP_VL_ERR_CNT 0
+#define SEND_WAIT_CNT 2
+#define SEND_WAIT_VL0_CNT 21
+#define TXE_PIO_SEND_OFFSET 0x0800000
+#define ASIC_CFG_DRV_STR (ASIC + 0x000000000048)
+#define ASIC_CFG_MUTEX (ASIC + 0x000000000040)
+#define ASIC_CFG_SBUS_EXECUTE (ASIC + 0x000000000008)
+#define ASIC_CFG_SBUS_EXECUTE_EXECUTE_SMASK 0x1ull
+#define ASIC_CFG_SBUS_EXECUTE_FAST_MODE_SMASK 0x2ull
+#define ASIC_CFG_SBUS_REQUEST (ASIC + 0x000000000000)
+#define ASIC_CFG_SBUS_REQUEST_COMMAND_SHIFT 16
+#define ASIC_CFG_SBUS_REQUEST_DATA_ADDR_SHIFT 8
+#define ASIC_CFG_SBUS_REQUEST_DATA_IN_SHIFT 32
+#define ASIC_CFG_SBUS_REQUEST_RECEIVER_ADDR_SHIFT 0
+#define ASIC_CFG_SCRATCH (ASIC + 0x000000000020)
+#define ASIC_CFG_THERM_POLL_EN (ASIC + 0x000000000050)
+#define ASIC_EEP_ADDR_CMD (ASIC + 0x000000000308)
+#define ASIC_EEP_ADDR_CMD_EP_ADDR_MASK 0xFFFFFFull
+#define ASIC_EEP_CTL_STAT (ASIC + 0x000000000300)
+#define ASIC_EEP_CTL_STAT_EP_RESET_SMASK 0x4ull
+#define ASIC_EEP_CTL_STAT_RATE_SPI_SHIFT 8
+#define ASIC_EEP_CTL_STAT_RESETCSR 0x0000000083818000ull
+#define ASIC_EEP_DATA (ASIC + 0x000000000310)
+#define ASIC_GPIO_CLEAR (ASIC + 0x000000000230)
+#define ASIC_GPIO_FORCE (ASIC + 0x000000000238)
+#define ASIC_GPIO_IN (ASIC + 0x000000000200)
+#define ASIC_GPIO_INVERT (ASIC + 0x000000000210)
+#define ASIC_GPIO_MASK (ASIC + 0x000000000220)
+#define ASIC_GPIO_OE (ASIC + 0x000000000208)
+#define ASIC_GPIO_OUT (ASIC + 0x000000000218)
+#define ASIC_PCIE_SD_HOST_CMD (ASIC + 0x000000000100)
+#define ASIC_PCIE_SD_HOST_CMD_INTRPT_CMD_SHIFT 0
+#define ASIC_PCIE_SD_HOST_CMD_SBR_MODE_SMASK 0x400ull
+#define ASIC_PCIE_SD_HOST_CMD_SBUS_RCVR_ADDR_SHIFT 2
+#define ASIC_PCIE_SD_HOST_CMD_TIMER_MASK 0xFFFFFull
+#define ASIC_PCIE_SD_HOST_CMD_TIMER_SHIFT 12
+#define ASIC_PCIE_SD_HOST_STATUS (ASIC + 0x000000000108)
+#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_MASK 0x7ull
+#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_SHIFT 2
+#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_MASK 0x3ull
+#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_SHIFT 0
+#define ASIC_PCIE_SD_INTRPT_DATA_CODE (ASIC + 0x000000000110)
+#define ASIC_PCIE_SD_INTRPT_ENABLE (ASIC + 0x000000000118)
+#define ASIC_PCIE_SD_INTRPT_LIST (ASIC + 0x000000000180)
+#define ASIC_PCIE_SD_INTRPT_LIST_INTRPT_CODE_SHIFT 16
+#define ASIC_PCIE_SD_INTRPT_LIST_INTRPT_DATA_SHIFT 0
+#define ASIC_PCIE_SD_INTRPT_STATUS (ASIC + 0x000000000128)
+#define ASIC_QSFP1_CLEAR (ASIC + 0x000000000270)
+#define ASIC_QSFP1_FORCE (ASIC + 0x000000000278)
+#define ASIC_QSFP1_IN (ASIC + 0x000000000240)
+#define ASIC_QSFP1_INVERT (ASIC + 0x000000000250)
+#define ASIC_QSFP1_MASK (ASIC + 0x000000000260)
+#define ASIC_QSFP1_OE (ASIC + 0x000000000248)
+#define ASIC_QSFP1_OUT (ASIC + 0x000000000258)
+#define ASIC_QSFP1_STATUS (ASIC + 0x000000000268)
+#define ASIC_QSFP2_CLEAR (ASIC + 0x0000000002B0)
+#define ASIC_QSFP2_FORCE (ASIC + 0x0000000002B8)
+#define ASIC_QSFP2_IN (ASIC + 0x000000000280)
+#define ASIC_QSFP2_INVERT (ASIC + 0x000000000290)
+#define ASIC_QSFP2_MASK (ASIC + 0x0000000002A0)
+#define ASIC_QSFP2_OE (ASIC + 0x000000000288)
+#define ASIC_QSFP2_OUT (ASIC + 0x000000000298)
+#define ASIC_QSFP2_STATUS (ASIC + 0x0000000002A8)
+#define ASIC_STS_SBUS_COUNTERS (ASIC + 0x000000000018)
+#define ASIC_STS_SBUS_COUNTERS_EXECUTE_CNT_MASK 0xFFFFull
+#define ASIC_STS_SBUS_COUNTERS_EXECUTE_CNT_SHIFT 0
+#define ASIC_STS_SBUS_COUNTERS_RCV_DATA_VALID_CNT_MASK 0xFFFFull
+#define ASIC_STS_SBUS_COUNTERS_RCV_DATA_VALID_CNT_SHIFT 16
+#define ASIC_STS_SBUS_RESULT (ASIC + 0x000000000010)
+#define ASIC_STS_SBUS_RESULT_DONE_SMASK 0x1ull
+#define ASIC_STS_SBUS_RESULT_RCV_DATA_VALID_SMASK 0x2ull
+#define ASIC_STS_THERM (ASIC + 0x000000000058)
+#define ASIC_STS_THERM_CRIT_TEMP_MASK 0x7FFull
+#define ASIC_STS_THERM_CRIT_TEMP_SHIFT 18
+#define ASIC_STS_THERM_CURR_TEMP_MASK 0x7FFull
+#define ASIC_STS_THERM_CURR_TEMP_SHIFT 2
+#define ASIC_STS_THERM_HI_TEMP_MASK 0x7FFull
+#define ASIC_STS_THERM_HI_TEMP_SHIFT 50
+#define ASIC_STS_THERM_LO_TEMP_MASK 0x7FFull
+#define ASIC_STS_THERM_LO_TEMP_SHIFT 34
+#define ASIC_STS_THERM_LOW_SHIFT 13
+#define CCE_COUNTER_ARRAY32 (CCE + 0x000000000060)
+#define CCE_CTRL (CCE + 0x000000000010)
+#define CCE_CTRL_RXE_RESUME_SMASK 0x800ull
+#define CCE_CTRL_SPC_FREEZE_SMASK 0x100ull
+#define CCE_CTRL_SPC_UNFREEZE_SMASK 0x200ull
+#define CCE_CTRL_TXE_RESUME_SMASK 0x2000ull
+#define CCE_DC_CTRL (CCE + 0x0000000000B8)
+#define CCE_DC_CTRL_DC_RESET_SMASK 0x1ull
+#define CCE_DC_CTRL_RESETCSR 0x0000000000000001ull
+#define CCE_ERR_CLEAR (CCE + 0x000000000050)
+#define CCE_ERR_MASK (CCE + 0x000000000048)
+#define CCE_ERR_STATUS (CCE + 0x000000000040)
+#define CCE_ERR_STATUS_CCE_CLI0_ASYNC_FIFO_PARITY_ERR_SMASK 0x40ull
+#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERROR_SMASK 0x1000ull
+#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR_SMASK \
+               0x200ull
+#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERROR_SMASK \
+               0x800ull
+#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR_SMASK \
+               0x400ull
+#define CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK 0x100ull
+#define CCE_ERR_STATUS_CCE_CSR_CFG_BUS_PARITY_ERR_SMASK 0x80ull
+#define CCE_ERR_STATUS_CCE_CSR_PARITY_ERR_SMASK 0x1ull
+#define CCE_ERR_STATUS_CCE_CSR_READ_BAD_ADDR_ERR_SMASK 0x2ull
+#define CCE_ERR_STATUS_CCE_CSR_WRITE_BAD_ADDR_ERR_SMASK 0x4ull
+#define CCE_ERR_STATUS_CCE_INT_MAP_COR_ERR_SMASK 0x4000000000ull
+#define CCE_ERR_STATUS_CCE_INT_MAP_UNC_ERR_SMASK 0x8000000000ull
+#define CCE_ERR_STATUS_CCE_MSIX_CSR_PARITY_ERR_SMASK 0x10000000000ull
+#define CCE_ERR_STATUS_CCE_MSIX_TABLE_COR_ERR_SMASK 0x1000000000ull
+#define CCE_ERR_STATUS_CCE_MSIX_TABLE_UNC_ERR_SMASK 0x2000000000ull
+#define CCE_ERR_STATUS_CCE_RCPL_ASYNC_FIFO_PARITY_ERR_SMASK 0x400000000ull
+#define CCE_ERR_STATUS_CCE_RSPD_DATA_PARITY_ERR_SMASK 0x20ull
+#define CCE_ERR_STATUS_CCE_RXDMA_CONV_FIFO_PARITY_ERR_SMASK 0x800000000ull
+#define CCE_ERR_STATUS_CCE_SEG_READ_BAD_ADDR_ERR_SMASK 0x100000000ull
+#define CCE_ERR_STATUS_CCE_SEG_WRITE_BAD_ADDR_ERR_SMASK 0x200000000ull
+#define CCE_ERR_STATUS_CCE_TRGT_ACCESS_ERR_SMASK 0x10ull
+#define CCE_ERR_STATUS_CCE_TRGT_ASYNC_FIFO_PARITY_ERR_SMASK 0x8ull
+#define CCE_ERR_STATUS_CCE_TRGT_CPL_TIMEOUT_ERR_SMASK 0x40000000ull
+#define CCE_ERR_STATUS_LA_TRIGGERED_SMASK 0x80000000ull
+#define CCE_ERR_STATUS_PCIC_CPL_DAT_QCOR_ERR_SMASK 0x40000ull
+#define CCE_ERR_STATUS_PCIC_CPL_DAT_QUNC_ERR_SMASK 0x4000000ull
+#define CCE_ERR_STATUS_PCIC_CPL_HD_QCOR_ERR_SMASK 0x20000ull
+#define CCE_ERR_STATUS_PCIC_CPL_HD_QUNC_ERR_SMASK 0x2000000ull
+#define CCE_ERR_STATUS_PCIC_NPOST_DAT_QPARITY_ERR_SMASK 0x100000ull
+#define CCE_ERR_STATUS_PCIC_NPOST_HQ_PARITY_ERR_SMASK 0x80000ull
+#define CCE_ERR_STATUS_PCIC_POST_DAT_QCOR_ERR_SMASK 0x10000ull
+#define CCE_ERR_STATUS_PCIC_POST_DAT_QUNC_ERR_SMASK 0x1000000ull
+#define CCE_ERR_STATUS_PCIC_POST_HD_QCOR_ERR_SMASK 0x8000ull
+#define CCE_ERR_STATUS_PCIC_POST_HD_QUNC_ERR_SMASK 0x800000ull
+#define CCE_ERR_STATUS_PCIC_RECEIVE_PARITY_ERR_SMASK 0x20000000ull
+#define CCE_ERR_STATUS_PCIC_RETRY_MEM_COR_ERR_SMASK 0x2000ull
+#define CCE_ERR_STATUS_PCIC_RETRY_MEM_UNC_ERR_SMASK 0x200000ull
+#define CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_COR_ERR_SMASK 0x4000ull
+#define CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_UNC_ERR_SMASK 0x400000ull
+#define CCE_ERR_STATUS_PCIC_TRANSMIT_BACK_PARITY_ERR_SMASK 0x10000000ull
+#define CCE_ERR_STATUS_PCIC_TRANSMIT_FRONT_PARITY_ERR_SMASK 0x8000000ull
+#define CCE_INT_CLEAR (CCE + 0x000000110A00)
+#define CCE_INT_COUNTER_ARRAY32 (CCE + 0x000000110D00)
+#define CCE_INT_FORCE (CCE + 0x000000110B00)
+#define CCE_INT_MAP (CCE + 0x000000110500)
+#define CCE_INT_MASK (CCE + 0x000000110900)
+#define CCE_INT_STATUS (CCE + 0x000000110800)
+#define CCE_MSIX_INT_GRANTED (CCE + 0x000000110200)
+#define CCE_MSIX_TABLE_LOWER (CCE + 0x000000100000)
+#define CCE_MSIX_TABLE_UPPER (CCE + 0x000000100008)
+#define CCE_MSIX_TABLE_UPPER_RESETCSR 0x0000000100000000ull
+#define CCE_MSIX_VEC_CLR_WITHOUT_INT (CCE + 0x000000110400)
+#define CCE_PCIE_CTRL (CCE + 0x0000000000C0)
+#define CCE_PCIE_CTRL_PCIE_LANE_BUNDLE_MASK 0x3ull
+#define CCE_PCIE_CTRL_PCIE_LANE_BUNDLE_SHIFT 0
+#define CCE_PCIE_CTRL_PCIE_LANE_DELAY_MASK 0xFull
+#define CCE_PCIE_CTRL_PCIE_LANE_DELAY_SHIFT 2
+#define CCE_PCIE_CTRL_XMT_MARGIN_OVERWRITE_ENABLE_SHIFT 8
+#define CCE_PCIE_CTRL_XMT_MARGIN_SHIFT 9
+#define CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_OVERWRITE_ENABLE_MASK 0x1ull
+#define CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_OVERWRITE_ENABLE_SHIFT 12
+#define CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_MASK 0x7ull
+#define CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_SHIFT 13
+#define CCE_REVISION (CCE + 0x000000000000)
+#define CCE_REVISION2 (CCE + 0x000000000008)
+#define CCE_REVISION2_HFI_ID_MASK 0x1ull
+#define CCE_REVISION2_HFI_ID_SHIFT 0
+#define CCE_REVISION2_IMPL_CODE_SHIFT 8
+#define CCE_REVISION2_IMPL_REVISION_SHIFT 16
+#define CCE_REVISION_BOARD_ID_LOWER_NIBBLE_MASK 0xFull
+#define CCE_REVISION_BOARD_ID_LOWER_NIBBLE_SHIFT 32
+#define CCE_REVISION_CHIP_REV_MAJOR_MASK 0xFFull
+#define CCE_REVISION_CHIP_REV_MAJOR_SHIFT 8
+#define CCE_REVISION_CHIP_REV_MINOR_MASK 0xFFull
+#define CCE_REVISION_CHIP_REV_MINOR_SHIFT 0
+#define CCE_REVISION_SW_MASK 0xFFull
+#define CCE_REVISION_SW_SHIFT 24
+#define CCE_SCRATCH (CCE + 0x000000000020)
+#define CCE_STATUS (CCE + 0x000000000018)
+#define CCE_STATUS_RXE_FROZE_SMASK 0x2ull
+#define CCE_STATUS_RXE_PAUSED_SMASK 0x20ull
+#define CCE_STATUS_SDMA_FROZE_SMASK 0x1ull
+#define CCE_STATUS_SDMA_PAUSED_SMASK 0x10ull
+#define CCE_STATUS_TXE_FROZE_SMASK 0x4ull
+#define CCE_STATUS_TXE_PAUSED_SMASK 0x40ull
+#define CCE_STATUS_TXE_PIO_FROZE_SMASK 0x8ull
+#define CCE_STATUS_TXE_PIO_PAUSED_SMASK 0x80ull
+#define MISC_CFG_FW_CTRL (MISC + 0x000000001000)
+#define MISC_CFG_FW_CTRL_FW_8051_LOADED_SMASK 0x2ull
+#define MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT 2
+#define MISC_CFG_FW_CTRL_RSA_STATUS_SMASK 0xCull
+#define MISC_CFG_RSA_CMD (MISC + 0x000000000A08)
+#define MISC_CFG_RSA_MODULUS (MISC + 0x000000000400)
+#define MISC_CFG_RSA_MU (MISC + 0x000000000A10)
+#define MISC_CFG_RSA_R2 (MISC + 0x000000000000)
+#define MISC_CFG_RSA_SIGNATURE (MISC + 0x000000000200)
+#define MISC_CFG_SHA_PRELOAD (MISC + 0x000000000A00)
+#define MISC_ERR_CLEAR (MISC + 0x000000002010)
+#define MISC_ERR_MASK (MISC + 0x000000002008)
+#define MISC_ERR_STATUS (MISC + 0x000000002000)
+#define MISC_ERR_STATUS_MISC_PLL_LOCK_FAIL_ERR_SMASK 0x1000ull
+#define MISC_ERR_STATUS_MISC_MBIST_FAIL_ERR_SMASK 0x800ull
+#define MISC_ERR_STATUS_MISC_INVALID_EEP_CMD_ERR_SMASK 0x400ull
+#define MISC_ERR_STATUS_MISC_EFUSE_DONE_PARITY_ERR_SMASK 0x200ull
+#define MISC_ERR_STATUS_MISC_EFUSE_WRITE_ERR_SMASK 0x100ull
+#define MISC_ERR_STATUS_MISC_EFUSE_READ_BAD_ADDR_ERR_SMASK 0x80ull
+#define MISC_ERR_STATUS_MISC_EFUSE_CSR_PARITY_ERR_SMASK 0x40ull
+#define MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK 0x20ull
+#define MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK 0x10ull
+#define MISC_ERR_STATUS_MISC_SBUS_WRITE_FAILED_ERR_SMASK 0x8ull
+#define MISC_ERR_STATUS_MISC_CSR_WRITE_BAD_ADDR_ERR_SMASK 0x4ull
+#define MISC_ERR_STATUS_MISC_CSR_READ_BAD_ADDR_ERR_SMASK 0x2ull
+#define MISC_ERR_STATUS_MISC_CSR_PARITY_ERR_SMASK 0x1ull
+#define PCI_CFG_MSIX0 (PCIE + 0x0000000000B0)
+#define PCI_CFG_REG1 (PCIE + 0x000000000004)
+#define PCI_CFG_REG11 (PCIE + 0x00000000002C)
+#define PCIE_CFG_SPCIE1 (PCIE + 0x00000000014C)
+#define PCIE_CFG_SPCIE2 (PCIE + 0x000000000150)
+#define PCIE_CFG_TPH2 (PCIE + 0x000000000180)
+#define RCV_ARRAY (RXE + 0x000000200000)
+#define RCV_ARRAY_CNT (RXE + 0x000000000018)
+#define RCV_ARRAY_RT_ADDR_MASK 0xFFFFFFFFFull
+#define RCV_ARRAY_RT_ADDR_SHIFT 0
+#define RCV_ARRAY_RT_BUF_SIZE_SHIFT 36
+#define RCV_ARRAY_RT_WRITE_ENABLE_SMASK 0x8000000000000000ull
+#define RCV_AVAIL_TIME_OUT (RXE + 0x000000100050)
+#define RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK 0xFFull
+#define RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT 0
+#define RCV_BTH_QP (RXE + 0x000000000028)
+#define RCV_BTH_QP_KDETH_QP_MASK 0xFFull
+#define RCV_BTH_QP_KDETH_QP_SHIFT 16
+#define RCV_BYPASS (RXE + 0x000000000038)
+#define RCV_CONTEXTS (RXE + 0x000000000010)
+#define RCV_COUNTER_ARRAY32 (RXE + 0x000000000400)
+#define RCV_COUNTER_ARRAY64 (RXE + 0x000000000500)
+#define RCV_CTRL (RXE + 0x000000000000)
+#define RCV_CTRL_RCV_BYPASS_ENABLE_SMASK 0x10ull
+#define RCV_CTRL_RCV_EXTENDED_PSN_ENABLE_SMASK 0x40ull
+#define RCV_CTRL_RCV_PARTITION_KEY_ENABLE_SMASK 0x4ull
+#define RCV_CTRL_RCV_PORT_ENABLE_SMASK 0x1ull
+#define RCV_CTRL_RCV_QP_MAP_ENABLE_SMASK 0x2ull
+#define RCV_CTRL_RCV_RSM_ENABLE_SMASK 0x20ull
+#define RCV_CTRL_RX_RBUF_INIT_SMASK 0x200ull
+#define RCV_CTXT_CTRL (RXE + 0x000000100000)
+#define RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK 0x4ull
+#define RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK 0x8ull
+#define RCV_CTXT_CTRL_EGR_BUF_SIZE_MASK 0x7ull
+#define RCV_CTXT_CTRL_EGR_BUF_SIZE_SHIFT 8
+#define RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK 0x700ull
+#define RCV_CTXT_CTRL_ENABLE_SMASK 0x1ull
+#define RCV_CTXT_CTRL_INTR_AVAIL_SMASK 0x20ull
+#define RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK 0x2ull
+#define RCV_CTXT_CTRL_TAIL_UPD_SMASK 0x40ull
+#define RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK 0x10ull
+#define RCV_CTXT_STATUS (RXE + 0x000000100008)
+#define RCV_EGR_CTRL (RXE + 0x000000100010)
+#define RCV_EGR_CTRL_EGR_BASE_INDEX_MASK 0x1FFFull
+#define RCV_EGR_CTRL_EGR_BASE_INDEX_SHIFT 0
+#define RCV_EGR_CTRL_EGR_CNT_MASK 0x1FFull
+#define RCV_EGR_CTRL_EGR_CNT_SHIFT 32
+#define RCV_EGR_INDEX_HEAD (RXE + 0x000000300018)
+#define RCV_EGR_INDEX_HEAD_HEAD_MASK 0x7FFull
+#define RCV_EGR_INDEX_HEAD_HEAD_SHIFT 0
+#define RCV_ERR_CLEAR (RXE + 0x000000000070)
+#define RCV_ERR_INFO (RXE + 0x000000000050)
+#define RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SC_SMASK 0x1Full
+#define RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK 0x20ull
+#define RCV_ERR_MASK (RXE + 0x000000000068)
+#define RCV_ERR_STATUS (RXE + 0x000000000060)
+#define RCV_ERR_STATUS_RX_CSR_PARITY_ERR_SMASK 0x8000000000000000ull
+#define RCV_ERR_STATUS_RX_CSR_READ_BAD_ADDR_ERR_SMASK 0x2000000000000000ull
+#define RCV_ERR_STATUS_RX_CSR_WRITE_BAD_ADDR_ERR_SMASK \
+               0x4000000000000000ull
+#define RCV_ERR_STATUS_RX_DC_INTF_PARITY_ERR_SMASK 0x2ull
+#define RCV_ERR_STATUS_RX_DC_SOP_EOP_PARITY_ERR_SMASK 0x200ull
+#define RCV_ERR_STATUS_RX_DMA_CSR_COR_ERR_SMASK 0x1ull
+#define RCV_ERR_STATUS_RX_DMA_CSR_PARITY_ERR_SMASK 0x200000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK 0x1000000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_COR_ERR_SMASK \
+               0x40000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK \
+               0x20000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_DQ_FSM_ENCODING_ERR_SMASK \
+               0x800000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_EQ_FSM_ENCODING_ERR_SMASK \
+               0x400000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_FLAG_COR_ERR_SMASK 0x800ull
+#define RCV_ERR_STATUS_RX_DMA_FLAG_UNC_ERR_SMASK 0x400ull
+#define RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_COR_ERR_SMASK 0x10000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK 0x8000000000000ull
+#define RCV_ERR_STATUS_RX_HQ_INTR_CSR_PARITY_ERR_SMASK 0x200000000000ull
+#define RCV_ERR_STATUS_RX_HQ_INTR_FSM_ERR_SMASK 0x400000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_CSR_PARITY_ERR_SMASK 0x100000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_COR_ERR_SMASK \
+               0x10000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_ERR_SMASK 0x8000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_DES_PART2_PARITY_ERR_SMASK \
+               0x20000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_COR_ERR_SMASK 0x80000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_UNC_ERR_SMASK 0x40000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_BAD_LOOKUP_ERR_SMASK 0x40000000ull
+#define RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_COR_ERR_SMASK 0x100000ull
+#define RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_UNC_ERR_SMASK 0x80000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QENT_CNT_PARITY_ERR_SMASK 0x400000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QEOPDW_PARITY_ERR_SMASK 0x10000000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QHD_PTR_PARITY_ERR_SMASK 0x2000000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QHEAD_BUF_NUM_PARITY_ERR_SMASK \
+               0x200000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QNEXT_BUF_PARITY_ERR_SMASK 0x800000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QNUM_OF_PKT_PARITY_ERR_SMASK \
+               0x8000000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QTL_PTR_PARITY_ERR_SMASK 0x4000000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QVLD_BIT_PARITY_ERR_SMASK 0x1000000ull
+#define RCV_ERR_STATUS_RX_RBUF_CTX_ID_PARITY_ERR_SMASK 0x20000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DATA_COR_ERR_SMASK 0x100000000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DATA_UNC_ERR_SMASK 0x80000000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DESC_PART1_COR_ERR_SMASK 0x1000000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DESC_PART1_UNC_ERR_SMASK 0x800000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DESC_PART2_COR_ERR_SMASK 0x4000000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DESC_PART2_UNC_ERR_SMASK 0x2000000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_EMPTY_ERR_SMASK 0x100000000ull
+#define RCV_ERR_STATUS_RX_RBUF_FL_INITDONE_PARITY_ERR_SMASK 0x800000000ull
+#define RCV_ERR_STATUS_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR_SMASK \
+               0x1000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_FL_RD_ADDR_PARITY_ERR_SMASK 0x200000000ull
+#define RCV_ERR_STATUS_RX_RBUF_FL_WR_ADDR_PARITY_ERR_SMASK 0x400000000ull
+#define RCV_ERR_STATUS_RX_RBUF_FREE_LIST_COR_ERR_SMASK 0x4000ull
+#define RCV_ERR_STATUS_RX_RBUF_FREE_LIST_UNC_ERR_SMASK 0x2000ull
+#define RCV_ERR_STATUS_RX_RBUF_FULL_ERR_SMASK 0x80000000ull
+#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_COR_ERR_SMASK 0x40000ull
+#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR_SMASK 0x10000ull
+#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_ERR_SMASK 0x8000ull
+#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_UNC_ERR_SMASK 0x20000ull
+#define RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_COR_ERR_SMASK 0x4000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_UNC_ERR_SMASK 0x2000000000ull
+#define RCV_ERR_STATUS_RX_RCV_CSR_PARITY_ERR_SMASK 0x100ull
+#define RCV_ERR_STATUS_RX_RCV_DATA_COR_ERR_SMASK 0x20ull
+#define RCV_ERR_STATUS_RX_RCV_DATA_UNC_ERR_SMASK 0x10ull
+#define RCV_ERR_STATUS_RX_RCV_FSM_ENCODING_ERR_SMASK 0x1000ull
+#define RCV_ERR_STATUS_RX_RCV_HDR_COR_ERR_SMASK 0x8ull
+#define RCV_ERR_STATUS_RX_RCV_HDR_UNC_ERR_SMASK 0x4ull
+#define RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_COR_ERR_SMASK 0x80ull
+#define RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_UNC_ERR_SMASK 0x40ull
+#define RCV_HDR_ADDR (RXE + 0x000000100028)
+#define RCV_HDR_CNT (RXE + 0x000000100030)
+#define RCV_HDR_CNT_CNT_MASK 0x1FFull
+#define RCV_HDR_CNT_CNT_SHIFT 0
+#define RCV_HDR_ENT_SIZE (RXE + 0x000000100038)
+#define RCV_HDR_ENT_SIZE_ENT_SIZE_MASK 0x7ull
+#define RCV_HDR_ENT_SIZE_ENT_SIZE_SHIFT 0
+#define RCV_HDR_HEAD (RXE + 0x000000300008)
+#define RCV_HDR_HEAD_COUNTER_MASK 0xFFull
+#define RCV_HDR_HEAD_COUNTER_SHIFT 32
+#define RCV_HDR_HEAD_HEAD_MASK 0x7FFFFull
+#define RCV_HDR_HEAD_HEAD_SHIFT 0
+#define RCV_HDR_HEAD_HEAD_SMASK 0x7FFFFull
+#define RCV_HDR_OVFL_CNT (RXE + 0x000000100058)
+#define RCV_HDR_SIZE (RXE + 0x000000100040)
+#define RCV_HDR_SIZE_HDR_SIZE_MASK 0x1Full
+#define RCV_HDR_SIZE_HDR_SIZE_SHIFT 0
+#define RCV_HDR_TAIL (RXE + 0x000000300000)
+#define RCV_HDR_TAIL_ADDR (RXE + 0x000000100048)
+#define RCV_KEY_CTRL (RXE + 0x000000100020)
+#define RCV_KEY_CTRL_JOB_KEY_ENABLE_SMASK 0x200000000ull
+#define RCV_KEY_CTRL_JOB_KEY_VALUE_MASK 0xFFFFull
+#define RCV_KEY_CTRL_JOB_KEY_VALUE_SHIFT 0
+#define RCV_MULTICAST (RXE + 0x000000000030)
+#define RCV_PARTITION_KEY (RXE + 0x000000000200)
+#define RCV_PARTITION_KEY_PARTITION_KEY_A_MASK 0xFFFFull
+#define RCV_PARTITION_KEY_PARTITION_KEY_B_SHIFT 16
+#define RCV_QP_MAP_TABLE (RXE + 0x000000000100)
+#define RCV_RSM_CFG (RXE + 0x000000000600)
+#define RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK 0x1ull
+#define RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT 0
+#define RCV_RSM_CFG_PACKET_TYPE_SHIFT 60
+#define RCV_RSM_CFG_OFFSET_SHIFT 32
+#define RCV_RSM_MAP_TABLE (RXE + 0x000000000900)
+#define RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK 0xFFull
+#define RCV_RSM_MATCH (RXE + 0x000000000800)
+#define RCV_RSM_MATCH_MASK1_SHIFT 0
+#define RCV_RSM_MATCH_MASK2_SHIFT 16
+#define RCV_RSM_MATCH_VALUE1_SHIFT 8
+#define RCV_RSM_MATCH_VALUE2_SHIFT 24
+#define RCV_RSM_SELECT (RXE + 0x000000000700)
+#define RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT 0
+#define RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT 16
+#define RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT 32
+#define RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT 44
+#define RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT 48
+#define RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT 60
+#define RCV_STATUS (RXE + 0x000000000008)
+#define RCV_STATUS_RX_PKT_IN_PROGRESS_SMASK 0x1ull
+#define RCV_STATUS_RX_RBUF_INIT_DONE_SMASK 0x200ull
+#define RCV_STATUS_RX_RBUF_PKT_PENDING_SMASK 0x40ull
+#define RCV_TID_CTRL (RXE + 0x000000100018)
+#define RCV_TID_CTRL_TID_BASE_INDEX_MASK 0x1FFFull
+#define RCV_TID_CTRL_TID_BASE_INDEX_SHIFT 0
+#define RCV_TID_CTRL_TID_PAIR_CNT_MASK 0x1FFull
+#define RCV_TID_CTRL_TID_PAIR_CNT_SHIFT 32
+#define RCV_TID_FLOW_TABLE (RXE + 0x000000300800)
+#define RCV_VL15 (RXE + 0x000000000048)
+#define SEND_BTH_QP (TXE + 0x0000000000A0)
+#define SEND_BTH_QP_KDETH_QP_MASK 0xFFull
+#define SEND_BTH_QP_KDETH_QP_SHIFT 16
+#define SEND_CM_CREDIT_USED_STATUS (TXE + 0x000000000510)
+#define SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK \
+               0x1000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL15_RETURN_CREDIT_STATUS_SMASK \
+               0x8000000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL1_RETURN_CREDIT_STATUS_SMASK \
+               0x2000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL2_RETURN_CREDIT_STATUS_SMASK \
+               0x4000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL3_RETURN_CREDIT_STATUS_SMASK \
+               0x8000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL4_RETURN_CREDIT_STATUS_SMASK \
+               0x10000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL5_RETURN_CREDIT_STATUS_SMASK \
+               0x20000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL6_RETURN_CREDIT_STATUS_SMASK \
+               0x40000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL7_RETURN_CREDIT_STATUS_SMASK \
+               0x80000000000000ull
+#define SEND_CM_CREDIT_VL (TXE + 0x000000000600)
+#define SEND_CM_CREDIT_VL15 (TXE + 0x000000000678)
+#define SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT 0
+#define SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_MASK 0xFFFFull
+#define SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT 0
+#define SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SMASK 0xFFFFull
+#define SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_MASK 0xFFFFull
+#define SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT 16
+#define SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SMASK 0xFFFF0000ull
+#define SEND_CM_CTRL (TXE + 0x000000000500)
+#define SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK 0x8ull
+#define SEND_CM_CTRL_RESETCSR 0x0000000000000020ull
+#define SEND_CM_GLOBAL_CREDIT (TXE + 0x000000000508)
+#define SEND_CM_GLOBAL_CREDIT_AU_SHIFT 16
+#define SEND_CM_GLOBAL_CREDIT_RESETCSR 0x0000094000030000ull
+#define SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_MASK 0xFFFFull
+#define SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT 0
+#define SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SMASK 0xFFFFull
+#define SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_MASK 0xFFFFull
+#define SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT 32
+#define SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SMASK 0xFFFF00000000ull
+#define SEND_CM_LOCAL_AU_TABLE0_TO3 (TXE + 0x000000000520)
+#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE0_SHIFT 0
+#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE1_SHIFT 16
+#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE2_SHIFT 32
+#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE3_SHIFT 48
+#define SEND_CM_LOCAL_AU_TABLE4_TO7 (TXE + 0x000000000528)
+#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE4_SHIFT 0
+#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE5_SHIFT 16
+#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE6_SHIFT 32
+#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE7_SHIFT 48
+#define SEND_CM_REMOTE_AU_TABLE0_TO3 (TXE + 0x000000000530)
+#define SEND_CM_REMOTE_AU_TABLE4_TO7 (TXE + 0x000000000538)
+#define SEND_CM_TIMER_CTRL (TXE + 0x000000000518)
+#define SEND_CONTEXTS (TXE + 0x000000000010)
+#define SEND_CONTEXT_SET_CTRL (TXE + 0x000000000200)
+#define SEND_COUNTER_ARRAY32 (TXE + 0x000000000300)
+#define SEND_COUNTER_ARRAY64 (TXE + 0x000000000400)
+#define SEND_CTRL (TXE + 0x000000000000)
+#define SEND_CTRL_CM_RESET_SMASK 0x4ull
+#define SEND_CTRL_SEND_ENABLE_SMASK 0x1ull
+#define SEND_CTRL_VL_ARBITER_ENABLE_SMASK 0x2ull
+#define SEND_CTXT_CHECK_ENABLE (TXE + 0x000000100080)
+#define SEND_CTXT_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK 0x80ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_ENABLE_SMASK 0x1ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK 0x4ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_OPCODE_SMASK 0x20ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK 0x8ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_SLID_SMASK 0x10ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK 0x40ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_VL_SMASK 0x2ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK 0x20000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK \
+               0x200000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_SMASK 0x800ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_GRH_SMASK 0x400ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK 0x1000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_NON_KDETH_PACKETS_SMASK 0x2000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK \
+               0x100000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK 0x10000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK 0x200ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_SMASK 0x100ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK \
+               0x80000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK \
+               0x40000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK \
+               0x8000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK \
+               0x4000ull
+#define SEND_CTXT_CHECK_JOB_KEY (TXE + 0x000000100090)
+#define SEND_CTXT_CHECK_JOB_KEY_ALLOW_PERMISSIVE_SMASK 0x100000000ull
+#define SEND_CTXT_CHECK_JOB_KEY_MASK_SMASK 0xFFFF0000ull
+#define SEND_CTXT_CHECK_JOB_KEY_VALUE_MASK 0xFFFFull
+#define SEND_CTXT_CHECK_JOB_KEY_VALUE_SHIFT 0
+#define SEND_CTXT_CHECK_OPCODE (TXE + 0x0000001000A8)
+#define SEND_CTXT_CHECK_OPCODE_MASK_SHIFT 8
+#define SEND_CTXT_CHECK_OPCODE_VALUE_SHIFT 0
+#define SEND_CTXT_CHECK_PARTITION_KEY (TXE + 0x000000100098)
+#define SEND_CTXT_CHECK_PARTITION_KEY_VALUE_MASK 0xFFFFull
+#define SEND_CTXT_CHECK_PARTITION_KEY_VALUE_SHIFT 0
+#define SEND_CTXT_CHECK_SLID (TXE + 0x0000001000A0)
+#define SEND_CTXT_CHECK_SLID_MASK_MASK 0xFFFFull
+#define SEND_CTXT_CHECK_SLID_MASK_SHIFT 16
+#define SEND_CTXT_CHECK_SLID_VALUE_MASK 0xFFFFull
+#define SEND_CTXT_CHECK_SLID_VALUE_SHIFT 0
+#define SEND_CTXT_CHECK_VL (TXE + 0x000000100088)
+#define SEND_CTXT_CREDIT_CTRL (TXE + 0x000000100010)
+#define SEND_CTXT_CREDIT_CTRL_CREDIT_INTR_SMASK 0x20000ull
+#define SEND_CTXT_CREDIT_CTRL_EARLY_RETURN_SMASK 0x10000ull
+#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_MASK 0x7FFull
+#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_SHIFT 0
+#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_SMASK 0x7FFull
+#define SEND_CTXT_CREDIT_FORCE (TXE + 0x000000100028)
+#define SEND_CTXT_CREDIT_FORCE_FORCE_RETURN_SMASK 0x1ull
+#define SEND_CTXT_CREDIT_RETURN_ADDR (TXE + 0x000000100020)
+#define SEND_CTXT_CREDIT_RETURN_ADDR_ADDRESS_SMASK 0xFFFFFFFFFFC0ull
+#define SEND_CTXT_CTRL (TXE + 0x000000100000)
+#define SEND_CTXT_CTRL_CTXT_BASE_MASK 0x3FFFull
+#define SEND_CTXT_CTRL_CTXT_BASE_SHIFT 32
+#define SEND_CTXT_CTRL_CTXT_DEPTH_MASK 0x7FFull
+#define SEND_CTXT_CTRL_CTXT_DEPTH_SHIFT 48
+#define SEND_CTXT_CTRL_CTXT_ENABLE_SMASK 0x1ull
+#define SEND_CTXT_ERR_CLEAR (TXE + 0x000000100050)
+#define SEND_CTXT_ERR_MASK (TXE + 0x000000100048)
+#define SEND_CTXT_ERR_STATUS (TXE + 0x000000100040)
+#define SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK 0x2ull
+#define SEND_CTXT_ERR_STATUS_PIO_INCONSISTENT_SOP_ERR_SMASK 0x1ull
+#define SEND_CTXT_ERR_STATUS_PIO_WRITE_CROSSES_BOUNDARY_ERR_SMASK 0x4ull
+#define SEND_CTXT_ERR_STATUS_PIO_WRITE_OUT_OF_BOUNDS_ERR_SMASK 0x10ull
+#define SEND_CTXT_ERR_STATUS_PIO_WRITE_OVERFLOW_ERR_SMASK 0x8ull
+#define SEND_CTXT_STATUS (TXE + 0x000000100008)
+#define SEND_CTXT_STATUS_CTXT_HALTED_SMASK 0x1ull
+#define SEND_DMA_BASE_ADDR (TXE + 0x000000200010)
+#define SEND_DMA_CHECK_ENABLE (TXE + 0x000000200080)
+#define SEND_DMA_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK 0x80ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_ENABLE_SMASK 0x1ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK 0x4ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_OPCODE_SMASK 0x20ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK 0x8ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_SLID_SMASK 0x10ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK 0x40ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_VL_SMASK 0x2ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK 0x20000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK 0x200000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK \
+               0x100000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK 0x200ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_SMASK 0x100ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK \
+               0x80000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK 0x40000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK \
+               0x8000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK 0x4000ull
+#define SEND_DMA_CHECK_JOB_KEY (TXE + 0x000000200090)
+#define SEND_DMA_CHECK_OPCODE (TXE + 0x0000002000A8)
+#define SEND_DMA_CHECK_PARTITION_KEY (TXE + 0x000000200098)
+#define SEND_DMA_CHECK_SLID (TXE + 0x0000002000A0)
+#define SEND_DMA_CHECK_SLID_MASK_MASK 0xFFFFull
+#define SEND_DMA_CHECK_SLID_MASK_SHIFT 16
+#define SEND_DMA_CHECK_SLID_VALUE_MASK 0xFFFFull
+#define SEND_DMA_CHECK_SLID_VALUE_SHIFT 0
+#define SEND_DMA_CHECK_VL (TXE + 0x000000200088)
+#define SEND_DMA_CTRL (TXE + 0x000000200000)
+#define SEND_DMA_CTRL_SDMA_CLEANUP_SMASK 0x4ull
+#define SEND_DMA_CTRL_SDMA_ENABLE_SMASK 0x1ull
+#define SEND_DMA_CTRL_SDMA_HALT_SMASK 0x2ull
+#define SEND_DMA_CTRL_SDMA_INT_ENABLE_SMASK 0x8ull
+#define SEND_DMA_DESC_CNT (TXE + 0x000000200050)
+#define SEND_DMA_DESC_CNT_CNT_MASK 0xFFFFull
+#define SEND_DMA_DESC_CNT_CNT_SHIFT 0
+#define SEND_DMA_ENG_ERR_CLEAR (TXE + 0x000000200070)
+#define SEND_DMA_ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_MASK 0x1ull
+#define SEND_DMA_ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SHIFT 18
+#define SEND_DMA_ENG_ERR_MASK (TXE + 0x000000200068)
+#define SEND_DMA_ENG_ERR_STATUS (TXE + 0x000000200060)
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_ASSEMBLY_UNC_ERR_SMASK 0x8000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_DESC_TABLE_UNC_ERR_SMASK 0x4000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_FIRST_DESC_ERR_SMASK 0x10ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_GEN_MISMATCH_ERR_SMASK 0x2ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK 0x40ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_ADDRESS_ERR_SMASK 0x800ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_LENGTH_ERR_SMASK 0x1000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SMASK \
+               0x40000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_SELECT_ERR_SMASK 0x400ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_STORAGE_UNC_ERR_SMASK \
+               0x20000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_LENGTH_MISMATCH_ERR_SMASK 0x80ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_MEM_READ_ERR_SMASK 0x20ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_PACKET_DESC_OVERFLOW_ERR_SMASK \
+               0x100ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_PACKET_TRACKING_UNC_ERR_SMASK \
+               0x10000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_TAIL_OUT_OF_BOUNDS_ERR_SMASK 0x8ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_TIMEOUT_ERR_SMASK 0x2000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_TOO_LONG_ERR_SMASK 0x4ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_WRONG_DW_ERR_SMASK 0x1ull
+#define SEND_DMA_ENGINES (TXE + 0x000000000018)
+#define SEND_DMA_ERR_CLEAR (TXE + 0x000000000070)
+#define SEND_DMA_ERR_MASK (TXE + 0x000000000068)
+#define SEND_DMA_ERR_STATUS (TXE + 0x000000000060)
+#define SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK 0x2ull
+#define SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_COR_ERR_SMASK 0x8ull
+#define SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK 0x4ull
+#define SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK 0x1ull
+#define SEND_DMA_HEAD (TXE + 0x000000200028)
+#define SEND_DMA_HEAD_ADDR (TXE + 0x000000200030)
+#define SEND_DMA_LEN_GEN (TXE + 0x000000200018)
+#define SEND_DMA_LEN_GEN_GENERATION_SHIFT 16
+#define SEND_DMA_LEN_GEN_LENGTH_SHIFT 6
+#define SEND_DMA_MEMORY (TXE + 0x0000002000B0)
+#define SEND_DMA_MEMORY_SDMA_MEMORY_CNT_SHIFT 16
+#define SEND_DMA_MEMORY_SDMA_MEMORY_INDEX_SHIFT 0
+#define SEND_DMA_MEM_SIZE (TXE + 0x000000000028)
+#define SEND_DMA_PRIORITY_THLD (TXE + 0x000000200038)
+#define SEND_DMA_RELOAD_CNT (TXE + 0x000000200048)
+#define SEND_DMA_STATUS (TXE + 0x000000200008)
+#define SEND_DMA_STATUS_ENG_CLEANED_UP_SMASK 0x200000000000000ull
+#define SEND_DMA_STATUS_ENG_HALTED_SMASK 0x100000000000000ull
+#define SEND_DMA_TAIL (TXE + 0x000000200020)
+#define SEND_EGRESS_CTXT_STATUS (TXE + 0x000000000800)
+#define SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_HALT_STATUS_SMASK 0x10000ull
+#define SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SHIFT 0
+#define SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SMASK \
+               0x3FFFull
+#define SEND_EGRESS_ERR_CLEAR (TXE + 0x000000000090)
+#define SEND_EGRESS_ERR_INFO (TXE + 0x000000000F00)
+#define SEND_EGRESS_ERR_INFO_BAD_PKT_LEN_ERR_SMASK 0x20000ull
+#define SEND_EGRESS_ERR_INFO_BYPASS_ERR_SMASK 0x800ull
+#define SEND_EGRESS_ERR_INFO_GRH_ERR_SMASK 0x400ull
+#define SEND_EGRESS_ERR_INFO_JOB_KEY_ERR_SMASK 0x4ull
+#define SEND_EGRESS_ERR_INFO_KDETH_PACKETS_ERR_SMASK 0x1000ull
+#define SEND_EGRESS_ERR_INFO_NON_KDETH_PACKETS_ERR_SMASK 0x2000ull
+#define SEND_EGRESS_ERR_INFO_OPCODE_ERR_SMASK 0x20ull
+#define SEND_EGRESS_ERR_INFO_PARTITION_KEY_ERR_SMASK 0x8ull
+#define SEND_EGRESS_ERR_INFO_PBC_STATIC_RATE_CONTROL_ERR_SMASK 0x100000ull
+#define SEND_EGRESS_ERR_INFO_PBC_TEST_ERR_SMASK 0x10000ull
+#define SEND_EGRESS_ERR_INFO_RAW_ERR_SMASK 0x100ull
+#define SEND_EGRESS_ERR_INFO_RAW_IPV6_ERR_SMASK 0x200ull
+#define SEND_EGRESS_ERR_INFO_SLID_ERR_SMASK 0x10ull
+#define SEND_EGRESS_ERR_INFO_TOO_LONG_BYPASS_PACKETS_ERR_SMASK 0x80000ull
+#define SEND_EGRESS_ERR_INFO_TOO_LONG_IB_PACKET_ERR_SMASK 0x40000ull
+#define SEND_EGRESS_ERR_INFO_TOO_SMALL_BYPASS_PACKETS_ERR_SMASK 0x8000ull
+#define SEND_EGRESS_ERR_INFO_TOO_SMALL_IB_PACKETS_ERR_SMASK 0x4000ull
+#define SEND_EGRESS_ERR_INFO_VL_ERR_SMASK 0x2ull
+#define SEND_EGRESS_ERR_INFO_VL_MAPPING_ERR_SMASK 0x40ull
+#define SEND_EGRESS_ERR_MASK (TXE + 0x000000000088)
+#define SEND_EGRESS_ERR_SOURCE (TXE + 0x000000000F08)
+#define SEND_EGRESS_ERR_STATUS (TXE + 0x000000000080)
+#define SEND_EGRESS_ERR_STATUS_TX_CONFIG_PARITY_ERR_SMASK 0x8000ull
+#define SEND_EGRESS_ERR_STATUS_TX_CREDIT_OVERRUN_ERR_SMASK \
+               0x200000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_PARITY_ERR_SMASK \
+               0x20000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_VL_ERR_SMASK \
+               0x800000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_EGRESS_FIFO_COR_ERR_SMASK \
+               0x2000000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_EGRESS_FIFO_UNC_ERR_SMASK \
+               0x200000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_EGRESS_FIFO_UNDERRUN_OR_PARITY_ERR_SMASK \
+               0x8ull
+#define SEND_EGRESS_ERR_STATUS_TX_HCRC_INSERTION_ERR_SMASK \
+               0x400000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_ILLEGAL_VL_ERR_SMASK 0x1000ull
+#define SEND_EGRESS_ERR_STATUS_TX_INCORRECT_LINK_STATE_ERR_SMASK 0x20ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_CSR_PARITY_ERR_SMASK 0x2000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO0_COR_ERR_SMASK \
+               0x1000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO0_UNC_OR_PARITY_ERR_SMASK \
+               0x100000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO1_COR_ERR_SMASK \
+               0x2000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO1_UNC_OR_PARITY_ERR_SMASK \
+               0x200000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO2_COR_ERR_SMASK \
+               0x4000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO2_UNC_OR_PARITY_ERR_SMASK \
+               0x400000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO3_COR_ERR_SMASK \
+               0x8000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO3_UNC_OR_PARITY_ERR_SMASK \
+               0x800000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO4_COR_ERR_SMASK \
+               0x10000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO4_UNC_OR_PARITY_ERR_SMASK \
+               0x1000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO5_COR_ERR_SMASK \
+               0x20000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO5_UNC_OR_PARITY_ERR_SMASK \
+               0x2000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO6_COR_ERR_SMASK \
+               0x40000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO6_UNC_OR_PARITY_ERR_SMASK \
+               0x4000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO7_COR_ERR_SMASK \
+               0x80000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO7_UNC_OR_PARITY_ERR_SMASK \
+               0x8000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO8_COR_ERR_SMASK \
+               0x100000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO8_UNC_OR_PARITY_ERR_SMASK \
+               0x10000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LINKDOWN_ERR_SMASK 0x10ull
+#define SEND_EGRESS_ERR_STATUS_TX_PIO_LAUNCH_INTF_PARITY_ERR_SMASK 0x80ull
+#define SEND_EGRESS_ERR_STATUS_TX_PKT_INTEGRITY_MEM_COR_ERR_SMASK 0x1ull
+#define SEND_EGRESS_ERR_STATUS_TX_PKT_INTEGRITY_MEM_UNC_ERR_SMASK 0x2ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_PIO_MEMORY_COR_ERR_SMASK \
+               0x1000000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_PIO_MEMORY_CSR_UNC_ERR_SMASK \
+               0x8000000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_PIO_MEMORY_UNC_ERR_SMASK \
+               0x100000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_SDMA_MEMORY_COR_ERR_SMASK \
+               0x800000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_SDMA_MEMORY_CSR_UNC_ERR_SMASK \
+               0x4000000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_SDMA_MEMORY_UNC_ERR_SMASK \
+               0x80000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SB_HDR_COR_ERR_SMASK 0x400000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SB_HDR_UNC_ERR_SMASK 0x40000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SBRD_CTL_CSR_PARITY_ERR_SMASK 0x4000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SBRD_CTL_STATE_MACHINE_PARITY_ERR_SMASK \
+               0x800ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA0_DISALLOWED_PACKET_ERR_SMASK \
+               0x10000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA10_DISALLOWED_PACKET_ERR_SMASK \
+               0x4000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA11_DISALLOWED_PACKET_ERR_SMASK \
+               0x8000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA12_DISALLOWED_PACKET_ERR_SMASK \
+               0x10000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA13_DISALLOWED_PACKET_ERR_SMASK \
+               0x20000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA14_DISALLOWED_PACKET_ERR_SMASK \
+               0x40000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA15_DISALLOWED_PACKET_ERR_SMASK \
+               0x80000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA1_DISALLOWED_PACKET_ERR_SMASK \
+               0x20000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA2_DISALLOWED_PACKET_ERR_SMASK \
+               0x40000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA3_DISALLOWED_PACKET_ERR_SMASK \
+               0x80000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA4_DISALLOWED_PACKET_ERR_SMASK \
+               0x100000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA5_DISALLOWED_PACKET_ERR_SMASK \
+               0x200000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA6_DISALLOWED_PACKET_ERR_SMASK \
+               0x400000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA7_DISALLOWED_PACKET_ERR_SMASK \
+               0x800000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA8_DISALLOWED_PACKET_ERR_SMASK \
+               0x1000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA9_DISALLOWED_PACKET_ERR_SMASK \
+               0x2000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA_LAUNCH_INTF_PARITY_ERR_SMASK \
+               0x100ull
+#define SEND_EGRESS_SEND_DMA_STATUS (TXE + 0x000000000E00)
+#define SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT 0
+#define SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SMASK \
+               0x3FFFull
+#define SEND_ERR_CLEAR (TXE + 0x0000000000F0)
+#define SEND_ERR_MASK (TXE + 0x0000000000E8)
+#define SEND_ERR_STATUS (TXE + 0x0000000000E0)
+#define SEND_ERR_STATUS_SEND_CSR_PARITY_ERR_SMASK 0x1ull
+#define SEND_ERR_STATUS_SEND_CSR_READ_BAD_ADDR_ERR_SMASK 0x2ull
+#define SEND_ERR_STATUS_SEND_CSR_WRITE_BAD_ADDR_ERR_SMASK 0x4ull
+#define SEND_HIGH_PRIORITY_LIMIT (TXE + 0x000000000030)
+#define SEND_HIGH_PRIORITY_LIMIT_LIMIT_MASK 0x3FFFull
+#define SEND_HIGH_PRIORITY_LIMIT_LIMIT_SHIFT 0
+#define SEND_HIGH_PRIORITY_LIST (TXE + 0x000000000180)
+#define SEND_LEN_CHECK0 (TXE + 0x0000000000D0)
+#define SEND_LEN_CHECK0_LEN_VL0_MASK 0xFFFull
+#define SEND_LEN_CHECK0_LEN_VL1_SHIFT 12
+#define SEND_LEN_CHECK1 (TXE + 0x0000000000D8)
+#define SEND_LEN_CHECK1_LEN_VL15_MASK 0xFFFull
+#define SEND_LEN_CHECK1_LEN_VL15_SHIFT 48
+#define SEND_LEN_CHECK1_LEN_VL4_MASK 0xFFFull
+#define SEND_LEN_CHECK1_LEN_VL5_SHIFT 12
+#define SEND_LOW_PRIORITY_LIST (TXE + 0x000000000100)
+#define SEND_LOW_PRIORITY_LIST_VL_MASK 0x7ull
+#define SEND_LOW_PRIORITY_LIST_VL_SHIFT 16
+#define SEND_LOW_PRIORITY_LIST_WEIGHT_MASK 0xFFull
+#define SEND_LOW_PRIORITY_LIST_WEIGHT_SHIFT 0
+#define SEND_PIO_ERR_CLEAR (TXE + 0x000000000050)
+#define SEND_PIO_ERR_CLEAR_PIO_INIT_SM_IN_ERR_SMASK 0x20000ull
+#define SEND_PIO_ERR_MASK (TXE + 0x000000000048)
+#define SEND_PIO_ERR_STATUS (TXE + 0x000000000040)
+#define SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK \
+               0x1000000ull
+#define SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK 0x8000ull
+#define SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK 0x4ull
+#define SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK \
+               0x100000000ull
+#define SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_COR_ERR_SMASK 0x100000ull
+#define SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK 0x80000ull
+#define SEND_PIO_ERR_STATUS_PIO_INIT_SM_IN_ERR_SMASK 0x20000ull
+#define SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK \
+               0x200000000ull
+#define SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK 0x20ull
+#define SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK \
+               0x400000000ull
+#define SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK 0x40ull
+#define SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK \
+               0x800000000ull
+#define SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK 0x200ull
+#define SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK 0x40000ull
+#define SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK 0x10000000ull
+#define SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK 0x10000ull
+#define SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK 0x20000000ull
+#define SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK 0x8ull
+#define SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK 0x10ull
+#define SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK 0x80ull
+#define SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK \
+               0x100ull
+#define SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK 0x400ull
+#define SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK 0x400000ull
+#define SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK 0x8000000ull
+#define SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK 0x4000000ull
+#define SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK 0x2000000ull
+#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_COR_ERR_SMASK 0x2000ull
+#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK 0x800ull
+#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_COR_ERR_SMASK 0x4000ull
+#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK 0x1000ull
+#define SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK 0x2ull
+#define SEND_PIO_ERR_STATUS_PIO_WRITE_BAD_CTXT_ERR_SMASK 0x1ull
+#define SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK 0x200000ull
+#define SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK 0x800000ull
+#define SEND_PIO_INIT_CTXT (TXE + 0x000000000038)
+#define SEND_PIO_INIT_CTXT_PIO_ALL_CTXT_INIT_SMASK 0x1ull
+#define SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_MASK 0xFFull
+#define SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_SHIFT 8
+#define SEND_PIO_INIT_CTXT_PIO_INIT_ERR_SMASK 0x8ull
+#define SEND_PIO_INIT_CTXT_PIO_INIT_IN_PROGRESS_SMASK 0x4ull
+#define SEND_PIO_INIT_CTXT_PIO_SINGLE_CTXT_INIT_SMASK 0x2ull
+#define SEND_PIO_MEM_SIZE (TXE + 0x000000000020)
+#define SEND_SC2VLT0 (TXE + 0x0000000000B0)
+#define SEND_SC2VLT0_SC0_SHIFT 0
+#define SEND_SC2VLT0_SC1_SHIFT 8
+#define SEND_SC2VLT0_SC2_SHIFT 16
+#define SEND_SC2VLT0_SC3_SHIFT 24
+#define SEND_SC2VLT0_SC4_SHIFT 32
+#define SEND_SC2VLT0_SC5_SHIFT 40
+#define SEND_SC2VLT0_SC6_SHIFT 48
+#define SEND_SC2VLT0_SC7_SHIFT 56
+#define SEND_SC2VLT1 (TXE + 0x0000000000B8)
+#define SEND_SC2VLT1_SC10_SHIFT 16
+#define SEND_SC2VLT1_SC11_SHIFT 24
+#define SEND_SC2VLT1_SC12_SHIFT 32
+#define SEND_SC2VLT1_SC13_SHIFT 40
+#define SEND_SC2VLT1_SC14_SHIFT 48
+#define SEND_SC2VLT1_SC15_SHIFT 56
+#define SEND_SC2VLT1_SC8_SHIFT 0
+#define SEND_SC2VLT1_SC9_SHIFT 8
+#define SEND_SC2VLT2 (TXE + 0x0000000000C0)
+#define SEND_SC2VLT2_SC16_SHIFT 0
+#define SEND_SC2VLT2_SC17_SHIFT 8
+#define SEND_SC2VLT2_SC18_SHIFT 16
+#define SEND_SC2VLT2_SC19_SHIFT 24
+#define SEND_SC2VLT2_SC20_SHIFT 32
+#define SEND_SC2VLT2_SC21_SHIFT 40
+#define SEND_SC2VLT2_SC22_SHIFT 48
+#define SEND_SC2VLT2_SC23_SHIFT 56
+#define SEND_SC2VLT3 (TXE + 0x0000000000C8)
+#define SEND_SC2VLT3_SC24_SHIFT 0
+#define SEND_SC2VLT3_SC25_SHIFT 8
+#define SEND_SC2VLT3_SC26_SHIFT 16
+#define SEND_SC2VLT3_SC27_SHIFT 24
+#define SEND_SC2VLT3_SC28_SHIFT 32
+#define SEND_SC2VLT3_SC29_SHIFT 40
+#define SEND_SC2VLT3_SC30_SHIFT 48
+#define SEND_SC2VLT3_SC31_SHIFT 56
+#define SEND_STATIC_RATE_CONTROL (TXE + 0x0000000000A8)
+#define SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SHIFT 0
+#define SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SMASK 0xFFFFull
+#define PCIE_CFG_REG_PL2 (PCIE + 0x000000000708)
+#define PCIE_CFG_REG_PL3 (PCIE + 0x00000000070C)
+#define PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SHIFT 27
+#define PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SMASK 0x38000000
+#define PCIE_CFG_REG_PL102 (PCIE + 0x000000000898)
+#define PCIE_CFG_REG_PL102_GEN3_EQ_POST_CURSOR_PSET_SHIFT 12
+#define PCIE_CFG_REG_PL102_GEN3_EQ_CURSOR_PSET_SHIFT 6
+#define PCIE_CFG_REG_PL102_GEN3_EQ_PRE_CURSOR_PSET_SHIFT 0
+#define PCIE_CFG_REG_PL103 (PCIE + 0x00000000089C)
+#define PCIE_CFG_REG_PL105 (PCIE + 0x0000000008A4)
+#define PCIE_CFG_REG_PL105_GEN3_EQ_VIOLATE_COEF_RULES_SMASK 0x1ull
+#define PCIE_CFG_REG_PL2_LOW_PWR_ENT_CNT_SHIFT 24
+#define PCIE_CFG_REG_PL100 (PCIE + 0x000000000890)
+#define PCIE_CFG_REG_PL100_EQ_EIEOS_CNT_SMASK 0x400ull
+#define PCIE_CFG_REG_PL101 (PCIE + 0x000000000894)
+#define PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_FS_SHIFT 6
+#define PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_LF_SHIFT 0
+#define PCIE_CFG_REG_PL106 (PCIE + 0x0000000008A8)
+#define PCIE_CFG_REG_PL106_GEN3_EQ_PSET_REQ_VEC_SHIFT 8
+#define PCIE_CFG_REG_PL106_GEN3_EQ_EVAL2MS_DISABLE_SMASK 0x20ull
+#define PCIE_CFG_REG_PL106_GEN3_EQ_PHASE23_EXIT_MODE_SMASK 0x10ull
+#define CCE_INT_BLOCKED (CCE + 0x000000110C00)
+#define SEND_DMA_IDLE_CNT (TXE + 0x000000200040)
+#define SEND_DMA_DESC_FETCHED_CNT (TXE + 0x000000200058)
+#define CCE_MSIX_PBA_OFFSET 0X0110000
+
+#endif          /* DEF_CHIP_REG */
diff --git a/drivers/infiniband/hw/hfi1/common.h b/drivers/infiniband/hw/hfi1/common.h
new file mode 100644 (file)
index 0000000..fcc9c21
--- /dev/null
@@ -0,0 +1,411 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef _COMMON_H
+#define _COMMON_H
+
+#include <rdma/hfi/hfi1_user.h>
+
+/*
+ * This file contains defines, structures, etc. that are used
+ * to communicate between kernel and user code.
+ */
+
+/* version of protocol header (known to chip also). In the long run,
+ * we should be able to generate and accept a range of version numbers;
+ * for now we only accept one, and it's compiled in.
+ */
+#define IPS_PROTO_VERSION 2
+
+/*
+ * These are compile time constants that you may want to enable or disable
+ * if you are trying to debug problems with code or performance.
+ * HFI1_VERBOSE_TRACING define as 1 if you want additional tracing in
+ * fast path code
+ * HFI1_TRACE_REGWRITES define as 1 if you want register writes to be
+ * traced in fast path code
+ * _HFI1_TRACING define as 0 if you want to remove all tracing in a
+ * compilation unit
+ */
+
+/*
+ * If a packet's QP[23:16] bits match this value, then it is
+ * a PSM packet and the hardware will expect a KDETH header
+ * following the BTH.
+ */
+#define DEFAULT_KDETH_QP 0x80
+
+/* driver/hw feature set bitmask */
+#define HFI1_CAP_USER_SHIFT      24
+#define HFI1_CAP_MASK            ((1UL << HFI1_CAP_USER_SHIFT) - 1)
+/* locked flag - if set, only HFI1_CAP_WRITABLE_MASK bits can be set */
+#define HFI1_CAP_LOCKED_SHIFT    63
+#define HFI1_CAP_LOCKED_MASK     0x1ULL
+#define HFI1_CAP_LOCKED_SMASK    (HFI1_CAP_LOCKED_MASK << HFI1_CAP_LOCKED_SHIFT)
+/* extra bits used between kernel and user processes */
+#define HFI1_CAP_MISC_SHIFT      (HFI1_CAP_USER_SHIFT * 2)
+#define HFI1_CAP_MISC_MASK       ((1ULL << (HFI1_CAP_LOCKED_SHIFT - \
+                                          HFI1_CAP_MISC_SHIFT)) - 1)
+
+#define HFI1_CAP_KSET(cap) ({ hfi1_cap_mask |= HFI1_CAP_##cap; hfi1_cap_mask; })
+#define HFI1_CAP_KCLEAR(cap)                                           \
+       ({                                                              \
+               hfi1_cap_mask &= ~HFI1_CAP_##cap;                       \
+               hfi1_cap_mask;                                          \
+       })
+#define HFI1_CAP_USET(cap)                                             \
+       ({                                                              \
+               hfi1_cap_mask |= (HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT); \
+               hfi1_cap_mask;                                          \
+               })
+#define HFI1_CAP_UCLEAR(cap)                                           \
+       ({                                                              \
+               hfi1_cap_mask &= ~(HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT); \
+               hfi1_cap_mask;                                          \
+       })
+#define HFI1_CAP_SET(cap)                                              \
+       ({                                                              \
+               hfi1_cap_mask |= (HFI1_CAP_##cap | (HFI1_CAP_##cap <<   \
+                                                 HFI1_CAP_USER_SHIFT)); \
+               hfi1_cap_mask;                                          \
+       })
+#define HFI1_CAP_CLEAR(cap)                                            \
+       ({                                                              \
+               hfi1_cap_mask &= ~(HFI1_CAP_##cap |                     \
+                                 (HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT)); \
+               hfi1_cap_mask;                                          \
+       })
+#define HFI1_CAP_LOCK()                                                        \
+       ({ hfi1_cap_mask |= HFI1_CAP_LOCKED_SMASK; hfi1_cap_mask; })
+#define HFI1_CAP_LOCKED() (!!(hfi1_cap_mask & HFI1_CAP_LOCKED_SMASK))
+/*
+ * The set of capability bits that can be changed after initial load
+ * This set is the same for kernel and user contexts. However, for
+ * user contexts, the set can be further filtered by using the
+ * HFI1_CAP_RESERVED_MASK bits.
+ */
+#define HFI1_CAP_WRITABLE_MASK   (HFI1_CAP_SDMA_AHG |                  \
+                                 HFI1_CAP_HDRSUPP |                    \
+                                 HFI1_CAP_MULTI_PKT_EGR |              \
+                                 HFI1_CAP_NODROP_RHQ_FULL |            \
+                                 HFI1_CAP_NODROP_EGR_FULL |            \
+                                 HFI1_CAP_ALLOW_PERM_JKEY |            \
+                                 HFI1_CAP_STATIC_RATE_CTRL |           \
+                                 HFI1_CAP_PRINT_UNIMPL |               \
+                                 HFI1_CAP_TID_UNMAP)
+/*
+ * A set of capability bits that are "global" and are not allowed to be
+ * set in the user bitmask.
+ */
+#define HFI1_CAP_RESERVED_MASK   ((HFI1_CAP_SDMA |                     \
+                                 HFI1_CAP_USE_SDMA_HEAD |              \
+                                 HFI1_CAP_EXTENDED_PSN |               \
+                                 HFI1_CAP_PRINT_UNIMPL |               \
+                                 HFI1_CAP_NO_INTEGRITY |               \
+                                 HFI1_CAP_PKEY_CHECK) <<               \
+                                HFI1_CAP_USER_SHIFT)
+/*
+ * Set of capabilities that need to be enabled for kernel context in
+ * order to be allowed for user contexts, as well.
+ */
+#define HFI1_CAP_MUST_HAVE_KERN (HFI1_CAP_STATIC_RATE_CTRL)
+/* Default enabled capabilities (both kernel and user) */
+#define HFI1_CAP_MASK_DEFAULT    (HFI1_CAP_HDRSUPP |                   \
+                                HFI1_CAP_NODROP_RHQ_FULL |             \
+                                HFI1_CAP_NODROP_EGR_FULL |             \
+                                HFI1_CAP_SDMA |                        \
+                                HFI1_CAP_PRINT_UNIMPL |                \
+                                HFI1_CAP_STATIC_RATE_CTRL |            \
+                                HFI1_CAP_PKEY_CHECK |                  \
+                                HFI1_CAP_MULTI_PKT_EGR |               \
+                                HFI1_CAP_EXTENDED_PSN |                \
+                                ((HFI1_CAP_HDRSUPP |                   \
+                                  HFI1_CAP_MULTI_PKT_EGR |             \
+                                  HFI1_CAP_STATIC_RATE_CTRL |          \
+                                  HFI1_CAP_PKEY_CHECK |                \
+                                  HFI1_CAP_EARLY_CREDIT_RETURN) <<     \
+                                 HFI1_CAP_USER_SHIFT))
+/*
+ * A bitmask of kernel/global capabilities that should be communicated
+ * to user level processes.
+ */
+#define HFI1_CAP_K2U (HFI1_CAP_SDMA |                  \
+                    HFI1_CAP_EXTENDED_PSN |            \
+                    HFI1_CAP_PKEY_CHECK |              \
+                    HFI1_CAP_NO_INTEGRITY)
+
+#define HFI1_USER_SWVERSION ((HFI1_USER_SWMAJOR << HFI1_SWMAJOR_SHIFT) | \
+                            HFI1_USER_SWMINOR)
+
+#ifndef HFI1_KERN_TYPE
+#define HFI1_KERN_TYPE 0
+#endif
+
+/*
+ * Similarly, this is the kernel version going back to the user.  It's
+ * slightly different, in that we want to tell if the driver was built as
+ * part of a Intel release, or from the driver from openfabrics.org,
+ * kernel.org, or a standard distribution, for support reasons.
+ * The high bit is 0 for non-Intel and 1 for Intel-built/supplied.
+ *
+ * It's returned by the driver to the user code during initialization in the
+ * spi_sw_version field of hfi1_base_info, so the user code can in turn
+ * check for compatibility with the kernel.
+*/
+#define HFI1_KERN_SWVERSION ((HFI1_KERN_TYPE << 31) | HFI1_USER_SWVERSION)
+
+/*
+ * Define the driver version number.  This is something that refers only
+ * to the driver itself, not the software interfaces it supports.
+ */
+#ifndef HFI1_DRIVER_VERSION_BASE
+#define HFI1_DRIVER_VERSION_BASE "0.9-294"
+#endif
+
+/* create the final driver version string */
+#ifdef HFI1_IDSTR
+#define HFI1_DRIVER_VERSION HFI1_DRIVER_VERSION_BASE " " HFI1_IDSTR
+#else
+#define HFI1_DRIVER_VERSION HFI1_DRIVER_VERSION_BASE
+#endif
+
+/*
+ * Diagnostics can send a packet by writing the following
+ * struct to the diag packet special file.
+ *
+ * This allows a custom PBC qword, so that special modes and deliberate
+ * changes to CRCs can be used.
+ */
+#define _DIAG_PKT_VERS 1
+struct diag_pkt {
+       __u16 version;          /* structure version */
+       __u16 unit;             /* which device */
+       __u16 sw_index;         /* send sw index to use */
+       __u16 len;              /* data length, in bytes */
+       __u16 port;             /* port number */
+       __u16 unused;
+       __u32 flags;            /* call flags */
+       __u64 data;             /* user data pointer */
+       __u64 pbc;              /* PBC for the packet */
+};
+
+/* diag_pkt flags */
+#define F_DIAGPKT_WAIT 0x1     /* wait until packet is sent */
+
+/*
+ * The next set of defines are for packet headers, and chip register
+ * and memory bits that are visible to and/or used by user-mode software.
+ */
+
+/*
+ * Receive Header Flags
+ */
+#define RHF_PKT_LEN_SHIFT      0
+#define RHF_PKT_LEN_MASK       0xfffull
+#define RHF_PKT_LEN_SMASK (RHF_PKT_LEN_MASK << RHF_PKT_LEN_SHIFT)
+
+#define RHF_RCV_TYPE_SHIFT     12
+#define RHF_RCV_TYPE_MASK      0x7ull
+#define RHF_RCV_TYPE_SMASK (RHF_RCV_TYPE_MASK << RHF_RCV_TYPE_SHIFT)
+
+#define RHF_USE_EGR_BFR_SHIFT  15
+#define RHF_USE_EGR_BFR_MASK   0x1ull
+#define RHF_USE_EGR_BFR_SMASK (RHF_USE_EGR_BFR_MASK << RHF_USE_EGR_BFR_SHIFT)
+
+#define RHF_EGR_INDEX_SHIFT    16
+#define RHF_EGR_INDEX_MASK     0x7ffull
+#define RHF_EGR_INDEX_SMASK (RHF_EGR_INDEX_MASK << RHF_EGR_INDEX_SHIFT)
+
+#define RHF_DC_INFO_SHIFT      27
+#define RHF_DC_INFO_MASK       0x1ull
+#define RHF_DC_INFO_SMASK (RHF_DC_INFO_MASK << RHF_DC_INFO_SHIFT)
+
+#define RHF_RCV_SEQ_SHIFT      28
+#define RHF_RCV_SEQ_MASK       0xfull
+#define RHF_RCV_SEQ_SMASK (RHF_RCV_SEQ_MASK << RHF_RCV_SEQ_SHIFT)
+
+#define RHF_EGR_OFFSET_SHIFT   32
+#define RHF_EGR_OFFSET_MASK    0xfffull
+#define RHF_EGR_OFFSET_SMASK (RHF_EGR_OFFSET_MASK << RHF_EGR_OFFSET_SHIFT)
+#define RHF_HDRQ_OFFSET_SHIFT  44
+#define RHF_HDRQ_OFFSET_MASK   0x1ffull
+#define RHF_HDRQ_OFFSET_SMASK (RHF_HDRQ_OFFSET_MASK << RHF_HDRQ_OFFSET_SHIFT)
+#define RHF_K_HDR_LEN_ERR      (0x1ull << 53)
+#define RHF_DC_UNC_ERR         (0x1ull << 54)
+#define RHF_DC_ERR             (0x1ull << 55)
+#define RHF_RCV_TYPE_ERR_SHIFT 56
+#define RHF_RCV_TYPE_ERR_MASK  0x7ul
+#define RHF_RCV_TYPE_ERR_SMASK (RHF_RCV_TYPE_ERR_MASK << RHF_RCV_TYPE_ERR_SHIFT)
+#define RHF_TID_ERR            (0x1ull << 59)
+#define RHF_LEN_ERR            (0x1ull << 60)
+#define RHF_ECC_ERR            (0x1ull << 61)
+#define RHF_VCRC_ERR           (0x1ull << 62)
+#define RHF_ICRC_ERR           (0x1ull << 63)
+
+#define RHF_ERROR_SMASK 0xffe0000000000000ull          /* bits 63:53 */
+
+/* RHF receive types */
+#define RHF_RCV_TYPE_EXPECTED 0
+#define RHF_RCV_TYPE_EAGER    1
+#define RHF_RCV_TYPE_IB       2 /* normal IB, IB Raw, or IPv6 */
+#define RHF_RCV_TYPE_ERROR    3
+#define RHF_RCV_TYPE_BYPASS   4
+#define RHF_RCV_TYPE_INVALID5 5
+#define RHF_RCV_TYPE_INVALID6 6
+#define RHF_RCV_TYPE_INVALID7 7
+
+/* RHF receive type error - expected packet errors */
+#define RHF_RTE_EXPECTED_FLOW_SEQ_ERR  0x2
+#define RHF_RTE_EXPECTED_FLOW_GEN_ERR  0x4
+
+/* RHF receive type error - eager packet errors */
+#define RHF_RTE_EAGER_NO_ERR           0x0
+
+/* RHF receive type error - IB packet errors */
+#define RHF_RTE_IB_NO_ERR              0x0
+
+/* RHF receive type error - error packet errors */
+#define RHF_RTE_ERROR_NO_ERR           0x0
+#define RHF_RTE_ERROR_OP_CODE_ERR      0x1
+#define RHF_RTE_ERROR_KHDR_MIN_LEN_ERR 0x2
+#define RHF_RTE_ERROR_KHDR_HCRC_ERR    0x3
+#define RHF_RTE_ERROR_KHDR_KVER_ERR    0x4
+#define RHF_RTE_ERROR_CONTEXT_ERR      0x5
+#define RHF_RTE_ERROR_KHDR_TID_ERR     0x6
+
+/* RHF receive type error - bypass packet errors */
+#define RHF_RTE_BYPASS_NO_ERR          0x0
+
+/*
+ * This structure contains the first field common to all protocols
+ * that employ this chip.
+ */
+struct hfi1_message_header {
+       __be16 lrh[4];
+};
+
+/* IB - LRH header constants */
+#define HFI1_LRH_GRH 0x0003      /* 1. word of IB LRH - next header: GRH */
+#define HFI1_LRH_BTH 0x0002      /* 1. word of IB LRH - next header: BTH */
+
+/* misc. */
+#define SIZE_OF_CRC 1
+
+#define LIM_MGMT_P_KEY       0x7FFF
+#define FULL_MGMT_P_KEY      0xFFFF
+
+#define DEFAULT_P_KEY LIM_MGMT_P_KEY
+#define HFI1_AETH_CREDIT_SHIFT 24
+#define HFI1_AETH_CREDIT_MASK 0x1F
+#define HFI1_AETH_CREDIT_INVAL 0x1F
+#define HFI1_MSN_MASK 0xFFFFFF
+#define HFI1_FECN_SHIFT 31
+#define HFI1_FECN_MASK 1
+#define HFI1_FECN_SMASK BIT(HFI1_FECN_SHIFT)
+#define HFI1_BECN_SHIFT 30
+#define HFI1_BECN_MASK 1
+#define HFI1_BECN_SMASK BIT(HFI1_BECN_SHIFT)
+
+#define HFI1_PSM_IOC_BASE_SEQ 0x0
+
+static inline __u64 rhf_to_cpu(const __le32 *rbuf)
+{
+       return __le64_to_cpu(*((__le64 *)rbuf));
+}
+
+static inline u64 rhf_err_flags(u64 rhf)
+{
+       return rhf & RHF_ERROR_SMASK;
+}
+
+static inline u32 rhf_rcv_type(u64 rhf)
+{
+       return (rhf >> RHF_RCV_TYPE_SHIFT) & RHF_RCV_TYPE_MASK;
+}
+
+static inline u32 rhf_rcv_type_err(u64 rhf)
+{
+       return (rhf >> RHF_RCV_TYPE_ERR_SHIFT) & RHF_RCV_TYPE_ERR_MASK;
+}
+
+/* return size is in bytes, not DWORDs */
+static inline u32 rhf_pkt_len(u64 rhf)
+{
+       return ((rhf & RHF_PKT_LEN_SMASK) >> RHF_PKT_LEN_SHIFT) << 2;
+}
+
+static inline u32 rhf_egr_index(u64 rhf)
+{
+       return (rhf >> RHF_EGR_INDEX_SHIFT) & RHF_EGR_INDEX_MASK;
+}
+
+static inline u32 rhf_rcv_seq(u64 rhf)
+{
+       return (rhf >> RHF_RCV_SEQ_SHIFT) & RHF_RCV_SEQ_MASK;
+}
+
+/* returned offset is in DWORDS */
+static inline u32 rhf_hdrq_offset(u64 rhf)
+{
+       return (rhf >> RHF_HDRQ_OFFSET_SHIFT) & RHF_HDRQ_OFFSET_MASK;
+}
+
+static inline u64 rhf_use_egr_bfr(u64 rhf)
+{
+       return rhf & RHF_USE_EGR_BFR_SMASK;
+}
+
+static inline u64 rhf_dc_info(u64 rhf)
+{
+       return rhf & RHF_DC_INFO_SMASK;
+}
+
+static inline u32 rhf_egr_buf_offset(u64 rhf)
+{
+       return (rhf >> RHF_EGR_OFFSET_SHIFT) & RHF_EGR_OFFSET_MASK;
+}
+#endif /* _COMMON_H */
diff --git a/drivers/infiniband/hw/hfi1/debugfs.c b/drivers/infiniband/hw/hfi1/debugfs.c
new file mode 100644 (file)
index 0000000..dbab9d9
--- /dev/null
@@ -0,0 +1,1145 @@
+#ifdef CONFIG_DEBUG_FS
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/module.h>
+
+#include "hfi.h"
+#include "debugfs.h"
+#include "device.h"
+#include "qp.h"
+#include "sdma.h"
+
+static struct dentry *hfi1_dbg_root;
+
+#define private2dd(file) (file_inode(file)->i_private)
+#define private2ppd(file) (file_inode(file)->i_private)
+
+#define DEBUGFS_SEQ_FILE_OPS(name) \
+static const struct seq_operations _##name##_seq_ops = { \
+       .start = _##name##_seq_start, \
+       .next  = _##name##_seq_next, \
+       .stop  = _##name##_seq_stop, \
+       .show  = _##name##_seq_show \
+}
+
+#define DEBUGFS_SEQ_FILE_OPEN(name) \
+static int _##name##_open(struct inode *inode, struct file *s) \
+{ \
+       struct seq_file *seq; \
+       int ret; \
+       ret =  seq_open(s, &_##name##_seq_ops); \
+       if (ret) \
+               return ret; \
+       seq = s->private_data; \
+       seq->private = inode->i_private; \
+       return 0; \
+}
+
+#define DEBUGFS_FILE_OPS(name) \
+static const struct file_operations _##name##_file_ops = { \
+       .owner   = THIS_MODULE, \
+       .open    = _##name##_open, \
+       .read    = seq_read, \
+       .llseek  = seq_lseek, \
+       .release = seq_release \
+}
+
+#define DEBUGFS_FILE_CREATE(name, parent, data, ops, mode)     \
+do { \
+       struct dentry *ent; \
+       ent = debugfs_create_file(name, mode, parent, \
+               data, ops); \
+       if (!ent) \
+               pr_warn("create of %s failed\n", name); \
+} while (0)
+
+#define DEBUGFS_SEQ_FILE_CREATE(name, parent, data) \
+       DEBUGFS_FILE_CREATE(#name, parent, data, &_##name##_file_ops, S_IRUGO)
+
+static void *_opcode_stats_seq_start(struct seq_file *s, loff_t *pos)
+__acquires(RCU)
+{
+       struct hfi1_opcode_stats_perctx *opstats;
+
+       rcu_read_lock();
+       if (*pos >= ARRAY_SIZE(opstats->stats))
+               return NULL;
+       return pos;
+}
+
+static void *_opcode_stats_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+       struct hfi1_opcode_stats_perctx *opstats;
+
+       ++*pos;
+       if (*pos >= ARRAY_SIZE(opstats->stats))
+               return NULL;
+       return pos;
+}
+
+static void _opcode_stats_seq_stop(struct seq_file *s, void *v)
+__releases(RCU)
+{
+       rcu_read_unlock();
+}
+
+static int _opcode_stats_seq_show(struct seq_file *s, void *v)
+{
+       loff_t *spos = v;
+       loff_t i = *spos, j;
+       u64 n_packets = 0, n_bytes = 0;
+       struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+       struct hfi1_devdata *dd = dd_from_dev(ibd);
+
+       for (j = 0; j < dd->first_user_ctxt; j++) {
+               if (!dd->rcd[j])
+                       continue;
+               n_packets += dd->rcd[j]->opstats->stats[i].n_packets;
+               n_bytes += dd->rcd[j]->opstats->stats[i].n_bytes;
+       }
+       if (!n_packets && !n_bytes)
+               return SEQ_SKIP;
+       seq_printf(s, "%02llx %llu/%llu\n", i,
+                  (unsigned long long)n_packets,
+                  (unsigned long long)n_bytes);
+
+       return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(opcode_stats);
+DEBUGFS_SEQ_FILE_OPEN(opcode_stats)
+DEBUGFS_FILE_OPS(opcode_stats);
+
+static void *_ctx_stats_seq_start(struct seq_file *s, loff_t *pos)
+{
+       struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+       struct hfi1_devdata *dd = dd_from_dev(ibd);
+
+       if (!*pos)
+               return SEQ_START_TOKEN;
+       if (*pos >= dd->first_user_ctxt)
+               return NULL;
+       return pos;
+}
+
+static void *_ctx_stats_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+       struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+       struct hfi1_devdata *dd = dd_from_dev(ibd);
+
+       if (v == SEQ_START_TOKEN)
+               return pos;
+
+       ++*pos;
+       if (*pos >= dd->first_user_ctxt)
+               return NULL;
+       return pos;
+}
+
+static void _ctx_stats_seq_stop(struct seq_file *s, void *v)
+{
+       /* nothing allocated */
+}
+
+static int _ctx_stats_seq_show(struct seq_file *s, void *v)
+{
+       loff_t *spos;
+       loff_t i, j;
+       u64 n_packets = 0;
+       struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+       struct hfi1_devdata *dd = dd_from_dev(ibd);
+
+       if (v == SEQ_START_TOKEN) {
+               seq_puts(s, "Ctx:npkts\n");
+               return 0;
+       }
+
+       spos = v;
+       i = *spos;
+
+       if (!dd->rcd[i])
+               return SEQ_SKIP;
+
+       for (j = 0; j < ARRAY_SIZE(dd->rcd[i]->opstats->stats); j++)
+               n_packets += dd->rcd[i]->opstats->stats[j].n_packets;
+
+       if (!n_packets)
+               return SEQ_SKIP;
+
+       seq_printf(s, "  %llu:%llu\n", i, n_packets);
+       return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(ctx_stats);
+DEBUGFS_SEQ_FILE_OPEN(ctx_stats)
+DEBUGFS_FILE_OPS(ctx_stats);
+
+static void *_qp_stats_seq_start(struct seq_file *s, loff_t *pos)
+__acquires(RCU)
+{
+       struct qp_iter *iter;
+       loff_t n = *pos;
+
+       rcu_read_lock();
+       iter = qp_iter_init(s->private);
+       if (!iter)
+               return NULL;
+
+       while (n--) {
+               if (qp_iter_next(iter)) {
+                       kfree(iter);
+                       return NULL;
+               }
+       }
+
+       return iter;
+}
+
+static void *_qp_stats_seq_next(struct seq_file *s, void *iter_ptr,
+                               loff_t *pos)
+{
+       struct qp_iter *iter = iter_ptr;
+
+       (*pos)++;
+
+       if (qp_iter_next(iter)) {
+               kfree(iter);
+               return NULL;
+       }
+
+       return iter;
+}
+
+static void _qp_stats_seq_stop(struct seq_file *s, void *iter_ptr)
+__releases(RCU)
+{
+       rcu_read_unlock();
+}
+
+static int _qp_stats_seq_show(struct seq_file *s, void *iter_ptr)
+{
+       struct qp_iter *iter = iter_ptr;
+
+       if (!iter)
+               return 0;
+
+       qp_iter_print(s, iter);
+
+       return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(qp_stats);
+DEBUGFS_SEQ_FILE_OPEN(qp_stats)
+DEBUGFS_FILE_OPS(qp_stats);
+
+static void *_sdes_seq_start(struct seq_file *s, loff_t *pos)
+__acquires(RCU)
+{
+       struct hfi1_ibdev *ibd;
+       struct hfi1_devdata *dd;
+
+       rcu_read_lock();
+       ibd = (struct hfi1_ibdev *)s->private;
+       dd = dd_from_dev(ibd);
+       if (!dd->per_sdma || *pos >= dd->num_sdma)
+               return NULL;
+       return pos;
+}
+
+static void *_sdes_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+       struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+       struct hfi1_devdata *dd = dd_from_dev(ibd);
+
+       ++*pos;
+       if (!dd->per_sdma || *pos >= dd->num_sdma)
+               return NULL;
+       return pos;
+}
+
+static void _sdes_seq_stop(struct seq_file *s, void *v)
+__releases(RCU)
+{
+       rcu_read_unlock();
+}
+
+static int _sdes_seq_show(struct seq_file *s, void *v)
+{
+       struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+       struct hfi1_devdata *dd = dd_from_dev(ibd);
+       loff_t *spos = v;
+       loff_t i = *spos;
+
+       sdma_seqfile_dump_sde(s, &dd->per_sdma[i]);
+       return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(sdes);
+DEBUGFS_SEQ_FILE_OPEN(sdes)
+DEBUGFS_FILE_OPS(sdes);
+
+/* read the per-device counters */
+static ssize_t dev_counters_read(struct file *file, char __user *buf,
+                                size_t count, loff_t *ppos)
+{
+       u64 *counters;
+       size_t avail;
+       struct hfi1_devdata *dd;
+       ssize_t rval;
+
+       rcu_read_lock();
+       dd = private2dd(file);
+       avail = hfi1_read_cntrs(dd, NULL, &counters);
+       rval =  simple_read_from_buffer(buf, count, ppos, counters, avail);
+       rcu_read_unlock();
+       return rval;
+}
+
+/* read the per-device counters */
+static ssize_t dev_names_read(struct file *file, char __user *buf,
+                             size_t count, loff_t *ppos)
+{
+       char *names;
+       size_t avail;
+       struct hfi1_devdata *dd;
+       ssize_t rval;
+
+       rcu_read_lock();
+       dd = private2dd(file);
+       avail = hfi1_read_cntrs(dd, &names, NULL);
+       rval =  simple_read_from_buffer(buf, count, ppos, names, avail);
+       rcu_read_unlock();
+       return rval;
+}
+
+struct counter_info {
+       char *name;
+       const struct file_operations ops;
+};
+
+/*
+ * Could use file_inode(file)->i_ino to figure out which file,
+ * instead of separate routine for each, but for now, this works...
+ */
+
+/* read the per-port names (same for each port) */
+static ssize_t portnames_read(struct file *file, char __user *buf,
+                             size_t count, loff_t *ppos)
+{
+       char *names;
+       size_t avail;
+       struct hfi1_devdata *dd;
+       ssize_t rval;
+
+       rcu_read_lock();
+       dd = private2dd(file);
+       avail = hfi1_read_portcntrs(dd->pport, &names, NULL);
+       rval = simple_read_from_buffer(buf, count, ppos, names, avail);
+       rcu_read_unlock();
+       return rval;
+}
+
+/* read the per-port counters */
+static ssize_t portcntrs_debugfs_read(struct file *file, char __user *buf,
+                                     size_t count, loff_t *ppos)
+{
+       u64 *counters;
+       size_t avail;
+       struct hfi1_pportdata *ppd;
+       ssize_t rval;
+
+       rcu_read_lock();
+       ppd = private2ppd(file);
+       avail = hfi1_read_portcntrs(ppd, NULL, &counters);
+       rval = simple_read_from_buffer(buf, count, ppos, counters, avail);
+       rcu_read_unlock();
+       return rval;
+}
+
+static void check_dyn_flag(u64 scratch0, char *p, int size, int *used,
+                          int this_hfi, int hfi, u32 flag, const char *what)
+{
+       u32 mask;
+
+       mask = flag << (hfi ? CR_DYN_SHIFT : 0);
+       if (scratch0 & mask) {
+               *used += scnprintf(p + *used, size - *used,
+                                  "  0x%08x - HFI%d %s in use, %s device\n",
+                                  mask, hfi, what,
+                                  this_hfi == hfi ? "this" : "other");
+       }
+}
+
+static ssize_t asic_flags_read(struct file *file, char __user *buf,
+                              size_t count, loff_t *ppos)
+{
+       struct hfi1_pportdata *ppd;
+       struct hfi1_devdata *dd;
+       u64 scratch0;
+       char *tmp;
+       int ret = 0;
+       int size;
+       int used;
+       int i;
+
+       rcu_read_lock();
+       ppd = private2ppd(file);
+       dd = ppd->dd;
+       size = PAGE_SIZE;
+       used = 0;
+       tmp = kmalloc(size, GFP_KERNEL);
+       if (!tmp) {
+               rcu_read_unlock();
+               return -ENOMEM;
+       }
+
+       scratch0 = read_csr(dd, ASIC_CFG_SCRATCH);
+       used += scnprintf(tmp + used, size - used,
+                         "Resource flags: 0x%016llx\n", scratch0);
+
+       /* check permanent flag */
+       if (scratch0 & CR_THERM_INIT) {
+               used += scnprintf(tmp + used, size - used,
+                                 "  0x%08x - thermal monitoring initialized\n",
+                                 (u32)CR_THERM_INIT);
+       }
+
+       /* check each dynamic flag on each HFI */
+       for (i = 0; i < 2; i++) {
+               check_dyn_flag(scratch0, tmp, size, &used, dd->hfi1_id, i,
+                              CR_SBUS, "SBus");
+               check_dyn_flag(scratch0, tmp, size, &used, dd->hfi1_id, i,
+                              CR_EPROM, "EPROM");
+               check_dyn_flag(scratch0, tmp, size, &used, dd->hfi1_id, i,
+                              CR_I2C1, "i2c chain 1");
+               check_dyn_flag(scratch0, tmp, size, &used, dd->hfi1_id, i,
+                              CR_I2C2, "i2c chain 2");
+       }
+       used += scnprintf(tmp + used, size - used, "Write bits to clear\n");
+
+       ret = simple_read_from_buffer(buf, count, ppos, tmp, used);
+       rcu_read_unlock();
+       kfree(tmp);
+       return ret;
+}
+
+static ssize_t asic_flags_write(struct file *file, const char __user *buf,
+                               size_t count, loff_t *ppos)
+{
+       struct hfi1_pportdata *ppd;
+       struct hfi1_devdata *dd;
+       char *buff;
+       int ret;
+       unsigned long long value;
+       u64 scratch0;
+       u64 clear;
+
+       rcu_read_lock();
+       ppd = private2ppd(file);
+       dd = ppd->dd;
+
+       buff = kmalloc(count + 1, GFP_KERNEL);
+       if (!buff) {
+               ret = -ENOMEM;
+               goto do_return;
+       }
+
+       ret = copy_from_user(buff, buf, count);
+       if (ret > 0) {
+               ret = -EFAULT;
+               goto do_free;
+       }
+
+       /* zero terminate and read the expected integer */
+       buff[count] = 0;
+       ret = kstrtoull(buff, 0, &value);
+       if (ret)
+               goto do_free;
+       clear = value;
+
+       /* obtain exclusive access */
+       mutex_lock(&dd->asic_data->asic_resource_mutex);
+       acquire_hw_mutex(dd);
+
+       scratch0 = read_csr(dd, ASIC_CFG_SCRATCH);
+       scratch0 &= ~clear;
+       write_csr(dd, ASIC_CFG_SCRATCH, scratch0);
+       /* force write to be visible to other HFI on another OS */
+       (void)read_csr(dd, ASIC_CFG_SCRATCH);
+
+       release_hw_mutex(dd);
+       mutex_unlock(&dd->asic_data->asic_resource_mutex);
+
+       /* return the number of bytes written */
+       ret = count;
+
+ do_free:
+       kfree(buff);
+ do_return:
+       rcu_read_unlock();
+       return ret;
+}
+
+/*
+ * read the per-port QSFP data for ppd
+ */
+static ssize_t qsfp_debugfs_dump(struct file *file, char __user *buf,
+                                size_t count, loff_t *ppos)
+{
+       struct hfi1_pportdata *ppd;
+       char *tmp;
+       int ret;
+
+       rcu_read_lock();
+       ppd = private2ppd(file);
+       tmp = kmalloc(PAGE_SIZE, GFP_KERNEL);
+       if (!tmp) {
+               rcu_read_unlock();
+               return -ENOMEM;
+       }
+
+       ret = qsfp_dump(ppd, tmp, PAGE_SIZE);
+       if (ret > 0)
+               ret = simple_read_from_buffer(buf, count, ppos, tmp, ret);
+       rcu_read_unlock();
+       kfree(tmp);
+       return ret;
+}
+
+/* Do an i2c write operation on the chain for the given HFI. */
+static ssize_t __i2c_debugfs_write(struct file *file, const char __user *buf,
+                                  size_t count, loff_t *ppos, u32 target)
+{
+       struct hfi1_pportdata *ppd;
+       char *buff;
+       int ret;
+       int i2c_addr;
+       int offset;
+       int total_written;
+
+       rcu_read_lock();
+       ppd = private2ppd(file);
+
+       /* byte offset format: [offsetSize][i2cAddr][offsetHigh][offsetLow] */
+       i2c_addr = (*ppos >> 16) & 0xffff;
+       offset = *ppos & 0xffff;
+
+       /* explicitly reject invalid address 0 to catch cp and cat */
+       if (i2c_addr == 0) {
+               ret = -EINVAL;
+               goto _return;
+       }
+
+       buff = kmalloc(count, GFP_KERNEL);
+       if (!buff) {
+               ret = -ENOMEM;
+               goto _return;
+       }
+
+       ret = copy_from_user(buff, buf, count);
+       if (ret > 0) {
+               ret = -EFAULT;
+               goto _free;
+       }
+
+       total_written = i2c_write(ppd, target, i2c_addr, offset, buff, count);
+       if (total_written < 0) {
+               ret = total_written;
+               goto _free;
+       }
+
+       *ppos += total_written;
+
+       ret = total_written;
+
+ _free:
+       kfree(buff);
+ _return:
+       rcu_read_unlock();
+       return ret;
+}
+
+/* Do an i2c write operation on chain for HFI 0. */
+static ssize_t i2c1_debugfs_write(struct file *file, const char __user *buf,
+                                 size_t count, loff_t *ppos)
+{
+       return __i2c_debugfs_write(file, buf, count, ppos, 0);
+}
+
+/* Do an i2c write operation on chain for HFI 1. */
+static ssize_t i2c2_debugfs_write(struct file *file, const char __user *buf,
+                                 size_t count, loff_t *ppos)
+{
+       return __i2c_debugfs_write(file, buf, count, ppos, 1);
+}
+
+/* Do an i2c read operation on the chain for the given HFI. */
+static ssize_t __i2c_debugfs_read(struct file *file, char __user *buf,
+                                 size_t count, loff_t *ppos, u32 target)
+{
+       struct hfi1_pportdata *ppd;
+       char *buff;
+       int ret;
+       int i2c_addr;
+       int offset;
+       int total_read;
+
+       rcu_read_lock();
+       ppd = private2ppd(file);
+
+       /* byte offset format: [offsetSize][i2cAddr][offsetHigh][offsetLow] */
+       i2c_addr = (*ppos >> 16) & 0xffff;
+       offset = *ppos & 0xffff;
+
+       /* explicitly reject invalid address 0 to catch cp and cat */
+       if (i2c_addr == 0) {
+               ret = -EINVAL;
+               goto _return;
+       }
+
+       buff = kmalloc(count, GFP_KERNEL);
+       if (!buff) {
+               ret = -ENOMEM;
+               goto _return;
+       }
+
+       total_read = i2c_read(ppd, target, i2c_addr, offset, buff, count);
+       if (total_read < 0) {
+               ret = total_read;
+               goto _free;
+       }
+
+       *ppos += total_read;
+
+       ret = copy_to_user(buf, buff, total_read);
+       if (ret > 0) {
+               ret = -EFAULT;
+               goto _free;
+       }
+
+       ret = total_read;
+
+ _free:
+       kfree(buff);
+ _return:
+       rcu_read_unlock();
+       return ret;
+}
+
+/* Do an i2c read operation on chain for HFI 0. */
+static ssize_t i2c1_debugfs_read(struct file *file, char __user *buf,
+                                size_t count, loff_t *ppos)
+{
+       return __i2c_debugfs_read(file, buf, count, ppos, 0);
+}
+
+/* Do an i2c read operation on chain for HFI 1. */
+static ssize_t i2c2_debugfs_read(struct file *file, char __user *buf,
+                                size_t count, loff_t *ppos)
+{
+       return __i2c_debugfs_read(file, buf, count, ppos, 1);
+}
+
+/* Do a QSFP write operation on the i2c chain for the given HFI. */
+static ssize_t __qsfp_debugfs_write(struct file *file, const char __user *buf,
+                                   size_t count, loff_t *ppos, u32 target)
+{
+       struct hfi1_pportdata *ppd;
+       char *buff;
+       int ret;
+       int total_written;
+
+       rcu_read_lock();
+       if (*ppos + count > QSFP_PAGESIZE * 4) { /* base page + page00-page03 */
+               ret = -EINVAL;
+               goto _return;
+       }
+
+       ppd = private2ppd(file);
+
+       buff = kmalloc(count, GFP_KERNEL);
+       if (!buff) {
+               ret = -ENOMEM;
+               goto _return;
+       }
+
+       ret = copy_from_user(buff, buf, count);
+       if (ret > 0) {
+               ret = -EFAULT;
+               goto _free;
+       }
+
+       total_written = qsfp_write(ppd, target, *ppos, buff, count);
+       if (total_written < 0) {
+               ret = total_written;
+               goto _free;
+       }
+
+       *ppos += total_written;
+
+       ret = total_written;
+
+ _free:
+       kfree(buff);
+ _return:
+       rcu_read_unlock();
+       return ret;
+}
+
+/* Do a QSFP write operation on i2c chain for HFI 0. */
+static ssize_t qsfp1_debugfs_write(struct file *file, const char __user *buf,
+                                  size_t count, loff_t *ppos)
+{
+       return __qsfp_debugfs_write(file, buf, count, ppos, 0);
+}
+
+/* Do a QSFP write operation on i2c chain for HFI 1. */
+static ssize_t qsfp2_debugfs_write(struct file *file, const char __user *buf,
+                                  size_t count, loff_t *ppos)
+{
+       return __qsfp_debugfs_write(file, buf, count, ppos, 1);
+}
+
+/* Do a QSFP read operation on the i2c chain for the given HFI. */
+static ssize_t __qsfp_debugfs_read(struct file *file, char __user *buf,
+                                  size_t count, loff_t *ppos, u32 target)
+{
+       struct hfi1_pportdata *ppd;
+       char *buff;
+       int ret;
+       int total_read;
+
+       rcu_read_lock();
+       if (*ppos + count > QSFP_PAGESIZE * 4) { /* base page + page00-page03 */
+               ret = -EINVAL;
+               goto _return;
+       }
+
+       ppd = private2ppd(file);
+
+       buff = kmalloc(count, GFP_KERNEL);
+       if (!buff) {
+               ret = -ENOMEM;
+               goto _return;
+       }
+
+       total_read = qsfp_read(ppd, target, *ppos, buff, count);
+       if (total_read < 0) {
+               ret = total_read;
+               goto _free;
+       }
+
+       *ppos += total_read;
+
+       ret = copy_to_user(buf, buff, total_read);
+       if (ret > 0) {
+               ret = -EFAULT;
+               goto _free;
+       }
+
+       ret = total_read;
+
+ _free:
+       kfree(buff);
+ _return:
+       rcu_read_unlock();
+       return ret;
+}
+
+/* Do a QSFP read operation on i2c chain for HFI 0. */
+static ssize_t qsfp1_debugfs_read(struct file *file, char __user *buf,
+                                 size_t count, loff_t *ppos)
+{
+       return __qsfp_debugfs_read(file, buf, count, ppos, 0);
+}
+
+/* Do a QSFP read operation on i2c chain for HFI 1. */
+static ssize_t qsfp2_debugfs_read(struct file *file, char __user *buf,
+                                 size_t count, loff_t *ppos)
+{
+       return __qsfp_debugfs_read(file, buf, count, ppos, 1);
+}
+
+static int __i2c_debugfs_open(struct inode *in, struct file *fp, u32 target)
+{
+       struct hfi1_pportdata *ppd;
+       int ret;
+
+       if (!try_module_get(THIS_MODULE))
+               return -ENODEV;
+
+       ppd = private2ppd(fp);
+
+       ret = acquire_chip_resource(ppd->dd, i2c_target(target), 0);
+       if (ret) /* failed - release the module */
+               module_put(THIS_MODULE);
+
+       return ret;
+}
+
+static int i2c1_debugfs_open(struct inode *in, struct file *fp)
+{
+       return __i2c_debugfs_open(in, fp, 0);
+}
+
+static int i2c2_debugfs_open(struct inode *in, struct file *fp)
+{
+       return __i2c_debugfs_open(in, fp, 1);
+}
+
+static int __i2c_debugfs_release(struct inode *in, struct file *fp, u32 target)
+{
+       struct hfi1_pportdata *ppd;
+
+       ppd = private2ppd(fp);
+
+       release_chip_resource(ppd->dd, i2c_target(target));
+       module_put(THIS_MODULE);
+
+       return 0;
+}
+
+static int i2c1_debugfs_release(struct inode *in, struct file *fp)
+{
+       return __i2c_debugfs_release(in, fp, 0);
+}
+
+static int i2c2_debugfs_release(struct inode *in, struct file *fp)
+{
+       return __i2c_debugfs_release(in, fp, 1);
+}
+
+static int __qsfp_debugfs_open(struct inode *in, struct file *fp, u32 target)
+{
+       struct hfi1_pportdata *ppd;
+       int ret;
+
+       if (!try_module_get(THIS_MODULE))
+               return -ENODEV;
+
+       ppd = private2ppd(fp);
+
+       ret = acquire_chip_resource(ppd->dd, i2c_target(target), 0);
+       if (ret) /* failed - release the module */
+               module_put(THIS_MODULE);
+
+       return ret;
+}
+
+static int qsfp1_debugfs_open(struct inode *in, struct file *fp)
+{
+       return __qsfp_debugfs_open(in, fp, 0);
+}
+
+static int qsfp2_debugfs_open(struct inode *in, struct file *fp)
+{
+       return __qsfp_debugfs_open(in, fp, 1);
+}
+
+static int __qsfp_debugfs_release(struct inode *in, struct file *fp, u32 target)
+{
+       struct hfi1_pportdata *ppd;
+
+       ppd = private2ppd(fp);
+
+       release_chip_resource(ppd->dd, i2c_target(target));
+       module_put(THIS_MODULE);
+
+       return 0;
+}
+
+static int qsfp1_debugfs_release(struct inode *in, struct file *fp)
+{
+       return __qsfp_debugfs_release(in, fp, 0);
+}
+
+static int qsfp2_debugfs_release(struct inode *in, struct file *fp)
+{
+       return __qsfp_debugfs_release(in, fp, 1);
+}
+
+#define DEBUGFS_OPS(nm, readroutine, writeroutine)     \
+{ \
+       .name = nm, \
+       .ops = { \
+               .read = readroutine, \
+               .write = writeroutine, \
+               .llseek = generic_file_llseek, \
+       }, \
+}
+
+#define DEBUGFS_XOPS(nm, readf, writef, openf, releasef) \
+{ \
+       .name = nm, \
+       .ops = { \
+               .read = readf, \
+               .write = writef, \
+               .llseek = generic_file_llseek, \
+               .open = openf, \
+               .release = releasef \
+       }, \
+}
+
+static const struct counter_info cntr_ops[] = {
+       DEBUGFS_OPS("counter_names", dev_names_read, NULL),
+       DEBUGFS_OPS("counters", dev_counters_read, NULL),
+       DEBUGFS_OPS("portcounter_names", portnames_read, NULL),
+};
+
+static const struct counter_info port_cntr_ops[] = {
+       DEBUGFS_OPS("port%dcounters", portcntrs_debugfs_read, NULL),
+       DEBUGFS_XOPS("i2c1", i2c1_debugfs_read, i2c1_debugfs_write,
+                    i2c1_debugfs_open, i2c1_debugfs_release),
+       DEBUGFS_XOPS("i2c2", i2c2_debugfs_read, i2c2_debugfs_write,
+                    i2c2_debugfs_open, i2c2_debugfs_release),
+       DEBUGFS_OPS("qsfp_dump%d", qsfp_debugfs_dump, NULL),
+       DEBUGFS_XOPS("qsfp1", qsfp1_debugfs_read, qsfp1_debugfs_write,
+                    qsfp1_debugfs_open, qsfp1_debugfs_release),
+       DEBUGFS_XOPS("qsfp2", qsfp2_debugfs_read, qsfp2_debugfs_write,
+                    qsfp2_debugfs_open, qsfp2_debugfs_release),
+       DEBUGFS_OPS("asic_flags", asic_flags_read, asic_flags_write),
+};
+
+void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd)
+{
+       char name[sizeof("port0counters") + 1];
+       char link[10];
+       struct hfi1_devdata *dd = dd_from_dev(ibd);
+       struct hfi1_pportdata *ppd;
+       int unit = dd->unit;
+       int i, j;
+
+       if (!hfi1_dbg_root)
+               return;
+       snprintf(name, sizeof(name), "%s_%d", class_name(), unit);
+       snprintf(link, sizeof(link), "%d", unit);
+       ibd->hfi1_ibdev_dbg = debugfs_create_dir(name, hfi1_dbg_root);
+       if (!ibd->hfi1_ibdev_dbg) {
+               pr_warn("create of %s failed\n", name);
+               return;
+       }
+       ibd->hfi1_ibdev_link =
+               debugfs_create_symlink(link, hfi1_dbg_root, name);
+       if (!ibd->hfi1_ibdev_link) {
+               pr_warn("create of %s symlink failed\n", name);
+               return;
+       }
+       DEBUGFS_SEQ_FILE_CREATE(opcode_stats, ibd->hfi1_ibdev_dbg, ibd);
+       DEBUGFS_SEQ_FILE_CREATE(ctx_stats, ibd->hfi1_ibdev_dbg, ibd);
+       DEBUGFS_SEQ_FILE_CREATE(qp_stats, ibd->hfi1_ibdev_dbg, ibd);
+       DEBUGFS_SEQ_FILE_CREATE(sdes, ibd->hfi1_ibdev_dbg, ibd);
+       /* dev counter files */
+       for (i = 0; i < ARRAY_SIZE(cntr_ops); i++)
+               DEBUGFS_FILE_CREATE(cntr_ops[i].name,
+                                   ibd->hfi1_ibdev_dbg,
+                                   dd,
+                                   &cntr_ops[i].ops, S_IRUGO);
+       /* per port files */
+       for (ppd = dd->pport, j = 0; j < dd->num_pports; j++, ppd++)
+               for (i = 0; i < ARRAY_SIZE(port_cntr_ops); i++) {
+                       snprintf(name,
+                                sizeof(name),
+                                port_cntr_ops[i].name,
+                                j + 1);
+                       DEBUGFS_FILE_CREATE(name,
+                                           ibd->hfi1_ibdev_dbg,
+                                           ppd,
+                                           &port_cntr_ops[i].ops,
+                                           !port_cntr_ops[i].ops.write ?
+                                           S_IRUGO : S_IRUGO | S_IWUSR);
+               }
+}
+
+void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd)
+{
+       if (!hfi1_dbg_root)
+               goto out;
+       debugfs_remove(ibd->hfi1_ibdev_link);
+       debugfs_remove_recursive(ibd->hfi1_ibdev_dbg);
+out:
+       ibd->hfi1_ibdev_dbg = NULL;
+       synchronize_rcu();
+}
+
+/*
+ * driver stats field names, one line per stat, single string.  Used by
+ * programs like hfistats to print the stats in a way which works for
+ * different versions of drivers, without changing program source.
+ * if hfi1_ib_stats changes, this needs to change.  Names need to be
+ * 12 chars or less (w/o newline), for proper display by hfistats utility.
+ */
+static const char * const hfi1_statnames[] = {
+       /* must be element 0*/
+       "KernIntr",
+       "ErrorIntr",
+       "Tx_Errs",
+       "Rcv_Errs",
+       "H/W_Errs",
+       "NoPIOBufs",
+       "CtxtsOpen",
+       "RcvLen_Errs",
+       "EgrBufFull",
+       "EgrHdrFull"
+};
+
+static void *_driver_stats_names_seq_start(struct seq_file *s, loff_t *pos)
+__acquires(RCU)
+{
+       rcu_read_lock();
+       if (*pos >= ARRAY_SIZE(hfi1_statnames))
+               return NULL;
+       return pos;
+}
+
+static void *_driver_stats_names_seq_next(
+       struct seq_file *s,
+       void *v,
+       loff_t *pos)
+{
+       ++*pos;
+       if (*pos >= ARRAY_SIZE(hfi1_statnames))
+               return NULL;
+       return pos;
+}
+
+static void _driver_stats_names_seq_stop(struct seq_file *s, void *v)
+__releases(RCU)
+{
+       rcu_read_unlock();
+}
+
+static int _driver_stats_names_seq_show(struct seq_file *s, void *v)
+{
+       loff_t *spos = v;
+
+       seq_printf(s, "%s\n", hfi1_statnames[*spos]);
+       return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(driver_stats_names);
+DEBUGFS_SEQ_FILE_OPEN(driver_stats_names)
+DEBUGFS_FILE_OPS(driver_stats_names);
+
+static void *_driver_stats_seq_start(struct seq_file *s, loff_t *pos)
+__acquires(RCU)
+{
+       rcu_read_lock();
+       if (*pos >= ARRAY_SIZE(hfi1_statnames))
+               return NULL;
+       return pos;
+}
+
+static void *_driver_stats_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+       ++*pos;
+       if (*pos >= ARRAY_SIZE(hfi1_statnames))
+               return NULL;
+       return pos;
+}
+
+static void _driver_stats_seq_stop(struct seq_file *s, void *v)
+__releases(RCU)
+{
+       rcu_read_unlock();
+}
+
+static u64 hfi1_sps_ints(void)
+{
+       unsigned long flags;
+       struct hfi1_devdata *dd;
+       u64 sps_ints = 0;
+
+       spin_lock_irqsave(&hfi1_devs_lock, flags);
+       list_for_each_entry(dd, &hfi1_dev_list, list) {
+               sps_ints += get_all_cpu_total(dd->int_counter);
+       }
+       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+       return sps_ints;
+}
+
+static int _driver_stats_seq_show(struct seq_file *s, void *v)
+{
+       loff_t *spos = v;
+       char *buffer;
+       u64 *stats = (u64 *)&hfi1_stats;
+       size_t sz = seq_get_buf(s, &buffer);
+
+       if (sz < sizeof(u64))
+               return SEQ_SKIP;
+       /* special case for interrupts */
+       if (*spos == 0)
+               *(u64 *)buffer = hfi1_sps_ints();
+       else
+               *(u64 *)buffer = stats[*spos];
+       seq_commit(s,  sizeof(u64));
+       return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(driver_stats);
+DEBUGFS_SEQ_FILE_OPEN(driver_stats)
+DEBUGFS_FILE_OPS(driver_stats);
+
+void hfi1_dbg_init(void)
+{
+       hfi1_dbg_root  = debugfs_create_dir(DRIVER_NAME, NULL);
+       if (!hfi1_dbg_root)
+               pr_warn("init of debugfs failed\n");
+       DEBUGFS_SEQ_FILE_CREATE(driver_stats_names, hfi1_dbg_root, NULL);
+       DEBUGFS_SEQ_FILE_CREATE(driver_stats, hfi1_dbg_root, NULL);
+}
+
+void hfi1_dbg_exit(void)
+{
+       debugfs_remove_recursive(hfi1_dbg_root);
+       hfi1_dbg_root = NULL;
+}
+
+#endif
diff --git a/drivers/infiniband/hw/hfi1/debugfs.h b/drivers/infiniband/hw/hfi1/debugfs.h
new file mode 100644 (file)
index 0000000..b6fb681
--- /dev/null
@@ -0,0 +1,75 @@
+#ifndef _HFI1_DEBUGFS_H
+#define _HFI1_DEBUGFS_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+struct hfi1_ibdev;
+#ifdef CONFIG_DEBUG_FS
+void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd);
+void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd);
+void hfi1_dbg_init(void);
+void hfi1_dbg_exit(void);
+#else
+static inline void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd)
+{
+}
+
+void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd)
+{
+}
+
+void hfi1_dbg_init(void)
+{
+}
+
+void hfi1_dbg_exit(void)
+{
+}
+
+#endif
+
+#endif                          /* _HFI1_DEBUGFS_H */
diff --git a/drivers/infiniband/hw/hfi1/device.c b/drivers/infiniband/hw/hfi1/device.c
new file mode 100644 (file)
index 0000000..bf64b5a
--- /dev/null
@@ -0,0 +1,183 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/cdev.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+
+#include "hfi.h"
+#include "device.h"
+
+static struct class *class;
+static struct class *user_class;
+static dev_t hfi1_dev;
+
+int hfi1_cdev_init(int minor, const char *name,
+                  const struct file_operations *fops,
+                  struct cdev *cdev, struct device **devp,
+                  bool user_accessible,
+                  struct kobject *parent)
+{
+       const dev_t dev = MKDEV(MAJOR(hfi1_dev), minor);
+       struct device *device = NULL;
+       int ret;
+
+       cdev_init(cdev, fops);
+       cdev->owner = THIS_MODULE;
+       cdev->kobj.parent = parent;
+       kobject_set_name(&cdev->kobj, name);
+
+       ret = cdev_add(cdev, dev, 1);
+       if (ret < 0) {
+               pr_err("Could not add cdev for minor %d, %s (err %d)\n",
+                      minor, name, -ret);
+               goto done;
+       }
+
+       if (user_accessible)
+               device = device_create(user_class, NULL, dev, NULL, "%s", name);
+       else
+               device = device_create(class, NULL, dev, NULL, "%s", name);
+
+       if (IS_ERR(device)) {
+               ret = PTR_ERR(device);
+               device = NULL;
+               pr_err("Could not create device for minor %d, %s (err %d)\n",
+                       minor, name, -ret);
+               cdev_del(cdev);
+       }
+done:
+       *devp = device;
+       return ret;
+}
+
+void hfi1_cdev_cleanup(struct cdev *cdev, struct device **devp)
+{
+       struct device *device = *devp;
+
+       if (device) {
+               device_unregister(device);
+               *devp = NULL;
+
+               cdev_del(cdev);
+       }
+}
+
+static const char *hfi1_class_name = "hfi1";
+
+const char *class_name(void)
+{
+       return hfi1_class_name;
+}
+
+static char *hfi1_devnode(struct device *dev, umode_t *mode)
+{
+       if (mode)
+               *mode = 0600;
+       return kasprintf(GFP_KERNEL, "%s", dev_name(dev));
+}
+
+static const char *hfi1_class_name_user = "hfi1_user";
+static const char *class_name_user(void)
+{
+       return hfi1_class_name_user;
+}
+
+static char *hfi1_user_devnode(struct device *dev, umode_t *mode)
+{
+       if (mode)
+               *mode = 0666;
+       return kasprintf(GFP_KERNEL, "%s", dev_name(dev));
+}
+
+int __init dev_init(void)
+{
+       int ret;
+
+       ret = alloc_chrdev_region(&hfi1_dev, 0, HFI1_NMINORS, DRIVER_NAME);
+       if (ret < 0) {
+               pr_err("Could not allocate chrdev region (err %d)\n", -ret);
+               goto done;
+       }
+
+       class = class_create(THIS_MODULE, class_name());
+       if (IS_ERR(class)) {
+               ret = PTR_ERR(class);
+               pr_err("Could not create device class (err %d)\n", -ret);
+               unregister_chrdev_region(hfi1_dev, HFI1_NMINORS);
+               goto done;
+       }
+       class->devnode = hfi1_devnode;
+
+       user_class = class_create(THIS_MODULE, class_name_user());
+       if (IS_ERR(user_class)) {
+               ret = PTR_ERR(user_class);
+               pr_err("Could not create device class for user accessible files (err %d)\n",
+                      -ret);
+               class_destroy(class);
+               class = NULL;
+               user_class = NULL;
+               unregister_chrdev_region(hfi1_dev, HFI1_NMINORS);
+               goto done;
+       }
+       user_class->devnode = hfi1_user_devnode;
+
+done:
+       return ret;
+}
+
+void dev_cleanup(void)
+{
+       class_destroy(class);
+       class = NULL;
+
+       class_destroy(user_class);
+       user_class = NULL;
+
+       unregister_chrdev_region(hfi1_dev, HFI1_NMINORS);
+}
diff --git a/drivers/infiniband/hw/hfi1/device.h b/drivers/infiniband/hw/hfi1/device.h
new file mode 100644 (file)
index 0000000..c3ec19c
--- /dev/null
@@ -0,0 +1,60 @@
+#ifndef _HFI1_DEVICE_H
+#define _HFI1_DEVICE_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+int hfi1_cdev_init(int minor, const char *name,
+                  const struct file_operations *fops,
+                  struct cdev *cdev, struct device **devp,
+                  bool user_accessible,
+                  struct kobject *parent);
+void hfi1_cdev_cleanup(struct cdev *cdev, struct device **devp);
+const char *class_name(void);
+int __init dev_init(void);
+void dev_cleanup(void);
+
+#endif                          /* _HFI1_DEVICE_H */
diff --git a/drivers/infiniband/hw/hfi1/dma.c b/drivers/infiniband/hw/hfi1/dma.c
new file mode 100644 (file)
index 0000000..7e8dab8
--- /dev/null
@@ -0,0 +1,183 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/types.h>
+#include <linux/scatterlist.h>
+
+#include "verbs.h"
+
+#define BAD_DMA_ADDRESS ((u64)0)
+
+/*
+ * The following functions implement driver specific replacements
+ * for the ib_dma_*() functions.
+ *
+ * These functions return kernel virtual addresses instead of
+ * device bus addresses since the driver uses the CPU to copy
+ * data instead of using hardware DMA.
+ */
+
+static int hfi1_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+       return dma_addr == BAD_DMA_ADDRESS;
+}
+
+static u64 hfi1_dma_map_single(struct ib_device *dev, void *cpu_addr,
+                              size_t size, enum dma_data_direction direction)
+{
+       if (WARN_ON(!valid_dma_direction(direction)))
+               return BAD_DMA_ADDRESS;
+
+       return (u64)cpu_addr;
+}
+
+static void hfi1_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size,
+                                 enum dma_data_direction direction)
+{
+       /* This is a stub, nothing to be done here */
+}
+
+static u64 hfi1_dma_map_page(struct ib_device *dev, struct page *page,
+                            unsigned long offset, size_t size,
+                           enum dma_data_direction direction)
+{
+       u64 addr;
+
+       if (WARN_ON(!valid_dma_direction(direction)))
+               return BAD_DMA_ADDRESS;
+
+       if (offset + size > PAGE_SIZE)
+               return BAD_DMA_ADDRESS;
+
+       addr = (u64)page_address(page);
+       if (addr)
+               addr += offset;
+
+       return addr;
+}
+
+static void hfi1_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size,
+                               enum dma_data_direction direction)
+{
+       /* This is a stub, nothing to be done here */
+}
+
+static int hfi1_map_sg(struct ib_device *dev, struct scatterlist *sgl,
+                      int nents, enum dma_data_direction direction)
+{
+       struct scatterlist *sg;
+       u64 addr;
+       int i;
+       int ret = nents;
+
+       if (WARN_ON(!valid_dma_direction(direction)))
+               return BAD_DMA_ADDRESS;
+
+       for_each_sg(sgl, sg, nents, i) {
+               addr = (u64)page_address(sg_page(sg));
+               if (!addr) {
+                       ret = 0;
+                       break;
+               }
+               sg->dma_address = addr + sg->offset;
+#ifdef CONFIG_NEED_SG_DMA_LENGTH
+               sg->dma_length = sg->length;
+#endif
+       }
+       return ret;
+}
+
+static void hfi1_unmap_sg(struct ib_device *dev,
+                         struct scatterlist *sg, int nents,
+                        enum dma_data_direction direction)
+{
+       /* This is a stub, nothing to be done here */
+}
+
+static void hfi1_sync_single_for_cpu(struct ib_device *dev, u64 addr,
+                                    size_t size, enum dma_data_direction dir)
+{
+}
+
+static void hfi1_sync_single_for_device(struct ib_device *dev, u64 addr,
+                                       size_t size,
+                                       enum dma_data_direction dir)
+{
+}
+
+static void *hfi1_dma_alloc_coherent(struct ib_device *dev, size_t size,
+                                    u64 *dma_handle, gfp_t flag)
+{
+       struct page *p;
+       void *addr = NULL;
+
+       p = alloc_pages(flag, get_order(size));
+       if (p)
+               addr = page_address(p);
+       if (dma_handle)
+               *dma_handle = (u64)addr;
+       return addr;
+}
+
+static void hfi1_dma_free_coherent(struct ib_device *dev, size_t size,
+                                  void *cpu_addr, u64 dma_handle)
+{
+       free_pages((unsigned long)cpu_addr, get_order(size));
+}
+
+struct ib_dma_mapping_ops hfi1_dma_mapping_ops = {
+       .mapping_error = hfi1_mapping_error,
+       .map_single = hfi1_dma_map_single,
+       .unmap_single = hfi1_dma_unmap_single,
+       .map_page = hfi1_dma_map_page,
+       .unmap_page = hfi1_dma_unmap_page,
+       .map_sg = hfi1_map_sg,
+       .unmap_sg = hfi1_unmap_sg,
+       .sync_single_for_cpu = hfi1_sync_single_for_cpu,
+       .sync_single_for_device = hfi1_sync_single_for_device,
+       .alloc_coherent = hfi1_dma_alloc_coherent,
+       .free_coherent = hfi1_dma_free_coherent
+};
diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c
new file mode 100644 (file)
index 0000000..c75b0ae
--- /dev/null
@@ -0,0 +1,1404 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/netdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/prefetch.h>
+#include <rdma/ib_verbs.h>
+
+#include "hfi.h"
+#include "trace.h"
+#include "qp.h"
+#include "sdma.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) DRIVER_NAME ": " fmt
+
+/*
+ * The size has to be longer than this string, so we can append
+ * board/chip information to it in the initialization code.
+ */
+const char ib_hfi1_version[] = HFI1_DRIVER_VERSION "\n";
+
+DEFINE_SPINLOCK(hfi1_devs_lock);
+LIST_HEAD(hfi1_dev_list);
+DEFINE_MUTEX(hfi1_mutex);      /* general driver use */
+
+unsigned int hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU;
+module_param_named(max_mtu, hfi1_max_mtu, uint, S_IRUGO);
+MODULE_PARM_DESC(max_mtu, "Set max MTU bytes, default is " __stringify(
+                HFI1_DEFAULT_MAX_MTU));
+
+unsigned int hfi1_cu = 1;
+module_param_named(cu, hfi1_cu, uint, S_IRUGO);
+MODULE_PARM_DESC(cu, "Credit return units");
+
+unsigned long hfi1_cap_mask = HFI1_CAP_MASK_DEFAULT;
+static int hfi1_caps_set(const char *, const struct kernel_param *);
+static int hfi1_caps_get(char *, const struct kernel_param *);
+static const struct kernel_param_ops cap_ops = {
+       .set = hfi1_caps_set,
+       .get = hfi1_caps_get
+};
+module_param_cb(cap_mask, &cap_ops, &hfi1_cap_mask, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(cap_mask, "Bit mask of enabled/disabled HW features");
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("Intel Omni-Path Architecture driver");
+MODULE_VERSION(HFI1_DRIVER_VERSION);
+
+/*
+ * MAX_PKT_RCV is the max # if packets processed per receive interrupt.
+ */
+#define MAX_PKT_RECV 64
+#define EGR_HEAD_UPDATE_THRESHOLD 16
+
+struct hfi1_ib_stats hfi1_stats;
+
+static int hfi1_caps_set(const char *val, const struct kernel_param *kp)
+{
+       int ret = 0;
+       unsigned long *cap_mask_ptr = (unsigned long *)kp->arg,
+               cap_mask = *cap_mask_ptr, value, diff,
+               write_mask = ((HFI1_CAP_WRITABLE_MASK << HFI1_CAP_USER_SHIFT) |
+                             HFI1_CAP_WRITABLE_MASK);
+
+       ret = kstrtoul(val, 0, &value);
+       if (ret) {
+               pr_warn("Invalid module parameter value for 'cap_mask'\n");
+               goto done;
+       }
+       /* Get the changed bits (except the locked bit) */
+       diff = value ^ (cap_mask & ~HFI1_CAP_LOCKED_SMASK);
+
+       /* Remove any bits that are not allowed to change after driver load */
+       if (HFI1_CAP_LOCKED() && (diff & ~write_mask)) {
+               pr_warn("Ignoring non-writable capability bits %#lx\n",
+                       diff & ~write_mask);
+               diff &= write_mask;
+       }
+
+       /* Mask off any reserved bits */
+       diff &= ~HFI1_CAP_RESERVED_MASK;
+       /* Clear any previously set and changing bits */
+       cap_mask &= ~diff;
+       /* Update the bits with the new capability */
+       cap_mask |= (value & diff);
+       /* Check for any kernel/user restrictions */
+       diff = (cap_mask & (HFI1_CAP_MUST_HAVE_KERN << HFI1_CAP_USER_SHIFT)) ^
+               ((cap_mask & HFI1_CAP_MUST_HAVE_KERN) << HFI1_CAP_USER_SHIFT);
+       cap_mask &= ~diff;
+       /* Set the bitmask to the final set */
+       *cap_mask_ptr = cap_mask;
+done:
+       return ret;
+}
+
+static int hfi1_caps_get(char *buffer, const struct kernel_param *kp)
+{
+       unsigned long cap_mask = *(unsigned long *)kp->arg;
+
+       cap_mask &= ~HFI1_CAP_LOCKED_SMASK;
+       cap_mask |= ((cap_mask & HFI1_CAP_K2U) << HFI1_CAP_USER_SHIFT);
+
+       return scnprintf(buffer, PAGE_SIZE, "0x%lx", cap_mask);
+}
+
+const char *get_unit_name(int unit)
+{
+       static char iname[16];
+
+       snprintf(iname, sizeof(iname), DRIVER_NAME "_%u", unit);
+       return iname;
+}
+
+const char *get_card_name(struct rvt_dev_info *rdi)
+{
+       struct hfi1_ibdev *ibdev = container_of(rdi, struct hfi1_ibdev, rdi);
+       struct hfi1_devdata *dd = container_of(ibdev,
+                                              struct hfi1_devdata, verbs_dev);
+       return get_unit_name(dd->unit);
+}
+
+struct pci_dev *get_pci_dev(struct rvt_dev_info *rdi)
+{
+       struct hfi1_ibdev *ibdev = container_of(rdi, struct hfi1_ibdev, rdi);
+       struct hfi1_devdata *dd = container_of(ibdev,
+                                              struct hfi1_devdata, verbs_dev);
+       return dd->pcidev;
+}
+
+/*
+ * Return count of units with at least one port ACTIVE.
+ */
+int hfi1_count_active_units(void)
+{
+       struct hfi1_devdata *dd;
+       struct hfi1_pportdata *ppd;
+       unsigned long flags;
+       int pidx, nunits_active = 0;
+
+       spin_lock_irqsave(&hfi1_devs_lock, flags);
+       list_for_each_entry(dd, &hfi1_dev_list, list) {
+               if (!(dd->flags & HFI1_PRESENT) || !dd->kregbase)
+                       continue;
+               for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+                       ppd = dd->pport + pidx;
+                       if (ppd->lid && ppd->linkup) {
+                               nunits_active++;
+                               break;
+                       }
+               }
+       }
+       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+       return nunits_active;
+}
+
+/*
+ * Return count of all units, optionally return in arguments
+ * the number of usable (present) units, and the number of
+ * ports that are up.
+ */
+int hfi1_count_units(int *npresentp, int *nupp)
+{
+       int nunits = 0, npresent = 0, nup = 0;
+       struct hfi1_devdata *dd;
+       unsigned long flags;
+       int pidx;
+       struct hfi1_pportdata *ppd;
+
+       spin_lock_irqsave(&hfi1_devs_lock, flags);
+
+       list_for_each_entry(dd, &hfi1_dev_list, list) {
+               nunits++;
+               if ((dd->flags & HFI1_PRESENT) && dd->kregbase)
+                       npresent++;
+               for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+                       ppd = dd->pport + pidx;
+                       if (ppd->lid && ppd->linkup)
+                               nup++;
+               }
+       }
+
+       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+
+       if (npresentp)
+               *npresentp = npresent;
+       if (nupp)
+               *nupp = nup;
+
+       return nunits;
+}
+
+/*
+ * Get address of eager buffer from it's index (allocated in chunks, not
+ * contiguous).
+ */
+static inline void *get_egrbuf(const struct hfi1_ctxtdata *rcd, u64 rhf,
+                              u8 *update)
+{
+       u32 idx = rhf_egr_index(rhf), offset = rhf_egr_buf_offset(rhf);
+
+       *update |= !(idx & (rcd->egrbufs.threshold - 1)) && !offset;
+       return (void *)(((u64)(rcd->egrbufs.rcvtids[idx].addr)) +
+                       (offset * RCV_BUF_BLOCK_SIZE));
+}
+
+/*
+ * Validate and encode the a given RcvArray Buffer size.
+ * The function will check whether the given size falls within
+ * allowed size ranges for the respective type and, optionally,
+ * return the proper encoding.
+ */
+inline int hfi1_rcvbuf_validate(u32 size, u8 type, u16 *encoded)
+{
+       if (unlikely(!PAGE_ALIGNED(size)))
+               return 0;
+       if (unlikely(size < MIN_EAGER_BUFFER))
+               return 0;
+       if (size >
+           (type == PT_EAGER ? MAX_EAGER_BUFFER : MAX_EXPECTED_BUFFER))
+               return 0;
+       if (encoded)
+               *encoded = ilog2(size / PAGE_SIZE) + 1;
+       return 1;
+}
+
+static void rcv_hdrerr(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd,
+                      struct hfi1_packet *packet)
+{
+       struct hfi1_message_header *rhdr = packet->hdr;
+       u32 rte = rhf_rcv_type_err(packet->rhf);
+       int lnh = be16_to_cpu(rhdr->lrh[0]) & 3;
+       struct hfi1_ibport *ibp = &ppd->ibport_data;
+       struct hfi1_devdata *dd = ppd->dd;
+       struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
+
+       if (packet->rhf & (RHF_VCRC_ERR | RHF_ICRC_ERR))
+               return;
+
+       if (packet->rhf & RHF_TID_ERR) {
+               /* For TIDERR and RC QPs preemptively schedule a NAK */
+               struct hfi1_ib_header *hdr = (struct hfi1_ib_header *)rhdr;
+               struct hfi1_other_headers *ohdr = NULL;
+               u32 tlen = rhf_pkt_len(packet->rhf); /* in bytes */
+               u16 lid  = be16_to_cpu(hdr->lrh[1]);
+               u32 qp_num;
+               u32 rcv_flags = 0;
+
+               /* Sanity check packet */
+               if (tlen < 24)
+                       goto drop;
+
+               /* Check for GRH */
+               if (lnh == HFI1_LRH_BTH) {
+                       ohdr = &hdr->u.oth;
+               } else if (lnh == HFI1_LRH_GRH) {
+                       u32 vtf;
+
+                       ohdr = &hdr->u.l.oth;
+                       if (hdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR)
+                               goto drop;
+                       vtf = be32_to_cpu(hdr->u.l.grh.version_tclass_flow);
+                       if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION)
+                               goto drop;
+                       rcv_flags |= HFI1_HAS_GRH;
+               } else {
+                       goto drop;
+               }
+               /* Get the destination QP number. */
+               qp_num = be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK;
+               if (lid < be16_to_cpu(IB_MULTICAST_LID_BASE)) {
+                       struct rvt_qp *qp;
+                       unsigned long flags;
+
+                       rcu_read_lock();
+                       qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
+                       if (!qp) {
+                               rcu_read_unlock();
+                               goto drop;
+                       }
+
+                       /*
+                        * Handle only RC QPs - for other QP types drop error
+                        * packet.
+                        */
+                       spin_lock_irqsave(&qp->r_lock, flags);
+
+                       /* Check for valid receive state. */
+                       if (!(ib_rvt_state_ops[qp->state] &
+                             RVT_PROCESS_RECV_OK)) {
+                               ibp->rvp.n_pkt_drops++;
+                       }
+
+                       switch (qp->ibqp.qp_type) {
+                       case IB_QPT_RC:
+                               hfi1_rc_hdrerr(
+                                       rcd,
+                                       hdr,
+                                       rcv_flags,
+                                       qp);
+                               break;
+                       default:
+                               /* For now don't handle any other QP types */
+                               break;
+                       }
+
+                       spin_unlock_irqrestore(&qp->r_lock, flags);
+                       rcu_read_unlock();
+               } /* Unicast QP */
+       } /* Valid packet with TIDErr */
+
+       /* handle "RcvTypeErr" flags */
+       switch (rte) {
+       case RHF_RTE_ERROR_OP_CODE_ERR:
+       {
+               u32 opcode;
+               void *ebuf = NULL;
+               __be32 *bth = NULL;
+
+               if (rhf_use_egr_bfr(packet->rhf))
+                       ebuf = packet->ebuf;
+
+               if (!ebuf)
+                       goto drop; /* this should never happen */
+
+               if (lnh == HFI1_LRH_BTH)
+                       bth = (__be32 *)ebuf;
+               else if (lnh == HFI1_LRH_GRH)
+                       bth = (__be32 *)((char *)ebuf + sizeof(struct ib_grh));
+               else
+                       goto drop;
+
+               opcode = be32_to_cpu(bth[0]) >> 24;
+               opcode &= 0xff;
+
+               if (opcode == IB_OPCODE_CNP) {
+                       /*
+                        * Only in pre-B0 h/w is the CNP_OPCODE handled
+                        * via this code path.
+                        */
+                       struct rvt_qp *qp = NULL;
+                       u32 lqpn, rqpn;
+                       u16 rlid;
+                       u8 svc_type, sl, sc5;
+
+                       sc5  = (be16_to_cpu(rhdr->lrh[0]) >> 12) & 0xf;
+                       if (rhf_dc_info(packet->rhf))
+                               sc5 |= 0x10;
+                       sl = ibp->sc_to_sl[sc5];
+
+                       lqpn = be32_to_cpu(bth[1]) & RVT_QPN_MASK;
+                       rcu_read_lock();
+                       qp = rvt_lookup_qpn(rdi, &ibp->rvp, lqpn);
+                       if (!qp) {
+                               rcu_read_unlock();
+                               goto drop;
+                       }
+
+                       switch (qp->ibqp.qp_type) {
+                       case IB_QPT_UD:
+                               rlid = 0;
+                               rqpn = 0;
+                               svc_type = IB_CC_SVCTYPE_UD;
+                               break;
+                       case IB_QPT_UC:
+                               rlid = be16_to_cpu(rhdr->lrh[3]);
+                               rqpn = qp->remote_qpn;
+                               svc_type = IB_CC_SVCTYPE_UC;
+                               break;
+                       default:
+                               goto drop;
+                       }
+
+                       process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type);
+                       rcu_read_unlock();
+               }
+
+               packet->rhf &= ~RHF_RCV_TYPE_ERR_SMASK;
+               break;
+       }
+       default:
+               break;
+       }
+
+drop:
+       return;
+}
+
+static inline void init_packet(struct hfi1_ctxtdata *rcd,
+                              struct hfi1_packet *packet)
+{
+       packet->rsize = rcd->rcvhdrqentsize; /* words */
+       packet->maxcnt = rcd->rcvhdrq_cnt * packet->rsize; /* words */
+       packet->rcd = rcd;
+       packet->updegr = 0;
+       packet->etail = -1;
+       packet->rhf_addr = get_rhf_addr(rcd);
+       packet->rhf = rhf_to_cpu(packet->rhf_addr);
+       packet->rhqoff = rcd->head;
+       packet->numpkt = 0;
+       packet->rcv_flags = 0;
+}
+
+static void process_ecn(struct rvt_qp *qp, struct hfi1_ib_header *hdr,
+                       struct hfi1_other_headers *ohdr,
+                       u64 rhf, u32 bth1, struct ib_grh *grh)
+{
+       struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+       u32 rqpn = 0;
+       u16 rlid;
+       u8 sc5, svc_type;
+
+       switch (qp->ibqp.qp_type) {
+       case IB_QPT_SMI:
+       case IB_QPT_GSI:
+       case IB_QPT_UD:
+               rlid = be16_to_cpu(hdr->lrh[3]);
+               rqpn = be32_to_cpu(ohdr->u.ud.deth[1]) & RVT_QPN_MASK;
+               svc_type = IB_CC_SVCTYPE_UD;
+               break;
+       case IB_QPT_UC:
+               rlid = qp->remote_ah_attr.dlid;
+               rqpn = qp->remote_qpn;
+               svc_type = IB_CC_SVCTYPE_UC;
+               break;
+       case IB_QPT_RC:
+               rlid = qp->remote_ah_attr.dlid;
+               rqpn = qp->remote_qpn;
+               svc_type = IB_CC_SVCTYPE_RC;
+               break;
+       default:
+               return;
+       }
+
+       sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
+       if (rhf_dc_info(rhf))
+               sc5 |= 0x10;
+
+       if (bth1 & HFI1_FECN_SMASK) {
+               u16 pkey = (u16)be32_to_cpu(ohdr->bth[0]);
+               u16 dlid = be16_to_cpu(hdr->lrh[1]);
+
+               return_cnp(ibp, qp, rqpn, pkey, dlid, rlid, sc5, grh);
+       }
+
+       if (bth1 & HFI1_BECN_SMASK) {
+               struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+               u32 lqpn = bth1 & RVT_QPN_MASK;
+               u8 sl = ibp->sc_to_sl[sc5];
+
+               process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type);
+       }
+}
+
+struct ps_mdata {
+       struct hfi1_ctxtdata *rcd;
+       u32 rsize;
+       u32 maxcnt;
+       u32 ps_head;
+       u32 ps_tail;
+       u32 ps_seq;
+};
+
+static inline void init_ps_mdata(struct ps_mdata *mdata,
+                                struct hfi1_packet *packet)
+{
+       struct hfi1_ctxtdata *rcd = packet->rcd;
+
+       mdata->rcd = rcd;
+       mdata->rsize = packet->rsize;
+       mdata->maxcnt = packet->maxcnt;
+       mdata->ps_head = packet->rhqoff;
+
+       if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) {
+               mdata->ps_tail = get_rcvhdrtail(rcd);
+               if (rcd->ctxt == HFI1_CTRL_CTXT)
+                       mdata->ps_seq = rcd->seq_cnt;
+               else
+                       mdata->ps_seq = 0; /* not used with DMA_RTAIL */
+       } else {
+               mdata->ps_tail = 0; /* used only with DMA_RTAIL*/
+               mdata->ps_seq = rcd->seq_cnt;
+       }
+}
+
+static inline int ps_done(struct ps_mdata *mdata, u64 rhf,
+                         struct hfi1_ctxtdata *rcd)
+{
+       if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL))
+               return mdata->ps_head == mdata->ps_tail;
+       return mdata->ps_seq != rhf_rcv_seq(rhf);
+}
+
+static inline int ps_skip(struct ps_mdata *mdata, u64 rhf,
+                         struct hfi1_ctxtdata *rcd)
+{
+       /*
+        * Control context can potentially receive an invalid rhf.
+        * Drop such packets.
+        */
+       if ((rcd->ctxt == HFI1_CTRL_CTXT) && (mdata->ps_head != mdata->ps_tail))
+               return mdata->ps_seq != rhf_rcv_seq(rhf);
+
+       return 0;
+}
+
+static inline void update_ps_mdata(struct ps_mdata *mdata,
+                                  struct hfi1_ctxtdata *rcd)
+{
+       mdata->ps_head += mdata->rsize;
+       if (mdata->ps_head >= mdata->maxcnt)
+               mdata->ps_head = 0;
+
+       /* Control context must do seq counting */
+       if (!HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) ||
+           (rcd->ctxt == HFI1_CTRL_CTXT)) {
+               if (++mdata->ps_seq > 13)
+                       mdata->ps_seq = 1;
+       }
+}
+
+/*
+ * prescan_rxq - search through the receive queue looking for packets
+ * containing Excplicit Congestion Notifications (FECNs, or BECNs).
+ * When an ECN is found, process the Congestion Notification, and toggle
+ * it off.
+ * This is declared as a macro to allow quick checking of the port to avoid
+ * the overhead of a function call if not enabled.
+ */
+#define prescan_rxq(rcd, packet) \
+       do { \
+               if (rcd->ppd->cc_prescan) \
+                       __prescan_rxq(packet); \
+       } while (0)
+static void __prescan_rxq(struct hfi1_packet *packet)
+{
+       struct hfi1_ctxtdata *rcd = packet->rcd;
+       struct ps_mdata mdata;
+
+       init_ps_mdata(&mdata, packet);
+
+       while (1) {
+               struct hfi1_devdata *dd = rcd->dd;
+               struct hfi1_ibport *ibp = &rcd->ppd->ibport_data;
+               __le32 *rhf_addr = (__le32 *)rcd->rcvhdrq + mdata.ps_head +
+                                        dd->rhf_offset;
+               struct rvt_qp *qp;
+               struct hfi1_ib_header *hdr;
+               struct hfi1_other_headers *ohdr;
+               struct ib_grh *grh = NULL;
+               struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
+               u64 rhf = rhf_to_cpu(rhf_addr);
+               u32 etype = rhf_rcv_type(rhf), qpn, bth1;
+               int is_ecn = 0;
+               u8 lnh;
+
+               if (ps_done(&mdata, rhf, rcd))
+                       break;
+
+               if (ps_skip(&mdata, rhf, rcd))
+                       goto next;
+
+               if (etype != RHF_RCV_TYPE_IB)
+                       goto next;
+
+               hdr = (struct hfi1_ib_header *)
+                       hfi1_get_msgheader(dd, rhf_addr);
+               lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+
+               if (lnh == HFI1_LRH_BTH) {
+                       ohdr = &hdr->u.oth;
+               } else if (lnh == HFI1_LRH_GRH) {
+                       ohdr = &hdr->u.l.oth;
+                       grh = &hdr->u.l.grh;
+               } else {
+                       goto next; /* just in case */
+               }
+               bth1 = be32_to_cpu(ohdr->bth[1]);
+               is_ecn = !!(bth1 & (HFI1_FECN_SMASK | HFI1_BECN_SMASK));
+
+               if (!is_ecn)
+                       goto next;
+
+               qpn = bth1 & RVT_QPN_MASK;
+               rcu_read_lock();
+               qp = rvt_lookup_qpn(rdi, &ibp->rvp, qpn);
+
+               if (!qp) {
+                       rcu_read_unlock();
+                       goto next;
+               }
+
+               process_ecn(qp, hdr, ohdr, rhf, bth1, grh);
+               rcu_read_unlock();
+
+               /* turn off BECN, FECN */
+               bth1 &= ~(HFI1_FECN_SMASK | HFI1_BECN_SMASK);
+               ohdr->bth[1] = cpu_to_be32(bth1);
+next:
+               update_ps_mdata(&mdata, rcd);
+       }
+}
+
+static inline int skip_rcv_packet(struct hfi1_packet *packet, int thread)
+{
+       int ret = RCV_PKT_OK;
+
+       /* Set up for the next packet */
+       packet->rhqoff += packet->rsize;
+       if (packet->rhqoff >= packet->maxcnt)
+               packet->rhqoff = 0;
+
+       packet->numpkt++;
+       if (unlikely((packet->numpkt & (MAX_PKT_RECV - 1)) == 0)) {
+               if (thread) {
+                       cond_resched();
+               } else {
+                       ret = RCV_PKT_LIMIT;
+                       this_cpu_inc(*packet->rcd->dd->rcv_limit);
+               }
+       }
+
+       packet->rhf_addr = (__le32 *)packet->rcd->rcvhdrq + packet->rhqoff +
+                                    packet->rcd->dd->rhf_offset;
+       packet->rhf = rhf_to_cpu(packet->rhf_addr);
+
+       return ret;
+}
+
+static inline int process_rcv_packet(struct hfi1_packet *packet, int thread)
+{
+       int ret = RCV_PKT_OK;
+
+       packet->hdr = hfi1_get_msgheader(packet->rcd->dd,
+                                        packet->rhf_addr);
+       packet->hlen = (u8 *)packet->rhf_addr - (u8 *)packet->hdr;
+       packet->etype = rhf_rcv_type(packet->rhf);
+       /* total length */
+       packet->tlen = rhf_pkt_len(packet->rhf); /* in bytes */
+       /* retrieve eager buffer details */
+       packet->ebuf = NULL;
+       if (rhf_use_egr_bfr(packet->rhf)) {
+               packet->etail = rhf_egr_index(packet->rhf);
+               packet->ebuf = get_egrbuf(packet->rcd, packet->rhf,
+                                &packet->updegr);
+               /*
+                * Prefetch the contents of the eager buffer.  It is
+                * OK to send a negative length to prefetch_range().
+                * The +2 is the size of the RHF.
+                */
+               prefetch_range(packet->ebuf,
+                              packet->tlen - ((packet->rcd->rcvhdrqentsize -
+                                              (rhf_hdrq_offset(packet->rhf)
+                                               + 2)) * 4));
+       }
+
+       /*
+        * Call a type specific handler for the packet. We
+        * should be able to trust that etype won't be beyond
+        * the range of valid indexes. If so something is really
+        * wrong and we can probably just let things come
+        * crashing down. There is no need to eat another
+        * comparison in this performance critical code.
+        */
+       packet->rcd->dd->rhf_rcv_function_map[packet->etype](packet);
+       packet->numpkt++;
+
+       /* Set up for the next packet */
+       packet->rhqoff += packet->rsize;
+       if (packet->rhqoff >= packet->maxcnt)
+               packet->rhqoff = 0;
+
+       if (unlikely((packet->numpkt & (MAX_PKT_RECV - 1)) == 0)) {
+               if (thread) {
+                       cond_resched();
+               } else {
+                       ret = RCV_PKT_LIMIT;
+                       this_cpu_inc(*packet->rcd->dd->rcv_limit);
+               }
+       }
+
+       packet->rhf_addr = (__le32 *)packet->rcd->rcvhdrq + packet->rhqoff +
+                                     packet->rcd->dd->rhf_offset;
+       packet->rhf = rhf_to_cpu(packet->rhf_addr);
+
+       return ret;
+}
+
+static inline void process_rcv_update(int last, struct hfi1_packet *packet)
+{
+       /*
+        * Update head regs etc., every 16 packets, if not last pkt,
+        * to help prevent rcvhdrq overflows, when many packets
+        * are processed and queue is nearly full.
+        * Don't request an interrupt for intermediate updates.
+        */
+       if (!last && !(packet->numpkt & 0xf)) {
+               update_usrhead(packet->rcd, packet->rhqoff, packet->updegr,
+                              packet->etail, 0, 0);
+               packet->updegr = 0;
+       }
+       packet->rcv_flags = 0;
+}
+
+static inline void finish_packet(struct hfi1_packet *packet)
+{
+       /*
+        * Nothing we need to free for the packet.
+        *
+        * The only thing we need to do is a final update and call for an
+        * interrupt
+        */
+       update_usrhead(packet->rcd, packet->rcd->head, packet->updegr,
+                      packet->etail, rcv_intr_dynamic, packet->numpkt);
+}
+
+static inline void process_rcv_qp_work(struct hfi1_packet *packet)
+{
+       struct hfi1_ctxtdata *rcd;
+       struct rvt_qp *qp, *nqp;
+
+       rcd = packet->rcd;
+       rcd->head = packet->rhqoff;
+
+       /*
+        * Iterate over all QPs waiting to respond.
+        * The list won't change since the IRQ is only run on one CPU.
+        */
+       list_for_each_entry_safe(qp, nqp, &rcd->qp_wait_list, rspwait) {
+               list_del_init(&qp->rspwait);
+               if (qp->r_flags & RVT_R_RSP_NAK) {
+                       qp->r_flags &= ~RVT_R_RSP_NAK;
+                       hfi1_send_rc_ack(rcd, qp, 0);
+               }
+               if (qp->r_flags & RVT_R_RSP_SEND) {
+                       unsigned long flags;
+
+                       qp->r_flags &= ~RVT_R_RSP_SEND;
+                       spin_lock_irqsave(&qp->s_lock, flags);
+                       if (ib_rvt_state_ops[qp->state] &
+                                       RVT_PROCESS_OR_FLUSH_SEND)
+                               hfi1_schedule_send(qp);
+                       spin_unlock_irqrestore(&qp->s_lock, flags);
+               }
+               if (atomic_dec_and_test(&qp->refcount))
+                       wake_up(&qp->wait);
+       }
+}
+
+/*
+ * Handle receive interrupts when using the no dma rtail option.
+ */
+int handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *rcd, int thread)
+{
+       u32 seq;
+       int last = RCV_PKT_OK;
+       struct hfi1_packet packet;
+
+       init_packet(rcd, &packet);
+       seq = rhf_rcv_seq(packet.rhf);
+       if (seq != rcd->seq_cnt) {
+               last = RCV_PKT_DONE;
+               goto bail;
+       }
+
+       prescan_rxq(rcd, &packet);
+
+       while (last == RCV_PKT_OK) {
+               last = process_rcv_packet(&packet, thread);
+               seq = rhf_rcv_seq(packet.rhf);
+               if (++rcd->seq_cnt > 13)
+                       rcd->seq_cnt = 1;
+               if (seq != rcd->seq_cnt)
+                       last = RCV_PKT_DONE;
+               process_rcv_update(last, &packet);
+       }
+       process_rcv_qp_work(&packet);
+bail:
+       finish_packet(&packet);
+       return last;
+}
+
+int handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *rcd, int thread)
+{
+       u32 hdrqtail;
+       int last = RCV_PKT_OK;
+       struct hfi1_packet packet;
+
+       init_packet(rcd, &packet);
+       hdrqtail = get_rcvhdrtail(rcd);
+       if (packet.rhqoff == hdrqtail) {
+               last = RCV_PKT_DONE;
+               goto bail;
+       }
+       smp_rmb();  /* prevent speculative reads of dma'ed hdrq */
+
+       prescan_rxq(rcd, &packet);
+
+       while (last == RCV_PKT_OK) {
+               last = process_rcv_packet(&packet, thread);
+               if (packet.rhqoff == hdrqtail)
+                       last = RCV_PKT_DONE;
+               process_rcv_update(last, &packet);
+       }
+       process_rcv_qp_work(&packet);
+bail:
+       finish_packet(&packet);
+       return last;
+}
+
+static inline void set_all_nodma_rtail(struct hfi1_devdata *dd)
+{
+       int i;
+
+       for (i = HFI1_CTRL_CTXT + 1; i < dd->first_user_ctxt; i++)
+               dd->rcd[i]->do_interrupt =
+                       &handle_receive_interrupt_nodma_rtail;
+}
+
+static inline void set_all_dma_rtail(struct hfi1_devdata *dd)
+{
+       int i;
+
+       for (i = HFI1_CTRL_CTXT + 1; i < dd->first_user_ctxt; i++)
+               dd->rcd[i]->do_interrupt =
+                       &handle_receive_interrupt_dma_rtail;
+}
+
+void set_all_slowpath(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /* HFI1_CTRL_CTXT must always use the slow path interrupt handler */
+       for (i = HFI1_CTRL_CTXT + 1; i < dd->first_user_ctxt; i++)
+               dd->rcd[i]->do_interrupt = &handle_receive_interrupt;
+}
+
+static inline int set_armed_to_active(struct hfi1_ctxtdata *rcd,
+                                     struct hfi1_packet packet,
+                                     struct hfi1_devdata *dd)
+{
+       struct work_struct *lsaw = &rcd->ppd->linkstate_active_work;
+       struct hfi1_message_header *hdr = hfi1_get_msgheader(packet.rcd->dd,
+                                                            packet.rhf_addr);
+
+       if (hdr2sc(hdr, packet.rhf) != 0xf) {
+               int hwstate = read_logical_state(dd);
+
+               if (hwstate != LSTATE_ACTIVE) {
+                       dd_dev_info(dd, "Unexpected link state %d\n", hwstate);
+                       return 0;
+               }
+
+               queue_work(rcd->ppd->hfi1_wq, lsaw);
+               return 1;
+       }
+       return 0;
+}
+
+/*
+ * handle_receive_interrupt - receive a packet
+ * @rcd: the context
+ *
+ * Called from interrupt handler for errors or receive interrupt.
+ * This is the slow path interrupt handler.
+ */
+int handle_receive_interrupt(struct hfi1_ctxtdata *rcd, int thread)
+{
+       struct hfi1_devdata *dd = rcd->dd;
+       u32 hdrqtail;
+       int needset, last = RCV_PKT_OK;
+       struct hfi1_packet packet;
+       int skip_pkt = 0;
+
+       /* Control context will always use the slow path interrupt handler */
+       needset = (rcd->ctxt == HFI1_CTRL_CTXT) ? 0 : 1;
+
+       init_packet(rcd, &packet);
+
+       if (!HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) {
+               u32 seq = rhf_rcv_seq(packet.rhf);
+
+               if (seq != rcd->seq_cnt) {
+                       last = RCV_PKT_DONE;
+                       goto bail;
+               }
+               hdrqtail = 0;
+       } else {
+               hdrqtail = get_rcvhdrtail(rcd);
+               if (packet.rhqoff == hdrqtail) {
+                       last = RCV_PKT_DONE;
+                       goto bail;
+               }
+               smp_rmb();  /* prevent speculative reads of dma'ed hdrq */
+
+               /*
+                * Control context can potentially receive an invalid
+                * rhf. Drop such packets.
+                */
+               if (rcd->ctxt == HFI1_CTRL_CTXT) {
+                       u32 seq = rhf_rcv_seq(packet.rhf);
+
+                       if (seq != rcd->seq_cnt)
+                               skip_pkt = 1;
+               }
+       }
+
+       prescan_rxq(rcd, &packet);
+
+       while (last == RCV_PKT_OK) {
+               if (unlikely(dd->do_drop &&
+                            atomic_xchg(&dd->drop_packet, DROP_PACKET_OFF) ==
+                            DROP_PACKET_ON)) {
+                       dd->do_drop = 0;
+
+                       /* On to the next packet */
+                       packet.rhqoff += packet.rsize;
+                       packet.rhf_addr = (__le32 *)rcd->rcvhdrq +
+                                         packet.rhqoff +
+                                         dd->rhf_offset;
+                       packet.rhf = rhf_to_cpu(packet.rhf_addr);
+
+               } else if (skip_pkt) {
+                       last = skip_rcv_packet(&packet, thread);
+                       skip_pkt = 0;
+               } else {
+                       /* Auto activate link on non-SC15 packet receive */
+                       if (unlikely(rcd->ppd->host_link_state ==
+                                    HLS_UP_ARMED) &&
+                           set_armed_to_active(rcd, packet, dd))
+                               goto bail;
+                       last = process_rcv_packet(&packet, thread);
+               }
+
+               if (!HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) {
+                       u32 seq = rhf_rcv_seq(packet.rhf);
+
+                       if (++rcd->seq_cnt > 13)
+                               rcd->seq_cnt = 1;
+                       if (seq != rcd->seq_cnt)
+                               last = RCV_PKT_DONE;
+                       if (needset) {
+                               dd_dev_info(dd, "Switching to NO_DMA_RTAIL\n");
+                               set_all_nodma_rtail(dd);
+                               needset = 0;
+                       }
+               } else {
+                       if (packet.rhqoff == hdrqtail)
+                               last = RCV_PKT_DONE;
+                       /*
+                        * Control context can potentially receive an invalid
+                        * rhf. Drop such packets.
+                        */
+                       if (rcd->ctxt == HFI1_CTRL_CTXT) {
+                               u32 seq = rhf_rcv_seq(packet.rhf);
+
+                               if (++rcd->seq_cnt > 13)
+                                       rcd->seq_cnt = 1;
+                               if (!last && (seq != rcd->seq_cnt))
+                                       skip_pkt = 1;
+                       }
+
+                       if (needset) {
+                               dd_dev_info(dd,
+                                           "Switching to DMA_RTAIL\n");
+                               set_all_dma_rtail(dd);
+                               needset = 0;
+                       }
+               }
+
+               process_rcv_update(last, &packet);
+       }
+
+       process_rcv_qp_work(&packet);
+
+bail:
+       /*
+        * Always write head at end, and setup rcv interrupt, even
+        * if no packets were processed.
+        */
+       finish_packet(&packet);
+       return last;
+}
+
+/*
+ * We may discover in the interrupt that the hardware link state has
+ * changed from ARMED to ACTIVE (due to the arrival of a non-SC15 packet),
+ * and we need to update the driver's notion of the link state.  We cannot
+ * run set_link_state from interrupt context, so we queue this function on
+ * a workqueue.
+ *
+ * We delay the regular interrupt processing until after the state changes
+ * so that the link will be in the correct state by the time any application
+ * we wake up attempts to send a reply to any message it received.
+ * (Subsequent receive interrupts may possibly force the wakeup before we
+ * update the link state.)
+ *
+ * The rcd is freed in hfi1_free_ctxtdata after hfi1_postinit_cleanup invokes
+ * dd->f_cleanup(dd) to disable the interrupt handler and flush workqueues,
+ * so we're safe from use-after-free of the rcd.
+ */
+void receive_interrupt_work(struct work_struct *work)
+{
+       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+                                                 linkstate_active_work);
+       struct hfi1_devdata *dd = ppd->dd;
+       int i;
+
+       /* Received non-SC15 packet implies neighbor_normal */
+       ppd->neighbor_normal = 1;
+       set_link_state(ppd, HLS_UP_ACTIVE);
+
+       /*
+        * Interrupt all kernel contexts that could have had an
+        * interrupt during auto activation.
+        */
+       for (i = HFI1_CTRL_CTXT; i < dd->first_user_ctxt; i++)
+               force_recv_intr(dd->rcd[i]);
+}
+
+/*
+ * Convert a given MTU size to the on-wire MAD packet enumeration.
+ * Return -1 if the size is invalid.
+ */
+int mtu_to_enum(u32 mtu, int default_if_bad)
+{
+       switch (mtu) {
+       case     0: return OPA_MTU_0;
+       case   256: return OPA_MTU_256;
+       case   512: return OPA_MTU_512;
+       case  1024: return OPA_MTU_1024;
+       case  2048: return OPA_MTU_2048;
+       case  4096: return OPA_MTU_4096;
+       case  8192: return OPA_MTU_8192;
+       case 10240: return OPA_MTU_10240;
+       }
+       return default_if_bad;
+}
+
+u16 enum_to_mtu(int mtu)
+{
+       switch (mtu) {
+       case OPA_MTU_0:     return 0;
+       case OPA_MTU_256:   return 256;
+       case OPA_MTU_512:   return 512;
+       case OPA_MTU_1024:  return 1024;
+       case OPA_MTU_2048:  return 2048;
+       case OPA_MTU_4096:  return 4096;
+       case OPA_MTU_8192:  return 8192;
+       case OPA_MTU_10240: return 10240;
+       default: return 0xffff;
+       }
+}
+
+/*
+ * set_mtu - set the MTU
+ * @ppd: the per port data
+ *
+ * We can handle "any" incoming size, the issue here is whether we
+ * need to restrict our outgoing size.  We do not deal with what happens
+ * to programs that are already running when the size changes.
+ */
+int set_mtu(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       int i, drain, ret = 0, is_up = 0;
+
+       ppd->ibmtu = 0;
+       for (i = 0; i < ppd->vls_supported; i++)
+               if (ppd->ibmtu < dd->vld[i].mtu)
+                       ppd->ibmtu = dd->vld[i].mtu;
+       ppd->ibmaxlen = ppd->ibmtu + lrh_max_header_bytes(ppd->dd);
+
+       mutex_lock(&ppd->hls_lock);
+       if (ppd->host_link_state == HLS_UP_INIT ||
+           ppd->host_link_state == HLS_UP_ARMED ||
+           ppd->host_link_state == HLS_UP_ACTIVE)
+               is_up = 1;
+
+       drain = !is_ax(dd) && is_up;
+
+       if (drain)
+               /*
+                * MTU is specified per-VL. To ensure that no packet gets
+                * stuck (due, e.g., to the MTU for the packet's VL being
+                * reduced), empty the per-VL FIFOs before adjusting MTU.
+                */
+               ret = stop_drain_data_vls(dd);
+
+       if (ret) {
+               dd_dev_err(dd, "%s: cannot stop/drain VLs - refusing to change per-VL MTUs\n",
+                          __func__);
+               goto err;
+       }
+
+       hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_MTU, 0);
+
+       if (drain)
+               open_fill_data_vls(dd); /* reopen all VLs */
+
+err:
+       mutex_unlock(&ppd->hls_lock);
+
+       return ret;
+}
+
+int hfi1_set_lid(struct hfi1_pportdata *ppd, u32 lid, u8 lmc)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+
+       ppd->lid = lid;
+       ppd->lmc = lmc;
+       hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LIDLMC, 0);
+
+       dd_dev_info(dd, "port %u: got a lid: 0x%x\n", ppd->port, lid);
+
+       return 0;
+}
+
+void shutdown_led_override(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+
+       /*
+        * This pairs with the memory barrier in hfi1_start_led_override to
+        * ensure that we read the correct state of LED beaconing represented
+        * by led_override_timer_active
+        */
+       smp_rmb();
+       if (atomic_read(&ppd->led_override_timer_active)) {
+               del_timer_sync(&ppd->led_override_timer);
+               atomic_set(&ppd->led_override_timer_active, 0);
+               /* Ensure the atomic_set is visible to all CPUs */
+               smp_wmb();
+       }
+
+       /* Hand control of the LED to the DC for normal operation */
+       write_csr(dd, DCC_CFG_LED_CNTRL, 0);
+}
+
+static void run_led_override(unsigned long opaque)
+{
+       struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)opaque;
+       struct hfi1_devdata *dd = ppd->dd;
+       unsigned long timeout;
+       int phase_idx;
+
+       if (!(dd->flags & HFI1_INITTED))
+               return;
+
+       phase_idx = ppd->led_override_phase & 1;
+
+       setextled(dd, phase_idx);
+
+       timeout = ppd->led_override_vals[phase_idx];
+
+       /* Set up for next phase */
+       ppd->led_override_phase = !ppd->led_override_phase;
+
+       mod_timer(&ppd->led_override_timer, jiffies + timeout);
+}
+
+/*
+ * To have the LED blink in a particular pattern, provide timeon and timeoff
+ * in milliseconds.
+ * To turn off custom blinking and return to normal operation, use
+ * shutdown_led_override()
+ */
+void hfi1_start_led_override(struct hfi1_pportdata *ppd, unsigned int timeon,
+                            unsigned int timeoff)
+{
+       if (!(ppd->dd->flags & HFI1_INITTED))
+               return;
+
+       /* Convert to jiffies for direct use in timer */
+       ppd->led_override_vals[0] = msecs_to_jiffies(timeoff);
+       ppd->led_override_vals[1] = msecs_to_jiffies(timeon);
+
+       /* Arbitrarily start from LED on phase */
+       ppd->led_override_phase = 1;
+
+       /*
+        * If the timer has not already been started, do so. Use a "quick"
+        * timeout so the handler will be called soon to look at our request.
+        */
+       if (!timer_pending(&ppd->led_override_timer)) {
+               setup_timer(&ppd->led_override_timer, run_led_override,
+                           (unsigned long)ppd);
+               ppd->led_override_timer.expires = jiffies + 1;
+               add_timer(&ppd->led_override_timer);
+               atomic_set(&ppd->led_override_timer_active, 1);
+               /* Ensure the atomic_set is visible to all CPUs */
+               smp_wmb();
+       }
+}
+
+/**
+ * hfi1_reset_device - reset the chip if possible
+ * @unit: the device to reset
+ *
+ * Whether or not reset is successful, we attempt to re-initialize the chip
+ * (that is, much like a driver unload/reload).  We clear the INITTED flag
+ * so that the various entry points will fail until we reinitialize.  For
+ * now, we only allow this if no user contexts are open that use chip resources
+ */
+int hfi1_reset_device(int unit)
+{
+       int ret, i;
+       struct hfi1_devdata *dd = hfi1_lookup(unit);
+       struct hfi1_pportdata *ppd;
+       unsigned long flags;
+       int pidx;
+
+       if (!dd) {
+               ret = -ENODEV;
+               goto bail;
+       }
+
+       dd_dev_info(dd, "Reset on unit %u requested\n", unit);
+
+       if (!dd->kregbase || !(dd->flags & HFI1_PRESENT)) {
+               dd_dev_info(dd,
+                           "Invalid unit number %u or not initialized or not present\n",
+                           unit);
+               ret = -ENXIO;
+               goto bail;
+       }
+
+       spin_lock_irqsave(&dd->uctxt_lock, flags);
+       if (dd->rcd)
+               for (i = dd->first_user_ctxt; i < dd->num_rcv_contexts; i++) {
+                       if (!dd->rcd[i] || !dd->rcd[i]->cnt)
+                               continue;
+                       spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+                       ret = -EBUSY;
+                       goto bail;
+               }
+       spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+
+       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+               ppd = dd->pport + pidx;
+
+               shutdown_led_override(ppd);
+       }
+       if (dd->flags & HFI1_HAS_SEND_DMA)
+               sdma_exit(dd);
+
+       hfi1_reset_cpu_counters(dd);
+
+       ret = hfi1_init(dd, 1);
+
+       if (ret)
+               dd_dev_err(dd,
+                          "Reinitialize unit %u after reset failed with %d\n",
+                          unit, ret);
+       else
+               dd_dev_info(dd, "Reinitialized unit %u after resetting\n",
+                           unit);
+
+bail:
+       return ret;
+}
+
+void handle_eflags(struct hfi1_packet *packet)
+{
+       struct hfi1_ctxtdata *rcd = packet->rcd;
+       u32 rte = rhf_rcv_type_err(packet->rhf);
+
+       rcv_hdrerr(rcd, rcd->ppd, packet);
+       if (rhf_err_flags(packet->rhf))
+               dd_dev_err(rcd->dd,
+                          "receive context %d: rhf 0x%016llx, errs [ %s%s%s%s%s%s%s%s] rte 0x%x\n",
+                          rcd->ctxt, packet->rhf,
+                          packet->rhf & RHF_K_HDR_LEN_ERR ? "k_hdr_len " : "",
+                          packet->rhf & RHF_DC_UNC_ERR ? "dc_unc " : "",
+                          packet->rhf & RHF_DC_ERR ? "dc " : "",
+                          packet->rhf & RHF_TID_ERR ? "tid " : "",
+                          packet->rhf & RHF_LEN_ERR ? "len " : "",
+                          packet->rhf & RHF_ECC_ERR ? "ecc " : "",
+                          packet->rhf & RHF_VCRC_ERR ? "vcrc " : "",
+                          packet->rhf & RHF_ICRC_ERR ? "icrc " : "",
+                          rte);
+}
+
+/*
+ * The following functions are called by the interrupt handler. They are type
+ * specific handlers for each packet type.
+ */
+int process_receive_ib(struct hfi1_packet *packet)
+{
+       trace_hfi1_rcvhdr(packet->rcd->ppd->dd,
+                         packet->rcd->ctxt,
+                         rhf_err_flags(packet->rhf),
+                         RHF_RCV_TYPE_IB,
+                         packet->hlen,
+                         packet->tlen,
+                         packet->updegr,
+                         rhf_egr_index(packet->rhf));
+
+       if (unlikely(rhf_err_flags(packet->rhf))) {
+               handle_eflags(packet);
+               return RHF_RCV_CONTINUE;
+       }
+
+       hfi1_ib_rcv(packet);
+       return RHF_RCV_CONTINUE;
+}
+
+int process_receive_bypass(struct hfi1_packet *packet)
+{
+       if (unlikely(rhf_err_flags(packet->rhf)))
+               handle_eflags(packet);
+
+       dd_dev_err(packet->rcd->dd,
+                  "Bypass packets are not supported in normal operation. Dropping\n");
+       return RHF_RCV_CONTINUE;
+}
+
+int process_receive_error(struct hfi1_packet *packet)
+{
+       handle_eflags(packet);
+
+       if (unlikely(rhf_err_flags(packet->rhf)))
+               dd_dev_err(packet->rcd->dd,
+                          "Unhandled error packet received. Dropping.\n");
+
+       return RHF_RCV_CONTINUE;
+}
+
+int kdeth_process_expected(struct hfi1_packet *packet)
+{
+       if (unlikely(rhf_err_flags(packet->rhf)))
+               handle_eflags(packet);
+
+       dd_dev_err(packet->rcd->dd,
+                  "Unhandled expected packet received. Dropping.\n");
+       return RHF_RCV_CONTINUE;
+}
+
+int kdeth_process_eager(struct hfi1_packet *packet)
+{
+       if (unlikely(rhf_err_flags(packet->rhf)))
+               handle_eflags(packet);
+
+       dd_dev_err(packet->rcd->dd,
+                  "Unhandled eager packet received. Dropping.\n");
+       return RHF_RCV_CONTINUE;
+}
+
+int process_receive_invalid(struct hfi1_packet *packet)
+{
+       dd_dev_err(packet->rcd->dd, "Invalid packet type %d. Dropping\n",
+                  rhf_rcv_type(packet->rhf));
+       return RHF_RCV_CONTINUE;
+}
diff --git a/drivers/infiniband/hw/hfi1/efivar.c b/drivers/infiniband/hw/hfi1/efivar.c
new file mode 100644 (file)
index 0000000..106349f
--- /dev/null
@@ -0,0 +1,164 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "efivar.h"
+
+/* GUID for HFI1 variables in EFI */
+#define HFI1_EFIVAR_GUID EFI_GUID(0xc50a953e, 0xa8b2, 0x42a6, \
+               0xbf, 0x89, 0xd3, 0x33, 0xa6, 0xe9, 0xe6, 0xd4)
+/* largest EFI data size we expect */
+#define EFI_DATA_SIZE 4096
+
+/*
+ * Read the named EFI variable.  Return the size of the actual data in *size
+ * and a kmalloc'ed buffer in *return_data.  The caller must free the
+ * data.  It is guaranteed that *return_data will be NULL and *size = 0
+ * if this routine fails.
+ *
+ * Return 0 on success, -errno on failure.
+ */
+static int read_efi_var(const char *name, unsigned long *size,
+                       void **return_data)
+{
+       efi_status_t status;
+       efi_char16_t *uni_name;
+       efi_guid_t guid;
+       unsigned long temp_size;
+       void *temp_buffer;
+       void *data;
+       int i;
+       int ret;
+
+       /* set failure return values */
+       *size = 0;
+       *return_data = NULL;
+
+       if (!efi_enabled(EFI_RUNTIME_SERVICES))
+               return -EOPNOTSUPP;
+
+       uni_name = kcalloc(strlen(name) + 1, sizeof(efi_char16_t), GFP_KERNEL);
+       temp_buffer = kzalloc(EFI_DATA_SIZE, GFP_KERNEL);
+
+       if (!uni_name || !temp_buffer) {
+               ret = -ENOMEM;
+               goto fail;
+       }
+
+       /* input: the size of the buffer */
+       temp_size = EFI_DATA_SIZE;
+
+       /* convert ASCII to unicode - it is a 1:1 mapping */
+       for (i = 0; name[i]; i++)
+               uni_name[i] = name[i];
+
+       /* need a variable for our GUID */
+       guid = HFI1_EFIVAR_GUID;
+
+       /* call into EFI runtime services */
+       status = efi.get_variable(
+                       uni_name,
+                       &guid,
+                       NULL,
+                       &temp_size,
+                       temp_buffer);
+
+       /*
+        * It would be nice to call efi_status_to_err() here, but that
+        * is in the EFIVAR_FS code and may not be compiled in.
+        * However, even that is insufficient since it does not cover
+        * EFI_BUFFER_TOO_SMALL which could be an important return.
+        * For now, just split out succces or not found.
+        */
+       ret = status == EFI_SUCCESS   ? 0 :
+             status == EFI_NOT_FOUND ? -ENOENT :
+                                       -EINVAL;
+       if (ret)
+               goto fail;
+
+       /*
+        * We have successfully read the EFI variable into our
+        * temporary buffer.  Now allocate a correctly sized
+        * buffer.
+        */
+       data = kmemdup(temp_buffer, temp_size, GFP_KERNEL);
+       if (!data) {
+               ret = -ENOMEM;
+               goto fail;
+       }
+
+       *size = temp_size;
+       *return_data = data;
+
+fail:
+       kfree(uni_name);
+       kfree(temp_buffer);
+
+       return ret;
+}
+
+/*
+ * Read an HFI1 EFI variable of the form:
+ *     <PCIe address>-<kind>
+ * Return an kalloc'ed array and size of the data.
+ *
+ * Returns 0 on success, -errno on failure.
+ */
+int read_hfi1_efi_var(struct hfi1_devdata *dd, const char *kind,
+                     unsigned long *size, void **return_data)
+{
+       char name[64];
+
+       /* create a common prefix */
+       snprintf(name, sizeof(name), "%04x:%02x:%02x.%x-%s",
+                pci_domain_nr(dd->pcidev->bus),
+                dd->pcidev->bus->number,
+                PCI_SLOT(dd->pcidev->devfn),
+                PCI_FUNC(dd->pcidev->devfn),
+                kind);
+
+       return read_efi_var(name, size, return_data);
+}
diff --git a/drivers/infiniband/hw/hfi1/efivar.h b/drivers/infiniband/hw/hfi1/efivar.h
new file mode 100644 (file)
index 0000000..94e9e70
--- /dev/null
@@ -0,0 +1,57 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef _HFI1_EFIVAR_H
+#define _HFI1_EFIVAR_H
+
+#include <linux/efi.h>
+
+#include "hfi.h"
+
+int read_hfi1_efi_var(struct hfi1_devdata *dd, const char *kind,
+                     unsigned long *size, void **return_data);
+
+#endif /* _HFI1_EFIVAR_H */
diff --git a/drivers/infiniband/hw/hfi1/eprom.c b/drivers/infiniband/hw/hfi1/eprom.c
new file mode 100644 (file)
index 0000000..36b7794
--- /dev/null
@@ -0,0 +1,102 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/delay.h>
+#include "hfi.h"
+#include "common.h"
+#include "eprom.h"
+
+#define CMD_SHIFT 24
+#define CMD_RELEASE_POWERDOWN_NOID  ((0xab << CMD_SHIFT))
+
+/* controller interface speeds */
+#define EP_SPEED_FULL 0x2      /* full speed */
+
+/*
+ * How long to wait for the EPROM to become available, in ms.
+ * The spec 32 Mb EPROM takes around 40s to erase then write.
+ * Double it for safety.
+ */
+#define EPROM_TIMEOUT 80000 /* ms */
+/*
+ * Initialize the EPROM handler.
+ */
+int eprom_init(struct hfi1_devdata *dd)
+{
+       int ret = 0;
+
+       /* only the discrete chip has an EPROM */
+       if (dd->pcidev->device != PCI_DEVICE_ID_INTEL0)
+               return 0;
+
+       /*
+        * It is OK if both HFIs reset the EPROM as long as they don't
+        * do it at the same time.
+        */
+       ret = acquire_chip_resource(dd, CR_EPROM, EPROM_TIMEOUT);
+       if (ret) {
+               dd_dev_err(dd,
+                          "%s: unable to acquire EPROM resource, no EPROM support\n",
+                          __func__);
+               goto done_asic;
+       }
+
+       /* reset EPROM to be sure it is in a good state */
+
+       /* set reset */
+       write_csr(dd, ASIC_EEP_CTL_STAT, ASIC_EEP_CTL_STAT_EP_RESET_SMASK);
+       /* clear reset, set speed */
+       write_csr(dd, ASIC_EEP_CTL_STAT,
+                 EP_SPEED_FULL << ASIC_EEP_CTL_STAT_RATE_SPI_SHIFT);
+
+       /* wake the device with command "release powerdown NoID" */
+       write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_RELEASE_POWERDOWN_NOID);
+
+       dd->eprom_available = true;
+       release_chip_resource(dd, CR_EPROM);
+done_asic:
+       return ret;
+}
diff --git a/drivers/infiniband/hw/hfi1/eprom.h b/drivers/infiniband/hw/hfi1/eprom.h
new file mode 100644 (file)
index 0000000..d41f0b1
--- /dev/null
@@ -0,0 +1,52 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+struct hfi1_cmd;
+struct hfi1_devdata;
+
+int eprom_init(struct hfi1_devdata *dd);
+int handle_eprom_command(struct file *fp, const struct hfi1_cmd *cmd);
diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c
new file mode 100644 (file)
index 0000000..7a5b0e6
--- /dev/null
@@ -0,0 +1,1498 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/poll.h>
+#include <linux/cdev.h>
+#include <linux/vmalloc.h>
+#include <linux/io.h>
+
+#include <rdma/ib.h>
+
+#include "hfi.h"
+#include "pio.h"
+#include "device.h"
+#include "common.h"
+#include "trace.h"
+#include "user_sdma.h"
+#include "user_exp_rcv.h"
+#include "eprom.h"
+#include "aspm.h"
+#include "mmu_rb.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) DRIVER_NAME ": " fmt
+
+#define SEND_CTXT_HALT_TIMEOUT 1000 /* msecs */
+
+/*
+ * File operation functions
+ */
+static int hfi1_file_open(struct inode *, struct file *);
+static int hfi1_file_close(struct inode *, struct file *);
+static ssize_t hfi1_write_iter(struct kiocb *, struct iov_iter *);
+static unsigned int hfi1_poll(struct file *, struct poll_table_struct *);
+static int hfi1_file_mmap(struct file *, struct vm_area_struct *);
+
+static u64 kvirt_to_phys(void *);
+static int assign_ctxt(struct file *, struct hfi1_user_info *);
+static int init_subctxts(struct hfi1_ctxtdata *, const struct hfi1_user_info *);
+static int user_init(struct file *);
+static int get_ctxt_info(struct file *, void __user *, __u32);
+static int get_base_info(struct file *, void __user *, __u32);
+static int setup_ctxt(struct file *);
+static int setup_subctxt(struct hfi1_ctxtdata *);
+static int get_user_context(struct file *, struct hfi1_user_info *, int);
+static int find_shared_ctxt(struct file *, const struct hfi1_user_info *);
+static int allocate_ctxt(struct file *, struct hfi1_devdata *,
+                        struct hfi1_user_info *);
+static unsigned int poll_urgent(struct file *, struct poll_table_struct *);
+static unsigned int poll_next(struct file *, struct poll_table_struct *);
+static int user_event_ack(struct hfi1_ctxtdata *, int, unsigned long);
+static int set_ctxt_pkey(struct hfi1_ctxtdata *, unsigned, u16);
+static int manage_rcvq(struct hfi1_ctxtdata *, unsigned, int);
+static int vma_fault(struct vm_area_struct *, struct vm_fault *);
+static long hfi1_file_ioctl(struct file *fp, unsigned int cmd,
+                           unsigned long arg);
+
+static const struct file_operations hfi1_file_ops = {
+       .owner = THIS_MODULE,
+       .write_iter = hfi1_write_iter,
+       .open = hfi1_file_open,
+       .release = hfi1_file_close,
+       .unlocked_ioctl = hfi1_file_ioctl,
+       .poll = hfi1_poll,
+       .mmap = hfi1_file_mmap,
+       .llseek = noop_llseek,
+};
+
+static struct vm_operations_struct vm_ops = {
+       .fault = vma_fault,
+};
+
+/*
+ * Types of memories mapped into user processes' space
+ */
+enum mmap_types {
+       PIO_BUFS = 1,
+       PIO_BUFS_SOP,
+       PIO_CRED,
+       RCV_HDRQ,
+       RCV_EGRBUF,
+       UREGS,
+       EVENTS,
+       STATUS,
+       RTAIL,
+       SUBCTXT_UREGS,
+       SUBCTXT_RCV_HDRQ,
+       SUBCTXT_EGRBUF,
+       SDMA_COMP
+};
+
+/*
+ * Masks and offsets defining the mmap tokens
+ */
+#define HFI1_MMAP_OFFSET_MASK   0xfffULL
+#define HFI1_MMAP_OFFSET_SHIFT  0
+#define HFI1_MMAP_SUBCTXT_MASK  0xfULL
+#define HFI1_MMAP_SUBCTXT_SHIFT 12
+#define HFI1_MMAP_CTXT_MASK     0xffULL
+#define HFI1_MMAP_CTXT_SHIFT    16
+#define HFI1_MMAP_TYPE_MASK     0xfULL
+#define HFI1_MMAP_TYPE_SHIFT    24
+#define HFI1_MMAP_MAGIC_MASK    0xffffffffULL
+#define HFI1_MMAP_MAGIC_SHIFT   32
+
+#define HFI1_MMAP_MAGIC         0xdabbad00
+
+#define HFI1_MMAP_TOKEN_SET(field, val)        \
+       (((val) & HFI1_MMAP_##field##_MASK) << HFI1_MMAP_##field##_SHIFT)
+#define HFI1_MMAP_TOKEN_GET(field, token) \
+       (((token) >> HFI1_MMAP_##field##_SHIFT) & HFI1_MMAP_##field##_MASK)
+#define HFI1_MMAP_TOKEN(type, ctxt, subctxt, addr)   \
+       (HFI1_MMAP_TOKEN_SET(MAGIC, HFI1_MMAP_MAGIC) | \
+       HFI1_MMAP_TOKEN_SET(TYPE, type) | \
+       HFI1_MMAP_TOKEN_SET(CTXT, ctxt) | \
+       HFI1_MMAP_TOKEN_SET(SUBCTXT, subctxt) | \
+       HFI1_MMAP_TOKEN_SET(OFFSET, (offset_in_page(addr))))
+
+#define dbg(fmt, ...)                          \
+       pr_info(fmt, ##__VA_ARGS__)
+
+static inline int is_valid_mmap(u64 token)
+{
+       return (HFI1_MMAP_TOKEN_GET(MAGIC, token) == HFI1_MMAP_MAGIC);
+}
+
+static int hfi1_file_open(struct inode *inode, struct file *fp)
+{
+       struct hfi1_devdata *dd = container_of(inode->i_cdev,
+                                              struct hfi1_devdata,
+                                              user_cdev);
+
+       /* Just take a ref now. Not all opens result in a context assign */
+       kobject_get(&dd->kobj);
+
+       /* The real work is performed later in assign_ctxt() */
+       fp->private_data = kzalloc(sizeof(struct hfi1_filedata), GFP_KERNEL);
+       if (fp->private_data) /* no cpu affinity by default */
+               ((struct hfi1_filedata *)fp->private_data)->rec_cpu_num = -1;
+       return fp->private_data ? 0 : -ENOMEM;
+}
+
+static long hfi1_file_ioctl(struct file *fp, unsigned int cmd,
+                           unsigned long arg)
+{
+       struct hfi1_filedata *fd = fp->private_data;
+       struct hfi1_ctxtdata *uctxt = fd->uctxt;
+       struct hfi1_user_info uinfo;
+       struct hfi1_tid_info tinfo;
+       int ret = 0;
+       unsigned long addr;
+       int uval = 0;
+       unsigned long ul_uval = 0;
+       u16 uval16 = 0;
+
+       hfi1_cdbg(IOCTL, "IOCTL recv: 0x%x", cmd);
+       if (cmd != HFI1_IOCTL_ASSIGN_CTXT &&
+           cmd != HFI1_IOCTL_GET_VERS &&
+           !uctxt)
+               return -EINVAL;
+
+       switch (cmd) {
+       case HFI1_IOCTL_ASSIGN_CTXT:
+               if (copy_from_user(&uinfo,
+                                  (struct hfi1_user_info __user *)arg,
+                                  sizeof(uinfo)))
+                       return -EFAULT;
+
+               ret = assign_ctxt(fp, &uinfo);
+               if (ret < 0)
+                       return ret;
+               setup_ctxt(fp);
+               if (ret)
+                       return ret;
+               ret = user_init(fp);
+               break;
+       case HFI1_IOCTL_CTXT_INFO:
+               ret = get_ctxt_info(fp, (void __user *)(unsigned long)arg,
+                                   sizeof(struct hfi1_ctxt_info));
+               break;
+       case HFI1_IOCTL_USER_INFO:
+               ret = get_base_info(fp, (void __user *)(unsigned long)arg,
+                                   sizeof(struct hfi1_base_info));
+               break;
+       case HFI1_IOCTL_CREDIT_UPD:
+               if (uctxt && uctxt->sc)
+                       sc_return_credits(uctxt->sc);
+               break;
+
+       case HFI1_IOCTL_TID_UPDATE:
+               if (copy_from_user(&tinfo,
+                                  (struct hfi11_tid_info __user *)arg,
+                                  sizeof(tinfo)))
+                       return -EFAULT;
+
+               ret = hfi1_user_exp_rcv_setup(fp, &tinfo);
+               if (!ret) {
+                       /*
+                        * Copy the number of tidlist entries we used
+                        * and the length of the buffer we registered.
+                        * These fields are adjacent in the structure so
+                        * we can copy them at the same time.
+                        */
+                       addr = arg + offsetof(struct hfi1_tid_info, tidcnt);
+                       if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
+                                        sizeof(tinfo.tidcnt) +
+                                        sizeof(tinfo.length)))
+                               ret = -EFAULT;
+               }
+               break;
+
+       case HFI1_IOCTL_TID_FREE:
+               if (copy_from_user(&tinfo,
+                                  (struct hfi11_tid_info __user *)arg,
+                                  sizeof(tinfo)))
+                       return -EFAULT;
+
+               ret = hfi1_user_exp_rcv_clear(fp, &tinfo);
+               if (ret)
+                       break;
+               addr = arg + offsetof(struct hfi1_tid_info, tidcnt);
+               if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
+                                sizeof(tinfo.tidcnt)))
+                       ret = -EFAULT;
+               break;
+
+       case HFI1_IOCTL_TID_INVAL_READ:
+               if (copy_from_user(&tinfo,
+                                  (struct hfi11_tid_info __user *)arg,
+                                  sizeof(tinfo)))
+                       return -EFAULT;
+
+               ret = hfi1_user_exp_rcv_invalid(fp, &tinfo);
+               if (ret)
+                       break;
+               addr = arg + offsetof(struct hfi1_tid_info, tidcnt);
+               if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
+                                sizeof(tinfo.tidcnt)))
+                       ret = -EFAULT;
+               break;
+
+       case HFI1_IOCTL_RECV_CTRL:
+               ret = get_user(uval, (int __user *)arg);
+               if (ret != 0)
+                       return -EFAULT;
+               ret = manage_rcvq(uctxt, fd->subctxt, uval);
+               break;
+
+       case HFI1_IOCTL_POLL_TYPE:
+               ret = get_user(uval, (int __user *)arg);
+               if (ret != 0)
+                       return -EFAULT;
+               uctxt->poll_type = (typeof(uctxt->poll_type))uval;
+               break;
+
+       case HFI1_IOCTL_ACK_EVENT:
+               ret = get_user(ul_uval, (unsigned long __user *)arg);
+               if (ret != 0)
+                       return -EFAULT;
+               ret = user_event_ack(uctxt, fd->subctxt, ul_uval);
+               break;
+
+       case HFI1_IOCTL_SET_PKEY:
+               ret = get_user(uval16, (u16 __user *)arg);
+               if (ret != 0)
+                       return -EFAULT;
+               if (HFI1_CAP_IS_USET(PKEY_CHECK))
+                       ret = set_ctxt_pkey(uctxt, fd->subctxt, uval16);
+               else
+                       return -EPERM;
+               break;
+
+       case HFI1_IOCTL_CTXT_RESET: {
+               struct send_context *sc;
+               struct hfi1_devdata *dd;
+
+               if (!uctxt || !uctxt->dd || !uctxt->sc)
+                       return -EINVAL;
+
+               /*
+                * There is no protection here. User level has to
+                * guarantee that no one will be writing to the send
+                * context while it is being re-initialized.
+                * If user level breaks that guarantee, it will break
+                * it's own context and no one else's.
+                */
+               dd = uctxt->dd;
+               sc = uctxt->sc;
+               /*
+                * Wait until the interrupt handler has marked the
+                * context as halted or frozen. Report error if we time
+                * out.
+                */
+               wait_event_interruptible_timeout(
+                       sc->halt_wait, (sc->flags & SCF_HALTED),
+                       msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT));
+               if (!(sc->flags & SCF_HALTED))
+                       return -ENOLCK;
+
+               /*
+                * If the send context was halted due to a Freeze,
+                * wait until the device has been "unfrozen" before
+                * resetting the context.
+                */
+               if (sc->flags & SCF_FROZEN) {
+                       wait_event_interruptible_timeout(
+                               dd->event_queue,
+                               !(ACCESS_ONCE(dd->flags) & HFI1_FROZEN),
+                               msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT));
+                       if (dd->flags & HFI1_FROZEN)
+                               return -ENOLCK;
+
+                       if (dd->flags & HFI1_FORCED_FREEZE)
+                               /*
+                                * Don't allow context reset if we are into
+                                * forced freeze
+                                */
+                               return -ENODEV;
+
+                       sc_disable(sc);
+                       ret = sc_enable(sc);
+                       hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_ENB,
+                                    uctxt->ctxt);
+               } else {
+                       ret = sc_restart(sc);
+               }
+               if (!ret)
+                       sc_return_credits(sc);
+               break;
+       }
+
+       case HFI1_IOCTL_GET_VERS:
+               uval = HFI1_USER_SWVERSION;
+               if (put_user(uval, (int __user *)arg))
+                       return -EFAULT;
+               break;
+
+       default:
+               return -EINVAL;
+       }
+
+       return ret;
+}
+
+static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from)
+{
+       struct hfi1_filedata *fd = kiocb->ki_filp->private_data;
+       struct hfi1_user_sdma_pkt_q *pq = fd->pq;
+       struct hfi1_user_sdma_comp_q *cq = fd->cq;
+       int ret = 0, done = 0, reqs = 0;
+       unsigned long dim = from->nr_segs;
+
+       if (!cq || !pq) {
+               ret = -EIO;
+               goto done;
+       }
+
+       if (!iter_is_iovec(from) || !dim) {
+               ret = -EINVAL;
+               goto done;
+       }
+
+       hfi1_cdbg(SDMA, "SDMA request from %u:%u (%lu)",
+                 fd->uctxt->ctxt, fd->subctxt, dim);
+
+       if (atomic_read(&pq->n_reqs) == pq->n_max_reqs) {
+               ret = -ENOSPC;
+               goto done;
+       }
+
+       while (dim) {
+               unsigned long count = 0;
+
+               ret = hfi1_user_sdma_process_request(
+                       kiocb->ki_filp, (struct iovec *)(from->iov + done),
+                       dim, &count);
+               if (ret)
+                       goto done;
+               dim -= count;
+               done += count;
+               reqs++;
+       }
+done:
+       return ret ? ret : reqs;
+}
+
+static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma)
+{
+       struct hfi1_filedata *fd = fp->private_data;
+       struct hfi1_ctxtdata *uctxt = fd->uctxt;
+       struct hfi1_devdata *dd;
+       unsigned long flags, pfn;
+       u64 token = vma->vm_pgoff << PAGE_SHIFT,
+               memaddr = 0;
+       u8 subctxt, mapio = 0, vmf = 0, type;
+       ssize_t memlen = 0;
+       int ret = 0;
+       u16 ctxt;
+
+       if (!is_valid_mmap(token) || !uctxt ||
+           !(vma->vm_flags & VM_SHARED)) {
+               ret = -EINVAL;
+               goto done;
+       }
+       dd = uctxt->dd;
+       ctxt = HFI1_MMAP_TOKEN_GET(CTXT, token);
+       subctxt = HFI1_MMAP_TOKEN_GET(SUBCTXT, token);
+       type = HFI1_MMAP_TOKEN_GET(TYPE, token);
+       if (ctxt != uctxt->ctxt || subctxt != fd->subctxt) {
+               ret = -EINVAL;
+               goto done;
+       }
+
+       flags = vma->vm_flags;
+
+       switch (type) {
+       case PIO_BUFS:
+       case PIO_BUFS_SOP:
+               memaddr = ((dd->physaddr + TXE_PIO_SEND) +
+                               /* chip pio base */
+                          (uctxt->sc->hw_context * BIT(16))) +
+                               /* 64K PIO space / ctxt */
+                       (type == PIO_BUFS_SOP ?
+                               (TXE_PIO_SIZE / 2) : 0); /* sop? */
+               /*
+                * Map only the amount allocated to the context, not the
+                * entire available context's PIO space.
+                */
+               memlen = PAGE_ALIGN(uctxt->sc->credits * PIO_BLOCK_SIZE);
+               flags &= ~VM_MAYREAD;
+               flags |= VM_DONTCOPY | VM_DONTEXPAND;
+               vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+               mapio = 1;
+               break;
+       case PIO_CRED:
+               if (flags & VM_WRITE) {
+                       ret = -EPERM;
+                       goto done;
+               }
+               /*
+                * The credit return location for this context could be on the
+                * second or third page allocated for credit returns (if number
+                * of enabled contexts > 64 and 128 respectively).
+                */
+               memaddr = dd->cr_base[uctxt->numa_id].pa +
+                       (((u64)uctxt->sc->hw_free -
+                         (u64)dd->cr_base[uctxt->numa_id].va) & PAGE_MASK);
+               memlen = PAGE_SIZE;
+               flags &= ~VM_MAYWRITE;
+               flags |= VM_DONTCOPY | VM_DONTEXPAND;
+               /*
+                * The driver has already allocated memory for credit
+                * returns and programmed it into the chip. Has that
+                * memory been flagged as non-cached?
+                */
+               /* vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); */
+               mapio = 1;
+               break;
+       case RCV_HDRQ:
+               memaddr = uctxt->rcvhdrq_phys;
+               memlen = uctxt->rcvhdrq_size;
+               break;
+       case RCV_EGRBUF: {
+               unsigned long addr;
+               int i;
+               /*
+                * The RcvEgr buffer need to be handled differently
+                * as multiple non-contiguous pages need to be mapped
+                * into the user process.
+                */
+               memlen = uctxt->egrbufs.size;
+               if ((vma->vm_end - vma->vm_start) != memlen) {
+                       dd_dev_err(dd, "Eager buffer map size invalid (%lu != %lu)\n",
+                                  (vma->vm_end - vma->vm_start), memlen);
+                       ret = -EINVAL;
+                       goto done;
+               }
+               if (vma->vm_flags & VM_WRITE) {
+                       ret = -EPERM;
+                       goto done;
+               }
+               vma->vm_flags &= ~VM_MAYWRITE;
+               addr = vma->vm_start;
+               for (i = 0 ; i < uctxt->egrbufs.numbufs; i++) {
+                       ret = remap_pfn_range(
+                               vma, addr,
+                               uctxt->egrbufs.buffers[i].phys >> PAGE_SHIFT,
+                               uctxt->egrbufs.buffers[i].len,
+                               vma->vm_page_prot);
+                       if (ret < 0)
+                               goto done;
+                       addr += uctxt->egrbufs.buffers[i].len;
+               }
+               ret = 0;
+               goto done;
+       }
+       case UREGS:
+               /*
+                * Map only the page that contains this context's user
+                * registers.
+                */
+               memaddr = (unsigned long)
+                       (dd->physaddr + RXE_PER_CONTEXT_USER)
+                       + (uctxt->ctxt * RXE_PER_CONTEXT_SIZE);
+               /*
+                * TidFlow table is on the same page as the rest of the
+                * user registers.
+                */
+               memlen = PAGE_SIZE;
+               flags |= VM_DONTCOPY | VM_DONTEXPAND;
+               vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+               mapio = 1;
+               break;
+       case EVENTS:
+               /*
+                * Use the page where this context's flags are. User level
+                * knows where it's own bitmap is within the page.
+                */
+               memaddr = (unsigned long)(dd->events +
+                                         ((uctxt->ctxt - dd->first_user_ctxt) *
+                                          HFI1_MAX_SHARED_CTXTS)) & PAGE_MASK;
+               memlen = PAGE_SIZE;
+               /*
+                * v3.7 removes VM_RESERVED but the effect is kept by
+                * using VM_IO.
+                */
+               flags |= VM_IO | VM_DONTEXPAND;
+               vmf = 1;
+               break;
+       case STATUS:
+               memaddr = kvirt_to_phys((void *)dd->status);
+               memlen = PAGE_SIZE;
+               flags |= VM_IO | VM_DONTEXPAND;
+               break;
+       case RTAIL:
+               if (!HFI1_CAP_IS_USET(DMA_RTAIL)) {
+                       /*
+                        * If the memory allocation failed, the context alloc
+                        * also would have failed, so we would never get here
+                        */
+                       ret = -EINVAL;
+                       goto done;
+               }
+               if (flags & VM_WRITE) {
+                       ret = -EPERM;
+                       goto done;
+               }
+               memaddr = uctxt->rcvhdrqtailaddr_phys;
+               memlen = PAGE_SIZE;
+               flags &= ~VM_MAYWRITE;
+               break;
+       case SUBCTXT_UREGS:
+               memaddr = (u64)uctxt->subctxt_uregbase;
+               memlen = PAGE_SIZE;
+               flags |= VM_IO | VM_DONTEXPAND;
+               vmf = 1;
+               break;
+       case SUBCTXT_RCV_HDRQ:
+               memaddr = (u64)uctxt->subctxt_rcvhdr_base;
+               memlen = uctxt->rcvhdrq_size * uctxt->subctxt_cnt;
+               flags |= VM_IO | VM_DONTEXPAND;
+               vmf = 1;
+               break;
+       case SUBCTXT_EGRBUF:
+               memaddr = (u64)uctxt->subctxt_rcvegrbuf;
+               memlen = uctxt->egrbufs.size * uctxt->subctxt_cnt;
+               flags |= VM_IO | VM_DONTEXPAND;
+               flags &= ~VM_MAYWRITE;
+               vmf = 1;
+               break;
+       case SDMA_COMP: {
+               struct hfi1_user_sdma_comp_q *cq = fd->cq;
+
+               if (!cq) {
+                       ret = -EFAULT;
+                       goto done;
+               }
+               memaddr = (u64)cq->comps;
+               memlen = PAGE_ALIGN(sizeof(*cq->comps) * cq->nentries);
+               flags |= VM_IO | VM_DONTEXPAND;
+               vmf = 1;
+               break;
+       }
+       default:
+               ret = -EINVAL;
+               break;
+       }
+
+       if ((vma->vm_end - vma->vm_start) != memlen) {
+               hfi1_cdbg(PROC, "%u:%u Memory size mismatch %lu:%lu",
+                         uctxt->ctxt, fd->subctxt,
+                         (vma->vm_end - vma->vm_start), memlen);
+               ret = -EINVAL;
+               goto done;
+       }
+
+       vma->vm_flags = flags;
+       hfi1_cdbg(PROC,
+                 "%u:%u type:%u io/vf:%d/%d, addr:0x%llx, len:%lu(%lu), flags:0x%lx\n",
+                   ctxt, subctxt, type, mapio, vmf, memaddr, memlen,
+                   vma->vm_end - vma->vm_start, vma->vm_flags);
+       pfn = (unsigned long)(memaddr >> PAGE_SHIFT);
+       if (vmf) {
+               vma->vm_pgoff = pfn;
+               vma->vm_ops = &vm_ops;
+               ret = 0;
+       } else if (mapio) {
+               ret = io_remap_pfn_range(vma, vma->vm_start, pfn, memlen,
+                                        vma->vm_page_prot);
+       } else {
+               ret = remap_pfn_range(vma, vma->vm_start, pfn, memlen,
+                                     vma->vm_page_prot);
+       }
+done:
+       return ret;
+}
+
+/*
+ * Local (non-chip) user memory is not mapped right away but as it is
+ * accessed by the user-level code.
+ */
+static int vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       struct page *page;
+
+       page = vmalloc_to_page((void *)(vmf->pgoff << PAGE_SHIFT));
+       if (!page)
+               return VM_FAULT_SIGBUS;
+
+       get_page(page);
+       vmf->page = page;
+
+       return 0;
+}
+
+static unsigned int hfi1_poll(struct file *fp, struct poll_table_struct *pt)
+{
+       struct hfi1_ctxtdata *uctxt;
+       unsigned pollflag;
+
+       uctxt = ((struct hfi1_filedata *)fp->private_data)->uctxt;
+       if (!uctxt)
+               pollflag = POLLERR;
+       else if (uctxt->poll_type == HFI1_POLL_TYPE_URGENT)
+               pollflag = poll_urgent(fp, pt);
+       else  if (uctxt->poll_type == HFI1_POLL_TYPE_ANYRCV)
+               pollflag = poll_next(fp, pt);
+       else /* invalid */
+               pollflag = POLLERR;
+
+       return pollflag;
+}
+
+static int hfi1_file_close(struct inode *inode, struct file *fp)
+{
+       struct hfi1_filedata *fdata = fp->private_data;
+       struct hfi1_ctxtdata *uctxt = fdata->uctxt;
+       struct hfi1_devdata *dd = container_of(inode->i_cdev,
+                                              struct hfi1_devdata,
+                                              user_cdev);
+       unsigned long flags, *ev;
+
+       fp->private_data = NULL;
+
+       if (!uctxt)
+               goto done;
+
+       hfi1_cdbg(PROC, "freeing ctxt %u:%u", uctxt->ctxt, fdata->subctxt);
+       mutex_lock(&hfi1_mutex);
+
+       flush_wc();
+       /* drain user sdma queue */
+       hfi1_user_sdma_free_queues(fdata);
+
+       /* release the cpu */
+       hfi1_put_proc_affinity(dd, fdata->rec_cpu_num);
+
+       /*
+        * Clear any left over, unhandled events so the next process that
+        * gets this context doesn't get confused.
+        */
+       ev = dd->events + ((uctxt->ctxt - dd->first_user_ctxt) *
+                          HFI1_MAX_SHARED_CTXTS) + fdata->subctxt;
+       *ev = 0;
+
+       if (--uctxt->cnt) {
+               uctxt->active_slaves &= ~(1 << fdata->subctxt);
+               uctxt->subpid[fdata->subctxt] = 0;
+               mutex_unlock(&hfi1_mutex);
+               goto done;
+       }
+
+       spin_lock_irqsave(&dd->uctxt_lock, flags);
+       /*
+        * Disable receive context and interrupt available, reset all
+        * RcvCtxtCtrl bits to default values.
+        */
+       hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS |
+                    HFI1_RCVCTRL_TIDFLOW_DIS |
+                    HFI1_RCVCTRL_INTRAVAIL_DIS |
+                    HFI1_RCVCTRL_TAILUPD_DIS |
+                    HFI1_RCVCTRL_ONE_PKT_EGR_DIS |
+                    HFI1_RCVCTRL_NO_RHQ_DROP_DIS |
+                    HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt->ctxt);
+       /* Clear the context's J_KEY */
+       hfi1_clear_ctxt_jkey(dd, uctxt->ctxt);
+       /*
+        * Reset context integrity checks to default.
+        * (writes to CSRs probably belong in chip.c)
+        */
+       write_kctxt_csr(dd, uctxt->sc->hw_context, SEND_CTXT_CHECK_ENABLE,
+                       hfi1_pkt_default_send_ctxt_mask(dd, uctxt->sc->type));
+       sc_disable(uctxt->sc);
+       uctxt->pid = 0;
+       spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+
+       dd->rcd[uctxt->ctxt] = NULL;
+
+       hfi1_user_exp_rcv_free(fdata);
+       hfi1_clear_ctxt_pkey(dd, uctxt->ctxt);
+
+       uctxt->rcvwait_to = 0;
+       uctxt->piowait_to = 0;
+       uctxt->rcvnowait = 0;
+       uctxt->pionowait = 0;
+       uctxt->event_flags = 0;
+
+       hfi1_stats.sps_ctxts--;
+       if (++dd->freectxts == dd->num_user_contexts)
+               aspm_enable_all(dd);
+       mutex_unlock(&hfi1_mutex);
+       hfi1_free_ctxtdata(dd, uctxt);
+done:
+       kobject_put(&dd->kobj);
+       kfree(fdata);
+       return 0;
+}
+
+/*
+ * Convert kernel *virtual* addresses to physical addresses.
+ * This is used to vmalloc'ed addresses.
+ */
+static u64 kvirt_to_phys(void *addr)
+{
+       struct page *page;
+       u64 paddr = 0;
+
+       page = vmalloc_to_page(addr);
+       if (page)
+               paddr = page_to_pfn(page) << PAGE_SHIFT;
+
+       return paddr;
+}
+
+static int assign_ctxt(struct file *fp, struct hfi1_user_info *uinfo)
+{
+       int i_minor, ret = 0;
+       unsigned int swmajor, swminor;
+
+       swmajor = uinfo->userversion >> 16;
+       if (swmajor != HFI1_USER_SWMAJOR) {
+               ret = -ENODEV;
+               goto done;
+       }
+
+       swminor = uinfo->userversion & 0xffff;
+
+       mutex_lock(&hfi1_mutex);
+       /* First, lets check if we need to setup a shared context? */
+       if (uinfo->subctxt_cnt) {
+               struct hfi1_filedata *fd = fp->private_data;
+
+               ret = find_shared_ctxt(fp, uinfo);
+               if (ret < 0)
+                       goto done_unlock;
+               if (ret)
+                       fd->rec_cpu_num = hfi1_get_proc_affinity(
+                               fd->uctxt->dd, fd->uctxt->numa_id);
+       }
+
+       /*
+        * We execute the following block if we couldn't find a
+        * shared context or if context sharing is not required.
+        */
+       if (!ret) {
+               i_minor = iminor(file_inode(fp)) - HFI1_USER_MINOR_BASE;
+               ret = get_user_context(fp, uinfo, i_minor);
+       }
+done_unlock:
+       mutex_unlock(&hfi1_mutex);
+done:
+       return ret;
+}
+
+static int get_user_context(struct file *fp, struct hfi1_user_info *uinfo,
+                           int devno)
+{
+       struct hfi1_devdata *dd = NULL;
+       int devmax, npresent, nup;
+
+       devmax = hfi1_count_units(&npresent, &nup);
+       if (!npresent)
+               return -ENXIO;
+
+       if (!nup)
+               return -ENETDOWN;
+
+       dd = hfi1_lookup(devno);
+       if (!dd)
+               return -ENODEV;
+       else if (!dd->freectxts)
+               return -EBUSY;
+
+       return allocate_ctxt(fp, dd, uinfo);
+}
+
+static int find_shared_ctxt(struct file *fp,
+                           const struct hfi1_user_info *uinfo)
+{
+       int devmax, ndev, i;
+       int ret = 0;
+       struct hfi1_filedata *fd = fp->private_data;
+
+       devmax = hfi1_count_units(NULL, NULL);
+
+       for (ndev = 0; ndev < devmax; ndev++) {
+               struct hfi1_devdata *dd = hfi1_lookup(ndev);
+
+               if (!(dd && (dd->flags & HFI1_PRESENT) && dd->kregbase))
+                       continue;
+               for (i = dd->first_user_ctxt; i < dd->num_rcv_contexts; i++) {
+                       struct hfi1_ctxtdata *uctxt = dd->rcd[i];
+
+                       /* Skip ctxts which are not yet open */
+                       if (!uctxt || !uctxt->cnt)
+                               continue;
+                       /* Skip ctxt if it doesn't match the requested one */
+                       if (memcmp(uctxt->uuid, uinfo->uuid,
+                                  sizeof(uctxt->uuid)) ||
+                           uctxt->jkey != generate_jkey(current_uid()) ||
+                           uctxt->subctxt_id != uinfo->subctxt_id ||
+                           uctxt->subctxt_cnt != uinfo->subctxt_cnt)
+                               continue;
+
+                       /* Verify the sharing process matches the master */
+                       if (uctxt->userversion != uinfo->userversion ||
+                           uctxt->cnt >= uctxt->subctxt_cnt) {
+                               ret = -EINVAL;
+                               goto done;
+                       }
+                       fd->uctxt = uctxt;
+                       fd->subctxt  = uctxt->cnt++;
+                       uctxt->subpid[fd->subctxt] = current->pid;
+                       uctxt->active_slaves |= 1 << fd->subctxt;
+                       ret = 1;
+                       goto done;
+               }
+       }
+
+done:
+       return ret;
+}
+
+static int allocate_ctxt(struct file *fp, struct hfi1_devdata *dd,
+                        struct hfi1_user_info *uinfo)
+{
+       struct hfi1_filedata *fd = fp->private_data;
+       struct hfi1_ctxtdata *uctxt;
+       unsigned ctxt;
+       int ret, numa;
+
+       if (dd->flags & HFI1_FROZEN) {
+               /*
+                * Pick an error that is unique from all other errors
+                * that are returned so the user process knows that
+                * it tried to allocate while the SPC was frozen.  It
+                * it should be able to retry with success in a short
+                * while.
+                */
+               return -EIO;
+       }
+
+       for (ctxt = dd->first_user_ctxt; ctxt < dd->num_rcv_contexts; ctxt++)
+               if (!dd->rcd[ctxt])
+                       break;
+
+       if (ctxt == dd->num_rcv_contexts)
+               return -EBUSY;
+
+       fd->rec_cpu_num = hfi1_get_proc_affinity(dd, -1);
+       if (fd->rec_cpu_num != -1)
+               numa = cpu_to_node(fd->rec_cpu_num);
+       else
+               numa = numa_node_id();
+       uctxt = hfi1_create_ctxtdata(dd->pport, ctxt, numa);
+       if (!uctxt) {
+               dd_dev_err(dd,
+                          "Unable to allocate ctxtdata memory, failing open\n");
+               return -ENOMEM;
+       }
+       hfi1_cdbg(PROC, "[%u:%u] pid %u assigned to CPU %d (NUMA %u)",
+                 uctxt->ctxt, fd->subctxt, current->pid, fd->rec_cpu_num,
+                 uctxt->numa_id);
+
+       /*
+        * Allocate and enable a PIO send context.
+        */
+       uctxt->sc = sc_alloc(dd, SC_USER, uctxt->rcvhdrqentsize,
+                            uctxt->dd->node);
+       if (!uctxt->sc)
+               return -ENOMEM;
+
+       hfi1_cdbg(PROC, "allocated send context %u(%u)\n", uctxt->sc->sw_index,
+                 uctxt->sc->hw_context);
+       ret = sc_enable(uctxt->sc);
+       if (ret)
+               return ret;
+       /*
+        * Setup shared context resources if the user-level has requested
+        * shared contexts and this is the 'master' process.
+        * This has to be done here so the rest of the sub-contexts find the
+        * proper master.
+        */
+       if (uinfo->subctxt_cnt && !fd->subctxt) {
+               ret = init_subctxts(uctxt, uinfo);
+               /*
+                * On error, we don't need to disable and de-allocate the
+                * send context because it will be done during file close
+                */
+               if (ret)
+                       return ret;
+       }
+       uctxt->userversion = uinfo->userversion;
+       uctxt->pid = current->pid;
+       uctxt->flags = HFI1_CAP_UGET(MASK);
+       init_waitqueue_head(&uctxt->wait);
+       strlcpy(uctxt->comm, current->comm, sizeof(uctxt->comm));
+       memcpy(uctxt->uuid, uinfo->uuid, sizeof(uctxt->uuid));
+       uctxt->jkey = generate_jkey(current_uid());
+       INIT_LIST_HEAD(&uctxt->sdma_queues);
+       spin_lock_init(&uctxt->sdma_qlock);
+       hfi1_stats.sps_ctxts++;
+       /*
+        * Disable ASPM when there are open user/PSM contexts to avoid
+        * issues with ASPM L1 exit latency
+        */
+       if (dd->freectxts-- == dd->num_user_contexts)
+               aspm_disable_all(dd);
+       fd->uctxt = uctxt;
+
+       return 0;
+}
+
+static int init_subctxts(struct hfi1_ctxtdata *uctxt,
+                        const struct hfi1_user_info *uinfo)
+{
+       unsigned num_subctxts;
+
+       num_subctxts = uinfo->subctxt_cnt;
+       if (num_subctxts > HFI1_MAX_SHARED_CTXTS)
+               return -EINVAL;
+
+       uctxt->subctxt_cnt = uinfo->subctxt_cnt;
+       uctxt->subctxt_id = uinfo->subctxt_id;
+       uctxt->active_slaves = 1;
+       uctxt->redirect_seq_cnt = 1;
+       set_bit(HFI1_CTXT_MASTER_UNINIT, &uctxt->event_flags);
+
+       return 0;
+}
+
+static int setup_subctxt(struct hfi1_ctxtdata *uctxt)
+{
+       int ret = 0;
+       unsigned num_subctxts = uctxt->subctxt_cnt;
+
+       uctxt->subctxt_uregbase = vmalloc_user(PAGE_SIZE);
+       if (!uctxt->subctxt_uregbase) {
+               ret = -ENOMEM;
+               goto bail;
+       }
+       /* We can take the size of the RcvHdr Queue from the master */
+       uctxt->subctxt_rcvhdr_base = vmalloc_user(uctxt->rcvhdrq_size *
+                                                 num_subctxts);
+       if (!uctxt->subctxt_rcvhdr_base) {
+               ret = -ENOMEM;
+               goto bail_ureg;
+       }
+
+       uctxt->subctxt_rcvegrbuf = vmalloc_user(uctxt->egrbufs.size *
+                                               num_subctxts);
+       if (!uctxt->subctxt_rcvegrbuf) {
+               ret = -ENOMEM;
+               goto bail_rhdr;
+       }
+       goto bail;
+bail_rhdr:
+       vfree(uctxt->subctxt_rcvhdr_base);
+bail_ureg:
+       vfree(uctxt->subctxt_uregbase);
+       uctxt->subctxt_uregbase = NULL;
+bail:
+       return ret;
+}
+
+static int user_init(struct file *fp)
+{
+       unsigned int rcvctrl_ops = 0;
+       struct hfi1_filedata *fd = fp->private_data;
+       struct hfi1_ctxtdata *uctxt = fd->uctxt;
+
+       /* make sure that the context has already been setup */
+       if (!test_bit(HFI1_CTXT_SETUP_DONE, &uctxt->event_flags))
+               return -EFAULT;
+
+       /* initialize poll variables... */
+       uctxt->urgent = 0;
+       uctxt->urgent_poll = 0;
+
+       /*
+        * Now enable the ctxt for receive.
+        * For chips that are set to DMA the tail register to memory
+        * when they change (and when the update bit transitions from
+        * 0 to 1.  So for those chips, we turn it off and then back on.
+        * This will (very briefly) affect any other open ctxts, but the
+        * duration is very short, and therefore isn't an issue.  We
+        * explicitly set the in-memory tail copy to 0 beforehand, so we
+        * don't have to wait to be sure the DMA update has happened
+        * (chip resets head/tail to 0 on transition to enable).
+        */
+       if (uctxt->rcvhdrtail_kvaddr)
+               clear_rcvhdrtail(uctxt);
+
+       /* Setup J_KEY before enabling the context */
+       hfi1_set_ctxt_jkey(uctxt->dd, uctxt->ctxt, uctxt->jkey);
+
+       rcvctrl_ops = HFI1_RCVCTRL_CTXT_ENB;
+       if (HFI1_CAP_KGET_MASK(uctxt->flags, HDRSUPP))
+               rcvctrl_ops |= HFI1_RCVCTRL_TIDFLOW_ENB;
+       /*
+        * Ignore the bit in the flags for now until proper
+        * support for multiple packet per rcv array entry is
+        * added.
+        */
+       if (!HFI1_CAP_KGET_MASK(uctxt->flags, MULTI_PKT_EGR))
+               rcvctrl_ops |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB;
+       if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_EGR_FULL))
+               rcvctrl_ops |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
+       if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_RHQ_FULL))
+               rcvctrl_ops |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
+       /*
+        * The RcvCtxtCtrl.TailUpd bit has to be explicitly written.
+        * We can't rely on the correct value to be set from prior
+        * uses of the chip or ctxt. Therefore, add the rcvctrl op
+        * for both cases.
+        */
+       if (HFI1_CAP_KGET_MASK(uctxt->flags, DMA_RTAIL))
+               rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_ENB;
+       else
+               rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_DIS;
+       hfi1_rcvctrl(uctxt->dd, rcvctrl_ops, uctxt->ctxt);
+
+       /* Notify any waiting slaves */
+       if (uctxt->subctxt_cnt) {
+               clear_bit(HFI1_CTXT_MASTER_UNINIT, &uctxt->event_flags);
+               wake_up(&uctxt->wait);
+       }
+
+       return 0;
+}
+
+static int get_ctxt_info(struct file *fp, void __user *ubase, __u32 len)
+{
+       struct hfi1_ctxt_info cinfo;
+       struct hfi1_filedata *fd = fp->private_data;
+       struct hfi1_ctxtdata *uctxt = fd->uctxt;
+       int ret = 0;
+
+       memset(&cinfo, 0, sizeof(cinfo));
+       ret = hfi1_get_base_kinfo(uctxt, &cinfo);
+       if (ret < 0)
+               goto done;
+       cinfo.num_active = hfi1_count_active_units();
+       cinfo.unit = uctxt->dd->unit;
+       cinfo.ctxt = uctxt->ctxt;
+       cinfo.subctxt = fd->subctxt;
+       cinfo.rcvtids = roundup(uctxt->egrbufs.alloced,
+                               uctxt->dd->rcv_entries.group_size) +
+               uctxt->expected_count;
+       cinfo.credits = uctxt->sc->credits;
+       cinfo.numa_node = uctxt->numa_id;
+       cinfo.rec_cpu = fd->rec_cpu_num;
+       cinfo.send_ctxt = uctxt->sc->hw_context;
+
+       cinfo.egrtids = uctxt->egrbufs.alloced;
+       cinfo.rcvhdrq_cnt = uctxt->rcvhdrq_cnt;
+       cinfo.rcvhdrq_entsize = uctxt->rcvhdrqentsize << 2;
+       cinfo.sdma_ring_size = fd->cq->nentries;
+       cinfo.rcvegr_size = uctxt->egrbufs.rcvtid_size;
+
+       trace_hfi1_ctxt_info(uctxt->dd, uctxt->ctxt, fd->subctxt, cinfo);
+       if (copy_to_user(ubase, &cinfo, sizeof(cinfo)))
+               ret = -EFAULT;
+done:
+       return ret;
+}
+
+static int setup_ctxt(struct file *fp)
+{
+       struct hfi1_filedata *fd = fp->private_data;
+       struct hfi1_ctxtdata *uctxt = fd->uctxt;
+       struct hfi1_devdata *dd = uctxt->dd;
+       int ret = 0;
+
+       /*
+        * Context should be set up only once, including allocation and
+        * programming of eager buffers. This is done if context sharing
+        * is not requested or by the master process.
+        */
+       if (!uctxt->subctxt_cnt || !fd->subctxt) {
+               ret = hfi1_init_ctxt(uctxt->sc);
+               if (ret)
+                       goto done;
+
+               /* Now allocate the RcvHdr queue and eager buffers. */
+               ret = hfi1_create_rcvhdrq(dd, uctxt);
+               if (ret)
+                       goto done;
+               ret = hfi1_setup_eagerbufs(uctxt);
+               if (ret)
+                       goto done;
+               if (uctxt->subctxt_cnt && !fd->subctxt) {
+                       ret = setup_subctxt(uctxt);
+                       if (ret)
+                               goto done;
+               }
+       } else {
+               ret = wait_event_interruptible(uctxt->wait, !test_bit(
+                                              HFI1_CTXT_MASTER_UNINIT,
+                                              &uctxt->event_flags));
+               if (ret)
+                       goto done;
+       }
+
+       ret = hfi1_user_sdma_alloc_queues(uctxt, fp);
+       if (ret)
+               goto done;
+       /*
+        * Expected receive has to be setup for all processes (including
+        * shared contexts). However, it has to be done after the master
+        * context has been fully configured as it depends on the
+        * eager/expected split of the RcvArray entries.
+        * Setting it up here ensures that the subcontexts will be waiting
+        * (due to the above wait_event_interruptible() until the master
+        * is setup.
+        */
+       ret = hfi1_user_exp_rcv_init(fp);
+       if (ret)
+               goto done;
+
+       set_bit(HFI1_CTXT_SETUP_DONE, &uctxt->event_flags);
+done:
+       return ret;
+}
+
+static int get_base_info(struct file *fp, void __user *ubase, __u32 len)
+{
+       struct hfi1_base_info binfo;
+       struct hfi1_filedata *fd = fp->private_data;
+       struct hfi1_ctxtdata *uctxt = fd->uctxt;
+       struct hfi1_devdata *dd = uctxt->dd;
+       ssize_t sz;
+       unsigned offset;
+       int ret = 0;
+
+       trace_hfi1_uctxtdata(uctxt->dd, uctxt);
+
+       memset(&binfo, 0, sizeof(binfo));
+       binfo.hw_version = dd->revision;
+       binfo.sw_version = HFI1_KERN_SWVERSION;
+       binfo.bthqp = kdeth_qp;
+       binfo.jkey = uctxt->jkey;
+       /*
+        * If more than 64 contexts are enabled the allocated credit
+        * return will span two or three contiguous pages. Since we only
+        * map the page containing the context's credit return address,
+        * we need to calculate the offset in the proper page.
+        */
+       offset = ((u64)uctxt->sc->hw_free -
+                 (u64)dd->cr_base[uctxt->numa_id].va) % PAGE_SIZE;
+       binfo.sc_credits_addr = HFI1_MMAP_TOKEN(PIO_CRED, uctxt->ctxt,
+                                               fd->subctxt, offset);
+       binfo.pio_bufbase = HFI1_MMAP_TOKEN(PIO_BUFS, uctxt->ctxt,
+                                           fd->subctxt,
+                                           uctxt->sc->base_addr);
+       binfo.pio_bufbase_sop = HFI1_MMAP_TOKEN(PIO_BUFS_SOP,
+                                               uctxt->ctxt,
+                                               fd->subctxt,
+                                               uctxt->sc->base_addr);
+       binfo.rcvhdr_bufbase = HFI1_MMAP_TOKEN(RCV_HDRQ, uctxt->ctxt,
+                                              fd->subctxt,
+                                              uctxt->rcvhdrq);
+       binfo.rcvegr_bufbase = HFI1_MMAP_TOKEN(RCV_EGRBUF, uctxt->ctxt,
+                                              fd->subctxt,
+                                              uctxt->egrbufs.rcvtids[0].phys);
+       binfo.sdma_comp_bufbase = HFI1_MMAP_TOKEN(SDMA_COMP, uctxt->ctxt,
+                                                fd->subctxt, 0);
+       /*
+        * user regs are at
+        * (RXE_PER_CONTEXT_USER + (ctxt * RXE_PER_CONTEXT_SIZE))
+        */
+       binfo.user_regbase = HFI1_MMAP_TOKEN(UREGS, uctxt->ctxt,
+                                           fd->subctxt, 0);
+       offset = offset_in_page((((uctxt->ctxt - dd->first_user_ctxt) *
+                   HFI1_MAX_SHARED_CTXTS) + fd->subctxt) *
+                 sizeof(*dd->events));
+       binfo.events_bufbase = HFI1_MMAP_TOKEN(EVENTS, uctxt->ctxt,
+                                             fd->subctxt,
+                                             offset);
+       binfo.status_bufbase = HFI1_MMAP_TOKEN(STATUS, uctxt->ctxt,
+                                             fd->subctxt,
+                                             dd->status);
+       if (HFI1_CAP_IS_USET(DMA_RTAIL))
+               binfo.rcvhdrtail_base = HFI1_MMAP_TOKEN(RTAIL, uctxt->ctxt,
+                                                      fd->subctxt, 0);
+       if (uctxt->subctxt_cnt) {
+               binfo.subctxt_uregbase = HFI1_MMAP_TOKEN(SUBCTXT_UREGS,
+                                                       uctxt->ctxt,
+                                                       fd->subctxt, 0);
+               binfo.subctxt_rcvhdrbuf = HFI1_MMAP_TOKEN(SUBCTXT_RCV_HDRQ,
+                                                        uctxt->ctxt,
+                                                        fd->subctxt, 0);
+               binfo.subctxt_rcvegrbuf = HFI1_MMAP_TOKEN(SUBCTXT_EGRBUF,
+                                                        uctxt->ctxt,
+                                                        fd->subctxt, 0);
+       }
+       sz = (len < sizeof(binfo)) ? len : sizeof(binfo);
+       if (copy_to_user(ubase, &binfo, sz))
+               ret = -EFAULT;
+       return ret;
+}
+
+static unsigned int poll_urgent(struct file *fp,
+                               struct poll_table_struct *pt)
+{
+       struct hfi1_filedata *fd = fp->private_data;
+       struct hfi1_ctxtdata *uctxt = fd->uctxt;
+       struct hfi1_devdata *dd = uctxt->dd;
+       unsigned pollflag;
+
+       poll_wait(fp, &uctxt->wait, pt);
+
+       spin_lock_irq(&dd->uctxt_lock);
+       if (uctxt->urgent != uctxt->urgent_poll) {
+               pollflag = POLLIN | POLLRDNORM;
+               uctxt->urgent_poll = uctxt->urgent;
+       } else {
+               pollflag = 0;
+               set_bit(HFI1_CTXT_WAITING_URG, &uctxt->event_flags);
+       }
+       spin_unlock_irq(&dd->uctxt_lock);
+
+       return pollflag;
+}
+
+static unsigned int poll_next(struct file *fp,
+                             struct poll_table_struct *pt)
+{
+       struct hfi1_filedata *fd = fp->private_data;
+       struct hfi1_ctxtdata *uctxt = fd->uctxt;
+       struct hfi1_devdata *dd = uctxt->dd;
+       unsigned pollflag;
+
+       poll_wait(fp, &uctxt->wait, pt);
+
+       spin_lock_irq(&dd->uctxt_lock);
+       if (hdrqempty(uctxt)) {
+               set_bit(HFI1_CTXT_WAITING_RCV, &uctxt->event_flags);
+               hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_ENB, uctxt->ctxt);
+               pollflag = 0;
+       } else {
+               pollflag = POLLIN | POLLRDNORM;
+       }
+       spin_unlock_irq(&dd->uctxt_lock);
+
+       return pollflag;
+}
+
+/*
+ * Find all user contexts in use, and set the specified bit in their
+ * event mask.
+ * See also find_ctxt() for a similar use, that is specific to send buffers.
+ */
+int hfi1_set_uevent_bits(struct hfi1_pportdata *ppd, const int evtbit)
+{
+       struct hfi1_ctxtdata *uctxt;
+       struct hfi1_devdata *dd = ppd->dd;
+       unsigned ctxt;
+       int ret = 0;
+       unsigned long flags;
+
+       if (!dd->events) {
+               ret = -EINVAL;
+               goto done;
+       }
+
+       spin_lock_irqsave(&dd->uctxt_lock, flags);
+       for (ctxt = dd->first_user_ctxt; ctxt < dd->num_rcv_contexts;
+            ctxt++) {
+               uctxt = dd->rcd[ctxt];
+               if (uctxt) {
+                       unsigned long *evs = dd->events +
+                               (uctxt->ctxt - dd->first_user_ctxt) *
+                               HFI1_MAX_SHARED_CTXTS;
+                       int i;
+                       /*
+                        * subctxt_cnt is 0 if not shared, so do base
+                        * separately, first, then remaining subctxt, if any
+                        */
+                       set_bit(evtbit, evs);
+                       for (i = 1; i < uctxt->subctxt_cnt; i++)
+                               set_bit(evtbit, evs + i);
+               }
+       }
+       spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+done:
+       return ret;
+}
+
+/**
+ * manage_rcvq - manage a context's receive queue
+ * @uctxt: the context
+ * @subctxt: the sub-context
+ * @start_stop: action to carry out
+ *
+ * start_stop == 0 disables receive on the context, for use in queue
+ * overflow conditions.  start_stop==1 re-enables, to be used to
+ * re-init the software copy of the head register
+ */
+static int manage_rcvq(struct hfi1_ctxtdata *uctxt, unsigned subctxt,
+                      int start_stop)
+{
+       struct hfi1_devdata *dd = uctxt->dd;
+       unsigned int rcvctrl_op;
+
+       if (subctxt)
+               goto bail;
+       /* atomically clear receive enable ctxt. */
+       if (start_stop) {
+               /*
+                * On enable, force in-memory copy of the tail register to
+                * 0, so that protocol code doesn't have to worry about
+                * whether or not the chip has yet updated the in-memory
+                * copy or not on return from the system call. The chip
+                * always resets it's tail register back to 0 on a
+                * transition from disabled to enabled.
+                */
+               if (uctxt->rcvhdrtail_kvaddr)
+                       clear_rcvhdrtail(uctxt);
+               rcvctrl_op = HFI1_RCVCTRL_CTXT_ENB;
+       } else {
+               rcvctrl_op = HFI1_RCVCTRL_CTXT_DIS;
+       }
+       hfi1_rcvctrl(dd, rcvctrl_op, uctxt->ctxt);
+       /* always; new head should be equal to new tail; see above */
+bail:
+       return 0;
+}
+
+/*
+ * clear the event notifier events for this context.
+ * User process then performs actions appropriate to bit having been
+ * set, if desired, and checks again in future.
+ */
+static int user_event_ack(struct hfi1_ctxtdata *uctxt, int subctxt,
+                         unsigned long events)
+{
+       int i;
+       struct hfi1_devdata *dd = uctxt->dd;
+       unsigned long *evs;
+
+       if (!dd->events)
+               return 0;
+
+       evs = dd->events + ((uctxt->ctxt - dd->first_user_ctxt) *
+                           HFI1_MAX_SHARED_CTXTS) + subctxt;
+
+       for (i = 0; i <= _HFI1_MAX_EVENT_BIT; i++) {
+               if (!test_bit(i, &events))
+                       continue;
+               clear_bit(i, evs);
+       }
+       return 0;
+}
+
+static int set_ctxt_pkey(struct hfi1_ctxtdata *uctxt, unsigned subctxt,
+                        u16 pkey)
+{
+       int ret = -ENOENT, i, intable = 0;
+       struct hfi1_pportdata *ppd = uctxt->ppd;
+       struct hfi1_devdata *dd = uctxt->dd;
+
+       if (pkey == LIM_MGMT_P_KEY || pkey == FULL_MGMT_P_KEY) {
+               ret = -EINVAL;
+               goto done;
+       }
+
+       for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++)
+               if (pkey == ppd->pkeys[i]) {
+                       intable = 1;
+                       break;
+               }
+
+       if (intable)
+               ret = hfi1_set_ctxt_pkey(dd, uctxt->ctxt, pkey);
+done:
+       return ret;
+}
+
+static void user_remove(struct hfi1_devdata *dd)
+{
+
+       hfi1_cdev_cleanup(&dd->user_cdev, &dd->user_device);
+}
+
+static int user_add(struct hfi1_devdata *dd)
+{
+       char name[10];
+       int ret;
+
+       snprintf(name, sizeof(name), "%s_%d", class_name(), dd->unit);
+       ret = hfi1_cdev_init(dd->unit, name, &hfi1_file_ops,
+                            &dd->user_cdev, &dd->user_device,
+                            true, &dd->kobj);
+       if (ret)
+               user_remove(dd);
+
+       return ret;
+}
+
+/*
+ * Create per-unit files in /dev
+ */
+int hfi1_device_create(struct hfi1_devdata *dd)
+{
+       return user_add(dd);
+}
+
+/*
+ * Remove per-unit files in /dev
+ * void, core kernel returns no errors for this stuff
+ */
+void hfi1_device_remove(struct hfi1_devdata *dd)
+{
+       user_remove(dd);
+}
diff --git a/drivers/infiniband/hw/hfi1/firmware.c b/drivers/infiniband/hw/hfi1/firmware.c
new file mode 100644 (file)
index 0000000..ed680fd
--- /dev/null
@@ -0,0 +1,2056 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/firmware.h>
+#include <linux/mutex.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/crc32.h>
+
+#include "hfi.h"
+#include "trace.h"
+
+/*
+ * Make it easy to toggle firmware file name and if it gets loaded by
+ * editing the following. This may be something we do while in development
+ * but not necessarily something a user would ever need to use.
+ */
+#define DEFAULT_FW_8051_NAME_FPGA "hfi_dc8051.bin"
+#define DEFAULT_FW_8051_NAME_ASIC "hfi1_dc8051.fw"
+#define DEFAULT_FW_FABRIC_NAME "hfi1_fabric.fw"
+#define DEFAULT_FW_SBUS_NAME "hfi1_sbus.fw"
+#define DEFAULT_FW_PCIE_NAME "hfi1_pcie.fw"
+#define DEFAULT_PLATFORM_CONFIG_NAME "hfi1_platform.dat"
+#define ALT_FW_8051_NAME_ASIC "hfi1_dc8051_d.fw"
+#define ALT_FW_FABRIC_NAME "hfi1_fabric_d.fw"
+#define ALT_FW_SBUS_NAME "hfi1_sbus_d.fw"
+#define ALT_FW_PCIE_NAME "hfi1_pcie_d.fw"
+
+static uint fw_8051_load = 1;
+static uint fw_fabric_serdes_load = 1;
+static uint fw_pcie_serdes_load = 1;
+static uint fw_sbus_load = 1;
+
+/*
+ * Access required in platform.c
+ * Maintains state of whether the platform config was fetched via the
+ * fallback option
+ */
+uint platform_config_load;
+
+/* Firmware file names get set in hfi1_firmware_init() based on the above */
+static char *fw_8051_name;
+static char *fw_fabric_serdes_name;
+static char *fw_sbus_name;
+static char *fw_pcie_serdes_name;
+static char *platform_config_name;
+
+#define SBUS_MAX_POLL_COUNT 100
+#define SBUS_COUNTER(reg, name) \
+       (((reg) >> ASIC_STS_SBUS_COUNTERS_##name##_CNT_SHIFT) & \
+        ASIC_STS_SBUS_COUNTERS_##name##_CNT_MASK)
+
+/*
+ * Firmware security header.
+ */
+struct css_header {
+       u32 module_type;
+       u32 header_len;
+       u32 header_version;
+       u32 module_id;
+       u32 module_vendor;
+       u32 date;               /* BCD yyyymmdd */
+       u32 size;               /* in DWORDs */
+       u32 key_size;           /* in DWORDs */
+       u32 modulus_size;       /* in DWORDs */
+       u32 exponent_size;      /* in DWORDs */
+       u32 reserved[22];
+};
+
+/* expected field values */
+#define CSS_MODULE_TYPE           0x00000006
+#define CSS_HEADER_LEN    0x000000a1
+#define CSS_HEADER_VERSION 0x00010000
+#define CSS_MODULE_VENDOR  0x00008086
+
+#define KEY_SIZE      256
+#define MU_SIZE                8
+#define EXPONENT_SIZE  4
+
+/* the file itself */
+struct firmware_file {
+       struct css_header css_header;
+       u8 modulus[KEY_SIZE];
+       u8 exponent[EXPONENT_SIZE];
+       u8 signature[KEY_SIZE];
+       u8 firmware[];
+};
+
+struct augmented_firmware_file {
+       struct css_header css_header;
+       u8 modulus[KEY_SIZE];
+       u8 exponent[EXPONENT_SIZE];
+       u8 signature[KEY_SIZE];
+       u8 r2[KEY_SIZE];
+       u8 mu[MU_SIZE];
+       u8 firmware[];
+};
+
+/* augmented file size difference */
+#define AUGMENT_SIZE (sizeof(struct augmented_firmware_file) - \
+                                               sizeof(struct firmware_file))
+
+struct firmware_details {
+       /* Linux core piece */
+       const struct firmware *fw;
+
+       struct css_header *css_header;
+       u8 *firmware_ptr;               /* pointer to binary data */
+       u32 firmware_len;               /* length in bytes */
+       u8 *modulus;                    /* pointer to the modulus */
+       u8 *exponent;                   /* pointer to the exponent */
+       u8 *signature;                  /* pointer to the signature */
+       u8 *r2;                         /* pointer to r2 */
+       u8 *mu;                         /* pointer to mu */
+       struct augmented_firmware_file dummy_header;
+};
+
+/*
+ * The mutex protects fw_state, fw_err, and all of the firmware_details
+ * variables.
+ */
+static DEFINE_MUTEX(fw_mutex);
+enum fw_state {
+       FW_EMPTY,
+       FW_TRY,
+       FW_FINAL,
+       FW_ERR
+};
+
+static enum fw_state fw_state = FW_EMPTY;
+static int fw_err;
+static struct firmware_details fw_8051;
+static struct firmware_details fw_fabric;
+static struct firmware_details fw_pcie;
+static struct firmware_details fw_sbus;
+static const struct firmware *platform_config;
+
+/* flags for turn_off_spicos() */
+#define SPICO_SBUS   0x1
+#define SPICO_FABRIC 0x2
+#define ENABLE_SPICO_SMASK 0x1
+
+/* security block commands */
+#define RSA_CMD_INIT  0x1
+#define RSA_CMD_START 0x2
+
+/* security block status */
+#define RSA_STATUS_IDLE   0x0
+#define RSA_STATUS_ACTIVE 0x1
+#define RSA_STATUS_DONE   0x2
+#define RSA_STATUS_FAILED 0x3
+
+/* RSA engine timeout, in ms */
+#define RSA_ENGINE_TIMEOUT 100 /* ms */
+
+/* hardware mutex timeout, in ms */
+#define HM_TIMEOUT 10 /* ms */
+
+/* 8051 memory access timeout, in us */
+#define DC8051_ACCESS_TIMEOUT 100 /* us */
+
+/* the number of fabric SerDes on the SBus */
+#define NUM_FABRIC_SERDES 4
+
+/* SBus fabric SerDes addresses, one set per HFI */
+static const u8 fabric_serdes_addrs[2][NUM_FABRIC_SERDES] = {
+       { 0x01, 0x02, 0x03, 0x04 },
+       { 0x28, 0x29, 0x2a, 0x2b }
+};
+
+/* SBus PCIe SerDes addresses, one set per HFI */
+static const u8 pcie_serdes_addrs[2][NUM_PCIE_SERDES] = {
+       { 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16,
+         0x18, 0x1a, 0x1c, 0x1e, 0x20, 0x22, 0x24, 0x26 },
+       { 0x2f, 0x31, 0x33, 0x35, 0x37, 0x39, 0x3b, 0x3d,
+         0x3f, 0x41, 0x43, 0x45, 0x47, 0x49, 0x4b, 0x4d }
+};
+
+/* SBus PCIe PCS addresses, one set per HFI */
+const u8 pcie_pcs_addrs[2][NUM_PCIE_SERDES] = {
+       { 0x09, 0x0b, 0x0d, 0x0f, 0x11, 0x13, 0x15, 0x17,
+         0x19, 0x1b, 0x1d, 0x1f, 0x21, 0x23, 0x25, 0x27 },
+       { 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e,
+         0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e }
+};
+
+/* SBus fabric SerDes broadcast addresses, one per HFI */
+static const u8 fabric_serdes_broadcast[2] = { 0xe4, 0xe5 };
+static const u8 all_fabric_serdes_broadcast = 0xe1;
+
+/* SBus PCIe SerDes broadcast addresses, one per HFI */
+const u8 pcie_serdes_broadcast[2] = { 0xe2, 0xe3 };
+static const u8 all_pcie_serdes_broadcast = 0xe0;
+
+/* forwards */
+static void dispose_one_firmware(struct firmware_details *fdet);
+static int load_fabric_serdes_firmware(struct hfi1_devdata *dd,
+                                      struct firmware_details *fdet);
+
+/*
+ * Read a single 64-bit value from 8051 data memory.
+ *
+ * Expects:
+ * o caller to have already set up data read, no auto increment
+ * o caller to turn off read enable when finished
+ *
+ * The address argument is a byte offset.  Bits 0:2 in the address are
+ * ignored - i.e. the hardware will always do aligned 8-byte reads as if
+ * the lower bits are zero.
+ *
+ * Return 0 on success, -ENXIO on a read error (timeout).
+ */
+static int __read_8051_data(struct hfi1_devdata *dd, u32 addr, u64 *result)
+{
+       u64 reg;
+       int count;
+
+       /* start the read at the given address */
+       reg = ((addr & DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK)
+                       << DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT)
+               | DC_DC8051_CFG_RAM_ACCESS_CTRL_READ_ENA_SMASK;
+       write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, reg);
+
+       /* wait until ACCESS_COMPLETED is set */
+       count = 0;
+       while ((read_csr(dd, DC_DC8051_CFG_RAM_ACCESS_STATUS)
+                   & DC_DC8051_CFG_RAM_ACCESS_STATUS_ACCESS_COMPLETED_SMASK)
+                   == 0) {
+               count++;
+               if (count > DC8051_ACCESS_TIMEOUT) {
+                       dd_dev_err(dd, "timeout reading 8051 data\n");
+                       return -ENXIO;
+               }
+               ndelay(10);
+       }
+
+       /* gather the data */
+       *result = read_csr(dd, DC_DC8051_CFG_RAM_ACCESS_RD_DATA);
+
+       return 0;
+}
+
+/*
+ * Read 8051 data starting at addr, for len bytes.  Will read in 8-byte chunks.
+ * Return 0 on success, -errno on error.
+ */
+int read_8051_data(struct hfi1_devdata *dd, u32 addr, u32 len, u64 *result)
+{
+       unsigned long flags;
+       u32 done;
+       int ret = 0;
+
+       spin_lock_irqsave(&dd->dc8051_memlock, flags);
+
+       /* data read set-up, no auto-increment */
+       write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_SETUP, 0);
+
+       for (done = 0; done < len; addr += 8, done += 8, result++) {
+               ret = __read_8051_data(dd, addr, result);
+               if (ret)
+                       break;
+       }
+
+       /* turn off read enable */
+       write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, 0);
+
+       spin_unlock_irqrestore(&dd->dc8051_memlock, flags);
+
+       return ret;
+}
+
+/*
+ * Write data or code to the 8051 code or data RAM.
+ */
+static int write_8051(struct hfi1_devdata *dd, int code, u32 start,
+                     const u8 *data, u32 len)
+{
+       u64 reg;
+       u32 offset;
+       int aligned, count;
+
+       /* check alignment */
+       aligned = ((unsigned long)data & 0x7) == 0;
+
+       /* write set-up */
+       reg = (code ? DC_DC8051_CFG_RAM_ACCESS_SETUP_RAM_SEL_SMASK : 0ull)
+               | DC_DC8051_CFG_RAM_ACCESS_SETUP_AUTO_INCR_ADDR_SMASK;
+       write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_SETUP, reg);
+
+       reg = ((start & DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK)
+                       << DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT)
+               | DC_DC8051_CFG_RAM_ACCESS_CTRL_WRITE_ENA_SMASK;
+       write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, reg);
+
+       /* write */
+       for (offset = 0; offset < len; offset += 8) {
+               int bytes = len - offset;
+
+               if (bytes < 8) {
+                       reg = 0;
+                       memcpy(&reg, &data[offset], bytes);
+               } else if (aligned) {
+                       reg = *(u64 *)&data[offset];
+               } else {
+                       memcpy(&reg, &data[offset], 8);
+               }
+               write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_WR_DATA, reg);
+
+               /* wait until ACCESS_COMPLETED is set */
+               count = 0;
+               while ((read_csr(dd, DC_DC8051_CFG_RAM_ACCESS_STATUS)
+                   & DC_DC8051_CFG_RAM_ACCESS_STATUS_ACCESS_COMPLETED_SMASK)
+                   == 0) {
+                       count++;
+                       if (count > DC8051_ACCESS_TIMEOUT) {
+                               dd_dev_err(dd, "timeout writing 8051 data\n");
+                               return -ENXIO;
+                       }
+                       udelay(1);
+               }
+       }
+
+       /* turn off write access, auto increment (also sets to data access) */
+       write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, 0);
+       write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_SETUP, 0);
+
+       return 0;
+}
+
+/* return 0 if values match, non-zero and complain otherwise */
+static int invalid_header(struct hfi1_devdata *dd, const char *what,
+                         u32 actual, u32 expected)
+{
+       if (actual == expected)
+               return 0;
+
+       dd_dev_err(dd,
+                  "invalid firmware header field %s: expected 0x%x, actual 0x%x\n",
+                  what, expected, actual);
+       return 1;
+}
+
+/*
+ * Verify that the static fields in the CSS header match.
+ */
+static int verify_css_header(struct hfi1_devdata *dd, struct css_header *css)
+{
+       /* verify CSS header fields (most sizes are in DW, so add /4) */
+       if (invalid_header(dd, "module_type", css->module_type,
+                          CSS_MODULE_TYPE) ||
+           invalid_header(dd, "header_len", css->header_len,
+                          (sizeof(struct firmware_file) / 4)) ||
+           invalid_header(dd, "header_version", css->header_version,
+                          CSS_HEADER_VERSION) ||
+           invalid_header(dd, "module_vendor", css->module_vendor,
+                          CSS_MODULE_VENDOR) ||
+           invalid_header(dd, "key_size", css->key_size, KEY_SIZE / 4) ||
+           invalid_header(dd, "modulus_size", css->modulus_size,
+                          KEY_SIZE / 4) ||
+           invalid_header(dd, "exponent_size", css->exponent_size,
+                          EXPONENT_SIZE / 4)) {
+               return -EINVAL;
+       }
+       return 0;
+}
+
+/*
+ * Make sure there are at least some bytes after the prefix.
+ */
+static int payload_check(struct hfi1_devdata *dd, const char *name,
+                        long file_size, long prefix_size)
+{
+       /* make sure we have some payload */
+       if (prefix_size >= file_size) {
+               dd_dev_err(dd,
+                          "firmware \"%s\", size %ld, must be larger than %ld bytes\n",
+                          name, file_size, prefix_size);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+/*
+ * Request the firmware from the system.  Extract the pieces and fill in
+ * fdet.  If successful, the caller will need to call dispose_one_firmware().
+ * Returns 0 on success, -ERRNO on error.
+ */
+static int obtain_one_firmware(struct hfi1_devdata *dd, const char *name,
+                              struct firmware_details *fdet)
+{
+       struct css_header *css;
+       int ret;
+
+       memset(fdet, 0, sizeof(*fdet));
+
+       ret = request_firmware(&fdet->fw, name, &dd->pcidev->dev);
+       if (ret) {
+               dd_dev_warn(dd, "cannot find firmware \"%s\", err %d\n",
+                           name, ret);
+               return ret;
+       }
+
+       /* verify the firmware */
+       if (fdet->fw->size < sizeof(struct css_header)) {
+               dd_dev_err(dd, "firmware \"%s\" is too small\n", name);
+               ret = -EINVAL;
+               goto done;
+       }
+       css = (struct css_header *)fdet->fw->data;
+
+       hfi1_cdbg(FIRMWARE, "Firmware %s details:", name);
+       hfi1_cdbg(FIRMWARE, "file size: 0x%lx bytes", fdet->fw->size);
+       hfi1_cdbg(FIRMWARE, "CSS structure:");
+       hfi1_cdbg(FIRMWARE, "  module_type    0x%x", css->module_type);
+       hfi1_cdbg(FIRMWARE, "  header_len     0x%03x (0x%03x bytes)",
+                 css->header_len, 4 * css->header_len);
+       hfi1_cdbg(FIRMWARE, "  header_version 0x%x", css->header_version);
+       hfi1_cdbg(FIRMWARE, "  module_id      0x%x", css->module_id);
+       hfi1_cdbg(FIRMWARE, "  module_vendor  0x%x", css->module_vendor);
+       hfi1_cdbg(FIRMWARE, "  date           0x%x", css->date);
+       hfi1_cdbg(FIRMWARE, "  size           0x%03x (0x%03x bytes)",
+                 css->size, 4 * css->size);
+       hfi1_cdbg(FIRMWARE, "  key_size       0x%03x (0x%03x bytes)",
+                 css->key_size, 4 * css->key_size);
+       hfi1_cdbg(FIRMWARE, "  modulus_size   0x%03x (0x%03x bytes)",
+                 css->modulus_size, 4 * css->modulus_size);
+       hfi1_cdbg(FIRMWARE, "  exponent_size  0x%03x (0x%03x bytes)",
+                 css->exponent_size, 4 * css->exponent_size);
+       hfi1_cdbg(FIRMWARE, "firmware size: 0x%lx bytes",
+                 fdet->fw->size - sizeof(struct firmware_file));
+
+       /*
+        * If the file does not have a valid CSS header, fail.
+        * Otherwise, check the CSS size field for an expected size.
+        * The augmented file has r2 and mu inserted after the header
+        * was generated, so there will be a known difference between
+        * the CSS header size and the actual file size.  Use this
+        * difference to identify an augmented file.
+        *
+        * Note: css->size is in DWORDs, multiply by 4 to get bytes.
+        */
+       ret = verify_css_header(dd, css);
+       if (ret) {
+               dd_dev_info(dd, "Invalid CSS header for \"%s\"\n", name);
+       } else if ((css->size * 4) == fdet->fw->size) {
+               /* non-augmented firmware file */
+               struct firmware_file *ff = (struct firmware_file *)
+                                                       fdet->fw->data;
+
+               /* make sure there are bytes in the payload */
+               ret = payload_check(dd, name, fdet->fw->size,
+                                   sizeof(struct firmware_file));
+               if (ret == 0) {
+                       fdet->css_header = css;
+                       fdet->modulus = ff->modulus;
+                       fdet->exponent = ff->exponent;
+                       fdet->signature = ff->signature;
+                       fdet->r2 = fdet->dummy_header.r2; /* use dummy space */
+                       fdet->mu = fdet->dummy_header.mu; /* use dummy space */
+                       fdet->firmware_ptr = ff->firmware;
+                       fdet->firmware_len = fdet->fw->size -
+                                               sizeof(struct firmware_file);
+                       /*
+                        * Header does not include r2 and mu - generate here.
+                        * For now, fail.
+                        */
+                       dd_dev_err(dd, "driver is unable to validate firmware without r2 and mu (not in firmware file)\n");
+                       ret = -EINVAL;
+               }
+       } else if ((css->size * 4) + AUGMENT_SIZE == fdet->fw->size) {
+               /* augmented firmware file */
+               struct augmented_firmware_file *aff =
+                       (struct augmented_firmware_file *)fdet->fw->data;
+
+               /* make sure there are bytes in the payload */
+               ret = payload_check(dd, name, fdet->fw->size,
+                                   sizeof(struct augmented_firmware_file));
+               if (ret == 0) {
+                       fdet->css_header = css;
+                       fdet->modulus = aff->modulus;
+                       fdet->exponent = aff->exponent;
+                       fdet->signature = aff->signature;
+                       fdet->r2 = aff->r2;
+                       fdet->mu = aff->mu;
+                       fdet->firmware_ptr = aff->firmware;
+                       fdet->firmware_len = fdet->fw->size -
+                                       sizeof(struct augmented_firmware_file);
+               }
+       } else {
+               /* css->size check failed */
+               dd_dev_err(dd,
+                          "invalid firmware header field size: expected 0x%lx or 0x%lx, actual 0x%x\n",
+                          fdet->fw->size / 4,
+                          (fdet->fw->size - AUGMENT_SIZE) / 4,
+                          css->size);
+
+               ret = -EINVAL;
+       }
+
+done:
+       /* if returning an error, clean up after ourselves */
+       if (ret)
+               dispose_one_firmware(fdet);
+       return ret;
+}
+
+static void dispose_one_firmware(struct firmware_details *fdet)
+{
+       release_firmware(fdet->fw);
+       /* erase all previous information */
+       memset(fdet, 0, sizeof(*fdet));
+}
+
+/*
+ * Obtain the 4 firmwares from the OS.  All must be obtained at once or not
+ * at all.  If called with the firmware state in FW_TRY, use alternate names.
+ * On exit, this routine will have set the firmware state to one of FW_TRY,
+ * FW_FINAL, or FW_ERR.
+ *
+ * Must be holding fw_mutex.
+ */
+static void __obtain_firmware(struct hfi1_devdata *dd)
+{
+       int err = 0;
+
+       if (fw_state == FW_FINAL)       /* nothing more to obtain */
+               return;
+       if (fw_state == FW_ERR)         /* already in error */
+               return;
+
+       /* fw_state is FW_EMPTY or FW_TRY */
+retry:
+       if (fw_state == FW_TRY) {
+               /*
+                * We tried the original and it failed.  Move to the
+                * alternate.
+                */
+               dd_dev_warn(dd, "using alternate firmware names\n");
+               /*
+                * Let others run.  Some systems, when missing firmware, does
+                * something that holds for 30 seconds.  If we do that twice
+                * in a row it triggers task blocked warning.
+                */
+               cond_resched();
+               if (fw_8051_load)
+                       dispose_one_firmware(&fw_8051);
+               if (fw_fabric_serdes_load)
+                       dispose_one_firmware(&fw_fabric);
+               if (fw_sbus_load)
+                       dispose_one_firmware(&fw_sbus);
+               if (fw_pcie_serdes_load)
+                       dispose_one_firmware(&fw_pcie);
+               fw_8051_name = ALT_FW_8051_NAME_ASIC;
+               fw_fabric_serdes_name = ALT_FW_FABRIC_NAME;
+               fw_sbus_name = ALT_FW_SBUS_NAME;
+               fw_pcie_serdes_name = ALT_FW_PCIE_NAME;
+       }
+
+       if (fw_sbus_load) {
+               err = obtain_one_firmware(dd, fw_sbus_name, &fw_sbus);
+               if (err)
+                       goto done;
+       }
+
+       if (fw_pcie_serdes_load) {
+               err = obtain_one_firmware(dd, fw_pcie_serdes_name, &fw_pcie);
+               if (err)
+                       goto done;
+       }
+
+       if (fw_fabric_serdes_load) {
+               err = obtain_one_firmware(dd, fw_fabric_serdes_name,
+                                         &fw_fabric);
+               if (err)
+                       goto done;
+       }
+
+       if (fw_8051_load) {
+               err = obtain_one_firmware(dd, fw_8051_name, &fw_8051);
+               if (err)
+                       goto done;
+       }
+
+done:
+       if (err) {
+               /* oops, had problems obtaining a firmware */
+               if (fw_state == FW_EMPTY && dd->icode == ICODE_RTL_SILICON) {
+                       /* retry with alternate (RTL only) */
+                       fw_state = FW_TRY;
+                       goto retry;
+               }
+               dd_dev_err(dd, "unable to obtain working firmware\n");
+               fw_state = FW_ERR;
+               fw_err = -ENOENT;
+       } else {
+               /* success */
+               if (fw_state == FW_EMPTY &&
+                   dd->icode != ICODE_FUNCTIONAL_SIMULATOR)
+                       fw_state = FW_TRY;      /* may retry later */
+               else
+                       fw_state = FW_FINAL;    /* cannot try again */
+       }
+}
+
+/*
+ * Called by all HFIs when loading their firmware - i.e. device probe time.
+ * The first one will do the actual firmware load.  Use a mutex to resolve
+ * any possible race condition.
+ *
+ * The call to this routine cannot be moved to driver load because the kernel
+ * call request_firmware() requires a device which is only available after
+ * the first device probe.
+ */
+static int obtain_firmware(struct hfi1_devdata *dd)
+{
+       unsigned long timeout;
+       int err = 0;
+
+       mutex_lock(&fw_mutex);
+
+       /* 40s delay due to long delay on missing firmware on some systems */
+       timeout = jiffies + msecs_to_jiffies(40000);
+       while (fw_state == FW_TRY) {
+               /*
+                * Another device is trying the firmware.  Wait until it
+                * decides what works (or not).
+                */
+               if (time_after(jiffies, timeout)) {
+                       /* waited too long */
+                       dd_dev_err(dd, "Timeout waiting for firmware try");
+                       fw_state = FW_ERR;
+                       fw_err = -ETIMEDOUT;
+                       break;
+               }
+               mutex_unlock(&fw_mutex);
+               msleep(20);     /* arbitrary delay */
+               mutex_lock(&fw_mutex);
+       }
+       /* not in FW_TRY state */
+
+       if (fw_state == FW_FINAL) {
+               if (platform_config) {
+                       dd->platform_config.data = platform_config->data;
+                       dd->platform_config.size = platform_config->size;
+               }
+               goto done;      /* already acquired */
+       } else if (fw_state == FW_ERR) {
+               goto done;      /* already tried and failed */
+       }
+       /* fw_state is FW_EMPTY */
+
+       /* set fw_state to FW_TRY, FW_FINAL, or FW_ERR, and fw_err */
+       __obtain_firmware(dd);
+
+       if (platform_config_load) {
+               platform_config = NULL;
+               err = request_firmware(&platform_config, platform_config_name,
+                                      &dd->pcidev->dev);
+               if (err) {
+                       platform_config = NULL;
+                       goto done;
+               }
+               dd->platform_config.data = platform_config->data;
+               dd->platform_config.size = platform_config->size;
+       }
+
+done:
+       mutex_unlock(&fw_mutex);
+
+       return fw_err;
+}
+
+/*
+ * Called when the driver unloads.  The timing is asymmetric with its
+ * counterpart, obtain_firmware().  If called at device remove time,
+ * then it is conceivable that another device could probe while the
+ * firmware is being disposed.  The mutexes can be moved to do that
+ * safely, but then the firmware would be requested from the OS multiple
+ * times.
+ *
+ * No mutex is needed as the driver is unloading and there cannot be any
+ * other callers.
+ */
+void dispose_firmware(void)
+{
+       dispose_one_firmware(&fw_8051);
+       dispose_one_firmware(&fw_fabric);
+       dispose_one_firmware(&fw_pcie);
+       dispose_one_firmware(&fw_sbus);
+
+       release_firmware(platform_config);
+       platform_config = NULL;
+
+       /* retain the error state, otherwise revert to empty */
+       if (fw_state != FW_ERR)
+               fw_state = FW_EMPTY;
+}
+
+/*
+ * Called with the result of a firmware download.
+ *
+ * Return 1 to retry loading the firmware, 0 to stop.
+ */
+static int retry_firmware(struct hfi1_devdata *dd, int load_result)
+{
+       int retry;
+
+       mutex_lock(&fw_mutex);
+
+       if (load_result == 0) {
+               /*
+                * The load succeeded, so expect all others to do the same.
+                * Do not retry again.
+                */
+               if (fw_state == FW_TRY)
+                       fw_state = FW_FINAL;
+               retry = 0;      /* do NOT retry */
+       } else if (fw_state == FW_TRY) {
+               /* load failed, obtain alternate firmware */
+               __obtain_firmware(dd);
+               retry = (fw_state == FW_FINAL);
+       } else {
+               /* else in FW_FINAL or FW_ERR, no retry in either case */
+               retry = 0;
+       }
+
+       mutex_unlock(&fw_mutex);
+       return retry;
+}
+
+/*
+ * Write a block of data to a given array CSR.  All calls will be in
+ * multiples of 8 bytes.
+ */
+static void write_rsa_data(struct hfi1_devdata *dd, int what,
+                          const u8 *data, int nbytes)
+{
+       int qw_size = nbytes / 8;
+       int i;
+
+       if (((unsigned long)data & 0x7) == 0) {
+               /* aligned */
+               u64 *ptr = (u64 *)data;
+
+               for (i = 0; i < qw_size; i++, ptr++)
+                       write_csr(dd, what + (8 * i), *ptr);
+       } else {
+               /* not aligned */
+               for (i = 0; i < qw_size; i++, data += 8) {
+                       u64 value;
+
+                       memcpy(&value, data, 8);
+                       write_csr(dd, what + (8 * i), value);
+               }
+       }
+}
+
+/*
+ * Write a block of data to a given CSR as a stream of writes.  All calls will
+ * be in multiples of 8 bytes.
+ */
+static void write_streamed_rsa_data(struct hfi1_devdata *dd, int what,
+                                   const u8 *data, int nbytes)
+{
+       u64 *ptr = (u64 *)data;
+       int qw_size = nbytes / 8;
+
+       for (; qw_size > 0; qw_size--, ptr++)
+               write_csr(dd, what, *ptr);
+}
+
+/*
+ * Download the signature and start the RSA mechanism.  Wait for
+ * RSA_ENGINE_TIMEOUT before giving up.
+ */
+static int run_rsa(struct hfi1_devdata *dd, const char *who,
+                  const u8 *signature)
+{
+       unsigned long timeout;
+       u64 reg;
+       u32 status;
+       int ret = 0;
+
+       /* write the signature */
+       write_rsa_data(dd, MISC_CFG_RSA_SIGNATURE, signature, KEY_SIZE);
+
+       /* initialize RSA */
+       write_csr(dd, MISC_CFG_RSA_CMD, RSA_CMD_INIT);
+
+       /*
+        * Make sure the engine is idle and insert a delay between the two
+        * writes to MISC_CFG_RSA_CMD.
+        */
+       status = (read_csr(dd, MISC_CFG_FW_CTRL)
+                          & MISC_CFG_FW_CTRL_RSA_STATUS_SMASK)
+                            >> MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT;
+       if (status != RSA_STATUS_IDLE) {
+               dd_dev_err(dd, "%s security engine not idle - giving up\n",
+                          who);
+               return -EBUSY;
+       }
+
+       /* start RSA */
+       write_csr(dd, MISC_CFG_RSA_CMD, RSA_CMD_START);
+
+       /*
+        * Look for the result.
+        *
+        * The RSA engine is hooked up to two MISC errors.  The driver
+        * masks these errors as they do not respond to the standard
+        * error "clear down" mechanism.  Look for these errors here and
+        * clear them when possible.  This routine will exit with the
+        * errors of the current run still set.
+        *
+        * MISC_FW_AUTH_FAILED_ERR
+        *      Firmware authorization failed.  This can be cleared by
+        *      re-initializing the RSA engine, then clearing the status bit.
+        *      Do not re-init the RSA angine immediately after a successful
+        *      run - this will reset the current authorization.
+        *
+        * MISC_KEY_MISMATCH_ERR
+        *      Key does not match.  The only way to clear this is to load
+        *      a matching key then clear the status bit.  If this error
+        *      is raised, it will persist outside of this routine until a
+        *      matching key is loaded.
+        */
+       timeout = msecs_to_jiffies(RSA_ENGINE_TIMEOUT) + jiffies;
+       while (1) {
+               status = (read_csr(dd, MISC_CFG_FW_CTRL)
+                          & MISC_CFG_FW_CTRL_RSA_STATUS_SMASK)
+                            >> MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT;
+
+               if (status == RSA_STATUS_IDLE) {
+                       /* should not happen */
+                       dd_dev_err(dd, "%s firmware security bad idle state\n",
+                                  who);
+                       ret = -EINVAL;
+                       break;
+               } else if (status == RSA_STATUS_DONE) {
+                       /* finished successfully */
+                       break;
+               } else if (status == RSA_STATUS_FAILED) {
+                       /* finished unsuccessfully */
+                       ret = -EINVAL;
+                       break;
+               }
+               /* else still active */
+
+               if (time_after(jiffies, timeout)) {
+                       /*
+                        * Timed out while active.  We can't reset the engine
+                        * if it is stuck active, but run through the
+                        * error code to see what error bits are set.
+                        */
+                       dd_dev_err(dd, "%s firmware security time out\n", who);
+                       ret = -ETIMEDOUT;
+                       break;
+               }
+
+               msleep(20);
+       }
+
+       /*
+        * Arrive here on success or failure.  Clear all RSA engine
+        * errors.  All current errors will stick - the RSA logic is keeping
+        * error high.  All previous errors will clear - the RSA logic
+        * is not keeping the error high.
+        */
+       write_csr(dd, MISC_ERR_CLEAR,
+                 MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK |
+                 MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK);
+       /*
+        * All that is left are the current errors.  Print warnings on
+        * authorization failure details, if any.  Firmware authorization
+        * can be retried, so these are only warnings.
+        */
+       reg = read_csr(dd, MISC_ERR_STATUS);
+       if (ret) {
+               if (reg & MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK)
+                       dd_dev_warn(dd, "%s firmware authorization failed\n",
+                                   who);
+               if (reg & MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK)
+                       dd_dev_warn(dd, "%s firmware key mismatch\n", who);
+       }
+
+       return ret;
+}
+
+static void load_security_variables(struct hfi1_devdata *dd,
+                                   struct firmware_details *fdet)
+{
+       /* Security variables a.  Write the modulus */
+       write_rsa_data(dd, MISC_CFG_RSA_MODULUS, fdet->modulus, KEY_SIZE);
+       /* Security variables b.  Write the r2 */
+       write_rsa_data(dd, MISC_CFG_RSA_R2, fdet->r2, KEY_SIZE);
+       /* Security variables c.  Write the mu */
+       write_rsa_data(dd, MISC_CFG_RSA_MU, fdet->mu, MU_SIZE);
+       /* Security variables d.  Write the header */
+       write_streamed_rsa_data(dd, MISC_CFG_SHA_PRELOAD,
+                               (u8 *)fdet->css_header,
+                               sizeof(struct css_header));
+}
+
+/* return the 8051 firmware state */
+static inline u32 get_firmware_state(struct hfi1_devdata *dd)
+{
+       u64 reg = read_csr(dd, DC_DC8051_STS_CUR_STATE);
+
+       return (reg >> DC_DC8051_STS_CUR_STATE_FIRMWARE_SHIFT)
+                               & DC_DC8051_STS_CUR_STATE_FIRMWARE_MASK;
+}
+
+/*
+ * Wait until the firmware is up and ready to take host requests.
+ * Return 0 on success, -ETIMEDOUT on timeout.
+ */
+int wait_fm_ready(struct hfi1_devdata *dd, u32 mstimeout)
+{
+       unsigned long timeout;
+
+       /* in the simulator, the fake 8051 is always ready */
+       if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
+               return 0;
+
+       timeout = msecs_to_jiffies(mstimeout) + jiffies;
+       while (1) {
+               if (get_firmware_state(dd) == 0xa0)     /* ready */
+                       return 0;
+               if (time_after(jiffies, timeout))       /* timed out */
+                       return -ETIMEDOUT;
+               usleep_range(1950, 2050); /* sleep 2ms-ish */
+       }
+}
+
+/*
+ * Load the 8051 firmware.
+ */
+static int load_8051_firmware(struct hfi1_devdata *dd,
+                             struct firmware_details *fdet)
+{
+       u64 reg;
+       int ret;
+       u8 ver_a, ver_b;
+
+       /*
+        * DC Reset sequence
+        * Load DC 8051 firmware
+        */
+       /*
+        * DC reset step 1: Reset DC8051
+        */
+       reg = DC_DC8051_CFG_RST_M8051W_SMASK
+               | DC_DC8051_CFG_RST_CRAM_SMASK
+               | DC_DC8051_CFG_RST_DRAM_SMASK
+               | DC_DC8051_CFG_RST_IRAM_SMASK
+               | DC_DC8051_CFG_RST_SFR_SMASK;
+       write_csr(dd, DC_DC8051_CFG_RST, reg);
+
+       /*
+        * DC reset step 2 (optional): Load 8051 data memory with link
+        * configuration
+        */
+
+       /*
+        * DC reset step 3: Load DC8051 firmware
+        */
+       /* release all but the core reset */
+       reg = DC_DC8051_CFG_RST_M8051W_SMASK;
+       write_csr(dd, DC_DC8051_CFG_RST, reg);
+
+       /* Firmware load step 1 */
+       load_security_variables(dd, fdet);
+
+       /*
+        * Firmware load step 2.  Clear MISC_CFG_FW_CTRL.FW_8051_LOADED
+        */
+       write_csr(dd, MISC_CFG_FW_CTRL, 0);
+
+       /* Firmware load steps 3-5 */
+       ret = write_8051(dd, 1/*code*/, 0, fdet->firmware_ptr,
+                        fdet->firmware_len);
+       if (ret)
+               return ret;
+
+       /*
+        * DC reset step 4. Host starts the DC8051 firmware
+        */
+       /*
+        * Firmware load step 6.  Set MISC_CFG_FW_CTRL.FW_8051_LOADED
+        */
+       write_csr(dd, MISC_CFG_FW_CTRL, MISC_CFG_FW_CTRL_FW_8051_LOADED_SMASK);
+
+       /* Firmware load steps 7-10 */
+       ret = run_rsa(dd, "8051", fdet->signature);
+       if (ret)
+               return ret;
+
+       /* clear all reset bits, releasing the 8051 */
+       write_csr(dd, DC_DC8051_CFG_RST, 0ull);
+
+       /*
+        * DC reset step 5. Wait for firmware to be ready to accept host
+        * requests.
+        */
+       ret = wait_fm_ready(dd, TIMEOUT_8051_START);
+       if (ret) { /* timed out */
+               dd_dev_err(dd, "8051 start timeout, current state 0x%x\n",
+                          get_firmware_state(dd));
+               return -ETIMEDOUT;
+       }
+
+       read_misc_status(dd, &ver_a, &ver_b);
+       dd_dev_info(dd, "8051 firmware version %d.%d\n",
+                   (int)ver_b, (int)ver_a);
+       dd->dc8051_ver = dc8051_ver(ver_b, ver_a);
+
+       return 0;
+}
+
+/*
+ * Write the SBus request register
+ *
+ * No need for masking - the arguments are sized exactly.
+ */
+void sbus_request(struct hfi1_devdata *dd,
+                 u8 receiver_addr, u8 data_addr, u8 command, u32 data_in)
+{
+       write_csr(dd, ASIC_CFG_SBUS_REQUEST,
+                 ((u64)data_in << ASIC_CFG_SBUS_REQUEST_DATA_IN_SHIFT) |
+                 ((u64)command << ASIC_CFG_SBUS_REQUEST_COMMAND_SHIFT) |
+                 ((u64)data_addr << ASIC_CFG_SBUS_REQUEST_DATA_ADDR_SHIFT) |
+                 ((u64)receiver_addr <<
+                  ASIC_CFG_SBUS_REQUEST_RECEIVER_ADDR_SHIFT));
+}
+
+/*
+ * Turn off the SBus and fabric serdes spicos.
+ *
+ * + Must be called with Sbus fast mode turned on.
+ * + Must be called after fabric serdes broadcast is set up.
+ * + Must be called before the 8051 is loaded - assumes 8051 is not loaded
+ *   when using MISC_CFG_FW_CTRL.
+ */
+static void turn_off_spicos(struct hfi1_devdata *dd, int flags)
+{
+       /* only needed on A0 */
+       if (!is_ax(dd))
+               return;
+
+       dd_dev_info(dd, "Turning off spicos:%s%s\n",
+                   flags & SPICO_SBUS ? " SBus" : "",
+                   flags & SPICO_FABRIC ? " fabric" : "");
+
+       write_csr(dd, MISC_CFG_FW_CTRL, ENABLE_SPICO_SMASK);
+       /* disable SBus spico */
+       if (flags & SPICO_SBUS)
+               sbus_request(dd, SBUS_MASTER_BROADCAST, 0x01,
+                            WRITE_SBUS_RECEIVER, 0x00000040);
+
+       /* disable the fabric serdes spicos */
+       if (flags & SPICO_FABRIC)
+               sbus_request(dd, fabric_serdes_broadcast[dd->hfi1_id],
+                            0x07, WRITE_SBUS_RECEIVER, 0x00000000);
+       write_csr(dd, MISC_CFG_FW_CTRL, 0);
+}
+
+/*
+ * Reset all of the fabric serdes for this HFI in preparation to take the
+ * link to Polling.
+ *
+ * To do a reset, we need to write to to the serdes registers.  Unfortunately,
+ * the fabric serdes download to the other HFI on the ASIC will have turned
+ * off the firmware validation on this HFI.  This means we can't write to the
+ * registers to reset the serdes.  Work around this by performing a complete
+ * re-download and validation of the fabric serdes firmware.  This, as a
+ * by-product, will reset the serdes.  NOTE: the re-download requires that
+ * the 8051 be in the Offline state.  I.e. not actively trying to use the
+ * serdes.  This routine is called at the point where the link is Offline and
+ * is getting ready to go to Polling.
+ */
+void fabric_serdes_reset(struct hfi1_devdata *dd)
+{
+       int ret;
+
+       if (!fw_fabric_serdes_load)
+               return;
+
+       ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT);
+       if (ret) {
+               dd_dev_err(dd,
+                          "Cannot acquire SBus resource to reset fabric SerDes - perhaps you should reboot\n");
+               return;
+       }
+       set_sbus_fast_mode(dd);
+
+       if (is_ax(dd)) {
+               /* A0 serdes do not work with a re-download */
+               u8 ra = fabric_serdes_broadcast[dd->hfi1_id];
+
+               /* place SerDes in reset and disable SPICO */
+               sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000011);
+               /* wait 100 refclk cycles @ 156.25MHz => 640ns */
+               udelay(1);
+               /* remove SerDes reset */
+               sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000010);
+               /* turn SPICO enable on */
+               sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000002);
+       } else {
+               turn_off_spicos(dd, SPICO_FABRIC);
+               /*
+                * No need for firmware retry - what to download has already
+                * been decided.
+                * No need to pay attention to the load return - the only
+                * failure is a validation failure, which has already been
+                * checked by the initial download.
+                */
+               (void)load_fabric_serdes_firmware(dd, &fw_fabric);
+       }
+
+       clear_sbus_fast_mode(dd);
+       release_chip_resource(dd, CR_SBUS);
+}
+
+/* Access to the SBus in this routine should probably be serialized */
+int sbus_request_slow(struct hfi1_devdata *dd,
+                     u8 receiver_addr, u8 data_addr, u8 command, u32 data_in)
+{
+       u64 reg, count = 0;
+
+       /* make sure fast mode is clear */
+       clear_sbus_fast_mode(dd);
+
+       sbus_request(dd, receiver_addr, data_addr, command, data_in);
+       write_csr(dd, ASIC_CFG_SBUS_EXECUTE,
+                 ASIC_CFG_SBUS_EXECUTE_EXECUTE_SMASK);
+       /* Wait for both DONE and RCV_DATA_VALID to go high */
+       reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
+       while (!((reg & ASIC_STS_SBUS_RESULT_DONE_SMASK) &&
+                (reg & ASIC_STS_SBUS_RESULT_RCV_DATA_VALID_SMASK))) {
+               if (count++ >= SBUS_MAX_POLL_COUNT) {
+                       u64 counts = read_csr(dd, ASIC_STS_SBUS_COUNTERS);
+                       /*
+                        * If the loop has timed out, we are OK if DONE bit
+                        * is set and RCV_DATA_VALID and EXECUTE counters
+                        * are the same. If not, we cannot proceed.
+                        */
+                       if ((reg & ASIC_STS_SBUS_RESULT_DONE_SMASK) &&
+                           (SBUS_COUNTER(counts, RCV_DATA_VALID) ==
+                            SBUS_COUNTER(counts, EXECUTE)))
+                               break;
+                       return -ETIMEDOUT;
+               }
+               udelay(1);
+               reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
+       }
+       count = 0;
+       write_csr(dd, ASIC_CFG_SBUS_EXECUTE, 0);
+       /* Wait for DONE to clear after EXECUTE is cleared */
+       reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
+       while (reg & ASIC_STS_SBUS_RESULT_DONE_SMASK) {
+               if (count++ >= SBUS_MAX_POLL_COUNT)
+                       return -ETIME;
+               udelay(1);
+               reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
+       }
+       return 0;
+}
+
+static int load_fabric_serdes_firmware(struct hfi1_devdata *dd,
+                                      struct firmware_details *fdet)
+{
+       int i, err;
+       const u8 ra = fabric_serdes_broadcast[dd->hfi1_id]; /* receiver addr */
+
+       dd_dev_info(dd, "Downloading fabric firmware\n");
+
+       /* step 1: load security variables */
+       load_security_variables(dd, fdet);
+       /* step 2: place SerDes in reset and disable SPICO */
+       sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000011);
+       /* wait 100 refclk cycles @ 156.25MHz => 640ns */
+       udelay(1);
+       /* step 3:  remove SerDes reset */
+       sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000010);
+       /* step 4: assert IMEM override */
+       sbus_request(dd, ra, 0x00, WRITE_SBUS_RECEIVER, 0x40000000);
+       /* step 5: download SerDes machine code */
+       for (i = 0; i < fdet->firmware_len; i += 4) {
+               sbus_request(dd, ra, 0x0a, WRITE_SBUS_RECEIVER,
+                            *(u32 *)&fdet->firmware_ptr[i]);
+       }
+       /* step 6: IMEM override off */
+       sbus_request(dd, ra, 0x00, WRITE_SBUS_RECEIVER, 0x00000000);
+       /* step 7: turn ECC on */
+       sbus_request(dd, ra, 0x0b, WRITE_SBUS_RECEIVER, 0x000c0000);
+
+       /* steps 8-11: run the RSA engine */
+       err = run_rsa(dd, "fabric serdes", fdet->signature);
+       if (err)
+               return err;
+
+       /* step 12: turn SPICO enable on */
+       sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000002);
+       /* step 13: enable core hardware interrupts */
+       sbus_request(dd, ra, 0x08, WRITE_SBUS_RECEIVER, 0x00000000);
+
+       return 0;
+}
+
+static int load_sbus_firmware(struct hfi1_devdata *dd,
+                             struct firmware_details *fdet)
+{
+       int i, err;
+       const u8 ra = SBUS_MASTER_BROADCAST; /* receiver address */
+
+       dd_dev_info(dd, "Downloading SBus firmware\n");
+
+       /* step 1: load security variables */
+       load_security_variables(dd, fdet);
+       /* step 2: place SPICO into reset and enable off */
+       sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x000000c0);
+       /* step 3: remove reset, enable off, IMEM_CNTRL_EN on */
+       sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000240);
+       /* step 4: set starting IMEM address for burst download */
+       sbus_request(dd, ra, 0x03, WRITE_SBUS_RECEIVER, 0x80000000);
+       /* step 5: download the SBus Master machine code */
+       for (i = 0; i < fdet->firmware_len; i += 4) {
+               sbus_request(dd, ra, 0x14, WRITE_SBUS_RECEIVER,
+                            *(u32 *)&fdet->firmware_ptr[i]);
+       }
+       /* step 6: set IMEM_CNTL_EN off */
+       sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000040);
+       /* step 7: turn ECC on */
+       sbus_request(dd, ra, 0x16, WRITE_SBUS_RECEIVER, 0x000c0000);
+
+       /* steps 8-11: run the RSA engine */
+       err = run_rsa(dd, "SBus", fdet->signature);
+       if (err)
+               return err;
+
+       /* step 12: set SPICO_ENABLE on */
+       sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000140);
+
+       return 0;
+}
+
+static int load_pcie_serdes_firmware(struct hfi1_devdata *dd,
+                                    struct firmware_details *fdet)
+{
+       int i;
+       const u8 ra = SBUS_MASTER_BROADCAST; /* receiver address */
+
+       dd_dev_info(dd, "Downloading PCIe firmware\n");
+
+       /* step 1: load security variables */
+       load_security_variables(dd, fdet);
+       /* step 2: assert single step (halts the SBus Master spico) */
+       sbus_request(dd, ra, 0x05, WRITE_SBUS_RECEIVER, 0x00000001);
+       /* step 3: enable XDMEM access */
+       sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000d40);
+       /* step 4: load firmware into SBus Master XDMEM */
+       /*
+        * NOTE: the dmem address, write_en, and wdata are all pre-packed,
+        * we only need to pick up the bytes and write them
+        */
+       for (i = 0; i < fdet->firmware_len; i += 4) {
+               sbus_request(dd, ra, 0x04, WRITE_SBUS_RECEIVER,
+                            *(u32 *)&fdet->firmware_ptr[i]);
+       }
+       /* step 5: disable XDMEM access */
+       sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000140);
+       /* step 6: allow SBus Spico to run */
+       sbus_request(dd, ra, 0x05, WRITE_SBUS_RECEIVER, 0x00000000);
+
+       /*
+        * steps 7-11: run RSA, if it succeeds, firmware is available to
+        * be swapped
+        */
+       return run_rsa(dd, "PCIe serdes", fdet->signature);
+}
+
+/*
+ * Set the given broadcast values on the given list of devices.
+ */
+static void set_serdes_broadcast(struct hfi1_devdata *dd, u8 bg1, u8 bg2,
+                                const u8 *addrs, int count)
+{
+       while (--count >= 0) {
+               /*
+                * Set BROADCAST_GROUP_1 and BROADCAST_GROUP_2, leave
+                * defaults for everything else.  Do not read-modify-write,
+                * per instruction from the manufacturer.
+                *
+                * Register 0xfd:
+                *      bits    what
+                *      -----   ---------------------------------
+                *        0     IGNORE_BROADCAST  (default 0)
+                *      11:4    BROADCAST_GROUP_1 (default 0xff)
+                *      23:16   BROADCAST_GROUP_2 (default 0xff)
+                */
+               sbus_request(dd, addrs[count], 0xfd, WRITE_SBUS_RECEIVER,
+                            (u32)bg1 << 4 | (u32)bg2 << 16);
+       }
+}
+
+int acquire_hw_mutex(struct hfi1_devdata *dd)
+{
+       unsigned long timeout;
+       int try = 0;
+       u8 mask = 1 << dd->hfi1_id;
+       u8 user;
+
+retry:
+       timeout = msecs_to_jiffies(HM_TIMEOUT) + jiffies;
+       while (1) {
+               write_csr(dd, ASIC_CFG_MUTEX, mask);
+               user = (u8)read_csr(dd, ASIC_CFG_MUTEX);
+               if (user == mask)
+                       return 0; /* success */
+               if (time_after(jiffies, timeout))
+                       break; /* timed out */
+               msleep(20);
+       }
+
+       /* timed out */
+       dd_dev_err(dd,
+                  "Unable to acquire hardware mutex, mutex mask %u, my mask %u (%s)\n",
+                  (u32)user, (u32)mask, (try == 0) ? "retrying" : "giving up");
+
+       if (try == 0) {
+               /* break mutex and retry */
+               write_csr(dd, ASIC_CFG_MUTEX, 0);
+               try++;
+               goto retry;
+       }
+
+       return -EBUSY;
+}
+
+void release_hw_mutex(struct hfi1_devdata *dd)
+{
+       write_csr(dd, ASIC_CFG_MUTEX, 0);
+}
+
+/* return the given resource bit(s) as a mask for the given HFI */
+static inline u64 resource_mask(u32 hfi1_id, u32 resource)
+{
+       return ((u64)resource) << (hfi1_id ? CR_DYN_SHIFT : 0);
+}
+
+static void fail_mutex_acquire_message(struct hfi1_devdata *dd,
+                                      const char *func)
+{
+       dd_dev_err(dd,
+                  "%s: hardware mutex stuck - suggest rebooting the machine\n",
+                  func);
+}
+
+/*
+ * Acquire access to a chip resource.
+ *
+ * Return 0 on success, -EBUSY if resource busy, -EIO if mutex acquire failed.
+ */
+static int __acquire_chip_resource(struct hfi1_devdata *dd, u32 resource)
+{
+       u64 scratch0, all_bits, my_bit;
+       int ret;
+
+       if (resource & CR_DYN_MASK) {
+               /* a dynamic resource is in use if either HFI has set the bit */
+               if (dd->pcidev->device == PCI_DEVICE_ID_INTEL0 &&
+                   (resource & (CR_I2C1 | CR_I2C2))) {
+                       /* discrete devices must serialize across both chains */
+                       all_bits = resource_mask(0, CR_I2C1 | CR_I2C2) |
+                                       resource_mask(1, CR_I2C1 | CR_I2C2);
+               } else {
+                       all_bits = resource_mask(0, resource) |
+                                               resource_mask(1, resource);
+               }
+               my_bit = resource_mask(dd->hfi1_id, resource);
+       } else {
+               /* non-dynamic resources are not split between HFIs */
+               all_bits = resource;
+               my_bit = resource;
+       }
+
+       /* lock against other callers within the driver wanting a resource */
+       mutex_lock(&dd->asic_data->asic_resource_mutex);
+
+       ret = acquire_hw_mutex(dd);
+       if (ret) {
+               fail_mutex_acquire_message(dd, __func__);
+               ret = -EIO;
+               goto done;
+       }
+
+       scratch0 = read_csr(dd, ASIC_CFG_SCRATCH);
+       if (scratch0 & all_bits) {
+               ret = -EBUSY;
+       } else {
+               write_csr(dd, ASIC_CFG_SCRATCH, scratch0 | my_bit);
+               /* force write to be visible to other HFI on another OS */
+               (void)read_csr(dd, ASIC_CFG_SCRATCH);
+       }
+
+       release_hw_mutex(dd);
+
+done:
+       mutex_unlock(&dd->asic_data->asic_resource_mutex);
+       return ret;
+}
+
+/*
+ * Acquire access to a chip resource, wait up to mswait milliseconds for
+ * the resource to become available.
+ *
+ * Return 0 on success, -EBUSY if busy (even after wait), -EIO if mutex
+ * acquire failed.
+ */
+int acquire_chip_resource(struct hfi1_devdata *dd, u32 resource, u32 mswait)
+{
+       unsigned long timeout;
+       int ret;
+
+       timeout = jiffies + msecs_to_jiffies(mswait);
+       while (1) {
+               ret = __acquire_chip_resource(dd, resource);
+               if (ret != -EBUSY)
+                       return ret;
+               /* resource is busy, check our timeout */
+               if (time_after_eq(jiffies, timeout))
+                       return -EBUSY;
+               usleep_range(80, 120);  /* arbitrary delay */
+       }
+}
+
+/*
+ * Release access to a chip resource
+ */
+void release_chip_resource(struct hfi1_devdata *dd, u32 resource)
+{
+       u64 scratch0, bit;
+
+       /* only dynamic resources should ever be cleared */
+       if (!(resource & CR_DYN_MASK)) {
+               dd_dev_err(dd, "%s: invalid resource 0x%x\n", __func__,
+                          resource);
+               return;
+       }
+       bit = resource_mask(dd->hfi1_id, resource);
+
+       /* lock against other callers within the driver wanting a resource */
+       mutex_lock(&dd->asic_data->asic_resource_mutex);
+
+       if (acquire_hw_mutex(dd)) {
+               fail_mutex_acquire_message(dd, __func__);
+               goto done;
+       }
+
+       scratch0 = read_csr(dd, ASIC_CFG_SCRATCH);
+       if ((scratch0 & bit) != 0) {
+               scratch0 &= ~bit;
+               write_csr(dd, ASIC_CFG_SCRATCH, scratch0);
+               /* force write to be visible to other HFI on another OS */
+               (void)read_csr(dd, ASIC_CFG_SCRATCH);
+       } else {
+               dd_dev_warn(dd, "%s: id %d, resource 0x%x: bit not set\n",
+                           __func__, dd->hfi1_id, resource);
+       }
+
+       release_hw_mutex(dd);
+
+done:
+       mutex_unlock(&dd->asic_data->asic_resource_mutex);
+}
+
+/*
+ * Return true if resource is set, false otherwise.  Print a warning
+ * if not set and a function is supplied.
+ */
+bool check_chip_resource(struct hfi1_devdata *dd, u32 resource,
+                        const char *func)
+{
+       u64 scratch0, bit;
+
+       if (resource & CR_DYN_MASK)
+               bit = resource_mask(dd->hfi1_id, resource);
+       else
+               bit = resource;
+
+       scratch0 = read_csr(dd, ASIC_CFG_SCRATCH);
+       if ((scratch0 & bit) == 0) {
+               if (func)
+                       dd_dev_warn(dd,
+                                   "%s: id %d, resource 0x%x, not acquired!\n",
+                                   func, dd->hfi1_id, resource);
+               return false;
+       }
+       return true;
+}
+
+static void clear_chip_resources(struct hfi1_devdata *dd, const char *func)
+{
+       u64 scratch0;
+
+       /* lock against other callers within the driver wanting a resource */
+       mutex_lock(&dd->asic_data->asic_resource_mutex);
+
+       if (acquire_hw_mutex(dd)) {
+               fail_mutex_acquire_message(dd, func);
+               goto done;
+       }
+
+       /* clear all dynamic access bits for this HFI */
+       scratch0 = read_csr(dd, ASIC_CFG_SCRATCH);
+       scratch0 &= ~resource_mask(dd->hfi1_id, CR_DYN_MASK);
+       write_csr(dd, ASIC_CFG_SCRATCH, scratch0);
+       /* force write to be visible to other HFI on another OS */
+       (void)read_csr(dd, ASIC_CFG_SCRATCH);
+
+       release_hw_mutex(dd);
+
+done:
+       mutex_unlock(&dd->asic_data->asic_resource_mutex);
+}
+
+void init_chip_resources(struct hfi1_devdata *dd)
+{
+       /* clear any holds left by us */
+       clear_chip_resources(dd, __func__);
+}
+
+void finish_chip_resources(struct hfi1_devdata *dd)
+{
+       /* clear any holds left by us */
+       clear_chip_resources(dd, __func__);
+}
+
+void set_sbus_fast_mode(struct hfi1_devdata *dd)
+{
+       write_csr(dd, ASIC_CFG_SBUS_EXECUTE,
+                 ASIC_CFG_SBUS_EXECUTE_FAST_MODE_SMASK);
+}
+
+void clear_sbus_fast_mode(struct hfi1_devdata *dd)
+{
+       u64 reg, count = 0;
+
+       reg = read_csr(dd, ASIC_STS_SBUS_COUNTERS);
+       while (SBUS_COUNTER(reg, EXECUTE) !=
+              SBUS_COUNTER(reg, RCV_DATA_VALID)) {
+               if (count++ >= SBUS_MAX_POLL_COUNT)
+                       break;
+               udelay(1);
+               reg = read_csr(dd, ASIC_STS_SBUS_COUNTERS);
+       }
+       write_csr(dd, ASIC_CFG_SBUS_EXECUTE, 0);
+}
+
+int load_firmware(struct hfi1_devdata *dd)
+{
+       int ret;
+
+       if (fw_fabric_serdes_load) {
+               ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT);
+               if (ret)
+                       return ret;
+
+               set_sbus_fast_mode(dd);
+
+               set_serdes_broadcast(dd, all_fabric_serdes_broadcast,
+                                    fabric_serdes_broadcast[dd->hfi1_id],
+                                    fabric_serdes_addrs[dd->hfi1_id],
+                                    NUM_FABRIC_SERDES);
+               turn_off_spicos(dd, SPICO_FABRIC);
+               do {
+                       ret = load_fabric_serdes_firmware(dd, &fw_fabric);
+               } while (retry_firmware(dd, ret));
+
+               clear_sbus_fast_mode(dd);
+               release_chip_resource(dd, CR_SBUS);
+               if (ret)
+                       return ret;
+       }
+
+       if (fw_8051_load) {
+               do {
+                       ret = load_8051_firmware(dd, &fw_8051);
+               } while (retry_firmware(dd, ret));
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+int hfi1_firmware_init(struct hfi1_devdata *dd)
+{
+       /* only RTL can use these */
+       if (dd->icode != ICODE_RTL_SILICON) {
+               fw_fabric_serdes_load = 0;
+               fw_pcie_serdes_load = 0;
+               fw_sbus_load = 0;
+       }
+
+       /* no 8051 or QSFP on simulator */
+       if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
+               fw_8051_load = 0;
+               platform_config_load = 0;
+       }
+
+       if (!fw_8051_name) {
+               if (dd->icode == ICODE_RTL_SILICON)
+                       fw_8051_name = DEFAULT_FW_8051_NAME_ASIC;
+               else
+                       fw_8051_name = DEFAULT_FW_8051_NAME_FPGA;
+       }
+       if (!fw_fabric_serdes_name)
+               fw_fabric_serdes_name = DEFAULT_FW_FABRIC_NAME;
+       if (!fw_sbus_name)
+               fw_sbus_name = DEFAULT_FW_SBUS_NAME;
+       if (!fw_pcie_serdes_name)
+               fw_pcie_serdes_name = DEFAULT_FW_PCIE_NAME;
+       if (!platform_config_name)
+               platform_config_name = DEFAULT_PLATFORM_CONFIG_NAME;
+
+       return obtain_firmware(dd);
+}
+
+/*
+ * This function is a helper function for parse_platform_config(...) and
+ * does not check for validity of the platform configuration cache
+ * (because we know it is invalid as we are building up the cache).
+ * As such, this should not be called from anywhere other than
+ * parse_platform_config
+ */
+static int check_meta_version(struct hfi1_devdata *dd, u32 *system_table)
+{
+       u32 meta_ver, meta_ver_meta, ver_start, ver_len, mask;
+       struct platform_config_cache *pcfgcache = &dd->pcfg_cache;
+
+       if (!system_table)
+               return -EINVAL;
+
+       meta_ver_meta =
+       *(pcfgcache->config_tables[PLATFORM_CONFIG_SYSTEM_TABLE].table_metadata
+       + SYSTEM_TABLE_META_VERSION);
+
+       mask = ((1 << METADATA_TABLE_FIELD_START_LEN_BITS) - 1);
+       ver_start = meta_ver_meta & mask;
+
+       meta_ver_meta >>= METADATA_TABLE_FIELD_LEN_SHIFT;
+
+       mask = ((1 << METADATA_TABLE_FIELD_LEN_LEN_BITS) - 1);
+       ver_len = meta_ver_meta & mask;
+
+       ver_start /= 8;
+       meta_ver = *((u8 *)system_table + ver_start) & ((1 << ver_len) - 1);
+
+       if (meta_ver < 5) {
+               dd_dev_info(
+                       dd, "%s:Please update platform config\n", __func__);
+               return -EINVAL;
+       }
+       return 0;
+}
+
+int parse_platform_config(struct hfi1_devdata *dd)
+{
+       struct platform_config_cache *pcfgcache = &dd->pcfg_cache;
+       u32 *ptr = NULL;
+       u32 header1 = 0, header2 = 0, magic_num = 0, crc = 0, file_length = 0;
+       u32 record_idx = 0, table_type = 0, table_length_dwords = 0;
+       int ret = -EINVAL; /* assume failure */
+
+       if (!dd->platform_config.data) {
+               dd_dev_info(dd, "%s: Missing config file\n", __func__);
+               goto bail;
+       }
+       ptr = (u32 *)dd->platform_config.data;
+
+       magic_num = *ptr;
+       ptr++;
+       if (magic_num != PLATFORM_CONFIG_MAGIC_NUM) {
+               dd_dev_info(dd, "%s: Bad config file\n", __func__);
+               goto bail;
+       }
+
+       /* Field is file size in DWORDs */
+       file_length = (*ptr) * 4;
+       ptr++;
+
+       if (file_length > dd->platform_config.size) {
+               dd_dev_info(dd, "%s:File claims to be larger than read size\n",
+                           __func__);
+               goto bail;
+       } else if (file_length < dd->platform_config.size) {
+               dd_dev_info(dd,
+                           "%s:File claims to be smaller than read size, continuing\n",
+                           __func__);
+       }
+       /* exactly equal, perfection */
+
+       /*
+        * In both cases where we proceed, using the self-reported file length
+        * is the safer option
+        */
+       while (ptr < (u32 *)(dd->platform_config.data + file_length)) {
+               header1 = *ptr;
+               header2 = *(ptr + 1);
+               if (header1 != ~header2) {
+                       dd_dev_info(dd, "%s: Failed validation at offset %ld\n",
+                                   __func__, (ptr - (u32 *)
+                                              dd->platform_config.data));
+                       goto bail;
+               }
+
+               record_idx = *ptr &
+                       ((1 << PLATFORM_CONFIG_HEADER_RECORD_IDX_LEN_BITS) - 1);
+
+               table_length_dwords = (*ptr >>
+                               PLATFORM_CONFIG_HEADER_TABLE_LENGTH_SHIFT) &
+                     ((1 << PLATFORM_CONFIG_HEADER_TABLE_LENGTH_LEN_BITS) - 1);
+
+               table_type = (*ptr >> PLATFORM_CONFIG_HEADER_TABLE_TYPE_SHIFT) &
+                       ((1 << PLATFORM_CONFIG_HEADER_TABLE_TYPE_LEN_BITS) - 1);
+
+               /* Done with this set of headers */
+               ptr += 2;
+
+               if (record_idx) {
+                       /* data table */
+                       switch (table_type) {
+                       case PLATFORM_CONFIG_SYSTEM_TABLE:
+                               pcfgcache->config_tables[table_type].num_table =
+                                                                       1;
+                               ret = check_meta_version(dd, ptr);
+                               if (ret)
+                                       goto bail;
+                               break;
+                       case PLATFORM_CONFIG_PORT_TABLE:
+                               pcfgcache->config_tables[table_type].num_table =
+                                                                       2;
+                               break;
+                       case PLATFORM_CONFIG_RX_PRESET_TABLE:
+                               /* fall through */
+                       case PLATFORM_CONFIG_TX_PRESET_TABLE:
+                               /* fall through */
+                       case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
+                               /* fall through */
+                       case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
+                               pcfgcache->config_tables[table_type].num_table =
+                                                       table_length_dwords;
+                               break;
+                       default:
+                               dd_dev_info(dd,
+                                           "%s: Unknown data table %d, offset %ld\n",
+                                           __func__, table_type,
+                                           (ptr - (u32 *)
+                                            dd->platform_config.data));
+                               goto bail; /* We don't trust this file now */
+                       }
+                       pcfgcache->config_tables[table_type].table = ptr;
+               } else {
+                       /* metadata table */
+                       switch (table_type) {
+                       case PLATFORM_CONFIG_SYSTEM_TABLE:
+                               /* fall through */
+                       case PLATFORM_CONFIG_PORT_TABLE:
+                               /* fall through */
+                       case PLATFORM_CONFIG_RX_PRESET_TABLE:
+                               /* fall through */
+                       case PLATFORM_CONFIG_TX_PRESET_TABLE:
+                               /* fall through */
+                       case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
+                               /* fall through */
+                       case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
+                               break;
+                       default:
+                               dd_dev_info(dd,
+                                           "%s: Unknown meta table %d, offset %ld\n",
+                                           __func__, table_type,
+                                           (ptr -
+                                            (u32 *)dd->platform_config.data));
+                               goto bail; /* We don't trust this file now */
+                       }
+                       pcfgcache->config_tables[table_type].table_metadata =
+                                                                       ptr;
+               }
+
+               /* Calculate and check table crc */
+               crc = crc32_le(~(u32)0, (unsigned char const *)ptr,
+                              (table_length_dwords * 4));
+               crc ^= ~(u32)0;
+
+               /* Jump the table */
+               ptr += table_length_dwords;
+               if (crc != *ptr) {
+                       dd_dev_info(dd, "%s: Failed CRC check at offset %ld\n",
+                                   __func__, (ptr -
+                                              (u32 *)
+                                              dd->platform_config.data));
+                       goto bail;
+               }
+               /* Jump the CRC DWORD */
+               ptr++;
+       }
+
+       pcfgcache->cache_valid = 1;
+       return 0;
+bail:
+       memset(pcfgcache, 0, sizeof(struct platform_config_cache));
+       return ret;
+}
+
+static int get_platform_fw_field_metadata(struct hfi1_devdata *dd, int table,
+                                         int field, u32 *field_len_bits,
+                                         u32 *field_start_bits)
+{
+       struct platform_config_cache *pcfgcache = &dd->pcfg_cache;
+       u32 *src_ptr = NULL;
+
+       if (!pcfgcache->cache_valid)
+               return -EINVAL;
+
+       switch (table) {
+       case PLATFORM_CONFIG_SYSTEM_TABLE:
+               /* fall through */
+       case PLATFORM_CONFIG_PORT_TABLE:
+               /* fall through */
+       case PLATFORM_CONFIG_RX_PRESET_TABLE:
+               /* fall through */
+       case PLATFORM_CONFIG_TX_PRESET_TABLE:
+               /* fall through */
+       case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
+               /* fall through */
+       case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
+               if (field && field < platform_config_table_limits[table])
+                       src_ptr =
+                       pcfgcache->config_tables[table].table_metadata + field;
+               break;
+       default:
+               dd_dev_info(dd, "%s: Unknown table\n", __func__);
+               break;
+       }
+
+       if (!src_ptr)
+               return -EINVAL;
+
+       if (field_start_bits)
+               *field_start_bits = *src_ptr &
+                     ((1 << METADATA_TABLE_FIELD_START_LEN_BITS) - 1);
+
+       if (field_len_bits)
+               *field_len_bits = (*src_ptr >> METADATA_TABLE_FIELD_LEN_SHIFT)
+                      & ((1 << METADATA_TABLE_FIELD_LEN_LEN_BITS) - 1);
+
+       return 0;
+}
+
+/* This is the central interface to getting data out of the platform config
+ * file. It depends on parse_platform_config() having populated the
+ * platform_config_cache in hfi1_devdata, and checks the cache_valid member to
+ * validate the sanity of the cache.
+ *
+ * The non-obvious parameters:
+ * @table_index: Acts as a look up key into which instance of the tables the
+ * relevant field is fetched from.
+ *
+ * This applies to the data tables that have multiple instances. The port table
+ * is an exception to this rule as each HFI only has one port and thus the
+ * relevant table can be distinguished by hfi_id.
+ *
+ * @data: pointer to memory that will be populated with the field requested.
+ * @len: length of memory pointed by @data in bytes.
+ */
+int get_platform_config_field(struct hfi1_devdata *dd,
+                             enum platform_config_table_type_encoding
+                             table_type, int table_index, int field_index,
+                             u32 *data, u32 len)
+{
+       int ret = 0, wlen = 0, seek = 0;
+       u32 field_len_bits = 0, field_start_bits = 0, *src_ptr = NULL;
+       struct platform_config_cache *pcfgcache = &dd->pcfg_cache;
+
+       if (data)
+               memset(data, 0, len);
+       else
+               return -EINVAL;
+
+       ret = get_platform_fw_field_metadata(dd, table_type, field_index,
+                                            &field_len_bits,
+                                            &field_start_bits);
+       if (ret)
+               return -EINVAL;
+
+       /* Convert length to bits */
+       len *= 8;
+
+       /* Our metadata function checked cache_valid and field_index for us */
+       switch (table_type) {
+       case PLATFORM_CONFIG_SYSTEM_TABLE:
+               src_ptr = pcfgcache->config_tables[table_type].table;
+
+               if (field_index != SYSTEM_TABLE_QSFP_POWER_CLASS_MAX) {
+                       if (len < field_len_bits)
+                               return -EINVAL;
+
+                       seek = field_start_bits / 8;
+                       wlen = field_len_bits / 8;
+
+                       src_ptr = (u32 *)((u8 *)src_ptr + seek);
+
+                       /*
+                        * We expect the field to be byte aligned and whole byte
+                        * lengths if we are here
+                        */
+                       memcpy(data, src_ptr, wlen);
+                       return 0;
+               }
+               break;
+       case PLATFORM_CONFIG_PORT_TABLE:
+               /* Port table is 4 DWORDS */
+               src_ptr = dd->hfi1_id ?
+                       pcfgcache->config_tables[table_type].table + 4 :
+                       pcfgcache->config_tables[table_type].table;
+               break;
+       case PLATFORM_CONFIG_RX_PRESET_TABLE:
+               /* fall through */
+       case PLATFORM_CONFIG_TX_PRESET_TABLE:
+               /* fall through */
+       case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
+               /* fall through */
+       case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
+               src_ptr = pcfgcache->config_tables[table_type].table;
+
+               if (table_index <
+                       pcfgcache->config_tables[table_type].num_table)
+                       src_ptr += table_index;
+               else
+                       src_ptr = NULL;
+               break;
+       default:
+               dd_dev_info(dd, "%s: Unknown table\n", __func__);
+               break;
+       }
+
+       if (!src_ptr || len < field_len_bits)
+               return -EINVAL;
+
+       src_ptr += (field_start_bits / 32);
+       *data = (*src_ptr >> (field_start_bits % 32)) &
+                       ((1 << field_len_bits) - 1);
+
+       return 0;
+}
+
+/*
+ * Download the firmware needed for the Gen3 PCIe SerDes.  An update
+ * to the SBus firmware is needed before updating the PCIe firmware.
+ *
+ * Note: caller must be holding the SBus resource.
+ */
+int load_pcie_firmware(struct hfi1_devdata *dd)
+{
+       int ret = 0;
+
+       /* both firmware loads below use the SBus */
+       set_sbus_fast_mode(dd);
+
+       if (fw_sbus_load) {
+               turn_off_spicos(dd, SPICO_SBUS);
+               do {
+                       ret = load_sbus_firmware(dd, &fw_sbus);
+               } while (retry_firmware(dd, ret));
+               if (ret)
+                       goto done;
+       }
+
+       if (fw_pcie_serdes_load) {
+               dd_dev_info(dd, "Setting PCIe SerDes broadcast\n");
+               set_serdes_broadcast(dd, all_pcie_serdes_broadcast,
+                                    pcie_serdes_broadcast[dd->hfi1_id],
+                                    pcie_serdes_addrs[dd->hfi1_id],
+                                    NUM_PCIE_SERDES);
+               do {
+                       ret = load_pcie_serdes_firmware(dd, &fw_pcie);
+               } while (retry_firmware(dd, ret));
+               if (ret)
+                       goto done;
+       }
+
+done:
+       clear_sbus_fast_mode(dd);
+
+       return ret;
+}
+
+/*
+ * Read the GUID from the hardware, store it in dd.
+ */
+void read_guid(struct hfi1_devdata *dd)
+{
+       /* Take the DC out of reset to get a valid GUID value */
+       write_csr(dd, CCE_DC_CTRL, 0);
+       (void)read_csr(dd, CCE_DC_CTRL);
+
+       dd->base_guid = read_csr(dd, DC_DC8051_CFG_LOCAL_GUID);
+       dd_dev_info(dd, "GUID %llx",
+                   (unsigned long long)dd->base_guid);
+}
diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h
new file mode 100644 (file)
index 0000000..4417a0f
--- /dev/null
@@ -0,0 +1,1950 @@
+#ifndef _HFI1_KERNEL_H
+#define _HFI1_KERNEL_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+#include <linux/io.h>
+#include <linux/fs.h>
+#include <linux/completion.h>
+#include <linux/kref.h>
+#include <linux/sched.h>
+#include <linux/cdev.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <rdma/rdma_vt.h>
+
+#include "chip_registers.h"
+#include "common.h"
+#include "verbs.h"
+#include "pio.h"
+#include "chip.h"
+#include "mad.h"
+#include "qsfp.h"
+#include "platform.h"
+#include "affinity.h"
+
+/* bumped 1 from s/w major version of TrueScale */
+#define HFI1_CHIP_VERS_MAJ 3U
+
+/* don't care about this except printing */
+#define HFI1_CHIP_VERS_MIN 0U
+
+/* The Organization Unique Identifier (Mfg code), and its position in GUID */
+#define HFI1_OUI 0x001175
+#define HFI1_OUI_LSB 40
+
+#define DROP_PACKET_OFF                0
+#define DROP_PACKET_ON         1
+
+extern unsigned long hfi1_cap_mask;
+#define HFI1_CAP_KGET_MASK(mask, cap) ((mask) & HFI1_CAP_##cap)
+#define HFI1_CAP_UGET_MASK(mask, cap) \
+       (((mask) >> HFI1_CAP_USER_SHIFT) & HFI1_CAP_##cap)
+#define HFI1_CAP_KGET(cap) (HFI1_CAP_KGET_MASK(hfi1_cap_mask, cap))
+#define HFI1_CAP_UGET(cap) (HFI1_CAP_UGET_MASK(hfi1_cap_mask, cap))
+#define HFI1_CAP_IS_KSET(cap) (!!HFI1_CAP_KGET(cap))
+#define HFI1_CAP_IS_USET(cap) (!!HFI1_CAP_UGET(cap))
+#define HFI1_MISC_GET() ((hfi1_cap_mask >> HFI1_CAP_MISC_SHIFT) & \
+                       HFI1_CAP_MISC_MASK)
+/* Offline Disabled Reason is 4-bits */
+#define HFI1_ODR_MASK(rsn) ((rsn) & OPA_PI_MASK_OFFLINE_REASON)
+
+/*
+ * Control context is always 0 and handles the error packets.
+ * It also handles the VL15 and multicast packets.
+ */
+#define HFI1_CTRL_CTXT    0
+
+/*
+ * Driver context will store software counters for each of the events
+ * associated with these status registers
+ */
+#define NUM_CCE_ERR_STATUS_COUNTERS 41
+#define NUM_RCV_ERR_STATUS_COUNTERS 64
+#define NUM_MISC_ERR_STATUS_COUNTERS 13
+#define NUM_SEND_PIO_ERR_STATUS_COUNTERS 36
+#define NUM_SEND_DMA_ERR_STATUS_COUNTERS 4
+#define NUM_SEND_EGRESS_ERR_STATUS_COUNTERS 64
+#define NUM_SEND_ERR_STATUS_COUNTERS 3
+#define NUM_SEND_CTXT_ERR_STATUS_COUNTERS 5
+#define NUM_SEND_DMA_ENG_ERR_STATUS_COUNTERS 24
+
+/*
+ * per driver stats, either not device nor port-specific, or
+ * summed over all of the devices and ports.
+ * They are described by name via ipathfs filesystem, so layout
+ * and number of elements can change without breaking compatibility.
+ * If members are added or deleted hfi1_statnames[] in debugfs.c must
+ * change to match.
+ */
+struct hfi1_ib_stats {
+       __u64 sps_ints; /* number of interrupts handled */
+       __u64 sps_errints; /* number of error interrupts */
+       __u64 sps_txerrs; /* tx-related packet errors */
+       __u64 sps_rcverrs; /* non-crc rcv packet errors */
+       __u64 sps_hwerrs; /* hardware errors reported (parity, etc.) */
+       __u64 sps_nopiobufs; /* no pio bufs avail from kernel */
+       __u64 sps_ctxts; /* number of contexts currently open */
+       __u64 sps_lenerrs; /* number of kernel packets where RHF != LRH len */
+       __u64 sps_buffull;
+       __u64 sps_hdrfull;
+};
+
+extern struct hfi1_ib_stats hfi1_stats;
+extern const struct pci_error_handlers hfi1_pci_err_handler;
+
+/*
+ * First-cut criterion for "device is active" is
+ * two thousand dwords combined Tx, Rx traffic per
+ * 5-second interval. SMA packets are 64 dwords,
+ * and occur "a few per second", presumably each way.
+ */
+#define HFI1_TRAFFIC_ACTIVE_THRESHOLD (2000)
+
+/*
+ * Below contains all data related to a single context (formerly called port).
+ */
+
+#ifdef CONFIG_DEBUG_FS
+struct hfi1_opcode_stats_perctx;
+#endif
+
+struct ctxt_eager_bufs {
+       ssize_t size;            /* total size of eager buffers */
+       u32 count;               /* size of buffers array */
+       u32 numbufs;             /* number of buffers allocated */
+       u32 alloced;             /* number of rcvarray entries used */
+       u32 rcvtid_size;         /* size of each eager rcv tid */
+       u32 threshold;           /* head update threshold */
+       struct eager_buffer {
+               void *addr;
+               dma_addr_t phys;
+               ssize_t len;
+       } *buffers;
+       struct {
+               void *addr;
+               dma_addr_t phys;
+       } *rcvtids;
+};
+
+struct exp_tid_set {
+       struct list_head list;
+       u32 count;
+};
+
+struct hfi1_ctxtdata {
+       /* shadow the ctxt's RcvCtrl register */
+       u64 rcvctrl;
+       /* rcvhdrq base, needs mmap before useful */
+       void *rcvhdrq;
+       /* kernel virtual address where hdrqtail is updated */
+       volatile __le64 *rcvhdrtail_kvaddr;
+       /*
+        * Shared page for kernel to signal user processes that send buffers
+        * need disarming.  The process should call HFI1_CMD_DISARM_BUFS
+        * or HFI1_CMD_ACK_EVENT with IPATH_EVENT_DISARM_BUFS set.
+        */
+       unsigned long *user_event_mask;
+       /* when waiting for rcv or pioavail */
+       wait_queue_head_t wait;
+       /* rcvhdrq size (for freeing) */
+       size_t rcvhdrq_size;
+       /* number of rcvhdrq entries */
+       u16 rcvhdrq_cnt;
+       /* size of each of the rcvhdrq entries */
+       u16 rcvhdrqentsize;
+       /* mmap of hdrq, must fit in 44 bits */
+       dma_addr_t rcvhdrq_phys;
+       dma_addr_t rcvhdrqtailaddr_phys;
+       struct ctxt_eager_bufs egrbufs;
+       /* this receive context's assigned PIO ACK send context */
+       struct send_context *sc;
+
+       /* dynamic receive available interrupt timeout */
+       u32 rcvavail_timeout;
+       /*
+        * number of opens (including slave sub-contexts) on this instance
+        * (ignoring forks, dup, etc. for now)
+        */
+       int cnt;
+       /*
+        * how much space to leave at start of eager TID entries for
+        * protocol use, on each TID
+        */
+       /* instead of calculating it */
+       unsigned ctxt;
+       /* non-zero if ctxt is being shared. */
+       u16 subctxt_cnt;
+       /* non-zero if ctxt is being shared. */
+       u16 subctxt_id;
+       u8 uuid[16];
+       /* job key */
+       u16 jkey;
+       /* number of RcvArray groups for this context. */
+       u32 rcv_array_groups;
+       /* index of first eager TID entry. */
+       u32 eager_base;
+       /* number of expected TID entries */
+       u32 expected_count;
+       /* index of first expected TID entry. */
+       u32 expected_base;
+
+       struct exp_tid_set tid_group_list;
+       struct exp_tid_set tid_used_list;
+       struct exp_tid_set tid_full_list;
+
+       /* lock protecting all Expected TID data */
+       struct mutex exp_lock;
+       /* number of pio bufs for this ctxt (all procs, if shared) */
+       u32 piocnt;
+       /* first pio buffer for this ctxt */
+       u32 pio_base;
+       /* chip offset of PIO buffers for this ctxt */
+       u32 piobufs;
+       /* per-context configuration flags */
+       u32 flags;
+       /* per-context event flags for fileops/intr communication */
+       unsigned long event_flags;
+       /* WAIT_RCV that timed out, no interrupt */
+       u32 rcvwait_to;
+       /* WAIT_PIO that timed out, no interrupt */
+       u32 piowait_to;
+       /* WAIT_RCV already happened, no wait */
+       u32 rcvnowait;
+       /* WAIT_PIO already happened, no wait */
+       u32 pionowait;
+       /* total number of polled urgent packets */
+       u32 urgent;
+       /* saved total number of polled urgent packets for poll edge trigger */
+       u32 urgent_poll;
+       /* pid of process using this ctxt */
+       pid_t pid;
+       pid_t subpid[HFI1_MAX_SHARED_CTXTS];
+       /* same size as task_struct .comm[], command that opened context */
+       char comm[TASK_COMM_LEN];
+       /* so file ops can get at unit */
+       struct hfi1_devdata *dd;
+       /* so functions that need physical port can get it easily */
+       struct hfi1_pportdata *ppd;
+       /* A page of memory for rcvhdrhead, rcvegrhead, rcvegrtail * N */
+       void *subctxt_uregbase;
+       /* An array of pages for the eager receive buffers * N */
+       void *subctxt_rcvegrbuf;
+       /* An array of pages for the eager header queue entries * N */
+       void *subctxt_rcvhdr_base;
+       /* The version of the library which opened this ctxt */
+       u32 userversion;
+       /* Bitmask of active slaves */
+       u32 active_slaves;
+       /* Type of packets or conditions we want to poll for */
+       u16 poll_type;
+       /* receive packet sequence counter */
+       u8 seq_cnt;
+       u8 redirect_seq_cnt;
+       /* ctxt rcvhdrq head offset */
+       u32 head;
+       u32 pkt_count;
+       /* QPs waiting for context processing */
+       struct list_head qp_wait_list;
+       /* interrupt handling */
+       u64 imask;      /* clear interrupt mask */
+       int ireg;       /* clear interrupt register */
+       unsigned numa_id; /* numa node of this context */
+       /* verbs stats per CTX */
+       struct hfi1_opcode_stats_perctx *opstats;
+       /*
+        * This is the kernel thread that will keep making
+        * progress on the user sdma requests behind the scenes.
+        * There is one per context (shared contexts use the master's).
+        */
+       struct task_struct *progress;
+       struct list_head sdma_queues;
+       /* protect sdma queues */
+       spinlock_t sdma_qlock;
+
+       /* Is ASPM interrupt supported for this context */
+       bool aspm_intr_supported;
+       /* ASPM state (enabled/disabled) for this context */
+       bool aspm_enabled;
+       /* Timer for re-enabling ASPM if interrupt activity quietens down */
+       struct timer_list aspm_timer;
+       /* Lock to serialize between intr, timer intr and user threads */
+       spinlock_t aspm_lock;
+       /* Is ASPM processing enabled for this context (in intr context) */
+       bool aspm_intr_enable;
+       /* Last interrupt timestamp */
+       ktime_t aspm_ts_last_intr;
+       /* Last timestamp at which we scheduled a timer for this context */
+       ktime_t aspm_ts_timer_sched;
+
+       /*
+        * The interrupt handler for a particular receive context can vary
+        * throughout it's lifetime. This is not a lock protected data member so
+        * it must be updated atomically and the prev and new value must always
+        * be valid. Worst case is we process an extra interrupt and up to 64
+        * packets with the wrong interrupt handler.
+        */
+       int (*do_interrupt)(struct hfi1_ctxtdata *rcd, int threaded);
+};
+
+/*
+ * Represents a single packet at a high level. Put commonly computed things in
+ * here so we do not have to keep doing them over and over. The rule of thumb is
+ * if something is used one time to derive some value, store that something in
+ * here. If it is used multiple times, then store the result of that derivation
+ * in here.
+ */
+struct hfi1_packet {
+       void *ebuf;
+       void *hdr;
+       struct hfi1_ctxtdata *rcd;
+       __le32 *rhf_addr;
+       struct rvt_qp *qp;
+       struct hfi1_other_headers *ohdr;
+       u64 rhf;
+       u32 maxcnt;
+       u32 rhqoff;
+       u32 hdrqtail;
+       int numpkt;
+       u16 tlen;
+       u16 hlen;
+       s16 etail;
+       u16 rsize;
+       u8 updegr;
+       u8 rcv_flags;
+       u8 etype;
+};
+
+static inline bool has_sc4_bit(struct hfi1_packet *p)
+{
+       return !!rhf_dc_info(p->rhf);
+}
+
+/*
+ * Private data for snoop/capture support.
+ */
+struct hfi1_snoop_data {
+       int mode_flag;
+       struct cdev cdev;
+       struct device *class_dev;
+       /* protect snoop data */
+       spinlock_t snoop_lock;
+       struct list_head queue;
+       wait_queue_head_t waitq;
+       void *filter_value;
+       int (*filter_callback)(void *hdr, void *data, void *value);
+       u64 dcc_cfg; /* saved value of DCC Cfg register */
+};
+
+/* snoop mode_flag values */
+#define HFI1_PORT_SNOOP_MODE     1U
+#define HFI1_PORT_CAPTURE_MODE   2U
+
+struct rvt_sge_state;
+
+/*
+ * Get/Set IB link-level config parameters for f_get/set_ib_cfg()
+ * Mostly for MADs that set or query link parameters, also ipath
+ * config interfaces
+ */
+#define HFI1_IB_CFG_LIDLMC 0 /* LID (LS16b) and Mask (MS16b) */
+#define HFI1_IB_CFG_LWID_DG_ENB 1 /* allowed Link-width downgrade */
+#define HFI1_IB_CFG_LWID_ENB 2 /* allowed Link-width */
+#define HFI1_IB_CFG_LWID 3 /* currently active Link-width */
+#define HFI1_IB_CFG_SPD_ENB 4 /* allowed Link speeds */
+#define HFI1_IB_CFG_SPD 5 /* current Link spd */
+#define HFI1_IB_CFG_RXPOL_ENB 6 /* Auto-RX-polarity enable */
+#define HFI1_IB_CFG_LREV_ENB 7 /* Auto-Lane-reversal enable */
+#define HFI1_IB_CFG_LINKLATENCY 8 /* Link Latency (IB1.2 only) */
+#define HFI1_IB_CFG_HRTBT 9 /* IB heartbeat off/enable/auto; DDR/QDR only */
+#define HFI1_IB_CFG_OP_VLS 10 /* operational VLs */
+#define HFI1_IB_CFG_VL_HIGH_CAP 11 /* num of VL high priority weights */
+#define HFI1_IB_CFG_VL_LOW_CAP 12 /* num of VL low priority weights */
+#define HFI1_IB_CFG_OVERRUN_THRESH 13 /* IB overrun threshold */
+#define HFI1_IB_CFG_PHYERR_THRESH 14 /* IB PHY error threshold */
+#define HFI1_IB_CFG_LINKDEFAULT 15 /* IB link default (sleep/poll) */
+#define HFI1_IB_CFG_PKEYS 16 /* update partition keys */
+#define HFI1_IB_CFG_MTU 17 /* update MTU in IBC */
+#define HFI1_IB_CFG_VL_HIGH_LIMIT 19
+#define HFI1_IB_CFG_PMA_TICKS 20 /* PMA sample tick resolution */
+#define HFI1_IB_CFG_PORT 21 /* switch port we are connected to */
+
+/*
+ * HFI or Host Link States
+ *
+ * These describe the states the driver thinks the logical and physical
+ * states are in.  Used as an argument to set_link_state().  Implemented
+ * as bits for easy multi-state checking.  The actual state can only be
+ * one.
+ */
+#define __HLS_UP_INIT_BP       0
+#define __HLS_UP_ARMED_BP      1
+#define __HLS_UP_ACTIVE_BP     2
+#define __HLS_DN_DOWNDEF_BP    3       /* link down default */
+#define __HLS_DN_POLL_BP       4
+#define __HLS_DN_DISABLE_BP    5
+#define __HLS_DN_OFFLINE_BP    6
+#define __HLS_VERIFY_CAP_BP    7
+#define __HLS_GOING_UP_BP      8
+#define __HLS_GOING_OFFLINE_BP  9
+#define __HLS_LINK_COOLDOWN_BP 10
+
+#define HLS_UP_INIT      BIT(__HLS_UP_INIT_BP)
+#define HLS_UP_ARMED     BIT(__HLS_UP_ARMED_BP)
+#define HLS_UP_ACTIVE    BIT(__HLS_UP_ACTIVE_BP)
+#define HLS_DN_DOWNDEF   BIT(__HLS_DN_DOWNDEF_BP) /* link down default */
+#define HLS_DN_POLL      BIT(__HLS_DN_POLL_BP)
+#define HLS_DN_DISABLE   BIT(__HLS_DN_DISABLE_BP)
+#define HLS_DN_OFFLINE   BIT(__HLS_DN_OFFLINE_BP)
+#define HLS_VERIFY_CAP   BIT(__HLS_VERIFY_CAP_BP)
+#define HLS_GOING_UP     BIT(__HLS_GOING_UP_BP)
+#define HLS_GOING_OFFLINE BIT(__HLS_GOING_OFFLINE_BP)
+#define HLS_LINK_COOLDOWN BIT(__HLS_LINK_COOLDOWN_BP)
+
+#define HLS_UP (HLS_UP_INIT | HLS_UP_ARMED | HLS_UP_ACTIVE)
+#define HLS_DOWN ~(HLS_UP)
+
+/* use this MTU size if none other is given */
+#define HFI1_DEFAULT_ACTIVE_MTU 10240
+/* use this MTU size as the default maximum */
+#define HFI1_DEFAULT_MAX_MTU 10240
+/* default partition key */
+#define DEFAULT_PKEY 0xffff
+
+/*
+ * Possible fabric manager config parameters for fm_{get,set}_table()
+ */
+#define FM_TBL_VL_HIGH_ARB             1 /* Get/set VL high prio weights */
+#define FM_TBL_VL_LOW_ARB              2 /* Get/set VL low prio weights */
+#define FM_TBL_BUFFER_CONTROL          3 /* Get/set Buffer Control */
+#define FM_TBL_SC2VLNT                 4 /* Get/set SC->VLnt */
+#define FM_TBL_VL_PREEMPT_ELEMS                5 /* Get (no set) VL preempt elems */
+#define FM_TBL_VL_PREEMPT_MATRIX       6 /* Get (no set) VL preempt matrix */
+
+/*
+ * Possible "operations" for f_rcvctrl(ppd, op, ctxt)
+ * these are bits so they can be combined, e.g.
+ * HFI1_RCVCTRL_INTRAVAIL_ENB | HFI1_RCVCTRL_CTXT_ENB
+ */
+#define HFI1_RCVCTRL_TAILUPD_ENB 0x01
+#define HFI1_RCVCTRL_TAILUPD_DIS 0x02
+#define HFI1_RCVCTRL_CTXT_ENB 0x04
+#define HFI1_RCVCTRL_CTXT_DIS 0x08
+#define HFI1_RCVCTRL_INTRAVAIL_ENB 0x10
+#define HFI1_RCVCTRL_INTRAVAIL_DIS 0x20
+#define HFI1_RCVCTRL_PKEY_ENB 0x40  /* Note, default is enabled */
+#define HFI1_RCVCTRL_PKEY_DIS 0x80
+#define HFI1_RCVCTRL_TIDFLOW_ENB 0x0400
+#define HFI1_RCVCTRL_TIDFLOW_DIS 0x0800
+#define HFI1_RCVCTRL_ONE_PKT_EGR_ENB 0x1000
+#define HFI1_RCVCTRL_ONE_PKT_EGR_DIS 0x2000
+#define HFI1_RCVCTRL_NO_RHQ_DROP_ENB 0x4000
+#define HFI1_RCVCTRL_NO_RHQ_DROP_DIS 0x8000
+#define HFI1_RCVCTRL_NO_EGR_DROP_ENB 0x10000
+#define HFI1_RCVCTRL_NO_EGR_DROP_DIS 0x20000
+
+/* partition enforcement flags */
+#define HFI1_PART_ENFORCE_IN   0x1
+#define HFI1_PART_ENFORCE_OUT  0x2
+
+/* how often we check for synthetic counter wrap around */
+#define SYNTH_CNT_TIME 2
+
+/* Counter flags */
+#define CNTR_NORMAL            0x0 /* Normal counters, just read register */
+#define CNTR_SYNTH             0x1 /* Synthetic counters, saturate at all 1s */
+#define CNTR_DISABLED          0x2 /* Disable this counter */
+#define CNTR_32BIT             0x4 /* Simulate 64 bits for this counter */
+#define CNTR_VL                        0x8 /* Per VL counter */
+#define CNTR_SDMA              0x10
+#define CNTR_INVALID_VL                -1  /* Specifies invalid VL */
+#define CNTR_MODE_W            0x0
+#define CNTR_MODE_R            0x1
+
+/* VLs Supported/Operational */
+#define HFI1_MIN_VLS_SUPPORTED 1
+#define HFI1_MAX_VLS_SUPPORTED 8
+
+static inline void incr_cntr64(u64 *cntr)
+{
+       if (*cntr < (u64)-1LL)
+               (*cntr)++;
+}
+
+static inline void incr_cntr32(u32 *cntr)
+{
+       if (*cntr < (u32)-1LL)
+               (*cntr)++;
+}
+
+#define MAX_NAME_SIZE 64
+struct hfi1_msix_entry {
+       enum irq_type type;
+       struct msix_entry msix;
+       void *arg;
+       char name[MAX_NAME_SIZE];
+       cpumask_t mask;
+};
+
+/* per-SL CCA information */
+struct cca_timer {
+       struct hrtimer hrtimer;
+       struct hfi1_pportdata *ppd; /* read-only */
+       int sl; /* read-only */
+       u16 ccti; /* read/write - current value of CCTI */
+};
+
+struct link_down_reason {
+       /*
+        * SMA-facing value.  Should be set from .latest when
+        * HLS_UP_* -> HLS_DN_* transition actually occurs.
+        */
+       u8 sma;
+       u8 latest;
+};
+
+enum {
+       LO_PRIO_TABLE,
+       HI_PRIO_TABLE,
+       MAX_PRIO_TABLE
+};
+
+struct vl_arb_cache {
+       /* protect vl arb cache */
+       spinlock_t lock;
+       struct ib_vl_weight_elem table[VL_ARB_TABLE_SIZE];
+};
+
+/*
+ * The structure below encapsulates data relevant to a physical IB Port.
+ * Current chips support only one such port, but the separation
+ * clarifies things a bit. Note that to conform to IB conventions,
+ * port-numbers are one-based. The first or only port is port1.
+ */
+struct hfi1_pportdata {
+       struct hfi1_ibport ibport_data;
+
+       struct hfi1_devdata *dd;
+       struct kobject pport_cc_kobj;
+       struct kobject sc2vl_kobj;
+       struct kobject sl2sc_kobj;
+       struct kobject vl2mtu_kobj;
+
+       /* PHY support */
+       u32 port_type;
+       struct qsfp_data qsfp_info;
+
+       /* GUID for this interface, in host order */
+       u64 guid;
+       /* GUID for peer interface, in host order */
+       u64 neighbor_guid;
+
+       /* up or down physical link state */
+       u32 linkup;
+
+       /*
+        * this address is mapped read-only into user processes so they can
+        * get status cheaply, whenever they want.  One qword of status per port
+        */
+       u64 *statusp;
+
+       /* SendDMA related entries */
+
+       struct workqueue_struct *hfi1_wq;
+
+       /* move out of interrupt context */
+       struct work_struct link_vc_work;
+       struct work_struct link_up_work;
+       struct work_struct link_down_work;
+       struct work_struct sma_message_work;
+       struct work_struct freeze_work;
+       struct work_struct link_downgrade_work;
+       struct work_struct link_bounce_work;
+       /* host link state variables */
+       struct mutex hls_lock;
+       u32 host_link_state;
+
+       spinlock_t            sdma_alllock ____cacheline_aligned_in_smp;
+
+       u32 lstate;     /* logical link state */
+
+       /* these are the "32 bit" regs */
+
+       u32 ibmtu; /* The MTU programmed for this unit */
+       /*
+        * Current max size IB packet (in bytes) including IB headers, that
+        * we can send. Changes when ibmtu changes.
+        */
+       u32 ibmaxlen;
+       u32 current_egress_rate; /* units [10^6 bits/sec] */
+       /* LID programmed for this instance */
+       u16 lid;
+       /* list of pkeys programmed; 0 if not set */
+       u16 pkeys[MAX_PKEY_VALUES];
+       u16 link_width_supported;
+       u16 link_width_downgrade_supported;
+       u16 link_speed_supported;
+       u16 link_width_enabled;
+       u16 link_width_downgrade_enabled;
+       u16 link_speed_enabled;
+       u16 link_width_active;
+       u16 link_width_downgrade_tx_active;
+       u16 link_width_downgrade_rx_active;
+       u16 link_speed_active;
+       u8 vls_supported;
+       u8 vls_operational;
+       u8 actual_vls_operational;
+       /* LID mask control */
+       u8 lmc;
+       /* Rx Polarity inversion (compensate for ~tx on partner) */
+       u8 rx_pol_inv;
+
+       u8 hw_pidx;     /* physical port index */
+       u8 port;        /* IB port number and index into dd->pports - 1 */
+       /* type of neighbor node */
+       u8 neighbor_type;
+       u8 neighbor_normal;
+       u8 neighbor_fm_security; /* 1 if firmware checking is disabled */
+       u8 neighbor_port_number;
+       u8 is_sm_config_started;
+       u8 offline_disabled_reason;
+       u8 is_active_optimize_enabled;
+       u8 driver_link_ready;   /* driver ready for active link */
+       u8 link_enabled;        /* link enabled? */
+       u8 linkinit_reason;
+       u8 local_tx_rate;       /* rate given to 8051 firmware */
+       u8 last_pstate;         /* info only */
+
+       /* placeholders for IB MAD packet settings */
+       u8 overrun_threshold;
+       u8 phy_error_threshold;
+
+       /* Used to override LED behavior for things like maintenance beaconing*/
+       /*
+        * Alternates per phase of blink
+        * [0] holds LED off duration, [1] holds LED on duration
+        */
+       unsigned long led_override_vals[2];
+       u8 led_override_phase; /* LSB picks from vals[] */
+       atomic_t led_override_timer_active;
+       /* Used to flash LEDs in override mode */
+       struct timer_list led_override_timer;
+
+       u32 sm_trap_qp;
+       u32 sa_qp;
+
+       /*
+        * cca_timer_lock protects access to the per-SL cca_timer
+        * structures (specifically the ccti member).
+        */
+       spinlock_t cca_timer_lock ____cacheline_aligned_in_smp;
+       struct cca_timer cca_timer[OPA_MAX_SLS];
+
+       /* List of congestion control table entries */
+       struct ib_cc_table_entry_shadow ccti_entries[CC_TABLE_SHADOW_MAX];
+
+       /* congestion entries, each entry corresponding to a SL */
+       struct opa_congestion_setting_entry_shadow
+               congestion_entries[OPA_MAX_SLS];
+
+       /*
+        * cc_state_lock protects (write) access to the per-port
+        * struct cc_state.
+        */
+       spinlock_t cc_state_lock ____cacheline_aligned_in_smp;
+
+       struct cc_state __rcu *cc_state;
+
+       /* Total number of congestion control table entries */
+       u16 total_cct_entry;
+
+       /* Bit map identifying service level */
+       u32 cc_sl_control_map;
+
+       /* CA's max number of 64 entry units in the congestion control table */
+       u8 cc_max_table_entries;
+
+       /*
+        * begin congestion log related entries
+        * cc_log_lock protects all congestion log related data
+        */
+       spinlock_t cc_log_lock ____cacheline_aligned_in_smp;
+       u8 threshold_cong_event_map[OPA_MAX_SLS / 8];
+       u16 threshold_event_counter;
+       struct opa_hfi1_cong_log_event_internal cc_events[OPA_CONG_LOG_ELEMS];
+       int cc_log_idx; /* index for logging events */
+       int cc_mad_idx; /* index for reporting events */
+       /* end congestion log related entries */
+
+       struct vl_arb_cache vl_arb_cache[MAX_PRIO_TABLE];
+
+       /* port relative counter buffer */
+       u64 *cntrs;
+       /* port relative synthetic counter buffer */
+       u64 *scntrs;
+       /* port_xmit_discards are synthesized from different egress errors */
+       u64 port_xmit_discards;
+       u64 port_xmit_discards_vl[C_VL_COUNT];
+       u64 port_xmit_constraint_errors;
+       u64 port_rcv_constraint_errors;
+       /* count of 'link_err' interrupts from DC */
+       u64 link_downed;
+       /* number of times link retrained successfully */
+       u64 link_up;
+       /* number of times a link unknown frame was reported */
+       u64 unknown_frame_count;
+       /* port_ltp_crc_mode is returned in 'portinfo' MADs */
+       u16 port_ltp_crc_mode;
+       /* port_crc_mode_enabled is the crc we support */
+       u8 port_crc_mode_enabled;
+       /* mgmt_allowed is also returned in 'portinfo' MADs */
+       u8 mgmt_allowed;
+       u8 part_enforce; /* partition enforcement flags */
+       struct link_down_reason local_link_down_reason;
+       struct link_down_reason neigh_link_down_reason;
+       /* Value to be sent to link peer on LinkDown .*/
+       u8 remote_link_down_reason;
+       /* Error events that will cause a port bounce. */
+       u32 port_error_action;
+       struct work_struct linkstate_active_work;
+       /* Does this port need to prescan for FECNs */
+       bool cc_prescan;
+};
+
+typedef int (*rhf_rcv_function_ptr)(struct hfi1_packet *packet);
+
+typedef void (*opcode_handler)(struct hfi1_packet *packet);
+
+/* return values for the RHF receive functions */
+#define RHF_RCV_CONTINUE  0    /* keep going */
+#define RHF_RCV_DONE     1     /* stop, this packet processed */
+#define RHF_RCV_REPROCESS 2    /* stop. retain this packet */
+
+struct rcv_array_data {
+       u8 group_size;
+       u16 ngroups;
+       u16 nctxt_extra;
+};
+
+struct per_vl_data {
+       u16 mtu;
+       struct send_context *sc;
+};
+
+/* 16 to directly index */
+#define PER_VL_SEND_CONTEXTS 16
+
+struct err_info_rcvport {
+       u8 status_and_code;
+       u64 packet_flit1;
+       u64 packet_flit2;
+};
+
+struct err_info_constraint {
+       u8 status;
+       u16 pkey;
+       u32 slid;
+};
+
+struct hfi1_temp {
+       unsigned int curr;       /* current temperature */
+       unsigned int lo_lim;     /* low temperature limit */
+       unsigned int hi_lim;     /* high temperature limit */
+       unsigned int crit_lim;   /* critical temperature limit */
+       u8 triggers;      /* temperature triggers */
+};
+
+/* common data between shared ASIC HFIs */
+struct hfi1_asic_data {
+       struct hfi1_devdata *dds[2];    /* back pointers */
+       struct mutex asic_resource_mutex;
+};
+
+/* device data struct now contains only "general per-device" info.
+ * fields related to a physical IB port are in a hfi1_pportdata struct.
+ */
+struct sdma_engine;
+struct sdma_vl_map;
+
+#define BOARD_VERS_MAX 96 /* how long the version string can be */
+#define SERIAL_MAX 16 /* length of the serial number */
+
+typedef int (*send_routine)(struct rvt_qp *, struct hfi1_pkt_state *, u64);
+struct hfi1_devdata {
+       struct hfi1_ibdev verbs_dev;     /* must be first */
+       struct list_head list;
+       /* pointers to related structs for this device */
+       /* pci access data structure */
+       struct pci_dev *pcidev;
+       struct cdev user_cdev;
+       struct cdev diag_cdev;
+       struct cdev ui_cdev;
+       struct device *user_device;
+       struct device *diag_device;
+       struct device *ui_device;
+
+       /* mem-mapped pointer to base of chip regs */
+       u8 __iomem *kregbase;
+       /* end of mem-mapped chip space excluding sendbuf and user regs */
+       u8 __iomem *kregend;
+       /* physical address of chip for io_remap, etc. */
+       resource_size_t physaddr;
+       /* receive context data */
+       struct hfi1_ctxtdata **rcd;
+       /* send context data */
+       struct send_context_info *send_contexts;
+       /* map hardware send contexts to software index */
+       u8 *hw_to_sw;
+       /* spinlock for allocating and releasing send context resources */
+       spinlock_t sc_lock;
+       /* Per VL data. Enough for all VLs but not all elements are set/used. */
+       struct per_vl_data vld[PER_VL_SEND_CONTEXTS];
+       /* lock for pio_map */
+       spinlock_t pio_map_lock;
+       /* array of kernel send contexts */
+       struct send_context **kernel_send_context;
+       /* array of vl maps */
+       struct pio_vl_map __rcu *pio_map;
+       /* seqlock for sc2vl */
+       seqlock_t sc2vl_lock;
+       u64 sc2vl[4];
+       /* Send Context initialization lock. */
+       spinlock_t sc_init_lock;
+
+       /* fields common to all SDMA engines */
+
+       /* default flags to last descriptor */
+       u64 default_desc1;
+       volatile __le64                    *sdma_heads_dma; /* DMA'ed by chip */
+       dma_addr_t                          sdma_heads_phys;
+       void                               *sdma_pad_dma; /* DMA'ed by chip */
+       dma_addr_t                          sdma_pad_phys;
+       /* for deallocation */
+       size_t                              sdma_heads_size;
+       /* number from the chip */
+       u32                                 chip_sdma_engines;
+       /* num used */
+       u32                                 num_sdma;
+       /* lock for sdma_map */
+       spinlock_t                          sde_map_lock;
+       /* array of engines sized by num_sdma */
+       struct sdma_engine                 *per_sdma;
+       /* array of vl maps */
+       struct sdma_vl_map __rcu           *sdma_map;
+       /* SPC freeze waitqueue and variable */
+       wait_queue_head_t                 sdma_unfreeze_wq;
+       atomic_t                          sdma_unfreeze_count;
+
+       /* common data between shared ASIC HFIs in this OS */
+       struct hfi1_asic_data *asic_data;
+
+       /* hfi1_pportdata, points to array of (physical) port-specific
+        * data structs, indexed by pidx (0..n-1)
+        */
+       struct hfi1_pportdata *pport;
+
+       /* mem-mapped pointer to base of PIO buffers */
+       void __iomem *piobase;
+       /*
+        * write-combining mem-mapped pointer to base of RcvArray
+        * memory.
+        */
+       void __iomem *rcvarray_wc;
+       /*
+        * credit return base - a per-NUMA range of DMA address that
+        * the chip will use to update the per-context free counter
+        */
+       struct credit_return_base *cr_base;
+
+       /* send context numbers and sizes for each type */
+       struct sc_config_sizes sc_sizes[SC_MAX];
+
+       u32 lcb_access_count;           /* count of LCB users */
+
+       char *boardname; /* human readable board info */
+
+       /* device (not port) flags, basically device capabilities */
+       u32 flags;
+
+       /* reset value */
+       u64 z_int_counter;
+       u64 z_rcv_limit;
+       u64 z_send_schedule;
+       /* percpu int_counter */
+       u64 __percpu *int_counter;
+       u64 __percpu *rcv_limit;
+       u64 __percpu *send_schedule;
+       /* number of receive contexts in use by the driver */
+       u32 num_rcv_contexts;
+       /* number of pio send contexts in use by the driver */
+       u32 num_send_contexts;
+       /*
+        * number of ctxts available for PSM open
+        */
+       u32 freectxts;
+       /* total number of available user/PSM contexts */
+       u32 num_user_contexts;
+       /* base receive interrupt timeout, in CSR units */
+       u32 rcv_intr_timeout_csr;
+
+       u64 __iomem *egrtidbase;
+       spinlock_t sendctrl_lock; /* protect changes to SendCtrl */
+       spinlock_t rcvctrl_lock; /* protect changes to RcvCtrl */
+       /* around rcd and (user ctxts) ctxt_cnt use (intr vs free) */
+       spinlock_t uctxt_lock; /* rcd and user context changes */
+       /* exclusive access to 8051 */
+       spinlock_t dc8051_lock;
+       /* exclusive access to 8051 memory */
+       spinlock_t dc8051_memlock;
+       int dc8051_timed_out;   /* remember if the 8051 timed out */
+       /*
+        * A page that will hold event notification bitmaps for all
+        * contexts. This page will be mapped into all processes.
+        */
+       unsigned long *events;
+       /*
+        * per unit status, see also portdata statusp
+        * mapped read-only into user processes so they can get unit and
+        * IB link status cheaply
+        */
+       struct hfi1_status *status;
+       u32 freezelen; /* max length of freezemsg */
+
+       /* revision register shadow */
+       u64 revision;
+       /* Base GUID for device (network order) */
+       u64 base_guid;
+
+       /* these are the "32 bit" regs */
+
+       /* value we put in kr_rcvhdrsize */
+       u32 rcvhdrsize;
+       /* number of receive contexts the chip supports */
+       u32 chip_rcv_contexts;
+       /* number of receive array entries */
+       u32 chip_rcv_array_count;
+       /* number of PIO send contexts the chip supports */
+       u32 chip_send_contexts;
+       /* number of bytes in the PIO memory buffer */
+       u32 chip_pio_mem_size;
+       /* number of bytes in the SDMA memory buffer */
+       u32 chip_sdma_mem_size;
+
+       /* size of each rcvegrbuffer */
+       u32 rcvegrbufsize;
+       /* log2 of above */
+       u16 rcvegrbufsize_shift;
+       /* both sides of the PCIe link are gen3 capable */
+       u8 link_gen3_capable;
+       /* localbus width (1, 2,4,8,16,32) from config space  */
+       u32 lbus_width;
+       /* localbus speed in MHz */
+       u32 lbus_speed;
+       int unit; /* unit # of this chip */
+       int node; /* home node of this chip */
+
+       /* save these PCI fields to restore after a reset */
+       u32 pcibar0;
+       u32 pcibar1;
+       u32 pci_rom;
+       u16 pci_command;
+       u16 pcie_devctl;
+       u16 pcie_lnkctl;
+       u16 pcie_devctl2;
+       u32 pci_msix0;
+       u32 pci_lnkctl3;
+       u32 pci_tph2;
+
+       /*
+        * ASCII serial number, from flash, large enough for original
+        * all digit strings, and longer serial number format
+        */
+       u8 serial[SERIAL_MAX];
+       /* human readable board version */
+       u8 boardversion[BOARD_VERS_MAX];
+       u8 lbus_info[32]; /* human readable localbus info */
+       /* chip major rev, from CceRevision */
+       u8 majrev;
+       /* chip minor rev, from CceRevision */
+       u8 minrev;
+       /* hardware ID */
+       u8 hfi1_id;
+       /* implementation code */
+       u8 icode;
+       /* default link down value (poll/sleep) */
+       u8 link_default;
+       /* vAU of this device */
+       u8 vau;
+       /* vCU of this device */
+       u8 vcu;
+       /* link credits of this device */
+       u16 link_credits;
+       /* initial vl15 credits to use */
+       u16 vl15_init;
+
+       /* Misc small ints */
+       /* Number of physical ports available */
+       u8 num_pports;
+       /* Lowest context number which can be used by user processes */
+       u8 first_user_ctxt;
+       u8 n_krcv_queues;
+       u8 qos_shift;
+       u8 qpn_mask;
+
+       u16 rhf_offset; /* offset of RHF within receive header entry */
+       u16 irev;       /* implementation revision */
+       u16 dc8051_ver; /* 8051 firmware version */
+
+       struct platform_config platform_config;
+       struct platform_config_cache pcfg_cache;
+
+       struct diag_client *diag_client;
+       spinlock_t hfi1_diag_trans_lock; /* protect diag observer ops */
+
+       u8 psxmitwait_supported;
+       /* cycle length of PS* counters in HW (in picoseconds) */
+       u16 psxmitwait_check_rate;
+       /* high volume overflow errors deferred to tasklet */
+       struct tasklet_struct error_tasklet;
+
+       /* MSI-X information */
+       struct hfi1_msix_entry *msix_entries;
+       u32 num_msix_entries;
+
+       /* INTx information */
+       u32 requested_intx_irq;         /* did we request one? */
+       char intx_name[MAX_NAME_SIZE];  /* INTx name */
+
+       /* general interrupt: mask of handled interrupts */
+       u64 gi_mask[CCE_NUM_INT_CSRS];
+
+       struct rcv_array_data rcv_entries;
+
+       /*
+        * 64 bit synthetic counters
+        */
+       struct timer_list synth_stats_timer;
+
+       /*
+        * device counters
+        */
+       char *cntrnames;
+       size_t cntrnameslen;
+       size_t ndevcntrs;
+       u64 *cntrs;
+       u64 *scntrs;
+
+       /*
+        * remembered values for synthetic counters
+        */
+       u64 last_tx;
+       u64 last_rx;
+
+       /*
+        * per-port counters
+        */
+       size_t nportcntrs;
+       char *portcntrnames;
+       size_t portcntrnameslen;
+
+       struct hfi1_snoop_data hfi1_snoop;
+
+       struct err_info_rcvport err_info_rcvport;
+       struct err_info_constraint err_info_rcv_constraint;
+       struct err_info_constraint err_info_xmit_constraint;
+       u8 err_info_uncorrectable;
+       u8 err_info_fmconfig;
+
+       atomic_t drop_packet;
+       u8 do_drop;
+
+       /*
+        * Software counters for the status bits defined by the
+        * associated error status registers
+        */
+       u64 cce_err_status_cnt[NUM_CCE_ERR_STATUS_COUNTERS];
+       u64 rcv_err_status_cnt[NUM_RCV_ERR_STATUS_COUNTERS];
+       u64 misc_err_status_cnt[NUM_MISC_ERR_STATUS_COUNTERS];
+       u64 send_pio_err_status_cnt[NUM_SEND_PIO_ERR_STATUS_COUNTERS];
+       u64 send_dma_err_status_cnt[NUM_SEND_DMA_ERR_STATUS_COUNTERS];
+       u64 send_egress_err_status_cnt[NUM_SEND_EGRESS_ERR_STATUS_COUNTERS];
+       u64 send_err_status_cnt[NUM_SEND_ERR_STATUS_COUNTERS];
+
+       /* Software counter that spans all contexts */
+       u64 sw_ctxt_err_status_cnt[NUM_SEND_CTXT_ERR_STATUS_COUNTERS];
+       /* Software counter that spans all DMA engines */
+       u64 sw_send_dma_eng_err_status_cnt[
+               NUM_SEND_DMA_ENG_ERR_STATUS_COUNTERS];
+       /* Software counter that aggregates all cce_err_status errors */
+       u64 sw_cce_err_status_aggregate;
+
+       /* receive interrupt functions */
+       rhf_rcv_function_ptr *rhf_rcv_function_map;
+       rhf_rcv_function_ptr normal_rhf_rcv_functions[8];
+
+       /*
+        * Handlers for outgoing data so that snoop/capture does not
+        * have to have its hooks in the send path
+        */
+       send_routine process_pio_send;
+       send_routine process_dma_send;
+       void (*pio_inline_send)(struct hfi1_devdata *dd, struct pio_buf *pbuf,
+                               u64 pbc, const void *from, size_t count);
+
+       /* OUI comes from the HW. Used everywhere as 3 separate bytes. */
+       u8 oui1;
+       u8 oui2;
+       u8 oui3;
+       /* Timer and counter used to detect RcvBufOvflCnt changes */
+       struct timer_list rcverr_timer;
+       u32 rcv_ovfl_cnt;
+
+       wait_queue_head_t event_queue;
+
+       /* Save the enabled LCB error bits */
+       u64 lcb_err_en;
+       u8 dc_shutdown;
+
+       /* receive context tail dummy address */
+       __le64 *rcvhdrtail_dummy_kvaddr;
+       dma_addr_t rcvhdrtail_dummy_physaddr;
+
+       bool eprom_available;   /* true if EPROM is available for this device */
+       bool aspm_supported;    /* Does HW support ASPM */
+       bool aspm_enabled;      /* ASPM state: enabled/disabled */
+       /* Serialize ASPM enable/disable between multiple verbs contexts */
+       spinlock_t aspm_lock;
+       /* Number of verbs contexts which have disabled ASPM */
+       atomic_t aspm_disabled_cnt;
+
+       struct hfi1_affinity *affinity;
+       struct kobject kobj;
+};
+
+/* 8051 firmware version helper */
+#define dc8051_ver(a, b) ((a) << 8 | (b))
+
+/* f_put_tid types */
+#define PT_EXPECTED 0
+#define PT_EAGER    1
+#define PT_INVALID  2
+
+struct tid_rb_node;
+struct mmu_rb_node;
+
+/* Private data for file operations */
+struct hfi1_filedata {
+       struct hfi1_ctxtdata *uctxt;
+       unsigned subctxt;
+       struct hfi1_user_sdma_comp_q *cq;
+       struct hfi1_user_sdma_pkt_q *pq;
+       /* for cpu affinity; -1 if none */
+       int rec_cpu_num;
+       u32 tid_n_pinned;
+       struct rb_root tid_rb_root;
+       struct tid_rb_node **entry_to_rb;
+       spinlock_t tid_lock; /* protect tid_[limit,used] counters */
+       u32 tid_limit;
+       u32 tid_used;
+       u32 *invalid_tids;
+       u32 invalid_tid_idx;
+       /* protect invalid_tids array and invalid_tid_idx */
+       spinlock_t invalid_lock;
+};
+
+extern struct list_head hfi1_dev_list;
+extern spinlock_t hfi1_devs_lock;
+struct hfi1_devdata *hfi1_lookup(int unit);
+extern u32 hfi1_cpulist_count;
+extern unsigned long *hfi1_cpulist;
+
+extern unsigned int snoop_drop_send;
+extern unsigned int snoop_force_capture;
+int hfi1_init(struct hfi1_devdata *, int);
+int hfi1_count_units(int *npresentp, int *nupp);
+int hfi1_count_active_units(void);
+
+int hfi1_diag_add(struct hfi1_devdata *);
+void hfi1_diag_remove(struct hfi1_devdata *);
+void handle_linkup_change(struct hfi1_devdata *dd, u32 linkup);
+
+void handle_user_interrupt(struct hfi1_ctxtdata *rcd);
+
+int hfi1_create_rcvhdrq(struct hfi1_devdata *, struct hfi1_ctxtdata *);
+int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *);
+int hfi1_create_ctxts(struct hfi1_devdata *dd);
+struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *, u32, int);
+void hfi1_init_pportdata(struct pci_dev *, struct hfi1_pportdata *,
+                        struct hfi1_devdata *, u8, u8);
+void hfi1_free_ctxtdata(struct hfi1_devdata *, struct hfi1_ctxtdata *);
+
+int handle_receive_interrupt(struct hfi1_ctxtdata *, int);
+int handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *, int);
+int handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *, int);
+void set_all_slowpath(struct hfi1_devdata *dd);
+
+/* receive packet handler dispositions */
+#define RCV_PKT_OK      0x0 /* keep going */
+#define RCV_PKT_LIMIT   0x1 /* stop, hit limit, start thread */
+#define RCV_PKT_DONE    0x2 /* stop, no more packets detected */
+
+/* calculate the current RHF address */
+static inline __le32 *get_rhf_addr(struct hfi1_ctxtdata *rcd)
+{
+       return (__le32 *)rcd->rcvhdrq + rcd->head + rcd->dd->rhf_offset;
+}
+
+int hfi1_reset_device(int);
+
+/* return the driver's idea of the logical OPA port state */
+static inline u32 driver_lstate(struct hfi1_pportdata *ppd)
+{
+       return ppd->lstate; /* use the cached value */
+}
+
+void receive_interrupt_work(struct work_struct *work);
+
+/* extract service channel from header and rhf */
+static inline int hdr2sc(struct hfi1_message_header *hdr, u64 rhf)
+{
+       return ((be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf) |
+              ((!!(rhf & RHF_DC_INFO_SMASK)) << 4);
+}
+
+static inline u16 generate_jkey(kuid_t uid)
+{
+       return from_kuid(current_user_ns(), uid) & 0xffff;
+}
+
+/*
+ * active_egress_rate
+ *
+ * returns the active egress rate in units of [10^6 bits/sec]
+ */
+static inline u32 active_egress_rate(struct hfi1_pportdata *ppd)
+{
+       u16 link_speed = ppd->link_speed_active;
+       u16 link_width = ppd->link_width_active;
+       u32 egress_rate;
+
+       if (link_speed == OPA_LINK_SPEED_25G)
+               egress_rate = 25000;
+       else /* assume OPA_LINK_SPEED_12_5G */
+               egress_rate = 12500;
+
+       switch (link_width) {
+       case OPA_LINK_WIDTH_4X:
+               egress_rate *= 4;
+               break;
+       case OPA_LINK_WIDTH_3X:
+               egress_rate *= 3;
+               break;
+       case OPA_LINK_WIDTH_2X:
+               egress_rate *= 2;
+               break;
+       default:
+               /* assume IB_WIDTH_1X */
+               break;
+       }
+
+       return egress_rate;
+}
+
+/*
+ * egress_cycles
+ *
+ * Returns the number of 'fabric clock cycles' to egress a packet
+ * of length 'len' bytes, at 'rate' Mbit/s. Since the fabric clock
+ * rate is (approximately) 805 MHz, the units of the returned value
+ * are (1/805 MHz).
+ */
+static inline u32 egress_cycles(u32 len, u32 rate)
+{
+       u32 cycles;
+
+       /*
+        * cycles is:
+        *
+        *          (length) [bits] / (rate) [bits/sec]
+        *  ---------------------------------------------------
+        *  fabric_clock_period == 1 /(805 * 10^6) [cycles/sec]
+        */
+
+       cycles = len * 8; /* bits */
+       cycles *= 805;
+       cycles /= rate;
+
+       return cycles;
+}
+
+void set_link_ipg(struct hfi1_pportdata *ppd);
+void process_becn(struct hfi1_pportdata *ppd, u8 sl,  u16 rlid, u32 lqpn,
+                 u32 rqpn, u8 svc_type);
+void return_cnp(struct hfi1_ibport *ibp, struct rvt_qp *qp, u32 remote_qpn,
+               u32 pkey, u32 slid, u32 dlid, u8 sc5,
+               const struct ib_grh *old_grh);
+#define PKEY_CHECK_INVALID -1
+int egress_pkey_check(struct hfi1_pportdata *ppd, __be16 *lrh, __be32 *bth,
+                     u8 sc5, int8_t s_pkey_index);
+
+#define PACKET_EGRESS_TIMEOUT 350
+static inline void pause_for_credit_return(struct hfi1_devdata *dd)
+{
+       /* Pause at least 1us, to ensure chip returns all credits */
+       u32 usec = cclock_to_ns(dd, PACKET_EGRESS_TIMEOUT) / 1000;
+
+       udelay(usec ? usec : 1);
+}
+
+/**
+ * sc_to_vlt() reverse lookup sc to vl
+ * @dd - devdata
+ * @sc5 - 5 bit sc
+ */
+static inline u8 sc_to_vlt(struct hfi1_devdata *dd, u8 sc5)
+{
+       unsigned seq;
+       u8 rval;
+
+       if (sc5 >= OPA_MAX_SCS)
+               return (u8)(0xff);
+
+       do {
+               seq = read_seqbegin(&dd->sc2vl_lock);
+               rval = *(((u8 *)dd->sc2vl) + sc5);
+       } while (read_seqretry(&dd->sc2vl_lock, seq));
+
+       return rval;
+}
+
+#define PKEY_MEMBER_MASK 0x8000
+#define PKEY_LOW_15_MASK 0x7fff
+
+/*
+ * ingress_pkey_matches_entry - return 1 if the pkey matches ent (ent
+ * being an entry from the ingress partition key table), return 0
+ * otherwise. Use the matching criteria for ingress partition keys
+ * specified in the OPAv1 spec., section 9.10.14.
+ */
+static inline int ingress_pkey_matches_entry(u16 pkey, u16 ent)
+{
+       u16 mkey = pkey & PKEY_LOW_15_MASK;
+       u16 ment = ent & PKEY_LOW_15_MASK;
+
+       if (mkey == ment) {
+               /*
+                * If pkey[15] is clear (limited partition member),
+                * is bit 15 in the corresponding table element
+                * clear (limited member)?
+                */
+               if (!(pkey & PKEY_MEMBER_MASK))
+                       return !!(ent & PKEY_MEMBER_MASK);
+               return 1;
+       }
+       return 0;
+}
+
+/*
+ * ingress_pkey_table_search - search the entire pkey table for
+ * an entry which matches 'pkey'. return 0 if a match is found,
+ * and 1 otherwise.
+ */
+static int ingress_pkey_table_search(struct hfi1_pportdata *ppd, u16 pkey)
+{
+       int i;
+
+       for (i = 0; i < MAX_PKEY_VALUES; i++) {
+               if (ingress_pkey_matches_entry(pkey, ppd->pkeys[i]))
+                       return 0;
+       }
+       return 1;
+}
+
+/*
+ * ingress_pkey_table_fail - record a failure of ingress pkey validation,
+ * i.e., increment port_rcv_constraint_errors for the port, and record
+ * the 'error info' for this failure.
+ */
+static void ingress_pkey_table_fail(struct hfi1_pportdata *ppd, u16 pkey,
+                                   u16 slid)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+
+       incr_cntr64(&ppd->port_rcv_constraint_errors);
+       if (!(dd->err_info_rcv_constraint.status & OPA_EI_STATUS_SMASK)) {
+               dd->err_info_rcv_constraint.status |= OPA_EI_STATUS_SMASK;
+               dd->err_info_rcv_constraint.slid = slid;
+               dd->err_info_rcv_constraint.pkey = pkey;
+       }
+}
+
+/*
+ * ingress_pkey_check - Return 0 if the ingress pkey is valid, return 1
+ * otherwise. Use the criteria in the OPAv1 spec, section 9.10.14. idx
+ * is a hint as to the best place in the partition key table to begin
+ * searching. This function should not be called on the data path because
+ * of performance reasons. On datapath pkey check is expected to be done
+ * by HW and rcv_pkey_check function should be called instead.
+ */
+static inline int ingress_pkey_check(struct hfi1_pportdata *ppd, u16 pkey,
+                                    u8 sc5, u8 idx, u16 slid)
+{
+       if (!(ppd->part_enforce & HFI1_PART_ENFORCE_IN))
+               return 0;
+
+       /* If SC15, pkey[0:14] must be 0x7fff */
+       if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
+               goto bad;
+
+       /* Is the pkey = 0x0, or 0x8000? */
+       if ((pkey & PKEY_LOW_15_MASK) == 0)
+               goto bad;
+
+       /* The most likely matching pkey has index 'idx' */
+       if (ingress_pkey_matches_entry(pkey, ppd->pkeys[idx]))
+               return 0;
+
+       /* no match - try the whole table */
+       if (!ingress_pkey_table_search(ppd, pkey))
+               return 0;
+
+bad:
+       ingress_pkey_table_fail(ppd, pkey, slid);
+       return 1;
+}
+
+/*
+ * rcv_pkey_check - Return 0 if the ingress pkey is valid, return 1
+ * otherwise. It only ensures pkey is vlid for QP0. This function
+ * should be called on the data path instead of ingress_pkey_check
+ * as on data path, pkey check is done by HW (except for QP0).
+ */
+static inline int rcv_pkey_check(struct hfi1_pportdata *ppd, u16 pkey,
+                                u8 sc5, u16 slid)
+{
+       if (!(ppd->part_enforce & HFI1_PART_ENFORCE_IN))
+               return 0;
+
+       /* If SC15, pkey[0:14] must be 0x7fff */
+       if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
+               goto bad;
+
+       return 0;
+bad:
+       ingress_pkey_table_fail(ppd, pkey, slid);
+       return 1;
+}
+
+/* MTU handling */
+
+/* MTU enumeration, 256-4k match IB */
+#define OPA_MTU_0     0
+#define OPA_MTU_256   1
+#define OPA_MTU_512   2
+#define OPA_MTU_1024  3
+#define OPA_MTU_2048  4
+#define OPA_MTU_4096  5
+
+u32 lrh_max_header_bytes(struct hfi1_devdata *dd);
+int mtu_to_enum(u32 mtu, int default_if_bad);
+u16 enum_to_mtu(int);
+static inline int valid_ib_mtu(unsigned int mtu)
+{
+       return mtu == 256 || mtu == 512 ||
+               mtu == 1024 || mtu == 2048 ||
+               mtu == 4096;
+}
+
+static inline int valid_opa_max_mtu(unsigned int mtu)
+{
+       return mtu >= 2048 &&
+               (valid_ib_mtu(mtu) || mtu == 8192 || mtu == 10240);
+}
+
+int set_mtu(struct hfi1_pportdata *);
+
+int hfi1_set_lid(struct hfi1_pportdata *, u32, u8);
+void hfi1_disable_after_error(struct hfi1_devdata *);
+int hfi1_set_uevent_bits(struct hfi1_pportdata *, const int);
+int hfi1_rcvbuf_validate(u32, u8, u16 *);
+
+int fm_get_table(struct hfi1_pportdata *, int, void *);
+int fm_set_table(struct hfi1_pportdata *, int, void *);
+
+void set_up_vl15(struct hfi1_devdata *dd, u8 vau, u16 vl15buf);
+void reset_link_credits(struct hfi1_devdata *dd);
+void assign_remote_cm_au_table(struct hfi1_devdata *dd, u8 vcu);
+
+int snoop_recv_handler(struct hfi1_packet *packet);
+int snoop_send_dma_handler(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+                          u64 pbc);
+int snoop_send_pio_handler(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+                          u64 pbc);
+void snoop_inline_pio_send(struct hfi1_devdata *dd, struct pio_buf *pbuf,
+                          u64 pbc, const void *from, size_t count);
+int set_buffer_control(struct hfi1_pportdata *ppd, struct buffer_control *bc);
+
+static inline struct hfi1_devdata *dd_from_ppd(struct hfi1_pportdata *ppd)
+{
+       return ppd->dd;
+}
+
+static inline struct hfi1_devdata *dd_from_dev(struct hfi1_ibdev *dev)
+{
+       return container_of(dev, struct hfi1_devdata, verbs_dev);
+}
+
+static inline struct hfi1_devdata *dd_from_ibdev(struct ib_device *ibdev)
+{
+       return dd_from_dev(to_idev(ibdev));
+}
+
+static inline struct hfi1_pportdata *ppd_from_ibp(struct hfi1_ibport *ibp)
+{
+       return container_of(ibp, struct hfi1_pportdata, ibport_data);
+}
+
+static inline struct hfi1_ibdev *dev_from_rdi(struct rvt_dev_info *rdi)
+{
+       return container_of(rdi, struct hfi1_ibdev, rdi);
+}
+
+static inline struct hfi1_ibport *to_iport(struct ib_device *ibdev, u8 port)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       unsigned pidx = port - 1; /* IB number port from 1, hdw from 0 */
+
+       WARN_ON(pidx >= dd->num_pports);
+       return &dd->pport[pidx].ibport_data;
+}
+
+/*
+ * Return the indexed PKEY from the port PKEY table.
+ */
+static inline u16 hfi1_get_pkey(struct hfi1_ibport *ibp, unsigned index)
+{
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       u16 ret;
+
+       if (index >= ARRAY_SIZE(ppd->pkeys))
+               ret = 0;
+       else
+               ret = ppd->pkeys[index];
+
+       return ret;
+}
+
+/*
+ * Readers of cc_state must call get_cc_state() under rcu_read_lock().
+ * Writers of cc_state must call get_cc_state() under cc_state_lock.
+ */
+static inline struct cc_state *get_cc_state(struct hfi1_pportdata *ppd)
+{
+       return rcu_dereference(ppd->cc_state);
+}
+
+/*
+ * values for dd->flags (_device_ related flags)
+ */
+#define HFI1_INITTED           0x1    /* chip and driver up and initted */
+#define HFI1_PRESENT           0x2    /* chip accesses can be done */
+#define HFI1_FROZEN            0x4    /* chip in SPC freeze */
+#define HFI1_HAS_SDMA_TIMEOUT  0x8
+#define HFI1_HAS_SEND_DMA      0x10   /* Supports Send DMA */
+#define HFI1_FORCED_FREEZE     0x80   /* driver forced freeze mode */
+
+/* IB dword length mask in PBC (lower 11 bits); same for all chips */
+#define HFI1_PBC_LENGTH_MASK                     ((1 << 11) - 1)
+
+/* ctxt_flag bit offsets */
+               /* context has been setup */
+#define HFI1_CTXT_SETUP_DONE 1
+               /* waiting for a packet to arrive */
+#define HFI1_CTXT_WAITING_RCV   2
+               /* master has not finished initializing */
+#define HFI1_CTXT_MASTER_UNINIT 4
+               /* waiting for an urgent packet to arrive */
+#define HFI1_CTXT_WAITING_URG 5
+
+/* free up any allocated data at closes */
+struct hfi1_devdata *hfi1_init_dd(struct pci_dev *,
+                                 const struct pci_device_id *);
+void hfi1_free_devdata(struct hfi1_devdata *);
+void cc_state_reclaim(struct rcu_head *rcu);
+struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra);
+
+/* LED beaconing functions */
+void hfi1_start_led_override(struct hfi1_pportdata *ppd, unsigned int timeon,
+                            unsigned int timeoff);
+void shutdown_led_override(struct hfi1_pportdata *ppd);
+
+#define HFI1_CREDIT_RETURN_RATE (100)
+
+/*
+ * The number of words for the KDETH protocol field.  If this is
+ * larger then the actual field used, then part of the payload
+ * will be in the header.
+ *
+ * Optimally, we want this sized so that a typical case will
+ * use full cache lines.  The typical local KDETH header would
+ * be:
+ *
+ *     Bytes   Field
+ *       8     LRH
+ *      12     BHT
+ *      ??     KDETH
+ *       8     RHF
+ *     ---
+ *      28 + KDETH
+ *
+ * For a 64-byte cache line, KDETH would need to be 36 bytes or 9 DWORDS
+ */
+#define DEFAULT_RCVHDRSIZE 9
+
+/*
+ * Maximal header byte count:
+ *
+ *     Bytes   Field
+ *       8     LRH
+ *      40     GRH (optional)
+ *      12     BTH
+ *      ??     KDETH
+ *       8     RHF
+ *     ---
+ *      68 + KDETH
+ *
+ * We also want to maintain a cache line alignment to assist DMA'ing
+ * of the header bytes.  Round up to a good size.
+ */
+#define DEFAULT_RCVHDR_ENTSIZE 32
+
+bool hfi1_can_pin_pages(struct hfi1_devdata *, u32, u32);
+int hfi1_acquire_user_pages(unsigned long, size_t, bool, struct page **);
+void hfi1_release_user_pages(struct mm_struct *, struct page **, size_t, bool);
+
+static inline void clear_rcvhdrtail(const struct hfi1_ctxtdata *rcd)
+{
+       *((u64 *)rcd->rcvhdrtail_kvaddr) = 0ULL;
+}
+
+static inline u32 get_rcvhdrtail(const struct hfi1_ctxtdata *rcd)
+{
+       /*
+        * volatile because it's a DMA target from the chip, routine is
+        * inlined, and don't want register caching or reordering.
+        */
+       return (u32)le64_to_cpu(*rcd->rcvhdrtail_kvaddr);
+}
+
+/*
+ * sysfs interface.
+ */
+
+extern const char ib_hfi1_version[];
+
+int hfi1_device_create(struct hfi1_devdata *);
+void hfi1_device_remove(struct hfi1_devdata *);
+
+int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num,
+                          struct kobject *kobj);
+int hfi1_verbs_register_sysfs(struct hfi1_devdata *);
+void hfi1_verbs_unregister_sysfs(struct hfi1_devdata *);
+/* Hook for sysfs read of QSFP */
+int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len);
+
+int hfi1_pcie_init(struct pci_dev *, const struct pci_device_id *);
+void hfi1_pcie_cleanup(struct pci_dev *);
+int hfi1_pcie_ddinit(struct hfi1_devdata *, struct pci_dev *,
+                    const struct pci_device_id *);
+void hfi1_pcie_ddcleanup(struct hfi1_devdata *);
+void hfi1_pcie_flr(struct hfi1_devdata *);
+int pcie_speeds(struct hfi1_devdata *);
+void request_msix(struct hfi1_devdata *, u32 *, struct hfi1_msix_entry *);
+void hfi1_enable_intx(struct pci_dev *);
+void restore_pci_variables(struct hfi1_devdata *dd);
+int do_pcie_gen3_transition(struct hfi1_devdata *dd);
+int parse_platform_config(struct hfi1_devdata *dd);
+int get_platform_config_field(struct hfi1_devdata *dd,
+                             enum platform_config_table_type_encoding
+                             table_type, int table_index, int field_index,
+                             u32 *data, u32 len);
+
+const char *get_unit_name(int unit);
+const char *get_card_name(struct rvt_dev_info *rdi);
+struct pci_dev *get_pci_dev(struct rvt_dev_info *rdi);
+
+/*
+ * Flush write combining store buffers (if present) and perform a write
+ * barrier.
+ */
+static inline void flush_wc(void)
+{
+       asm volatile("sfence" : : : "memory");
+}
+
+void handle_eflags(struct hfi1_packet *packet);
+int process_receive_ib(struct hfi1_packet *packet);
+int process_receive_bypass(struct hfi1_packet *packet);
+int process_receive_error(struct hfi1_packet *packet);
+int kdeth_process_expected(struct hfi1_packet *packet);
+int kdeth_process_eager(struct hfi1_packet *packet);
+int process_receive_invalid(struct hfi1_packet *packet);
+
+extern rhf_rcv_function_ptr snoop_rhf_rcv_functions[8];
+
+void update_sge(struct rvt_sge_state *ss, u32 length);
+
+/* global module parameter variables */
+extern unsigned int hfi1_max_mtu;
+extern unsigned int hfi1_cu;
+extern unsigned int user_credit_return_threshold;
+extern int num_user_contexts;
+extern unsigned n_krcvqs;
+extern uint krcvqs[];
+extern int krcvqsset;
+extern uint kdeth_qp;
+extern uint loopback;
+extern uint quick_linkup;
+extern uint rcv_intr_timeout;
+extern uint rcv_intr_count;
+extern uint rcv_intr_dynamic;
+extern ushort link_crc_mask;
+
+extern struct mutex hfi1_mutex;
+
+/* Number of seconds before our card status check...  */
+#define STATUS_TIMEOUT 60
+
+#define DRIVER_NAME            "hfi1"
+#define HFI1_USER_MINOR_BASE     0
+#define HFI1_TRACE_MINOR         127
+#define HFI1_DIAGPKT_MINOR       128
+#define HFI1_DIAG_MINOR_BASE     129
+#define HFI1_SNOOP_CAPTURE_BASE  200
+#define HFI1_NMINORS             255
+
+#define PCI_VENDOR_ID_INTEL 0x8086
+#define PCI_DEVICE_ID_INTEL0 0x24f0
+#define PCI_DEVICE_ID_INTEL1 0x24f1
+
+#define HFI1_PKT_USER_SC_INTEGRITY                                         \
+       (SEND_CTXT_CHECK_ENABLE_DISALLOW_NON_KDETH_PACKETS_SMASK            \
+       | SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK           \
+       | SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_SMASK              \
+       | SEND_CTXT_CHECK_ENABLE_DISALLOW_GRH_SMASK)
+
+#define HFI1_PKT_KERNEL_SC_INTEGRITY                                       \
+       (SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK)
+
+static inline u64 hfi1_pkt_default_send_ctxt_mask(struct hfi1_devdata *dd,
+                                                 u16 ctxt_type)
+{
+       u64 base_sc_integrity =
+       SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK
+       | SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK
+       | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK
+       | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK
+       | SEND_CTXT_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK
+       | SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK
+       | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK
+       | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK
+       | SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK
+       | SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_SMASK
+       | SEND_CTXT_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK
+       | SEND_CTXT_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK
+       | SEND_CTXT_CHECK_ENABLE_CHECK_OPCODE_SMASK
+       | SEND_CTXT_CHECK_ENABLE_CHECK_SLID_SMASK
+       | SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK
+       | SEND_CTXT_CHECK_ENABLE_CHECK_VL_SMASK
+       | SEND_CTXT_CHECK_ENABLE_CHECK_ENABLE_SMASK;
+
+       if (ctxt_type == SC_USER)
+               base_sc_integrity |= HFI1_PKT_USER_SC_INTEGRITY;
+       else
+               base_sc_integrity |= HFI1_PKT_KERNEL_SC_INTEGRITY;
+
+       if (is_ax(dd))
+               /* turn off send-side job key checks - A0 */
+               return base_sc_integrity &
+                      ~SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
+       return base_sc_integrity;
+}
+
+static inline u64 hfi1_pkt_base_sdma_integrity(struct hfi1_devdata *dd)
+{
+       u64 base_sdma_integrity =
+       SEND_DMA_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK
+       | SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK
+       | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK
+       | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK
+       | SEND_DMA_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK
+       | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK
+       | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK
+       | SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK
+       | SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_SMASK
+       | SEND_DMA_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK
+       | SEND_DMA_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK
+       | SEND_DMA_CHECK_ENABLE_CHECK_OPCODE_SMASK
+       | SEND_DMA_CHECK_ENABLE_CHECK_SLID_SMASK
+       | SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK
+       | SEND_DMA_CHECK_ENABLE_CHECK_VL_SMASK
+       | SEND_DMA_CHECK_ENABLE_CHECK_ENABLE_SMASK;
+
+       if (is_ax(dd))
+               /* turn off send-side job key checks - A0 */
+               return base_sdma_integrity &
+                      ~SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
+       return base_sdma_integrity;
+}
+
+/*
+ * hfi1_early_err is used (only!) to print early errors before devdata is
+ * allocated, or when dd->pcidev may not be valid, and at the tail end of
+ * cleanup when devdata may have been freed, etc.  hfi1_dev_porterr is
+ * the same as dd_dev_err, but is used when the message really needs
+ * the IB port# to be definitive as to what's happening..
+ */
+#define hfi1_early_err(dev, fmt, ...) \
+       dev_err(dev, fmt, ##__VA_ARGS__)
+
+#define hfi1_early_info(dev, fmt, ...) \
+       dev_info(dev, fmt, ##__VA_ARGS__)
+
+#define dd_dev_emerg(dd, fmt, ...) \
+       dev_emerg(&(dd)->pcidev->dev, "%s: " fmt, \
+                 get_unit_name((dd)->unit), ##__VA_ARGS__)
+#define dd_dev_err(dd, fmt, ...) \
+       dev_err(&(dd)->pcidev->dev, "%s: " fmt, \
+                       get_unit_name((dd)->unit), ##__VA_ARGS__)
+#define dd_dev_warn(dd, fmt, ...) \
+       dev_warn(&(dd)->pcidev->dev, "%s: " fmt, \
+                       get_unit_name((dd)->unit), ##__VA_ARGS__)
+
+#define dd_dev_warn_ratelimited(dd, fmt, ...) \
+       dev_warn_ratelimited(&(dd)->pcidev->dev, "%s: " fmt, \
+                       get_unit_name((dd)->unit), ##__VA_ARGS__)
+
+#define dd_dev_info(dd, fmt, ...) \
+       dev_info(&(dd)->pcidev->dev, "%s: " fmt, \
+                       get_unit_name((dd)->unit), ##__VA_ARGS__)
+
+#define dd_dev_dbg(dd, fmt, ...) \
+       dev_dbg(&(dd)->pcidev->dev, "%s: " fmt, \
+               get_unit_name((dd)->unit), ##__VA_ARGS__)
+
+#define hfi1_dev_porterr(dd, port, fmt, ...) \
+       dev_err(&(dd)->pcidev->dev, "%s: port %u: " fmt, \
+                       get_unit_name((dd)->unit), (port), ##__VA_ARGS__)
+
+/*
+ * this is used for formatting hw error messages...
+ */
+struct hfi1_hwerror_msgs {
+       u64 mask;
+       const char *msg;
+       size_t sz;
+};
+
+/* in intr.c... */
+void hfi1_format_hwerrors(u64 hwerrs,
+                         const struct hfi1_hwerror_msgs *hwerrmsgs,
+                         size_t nhwerrmsgs, char *msg, size_t lmsg);
+
+#define USER_OPCODE_CHECK_VAL 0xC0
+#define USER_OPCODE_CHECK_MASK 0xC0
+#define OPCODE_CHECK_VAL_DISABLED 0x0
+#define OPCODE_CHECK_MASK_DISABLED 0x0
+
+static inline void hfi1_reset_cpu_counters(struct hfi1_devdata *dd)
+{
+       struct hfi1_pportdata *ppd;
+       int i;
+
+       dd->z_int_counter = get_all_cpu_total(dd->int_counter);
+       dd->z_rcv_limit = get_all_cpu_total(dd->rcv_limit);
+       dd->z_send_schedule = get_all_cpu_total(dd->send_schedule);
+
+       ppd = (struct hfi1_pportdata *)(dd + 1);
+       for (i = 0; i < dd->num_pports; i++, ppd++) {
+               ppd->ibport_data.rvp.z_rc_acks =
+                       get_all_cpu_total(ppd->ibport_data.rvp.rc_acks);
+               ppd->ibport_data.rvp.z_rc_qacks =
+                       get_all_cpu_total(ppd->ibport_data.rvp.rc_qacks);
+       }
+}
+
+/* Control LED state */
+static inline void setextled(struct hfi1_devdata *dd, u32 on)
+{
+       if (on)
+               write_csr(dd, DCC_CFG_LED_CNTRL, 0x1F);
+       else
+               write_csr(dd, DCC_CFG_LED_CNTRL, 0x10);
+}
+
+/* return the i2c resource given the target */
+static inline u32 i2c_target(u32 target)
+{
+       return target ? CR_I2C2 : CR_I2C1;
+}
+
+/* return the i2c chain chip resource that this HFI uses for QSFP */
+static inline u32 qsfp_resource(struct hfi1_devdata *dd)
+{
+       return i2c_target(dd->hfi1_id);
+}
+
+int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp);
+
+#endif                          /* _HFI1_KERNEL_H */
diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c
new file mode 100644 (file)
index 0000000..5cc492e
--- /dev/null
@@ -0,0 +1,1818 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/delay.h>
+#include <linux/idr.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/hrtimer.h>
+#include <rdma/rdma_vt.h>
+
+#include "hfi.h"
+#include "device.h"
+#include "common.h"
+#include "trace.h"
+#include "mad.h"
+#include "sdma.h"
+#include "debugfs.h"
+#include "verbs.h"
+#include "aspm.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) DRIVER_NAME ": " fmt
+
+/*
+ * min buffers we want to have per context, after driver
+ */
+#define HFI1_MIN_USER_CTXT_BUFCNT 7
+
+#define HFI1_MIN_HDRQ_EGRBUF_CNT 2
+#define HFI1_MAX_HDRQ_EGRBUF_CNT 16352
+#define HFI1_MIN_EAGER_BUFFER_SIZE (4 * 1024) /* 4KB */
+#define HFI1_MAX_EAGER_BUFFER_SIZE (256 * 1024) /* 256KB */
+
+/*
+ * Number of user receive contexts we are configured to use (to allow for more
+ * pio buffers per ctxt, etc.)  Zero means use one user context per CPU.
+ */
+int num_user_contexts = -1;
+module_param_named(num_user_contexts, num_user_contexts, uint, S_IRUGO);
+MODULE_PARM_DESC(
+       num_user_contexts, "Set max number of user contexts to use");
+
+uint krcvqs[RXE_NUM_DATA_VL];
+int krcvqsset;
+module_param_array(krcvqs, uint, &krcvqsset, S_IRUGO);
+MODULE_PARM_DESC(krcvqs, "Array of the number of non-control kernel receive queues by VL");
+
+/* computed based on above array */
+unsigned n_krcvqs;
+
+static unsigned hfi1_rcvarr_split = 25;
+module_param_named(rcvarr_split, hfi1_rcvarr_split, uint, S_IRUGO);
+MODULE_PARM_DESC(rcvarr_split, "Percent of context's RcvArray entries used for Eager buffers");
+
+static uint eager_buffer_size = (2 << 20); /* 2MB */
+module_param(eager_buffer_size, uint, S_IRUGO);
+MODULE_PARM_DESC(eager_buffer_size, "Size of the eager buffers, default: 2MB");
+
+static uint rcvhdrcnt = 2048; /* 2x the max eager buffer count */
+module_param_named(rcvhdrcnt, rcvhdrcnt, uint, S_IRUGO);
+MODULE_PARM_DESC(rcvhdrcnt, "Receive header queue count (default 2048)");
+
+static uint hfi1_hdrq_entsize = 32;
+module_param_named(hdrq_entsize, hfi1_hdrq_entsize, uint, S_IRUGO);
+MODULE_PARM_DESC(hdrq_entsize, "Size of header queue entries: 2 - 8B, 16 - 64B (default), 32 - 128B");
+
+unsigned int user_credit_return_threshold = 33;        /* default is 33% */
+module_param(user_credit_return_threshold, uint, S_IRUGO);
+MODULE_PARM_DESC(user_credit_return_threshold, "Credit return threshold for user send contexts, return when unreturned credits passes this many blocks (in percent of allocated blocks, 0 is off)");
+
+static inline u64 encode_rcv_header_entry_size(u16);
+
+static struct idr hfi1_unit_table;
+u32 hfi1_cpulist_count;
+unsigned long *hfi1_cpulist;
+
+/*
+ * Common code for creating the receive context array.
+ */
+int hfi1_create_ctxts(struct hfi1_devdata *dd)
+{
+       unsigned i;
+       int ret;
+
+       /* Control context has to be always 0 */
+       BUILD_BUG_ON(HFI1_CTRL_CTXT != 0);
+
+       dd->rcd = kzalloc_node(dd->num_rcv_contexts * sizeof(*dd->rcd),
+                              GFP_KERNEL, dd->node);
+       if (!dd->rcd)
+               goto nomem;
+
+       /* create one or more kernel contexts */
+       for (i = 0; i < dd->first_user_ctxt; ++i) {
+               struct hfi1_pportdata *ppd;
+               struct hfi1_ctxtdata *rcd;
+
+               ppd = dd->pport + (i % dd->num_pports);
+               rcd = hfi1_create_ctxtdata(ppd, i, dd->node);
+               if (!rcd) {
+                       dd_dev_err(dd,
+                                  "Unable to allocate kernel receive context, failing\n");
+                       goto nomem;
+               }
+               /*
+                * Set up the kernel context flags here and now because they
+                * use default values for all receive side memories.  User
+                * contexts will be handled as they are created.
+                */
+               rcd->flags = HFI1_CAP_KGET(MULTI_PKT_EGR) |
+                       HFI1_CAP_KGET(NODROP_RHQ_FULL) |
+                       HFI1_CAP_KGET(NODROP_EGR_FULL) |
+                       HFI1_CAP_KGET(DMA_RTAIL);
+
+               /* Control context must use DMA_RTAIL */
+               if (rcd->ctxt == HFI1_CTRL_CTXT)
+                       rcd->flags |= HFI1_CAP_DMA_RTAIL;
+               rcd->seq_cnt = 1;
+
+               rcd->sc = sc_alloc(dd, SC_ACK, rcd->rcvhdrqentsize, dd->node);
+               if (!rcd->sc) {
+                       dd_dev_err(dd,
+                                  "Unable to allocate kernel send context, failing\n");
+                       dd->rcd[rcd->ctxt] = NULL;
+                       hfi1_free_ctxtdata(dd, rcd);
+                       goto nomem;
+               }
+
+               ret = hfi1_init_ctxt(rcd->sc);
+               if (ret < 0) {
+                       dd_dev_err(dd,
+                                  "Failed to setup kernel receive context, failing\n");
+                       sc_free(rcd->sc);
+                       dd->rcd[rcd->ctxt] = NULL;
+                       hfi1_free_ctxtdata(dd, rcd);
+                       ret = -EFAULT;
+                       goto bail;
+               }
+       }
+
+       /*
+        * Initialize aspm, to be done after gen3 transition and setting up
+        * contexts and before enabling interrupts
+        */
+       aspm_init(dd);
+
+       return 0;
+nomem:
+       ret = -ENOMEM;
+bail:
+       kfree(dd->rcd);
+       dd->rcd = NULL;
+       return ret;
+}
+
+/*
+ * Common code for user and kernel context setup.
+ */
+struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt,
+                                          int numa)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       struct hfi1_ctxtdata *rcd;
+       unsigned kctxt_ngroups = 0;
+       u32 base;
+
+       if (dd->rcv_entries.nctxt_extra >
+           dd->num_rcv_contexts - dd->first_user_ctxt)
+               kctxt_ngroups = (dd->rcv_entries.nctxt_extra -
+                                (dd->num_rcv_contexts - dd->first_user_ctxt));
+       rcd = kzalloc(sizeof(*rcd), GFP_KERNEL);
+       if (rcd) {
+               u32 rcvtids, max_entries;
+
+               hfi1_cdbg(PROC, "setting up context %u\n", ctxt);
+
+               INIT_LIST_HEAD(&rcd->qp_wait_list);
+               rcd->ppd = ppd;
+               rcd->dd = dd;
+               rcd->cnt = 1;
+               rcd->ctxt = ctxt;
+               dd->rcd[ctxt] = rcd;
+               rcd->numa_id = numa;
+               rcd->rcv_array_groups = dd->rcv_entries.ngroups;
+
+               mutex_init(&rcd->exp_lock);
+
+               /*
+                * Calculate the context's RcvArray entry starting point.
+                * We do this here because we have to take into account all
+                * the RcvArray entries that previous context would have
+                * taken and we have to account for any extra groups
+                * assigned to the kernel or user contexts.
+                */
+               if (ctxt < dd->first_user_ctxt) {
+                       if (ctxt < kctxt_ngroups) {
+                               base = ctxt * (dd->rcv_entries.ngroups + 1);
+                               rcd->rcv_array_groups++;
+                       } else
+                               base = kctxt_ngroups +
+                                       (ctxt * dd->rcv_entries.ngroups);
+               } else {
+                       u16 ct = ctxt - dd->first_user_ctxt;
+
+                       base = ((dd->n_krcv_queues * dd->rcv_entries.ngroups) +
+                               kctxt_ngroups);
+                       if (ct < dd->rcv_entries.nctxt_extra) {
+                               base += ct * (dd->rcv_entries.ngroups + 1);
+                               rcd->rcv_array_groups++;
+                       } else
+                               base += dd->rcv_entries.nctxt_extra +
+                                       (ct * dd->rcv_entries.ngroups);
+               }
+               rcd->eager_base = base * dd->rcv_entries.group_size;
+
+               /* Validate and initialize Rcv Hdr Q variables */
+               if (rcvhdrcnt % HDRQ_INCREMENT) {
+                       dd_dev_err(dd,
+                                  "ctxt%u: header queue count %d must be divisible by %lu\n",
+                                  rcd->ctxt, rcvhdrcnt, HDRQ_INCREMENT);
+                       goto bail;
+               }
+               rcd->rcvhdrq_cnt = rcvhdrcnt;
+               rcd->rcvhdrqentsize = hfi1_hdrq_entsize;
+               /*
+                * Simple Eager buffer allocation: we have already pre-allocated
+                * the number of RcvArray entry groups. Each ctxtdata structure
+                * holds the number of groups for that context.
+                *
+                * To follow CSR requirements and maintain cacheline alignment,
+                * make sure all sizes and bases are multiples of group_size.
+                *
+                * The expected entry count is what is left after assigning
+                * eager.
+                */
+               max_entries = rcd->rcv_array_groups *
+                       dd->rcv_entries.group_size;
+               rcvtids = ((max_entries * hfi1_rcvarr_split) / 100);
+               rcd->egrbufs.count = round_down(rcvtids,
+                                               dd->rcv_entries.group_size);
+               if (rcd->egrbufs.count > MAX_EAGER_ENTRIES) {
+                       dd_dev_err(dd, "ctxt%u: requested too many RcvArray entries.\n",
+                                  rcd->ctxt);
+                       rcd->egrbufs.count = MAX_EAGER_ENTRIES;
+               }
+               hfi1_cdbg(PROC,
+                         "ctxt%u: max Eager buffer RcvArray entries: %u\n",
+                         rcd->ctxt, rcd->egrbufs.count);
+
+               /*
+                * Allocate array that will hold the eager buffer accounting
+                * data.
+                * This will allocate the maximum possible buffer count based
+                * on the value of the RcvArray split parameter.
+                * The resulting value will be rounded down to the closest
+                * multiple of dd->rcv_entries.group_size.
+                */
+               rcd->egrbufs.buffers = kcalloc(rcd->egrbufs.count,
+                                              sizeof(*rcd->egrbufs.buffers),
+                                              GFP_KERNEL);
+               if (!rcd->egrbufs.buffers)
+                       goto bail;
+               rcd->egrbufs.rcvtids = kcalloc(rcd->egrbufs.count,
+                                              sizeof(*rcd->egrbufs.rcvtids),
+                                              GFP_KERNEL);
+               if (!rcd->egrbufs.rcvtids)
+                       goto bail;
+               rcd->egrbufs.size = eager_buffer_size;
+               /*
+                * The size of the buffers programmed into the RcvArray
+                * entries needs to be big enough to handle the highest
+                * MTU supported.
+                */
+               if (rcd->egrbufs.size < hfi1_max_mtu) {
+                       rcd->egrbufs.size = __roundup_pow_of_two(hfi1_max_mtu);
+                       hfi1_cdbg(PROC,
+                                 "ctxt%u: eager bufs size too small. Adjusting to %zu\n",
+                                   rcd->ctxt, rcd->egrbufs.size);
+               }
+               rcd->egrbufs.rcvtid_size = HFI1_MAX_EAGER_BUFFER_SIZE;
+
+               if (ctxt < dd->first_user_ctxt) { /* N/A for PSM contexts */
+                       rcd->opstats = kzalloc(sizeof(*rcd->opstats),
+                               GFP_KERNEL);
+                       if (!rcd->opstats)
+                               goto bail;
+               }
+       }
+       return rcd;
+bail:
+       kfree(rcd->egrbufs.rcvtids);
+       kfree(rcd->egrbufs.buffers);
+       kfree(rcd);
+       return NULL;
+}
+
+/*
+ * Convert a receive header entry size that to the encoding used in the CSR.
+ *
+ * Return a zero if the given size is invalid.
+ */
+static inline u64 encode_rcv_header_entry_size(u16 size)
+{
+       /* there are only 3 valid receive header entry sizes */
+       if (size == 2)
+               return 1;
+       if (size == 16)
+               return 2;
+       else if (size == 32)
+               return 4;
+       return 0; /* invalid */
+}
+
+/*
+ * Select the largest ccti value over all SLs to determine the intra-
+ * packet gap for the link.
+ *
+ * called with cca_timer_lock held (to protect access to cca_timer
+ * array), and rcu_read_lock() (to protect access to cc_state).
+ */
+void set_link_ipg(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       struct cc_state *cc_state;
+       int i;
+       u16 cce, ccti_limit, max_ccti = 0;
+       u16 shift, mult;
+       u64 src;
+       u32 current_egress_rate; /* Mbits /sec */
+       u32 max_pkt_time;
+       /*
+        * max_pkt_time is the maximum packet egress time in units
+        * of the fabric clock period 1/(805 MHz).
+        */
+
+       cc_state = get_cc_state(ppd);
+
+       if (!cc_state)
+               /*
+                * This should _never_ happen - rcu_read_lock() is held,
+                * and set_link_ipg() should not be called if cc_state
+                * is NULL.
+                */
+               return;
+
+       for (i = 0; i < OPA_MAX_SLS; i++) {
+               u16 ccti = ppd->cca_timer[i].ccti;
+
+               if (ccti > max_ccti)
+                       max_ccti = ccti;
+       }
+
+       ccti_limit = cc_state->cct.ccti_limit;
+       if (max_ccti > ccti_limit)
+               max_ccti = ccti_limit;
+
+       cce = cc_state->cct.entries[max_ccti].entry;
+       shift = (cce & 0xc000) >> 14;
+       mult = (cce & 0x3fff);
+
+       current_egress_rate = active_egress_rate(ppd);
+
+       max_pkt_time = egress_cycles(ppd->ibmaxlen, current_egress_rate);
+
+       src = (max_pkt_time >> shift) * mult;
+
+       src &= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SMASK;
+       src <<= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SHIFT;
+
+       write_csr(dd, SEND_STATIC_RATE_CONTROL, src);
+}
+
+static enum hrtimer_restart cca_timer_fn(struct hrtimer *t)
+{
+       struct cca_timer *cca_timer;
+       struct hfi1_pportdata *ppd;
+       int sl;
+       u16 ccti_timer, ccti_min;
+       struct cc_state *cc_state;
+       unsigned long flags;
+       enum hrtimer_restart ret = HRTIMER_NORESTART;
+
+       cca_timer = container_of(t, struct cca_timer, hrtimer);
+       ppd = cca_timer->ppd;
+       sl = cca_timer->sl;
+
+       rcu_read_lock();
+
+       cc_state = get_cc_state(ppd);
+
+       if (!cc_state) {
+               rcu_read_unlock();
+               return HRTIMER_NORESTART;
+       }
+
+       /*
+        * 1) decrement ccti for SL
+        * 2) calculate IPG for link (set_link_ipg())
+        * 3) restart timer, unless ccti is at min value
+        */
+
+       ccti_min = cc_state->cong_setting.entries[sl].ccti_min;
+       ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer;
+
+       spin_lock_irqsave(&ppd->cca_timer_lock, flags);
+
+       if (cca_timer->ccti > ccti_min) {
+               cca_timer->ccti--;
+               set_link_ipg(ppd);
+       }
+
+       if (cca_timer->ccti > ccti_min) {
+               unsigned long nsec = 1024 * ccti_timer;
+               /* ccti_timer is in units of 1.024 usec */
+               hrtimer_forward_now(t, ns_to_ktime(nsec));
+               ret = HRTIMER_RESTART;
+       }
+
+       spin_unlock_irqrestore(&ppd->cca_timer_lock, flags);
+       rcu_read_unlock();
+       return ret;
+}
+
+/*
+ * Common code for initializing the physical port structure.
+ */
+void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd,
+                        struct hfi1_devdata *dd, u8 hw_pidx, u8 port)
+{
+       int i, size;
+       uint default_pkey_idx;
+
+       ppd->dd = dd;
+       ppd->hw_pidx = hw_pidx;
+       ppd->port = port; /* IB port number, not index */
+
+       default_pkey_idx = 1;
+
+       ppd->pkeys[default_pkey_idx] = DEFAULT_P_KEY;
+       if (loopback) {
+               hfi1_early_err(&pdev->dev,
+                              "Faking data partition 0x8001 in idx %u\n",
+                              !default_pkey_idx);
+               ppd->pkeys[!default_pkey_idx] = 0x8001;
+       }
+
+       INIT_WORK(&ppd->link_vc_work, handle_verify_cap);
+       INIT_WORK(&ppd->link_up_work, handle_link_up);
+       INIT_WORK(&ppd->link_down_work, handle_link_down);
+       INIT_WORK(&ppd->freeze_work, handle_freeze);
+       INIT_WORK(&ppd->link_downgrade_work, handle_link_downgrade);
+       INIT_WORK(&ppd->sma_message_work, handle_sma_message);
+       INIT_WORK(&ppd->link_bounce_work, handle_link_bounce);
+       INIT_WORK(&ppd->linkstate_active_work, receive_interrupt_work);
+       INIT_WORK(&ppd->qsfp_info.qsfp_work, qsfp_event);
+
+       mutex_init(&ppd->hls_lock);
+       spin_lock_init(&ppd->sdma_alllock);
+       spin_lock_init(&ppd->qsfp_info.qsfp_lock);
+
+       ppd->qsfp_info.ppd = ppd;
+       ppd->sm_trap_qp = 0x0;
+       ppd->sa_qp = 0x1;
+
+       ppd->hfi1_wq = NULL;
+
+       spin_lock_init(&ppd->cca_timer_lock);
+
+       for (i = 0; i < OPA_MAX_SLS; i++) {
+               hrtimer_init(&ppd->cca_timer[i].hrtimer, CLOCK_MONOTONIC,
+                            HRTIMER_MODE_REL);
+               ppd->cca_timer[i].ppd = ppd;
+               ppd->cca_timer[i].sl = i;
+               ppd->cca_timer[i].ccti = 0;
+               ppd->cca_timer[i].hrtimer.function = cca_timer_fn;
+       }
+
+       ppd->cc_max_table_entries = IB_CC_TABLE_CAP_DEFAULT;
+
+       spin_lock_init(&ppd->cc_state_lock);
+       spin_lock_init(&ppd->cc_log_lock);
+       size = sizeof(struct cc_state);
+       RCU_INIT_POINTER(ppd->cc_state, kzalloc(size, GFP_KERNEL));
+       if (!rcu_dereference(ppd->cc_state))
+               goto bail;
+       return;
+
+bail:
+
+       hfi1_early_err(&pdev->dev,
+                      "Congestion Control Agent disabled for port %d\n", port);
+}
+
+/*
+ * Do initialization for device that is only needed on
+ * first detect, not on resets.
+ */
+static int loadtime_init(struct hfi1_devdata *dd)
+{
+       return 0;
+}
+
+/**
+ * init_after_reset - re-initialize after a reset
+ * @dd: the hfi1_ib device
+ *
+ * sanity check at least some of the values after reset, and
+ * ensure no receive or transmit (explicitly, in case reset
+ * failed
+ */
+static int init_after_reset(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /*
+        * Ensure chip does no sends or receives, tail updates, or
+        * pioavail updates while we re-initialize.  This is mostly
+        * for the driver data structures, not chip registers.
+        */
+       for (i = 0; i < dd->num_rcv_contexts; i++)
+               hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS |
+                                 HFI1_RCVCTRL_INTRAVAIL_DIS |
+                                 HFI1_RCVCTRL_TAILUPD_DIS, i);
+       pio_send_control(dd, PSC_GLOBAL_DISABLE);
+       for (i = 0; i < dd->num_send_contexts; i++)
+               sc_disable(dd->send_contexts[i].sc);
+
+       return 0;
+}
+
+static void enable_chip(struct hfi1_devdata *dd)
+{
+       u32 rcvmask;
+       u32 i;
+
+       /* enable PIO send */
+       pio_send_control(dd, PSC_GLOBAL_ENABLE);
+
+       /*
+        * Enable kernel ctxts' receive and receive interrupt.
+        * Other ctxts done as user opens and initializes them.
+        */
+       for (i = 0; i < dd->first_user_ctxt; ++i) {
+               rcvmask = HFI1_RCVCTRL_CTXT_ENB | HFI1_RCVCTRL_INTRAVAIL_ENB;
+               rcvmask |= HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, DMA_RTAIL) ?
+                       HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS;
+               if (!HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, MULTI_PKT_EGR))
+                       rcvmask |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB;
+               if (HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, NODROP_RHQ_FULL))
+                       rcvmask |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
+               if (HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, NODROP_EGR_FULL))
+                       rcvmask |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
+               hfi1_rcvctrl(dd, rcvmask, i);
+               sc_enable(dd->rcd[i]->sc);
+       }
+}
+
+/**
+ * create_workqueues - create per port workqueues
+ * @dd: the hfi1_ib device
+ */
+static int create_workqueues(struct hfi1_devdata *dd)
+{
+       int pidx;
+       struct hfi1_pportdata *ppd;
+
+       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+               ppd = dd->pport + pidx;
+               if (!ppd->hfi1_wq) {
+                       ppd->hfi1_wq =
+                               alloc_workqueue(
+                                   "hfi%d_%d",
+                                   WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE,
+                                   dd->num_sdma,
+                                   dd->unit, pidx);
+                       if (!ppd->hfi1_wq)
+                               goto wq_error;
+               }
+       }
+       return 0;
+wq_error:
+       pr_err("alloc_workqueue failed for port %d\n", pidx + 1);
+       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+               ppd = dd->pport + pidx;
+               if (ppd->hfi1_wq) {
+                       destroy_workqueue(ppd->hfi1_wq);
+                       ppd->hfi1_wq = NULL;
+               }
+       }
+       return -ENOMEM;
+}
+
+/**
+ * hfi1_init - do the actual initialization sequence on the chip
+ * @dd: the hfi1_ib device
+ * @reinit: re-initializing, so don't allocate new memory
+ *
+ * Do the actual initialization sequence on the chip.  This is done
+ * both from the init routine called from the PCI infrastructure, and
+ * when we reset the chip, or detect that it was reset internally,
+ * or it's administratively re-enabled.
+ *
+ * Memory allocation here and in called routines is only done in
+ * the first case (reinit == 0).  We have to be careful, because even
+ * without memory allocation, we need to re-write all the chip registers
+ * TIDs, etc. after the reset or enable has completed.
+ */
+int hfi1_init(struct hfi1_devdata *dd, int reinit)
+{
+       int ret = 0, pidx, lastfail = 0;
+       unsigned i, len;
+       struct hfi1_ctxtdata *rcd;
+       struct hfi1_pportdata *ppd;
+
+       /* Set up recv low level handlers */
+       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_EXPECTED] =
+                                               kdeth_process_expected;
+       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_EAGER] =
+                                               kdeth_process_eager;
+       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_IB] = process_receive_ib;
+       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_ERROR] =
+                                               process_receive_error;
+       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_BYPASS] =
+                                               process_receive_bypass;
+       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID5] =
+                                               process_receive_invalid;
+       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID6] =
+                                               process_receive_invalid;
+       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID7] =
+                                               process_receive_invalid;
+       dd->rhf_rcv_function_map = dd->normal_rhf_rcv_functions;
+
+       /* Set up send low level handlers */
+       dd->process_pio_send = hfi1_verbs_send_pio;
+       dd->process_dma_send = hfi1_verbs_send_dma;
+       dd->pio_inline_send = pio_copy;
+
+       if (is_ax(dd)) {
+               atomic_set(&dd->drop_packet, DROP_PACKET_ON);
+               dd->do_drop = 1;
+       } else {
+               atomic_set(&dd->drop_packet, DROP_PACKET_OFF);
+               dd->do_drop = 0;
+       }
+
+       /* make sure the link is not "up" */
+       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+               ppd = dd->pport + pidx;
+               ppd->linkup = 0;
+       }
+
+       if (reinit)
+               ret = init_after_reset(dd);
+       else
+               ret = loadtime_init(dd);
+       if (ret)
+               goto done;
+
+       /* allocate dummy tail memory for all receive contexts */
+       dd->rcvhdrtail_dummy_kvaddr = dma_zalloc_coherent(
+               &dd->pcidev->dev, sizeof(u64),
+               &dd->rcvhdrtail_dummy_physaddr,
+               GFP_KERNEL);
+
+       if (!dd->rcvhdrtail_dummy_kvaddr) {
+               dd_dev_err(dd, "cannot allocate dummy tail memory\n");
+               ret = -ENOMEM;
+               goto done;
+       }
+
+       /* dd->rcd can be NULL if early initialization failed */
+       for (i = 0; dd->rcd && i < dd->first_user_ctxt; ++i) {
+               /*
+                * Set up the (kernel) rcvhdr queue and egr TIDs.  If doing
+                * re-init, the simplest way to handle this is to free
+                * existing, and re-allocate.
+                * Need to re-create rest of ctxt 0 ctxtdata as well.
+                */
+               rcd = dd->rcd[i];
+               if (!rcd)
+                       continue;
+
+               rcd->do_interrupt = &handle_receive_interrupt;
+
+               lastfail = hfi1_create_rcvhdrq(dd, rcd);
+               if (!lastfail)
+                       lastfail = hfi1_setup_eagerbufs(rcd);
+               if (lastfail) {
+                       dd_dev_err(dd,
+                                  "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n");
+                       ret = lastfail;
+               }
+       }
+
+       /* Allocate enough memory for user event notification. */
+       len = PAGE_ALIGN(dd->chip_rcv_contexts * HFI1_MAX_SHARED_CTXTS *
+                        sizeof(*dd->events));
+       dd->events = vmalloc_user(len);
+       if (!dd->events)
+               dd_dev_err(dd, "Failed to allocate user events page\n");
+       /*
+        * Allocate a page for device and port status.
+        * Page will be shared amongst all user processes.
+        */
+       dd->status = vmalloc_user(PAGE_SIZE);
+       if (!dd->status)
+               dd_dev_err(dd, "Failed to allocate dev status page\n");
+       else
+               dd->freezelen = PAGE_SIZE - (sizeof(*dd->status) -
+                                            sizeof(dd->status->freezemsg));
+       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+               ppd = dd->pport + pidx;
+               if (dd->status)
+                       /* Currently, we only have one port */
+                       ppd->statusp = &dd->status->port;
+
+               set_mtu(ppd);
+       }
+
+       /* enable chip even if we have an error, so we can debug cause */
+       enable_chip(dd);
+
+done:
+       /*
+        * Set status even if port serdes is not initialized
+        * so that diags will work.
+        */
+       if (dd->status)
+               dd->status->dev |= HFI1_STATUS_CHIP_PRESENT |
+                       HFI1_STATUS_INITTED;
+       if (!ret) {
+               /* enable all interrupts from the chip */
+               set_intr_state(dd, 1);
+
+               /* chip is OK for user apps; mark it as initialized */
+               for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+                       ppd = dd->pport + pidx;
+
+                       /*
+                        * start the serdes - must be after interrupts are
+                        * enabled so we are notified when the link goes up
+                        */
+                       lastfail = bringup_serdes(ppd);
+                       if (lastfail)
+                               dd_dev_info(dd,
+                                           "Failed to bring up port %u\n",
+                                           ppd->port);
+
+                       /*
+                        * Set status even if port serdes is not initialized
+                        * so that diags will work.
+                        */
+                       if (ppd->statusp)
+                               *ppd->statusp |= HFI1_STATUS_CHIP_PRESENT |
+                                                       HFI1_STATUS_INITTED;
+                       if (!ppd->link_speed_enabled)
+                               continue;
+               }
+       }
+
+       /* if ret is non-zero, we probably should do some cleanup here... */
+       return ret;
+}
+
+static inline struct hfi1_devdata *__hfi1_lookup(int unit)
+{
+       return idr_find(&hfi1_unit_table, unit);
+}
+
+struct hfi1_devdata *hfi1_lookup(int unit)
+{
+       struct hfi1_devdata *dd;
+       unsigned long flags;
+
+       spin_lock_irqsave(&hfi1_devs_lock, flags);
+       dd = __hfi1_lookup(unit);
+       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+
+       return dd;
+}
+
+/*
+ * Stop the timers during unit shutdown, or after an error late
+ * in initialization.
+ */
+static void stop_timers(struct hfi1_devdata *dd)
+{
+       struct hfi1_pportdata *ppd;
+       int pidx;
+
+       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+               ppd = dd->pport + pidx;
+               if (ppd->led_override_timer.data) {
+                       del_timer_sync(&ppd->led_override_timer);
+                       atomic_set(&ppd->led_override_timer_active, 0);
+               }
+       }
+}
+
+/**
+ * shutdown_device - shut down a device
+ * @dd: the hfi1_ib device
+ *
+ * This is called to make the device quiet when we are about to
+ * unload the driver, and also when the device is administratively
+ * disabled.   It does not free any data structures.
+ * Everything it does has to be setup again by hfi1_init(dd, 1)
+ */
+static void shutdown_device(struct hfi1_devdata *dd)
+{
+       struct hfi1_pportdata *ppd;
+       unsigned pidx;
+       int i;
+
+       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+               ppd = dd->pport + pidx;
+
+               ppd->linkup = 0;
+               if (ppd->statusp)
+                       *ppd->statusp &= ~(HFI1_STATUS_IB_CONF |
+                                          HFI1_STATUS_IB_READY);
+       }
+       dd->flags &= ~HFI1_INITTED;
+
+       /* mask interrupts, but not errors */
+       set_intr_state(dd, 0);
+
+       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+               ppd = dd->pport + pidx;
+               for (i = 0; i < dd->num_rcv_contexts; i++)
+                       hfi1_rcvctrl(dd, HFI1_RCVCTRL_TAILUPD_DIS |
+                                         HFI1_RCVCTRL_CTXT_DIS |
+                                         HFI1_RCVCTRL_INTRAVAIL_DIS |
+                                         HFI1_RCVCTRL_PKEY_DIS |
+                                         HFI1_RCVCTRL_ONE_PKT_EGR_DIS, i);
+               /*
+                * Gracefully stop all sends allowing any in progress to
+                * trickle out first.
+                */
+               for (i = 0; i < dd->num_send_contexts; i++)
+                       sc_flush(dd->send_contexts[i].sc);
+       }
+
+       /*
+        * Enough for anything that's going to trickle out to have actually
+        * done so.
+        */
+       udelay(20);
+
+       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+               ppd = dd->pport + pidx;
+
+               /* disable all contexts */
+               for (i = 0; i < dd->num_send_contexts; i++)
+                       sc_disable(dd->send_contexts[i].sc);
+               /* disable the send device */
+               pio_send_control(dd, PSC_GLOBAL_DISABLE);
+
+               shutdown_led_override(ppd);
+
+               /*
+                * Clear SerdesEnable.
+                * We can't count on interrupts since we are stopping.
+                */
+               hfi1_quiet_serdes(ppd);
+
+               if (ppd->hfi1_wq) {
+                       destroy_workqueue(ppd->hfi1_wq);
+                       ppd->hfi1_wq = NULL;
+               }
+       }
+       sdma_exit(dd);
+}
+
+/**
+ * hfi1_free_ctxtdata - free a context's allocated data
+ * @dd: the hfi1_ib device
+ * @rcd: the ctxtdata structure
+ *
+ * free up any allocated data for a context
+ * This should not touch anything that would affect a simultaneous
+ * re-allocation of context data, because it is called after hfi1_mutex
+ * is released (and can be called from reinit as well).
+ * It should never change any chip state, or global driver state.
+ */
+void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
+{
+       unsigned e;
+
+       if (!rcd)
+               return;
+
+       if (rcd->rcvhdrq) {
+               dma_free_coherent(&dd->pcidev->dev, rcd->rcvhdrq_size,
+                                 rcd->rcvhdrq, rcd->rcvhdrq_phys);
+               rcd->rcvhdrq = NULL;
+               if (rcd->rcvhdrtail_kvaddr) {
+                       dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
+                                         (void *)rcd->rcvhdrtail_kvaddr,
+                                         rcd->rcvhdrqtailaddr_phys);
+                       rcd->rcvhdrtail_kvaddr = NULL;
+               }
+       }
+
+       /* all the RcvArray entries should have been cleared by now */
+       kfree(rcd->egrbufs.rcvtids);
+
+       for (e = 0; e < rcd->egrbufs.alloced; e++) {
+               if (rcd->egrbufs.buffers[e].phys)
+                       dma_free_coherent(&dd->pcidev->dev,
+                                         rcd->egrbufs.buffers[e].len,
+                                         rcd->egrbufs.buffers[e].addr,
+                                         rcd->egrbufs.buffers[e].phys);
+       }
+       kfree(rcd->egrbufs.buffers);
+
+       sc_free(rcd->sc);
+       vfree(rcd->user_event_mask);
+       vfree(rcd->subctxt_uregbase);
+       vfree(rcd->subctxt_rcvegrbuf);
+       vfree(rcd->subctxt_rcvhdr_base);
+       kfree(rcd->opstats);
+       kfree(rcd);
+}
+
+/*
+ * Release our hold on the shared asic data.  If we are the last one,
+ * free the structure.  Must be holding hfi1_devs_lock.
+ */
+static void release_asic_data(struct hfi1_devdata *dd)
+{
+       int other;
+
+       if (!dd->asic_data)
+               return;
+       dd->asic_data->dds[dd->hfi1_id] = NULL;
+       other = dd->hfi1_id ? 0 : 1;
+       if (!dd->asic_data->dds[other]) {
+               /* we are the last holder, free it */
+               kfree(dd->asic_data);
+       }
+       dd->asic_data = NULL;
+}
+
+static void __hfi1_free_devdata(struct kobject *kobj)
+{
+       struct hfi1_devdata *dd =
+               container_of(kobj, struct hfi1_devdata, kobj);
+       unsigned long flags;
+
+       spin_lock_irqsave(&hfi1_devs_lock, flags);
+       idr_remove(&hfi1_unit_table, dd->unit);
+       list_del(&dd->list);
+       release_asic_data(dd);
+       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+       free_platform_config(dd);
+       rcu_barrier(); /* wait for rcu callbacks to complete */
+       free_percpu(dd->int_counter);
+       free_percpu(dd->rcv_limit);
+       hfi1_dev_affinity_free(dd);
+       free_percpu(dd->send_schedule);
+       rvt_dealloc_device(&dd->verbs_dev.rdi);
+}
+
+static struct kobj_type hfi1_devdata_type = {
+       .release = __hfi1_free_devdata,
+};
+
+void hfi1_free_devdata(struct hfi1_devdata *dd)
+{
+       kobject_put(&dd->kobj);
+}
+
+/*
+ * Allocate our primary per-unit data structure.  Must be done via verbs
+ * allocator, because the verbs cleanup process both does cleanup and
+ * free of the data structure.
+ * "extra" is for chip-specific data.
+ *
+ * Use the idr mechanism to get a unit number for this unit.
+ */
+struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra)
+{
+       unsigned long flags;
+       struct hfi1_devdata *dd;
+       int ret, nports;
+
+       /* extra is * number of ports */
+       nports = extra / sizeof(struct hfi1_pportdata);
+
+       dd = (struct hfi1_devdata *)rvt_alloc_device(sizeof(*dd) + extra,
+                                                    nports);
+       if (!dd)
+               return ERR_PTR(-ENOMEM);
+       dd->num_pports = nports;
+       dd->pport = (struct hfi1_pportdata *)(dd + 1);
+
+       INIT_LIST_HEAD(&dd->list);
+       idr_preload(GFP_KERNEL);
+       spin_lock_irqsave(&hfi1_devs_lock, flags);
+
+       ret = idr_alloc(&hfi1_unit_table, dd, 0, 0, GFP_NOWAIT);
+       if (ret >= 0) {
+               dd->unit = ret;
+               list_add(&dd->list, &hfi1_dev_list);
+       }
+
+       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+       idr_preload_end();
+
+       if (ret < 0) {
+               hfi1_early_err(&pdev->dev,
+                              "Could not allocate unit ID: error %d\n", -ret);
+               goto bail;
+       }
+       /*
+        * Initialize all locks for the device. This needs to be as early as
+        * possible so locks are usable.
+        */
+       spin_lock_init(&dd->sc_lock);
+       spin_lock_init(&dd->sendctrl_lock);
+       spin_lock_init(&dd->rcvctrl_lock);
+       spin_lock_init(&dd->uctxt_lock);
+       spin_lock_init(&dd->hfi1_diag_trans_lock);
+       spin_lock_init(&dd->sc_init_lock);
+       spin_lock_init(&dd->dc8051_lock);
+       spin_lock_init(&dd->dc8051_memlock);
+       seqlock_init(&dd->sc2vl_lock);
+       spin_lock_init(&dd->sde_map_lock);
+       spin_lock_init(&dd->pio_map_lock);
+       init_waitqueue_head(&dd->event_queue);
+
+       dd->int_counter = alloc_percpu(u64);
+       if (!dd->int_counter) {
+               ret = -ENOMEM;
+               hfi1_early_err(&pdev->dev,
+                              "Could not allocate per-cpu int_counter\n");
+               goto bail;
+       }
+
+       dd->rcv_limit = alloc_percpu(u64);
+       if (!dd->rcv_limit) {
+               ret = -ENOMEM;
+               hfi1_early_err(&pdev->dev,
+                              "Could not allocate per-cpu rcv_limit\n");
+               goto bail;
+       }
+
+       dd->send_schedule = alloc_percpu(u64);
+       if (!dd->send_schedule) {
+               ret = -ENOMEM;
+               hfi1_early_err(&pdev->dev,
+                              "Could not allocate per-cpu int_counter\n");
+               goto bail;
+       }
+
+       if (!hfi1_cpulist_count) {
+               u32 count = num_online_cpus();
+
+               hfi1_cpulist = kcalloc(BITS_TO_LONGS(count), sizeof(long),
+                                      GFP_KERNEL);
+               if (hfi1_cpulist)
+                       hfi1_cpulist_count = count;
+               else
+                       hfi1_early_err(
+                       &pdev->dev,
+                       "Could not alloc cpulist info, cpu affinity might be wrong\n");
+       }
+       kobject_init(&dd->kobj, &hfi1_devdata_type);
+       return dd;
+
+bail:
+       if (!list_empty(&dd->list))
+               list_del_init(&dd->list);
+       rvt_dealloc_device(&dd->verbs_dev.rdi);
+       return ERR_PTR(ret);
+}
+
+/*
+ * Called from freeze mode handlers, and from PCI error
+ * reporting code.  Should be paranoid about state of
+ * system and data structures.
+ */
+void hfi1_disable_after_error(struct hfi1_devdata *dd)
+{
+       if (dd->flags & HFI1_INITTED) {
+               u32 pidx;
+
+               dd->flags &= ~HFI1_INITTED;
+               if (dd->pport)
+                       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+                               struct hfi1_pportdata *ppd;
+
+                               ppd = dd->pport + pidx;
+                               if (dd->flags & HFI1_PRESENT)
+                                       set_link_state(ppd, HLS_DN_DISABLE);
+
+                               if (ppd->statusp)
+                                       *ppd->statusp &= ~HFI1_STATUS_IB_READY;
+                       }
+       }
+
+       /*
+        * Mark as having had an error for driver, and also
+        * for /sys and status word mapped to user programs.
+        * This marks unit as not usable, until reset.
+        */
+       if (dd->status)
+               dd->status->dev |= HFI1_STATUS_HWERROR;
+}
+
+static void remove_one(struct pci_dev *);
+static int init_one(struct pci_dev *, const struct pci_device_id *);
+
+#define DRIVER_LOAD_MSG "Intel " DRIVER_NAME " loaded: "
+#define PFX DRIVER_NAME ": "
+
+static const struct pci_device_id hfi1_pci_tbl[] = {
+       { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL0) },
+       { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL1) },
+       { 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, hfi1_pci_tbl);
+
+static struct pci_driver hfi1_pci_driver = {
+       .name = DRIVER_NAME,
+       .probe = init_one,
+       .remove = remove_one,
+       .id_table = hfi1_pci_tbl,
+       .err_handler = &hfi1_pci_err_handler,
+};
+
+static void __init compute_krcvqs(void)
+{
+       int i;
+
+       for (i = 0; i < krcvqsset; i++)
+               n_krcvqs += krcvqs[i];
+}
+
+/*
+ * Do all the generic driver unit- and chip-independent memory
+ * allocation and initialization.
+ */
+static int __init hfi1_mod_init(void)
+{
+       int ret;
+
+       ret = dev_init();
+       if (ret)
+               goto bail;
+
+       /* validate max MTU before any devices start */
+       if (!valid_opa_max_mtu(hfi1_max_mtu)) {
+               pr_err("Invalid max_mtu 0x%x, using 0x%x instead\n",
+                      hfi1_max_mtu, HFI1_DEFAULT_MAX_MTU);
+               hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU;
+       }
+       /* valid CUs run from 1-128 in powers of 2 */
+       if (hfi1_cu > 128 || !is_power_of_2(hfi1_cu))
+               hfi1_cu = 1;
+       /* valid credit return threshold is 0-100, variable is unsigned */
+       if (user_credit_return_threshold > 100)
+               user_credit_return_threshold = 100;
+
+       compute_krcvqs();
+       /*
+        * sanitize receive interrupt count, time must wait until after
+        * the hardware type is known
+        */
+       if (rcv_intr_count > RCV_HDR_HEAD_COUNTER_MASK)
+               rcv_intr_count = RCV_HDR_HEAD_COUNTER_MASK;
+       /* reject invalid combinations */
+       if (rcv_intr_count == 0 && rcv_intr_timeout == 0) {
+               pr_err("Invalid mode: both receive interrupt count and available timeout are zero - setting interrupt count to 1\n");
+               rcv_intr_count = 1;
+       }
+       if (rcv_intr_count > 1 && rcv_intr_timeout == 0) {
+               /*
+                * Avoid indefinite packet delivery by requiring a timeout
+                * if count is > 1.
+                */
+               pr_err("Invalid mode: receive interrupt count greater than 1 and available timeout is zero - setting available timeout to 1\n");
+               rcv_intr_timeout = 1;
+       }
+       if (rcv_intr_dynamic && !(rcv_intr_count > 1 && rcv_intr_timeout > 0)) {
+               /*
+                * The dynamic algorithm expects a non-zero timeout
+                * and a count > 1.
+                */
+               pr_err("Invalid mode: dynamic receive interrupt mitigation with invalid count and timeout - turning dynamic off\n");
+               rcv_intr_dynamic = 0;
+       }
+
+       /* sanitize link CRC options */
+       link_crc_mask &= SUPPORTED_CRCS;
+
+       /*
+        * These must be called before the driver is registered with
+        * the PCI subsystem.
+        */
+       idr_init(&hfi1_unit_table);
+
+       hfi1_dbg_init();
+       ret = hfi1_wss_init();
+       if (ret < 0)
+               goto bail_wss;
+       ret = pci_register_driver(&hfi1_pci_driver);
+       if (ret < 0) {
+               pr_err("Unable to register driver: error %d\n", -ret);
+               goto bail_dev;
+       }
+       goto bail; /* all OK */
+
+bail_dev:
+       hfi1_wss_exit();
+bail_wss:
+       hfi1_dbg_exit();
+       idr_destroy(&hfi1_unit_table);
+       dev_cleanup();
+bail:
+       return ret;
+}
+
+module_init(hfi1_mod_init);
+
+/*
+ * Do the non-unit driver cleanup, memory free, etc. at unload.
+ */
+static void __exit hfi1_mod_cleanup(void)
+{
+       pci_unregister_driver(&hfi1_pci_driver);
+       hfi1_wss_exit();
+       hfi1_dbg_exit();
+       hfi1_cpulist_count = 0;
+       kfree(hfi1_cpulist);
+
+       idr_destroy(&hfi1_unit_table);
+       dispose_firmware();     /* asymmetric with obtain_firmware() */
+       dev_cleanup();
+}
+
+module_exit(hfi1_mod_cleanup);
+
+/* this can only be called after a successful initialization */
+static void cleanup_device_data(struct hfi1_devdata *dd)
+{
+       int ctxt;
+       int pidx;
+       struct hfi1_ctxtdata **tmp;
+       unsigned long flags;
+
+       /* users can't do anything more with chip */
+       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+               struct hfi1_pportdata *ppd = &dd->pport[pidx];
+               struct cc_state *cc_state;
+               int i;
+
+               if (ppd->statusp)
+                       *ppd->statusp &= ~HFI1_STATUS_CHIP_PRESENT;
+
+               for (i = 0; i < OPA_MAX_SLS; i++)
+                       hrtimer_cancel(&ppd->cca_timer[i].hrtimer);
+
+               spin_lock(&ppd->cc_state_lock);
+               cc_state = get_cc_state(ppd);
+               RCU_INIT_POINTER(ppd->cc_state, NULL);
+               spin_unlock(&ppd->cc_state_lock);
+
+               if (cc_state)
+                       call_rcu(&cc_state->rcu, cc_state_reclaim);
+       }
+
+       free_credit_return(dd);
+
+       /*
+        * Free any resources still in use (usually just kernel contexts)
+        * at unload; we do for ctxtcnt, because that's what we allocate.
+        * We acquire lock to be really paranoid that rcd isn't being
+        * accessed from some interrupt-related code (that should not happen,
+        * but best to be sure).
+        */
+       spin_lock_irqsave(&dd->uctxt_lock, flags);
+       tmp = dd->rcd;
+       dd->rcd = NULL;
+       spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+
+       if (dd->rcvhdrtail_dummy_kvaddr) {
+               dma_free_coherent(&dd->pcidev->dev, sizeof(u64),
+                                 (void *)dd->rcvhdrtail_dummy_kvaddr,
+                                 dd->rcvhdrtail_dummy_physaddr);
+                                 dd->rcvhdrtail_dummy_kvaddr = NULL;
+       }
+
+       for (ctxt = 0; tmp && ctxt < dd->num_rcv_contexts; ctxt++) {
+               struct hfi1_ctxtdata *rcd = tmp[ctxt];
+
+               tmp[ctxt] = NULL; /* debugging paranoia */
+               if (rcd) {
+                       hfi1_clear_tids(rcd);
+                       hfi1_free_ctxtdata(dd, rcd);
+               }
+       }
+       kfree(tmp);
+       free_pio_map(dd);
+       /* must follow rcv context free - need to remove rcv's hooks */
+       for (ctxt = 0; ctxt < dd->num_send_contexts; ctxt++)
+               sc_free(dd->send_contexts[ctxt].sc);
+       dd->num_send_contexts = 0;
+       kfree(dd->send_contexts);
+       dd->send_contexts = NULL;
+       kfree(dd->hw_to_sw);
+       dd->hw_to_sw = NULL;
+       kfree(dd->boardname);
+       vfree(dd->events);
+       vfree(dd->status);
+}
+
+/*
+ * Clean up on unit shutdown, or error during unit load after
+ * successful initialization.
+ */
+static void postinit_cleanup(struct hfi1_devdata *dd)
+{
+       hfi1_start_cleanup(dd);
+
+       hfi1_pcie_ddcleanup(dd);
+       hfi1_pcie_cleanup(dd->pcidev);
+
+       cleanup_device_data(dd);
+
+       hfi1_free_devdata(dd);
+}
+
+static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+       int ret = 0, j, pidx, initfail;
+       struct hfi1_devdata *dd = NULL;
+       struct hfi1_pportdata *ppd;
+
+       /* First, lock the non-writable module parameters */
+       HFI1_CAP_LOCK();
+
+       /* Validate some global module parameters */
+       if (rcvhdrcnt <= HFI1_MIN_HDRQ_EGRBUF_CNT) {
+               hfi1_early_err(&pdev->dev, "Header queue  count too small\n");
+               ret = -EINVAL;
+               goto bail;
+       }
+       if (rcvhdrcnt > HFI1_MAX_HDRQ_EGRBUF_CNT) {
+               hfi1_early_err(&pdev->dev,
+                              "Receive header queue count cannot be greater than %u\n",
+                              HFI1_MAX_HDRQ_EGRBUF_CNT);
+               ret = -EINVAL;
+               goto bail;
+       }
+       /* use the encoding function as a sanitization check */
+       if (!encode_rcv_header_entry_size(hfi1_hdrq_entsize)) {
+               hfi1_early_err(&pdev->dev, "Invalid HdrQ Entry size %u\n",
+                              hfi1_hdrq_entsize);
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       /* The receive eager buffer size must be set before the receive
+        * contexts are created.
+        *
+        * Set the eager buffer size.  Validate that it falls in a range
+        * allowed by the hardware - all powers of 2 between the min and
+        * max.  The maximum valid MTU is within the eager buffer range
+        * so we do not need to cap the max_mtu by an eager buffer size
+        * setting.
+        */
+       if (eager_buffer_size) {
+               if (!is_power_of_2(eager_buffer_size))
+                       eager_buffer_size =
+                               roundup_pow_of_two(eager_buffer_size);
+               eager_buffer_size =
+                       clamp_val(eager_buffer_size,
+                                 MIN_EAGER_BUFFER * 8,
+                                 MAX_EAGER_BUFFER_TOTAL);
+               hfi1_early_info(&pdev->dev, "Eager buffer size %u\n",
+                               eager_buffer_size);
+       } else {
+               hfi1_early_err(&pdev->dev, "Invalid Eager buffer size of 0\n");
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       /* restrict value of hfi1_rcvarr_split */
+       hfi1_rcvarr_split = clamp_val(hfi1_rcvarr_split, 0, 100);
+
+       ret = hfi1_pcie_init(pdev, ent);
+       if (ret)
+               goto bail;
+
+       /*
+        * Do device-specific initialization, function table setup, dd
+        * allocation, etc.
+        */
+       switch (ent->device) {
+       case PCI_DEVICE_ID_INTEL0:
+       case PCI_DEVICE_ID_INTEL1:
+               dd = hfi1_init_dd(pdev, ent);
+               break;
+       default:
+               hfi1_early_err(&pdev->dev,
+                              "Failing on unknown Intel deviceid 0x%x\n",
+                              ent->device);
+               ret = -ENODEV;
+       }
+
+       if (IS_ERR(dd))
+               ret = PTR_ERR(dd);
+       if (ret)
+               goto clean_bail; /* error already printed */
+
+       ret = create_workqueues(dd);
+       if (ret)
+               goto clean_bail;
+
+       /* do the generic initialization */
+       initfail = hfi1_init(dd, 0);
+
+       ret = hfi1_register_ib_device(dd);
+
+       /*
+        * Now ready for use.  this should be cleared whenever we
+        * detect a reset, or initiate one.  If earlier failure,
+        * we still create devices, so diags, etc. can be used
+        * to determine cause of problem.
+        */
+       if (!initfail && !ret) {
+               dd->flags |= HFI1_INITTED;
+               /* create debufs files after init and ib register */
+               hfi1_dbg_ibdev_init(&dd->verbs_dev);
+       }
+
+       j = hfi1_device_create(dd);
+       if (j)
+               dd_dev_err(dd, "Failed to create /dev devices: %d\n", -j);
+
+       if (initfail || ret) {
+               stop_timers(dd);
+               flush_workqueue(ib_wq);
+               for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+                       hfi1_quiet_serdes(dd->pport + pidx);
+                       ppd = dd->pport + pidx;
+                       if (ppd->hfi1_wq) {
+                               destroy_workqueue(ppd->hfi1_wq);
+                               ppd->hfi1_wq = NULL;
+                       }
+               }
+               if (!j)
+                       hfi1_device_remove(dd);
+               if (!ret)
+                       hfi1_unregister_ib_device(dd);
+               postinit_cleanup(dd);
+               if (initfail)
+                       ret = initfail;
+               goto bail;      /* everything already cleaned */
+       }
+
+       sdma_start(dd);
+
+       return 0;
+
+clean_bail:
+       hfi1_pcie_cleanup(pdev);
+bail:
+       return ret;
+}
+
+static void remove_one(struct pci_dev *pdev)
+{
+       struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+
+       /* close debugfs files before ib unregister */
+       hfi1_dbg_ibdev_exit(&dd->verbs_dev);
+       /* unregister from IB core */
+       hfi1_unregister_ib_device(dd);
+
+       /*
+        * Disable the IB link, disable interrupts on the device,
+        * clear dma engines, etc.
+        */
+       shutdown_device(dd);
+
+       stop_timers(dd);
+
+       /* wait until all of our (qsfp) queue_work() calls complete */
+       flush_workqueue(ib_wq);
+
+       hfi1_device_remove(dd);
+
+       postinit_cleanup(dd);
+}
+
+/**
+ * hfi1_create_rcvhdrq - create a receive header queue
+ * @dd: the hfi1_ib device
+ * @rcd: the context data
+ *
+ * This must be contiguous memory (from an i/o perspective), and must be
+ * DMA'able (which means for some systems, it will go through an IOMMU,
+ * or be forced into a low address range).
+ */
+int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
+{
+       unsigned amt;
+       u64 reg;
+
+       if (!rcd->rcvhdrq) {
+               dma_addr_t phys_hdrqtail;
+               gfp_t gfp_flags;
+
+               /*
+                * rcvhdrqentsize is in DWs, so we have to convert to bytes
+                * (* sizeof(u32)).
+                */
+               amt = PAGE_ALIGN(rcd->rcvhdrq_cnt * rcd->rcvhdrqentsize *
+                                sizeof(u32));
+
+               gfp_flags = (rcd->ctxt >= dd->first_user_ctxt) ?
+                       GFP_USER : GFP_KERNEL;
+               rcd->rcvhdrq = dma_zalloc_coherent(
+                       &dd->pcidev->dev, amt, &rcd->rcvhdrq_phys,
+                       gfp_flags | __GFP_COMP);
+
+               if (!rcd->rcvhdrq) {
+                       dd_dev_err(dd,
+                                  "attempt to allocate %d bytes for ctxt %u rcvhdrq failed\n",
+                                  amt, rcd->ctxt);
+                       goto bail;
+               }
+
+               if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) {
+                       rcd->rcvhdrtail_kvaddr = dma_zalloc_coherent(
+                               &dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail,
+                               gfp_flags);
+                       if (!rcd->rcvhdrtail_kvaddr)
+                               goto bail_free;
+                       rcd->rcvhdrqtailaddr_phys = phys_hdrqtail;
+               }
+
+               rcd->rcvhdrq_size = amt;
+       }
+       /*
+        * These values are per-context:
+        *      RcvHdrCnt
+        *      RcvHdrEntSize
+        *      RcvHdrSize
+        */
+       reg = ((u64)(rcd->rcvhdrq_cnt >> HDRQ_SIZE_SHIFT)
+                       & RCV_HDR_CNT_CNT_MASK)
+               << RCV_HDR_CNT_CNT_SHIFT;
+       write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_CNT, reg);
+       reg = (encode_rcv_header_entry_size(rcd->rcvhdrqentsize)
+                       & RCV_HDR_ENT_SIZE_ENT_SIZE_MASK)
+               << RCV_HDR_ENT_SIZE_ENT_SIZE_SHIFT;
+       write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_ENT_SIZE, reg);
+       reg = (dd->rcvhdrsize & RCV_HDR_SIZE_HDR_SIZE_MASK)
+               << RCV_HDR_SIZE_HDR_SIZE_SHIFT;
+       write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_SIZE, reg);
+
+       /*
+        * Program dummy tail address for every receive context
+        * before enabling any receive context
+        */
+       write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_TAIL_ADDR,
+                       dd->rcvhdrtail_dummy_physaddr);
+
+       return 0;
+
+bail_free:
+       dd_dev_err(dd,
+                  "attempt to allocate 1 page for ctxt %u rcvhdrqtailaddr failed\n",
+                  rcd->ctxt);
+       vfree(rcd->user_event_mask);
+       rcd->user_event_mask = NULL;
+       dma_free_coherent(&dd->pcidev->dev, amt, rcd->rcvhdrq,
+                         rcd->rcvhdrq_phys);
+       rcd->rcvhdrq = NULL;
+bail:
+       return -ENOMEM;
+}
+
+/**
+ * allocate eager buffers, both kernel and user contexts.
+ * @rcd: the context we are setting up.
+ *
+ * Allocate the eager TID buffers and program them into hip.
+ * They are no longer completely contiguous, we do multiple allocation
+ * calls.  Otherwise we get the OOM code involved, by asking for too
+ * much per call, with disastrous results on some kernels.
+ */
+int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd)
+{
+       struct hfi1_devdata *dd = rcd->dd;
+       u32 max_entries, egrtop, alloced_bytes = 0, idx = 0;
+       gfp_t gfp_flags;
+       u16 order;
+       int ret = 0;
+       u16 round_mtu = roundup_pow_of_two(hfi1_max_mtu);
+
+       /*
+        * GFP_USER, but without GFP_FS, so buffer cache can be
+        * coalesced (we hope); otherwise, even at order 4,
+        * heavy filesystem activity makes these fail, and we can
+        * use compound pages.
+        */
+       gfp_flags = __GFP_RECLAIM | __GFP_IO | __GFP_COMP;
+
+       /*
+        * The minimum size of the eager buffers is a groups of MTU-sized
+        * buffers.
+        * The global eager_buffer_size parameter is checked against the
+        * theoretical lower limit of the value. Here, we check against the
+        * MTU.
+        */
+       if (rcd->egrbufs.size < (round_mtu * dd->rcv_entries.group_size))
+               rcd->egrbufs.size = round_mtu * dd->rcv_entries.group_size;
+       /*
+        * If using one-pkt-per-egr-buffer, lower the eager buffer
+        * size to the max MTU (page-aligned).
+        */
+       if (!HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR))
+               rcd->egrbufs.rcvtid_size = round_mtu;
+
+       /*
+        * Eager buffers sizes of 1MB or less require smaller TID sizes
+        * to satisfy the "multiple of 8 RcvArray entries" requirement.
+        */
+       if (rcd->egrbufs.size <= (1 << 20))
+               rcd->egrbufs.rcvtid_size = max((unsigned long)round_mtu,
+                       rounddown_pow_of_two(rcd->egrbufs.size / 8));
+
+       while (alloced_bytes < rcd->egrbufs.size &&
+              rcd->egrbufs.alloced < rcd->egrbufs.count) {
+               rcd->egrbufs.buffers[idx].addr =
+                       dma_zalloc_coherent(&dd->pcidev->dev,
+                                           rcd->egrbufs.rcvtid_size,
+                                           &rcd->egrbufs.buffers[idx].phys,
+                                           gfp_flags);
+               if (rcd->egrbufs.buffers[idx].addr) {
+                       rcd->egrbufs.buffers[idx].len =
+                               rcd->egrbufs.rcvtid_size;
+                       rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].addr =
+                               rcd->egrbufs.buffers[idx].addr;
+                       rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].phys =
+                               rcd->egrbufs.buffers[idx].phys;
+                       rcd->egrbufs.alloced++;
+                       alloced_bytes += rcd->egrbufs.rcvtid_size;
+                       idx++;
+               } else {
+                       u32 new_size, i, j;
+                       u64 offset = 0;
+
+                       /*
+                        * Fail the eager buffer allocation if:
+                        *   - we are already using the lowest acceptable size
+                        *   - we are using one-pkt-per-egr-buffer (this implies
+                        *     that we are accepting only one size)
+                        */
+                       if (rcd->egrbufs.rcvtid_size == round_mtu ||
+                           !HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) {
+                               dd_dev_err(dd, "ctxt%u: Failed to allocate eager buffers\n",
+                                          rcd->ctxt);
+                               goto bail_rcvegrbuf_phys;
+                       }
+
+                       new_size = rcd->egrbufs.rcvtid_size / 2;
+
+                       /*
+                        * If the first attempt to allocate memory failed, don't
+                        * fail everything but continue with the next lower
+                        * size.
+                        */
+                       if (idx == 0) {
+                               rcd->egrbufs.rcvtid_size = new_size;
+                               continue;
+                       }
+
+                       /*
+                        * Re-partition already allocated buffers to a smaller
+                        * size.
+                        */
+                       rcd->egrbufs.alloced = 0;
+                       for (i = 0, j = 0, offset = 0; j < idx; i++) {
+                               if (i >= rcd->egrbufs.count)
+                                       break;
+                               rcd->egrbufs.rcvtids[i].phys =
+                                       rcd->egrbufs.buffers[j].phys + offset;
+                               rcd->egrbufs.rcvtids[i].addr =
+                                       rcd->egrbufs.buffers[j].addr + offset;
+                               rcd->egrbufs.alloced++;
+                               if ((rcd->egrbufs.buffers[j].phys + offset +
+                                    new_size) ==
+                                   (rcd->egrbufs.buffers[j].phys +
+                                    rcd->egrbufs.buffers[j].len)) {
+                                       j++;
+                                       offset = 0;
+                               } else {
+                                       offset += new_size;
+                               }
+                       }
+                       rcd->egrbufs.rcvtid_size = new_size;
+               }
+       }
+       rcd->egrbufs.numbufs = idx;
+       rcd->egrbufs.size = alloced_bytes;
+
+       hfi1_cdbg(PROC,
+                 "ctxt%u: Alloced %u rcv tid entries @ %uKB, total %zuKB\n",
+                 rcd->ctxt, rcd->egrbufs.alloced, rcd->egrbufs.rcvtid_size,
+                 rcd->egrbufs.size);
+
+       /*
+        * Set the contexts rcv array head update threshold to the closest
+        * power of 2 (so we can use a mask instead of modulo) below half
+        * the allocated entries.
+        */
+       rcd->egrbufs.threshold =
+               rounddown_pow_of_two(rcd->egrbufs.alloced / 2);
+       /*
+        * Compute the expected RcvArray entry base. This is done after
+        * allocating the eager buffers in order to maximize the
+        * expected RcvArray entries for the context.
+        */
+       max_entries = rcd->rcv_array_groups * dd->rcv_entries.group_size;
+       egrtop = roundup(rcd->egrbufs.alloced, dd->rcv_entries.group_size);
+       rcd->expected_count = max_entries - egrtop;
+       if (rcd->expected_count > MAX_TID_PAIR_ENTRIES * 2)
+               rcd->expected_count = MAX_TID_PAIR_ENTRIES * 2;
+
+       rcd->expected_base = rcd->eager_base + egrtop;
+       hfi1_cdbg(PROC, "ctxt%u: eager:%u, exp:%u, egrbase:%u, expbase:%u\n",
+                 rcd->ctxt, rcd->egrbufs.alloced, rcd->expected_count,
+                 rcd->eager_base, rcd->expected_base);
+
+       if (!hfi1_rcvbuf_validate(rcd->egrbufs.rcvtid_size, PT_EAGER, &order)) {
+               hfi1_cdbg(PROC,
+                         "ctxt%u: current Eager buffer size is invalid %u\n",
+                         rcd->ctxt, rcd->egrbufs.rcvtid_size);
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       for (idx = 0; idx < rcd->egrbufs.alloced; idx++) {
+               hfi1_put_tid(dd, rcd->eager_base + idx, PT_EAGER,
+                            rcd->egrbufs.rcvtids[idx].phys, order);
+               cond_resched();
+       }
+       goto bail;
+
+bail_rcvegrbuf_phys:
+       for (idx = 0; idx < rcd->egrbufs.alloced &&
+            rcd->egrbufs.buffers[idx].addr;
+            idx++) {
+               dma_free_coherent(&dd->pcidev->dev,
+                                 rcd->egrbufs.buffers[idx].len,
+                                 rcd->egrbufs.buffers[idx].addr,
+                                 rcd->egrbufs.buffers[idx].phys);
+               rcd->egrbufs.buffers[idx].addr = NULL;
+               rcd->egrbufs.buffers[idx].phys = 0;
+               rcd->egrbufs.buffers[idx].len = 0;
+       }
+bail:
+       return ret;
+}
diff --git a/drivers/infiniband/hw/hfi1/intr.c b/drivers/infiniband/hw/hfi1/intr.c
new file mode 100644 (file)
index 0000000..65348d1
--- /dev/null
@@ -0,0 +1,200 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/pci.h>
+#include <linux/delay.h>
+
+#include "hfi.h"
+#include "common.h"
+#include "sdma.h"
+
+/**
+ * format_hwmsg - format a single hwerror message
+ * @msg message buffer
+ * @msgl length of message buffer
+ * @hwmsg message to add to message buffer
+ */
+static void format_hwmsg(char *msg, size_t msgl, const char *hwmsg)
+{
+       strlcat(msg, "[", msgl);
+       strlcat(msg, hwmsg, msgl);
+       strlcat(msg, "]", msgl);
+}
+
+/**
+ * hfi1_format_hwerrors - format hardware error messages for display
+ * @hwerrs hardware errors bit vector
+ * @hwerrmsgs hardware error descriptions
+ * @nhwerrmsgs number of hwerrmsgs
+ * @msg message buffer
+ * @msgl message buffer length
+ */
+void hfi1_format_hwerrors(u64 hwerrs, const struct hfi1_hwerror_msgs *hwerrmsgs,
+                         size_t nhwerrmsgs, char *msg, size_t msgl)
+{
+       int i;
+
+       for (i = 0; i < nhwerrmsgs; i++)
+               if (hwerrs & hwerrmsgs[i].mask)
+                       format_hwmsg(msg, msgl, hwerrmsgs[i].msg);
+}
+
+static void signal_ib_event(struct hfi1_pportdata *ppd, enum ib_event_type ev)
+{
+       struct ib_event event;
+       struct hfi1_devdata *dd = ppd->dd;
+
+       /*
+        * Only call ib_dispatch_event() if the IB device has been
+        * registered.  HFI1_INITED is set iff the driver has successfully
+        * registered with the IB core.
+        */
+       if (!(dd->flags & HFI1_INITTED))
+               return;
+       event.device = &dd->verbs_dev.rdi.ibdev;
+       event.element.port_num = ppd->port;
+       event.event = ev;
+       ib_dispatch_event(&event);
+}
+
+/*
+ * Handle a linkup or link down notification.
+ * This is called outside an interrupt.
+ */
+void handle_linkup_change(struct hfi1_devdata *dd, u32 linkup)
+{
+       struct hfi1_pportdata *ppd = &dd->pport[0];
+       enum ib_event_type ev;
+
+       if (!(ppd->linkup ^ !!linkup))
+               return; /* no change, nothing to do */
+
+       if (linkup) {
+               /*
+                * Quick linkup and all link up on the simulator does not
+                * trigger or implement:
+                *      - VerifyCap interrupt
+                *      - VerifyCap frames
+                * But rather moves directly to LinkUp.
+                *
+                * Do the work of the VerifyCap interrupt handler,
+                * handle_verify_cap(), but do not try moving the state to
+                * LinkUp as we are already there.
+                *
+                * NOTE: This uses this device's vAU, vCU, and vl15_init for
+                * the remote values.  Both sides must be using the values.
+                */
+               if (quick_linkup || dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
+                       set_up_vl15(dd, dd->vau, dd->vl15_init);
+                       assign_remote_cm_au_table(dd, dd->vcu);
+                       ppd->neighbor_guid =
+                               read_csr(dd, DC_DC8051_STS_REMOTE_GUID);
+                       ppd->neighbor_type =
+                               read_csr(dd, DC_DC8051_STS_REMOTE_NODE_TYPE) &
+                                       DC_DC8051_STS_REMOTE_NODE_TYPE_VAL_MASK;
+                       ppd->neighbor_port_number =
+                               read_csr(dd, DC_DC8051_STS_REMOTE_PORT_NO) &
+                                        DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK;
+                       dd_dev_info(dd, "Neighbor GUID: %llx Neighbor type %d\n",
+                                   ppd->neighbor_guid,
+                                   ppd->neighbor_type);
+               }
+
+               /* physical link went up */
+               ppd->linkup = 1;
+               ppd->offline_disabled_reason =
+                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE);
+
+               /* link widths are not available until the link is fully up */
+               get_linkup_link_widths(ppd);
+
+       } else {
+               /* physical link went down */
+               ppd->linkup = 0;
+
+               /* clear HW details of the previous connection */
+               reset_link_credits(dd);
+
+               /* freeze after a link down to guarantee a clean egress */
+               start_freeze_handling(ppd, FREEZE_SELF | FREEZE_LINK_DOWN);
+
+               ev = IB_EVENT_PORT_ERR;
+
+               hfi1_set_uevent_bits(ppd, _HFI1_EVENT_LINKDOWN_BIT);
+
+               /* if we are down, the neighbor is down */
+               ppd->neighbor_normal = 0;
+
+               /* notify IB of the link change */
+               signal_ib_event(ppd, ev);
+       }
+}
+
+/*
+ * Handle receive or urgent interrupts for user contexts.  This means a user
+ * process was waiting for a packet to arrive, and didn't want to poll.
+ */
+void handle_user_interrupt(struct hfi1_ctxtdata *rcd)
+{
+       struct hfi1_devdata *dd = rcd->dd;
+       unsigned long flags;
+
+       spin_lock_irqsave(&dd->uctxt_lock, flags);
+       if (!rcd->cnt)
+               goto done;
+
+       if (test_and_clear_bit(HFI1_CTXT_WAITING_RCV, &rcd->event_flags)) {
+               wake_up_interruptible(&rcd->wait);
+               hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_DIS, rcd->ctxt);
+       } else if (test_and_clear_bit(HFI1_CTXT_WAITING_URG,
+                                                       &rcd->event_flags)) {
+               rcd->urgent++;
+               wake_up_interruptible(&rcd->wait);
+       }
+done:
+       spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+}
diff --git a/drivers/infiniband/hw/hfi1/iowait.h b/drivers/infiniband/hw/hfi1/iowait.h
new file mode 100644 (file)
index 0000000..2ec6ef3
--- /dev/null
@@ -0,0 +1,300 @@
+#ifndef _HFI1_IOWAIT_H
+#define _HFI1_IOWAIT_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/sched.h>
+
+#include "sdma_txreq.h"
+
+/*
+ * typedef (*restart_t)() - restart callback
+ * @work: pointer to work structure
+ */
+typedef void (*restart_t)(struct work_struct *work);
+
+struct sdma_txreq;
+struct sdma_engine;
+/**
+ * struct iowait - linkage for delayed progress/waiting
+ * @list: used to add/insert into QP/PQ wait lists
+ * @tx_head: overflow list of sdma_txreq's
+ * @sleep: no space callback
+ * @wakeup: space callback wakeup
+ * @sdma_drained: sdma count drained
+ * @iowork: workqueue overhead
+ * @wait_dma: wait for sdma_busy == 0
+ * @wait_pio: wait for pio_busy == 0
+ * @sdma_busy: # of packets in flight
+ * @count: total number of descriptors in tx_head'ed list
+ * @tx_limit: limit for overflow queuing
+ * @tx_count: number of tx entry's in tx_head'ed list
+ *
+ * This is to be embedded in user's state structure
+ * (QP or PQ).
+ *
+ * The sleep and wakeup members are a
+ * bit misnamed.   They do not strictly
+ * speaking sleep or wake up, but they
+ * are callbacks for the ULP to implement
+ * what ever queuing/dequeuing of
+ * the embedded iowait and its containing struct
+ * when a resource shortage like SDMA ring space is seen.
+ *
+ * Both potentially have locks help
+ * so sleeping is not allowed.
+ *
+ * The wait_dma member along with the iow
+ */
+
+struct iowait {
+       struct list_head list;
+       struct list_head tx_head;
+       int (*sleep)(
+               struct sdma_engine *sde,
+               struct iowait *wait,
+               struct sdma_txreq *tx,
+               unsigned seq);
+       void (*wakeup)(struct iowait *wait, int reason);
+       void (*sdma_drained)(struct iowait *wait);
+       struct work_struct iowork;
+       wait_queue_head_t wait_dma;
+       wait_queue_head_t wait_pio;
+       atomic_t sdma_busy;
+       atomic_t pio_busy;
+       u32 count;
+       u32 tx_limit;
+       u32 tx_count;
+};
+
+#define SDMA_AVAIL_REASON 0
+
+/**
+ * iowait_init() - initialize wait structure
+ * @wait: wait struct to initialize
+ * @tx_limit: limit for overflow queuing
+ * @func: restart function for workqueue
+ * @sleep: sleep function for no space
+ * @resume: wakeup function for no space
+ *
+ * This function initializes the iowait
+ * structure embedded in the QP or PQ.
+ *
+ */
+
+static inline void iowait_init(
+       struct iowait *wait,
+       u32 tx_limit,
+       void (*func)(struct work_struct *work),
+       int (*sleep)(
+               struct sdma_engine *sde,
+               struct iowait *wait,
+               struct sdma_txreq *tx,
+               unsigned seq),
+       void (*wakeup)(struct iowait *wait, int reason),
+       void (*sdma_drained)(struct iowait *wait))
+{
+       wait->count = 0;
+       INIT_LIST_HEAD(&wait->list);
+       INIT_LIST_HEAD(&wait->tx_head);
+       INIT_WORK(&wait->iowork, func);
+       init_waitqueue_head(&wait->wait_dma);
+       init_waitqueue_head(&wait->wait_pio);
+       atomic_set(&wait->sdma_busy, 0);
+       atomic_set(&wait->pio_busy, 0);
+       wait->tx_limit = tx_limit;
+       wait->sleep = sleep;
+       wait->wakeup = wakeup;
+       wait->sdma_drained = sdma_drained;
+}
+
+/**
+ * iowait_schedule() - initialize wait structure
+ * @wait: wait struct to schedule
+ * @wq: workqueue for schedule
+ * @cpu: cpu
+ */
+static inline void iowait_schedule(
+       struct iowait *wait,
+       struct workqueue_struct *wq,
+       int cpu)
+{
+       queue_work_on(cpu, wq, &wait->iowork);
+}
+
+/**
+ * iowait_sdma_drain() - wait for DMAs to drain
+ *
+ * @wait: iowait structure
+ *
+ * This will delay until the iowait sdmas have
+ * completed.
+ */
+static inline void iowait_sdma_drain(struct iowait *wait)
+{
+       wait_event(wait->wait_dma, !atomic_read(&wait->sdma_busy));
+}
+
+/**
+ * iowait_sdma_pending() - return sdma pending count
+ *
+ * @wait: iowait structure
+ *
+ */
+static inline int iowait_sdma_pending(struct iowait *wait)
+{
+       return atomic_read(&wait->sdma_busy);
+}
+
+/**
+ * iowait_sdma_inc - note sdma io pending
+ * @wait: iowait structure
+ */
+static inline void iowait_sdma_inc(struct iowait *wait)
+{
+       atomic_inc(&wait->sdma_busy);
+}
+
+/**
+ * iowait_sdma_add - add count to pending
+ * @wait: iowait structure
+ */
+static inline void iowait_sdma_add(struct iowait *wait, int count)
+{
+       atomic_add(count, &wait->sdma_busy);
+}
+
+/**
+ * iowait_sdma_dec - note sdma complete
+ * @wait: iowait structure
+ */
+static inline int iowait_sdma_dec(struct iowait *wait)
+{
+       return atomic_dec_and_test(&wait->sdma_busy);
+}
+
+/**
+ * iowait_pio_drain() - wait for pios to drain
+ *
+ * @wait: iowait structure
+ *
+ * This will delay until the iowait pios have
+ * completed.
+ */
+static inline void iowait_pio_drain(struct iowait *wait)
+{
+       wait_event_timeout(wait->wait_pio,
+                          !atomic_read(&wait->pio_busy),
+                          HZ);
+}
+
+/**
+ * iowait_pio_pending() - return pio pending count
+ *
+ * @wait: iowait structure
+ *
+ */
+static inline int iowait_pio_pending(struct iowait *wait)
+{
+       return atomic_read(&wait->pio_busy);
+}
+
+/**
+ * iowait_pio_inc - note pio pending
+ * @wait: iowait structure
+ */
+static inline void iowait_pio_inc(struct iowait *wait)
+{
+       atomic_inc(&wait->pio_busy);
+}
+
+/**
+ * iowait_sdma_dec - note pio complete
+ * @wait: iowait structure
+ */
+static inline int iowait_pio_dec(struct iowait *wait)
+{
+       return atomic_dec_and_test(&wait->pio_busy);
+}
+
+/**
+ * iowait_drain_wakeup() - trigger iowait_drain() waiter
+ *
+ * @wait: iowait structure
+ *
+ * This will trigger any waiters.
+ */
+static inline void iowait_drain_wakeup(struct iowait *wait)
+{
+       wake_up(&wait->wait_dma);
+       wake_up(&wait->wait_pio);
+       if (wait->sdma_drained)
+               wait->sdma_drained(wait);
+}
+
+/**
+ * iowait_get_txhead() - get packet off of iowait list
+ *
+ * @wait wait struture
+ */
+static inline struct sdma_txreq *iowait_get_txhead(struct iowait *wait)
+{
+       struct sdma_txreq *tx = NULL;
+
+       if (!list_empty(&wait->tx_head)) {
+               tx = list_first_entry(
+                       &wait->tx_head,
+                       struct sdma_txreq,
+                       list);
+               list_del_init(&tx->list);
+       }
+       return tx;
+}
+
+#endif
diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c
new file mode 100644 (file)
index 0000000..2190295
--- /dev/null
@@ -0,0 +1,4449 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/net.h>
+#define OPA_NUM_PKEY_BLOCKS_PER_SMP (OPA_SMP_DR_DATA_SIZE \
+                       / (OPA_PARTITION_TABLE_BLK_SIZE * sizeof(u16)))
+
+#include "hfi.h"
+#include "mad.h"
+#include "trace.h"
+#include "qp.h"
+
+/* the reset value from the FM is supposed to be 0xffff, handle both */
+#define OPA_LINK_WIDTH_RESET_OLD 0x0fff
+#define OPA_LINK_WIDTH_RESET 0xffff
+
+static int reply(struct ib_mad_hdr *smp)
+{
+       /*
+        * The verbs framework will handle the directed/LID route
+        * packet changes.
+        */
+       smp->method = IB_MGMT_METHOD_GET_RESP;
+       if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+               smp->status |= IB_SMP_DIRECTION;
+       return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+static inline void clear_opa_smp_data(struct opa_smp *smp)
+{
+       void *data = opa_get_smp_data(smp);
+       size_t size = opa_get_smp_data_size(smp);
+
+       memset(data, 0, size);
+}
+
+static void send_trap(struct hfi1_ibport *ibp, void *data, unsigned len)
+{
+       struct ib_mad_send_buf *send_buf;
+       struct ib_mad_agent *agent;
+       struct opa_smp *smp;
+       int ret;
+       unsigned long flags;
+       unsigned long timeout;
+       int pkey_idx;
+       u32 qpn = ppd_from_ibp(ibp)->sm_trap_qp;
+
+       agent = ibp->rvp.send_agent;
+       if (!agent)
+               return;
+
+       /* o14-3.2.1 */
+       if (ppd_from_ibp(ibp)->lstate != IB_PORT_ACTIVE)
+               return;
+
+       /* o14-2 */
+       if (ibp->rvp.trap_timeout && time_before(jiffies,
+                                                ibp->rvp.trap_timeout))
+               return;
+
+       pkey_idx = hfi1_lookup_pkey_idx(ibp, LIM_MGMT_P_KEY);
+       if (pkey_idx < 0) {
+               pr_warn("%s: failed to find limited mgmt pkey, defaulting 0x%x\n",
+                       __func__, hfi1_get_pkey(ibp, 1));
+               pkey_idx = 1;
+       }
+
+       send_buf = ib_create_send_mad(agent, qpn, pkey_idx, 0,
+                                     IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
+                                     GFP_ATOMIC, IB_MGMT_BASE_VERSION);
+       if (IS_ERR(send_buf))
+               return;
+
+       smp = send_buf->mad;
+       smp->base_version = OPA_MGMT_BASE_VERSION;
+       smp->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+       smp->class_version = OPA_SMI_CLASS_VERSION;
+       smp->method = IB_MGMT_METHOD_TRAP;
+       ibp->rvp.tid++;
+       smp->tid = cpu_to_be64(ibp->rvp.tid);
+       smp->attr_id = IB_SMP_ATTR_NOTICE;
+       /* o14-1: smp->mkey = 0; */
+       memcpy(smp->route.lid.data, data, len);
+
+       spin_lock_irqsave(&ibp->rvp.lock, flags);
+       if (!ibp->rvp.sm_ah) {
+               if (ibp->rvp.sm_lid != be16_to_cpu(IB_LID_PERMISSIVE)) {
+                       struct ib_ah *ah;
+
+                       ah = hfi1_create_qp0_ah(ibp, ibp->rvp.sm_lid);
+                       if (IS_ERR(ah)) {
+                               ret = PTR_ERR(ah);
+                       } else {
+                               send_buf->ah = ah;
+                               ibp->rvp.sm_ah = ibah_to_rvtah(ah);
+                               ret = 0;
+                       }
+               } else {
+                       ret = -EINVAL;
+               }
+       } else {
+               send_buf->ah = &ibp->rvp.sm_ah->ibah;
+               ret = 0;
+       }
+       spin_unlock_irqrestore(&ibp->rvp.lock, flags);
+
+       if (!ret)
+               ret = ib_post_send_mad(send_buf, NULL);
+       if (!ret) {
+               /* 4.096 usec. */
+               timeout = (4096 * (1UL << ibp->rvp.subnet_timeout)) / 1000;
+               ibp->rvp.trap_timeout = jiffies + usecs_to_jiffies(timeout);
+       } else {
+               ib_free_send_mad(send_buf);
+               ibp->rvp.trap_timeout = 0;
+       }
+}
+
+/*
+ * Send a bad [PQ]_Key trap (ch. 14.3.8).
+ */
+void hfi1_bad_pqkey(struct hfi1_ibport *ibp, __be16 trap_num, u32 key, u32 sl,
+                   u32 qp1, u32 qp2, u16 lid1, u16 lid2)
+{
+       struct opa_mad_notice_attr data;
+       u32 lid = ppd_from_ibp(ibp)->lid;
+       u32 _lid1 = lid1;
+       u32 _lid2 = lid2;
+
+       memset(&data, 0, sizeof(data));
+
+       if (trap_num == OPA_TRAP_BAD_P_KEY)
+               ibp->rvp.pkey_violations++;
+       else
+               ibp->rvp.qkey_violations++;
+       ibp->rvp.n_pkt_drops++;
+
+       /* Send violation trap */
+       data.generic_type = IB_NOTICE_TYPE_SECURITY;
+       data.prod_type_lsb = IB_NOTICE_PROD_CA;
+       data.trap_num = trap_num;
+       data.issuer_lid = cpu_to_be32(lid);
+       data.ntc_257_258.lid1 = cpu_to_be32(_lid1);
+       data.ntc_257_258.lid2 = cpu_to_be32(_lid2);
+       data.ntc_257_258.key = cpu_to_be32(key);
+       data.ntc_257_258.sl = sl << 3;
+       data.ntc_257_258.qp1 = cpu_to_be32(qp1);
+       data.ntc_257_258.qp2 = cpu_to_be32(qp2);
+
+       send_trap(ibp, &data, sizeof(data));
+}
+
+/*
+ * Send a bad M_Key trap (ch. 14.3.9).
+ */
+static void bad_mkey(struct hfi1_ibport *ibp, struct ib_mad_hdr *mad,
+                    __be64 mkey, __be32 dr_slid, u8 return_path[], u8 hop_cnt)
+{
+       struct opa_mad_notice_attr data;
+       u32 lid = ppd_from_ibp(ibp)->lid;
+
+       memset(&data, 0, sizeof(data));
+       /* Send violation trap */
+       data.generic_type = IB_NOTICE_TYPE_SECURITY;
+       data.prod_type_lsb = IB_NOTICE_PROD_CA;
+       data.trap_num = OPA_TRAP_BAD_M_KEY;
+       data.issuer_lid = cpu_to_be32(lid);
+       data.ntc_256.lid = data.issuer_lid;
+       data.ntc_256.method = mad->method;
+       data.ntc_256.attr_id = mad->attr_id;
+       data.ntc_256.attr_mod = mad->attr_mod;
+       data.ntc_256.mkey = mkey;
+       if (mad->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+               data.ntc_256.dr_slid = dr_slid;
+               data.ntc_256.dr_trunc_hop = IB_NOTICE_TRAP_DR_NOTICE;
+               if (hop_cnt > ARRAY_SIZE(data.ntc_256.dr_rtn_path)) {
+                       data.ntc_256.dr_trunc_hop |=
+                               IB_NOTICE_TRAP_DR_TRUNC;
+                       hop_cnt = ARRAY_SIZE(data.ntc_256.dr_rtn_path);
+               }
+               data.ntc_256.dr_trunc_hop |= hop_cnt;
+               memcpy(data.ntc_256.dr_rtn_path, return_path,
+                      hop_cnt);
+       }
+
+       send_trap(ibp, &data, sizeof(data));
+}
+
+/*
+ * Send a Port Capability Mask Changed trap (ch. 14.3.11).
+ */
+void hfi1_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num)
+{
+       struct opa_mad_notice_attr data;
+       struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
+       struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
+       struct hfi1_ibport *ibp = &dd->pport[port_num - 1].ibport_data;
+       u32 lid = ppd_from_ibp(ibp)->lid;
+
+       memset(&data, 0, sizeof(data));
+
+       data.generic_type = IB_NOTICE_TYPE_INFO;
+       data.prod_type_lsb = IB_NOTICE_PROD_CA;
+       data.trap_num = OPA_TRAP_CHANGE_CAPABILITY;
+       data.issuer_lid = cpu_to_be32(lid);
+       data.ntc_144.lid = data.issuer_lid;
+       data.ntc_144.new_cap_mask = cpu_to_be32(ibp->rvp.port_cap_flags);
+
+       send_trap(ibp, &data, sizeof(data));
+}
+
+/*
+ * Send a System Image GUID Changed trap (ch. 14.3.12).
+ */
+void hfi1_sys_guid_chg(struct hfi1_ibport *ibp)
+{
+       struct opa_mad_notice_attr data;
+       u32 lid = ppd_from_ibp(ibp)->lid;
+
+       memset(&data, 0, sizeof(data));
+
+       data.generic_type = IB_NOTICE_TYPE_INFO;
+       data.prod_type_lsb = IB_NOTICE_PROD_CA;
+       data.trap_num = OPA_TRAP_CHANGE_SYSGUID;
+       data.issuer_lid = cpu_to_be32(lid);
+       data.ntc_145.new_sys_guid = ib_hfi1_sys_image_guid;
+       data.ntc_145.lid = data.issuer_lid;
+
+       send_trap(ibp, &data, sizeof(data));
+}
+
+/*
+ * Send a Node Description Changed trap (ch. 14.3.13).
+ */
+void hfi1_node_desc_chg(struct hfi1_ibport *ibp)
+{
+       struct opa_mad_notice_attr data;
+       u32 lid = ppd_from_ibp(ibp)->lid;
+
+       memset(&data, 0, sizeof(data));
+
+       data.generic_type = IB_NOTICE_TYPE_INFO;
+       data.prod_type_lsb = IB_NOTICE_PROD_CA;
+       data.trap_num = OPA_TRAP_CHANGE_CAPABILITY;
+       data.issuer_lid = cpu_to_be32(lid);
+       data.ntc_144.lid = data.issuer_lid;
+       data.ntc_144.change_flags =
+               cpu_to_be16(OPA_NOTICE_TRAP_NODE_DESC_CHG);
+
+       send_trap(ibp, &data, sizeof(data));
+}
+
+static int __subn_get_opa_nodedesc(struct opa_smp *smp, u32 am,
+                                  u8 *data, struct ib_device *ibdev,
+                                  u8 port, u32 *resp_len)
+{
+       struct opa_node_description *nd;
+
+       if (am) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       nd = (struct opa_node_description *)data;
+
+       memcpy(nd->data, ibdev->node_desc, sizeof(nd->data));
+
+       if (resp_len)
+               *resp_len += sizeof(*nd);
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_get_opa_nodeinfo(struct opa_smp *smp, u32 am, u8 *data,
+                                  struct ib_device *ibdev, u8 port,
+                                  u32 *resp_len)
+{
+       struct opa_node_info *ni;
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       unsigned pidx = port - 1; /* IB number port from 1, hw from 0 */
+
+       ni = (struct opa_node_info *)data;
+
+       /* GUID 0 is illegal */
+       if (am || pidx >= dd->num_pports || dd->pport[pidx].guid == 0) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       ni->port_guid = cpu_to_be64(dd->pport[pidx].guid);
+       ni->base_version = OPA_MGMT_BASE_VERSION;
+       ni->class_version = OPA_SMI_CLASS_VERSION;
+       ni->node_type = 1;     /* channel adapter */
+       ni->num_ports = ibdev->phys_port_cnt;
+       /* This is already in network order */
+       ni->system_image_guid = ib_hfi1_sys_image_guid;
+       /* Use first-port GUID as node */
+       ni->node_guid = cpu_to_be64(dd->pport->guid);
+       ni->partition_cap = cpu_to_be16(hfi1_get_npkeys(dd));
+       ni->device_id = cpu_to_be16(dd->pcidev->device);
+       ni->revision = cpu_to_be32(dd->minrev);
+       ni->local_port_num = port;
+       ni->vendor_id[0] = dd->oui1;
+       ni->vendor_id[1] = dd->oui2;
+       ni->vendor_id[2] = dd->oui3;
+
+       if (resp_len)
+               *resp_len += sizeof(*ni);
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int subn_get_nodeinfo(struct ib_smp *smp, struct ib_device *ibdev,
+                            u8 port)
+{
+       struct ib_node_info *nip = (struct ib_node_info *)&smp->data;
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       unsigned pidx = port - 1; /* IB number port from 1, hw from 0 */
+
+       /* GUID 0 is illegal */
+       if (smp->attr_mod || pidx >= dd->num_pports ||
+           dd->pport[pidx].guid == 0)
+               smp->status |= IB_SMP_INVALID_FIELD;
+       else
+               nip->port_guid = cpu_to_be64(dd->pport[pidx].guid);
+
+       nip->base_version = OPA_MGMT_BASE_VERSION;
+       nip->class_version = OPA_SMI_CLASS_VERSION;
+       nip->node_type = 1;     /* channel adapter */
+       nip->num_ports = ibdev->phys_port_cnt;
+       /* This is already in network order */
+       nip->sys_guid = ib_hfi1_sys_image_guid;
+        /* Use first-port GUID as node */
+       nip->node_guid = cpu_to_be64(dd->pport->guid);
+       nip->partition_cap = cpu_to_be16(hfi1_get_npkeys(dd));
+       nip->device_id = cpu_to_be16(dd->pcidev->device);
+       nip->revision = cpu_to_be32(dd->minrev);
+       nip->local_port_num = port;
+       nip->vendor_id[0] = dd->oui1;
+       nip->vendor_id[1] = dd->oui2;
+       nip->vendor_id[2] = dd->oui3;
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static void set_link_width_enabled(struct hfi1_pportdata *ppd, u32 w)
+{
+       (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LWID_ENB, w);
+}
+
+static void set_link_width_downgrade_enabled(struct hfi1_pportdata *ppd, u32 w)
+{
+       (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LWID_DG_ENB, w);
+}
+
+static void set_link_speed_enabled(struct hfi1_pportdata *ppd, u32 s)
+{
+       (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_SPD_ENB, s);
+}
+
+static int check_mkey(struct hfi1_ibport *ibp, struct ib_mad_hdr *mad,
+                     int mad_flags, __be64 mkey, __be32 dr_slid,
+                     u8 return_path[], u8 hop_cnt)
+{
+       int valid_mkey = 0;
+       int ret = 0;
+
+       /* Is the mkey in the process of expiring? */
+       if (ibp->rvp.mkey_lease_timeout &&
+           time_after_eq(jiffies, ibp->rvp.mkey_lease_timeout)) {
+               /* Clear timeout and mkey protection field. */
+               ibp->rvp.mkey_lease_timeout = 0;
+               ibp->rvp.mkeyprot = 0;
+       }
+
+       if ((mad_flags & IB_MAD_IGNORE_MKEY) ||  ibp->rvp.mkey == 0 ||
+           ibp->rvp.mkey == mkey)
+               valid_mkey = 1;
+
+       /* Unset lease timeout on any valid Get/Set/TrapRepress */
+       if (valid_mkey && ibp->rvp.mkey_lease_timeout &&
+           (mad->method == IB_MGMT_METHOD_GET ||
+            mad->method == IB_MGMT_METHOD_SET ||
+            mad->method == IB_MGMT_METHOD_TRAP_REPRESS))
+               ibp->rvp.mkey_lease_timeout = 0;
+
+       if (!valid_mkey) {
+               switch (mad->method) {
+               case IB_MGMT_METHOD_GET:
+                       /* Bad mkey not a violation below level 2 */
+                       if (ibp->rvp.mkeyprot < 2)
+                               break;
+               case IB_MGMT_METHOD_SET:
+               case IB_MGMT_METHOD_TRAP_REPRESS:
+                       if (ibp->rvp.mkey_violations != 0xFFFF)
+                               ++ibp->rvp.mkey_violations;
+                       if (!ibp->rvp.mkey_lease_timeout &&
+                           ibp->rvp.mkey_lease_period)
+                               ibp->rvp.mkey_lease_timeout = jiffies +
+                                       ibp->rvp.mkey_lease_period * HZ;
+                       /* Generate a trap notice. */
+                       bad_mkey(ibp, mad, mkey, dr_slid, return_path,
+                                hop_cnt);
+                       ret = 1;
+               }
+       }
+
+       return ret;
+}
+
+/*
+ * The SMA caches reads from LCB registers in case the LCB is unavailable.
+ * (The LCB is unavailable in certain link states, for example.)
+ */
+struct lcb_datum {
+       u32 off;
+       u64 val;
+};
+
+static struct lcb_datum lcb_cache[] = {
+       { DC_LCB_STS_ROUND_TRIP_LTP_CNT, 0 },
+};
+
+static int write_lcb_cache(u32 off, u64 val)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(lcb_cache); i++) {
+               if (lcb_cache[i].off == off) {
+                       lcb_cache[i].val = val;
+                       return 0;
+               }
+       }
+
+       pr_warn("%s bad offset 0x%x\n", __func__, off);
+       return -1;
+}
+
+static int read_lcb_cache(u32 off, u64 *val)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(lcb_cache); i++) {
+               if (lcb_cache[i].off == off) {
+                       *val = lcb_cache[i].val;
+                       return 0;
+               }
+       }
+
+       pr_warn("%s bad offset 0x%x\n", __func__, off);
+       return -1;
+}
+
+void read_ltp_rtt(struct hfi1_devdata *dd)
+{
+       u64 reg;
+
+       if (read_lcb_csr(dd, DC_LCB_STS_ROUND_TRIP_LTP_CNT, &reg))
+               dd_dev_err(dd, "%s: unable to read LTP RTT\n", __func__);
+       else
+               write_lcb_cache(DC_LCB_STS_ROUND_TRIP_LTP_CNT, reg);
+}
+
+static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
+                                  struct ib_device *ibdev, u8 port,
+                                  u32 *resp_len)
+{
+       int i;
+       struct hfi1_devdata *dd;
+       struct hfi1_pportdata *ppd;
+       struct hfi1_ibport *ibp;
+       struct opa_port_info *pi = (struct opa_port_info *)data;
+       u8 mtu;
+       u8 credit_rate;
+       u8 is_beaconing_active;
+       u32 state;
+       u32 num_ports = OPA_AM_NPORT(am);
+       u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
+       u32 buffer_units;
+       u64 tmp = 0;
+
+       if (num_ports != 1) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       dd = dd_from_ibdev(ibdev);
+       /* IB numbers ports from 1, hw from 0 */
+       ppd = dd->pport + (port - 1);
+       ibp = &ppd->ibport_data;
+
+       if (ppd->vls_supported / 2 > ARRAY_SIZE(pi->neigh_mtu.pvlx_to_mtu) ||
+           ppd->vls_supported > ARRAY_SIZE(dd->vld)) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       pi->lid = cpu_to_be32(ppd->lid);
+
+       /* Only return the mkey if the protection field allows it. */
+       if (!(smp->method == IB_MGMT_METHOD_GET &&
+             ibp->rvp.mkey != smp->mkey &&
+             ibp->rvp.mkeyprot == 1))
+               pi->mkey = ibp->rvp.mkey;
+
+       pi->subnet_prefix = ibp->rvp.gid_prefix;
+       pi->sm_lid = cpu_to_be32(ibp->rvp.sm_lid);
+       pi->ib_cap_mask = cpu_to_be32(ibp->rvp.port_cap_flags);
+       pi->mkey_lease_period = cpu_to_be16(ibp->rvp.mkey_lease_period);
+       pi->sm_trap_qp = cpu_to_be32(ppd->sm_trap_qp);
+       pi->sa_qp = cpu_to_be32(ppd->sa_qp);
+
+       pi->link_width.enabled = cpu_to_be16(ppd->link_width_enabled);
+       pi->link_width.supported = cpu_to_be16(ppd->link_width_supported);
+       pi->link_width.active = cpu_to_be16(ppd->link_width_active);
+
+       pi->link_width_downgrade.supported =
+                       cpu_to_be16(ppd->link_width_downgrade_supported);
+       pi->link_width_downgrade.enabled =
+                       cpu_to_be16(ppd->link_width_downgrade_enabled);
+       pi->link_width_downgrade.tx_active =
+                       cpu_to_be16(ppd->link_width_downgrade_tx_active);
+       pi->link_width_downgrade.rx_active =
+                       cpu_to_be16(ppd->link_width_downgrade_rx_active);
+
+       pi->link_speed.supported = cpu_to_be16(ppd->link_speed_supported);
+       pi->link_speed.active = cpu_to_be16(ppd->link_speed_active);
+       pi->link_speed.enabled = cpu_to_be16(ppd->link_speed_enabled);
+
+       state = driver_lstate(ppd);
+
+       if (start_of_sm_config && (state == IB_PORT_INIT))
+               ppd->is_sm_config_started = 1;
+
+       pi->port_phys_conf = (ppd->port_type & 0xf);
+
+#if PI_LED_ENABLE_SUP
+       pi->port_states.ledenable_offlinereason = ppd->neighbor_normal << 4;
+       pi->port_states.ledenable_offlinereason |=
+               ppd->is_sm_config_started << 5;
+       /*
+        * This pairs with the memory barrier in hfi1_start_led_override to
+        * ensure that we read the correct state of LED beaconing represented
+        * by led_override_timer_active
+        */
+       smp_rmb();
+       is_beaconing_active = !!atomic_read(&ppd->led_override_timer_active);
+       pi->port_states.ledenable_offlinereason |= is_beaconing_active << 6;
+       pi->port_states.ledenable_offlinereason |=
+               ppd->offline_disabled_reason;
+#else
+       pi->port_states.offline_reason = ppd->neighbor_normal << 4;
+       pi->port_states.offline_reason |= ppd->is_sm_config_started << 5;
+       pi->port_states.offline_reason |= ppd->offline_disabled_reason;
+#endif /* PI_LED_ENABLE_SUP */
+
+       pi->port_states.portphysstate_portstate =
+               (hfi1_ibphys_portstate(ppd) << 4) | state;
+
+       pi->mkeyprotect_lmc = (ibp->rvp.mkeyprot << 6) | ppd->lmc;
+
+       memset(pi->neigh_mtu.pvlx_to_mtu, 0, sizeof(pi->neigh_mtu.pvlx_to_mtu));
+       for (i = 0; i < ppd->vls_supported; i++) {
+               mtu = mtu_to_enum(dd->vld[i].mtu, HFI1_DEFAULT_ACTIVE_MTU);
+               if ((i % 2) == 0)
+                       pi->neigh_mtu.pvlx_to_mtu[i / 2] |= (mtu << 4);
+               else
+                       pi->neigh_mtu.pvlx_to_mtu[i / 2] |= mtu;
+       }
+       /* don't forget VL 15 */
+       mtu = mtu_to_enum(dd->vld[15].mtu, 2048);
+       pi->neigh_mtu.pvlx_to_mtu[15 / 2] |= mtu;
+       pi->smsl = ibp->rvp.sm_sl & OPA_PI_MASK_SMSL;
+       pi->operational_vls = hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_OP_VLS);
+       pi->partenforce_filterraw |=
+               (ppd->linkinit_reason & OPA_PI_MASK_LINKINIT_REASON);
+       if (ppd->part_enforce & HFI1_PART_ENFORCE_IN)
+               pi->partenforce_filterraw |= OPA_PI_MASK_PARTITION_ENFORCE_IN;
+       if (ppd->part_enforce & HFI1_PART_ENFORCE_OUT)
+               pi->partenforce_filterraw |= OPA_PI_MASK_PARTITION_ENFORCE_OUT;
+       pi->mkey_violations = cpu_to_be16(ibp->rvp.mkey_violations);
+       /* P_KeyViolations are counted by hardware. */
+       pi->pkey_violations = cpu_to_be16(ibp->rvp.pkey_violations);
+       pi->qkey_violations = cpu_to_be16(ibp->rvp.qkey_violations);
+
+       pi->vl.cap = ppd->vls_supported;
+       pi->vl.high_limit = cpu_to_be16(ibp->rvp.vl_high_limit);
+       pi->vl.arb_high_cap = (u8)hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_VL_HIGH_CAP);
+       pi->vl.arb_low_cap = (u8)hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_VL_LOW_CAP);
+
+       pi->clientrereg_subnettimeout = ibp->rvp.subnet_timeout;
+
+       pi->port_link_mode  = cpu_to_be16(OPA_PORT_LINK_MODE_OPA << 10 |
+                                         OPA_PORT_LINK_MODE_OPA << 5 |
+                                         OPA_PORT_LINK_MODE_OPA);
+
+       pi->port_ltp_crc_mode = cpu_to_be16(ppd->port_ltp_crc_mode);
+
+       pi->port_mode = cpu_to_be16(
+                               ppd->is_active_optimize_enabled ?
+                                       OPA_PI_MASK_PORT_ACTIVE_OPTOMIZE : 0);
+
+       pi->port_packet_format.supported =
+               cpu_to_be16(OPA_PORT_PACKET_FORMAT_9B);
+       pi->port_packet_format.enabled =
+               cpu_to_be16(OPA_PORT_PACKET_FORMAT_9B);
+
+       /* flit_control.interleave is (OPA V1, version .76):
+        * bits         use
+        * ----         ---
+        * 2            res
+        * 2            DistanceSupported
+        * 2            DistanceEnabled
+        * 5            MaxNextLevelTxEnabled
+        * 5            MaxNestLevelRxSupported
+        *
+        * HFI supports only "distance mode 1" (see OPA V1, version .76,
+        * section 9.6.2), so set DistanceSupported, DistanceEnabled
+        * to 0x1.
+        */
+       pi->flit_control.interleave = cpu_to_be16(0x1400);
+
+       pi->link_down_reason = ppd->local_link_down_reason.sma;
+       pi->neigh_link_down_reason = ppd->neigh_link_down_reason.sma;
+       pi->port_error_action = cpu_to_be32(ppd->port_error_action);
+       pi->mtucap = mtu_to_enum(hfi1_max_mtu, IB_MTU_4096);
+
+       /* 32.768 usec. response time (guessing) */
+       pi->resptimevalue = 3;
+
+       pi->local_port_num = port;
+
+       /* buffer info for FM */
+       pi->overall_buffer_space = cpu_to_be16(dd->link_credits);
+
+       pi->neigh_node_guid = cpu_to_be64(ppd->neighbor_guid);
+       pi->neigh_port_num = ppd->neighbor_port_number;
+       pi->port_neigh_mode =
+               (ppd->neighbor_type & OPA_PI_MASK_NEIGH_NODE_TYPE) |
+               (ppd->mgmt_allowed ? OPA_PI_MASK_NEIGH_MGMT_ALLOWED : 0) |
+               (ppd->neighbor_fm_security ?
+                       OPA_PI_MASK_NEIGH_FW_AUTH_BYPASS : 0);
+
+       /* HFIs shall always return VL15 credits to their
+        * neighbor in a timely manner, without any credit return pacing.
+        */
+       credit_rate = 0;
+       buffer_units  = (dd->vau) & OPA_PI_MASK_BUF_UNIT_BUF_ALLOC;
+       buffer_units |= (dd->vcu << 3) & OPA_PI_MASK_BUF_UNIT_CREDIT_ACK;
+       buffer_units |= (credit_rate << 6) &
+                               OPA_PI_MASK_BUF_UNIT_VL15_CREDIT_RATE;
+       buffer_units |= (dd->vl15_init << 11) & OPA_PI_MASK_BUF_UNIT_VL15_INIT;
+       pi->buffer_units = cpu_to_be32(buffer_units);
+
+       pi->opa_cap_mask = cpu_to_be16(OPA_CAP_MASK3_IsSharedSpaceSupported);
+
+       /* HFI supports a replay buffer 128 LTPs in size */
+       pi->replay_depth.buffer = 0x80;
+       /* read the cached value of DC_LCB_STS_ROUND_TRIP_LTP_CNT */
+       read_lcb_cache(DC_LCB_STS_ROUND_TRIP_LTP_CNT, &tmp);
+
+       /*
+        * this counter is 16 bits wide, but the replay_depth.wire
+        * variable is only 8 bits
+        */
+       if (tmp > 0xff)
+               tmp = 0xff;
+       pi->replay_depth.wire = tmp;
+
+       if (resp_len)
+               *resp_len += sizeof(struct opa_port_info);
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+/**
+ * get_pkeys - return the PKEY table
+ * @dd: the hfi1_ib device
+ * @port: the IB port number
+ * @pkeys: the pkey table is placed here
+ */
+static int get_pkeys(struct hfi1_devdata *dd, u8 port, u16 *pkeys)
+{
+       struct hfi1_pportdata *ppd = dd->pport + port - 1;
+
+       memcpy(pkeys, ppd->pkeys, sizeof(ppd->pkeys));
+
+       return 0;
+}
+
+static int __subn_get_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data,
+                                   struct ib_device *ibdev, u8 port,
+                                   u32 *resp_len)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       u32 n_blocks_req = OPA_AM_NBLK(am);
+       u32 start_block = am & 0x7ff;
+       __be16 *p;
+       u16 *q;
+       int i;
+       u16 n_blocks_avail;
+       unsigned npkeys = hfi1_get_npkeys(dd);
+       size_t size;
+
+       if (n_blocks_req == 0) {
+               pr_warn("OPA Get PKey AM Invalid : P = %d; B = 0x%x; N = 0x%x\n",
+                       port, start_block, n_blocks_req);
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       n_blocks_avail = (u16)(npkeys / OPA_PARTITION_TABLE_BLK_SIZE) + 1;
+
+       size = (n_blocks_req * OPA_PARTITION_TABLE_BLK_SIZE) * sizeof(u16);
+
+       if (start_block + n_blocks_req > n_blocks_avail ||
+           n_blocks_req > OPA_NUM_PKEY_BLOCKS_PER_SMP) {
+               pr_warn("OPA Get PKey AM Invalid : s 0x%x; req 0x%x; "
+                       "avail 0x%x; blk/smp 0x%lx\n",
+                       start_block, n_blocks_req, n_blocks_avail,
+                       OPA_NUM_PKEY_BLOCKS_PER_SMP);
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       p = (__be16 *)data;
+       q = (u16 *)data;
+       /* get the real pkeys if we are requesting the first block */
+       if (start_block == 0) {
+               get_pkeys(dd, port, q);
+               for (i = 0; i < npkeys; i++)
+                       p[i] = cpu_to_be16(q[i]);
+               if (resp_len)
+                       *resp_len += size;
+       } else {
+               smp->status |= IB_SMP_INVALID_FIELD;
+       }
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+enum {
+       HFI_TRANSITION_DISALLOWED,
+       HFI_TRANSITION_IGNORED,
+       HFI_TRANSITION_ALLOWED,
+       HFI_TRANSITION_UNDEFINED,
+};
+
+/*
+ * Use shortened names to improve readability of
+ * {logical,physical}_state_transitions
+ */
+enum {
+       __D = HFI_TRANSITION_DISALLOWED,
+       __I = HFI_TRANSITION_IGNORED,
+       __A = HFI_TRANSITION_ALLOWED,
+       __U = HFI_TRANSITION_UNDEFINED,
+};
+
+/*
+ * IB_PORTPHYSSTATE_POLLING (2) through OPA_PORTPHYSSTATE_MAX (11) are
+ * represented in physical_state_transitions.
+ */
+#define __N_PHYSTATES (OPA_PORTPHYSSTATE_MAX - IB_PORTPHYSSTATE_POLLING + 1)
+
+/*
+ * Within physical_state_transitions, rows represent "old" states,
+ * columns "new" states, and physical_state_transitions.allowed[old][new]
+ * indicates if the transition from old state to new state is legal (see
+ * OPAg1v1, Table 6-4).
+ */
+static const struct {
+       u8 allowed[__N_PHYSTATES][__N_PHYSTATES];
+} physical_state_transitions = {
+       {
+               /* 2    3    4    5    6    7    8    9   10   11 */
+       /* 2 */ { __A, __A, __D, __D, __D, __D, __D, __D, __D, __D },
+       /* 3 */ { __A, __I, __D, __D, __D, __D, __D, __D, __D, __A },
+       /* 4 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
+       /* 5 */ { __A, __A, __D, __I, __D, __D, __D, __D, __D, __D },
+       /* 6 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
+       /* 7 */ { __D, __A, __D, __D, __D, __I, __D, __D, __D, __D },
+       /* 8 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
+       /* 9 */ { __I, __A, __D, __D, __D, __D, __D, __I, __D, __D },
+       /*10 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
+       /*11 */ { __D, __A, __D, __D, __D, __D, __D, __D, __D, __I },
+       }
+};
+
+/*
+ * IB_PORT_DOWN (1) through IB_PORT_ACTIVE_DEFER (5) are represented
+ * logical_state_transitions
+ */
+
+#define __N_LOGICAL_STATES (IB_PORT_ACTIVE_DEFER - IB_PORT_DOWN + 1)
+
+/*
+ * Within logical_state_transitions rows represent "old" states,
+ * columns "new" states, and logical_state_transitions.allowed[old][new]
+ * indicates if the transition from old state to new state is legal (see
+ * OPAg1v1, Table 9-12).
+ */
+static const struct {
+       u8 allowed[__N_LOGICAL_STATES][__N_LOGICAL_STATES];
+} logical_state_transitions = {
+       {
+               /* 1    2    3    4    5 */
+       /* 1 */ { __I, __D, __D, __D, __U},
+       /* 2 */ { __D, __I, __A, __D, __U},
+       /* 3 */ { __D, __D, __I, __A, __U},
+       /* 4 */ { __D, __D, __I, __I, __U},
+       /* 5 */ { __U, __U, __U, __U, __U},
+       }
+};
+
+static int logical_transition_allowed(int old, int new)
+{
+       if (old < IB_PORT_NOP || old > IB_PORT_ACTIVE_DEFER ||
+           new < IB_PORT_NOP || new > IB_PORT_ACTIVE_DEFER) {
+               pr_warn("invalid logical state(s) (old %d new %d)\n",
+                       old, new);
+               return HFI_TRANSITION_UNDEFINED;
+       }
+
+       if (new == IB_PORT_NOP)
+               return HFI_TRANSITION_ALLOWED; /* always allowed */
+
+       /* adjust states for indexing into logical_state_transitions */
+       old -= IB_PORT_DOWN;
+       new -= IB_PORT_DOWN;
+
+       if (old < 0 || new < 0)
+               return HFI_TRANSITION_UNDEFINED;
+       return logical_state_transitions.allowed[old][new];
+}
+
+static int physical_transition_allowed(int old, int new)
+{
+       if (old < IB_PORTPHYSSTATE_NOP || old > OPA_PORTPHYSSTATE_MAX ||
+           new < IB_PORTPHYSSTATE_NOP || new > OPA_PORTPHYSSTATE_MAX) {
+               pr_warn("invalid physical state(s) (old %d new %d)\n",
+                       old, new);
+               return HFI_TRANSITION_UNDEFINED;
+       }
+
+       if (new == IB_PORTPHYSSTATE_NOP)
+               return HFI_TRANSITION_ALLOWED; /* always allowed */
+
+       /* adjust states for indexing into physical_state_transitions */
+       old -= IB_PORTPHYSSTATE_POLLING;
+       new -= IB_PORTPHYSSTATE_POLLING;
+
+       if (old < 0 || new < 0)
+               return HFI_TRANSITION_UNDEFINED;
+       return physical_state_transitions.allowed[old][new];
+}
+
+static int port_states_transition_allowed(struct hfi1_pportdata *ppd,
+                                         u32 logical_new, u32 physical_new)
+{
+       u32 physical_old = driver_physical_state(ppd);
+       u32 logical_old = driver_logical_state(ppd);
+       int ret, logical_allowed, physical_allowed;
+
+       ret = logical_transition_allowed(logical_old, logical_new);
+       logical_allowed = ret;
+
+       if (ret == HFI_TRANSITION_DISALLOWED ||
+           ret == HFI_TRANSITION_UNDEFINED) {
+               pr_warn("invalid logical state transition %s -> %s\n",
+                       opa_lstate_name(logical_old),
+                       opa_lstate_name(logical_new));
+               return ret;
+       }
+
+       ret = physical_transition_allowed(physical_old, physical_new);
+       physical_allowed = ret;
+
+       if (ret == HFI_TRANSITION_DISALLOWED ||
+           ret == HFI_TRANSITION_UNDEFINED) {
+               pr_warn("invalid physical state transition %s -> %s\n",
+                       opa_pstate_name(physical_old),
+                       opa_pstate_name(physical_new));
+               return ret;
+       }
+
+       if (logical_allowed == HFI_TRANSITION_IGNORED &&
+           physical_allowed == HFI_TRANSITION_IGNORED)
+               return HFI_TRANSITION_IGNORED;
+
+       /*
+        * A change request of Physical Port State from
+        * 'Offline' to 'Polling' should be ignored.
+        */
+       if ((physical_old == OPA_PORTPHYSSTATE_OFFLINE) &&
+           (physical_new == IB_PORTPHYSSTATE_POLLING))
+               return HFI_TRANSITION_IGNORED;
+
+       /*
+        * Either physical_allowed or logical_allowed is
+        * HFI_TRANSITION_ALLOWED.
+        */
+       return HFI_TRANSITION_ALLOWED;
+}
+
+static int set_port_states(struct hfi1_pportdata *ppd, struct opa_smp *smp,
+                          u32 logical_state, u32 phys_state,
+                          int suppress_idle_sma)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u32 link_state;
+       int ret;
+
+       ret = port_states_transition_allowed(ppd, logical_state, phys_state);
+       if (ret == HFI_TRANSITION_DISALLOWED ||
+           ret == HFI_TRANSITION_UNDEFINED) {
+               /* error message emitted above */
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return 0;
+       }
+
+       if (ret == HFI_TRANSITION_IGNORED)
+               return 0;
+
+       if ((phys_state != IB_PORTPHYSSTATE_NOP) &&
+           !(logical_state == IB_PORT_DOWN ||
+             logical_state == IB_PORT_NOP)){
+               pr_warn("SubnSet(OPA_PortInfo) port state invalid: logical_state 0x%x physical_state 0x%x\n",
+                       logical_state, phys_state);
+               smp->status |= IB_SMP_INVALID_FIELD;
+       }
+
+       /*
+        * Logical state changes are summarized in OPAv1g1 spec.,
+        * Table 9-12; physical state changes are summarized in
+        * OPAv1g1 spec., Table 6.4.
+        */
+       switch (logical_state) {
+       case IB_PORT_NOP:
+               if (phys_state == IB_PORTPHYSSTATE_NOP)
+                       break;
+               /* FALLTHROUGH */
+       case IB_PORT_DOWN:
+               if (phys_state == IB_PORTPHYSSTATE_NOP) {
+                       link_state = HLS_DN_DOWNDEF;
+               } else if (phys_state == IB_PORTPHYSSTATE_POLLING) {
+                       link_state = HLS_DN_POLL;
+                       set_link_down_reason(ppd, OPA_LINKDOWN_REASON_FM_BOUNCE,
+                                            0, OPA_LINKDOWN_REASON_FM_BOUNCE);
+               } else if (phys_state == IB_PORTPHYSSTATE_DISABLED) {
+                       link_state = HLS_DN_DISABLE;
+               } else {
+                       pr_warn("SubnSet(OPA_PortInfo) invalid physical state 0x%x\n",
+                               phys_state);
+                       smp->status |= IB_SMP_INVALID_FIELD;
+                       break;
+               }
+
+               if ((link_state == HLS_DN_POLL ||
+                    link_state == HLS_DN_DOWNDEF)) {
+                       /*
+                        * Going to poll.  No matter what the current state,
+                        * always move offline first, then tune and start the
+                        * link.  This correctly handles a FM link bounce and
+                        * a link enable.  Going offline is a no-op if already
+                        * offline.
+                        */
+                       set_link_state(ppd, HLS_DN_OFFLINE);
+                       tune_serdes(ppd);
+                       start_link(ppd);
+               } else {
+                       set_link_state(ppd, link_state);
+               }
+               if (link_state == HLS_DN_DISABLE &&
+                   (ppd->offline_disabled_reason >
+                    HFI1_ODR_MASK(OPA_LINKDOWN_REASON_SMA_DISABLED) ||
+                    ppd->offline_disabled_reason ==
+                    HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE)))
+                       ppd->offline_disabled_reason =
+                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_SMA_DISABLED);
+               /*
+                * Don't send a reply if the response would be sent
+                * through the disabled port.
+                */
+               if (link_state == HLS_DN_DISABLE && smp->hop_cnt)
+                       return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+               break;
+       case IB_PORT_ARMED:
+               ret = set_link_state(ppd, HLS_UP_ARMED);
+               if ((ret == 0) && (suppress_idle_sma == 0))
+                       send_idle_sma(dd, SMA_IDLE_ARM);
+               break;
+       case IB_PORT_ACTIVE:
+               if (ppd->neighbor_normal) {
+                       ret = set_link_state(ppd, HLS_UP_ACTIVE);
+                       if (ret == 0)
+                               send_idle_sma(dd, SMA_IDLE_ACTIVE);
+               } else {
+                       pr_warn("SubnSet(OPA_PortInfo) Cannot move to Active with NeighborNormal 0\n");
+                       smp->status |= IB_SMP_INVALID_FIELD;
+               }
+               break;
+       default:
+               pr_warn("SubnSet(OPA_PortInfo) invalid logical state 0x%x\n",
+                       logical_state);
+               smp->status |= IB_SMP_INVALID_FIELD;
+       }
+
+       return 0;
+}
+
+/**
+ * subn_set_opa_portinfo - set port information
+ * @smp: the incoming SM packet
+ * @ibdev: the infiniband device
+ * @port: the port on the device
+ *
+ */
+static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
+                                  struct ib_device *ibdev, u8 port,
+                                  u32 *resp_len)
+{
+       struct opa_port_info *pi = (struct opa_port_info *)data;
+       struct ib_event event;
+       struct hfi1_devdata *dd;
+       struct hfi1_pportdata *ppd;
+       struct hfi1_ibport *ibp;
+       u8 clientrereg;
+       unsigned long flags;
+       u32 smlid, opa_lid; /* tmp vars to hold LID values */
+       u16 lid;
+       u8 ls_old, ls_new, ps_new;
+       u8 vls;
+       u8 msl;
+       u8 crc_enabled;
+       u16 lse, lwe, mtu;
+       u32 num_ports = OPA_AM_NPORT(am);
+       u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
+       int ret, i, invalid = 0, call_set_mtu = 0;
+       int call_link_downgrade_policy = 0;
+
+       if (num_ports != 1) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       opa_lid = be32_to_cpu(pi->lid);
+       if (opa_lid & 0xFFFF0000) {
+               pr_warn("OPA_PortInfo lid out of range: %X\n", opa_lid);
+               smp->status |= IB_SMP_INVALID_FIELD;
+               goto get_only;
+       }
+
+       lid = (u16)(opa_lid & 0x0000FFFF);
+
+       smlid = be32_to_cpu(pi->sm_lid);
+       if (smlid & 0xFFFF0000) {
+               pr_warn("OPA_PortInfo SM lid out of range: %X\n", smlid);
+               smp->status |= IB_SMP_INVALID_FIELD;
+               goto get_only;
+       }
+       smlid &= 0x0000FFFF;
+
+       clientrereg = (pi->clientrereg_subnettimeout &
+                       OPA_PI_MASK_CLIENT_REREGISTER);
+
+       dd = dd_from_ibdev(ibdev);
+       /* IB numbers ports from 1, hw from 0 */
+       ppd = dd->pport + (port - 1);
+       ibp = &ppd->ibport_data;
+       event.device = ibdev;
+       event.element.port_num = port;
+
+       ls_old = driver_lstate(ppd);
+
+       ibp->rvp.mkey = pi->mkey;
+       ibp->rvp.gid_prefix = pi->subnet_prefix;
+       ibp->rvp.mkey_lease_period = be16_to_cpu(pi->mkey_lease_period);
+
+       /* Must be a valid unicast LID address. */
+       if ((lid == 0 && ls_old > IB_PORT_INIT) ||
+           lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               pr_warn("SubnSet(OPA_PortInfo) lid invalid 0x%x\n",
+                       lid);
+       } else if (ppd->lid != lid ||
+                ppd->lmc != (pi->mkeyprotect_lmc & OPA_PI_MASK_LMC)) {
+               if (ppd->lid != lid)
+                       hfi1_set_uevent_bits(ppd, _HFI1_EVENT_LID_CHANGE_BIT);
+               if (ppd->lmc != (pi->mkeyprotect_lmc & OPA_PI_MASK_LMC))
+                       hfi1_set_uevent_bits(ppd, _HFI1_EVENT_LMC_CHANGE_BIT);
+               hfi1_set_lid(ppd, lid, pi->mkeyprotect_lmc & OPA_PI_MASK_LMC);
+               event.event = IB_EVENT_LID_CHANGE;
+               ib_dispatch_event(&event);
+       }
+
+       msl = pi->smsl & OPA_PI_MASK_SMSL;
+       if (pi->partenforce_filterraw & OPA_PI_MASK_LINKINIT_REASON)
+               ppd->linkinit_reason =
+                       (pi->partenforce_filterraw &
+                        OPA_PI_MASK_LINKINIT_REASON);
+       /* enable/disable SW pkey checking as per FM control */
+       if (pi->partenforce_filterraw & OPA_PI_MASK_PARTITION_ENFORCE_IN)
+               ppd->part_enforce |= HFI1_PART_ENFORCE_IN;
+       else
+               ppd->part_enforce &= ~HFI1_PART_ENFORCE_IN;
+
+       if (pi->partenforce_filterraw & OPA_PI_MASK_PARTITION_ENFORCE_OUT)
+               ppd->part_enforce |= HFI1_PART_ENFORCE_OUT;
+       else
+               ppd->part_enforce &= ~HFI1_PART_ENFORCE_OUT;
+
+       /* Must be a valid unicast LID address. */
+       if ((smlid == 0 && ls_old > IB_PORT_INIT) ||
+           smlid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               pr_warn("SubnSet(OPA_PortInfo) smlid invalid 0x%x\n", smlid);
+       } else if (smlid != ibp->rvp.sm_lid || msl != ibp->rvp.sm_sl) {
+               pr_warn("SubnSet(OPA_PortInfo) smlid 0x%x\n", smlid);
+               spin_lock_irqsave(&ibp->rvp.lock, flags);
+               if (ibp->rvp.sm_ah) {
+                       if (smlid != ibp->rvp.sm_lid)
+                               ibp->rvp.sm_ah->attr.dlid = smlid;
+                       if (msl != ibp->rvp.sm_sl)
+                               ibp->rvp.sm_ah->attr.sl = msl;
+               }
+               spin_unlock_irqrestore(&ibp->rvp.lock, flags);
+               if (smlid != ibp->rvp.sm_lid)
+                       ibp->rvp.sm_lid = smlid;
+               if (msl != ibp->rvp.sm_sl)
+                       ibp->rvp.sm_sl = msl;
+               event.event = IB_EVENT_SM_CHANGE;
+               ib_dispatch_event(&event);
+       }
+
+       if (pi->link_down_reason == 0) {
+               ppd->local_link_down_reason.sma = 0;
+               ppd->local_link_down_reason.latest = 0;
+       }
+
+       if (pi->neigh_link_down_reason == 0) {
+               ppd->neigh_link_down_reason.sma = 0;
+               ppd->neigh_link_down_reason.latest = 0;
+       }
+
+       ppd->sm_trap_qp = be32_to_cpu(pi->sm_trap_qp);
+       ppd->sa_qp = be32_to_cpu(pi->sa_qp);
+
+       ppd->port_error_action = be32_to_cpu(pi->port_error_action);
+       lwe = be16_to_cpu(pi->link_width.enabled);
+       if (lwe) {
+               if (lwe == OPA_LINK_WIDTH_RESET ||
+                   lwe == OPA_LINK_WIDTH_RESET_OLD)
+                       set_link_width_enabled(ppd, ppd->link_width_supported);
+               else if ((lwe & ~ppd->link_width_supported) == 0)
+                       set_link_width_enabled(ppd, lwe);
+               else
+                       smp->status |= IB_SMP_INVALID_FIELD;
+       }
+       lwe = be16_to_cpu(pi->link_width_downgrade.enabled);
+       /* LWD.E is always applied - 0 means "disabled" */
+       if (lwe == OPA_LINK_WIDTH_RESET ||
+           lwe == OPA_LINK_WIDTH_RESET_OLD) {
+               set_link_width_downgrade_enabled(ppd,
+                                                ppd->
+                                                link_width_downgrade_supported
+                                                );
+       } else if ((lwe & ~ppd->link_width_downgrade_supported) == 0) {
+               /* only set and apply if something changed */
+               if (lwe != ppd->link_width_downgrade_enabled) {
+                       set_link_width_downgrade_enabled(ppd, lwe);
+                       call_link_downgrade_policy = 1;
+               }
+       } else {
+               smp->status |= IB_SMP_INVALID_FIELD;
+       }
+       lse = be16_to_cpu(pi->link_speed.enabled);
+       if (lse) {
+               if (lse & be16_to_cpu(pi->link_speed.supported))
+                       set_link_speed_enabled(ppd, lse);
+               else
+                       smp->status |= IB_SMP_INVALID_FIELD;
+       }
+
+       ibp->rvp.mkeyprot =
+               (pi->mkeyprotect_lmc & OPA_PI_MASK_MKEY_PROT_BIT) >> 6;
+       ibp->rvp.vl_high_limit = be16_to_cpu(pi->vl.high_limit) & 0xFF;
+       (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_VL_HIGH_LIMIT,
+                                   ibp->rvp.vl_high_limit);
+
+       if (ppd->vls_supported / 2 > ARRAY_SIZE(pi->neigh_mtu.pvlx_to_mtu) ||
+           ppd->vls_supported > ARRAY_SIZE(dd->vld)) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+       for (i = 0; i < ppd->vls_supported; i++) {
+               if ((i % 2) == 0)
+                       mtu = enum_to_mtu((pi->neigh_mtu.pvlx_to_mtu[i / 2] >>
+                                          4) & 0xF);
+               else
+                       mtu = enum_to_mtu(pi->neigh_mtu.pvlx_to_mtu[i / 2] &
+                                         0xF);
+               if (mtu == 0xffff) {
+                       pr_warn("SubnSet(OPA_PortInfo) mtu invalid %d (0x%x)\n",
+                               mtu,
+                               (pi->neigh_mtu.pvlx_to_mtu[0] >> 4) & 0xF);
+                       smp->status |= IB_SMP_INVALID_FIELD;
+                       mtu = hfi1_max_mtu; /* use a valid MTU */
+               }
+               if (dd->vld[i].mtu != mtu) {
+                       dd_dev_info(dd,
+                                   "MTU change on vl %d from %d to %d\n",
+                                   i, dd->vld[i].mtu, mtu);
+                       dd->vld[i].mtu = mtu;
+                       call_set_mtu++;
+               }
+       }
+       /* As per OPAV1 spec: VL15 must support and be configured
+        * for operation with a 2048 or larger MTU.
+        */
+       mtu = enum_to_mtu(pi->neigh_mtu.pvlx_to_mtu[15 / 2] & 0xF);
+       if (mtu < 2048 || mtu == 0xffff)
+               mtu = 2048;
+       if (dd->vld[15].mtu != mtu) {
+               dd_dev_info(dd,
+                           "MTU change on vl 15 from %d to %d\n",
+                           dd->vld[15].mtu, mtu);
+               dd->vld[15].mtu = mtu;
+               call_set_mtu++;
+       }
+       if (call_set_mtu)
+               set_mtu(ppd);
+
+       /* Set operational VLs */
+       vls = pi->operational_vls & OPA_PI_MASK_OPERATIONAL_VL;
+       if (vls) {
+               if (vls > ppd->vls_supported) {
+                       pr_warn("SubnSet(OPA_PortInfo) VL's supported invalid %d\n",
+                               pi->operational_vls);
+                       smp->status |= IB_SMP_INVALID_FIELD;
+               } else {
+                       if (hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_OP_VLS,
+                                           vls) == -EINVAL)
+                               smp->status |= IB_SMP_INVALID_FIELD;
+               }
+       }
+
+       if (pi->mkey_violations == 0)
+               ibp->rvp.mkey_violations = 0;
+
+       if (pi->pkey_violations == 0)
+               ibp->rvp.pkey_violations = 0;
+
+       if (pi->qkey_violations == 0)
+               ibp->rvp.qkey_violations = 0;
+
+       ibp->rvp.subnet_timeout =
+               pi->clientrereg_subnettimeout & OPA_PI_MASK_SUBNET_TIMEOUT;
+
+       crc_enabled = be16_to_cpu(pi->port_ltp_crc_mode);
+       crc_enabled >>= 4;
+       crc_enabled &= 0xf;
+
+       if (crc_enabled != 0)
+               ppd->port_crc_mode_enabled = port_ltp_to_cap(crc_enabled);
+
+       ppd->is_active_optimize_enabled =
+                       !!(be16_to_cpu(pi->port_mode)
+                                       & OPA_PI_MASK_PORT_ACTIVE_OPTOMIZE);
+
+       ls_new = pi->port_states.portphysstate_portstate &
+                       OPA_PI_MASK_PORT_STATE;
+       ps_new = (pi->port_states.portphysstate_portstate &
+                       OPA_PI_MASK_PORT_PHYSICAL_STATE) >> 4;
+
+       if (ls_old == IB_PORT_INIT) {
+               if (start_of_sm_config) {
+                       if (ls_new == ls_old || (ls_new == IB_PORT_ARMED))
+                               ppd->is_sm_config_started = 1;
+               } else if (ls_new == IB_PORT_ARMED) {
+                       if (ppd->is_sm_config_started == 0)
+                               invalid = 1;
+               }
+       }
+
+       /* Handle CLIENT_REREGISTER event b/c SM asked us for it */
+       if (clientrereg) {
+               event.event = IB_EVENT_CLIENT_REREGISTER;
+               ib_dispatch_event(&event);
+       }
+
+       /*
+        * Do the port state change now that the other link parameters
+        * have been set.
+        * Changing the port physical state only makes sense if the link
+        * is down or is being set to down.
+        */
+
+       ret = set_port_states(ppd, smp, ls_new, ps_new, invalid);
+       if (ret)
+               return ret;
+
+       ret = __subn_get_opa_portinfo(smp, am, data, ibdev, port, resp_len);
+
+       /* restore re-reg bit per o14-12.2.1 */
+       pi->clientrereg_subnettimeout |= clientrereg;
+
+       /*
+        * Apply the new link downgrade policy.  This may result in a link
+        * bounce.  Do this after everything else so things are settled.
+        * Possible problem: if setting the port state above fails, then
+        * the policy change is not applied.
+        */
+       if (call_link_downgrade_policy)
+               apply_link_downgrade_policy(ppd, 0);
+
+       return ret;
+
+get_only:
+       return __subn_get_opa_portinfo(smp, am, data, ibdev, port, resp_len);
+}
+
+/**
+ * set_pkeys - set the PKEY table for ctxt 0
+ * @dd: the hfi1_ib device
+ * @port: the IB port number
+ * @pkeys: the PKEY table
+ */
+static int set_pkeys(struct hfi1_devdata *dd, u8 port, u16 *pkeys)
+{
+       struct hfi1_pportdata *ppd;
+       int i;
+       int changed = 0;
+       int update_includes_mgmt_partition = 0;
+
+       /*
+        * IB port one/two always maps to context zero/one,
+        * always a kernel context, no locking needed
+        * If we get here with ppd setup, no need to check
+        * that rcd is valid.
+        */
+       ppd = dd->pport + (port - 1);
+       /*
+        * If the update does not include the management pkey, don't do it.
+        */
+       for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) {
+               if (pkeys[i] == LIM_MGMT_P_KEY) {
+                       update_includes_mgmt_partition = 1;
+                       break;
+               }
+       }
+
+       if (!update_includes_mgmt_partition)
+               return 1;
+
+       for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) {
+               u16 key = pkeys[i];
+               u16 okey = ppd->pkeys[i];
+
+               if (key == okey)
+                       continue;
+               /*
+                * Don't update pkeys[2], if an HFI port without MgmtAllowed
+                * by neighbor is a switch.
+                */
+               if (i == 2 && !ppd->mgmt_allowed && ppd->neighbor_type == 1)
+                       continue;
+               /*
+                * The SM gives us the complete PKey table. We have
+                * to ensure that we put the PKeys in the matching
+                * slots.
+                */
+               ppd->pkeys[i] = key;
+               changed = 1;
+       }
+
+       if (changed) {
+               struct ib_event event;
+
+               (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_PKEYS, 0);
+
+               event.event = IB_EVENT_PKEY_CHANGE;
+               event.device = &dd->verbs_dev.rdi.ibdev;
+               event.element.port_num = port;
+               ib_dispatch_event(&event);
+       }
+       return 0;
+}
+
+static int __subn_set_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data,
+                                   struct ib_device *ibdev, u8 port,
+                                   u32 *resp_len)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       u32 n_blocks_sent = OPA_AM_NBLK(am);
+       u32 start_block = am & 0x7ff;
+       u16 *p = (u16 *)data;
+       __be16 *q = (__be16 *)data;
+       int i;
+       u16 n_blocks_avail;
+       unsigned npkeys = hfi1_get_npkeys(dd);
+
+       if (n_blocks_sent == 0) {
+               pr_warn("OPA Get PKey AM Invalid : P = %d; B = 0x%x; N = 0x%x\n",
+                       port, start_block, n_blocks_sent);
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       n_blocks_avail = (u16)(npkeys / OPA_PARTITION_TABLE_BLK_SIZE) + 1;
+
+       if (start_block + n_blocks_sent > n_blocks_avail ||
+           n_blocks_sent > OPA_NUM_PKEY_BLOCKS_PER_SMP) {
+               pr_warn("OPA Set PKey AM Invalid : s 0x%x; req 0x%x; avail 0x%x; blk/smp 0x%lx\n",
+                       start_block, n_blocks_sent, n_blocks_avail,
+                       OPA_NUM_PKEY_BLOCKS_PER_SMP);
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       for (i = 0; i < n_blocks_sent * OPA_PARTITION_TABLE_BLK_SIZE; i++)
+               p[i] = be16_to_cpu(q[i]);
+
+       if (start_block == 0 && set_pkeys(dd, port, p) != 0) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       return __subn_get_opa_pkeytable(smp, am, data, ibdev, port, resp_len);
+}
+
+static int get_sc2vlt_tables(struct hfi1_devdata *dd, void *data)
+{
+       u64 *val = data;
+
+       *val++ = read_csr(dd, SEND_SC2VLT0);
+       *val++ = read_csr(dd, SEND_SC2VLT1);
+       *val++ = read_csr(dd, SEND_SC2VLT2);
+       *val++ = read_csr(dd, SEND_SC2VLT3);
+       return 0;
+}
+
+#define ILLEGAL_VL 12
+/*
+ * filter_sc2vlt changes mappings to VL15 to ILLEGAL_VL (except
+ * for SC15, which must map to VL15). If we don't remap things this
+ * way it is possible for VL15 counters to increment when we try to
+ * send on a SC which is mapped to an invalid VL.
+ */
+static void filter_sc2vlt(void *data)
+{
+       int i;
+       u8 *pd = data;
+
+       for (i = 0; i < OPA_MAX_SCS; i++) {
+               if (i == 15)
+                       continue;
+               if ((pd[i] & 0x1f) == 0xf)
+                       pd[i] = ILLEGAL_VL;
+       }
+}
+
+static int set_sc2vlt_tables(struct hfi1_devdata *dd, void *data)
+{
+       u64 *val = data;
+
+       filter_sc2vlt(data);
+
+       write_csr(dd, SEND_SC2VLT0, *val++);
+       write_csr(dd, SEND_SC2VLT1, *val++);
+       write_csr(dd, SEND_SC2VLT2, *val++);
+       write_csr(dd, SEND_SC2VLT3, *val++);
+       write_seqlock_irq(&dd->sc2vl_lock);
+       memcpy(dd->sc2vl, data, sizeof(dd->sc2vl));
+       write_sequnlock_irq(&dd->sc2vl_lock);
+       return 0;
+}
+
+static int __subn_get_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data,
+                                  struct ib_device *ibdev, u8 port,
+                                  u32 *resp_len)
+{
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       u8 *p = data;
+       size_t size = ARRAY_SIZE(ibp->sl_to_sc); /* == 32 */
+       unsigned i;
+
+       if (am) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       for (i = 0; i < ARRAY_SIZE(ibp->sl_to_sc); i++)
+               *p++ = ibp->sl_to_sc[i];
+
+       if (resp_len)
+               *resp_len += size;
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data,
+                                  struct ib_device *ibdev, u8 port,
+                                  u32 *resp_len)
+{
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       u8 *p = data;
+       int i;
+       u8 sc;
+
+       if (am) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       for (i = 0; i <  ARRAY_SIZE(ibp->sl_to_sc); i++) {
+               sc = *p++;
+               if (ibp->sl_to_sc[i] != sc) {
+                       ibp->sl_to_sc[i] = sc;
+
+                       /* Put all stale qps into error state */
+                       hfi1_error_port_qps(ibp, i);
+               }
+       }
+
+       return __subn_get_opa_sl_to_sc(smp, am, data, ibdev, port, resp_len);
+}
+
+static int __subn_get_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data,
+                                  struct ib_device *ibdev, u8 port,
+                                  u32 *resp_len)
+{
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       u8 *p = data;
+       size_t size = ARRAY_SIZE(ibp->sc_to_sl); /* == 32 */
+       unsigned i;
+
+       if (am) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       for (i = 0; i < ARRAY_SIZE(ibp->sc_to_sl); i++)
+               *p++ = ibp->sc_to_sl[i];
+
+       if (resp_len)
+               *resp_len += size;
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data,
+                                  struct ib_device *ibdev, u8 port,
+                                  u32 *resp_len)
+{
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       u8 *p = data;
+       int i;
+
+       if (am) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       for (i = 0; i < ARRAY_SIZE(ibp->sc_to_sl); i++)
+               ibp->sc_to_sl[i] = *p++;
+
+       return __subn_get_opa_sc_to_sl(smp, am, data, ibdev, port, resp_len);
+}
+
+static int __subn_get_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data,
+                                   struct ib_device *ibdev, u8 port,
+                                   u32 *resp_len)
+{
+       u32 n_blocks = OPA_AM_NBLK(am);
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       void *vp = (void *)data;
+       size_t size = 4 * sizeof(u64);
+
+       if (n_blocks != 1) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       get_sc2vlt_tables(dd, vp);
+
+       if (resp_len)
+               *resp_len += size;
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data,
+                                   struct ib_device *ibdev, u8 port,
+                                   u32 *resp_len)
+{
+       u32 n_blocks = OPA_AM_NBLK(am);
+       int async_update = OPA_AM_ASYNC(am);
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       void *vp = (void *)data;
+       struct hfi1_pportdata *ppd;
+       int lstate;
+
+       if (n_blocks != 1 || async_update) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       /* IB numbers ports from 1, hw from 0 */
+       ppd = dd->pport + (port - 1);
+       lstate = driver_lstate(ppd);
+       /*
+        * it's known that async_update is 0 by this point, but include
+        * the explicit check for clarity
+        */
+       if (!async_update &&
+           (lstate == IB_PORT_ARMED || lstate == IB_PORT_ACTIVE)) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       set_sc2vlt_tables(dd, vp);
+
+       return __subn_get_opa_sc_to_vlt(smp, am, data, ibdev, port, resp_len);
+}
+
+static int __subn_get_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data,
+                                    struct ib_device *ibdev, u8 port,
+                                    u32 *resp_len)
+{
+       u32 n_blocks = OPA_AM_NPORT(am);
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       struct hfi1_pportdata *ppd;
+       void *vp = (void *)data;
+       int size;
+
+       if (n_blocks != 1) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       ppd = dd->pport + (port - 1);
+
+       size = fm_get_table(ppd, FM_TBL_SC2VLNT, vp);
+
+       if (resp_len)
+               *resp_len += size;
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data,
+                                    struct ib_device *ibdev, u8 port,
+                                    u32 *resp_len)
+{
+       u32 n_blocks = OPA_AM_NPORT(am);
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       struct hfi1_pportdata *ppd;
+       void *vp = (void *)data;
+       int lstate;
+
+       if (n_blocks != 1) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       /* IB numbers ports from 1, hw from 0 */
+       ppd = dd->pport + (port - 1);
+       lstate = driver_lstate(ppd);
+       if (lstate == IB_PORT_ARMED || lstate == IB_PORT_ACTIVE) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       ppd = dd->pport + (port - 1);
+
+       fm_set_table(ppd, FM_TBL_SC2VLNT, vp);
+
+       return __subn_get_opa_sc_to_vlnt(smp, am, data, ibdev, port,
+                                        resp_len);
+}
+
+static int __subn_get_opa_psi(struct opa_smp *smp, u32 am, u8 *data,
+                             struct ib_device *ibdev, u8 port,
+                             u32 *resp_len)
+{
+       u32 nports = OPA_AM_NPORT(am);
+       u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
+       u32 lstate;
+       struct hfi1_ibport *ibp;
+       struct hfi1_pportdata *ppd;
+       struct opa_port_state_info *psi = (struct opa_port_state_info *)data;
+
+       if (nports != 1) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       ibp = to_iport(ibdev, port);
+       ppd = ppd_from_ibp(ibp);
+
+       lstate = driver_lstate(ppd);
+
+       if (start_of_sm_config && (lstate == IB_PORT_INIT))
+               ppd->is_sm_config_started = 1;
+
+#if PI_LED_ENABLE_SUP
+       psi->port_states.ledenable_offlinereason = ppd->neighbor_normal << 4;
+       psi->port_states.ledenable_offlinereason |=
+               ppd->is_sm_config_started << 5;
+       psi->port_states.ledenable_offlinereason |=
+               ppd->offline_disabled_reason;
+#else
+       psi->port_states.offline_reason = ppd->neighbor_normal << 4;
+       psi->port_states.offline_reason |= ppd->is_sm_config_started << 5;
+       psi->port_states.offline_reason |= ppd->offline_disabled_reason;
+#endif /* PI_LED_ENABLE_SUP */
+
+       psi->port_states.portphysstate_portstate =
+               (hfi1_ibphys_portstate(ppd) << 4) | (lstate & 0xf);
+       psi->link_width_downgrade_tx_active =
+               cpu_to_be16(ppd->link_width_downgrade_tx_active);
+       psi->link_width_downgrade_rx_active =
+               cpu_to_be16(ppd->link_width_downgrade_rx_active);
+       if (resp_len)
+               *resp_len += sizeof(struct opa_port_state_info);
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_psi(struct opa_smp *smp, u32 am, u8 *data,
+                             struct ib_device *ibdev, u8 port,
+                             u32 *resp_len)
+{
+       u32 nports = OPA_AM_NPORT(am);
+       u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
+       u32 ls_old;
+       u8 ls_new, ps_new;
+       struct hfi1_ibport *ibp;
+       struct hfi1_pportdata *ppd;
+       struct opa_port_state_info *psi = (struct opa_port_state_info *)data;
+       int ret, invalid = 0;
+
+       if (nports != 1) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       ibp = to_iport(ibdev, port);
+       ppd = ppd_from_ibp(ibp);
+
+       ls_old = driver_lstate(ppd);
+
+       ls_new = port_states_to_logical_state(&psi->port_states);
+       ps_new = port_states_to_phys_state(&psi->port_states);
+
+       if (ls_old == IB_PORT_INIT) {
+               if (start_of_sm_config) {
+                       if (ls_new == ls_old || (ls_new == IB_PORT_ARMED))
+                               ppd->is_sm_config_started = 1;
+               } else if (ls_new == IB_PORT_ARMED) {
+                       if (ppd->is_sm_config_started == 0)
+                               invalid = 1;
+               }
+       }
+
+       ret = set_port_states(ppd, smp, ls_new, ps_new, invalid);
+       if (ret)
+               return ret;
+
+       if (invalid)
+               smp->status |= IB_SMP_INVALID_FIELD;
+
+       return __subn_get_opa_psi(smp, am, data, ibdev, port, resp_len);
+}
+
+static int __subn_get_opa_cable_info(struct opa_smp *smp, u32 am, u8 *data,
+                                    struct ib_device *ibdev, u8 port,
+                                    u32 *resp_len)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       u32 addr = OPA_AM_CI_ADDR(am);
+       u32 len = OPA_AM_CI_LEN(am) + 1;
+       int ret;
+
+#define __CI_PAGE_SIZE BIT(7) /* 128 bytes */
+#define __CI_PAGE_MASK ~(__CI_PAGE_SIZE - 1)
+#define __CI_PAGE_NUM(a) ((a) & __CI_PAGE_MASK)
+
+       /*
+        * check that addr is within spec, and
+        * addr and (addr + len - 1) are on the same "page"
+        */
+       if (addr >= 4096 ||
+           (__CI_PAGE_NUM(addr) != __CI_PAGE_NUM(addr + len - 1))) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       ret = get_cable_info(dd, port, addr, len, data);
+
+       if (ret == -ENODEV) {
+               smp->status |= IB_SMP_UNSUP_METH_ATTR;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       /* The address range for the CableInfo SMA query is wider than the
+        * memory available on the QSFP cable. We want to return a valid
+        * response, albeit zeroed out, for address ranges beyond available
+        * memory but that are within the CableInfo query spec
+        */
+       if (ret < 0 && ret != -ERANGE) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       if (resp_len)
+               *resp_len += len;
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_get_opa_bct(struct opa_smp *smp, u32 am, u8 *data,
+                             struct ib_device *ibdev, u8 port, u32 *resp_len)
+{
+       u32 num_ports = OPA_AM_NPORT(am);
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       struct hfi1_pportdata *ppd;
+       struct buffer_control *p = (struct buffer_control *)data;
+       int size;
+
+       if (num_ports != 1) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       ppd = dd->pport + (port - 1);
+       size = fm_get_table(ppd, FM_TBL_BUFFER_CONTROL, p);
+       trace_bct_get(dd, p);
+       if (resp_len)
+               *resp_len += size;
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_bct(struct opa_smp *smp, u32 am, u8 *data,
+                             struct ib_device *ibdev, u8 port, u32 *resp_len)
+{
+       u32 num_ports = OPA_AM_NPORT(am);
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       struct hfi1_pportdata *ppd;
+       struct buffer_control *p = (struct buffer_control *)data;
+
+       if (num_ports != 1) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+       ppd = dd->pport + (port - 1);
+       trace_bct_set(dd, p);
+       if (fm_set_table(ppd, FM_TBL_BUFFER_CONTROL, p) < 0) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       return __subn_get_opa_bct(smp, am, data, ibdev, port, resp_len);
+}
+
+static int __subn_get_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data,
+                                struct ib_device *ibdev, u8 port,
+                                u32 *resp_len)
+{
+       struct hfi1_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port));
+       u32 num_ports = OPA_AM_NPORT(am);
+       u8 section = (am & 0x00ff0000) >> 16;
+       u8 *p = data;
+       int size = 0;
+
+       if (num_ports != 1) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       switch (section) {
+       case OPA_VLARB_LOW_ELEMENTS:
+               size = fm_get_table(ppd, FM_TBL_VL_LOW_ARB, p);
+               break;
+       case OPA_VLARB_HIGH_ELEMENTS:
+               size = fm_get_table(ppd, FM_TBL_VL_HIGH_ARB, p);
+               break;
+       case OPA_VLARB_PREEMPT_ELEMENTS:
+               size = fm_get_table(ppd, FM_TBL_VL_PREEMPT_ELEMS, p);
+               break;
+       case OPA_VLARB_PREEMPT_MATRIX:
+               size = fm_get_table(ppd, FM_TBL_VL_PREEMPT_MATRIX, p);
+               break;
+       default:
+               pr_warn("OPA SubnGet(VL Arb) AM Invalid : 0x%x\n",
+                       be32_to_cpu(smp->attr_mod));
+               smp->status |= IB_SMP_INVALID_FIELD;
+               break;
+       }
+
+       if (size > 0 && resp_len)
+               *resp_len += size;
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data,
+                                struct ib_device *ibdev, u8 port,
+                                u32 *resp_len)
+{
+       struct hfi1_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port));
+       u32 num_ports = OPA_AM_NPORT(am);
+       u8 section = (am & 0x00ff0000) >> 16;
+       u8 *p = data;
+
+       if (num_ports != 1) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       switch (section) {
+       case OPA_VLARB_LOW_ELEMENTS:
+               (void)fm_set_table(ppd, FM_TBL_VL_LOW_ARB, p);
+               break;
+       case OPA_VLARB_HIGH_ELEMENTS:
+               (void)fm_set_table(ppd, FM_TBL_VL_HIGH_ARB, p);
+               break;
+       /*
+        * neither OPA_VLARB_PREEMPT_ELEMENTS, or OPA_VLARB_PREEMPT_MATRIX
+        * can be changed from the default values
+        */
+       case OPA_VLARB_PREEMPT_ELEMENTS:
+               /* FALLTHROUGH */
+       case OPA_VLARB_PREEMPT_MATRIX:
+               smp->status |= IB_SMP_UNSUP_METH_ATTR;
+               break;
+       default:
+               pr_warn("OPA SubnSet(VL Arb) AM Invalid : 0x%x\n",
+                       be32_to_cpu(smp->attr_mod));
+               smp->status |= IB_SMP_INVALID_FIELD;
+               break;
+       }
+
+       return __subn_get_opa_vl_arb(smp, am, data, ibdev, port, resp_len);
+}
+
+struct opa_pma_mad {
+       struct ib_mad_hdr mad_hdr;
+       u8 data[2024];
+} __packed;
+
+struct opa_class_port_info {
+       u8 base_version;
+       u8 class_version;
+       __be16 cap_mask;
+       __be32 cap_mask2_resp_time;
+
+       u8 redirect_gid[16];
+       __be32 redirect_tc_fl;
+       __be32 redirect_lid;
+       __be32 redirect_sl_qp;
+       __be32 redirect_qkey;
+
+       u8 trap_gid[16];
+       __be32 trap_tc_fl;
+       __be32 trap_lid;
+       __be32 trap_hl_qp;
+       __be32 trap_qkey;
+
+       __be16 trap_pkey;
+       __be16 redirect_pkey;
+
+       u8 trap_sl_rsvd;
+       u8 reserved[3];
+} __packed;
+
+struct opa_port_status_req {
+       __u8 port_num;
+       __u8 reserved[3];
+       __be32 vl_select_mask;
+};
+
+#define VL_MASK_ALL            0x000080ff
+
+struct opa_port_status_rsp {
+       __u8 port_num;
+       __u8 reserved[3];
+       __be32  vl_select_mask;
+
+       /* Data counters */
+       __be64 port_xmit_data;
+       __be64 port_rcv_data;
+       __be64 port_xmit_pkts;
+       __be64 port_rcv_pkts;
+       __be64 port_multicast_xmit_pkts;
+       __be64 port_multicast_rcv_pkts;
+       __be64 port_xmit_wait;
+       __be64 sw_port_congestion;
+       __be64 port_rcv_fecn;
+       __be64 port_rcv_becn;
+       __be64 port_xmit_time_cong;
+       __be64 port_xmit_wasted_bw;
+       __be64 port_xmit_wait_data;
+       __be64 port_rcv_bubble;
+       __be64 port_mark_fecn;
+       /* Error counters */
+       __be64 port_rcv_constraint_errors;
+       __be64 port_rcv_switch_relay_errors;
+       __be64 port_xmit_discards;
+       __be64 port_xmit_constraint_errors;
+       __be64 port_rcv_remote_physical_errors;
+       __be64 local_link_integrity_errors;
+       __be64 port_rcv_errors;
+       __be64 excessive_buffer_overruns;
+       __be64 fm_config_errors;
+       __be32 link_error_recovery;
+       __be32 link_downed;
+       u8 uncorrectable_errors;
+
+       u8 link_quality_indicator; /* 5res, 3bit */
+       u8 res2[6];
+       struct _vls_pctrs {
+               /* per-VL Data counters */
+               __be64 port_vl_xmit_data;
+               __be64 port_vl_rcv_data;
+               __be64 port_vl_xmit_pkts;
+               __be64 port_vl_rcv_pkts;
+               __be64 port_vl_xmit_wait;
+               __be64 sw_port_vl_congestion;
+               __be64 port_vl_rcv_fecn;
+               __be64 port_vl_rcv_becn;
+               __be64 port_xmit_time_cong;
+               __be64 port_vl_xmit_wasted_bw;
+               __be64 port_vl_xmit_wait_data;
+               __be64 port_vl_rcv_bubble;
+               __be64 port_vl_mark_fecn;
+               __be64 port_vl_xmit_discards;
+       } vls[0]; /* real array size defined by # bits set in vl_select_mask */
+};
+
+enum counter_selects {
+       CS_PORT_XMIT_DATA                       = (1 << 31),
+       CS_PORT_RCV_DATA                        = (1 << 30),
+       CS_PORT_XMIT_PKTS                       = (1 << 29),
+       CS_PORT_RCV_PKTS                        = (1 << 28),
+       CS_PORT_MCAST_XMIT_PKTS                 = (1 << 27),
+       CS_PORT_MCAST_RCV_PKTS                  = (1 << 26),
+       CS_PORT_XMIT_WAIT                       = (1 << 25),
+       CS_SW_PORT_CONGESTION                   = (1 << 24),
+       CS_PORT_RCV_FECN                        = (1 << 23),
+       CS_PORT_RCV_BECN                        = (1 << 22),
+       CS_PORT_XMIT_TIME_CONG                  = (1 << 21),
+       CS_PORT_XMIT_WASTED_BW                  = (1 << 20),
+       CS_PORT_XMIT_WAIT_DATA                  = (1 << 19),
+       CS_PORT_RCV_BUBBLE                      = (1 << 18),
+       CS_PORT_MARK_FECN                       = (1 << 17),
+       CS_PORT_RCV_CONSTRAINT_ERRORS           = (1 << 16),
+       CS_PORT_RCV_SWITCH_RELAY_ERRORS         = (1 << 15),
+       CS_PORT_XMIT_DISCARDS                   = (1 << 14),
+       CS_PORT_XMIT_CONSTRAINT_ERRORS          = (1 << 13),
+       CS_PORT_RCV_REMOTE_PHYSICAL_ERRORS      = (1 << 12),
+       CS_LOCAL_LINK_INTEGRITY_ERRORS          = (1 << 11),
+       CS_PORT_RCV_ERRORS                      = (1 << 10),
+       CS_EXCESSIVE_BUFFER_OVERRUNS            = (1 << 9),
+       CS_FM_CONFIG_ERRORS                     = (1 << 8),
+       CS_LINK_ERROR_RECOVERY                  = (1 << 7),
+       CS_LINK_DOWNED                          = (1 << 6),
+       CS_UNCORRECTABLE_ERRORS                 = (1 << 5),
+};
+
+struct opa_clear_port_status {
+       __be64 port_select_mask[4];
+       __be32 counter_select_mask;
+};
+
+struct opa_aggregate {
+       __be16 attr_id;
+       __be16 err_reqlength;   /* 1 bit, 8 res, 7 bit */
+       __be32 attr_mod;
+       u8 data[0];
+};
+
+#define MSK_LLI 0x000000f0
+#define MSK_LLI_SFT 4
+#define MSK_LER 0x0000000f
+#define MSK_LER_SFT 0
+#define ADD_LLI 8
+#define ADD_LER 2
+
+/* Request contains first three fields, response contains those plus the rest */
+struct opa_port_data_counters_msg {
+       __be64 port_select_mask[4];
+       __be32 vl_select_mask;
+       __be32 resolution;
+
+       /* Response fields follow */
+       struct _port_dctrs {
+               u8 port_number;
+               u8 reserved2[3];
+               __be32 link_quality_indicator; /* 29res, 3bit */
+
+               /* Data counters */
+               __be64 port_xmit_data;
+               __be64 port_rcv_data;
+               __be64 port_xmit_pkts;
+               __be64 port_rcv_pkts;
+               __be64 port_multicast_xmit_pkts;
+               __be64 port_multicast_rcv_pkts;
+               __be64 port_xmit_wait;
+               __be64 sw_port_congestion;
+               __be64 port_rcv_fecn;
+               __be64 port_rcv_becn;
+               __be64 port_xmit_time_cong;
+               __be64 port_xmit_wasted_bw;
+               __be64 port_xmit_wait_data;
+               __be64 port_rcv_bubble;
+               __be64 port_mark_fecn;
+
+               __be64 port_error_counter_summary;
+               /* Sum of error counts/port */
+
+               struct _vls_dctrs {
+                       /* per-VL Data counters */
+                       __be64 port_vl_xmit_data;
+                       __be64 port_vl_rcv_data;
+                       __be64 port_vl_xmit_pkts;
+                       __be64 port_vl_rcv_pkts;
+                       __be64 port_vl_xmit_wait;
+                       __be64 sw_port_vl_congestion;
+                       __be64 port_vl_rcv_fecn;
+                       __be64 port_vl_rcv_becn;
+                       __be64 port_xmit_time_cong;
+                       __be64 port_vl_xmit_wasted_bw;
+                       __be64 port_vl_xmit_wait_data;
+                       __be64 port_vl_rcv_bubble;
+                       __be64 port_vl_mark_fecn;
+               } vls[0];
+               /* array size defined by #bits set in vl_select_mask*/
+       } port[1]; /* array size defined by  #ports in attribute modifier */
+};
+
+struct opa_port_error_counters64_msg {
+       /*
+        * Request contains first two fields, response contains the
+        * whole magilla
+        */
+       __be64 port_select_mask[4];
+       __be32 vl_select_mask;
+
+       /* Response-only fields follow */
+       __be32 reserved1;
+       struct _port_ectrs {
+               u8 port_number;
+               u8 reserved2[7];
+               __be64 port_rcv_constraint_errors;
+               __be64 port_rcv_switch_relay_errors;
+               __be64 port_xmit_discards;
+               __be64 port_xmit_constraint_errors;
+               __be64 port_rcv_remote_physical_errors;
+               __be64 local_link_integrity_errors;
+               __be64 port_rcv_errors;
+               __be64 excessive_buffer_overruns;
+               __be64 fm_config_errors;
+               __be32 link_error_recovery;
+               __be32 link_downed;
+               u8 uncorrectable_errors;
+               u8 reserved3[7];
+               struct _vls_ectrs {
+                       __be64 port_vl_xmit_discards;
+               } vls[0];
+               /* array size defined by #bits set in vl_select_mask */
+       } port[1]; /* array size defined by #ports in attribute modifier */
+};
+
+struct opa_port_error_info_msg {
+       __be64 port_select_mask[4];
+       __be32 error_info_select_mask;
+       __be32 reserved1;
+       struct _port_ei {
+               u8 port_number;
+               u8 reserved2[7];
+
+               /* PortRcvErrorInfo */
+               struct {
+                       u8 status_and_code;
+                       union {
+                               u8 raw[17];
+                               struct {
+                                       /* EI1to12 format */
+                                       u8 packet_flit1[8];
+                                       u8 packet_flit2[8];
+                                       u8 remaining_flit_bits12;
+                               } ei1to12;
+                               struct {
+                                       u8 packet_bytes[8];
+                                       u8 remaining_flit_bits;
+                               } ei13;
+                       } ei;
+                       u8 reserved3[6];
+               } __packed port_rcv_ei;
+
+               /* ExcessiveBufferOverrunInfo */
+               struct {
+                       u8 status_and_sc;
+                       u8 reserved4[7];
+               } __packed excessive_buffer_overrun_ei;
+
+               /* PortXmitConstraintErrorInfo */
+               struct {
+                       u8 status;
+                       u8 reserved5;
+                       __be16 pkey;
+                       __be32 slid;
+               } __packed port_xmit_constraint_ei;
+
+               /* PortRcvConstraintErrorInfo */
+               struct {
+                       u8 status;
+                       u8 reserved6;
+                       __be16 pkey;
+                       __be32 slid;
+               } __packed port_rcv_constraint_ei;
+
+               /* PortRcvSwitchRelayErrorInfo */
+               struct {
+                       u8 status_and_code;
+                       u8 reserved7[3];
+                       __u32 error_info;
+               } __packed port_rcv_switch_relay_ei;
+
+               /* UncorrectableErrorInfo */
+               struct {
+                       u8 status_and_code;
+                       u8 reserved8;
+               } __packed uncorrectable_ei;
+
+               /* FMConfigErrorInfo */
+               struct {
+                       u8 status_and_code;
+                       u8 error_info;
+               } __packed fm_config_ei;
+               __u32 reserved9;
+       } port[1]; /* actual array size defined by #ports in attr modifier */
+};
+
+/* opa_port_error_info_msg error_info_select_mask bit definitions */
+enum error_info_selects {
+       ES_PORT_RCV_ERROR_INFO                  = (1 << 31),
+       ES_EXCESSIVE_BUFFER_OVERRUN_INFO        = (1 << 30),
+       ES_PORT_XMIT_CONSTRAINT_ERROR_INFO      = (1 << 29),
+       ES_PORT_RCV_CONSTRAINT_ERROR_INFO       = (1 << 28),
+       ES_PORT_RCV_SWITCH_RELAY_ERROR_INFO     = (1 << 27),
+       ES_UNCORRECTABLE_ERROR_INFO             = (1 << 26),
+       ES_FM_CONFIG_ERROR_INFO                 = (1 << 25)
+};
+
+static int pma_get_opa_classportinfo(struct opa_pma_mad *pmp,
+                                    struct ib_device *ibdev, u32 *resp_len)
+{
+       struct opa_class_port_info *p =
+               (struct opa_class_port_info *)pmp->data;
+
+       memset(pmp->data, 0, sizeof(pmp->data));
+
+       if (pmp->mad_hdr.attr_mod != 0)
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+
+       p->base_version = OPA_MGMT_BASE_VERSION;
+       p->class_version = OPA_SMI_CLASS_VERSION;
+       /*
+        * Expected response time is 4.096 usec. * 2^18 == 1.073741824 sec.
+        */
+       p->cap_mask2_resp_time = cpu_to_be32(18);
+
+       if (resp_len)
+               *resp_len += sizeof(*p);
+
+       return reply((struct ib_mad_hdr *)pmp);
+}
+
+static void a0_portstatus(struct hfi1_pportdata *ppd,
+                         struct opa_port_status_rsp *rsp, u32 vl_select_mask)
+{
+       if (!is_bx(ppd->dd)) {
+               unsigned long vl;
+               u64 sum_vl_xmit_wait = 0;
+               u32 vl_all_mask = VL_MASK_ALL;
+
+               for_each_set_bit(vl, (unsigned long *)&(vl_all_mask),
+                                8 * sizeof(vl_all_mask)) {
+                       u64 tmp = sum_vl_xmit_wait +
+                                 read_port_cntr(ppd, C_TX_WAIT_VL,
+                                                idx_from_vl(vl));
+                       if (tmp < sum_vl_xmit_wait) {
+                               /* we wrapped */
+                               sum_vl_xmit_wait = (u64)~0;
+                               break;
+                       }
+                       sum_vl_xmit_wait = tmp;
+               }
+               if (be64_to_cpu(rsp->port_xmit_wait) > sum_vl_xmit_wait)
+                       rsp->port_xmit_wait = cpu_to_be64(sum_vl_xmit_wait);
+       }
+}
+
+static int pma_get_opa_portstatus(struct opa_pma_mad *pmp,
+                                 struct ib_device *ibdev,
+                                 u8 port, u32 *resp_len)
+{
+       struct opa_port_status_req *req =
+               (struct opa_port_status_req *)pmp->data;
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       struct opa_port_status_rsp *rsp;
+       u32 vl_select_mask = be32_to_cpu(req->vl_select_mask);
+       unsigned long vl;
+       size_t response_data_size;
+       u32 nports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
+       u8 port_num = req->port_num;
+       u8 num_vls = hweight32(vl_select_mask);
+       struct _vls_pctrs *vlinfo;
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       int vfi;
+       u64 tmp, tmp2;
+
+       response_data_size = sizeof(struct opa_port_status_rsp) +
+                               num_vls * sizeof(struct _vls_pctrs);
+       if (response_data_size > sizeof(pmp->data)) {
+               pmp->mad_hdr.status |= OPA_PM_STATUS_REQUEST_TOO_LARGE;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+
+       if (nports != 1 || (port_num && port_num != port) ||
+           num_vls > OPA_MAX_VLS || (vl_select_mask & ~VL_MASK_ALL)) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+
+       memset(pmp->data, 0, sizeof(pmp->data));
+
+       rsp = (struct opa_port_status_rsp *)pmp->data;
+       if (port_num)
+               rsp->port_num = port_num;
+       else
+               rsp->port_num = port;
+
+       rsp->port_rcv_constraint_errors =
+               cpu_to_be64(read_port_cntr(ppd, C_SW_RCV_CSTR_ERR,
+                                          CNTR_INVALID_VL));
+
+       hfi1_read_link_quality(dd, &rsp->link_quality_indicator);
+
+       rsp->vl_select_mask = cpu_to_be32(vl_select_mask);
+       rsp->port_xmit_data = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_FLITS,
+                                         CNTR_INVALID_VL));
+       rsp->port_rcv_data = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FLITS,
+                                        CNTR_INVALID_VL));
+       rsp->port_xmit_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_PKTS,
+                                         CNTR_INVALID_VL));
+       rsp->port_rcv_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_PKTS,
+                                        CNTR_INVALID_VL));
+       rsp->port_multicast_xmit_pkts =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_MC_XMIT_PKTS,
+                                         CNTR_INVALID_VL));
+       rsp->port_multicast_rcv_pkts =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_MC_RCV_PKTS,
+                                         CNTR_INVALID_VL));
+       rsp->port_xmit_wait =
+               cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL));
+       rsp->port_rcv_fecn =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL));
+       rsp->port_rcv_becn =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL));
+       rsp->port_xmit_discards =
+               cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD,
+                                          CNTR_INVALID_VL));
+       rsp->port_xmit_constraint_errors =
+               cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR,
+                                          CNTR_INVALID_VL));
+       rsp->port_rcv_remote_physical_errors =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
+                                         CNTR_INVALID_VL));
+       tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL);
+       tmp2 = tmp + read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL);
+       if (tmp2 < tmp) {
+               /* overflow/wrapped */
+               rsp->local_link_integrity_errors = cpu_to_be64(~0);
+       } else {
+               rsp->local_link_integrity_errors = cpu_to_be64(tmp2);
+       }
+       tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL);
+       tmp2 = tmp + read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
+                                  CNTR_INVALID_VL);
+       if (tmp2 > (u32)UINT_MAX || tmp2 < tmp) {
+               /* overflow/wrapped */
+               rsp->link_error_recovery = cpu_to_be32(~0);
+       } else {
+               rsp->link_error_recovery = cpu_to_be32(tmp2);
+       }
+       rsp->port_rcv_errors =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL));
+       rsp->excessive_buffer_overruns =
+               cpu_to_be64(read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL));
+       rsp->fm_config_errors =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_FM_CFG_ERR,
+                                         CNTR_INVALID_VL));
+       rsp->link_downed = cpu_to_be32(read_port_cntr(ppd, C_SW_LINK_DOWN,
+                                                     CNTR_INVALID_VL));
+
+       /* rsp->uncorrectable_errors is 8 bits wide, and it pegs at 0xff */
+       tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL);
+       rsp->uncorrectable_errors = tmp < 0x100 ? (tmp & 0xff) : 0xff;
+
+       vlinfo = &rsp->vls[0];
+       vfi = 0;
+       /* The vl_select_mask has been checked above, and we know
+        * that it contains only entries which represent valid VLs.
+        * So in the for_each_set_bit() loop below, we don't need
+        * any additional checks for vl.
+        */
+       for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
+                        8 * sizeof(vl_select_mask)) {
+               memset(vlinfo, 0, sizeof(*vlinfo));
+
+               tmp = read_dev_cntr(dd, C_DC_RX_FLIT_VL, idx_from_vl(vl));
+               rsp->vls[vfi].port_vl_rcv_data = cpu_to_be64(tmp);
+
+               rsp->vls[vfi].port_vl_rcv_pkts =
+                       cpu_to_be64(read_dev_cntr(dd, C_DC_RX_PKT_VL,
+                                                 idx_from_vl(vl)));
+
+               rsp->vls[vfi].port_vl_xmit_data =
+                       cpu_to_be64(read_port_cntr(ppd, C_TX_FLIT_VL,
+                                                  idx_from_vl(vl)));
+
+               rsp->vls[vfi].port_vl_xmit_pkts =
+                       cpu_to_be64(read_port_cntr(ppd, C_TX_PKT_VL,
+                                                  idx_from_vl(vl)));
+
+               rsp->vls[vfi].port_vl_xmit_wait =
+                       cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT_VL,
+                                                  idx_from_vl(vl)));
+
+               rsp->vls[vfi].port_vl_rcv_fecn =
+                       cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN_VL,
+                                                 idx_from_vl(vl)));
+
+               rsp->vls[vfi].port_vl_rcv_becn =
+                       cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN_VL,
+                                                 idx_from_vl(vl)));
+
+               vlinfo++;
+               vfi++;
+       }
+
+       a0_portstatus(ppd, rsp, vl_select_mask);
+
+       if (resp_len)
+               *resp_len += response_data_size;
+
+       return reply((struct ib_mad_hdr *)pmp);
+}
+
+static u64 get_error_counter_summary(struct ib_device *ibdev, u8 port,
+                                    u8 res_lli, u8 res_ler)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       u64 error_counter_summary = 0, tmp;
+
+       error_counter_summary += read_port_cntr(ppd, C_SW_RCV_CSTR_ERR,
+                                               CNTR_INVALID_VL);
+       /* port_rcv_switch_relay_errors is 0 for HFIs */
+       error_counter_summary += read_port_cntr(ppd, C_SW_XMIT_DSCD,
+                                               CNTR_INVALID_VL);
+       error_counter_summary += read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR,
+                                               CNTR_INVALID_VL);
+       error_counter_summary += read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
+                                              CNTR_INVALID_VL);
+       /* local link integrity must be right-shifted by the lli resolution */
+       tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL);
+       tmp += read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL);
+       error_counter_summary += (tmp >> res_lli);
+       /* link error recovery must b right-shifted by the ler resolution */
+       tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL);
+       tmp += read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, CNTR_INVALID_VL);
+       error_counter_summary += (tmp >> res_ler);
+       error_counter_summary += read_dev_cntr(dd, C_DC_RCV_ERR,
+                                              CNTR_INVALID_VL);
+       error_counter_summary += read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL);
+       error_counter_summary += read_dev_cntr(dd, C_DC_FM_CFG_ERR,
+                                              CNTR_INVALID_VL);
+       /* ppd->link_downed is a 32-bit value */
+       error_counter_summary += read_port_cntr(ppd, C_SW_LINK_DOWN,
+                                               CNTR_INVALID_VL);
+       tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL);
+       /* this is an 8-bit quantity */
+       error_counter_summary += tmp < 0x100 ? (tmp & 0xff) : 0xff;
+
+       return error_counter_summary;
+}
+
+static void a0_datacounters(struct hfi1_pportdata *ppd, struct _port_dctrs *rsp,
+                           u32 vl_select_mask)
+{
+       if (!is_bx(ppd->dd)) {
+               unsigned long vl;
+               u64 sum_vl_xmit_wait = 0;
+               u32 vl_all_mask = VL_MASK_ALL;
+
+               for_each_set_bit(vl, (unsigned long *)&(vl_all_mask),
+                                8 * sizeof(vl_all_mask)) {
+                       u64 tmp = sum_vl_xmit_wait +
+                                 read_port_cntr(ppd, C_TX_WAIT_VL,
+                                                idx_from_vl(vl));
+                       if (tmp < sum_vl_xmit_wait) {
+                               /* we wrapped */
+                               sum_vl_xmit_wait = (u64)~0;
+                               break;
+                       }
+                       sum_vl_xmit_wait = tmp;
+               }
+               if (be64_to_cpu(rsp->port_xmit_wait) > sum_vl_xmit_wait)
+                       rsp->port_xmit_wait = cpu_to_be64(sum_vl_xmit_wait);
+       }
+}
+
+static void pma_get_opa_port_dctrs(struct ib_device *ibdev,
+                                  struct _port_dctrs *rsp)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+
+       rsp->port_xmit_data = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_FLITS,
+                                               CNTR_INVALID_VL));
+       rsp->port_rcv_data = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FLITS,
+                                               CNTR_INVALID_VL));
+       rsp->port_xmit_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_PKTS,
+                                               CNTR_INVALID_VL));
+       rsp->port_rcv_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_PKTS,
+                                               CNTR_INVALID_VL));
+       rsp->port_multicast_xmit_pkts =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_MC_XMIT_PKTS,
+                                         CNTR_INVALID_VL));
+       rsp->port_multicast_rcv_pkts =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_MC_RCV_PKTS,
+                                         CNTR_INVALID_VL));
+}
+
+static int pma_get_opa_datacounters(struct opa_pma_mad *pmp,
+                                   struct ib_device *ibdev,
+                                   u8 port, u32 *resp_len)
+{
+       struct opa_port_data_counters_msg *req =
+               (struct opa_port_data_counters_msg *)pmp->data;
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       struct _port_dctrs *rsp;
+       struct _vls_dctrs *vlinfo;
+       size_t response_data_size;
+       u32 num_ports;
+       u8 num_pslm;
+       u8 lq, num_vls;
+       u8 res_lli, res_ler;
+       u64 port_mask;
+       unsigned long port_num;
+       unsigned long vl;
+       u32 vl_select_mask;
+       int vfi;
+
+       num_ports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
+       num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
+       num_vls = hweight32(be32_to_cpu(req->vl_select_mask));
+       vl_select_mask = be32_to_cpu(req->vl_select_mask);
+       res_lli = (u8)(be32_to_cpu(req->resolution) & MSK_LLI) >> MSK_LLI_SFT;
+       res_lli = res_lli ? res_lli + ADD_LLI : 0;
+       res_ler = (u8)(be32_to_cpu(req->resolution) & MSK_LER) >> MSK_LER_SFT;
+       res_ler = res_ler ? res_ler + ADD_LER : 0;
+
+       if (num_ports != 1 || (vl_select_mask & ~VL_MASK_ALL)) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+
+       /* Sanity check */
+       response_data_size = sizeof(struct opa_port_data_counters_msg) +
+                               num_vls * sizeof(struct _vls_dctrs);
+
+       if (response_data_size > sizeof(pmp->data)) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+
+       /*
+        * The bit set in the mask needs to be consistent with the
+        * port the request came in on.
+        */
+       port_mask = be64_to_cpu(req->port_select_mask[3]);
+       port_num = find_first_bit((unsigned long *)&port_mask,
+                                 sizeof(port_mask));
+
+       if ((u8)port_num != port) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+
+       rsp = &req->port[0];
+       memset(rsp, 0, sizeof(*rsp));
+
+       rsp->port_number = port;
+       /*
+        * Note that link_quality_indicator is a 32 bit quantity in
+        * 'datacounters' queries (as opposed to 'portinfo' queries,
+        * where it's a byte).
+        */
+       hfi1_read_link_quality(dd, &lq);
+       rsp->link_quality_indicator = cpu_to_be32((u32)lq);
+       pma_get_opa_port_dctrs(ibdev, rsp);
+
+       rsp->port_xmit_wait =
+               cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL));
+       rsp->port_rcv_fecn =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL));
+       rsp->port_rcv_becn =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL));
+       rsp->port_error_counter_summary =
+               cpu_to_be64(get_error_counter_summary(ibdev, port,
+                                                     res_lli, res_ler));
+
+       vlinfo = &rsp->vls[0];
+       vfi = 0;
+       /* The vl_select_mask has been checked above, and we know
+        * that it contains only entries which represent valid VLs.
+        * So in the for_each_set_bit() loop below, we don't need
+        * any additional checks for vl.
+        */
+       for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
+                        8 * sizeof(req->vl_select_mask)) {
+               memset(vlinfo, 0, sizeof(*vlinfo));
+
+               rsp->vls[vfi].port_vl_xmit_data =
+                       cpu_to_be64(read_port_cntr(ppd, C_TX_FLIT_VL,
+                                                  idx_from_vl(vl)));
+
+               rsp->vls[vfi].port_vl_rcv_data =
+                       cpu_to_be64(read_dev_cntr(dd, C_DC_RX_FLIT_VL,
+                                                 idx_from_vl(vl)));
+
+               rsp->vls[vfi].port_vl_xmit_pkts =
+                       cpu_to_be64(read_port_cntr(ppd, C_TX_PKT_VL,
+                                                  idx_from_vl(vl)));
+
+               rsp->vls[vfi].port_vl_rcv_pkts =
+                       cpu_to_be64(read_dev_cntr(dd, C_DC_RX_PKT_VL,
+                                                 idx_from_vl(vl)));
+
+               rsp->vls[vfi].port_vl_xmit_wait =
+                       cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT_VL,
+                                                  idx_from_vl(vl)));
+
+               rsp->vls[vfi].port_vl_rcv_fecn =
+                       cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN_VL,
+                                                 idx_from_vl(vl)));
+               rsp->vls[vfi].port_vl_rcv_becn =
+                       cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN_VL,
+                                                 idx_from_vl(vl)));
+
+               /* rsp->port_vl_xmit_time_cong is 0 for HFIs */
+               /* rsp->port_vl_xmit_wasted_bw ??? */
+               /* port_vl_xmit_wait_data - TXE (table 13-9 HFI spec) ???
+                * does this differ from rsp->vls[vfi].port_vl_xmit_wait
+                */
+               /*rsp->vls[vfi].port_vl_mark_fecn =
+                *      cpu_to_be64(read_csr(dd, DCC_PRF_PORT_VL_MARK_FECN_CNT
+                *              + offset));
+                */
+               vlinfo++;
+               vfi++;
+       }
+
+       a0_datacounters(ppd, rsp, vl_select_mask);
+
+       if (resp_len)
+               *resp_len += response_data_size;
+
+       return reply((struct ib_mad_hdr *)pmp);
+}
+
+static int pma_get_ib_portcounters_ext(struct ib_pma_mad *pmp,
+                                      struct ib_device *ibdev, u8 port)
+{
+       struct ib_pma_portcounters_ext *p = (struct ib_pma_portcounters_ext *)
+                                               pmp->data;
+       struct _port_dctrs rsp;
+
+       if (pmp->mad_hdr.attr_mod != 0 || p->port_select != port) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               goto bail;
+       }
+
+       memset(&rsp, 0, sizeof(rsp));
+       pma_get_opa_port_dctrs(ibdev, &rsp);
+
+       p->port_xmit_data = rsp.port_xmit_data;
+       p->port_rcv_data = rsp.port_rcv_data;
+       p->port_xmit_packets = rsp.port_xmit_pkts;
+       p->port_rcv_packets = rsp.port_rcv_pkts;
+       p->port_unicast_xmit_packets = 0;
+       p->port_unicast_rcv_packets =  0;
+       p->port_multicast_xmit_packets = rsp.port_multicast_xmit_pkts;
+       p->port_multicast_rcv_packets = rsp.port_multicast_rcv_pkts;
+
+bail:
+       return reply((struct ib_mad_hdr *)pmp);
+}
+
+static void pma_get_opa_port_ectrs(struct ib_device *ibdev,
+                                  struct _port_ectrs *rsp, u8 port)
+{
+       u64 tmp, tmp2;
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+       tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL);
+       tmp2 = tmp + read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
+                                       CNTR_INVALID_VL);
+       if (tmp2 > (u32)UINT_MAX || tmp2 < tmp) {
+               /* overflow/wrapped */
+               rsp->link_error_recovery = cpu_to_be32(~0);
+       } else {
+               rsp->link_error_recovery = cpu_to_be32(tmp2);
+       }
+
+       rsp->link_downed = cpu_to_be32(read_port_cntr(ppd, C_SW_LINK_DOWN,
+                                               CNTR_INVALID_VL));
+       rsp->port_rcv_errors =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL));
+       rsp->port_rcv_remote_physical_errors =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
+                                         CNTR_INVALID_VL));
+       rsp->port_rcv_switch_relay_errors = 0;
+       rsp->port_xmit_discards =
+               cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD,
+                                          CNTR_INVALID_VL));
+       rsp->port_xmit_constraint_errors =
+               cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR,
+                                          CNTR_INVALID_VL));
+       rsp->port_rcv_constraint_errors =
+               cpu_to_be64(read_port_cntr(ppd, C_SW_RCV_CSTR_ERR,
+                                          CNTR_INVALID_VL));
+       tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL);
+       tmp2 = tmp + read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL);
+       if (tmp2 < tmp) {
+               /* overflow/wrapped */
+               rsp->local_link_integrity_errors = cpu_to_be64(~0);
+       } else {
+               rsp->local_link_integrity_errors = cpu_to_be64(tmp2);
+       }
+       rsp->excessive_buffer_overruns =
+               cpu_to_be64(read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL));
+}
+
+static int pma_get_opa_porterrors(struct opa_pma_mad *pmp,
+                                 struct ib_device *ibdev,
+                                 u8 port, u32 *resp_len)
+{
+       size_t response_data_size;
+       struct _port_ectrs *rsp;
+       u8 port_num;
+       struct opa_port_error_counters64_msg *req;
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       u32 num_ports;
+       u8 num_pslm;
+       u8 num_vls;
+       struct hfi1_ibport *ibp;
+       struct hfi1_pportdata *ppd;
+       struct _vls_ectrs *vlinfo;
+       unsigned long vl;
+       u64 port_mask, tmp;
+       u32 vl_select_mask;
+       int vfi;
+
+       req = (struct opa_port_error_counters64_msg *)pmp->data;
+
+       num_ports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
+
+       num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
+       num_vls = hweight32(be32_to_cpu(req->vl_select_mask));
+
+       if (num_ports != 1 || num_ports != num_pslm) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+
+       response_data_size = sizeof(struct opa_port_error_counters64_msg) +
+                               num_vls * sizeof(struct _vls_ectrs);
+
+       if (response_data_size > sizeof(pmp->data)) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+       /*
+        * The bit set in the mask needs to be consistent with the
+        * port the request came in on.
+        */
+       port_mask = be64_to_cpu(req->port_select_mask[3]);
+       port_num = find_first_bit((unsigned long *)&port_mask,
+                                 sizeof(port_mask));
+
+       if (port_num != port) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+
+       rsp = &req->port[0];
+
+       ibp = to_iport(ibdev, port_num);
+       ppd = ppd_from_ibp(ibp);
+
+       memset(rsp, 0, sizeof(*rsp));
+       rsp->port_number = port_num;
+
+       pma_get_opa_port_ectrs(ibdev, rsp, port_num);
+
+       rsp->port_rcv_remote_physical_errors =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
+                                         CNTR_INVALID_VL));
+       rsp->fm_config_errors =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_FM_CFG_ERR,
+                                         CNTR_INVALID_VL));
+       tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL);
+
+       rsp->uncorrectable_errors = tmp < 0x100 ? (tmp & 0xff) : 0xff;
+
+       vlinfo = &rsp->vls[0];
+       vfi = 0;
+       vl_select_mask = be32_to_cpu(req->vl_select_mask);
+       for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
+                        8 * sizeof(req->vl_select_mask)) {
+               memset(vlinfo, 0, sizeof(*vlinfo));
+               /* vlinfo->vls[vfi].port_vl_xmit_discards ??? */
+               vlinfo += 1;
+               vfi++;
+       }
+
+       if (resp_len)
+               *resp_len += response_data_size;
+
+       return reply((struct ib_mad_hdr *)pmp);
+}
+
+static int pma_get_ib_portcounters(struct ib_pma_mad *pmp,
+                                  struct ib_device *ibdev, u8 port)
+{
+       struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
+               pmp->data;
+       struct _port_ectrs rsp;
+       u64 temp_link_overrun_errors;
+       u64 temp_64;
+       u32 temp_32;
+
+       memset(&rsp, 0, sizeof(rsp));
+       pma_get_opa_port_ectrs(ibdev, &rsp, port);
+
+       if (pmp->mad_hdr.attr_mod != 0 || p->port_select != port) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               goto bail;
+       }
+
+       p->symbol_error_counter = 0; /* N/A for OPA */
+
+       temp_32 = be32_to_cpu(rsp.link_error_recovery);
+       if (temp_32 > 0xFFUL)
+               p->link_error_recovery_counter = 0xFF;
+       else
+               p->link_error_recovery_counter = (u8)temp_32;
+
+       temp_32 = be32_to_cpu(rsp.link_downed);
+       if (temp_32 > 0xFFUL)
+               p->link_downed_counter = 0xFF;
+       else
+               p->link_downed_counter = (u8)temp_32;
+
+       temp_64 = be64_to_cpu(rsp.port_rcv_errors);
+       if (temp_64 > 0xFFFFUL)
+               p->port_rcv_errors = cpu_to_be16(0xFFFF);
+       else
+               p->port_rcv_errors = cpu_to_be16((u16)temp_64);
+
+       temp_64 = be64_to_cpu(rsp.port_rcv_remote_physical_errors);
+       if (temp_64 > 0xFFFFUL)
+               p->port_rcv_remphys_errors = cpu_to_be16(0xFFFF);
+       else
+               p->port_rcv_remphys_errors = cpu_to_be16((u16)temp_64);
+
+       temp_64 = be64_to_cpu(rsp.port_rcv_switch_relay_errors);
+       p->port_rcv_switch_relay_errors = cpu_to_be16((u16)temp_64);
+
+       temp_64 = be64_to_cpu(rsp.port_xmit_discards);
+       if (temp_64 > 0xFFFFUL)
+               p->port_xmit_discards = cpu_to_be16(0xFFFF);
+       else
+               p->port_xmit_discards = cpu_to_be16((u16)temp_64);
+
+       temp_64 = be64_to_cpu(rsp.port_xmit_constraint_errors);
+       if (temp_64 > 0xFFUL)
+               p->port_xmit_constraint_errors = 0xFF;
+       else
+               p->port_xmit_constraint_errors = (u8)temp_64;
+
+       temp_64 = be64_to_cpu(rsp.port_rcv_constraint_errors);
+       if (temp_64 > 0xFFUL)
+               p->port_rcv_constraint_errors = 0xFFUL;
+       else
+               p->port_rcv_constraint_errors = (u8)temp_64;
+
+       /* LocalLink: 7:4, BufferOverrun: 3:0 */
+       temp_64 = be64_to_cpu(rsp.local_link_integrity_errors);
+       if (temp_64 > 0xFUL)
+               temp_64 = 0xFUL;
+
+       temp_link_overrun_errors = temp_64 << 4;
+
+       temp_64 = be64_to_cpu(rsp.excessive_buffer_overruns);
+       if (temp_64 > 0xFUL)
+               temp_64 = 0xFUL;
+       temp_link_overrun_errors |= temp_64;
+
+       p->link_overrun_errors = (u8)temp_link_overrun_errors;
+
+       p->vl15_dropped = 0; /* N/A for OPA */
+
+bail:
+       return reply((struct ib_mad_hdr *)pmp);
+}
+
+static int pma_get_opa_errorinfo(struct opa_pma_mad *pmp,
+                                struct ib_device *ibdev,
+                                u8 port, u32 *resp_len)
+{
+       size_t response_data_size;
+       struct _port_ei *rsp;
+       struct opa_port_error_info_msg *req;
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       u64 port_mask;
+       u32 num_ports;
+       u8 port_num;
+       u8 num_pslm;
+       u64 reg;
+
+       req = (struct opa_port_error_info_msg *)pmp->data;
+       rsp = &req->port[0];
+
+       num_ports = OPA_AM_NPORT(be32_to_cpu(pmp->mad_hdr.attr_mod));
+       num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
+
+       memset(rsp, 0, sizeof(*rsp));
+
+       if (num_ports != 1 || num_ports != num_pslm) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+
+       /* Sanity check */
+       response_data_size = sizeof(struct opa_port_error_info_msg);
+
+       if (response_data_size > sizeof(pmp->data)) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+
+       /*
+        * The bit set in the mask needs to be consistent with the port
+        * the request came in on.
+        */
+       port_mask = be64_to_cpu(req->port_select_mask[3]);
+       port_num = find_first_bit((unsigned long *)&port_mask,
+                                 sizeof(port_mask));
+
+       if (port_num != port) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+
+       /* PortRcvErrorInfo */
+       rsp->port_rcv_ei.status_and_code =
+               dd->err_info_rcvport.status_and_code;
+       memcpy(&rsp->port_rcv_ei.ei.ei1to12.packet_flit1,
+              &dd->err_info_rcvport.packet_flit1, sizeof(u64));
+       memcpy(&rsp->port_rcv_ei.ei.ei1to12.packet_flit2,
+              &dd->err_info_rcvport.packet_flit2, sizeof(u64));
+
+       /* ExcessiverBufferOverrunInfo */
+       reg = read_csr(dd, RCV_ERR_INFO);
+       if (reg & RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK) {
+               /*
+                * if the RcvExcessBufferOverrun bit is set, save SC of
+                * first pkt that encountered an excess buffer overrun
+                */
+               u8 tmp = (u8)reg;
+
+               tmp &=  RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SC_SMASK;
+               tmp <<= 2;
+               rsp->excessive_buffer_overrun_ei.status_and_sc = tmp;
+               /* set the status bit */
+               rsp->excessive_buffer_overrun_ei.status_and_sc |= 0x80;
+       }
+
+       rsp->port_xmit_constraint_ei.status =
+               dd->err_info_xmit_constraint.status;
+       rsp->port_xmit_constraint_ei.pkey =
+               cpu_to_be16(dd->err_info_xmit_constraint.pkey);
+       rsp->port_xmit_constraint_ei.slid =
+               cpu_to_be32(dd->err_info_xmit_constraint.slid);
+
+       rsp->port_rcv_constraint_ei.status =
+               dd->err_info_rcv_constraint.status;
+       rsp->port_rcv_constraint_ei.pkey =
+               cpu_to_be16(dd->err_info_rcv_constraint.pkey);
+       rsp->port_rcv_constraint_ei.slid =
+               cpu_to_be32(dd->err_info_rcv_constraint.slid);
+
+       /* UncorrectableErrorInfo */
+       rsp->uncorrectable_ei.status_and_code = dd->err_info_uncorrectable;
+
+       /* FMConfigErrorInfo */
+       rsp->fm_config_ei.status_and_code = dd->err_info_fmconfig;
+
+       if (resp_len)
+               *resp_len += response_data_size;
+
+       return reply((struct ib_mad_hdr *)pmp);
+}
+
+static int pma_set_opa_portstatus(struct opa_pma_mad *pmp,
+                                 struct ib_device *ibdev,
+                                 u8 port, u32 *resp_len)
+{
+       struct opa_clear_port_status *req =
+               (struct opa_clear_port_status *)pmp->data;
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       u32 nports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
+       u64 portn = be64_to_cpu(req->port_select_mask[3]);
+       u32 counter_select = be32_to_cpu(req->counter_select_mask);
+       u32 vl_select_mask = VL_MASK_ALL; /* clear all per-vl cnts */
+       unsigned long vl;
+
+       if ((nports != 1) || (portn != 1 << port)) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+       /*
+        * only counters returned by pma_get_opa_portstatus() are
+        * handled, so when pma_get_opa_portstatus() gets a fix,
+        * the corresponding change should be made here as well.
+        */
+
+       if (counter_select & CS_PORT_XMIT_DATA)
+               write_dev_cntr(dd, C_DC_XMIT_FLITS, CNTR_INVALID_VL, 0);
+
+       if (counter_select & CS_PORT_RCV_DATA)
+               write_dev_cntr(dd, C_DC_RCV_FLITS, CNTR_INVALID_VL, 0);
+
+       if (counter_select & CS_PORT_XMIT_PKTS)
+               write_dev_cntr(dd, C_DC_XMIT_PKTS, CNTR_INVALID_VL, 0);
+
+       if (counter_select & CS_PORT_RCV_PKTS)
+               write_dev_cntr(dd, C_DC_RCV_PKTS, CNTR_INVALID_VL, 0);
+
+       if (counter_select & CS_PORT_MCAST_XMIT_PKTS)
+               write_dev_cntr(dd, C_DC_MC_XMIT_PKTS, CNTR_INVALID_VL, 0);
+
+       if (counter_select & CS_PORT_MCAST_RCV_PKTS)
+               write_dev_cntr(dd, C_DC_MC_RCV_PKTS, CNTR_INVALID_VL, 0);
+
+       if (counter_select & CS_PORT_XMIT_WAIT)
+               write_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL, 0);
+
+       /* ignore cs_sw_portCongestion for HFIs */
+
+       if (counter_select & CS_PORT_RCV_FECN)
+               write_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL, 0);
+
+       if (counter_select & CS_PORT_RCV_BECN)
+               write_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL, 0);
+
+       /* ignore cs_port_xmit_time_cong for HFIs */
+       /* ignore cs_port_xmit_wasted_bw for now */
+       /* ignore cs_port_xmit_wait_data for now */
+       if (counter_select & CS_PORT_RCV_BUBBLE)
+               write_dev_cntr(dd, C_DC_RCV_BBL, CNTR_INVALID_VL, 0);
+
+       /* Only applicable for switch */
+       /* if (counter_select & CS_PORT_MARK_FECN)
+        *      write_csr(dd, DCC_PRF_PORT_MARK_FECN_CNT, 0);
+        */
+
+       if (counter_select & CS_PORT_RCV_CONSTRAINT_ERRORS)
+               write_port_cntr(ppd, C_SW_RCV_CSTR_ERR, CNTR_INVALID_VL, 0);
+
+       /* ignore cs_port_rcv_switch_relay_errors for HFIs */
+       if (counter_select & CS_PORT_XMIT_DISCARDS)
+               write_port_cntr(ppd, C_SW_XMIT_DSCD, CNTR_INVALID_VL, 0);
+
+       if (counter_select & CS_PORT_XMIT_CONSTRAINT_ERRORS)
+               write_port_cntr(ppd, C_SW_XMIT_CSTR_ERR, CNTR_INVALID_VL, 0);
+
+       if (counter_select & CS_PORT_RCV_REMOTE_PHYSICAL_ERRORS)
+               write_dev_cntr(dd, C_DC_RMT_PHY_ERR, CNTR_INVALID_VL, 0);
+
+       if (counter_select & CS_LOCAL_LINK_INTEGRITY_ERRORS) {
+               write_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL, 0);
+               write_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL, 0);
+       }
+
+       if (counter_select & CS_LINK_ERROR_RECOVERY) {
+               write_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL, 0);
+               write_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
+                              CNTR_INVALID_VL, 0);
+       }
+
+       if (counter_select & CS_PORT_RCV_ERRORS)
+               write_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL, 0);
+
+       if (counter_select & CS_EXCESSIVE_BUFFER_OVERRUNS) {
+               write_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL, 0);
+               dd->rcv_ovfl_cnt = 0;
+       }
+
+       if (counter_select & CS_FM_CONFIG_ERRORS)
+               write_dev_cntr(dd, C_DC_FM_CFG_ERR, CNTR_INVALID_VL, 0);
+
+       if (counter_select & CS_LINK_DOWNED)
+               write_port_cntr(ppd, C_SW_LINK_DOWN, CNTR_INVALID_VL, 0);
+
+       if (counter_select & CS_UNCORRECTABLE_ERRORS)
+               write_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL, 0);
+
+       for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
+                        8 * sizeof(vl_select_mask)) {
+               if (counter_select & CS_PORT_XMIT_DATA)
+                       write_port_cntr(ppd, C_TX_FLIT_VL, idx_from_vl(vl), 0);
+
+               if (counter_select & CS_PORT_RCV_DATA)
+                       write_dev_cntr(dd, C_DC_RX_FLIT_VL, idx_from_vl(vl), 0);
+
+               if (counter_select & CS_PORT_XMIT_PKTS)
+                       write_port_cntr(ppd, C_TX_PKT_VL, idx_from_vl(vl), 0);
+
+               if (counter_select & CS_PORT_RCV_PKTS)
+                       write_dev_cntr(dd, C_DC_RX_PKT_VL, idx_from_vl(vl), 0);
+
+               if (counter_select & CS_PORT_XMIT_WAIT)
+                       write_port_cntr(ppd, C_TX_WAIT_VL, idx_from_vl(vl), 0);
+
+               /* sw_port_vl_congestion is 0 for HFIs */
+               if (counter_select & CS_PORT_RCV_FECN)
+                       write_dev_cntr(dd, C_DC_RCV_FCN_VL, idx_from_vl(vl), 0);
+
+               if (counter_select & CS_PORT_RCV_BECN)
+                       write_dev_cntr(dd, C_DC_RCV_BCN_VL, idx_from_vl(vl), 0);
+
+               /* port_vl_xmit_time_cong is 0 for HFIs */
+               /* port_vl_xmit_wasted_bw ??? */
+               /* port_vl_xmit_wait_data - TXE (table 13-9 HFI spec) ??? */
+               if (counter_select & CS_PORT_RCV_BUBBLE)
+                       write_dev_cntr(dd, C_DC_RCV_BBL_VL, idx_from_vl(vl), 0);
+
+               /* if (counter_select & CS_PORT_MARK_FECN)
+                *     write_csr(dd, DCC_PRF_PORT_VL_MARK_FECN_CNT + offset, 0);
+                */
+               /* port_vl_xmit_discards ??? */
+       }
+
+       if (resp_len)
+               *resp_len += sizeof(*req);
+
+       return reply((struct ib_mad_hdr *)pmp);
+}
+
+static int pma_set_opa_errorinfo(struct opa_pma_mad *pmp,
+                                struct ib_device *ibdev,
+                                u8 port, u32 *resp_len)
+{
+       struct _port_ei *rsp;
+       struct opa_port_error_info_msg *req;
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       u64 port_mask;
+       u32 num_ports;
+       u8 port_num;
+       u8 num_pslm;
+       u32 error_info_select;
+
+       req = (struct opa_port_error_info_msg *)pmp->data;
+       rsp = &req->port[0];
+
+       num_ports = OPA_AM_NPORT(be32_to_cpu(pmp->mad_hdr.attr_mod));
+       num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
+
+       memset(rsp, 0, sizeof(*rsp));
+
+       if (num_ports != 1 || num_ports != num_pslm) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+
+       /*
+        * The bit set in the mask needs to be consistent with the port
+        * the request came in on.
+        */
+       port_mask = be64_to_cpu(req->port_select_mask[3]);
+       port_num = find_first_bit((unsigned long *)&port_mask,
+                                 sizeof(port_mask));
+
+       if (port_num != port) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+
+       error_info_select = be32_to_cpu(req->error_info_select_mask);
+
+       /* PortRcvErrorInfo */
+       if (error_info_select & ES_PORT_RCV_ERROR_INFO)
+               /* turn off status bit */
+               dd->err_info_rcvport.status_and_code &= ~OPA_EI_STATUS_SMASK;
+
+       /* ExcessiverBufferOverrunInfo */
+       if (error_info_select & ES_EXCESSIVE_BUFFER_OVERRUN_INFO)
+               /*
+                * status bit is essentially kept in the h/w - bit 5 of
+                * RCV_ERR_INFO
+                */
+               write_csr(dd, RCV_ERR_INFO,
+                         RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK);
+
+       if (error_info_select & ES_PORT_XMIT_CONSTRAINT_ERROR_INFO)
+               dd->err_info_xmit_constraint.status &= ~OPA_EI_STATUS_SMASK;
+
+       if (error_info_select & ES_PORT_RCV_CONSTRAINT_ERROR_INFO)
+               dd->err_info_rcv_constraint.status &= ~OPA_EI_STATUS_SMASK;
+
+       /* UncorrectableErrorInfo */
+       if (error_info_select & ES_UNCORRECTABLE_ERROR_INFO)
+               /* turn off status bit */
+               dd->err_info_uncorrectable &= ~OPA_EI_STATUS_SMASK;
+
+       /* FMConfigErrorInfo */
+       if (error_info_select & ES_FM_CONFIG_ERROR_INFO)
+               /* turn off status bit */
+               dd->err_info_fmconfig &= ~OPA_EI_STATUS_SMASK;
+
+       if (resp_len)
+               *resp_len += sizeof(*req);
+
+       return reply((struct ib_mad_hdr *)pmp);
+}
+
+struct opa_congestion_info_attr {
+       __be16 congestion_info;
+       u8 control_table_cap;   /* Multiple of 64 entry unit CCTs */
+       u8 congestion_log_length;
+} __packed;
+
+static int __subn_get_opa_cong_info(struct opa_smp *smp, u32 am, u8 *data,
+                                   struct ib_device *ibdev, u8 port,
+                                   u32 *resp_len)
+{
+       struct opa_congestion_info_attr *p =
+               (struct opa_congestion_info_attr *)data;
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+       p->congestion_info = 0;
+       p->control_table_cap = ppd->cc_max_table_entries;
+       p->congestion_log_length = OPA_CONG_LOG_ELEMS;
+
+       if (resp_len)
+               *resp_len += sizeof(*p);
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_get_opa_cong_setting(struct opa_smp *smp, u32 am,
+                                      u8 *data, struct ib_device *ibdev,
+                                      u8 port, u32 *resp_len)
+{
+       int i;
+       struct opa_congestion_setting_attr *p =
+               (struct opa_congestion_setting_attr *)data;
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       struct opa_congestion_setting_entry_shadow *entries;
+       struct cc_state *cc_state;
+
+       rcu_read_lock();
+
+       cc_state = get_cc_state(ppd);
+
+       if (!cc_state) {
+               rcu_read_unlock();
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       entries = cc_state->cong_setting.entries;
+       p->port_control = cpu_to_be16(cc_state->cong_setting.port_control);
+       p->control_map = cpu_to_be32(cc_state->cong_setting.control_map);
+       for (i = 0; i < OPA_MAX_SLS; i++) {
+               p->entries[i].ccti_increase = entries[i].ccti_increase;
+               p->entries[i].ccti_timer = cpu_to_be16(entries[i].ccti_timer);
+               p->entries[i].trigger_threshold =
+                       entries[i].trigger_threshold;
+               p->entries[i].ccti_min = entries[i].ccti_min;
+       }
+
+       rcu_read_unlock();
+
+       if (resp_len)
+               *resp_len += sizeof(*p);
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+/*
+ * Apply congestion control information stored in the ppd to the
+ * active structure.
+ */
+static void apply_cc_state(struct hfi1_pportdata *ppd)
+{
+       struct cc_state *old_cc_state, *new_cc_state;
+
+       new_cc_state = kzalloc(sizeof(*new_cc_state), GFP_KERNEL);
+       if (!new_cc_state)
+               return;
+
+       /*
+        * Hold the lock for updating *and* to prevent ppd information
+        * from changing during the update.
+        */
+       spin_lock(&ppd->cc_state_lock);
+
+       old_cc_state = get_cc_state(ppd);
+       if (!old_cc_state) {
+               /* never active, or shutting down */
+               spin_unlock(&ppd->cc_state_lock);
+               kfree(new_cc_state);
+               return;
+       }
+
+       *new_cc_state = *old_cc_state;
+
+       new_cc_state->cct.ccti_limit = ppd->total_cct_entry - 1;
+       memcpy(new_cc_state->cct.entries, ppd->ccti_entries,
+              ppd->total_cct_entry * sizeof(struct ib_cc_table_entry));
+
+       new_cc_state->cong_setting.port_control = IB_CC_CCS_PC_SL_BASED;
+       new_cc_state->cong_setting.control_map = ppd->cc_sl_control_map;
+       memcpy(new_cc_state->cong_setting.entries, ppd->congestion_entries,
+              OPA_MAX_SLS * sizeof(struct opa_congestion_setting_entry));
+
+       rcu_assign_pointer(ppd->cc_state, new_cc_state);
+
+       spin_unlock(&ppd->cc_state_lock);
+
+       call_rcu(&old_cc_state->rcu, cc_state_reclaim);
+}
+
+static int __subn_set_opa_cong_setting(struct opa_smp *smp, u32 am, u8 *data,
+                                      struct ib_device *ibdev, u8 port,
+                                      u32 *resp_len)
+{
+       struct opa_congestion_setting_attr *p =
+               (struct opa_congestion_setting_attr *)data;
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       struct opa_congestion_setting_entry_shadow *entries;
+       int i;
+
+       /*
+        * Save details from packet into the ppd.  Hold the cc_state_lock so
+        * our information is consistent with anyone trying to apply the state.
+        */
+       spin_lock(&ppd->cc_state_lock);
+       ppd->cc_sl_control_map = be32_to_cpu(p->control_map);
+
+       entries = ppd->congestion_entries;
+       for (i = 0; i < OPA_MAX_SLS; i++) {
+               entries[i].ccti_increase = p->entries[i].ccti_increase;
+               entries[i].ccti_timer = be16_to_cpu(p->entries[i].ccti_timer);
+               entries[i].trigger_threshold =
+                       p->entries[i].trigger_threshold;
+               entries[i].ccti_min = p->entries[i].ccti_min;
+       }
+       spin_unlock(&ppd->cc_state_lock);
+
+       /* now apply the information */
+       apply_cc_state(ppd);
+
+       return __subn_get_opa_cong_setting(smp, am, data, ibdev, port,
+                                          resp_len);
+}
+
+static int __subn_get_opa_hfi1_cong_log(struct opa_smp *smp, u32 am,
+                                       u8 *data, struct ib_device *ibdev,
+                                       u8 port, u32 *resp_len)
+{
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       struct opa_hfi1_cong_log *cong_log = (struct opa_hfi1_cong_log *)data;
+       s64 ts;
+       int i;
+
+       if (am != 0) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       spin_lock_irq(&ppd->cc_log_lock);
+
+       cong_log->log_type = OPA_CC_LOG_TYPE_HFI;
+       cong_log->congestion_flags = 0;
+       cong_log->threshold_event_counter =
+               cpu_to_be16(ppd->threshold_event_counter);
+       memcpy(cong_log->threshold_cong_event_map,
+              ppd->threshold_cong_event_map,
+              sizeof(cong_log->threshold_cong_event_map));
+       /* keep timestamp in units of 1.024 usec */
+       ts = ktime_to_ns(ktime_get()) / 1024;
+       cong_log->current_time_stamp = cpu_to_be32(ts);
+       for (i = 0; i < OPA_CONG_LOG_ELEMS; i++) {
+               struct opa_hfi1_cong_log_event_internal *cce =
+                       &ppd->cc_events[ppd->cc_mad_idx++];
+               if (ppd->cc_mad_idx == OPA_CONG_LOG_ELEMS)
+                       ppd->cc_mad_idx = 0;
+               /*
+                * Entries which are older than twice the time
+                * required to wrap the counter are supposed to
+                * be zeroed (CA10-49 IBTA, release 1.2.1, V1).
+                */
+               if ((u64)(ts - cce->timestamp) > (2 * UINT_MAX))
+                       continue;
+               memcpy(cong_log->events[i].local_qp_cn_entry, &cce->lqpn, 3);
+               memcpy(cong_log->events[i].remote_qp_number_cn_entry,
+                      &cce->rqpn, 3);
+               cong_log->events[i].sl_svc_type_cn_entry =
+                       ((cce->sl & 0x1f) << 3) | (cce->svc_type & 0x7);
+               cong_log->events[i].remote_lid_cn_entry =
+                       cpu_to_be32(cce->rlid);
+               cong_log->events[i].timestamp_cn_entry =
+                       cpu_to_be32(cce->timestamp);
+       }
+
+       /*
+        * Reset threshold_cong_event_map, and threshold_event_counter
+        * to 0 when log is read.
+        */
+       memset(ppd->threshold_cong_event_map, 0x0,
+              sizeof(ppd->threshold_cong_event_map));
+       ppd->threshold_event_counter = 0;
+
+       spin_unlock_irq(&ppd->cc_log_lock);
+
+       if (resp_len)
+               *resp_len += sizeof(struct opa_hfi1_cong_log);
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_get_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data,
+                                  struct ib_device *ibdev, u8 port,
+                                  u32 *resp_len)
+{
+       struct ib_cc_table_attr *cc_table_attr =
+               (struct ib_cc_table_attr *)data;
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       u32 start_block = OPA_AM_START_BLK(am);
+       u32 n_blocks = OPA_AM_NBLK(am);
+       struct ib_cc_table_entry_shadow *entries;
+       int i, j;
+       u32 sentry, eentry;
+       struct cc_state *cc_state;
+
+       /* sanity check n_blocks, start_block */
+       if (n_blocks == 0 ||
+           start_block + n_blocks > ppd->cc_max_table_entries) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       rcu_read_lock();
+
+       cc_state = get_cc_state(ppd);
+
+       if (!cc_state) {
+               rcu_read_unlock();
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       sentry = start_block * IB_CCT_ENTRIES;
+       eentry = sentry + (IB_CCT_ENTRIES * n_blocks);
+
+       cc_table_attr->ccti_limit = cpu_to_be16(cc_state->cct.ccti_limit);
+
+       entries = cc_state->cct.entries;
+
+       /* return n_blocks, though the last block may not be full */
+       for (j = 0, i = sentry; i < eentry; j++, i++)
+               cc_table_attr->ccti_entries[j].entry =
+                       cpu_to_be16(entries[i].entry);
+
+       rcu_read_unlock();
+
+       if (resp_len)
+               *resp_len += sizeof(u16) * (IB_CCT_ENTRIES * n_blocks + 1);
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+void cc_state_reclaim(struct rcu_head *rcu)
+{
+       struct cc_state *cc_state = container_of(rcu, struct cc_state, rcu);
+
+       kfree(cc_state);
+}
+
+static int __subn_set_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data,
+                                  struct ib_device *ibdev, u8 port,
+                                  u32 *resp_len)
+{
+       struct ib_cc_table_attr *p = (struct ib_cc_table_attr *)data;
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       u32 start_block = OPA_AM_START_BLK(am);
+       u32 n_blocks = OPA_AM_NBLK(am);
+       struct ib_cc_table_entry_shadow *entries;
+       int i, j;
+       u32 sentry, eentry;
+       u16 ccti_limit;
+
+       /* sanity check n_blocks, start_block */
+       if (n_blocks == 0 ||
+           start_block + n_blocks > ppd->cc_max_table_entries) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       sentry = start_block * IB_CCT_ENTRIES;
+       eentry = sentry + ((n_blocks - 1) * IB_CCT_ENTRIES) +
+                (be16_to_cpu(p->ccti_limit)) % IB_CCT_ENTRIES + 1;
+
+       /* sanity check ccti_limit */
+       ccti_limit = be16_to_cpu(p->ccti_limit);
+       if (ccti_limit + 1 > eentry) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       /*
+        * Save details from packet into the ppd.  Hold the cc_state_lock so
+        * our information is consistent with anyone trying to apply the state.
+        */
+       spin_lock(&ppd->cc_state_lock);
+       ppd->total_cct_entry = ccti_limit + 1;
+       entries = ppd->ccti_entries;
+       for (j = 0, i = sentry; i < eentry; j++, i++)
+               entries[i].entry = be16_to_cpu(p->ccti_entries[j].entry);
+       spin_unlock(&ppd->cc_state_lock);
+
+       /* now apply the information */
+       apply_cc_state(ppd);
+
+       return __subn_get_opa_cc_table(smp, am, data, ibdev, port, resp_len);
+}
+
+struct opa_led_info {
+       __be32 rsvd_led_mask;
+       __be32 rsvd;
+};
+
+#define OPA_LED_SHIFT  31
+#define OPA_LED_MASK   BIT(OPA_LED_SHIFT)
+
+static int __subn_get_opa_led_info(struct opa_smp *smp, u32 am, u8 *data,
+                                  struct ib_device *ibdev, u8 port,
+                                  u32 *resp_len)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       struct hfi1_pportdata *ppd = dd->pport;
+       struct opa_led_info *p = (struct opa_led_info *)data;
+       u32 nport = OPA_AM_NPORT(am);
+       u32 is_beaconing_active;
+
+       if (nport != 1) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       /*
+        * This pairs with the memory barrier in hfi1_start_led_override to
+        * ensure that we read the correct state of LED beaconing represented
+        * by led_override_timer_active
+        */
+       smp_rmb();
+       is_beaconing_active = !!atomic_read(&ppd->led_override_timer_active);
+       p->rsvd_led_mask = cpu_to_be32(is_beaconing_active << OPA_LED_SHIFT);
+
+       if (resp_len)
+               *resp_len += sizeof(struct opa_led_info);
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_led_info(struct opa_smp *smp, u32 am, u8 *data,
+                                  struct ib_device *ibdev, u8 port,
+                                  u32 *resp_len)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       struct opa_led_info *p = (struct opa_led_info *)data;
+       u32 nport = OPA_AM_NPORT(am);
+       int on = !!(be32_to_cpu(p->rsvd_led_mask) & OPA_LED_MASK);
+
+       if (nport != 1) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       if (on)
+               hfi1_start_led_override(dd->pport, 2000, 1500);
+       else
+               shutdown_led_override(dd->pport);
+
+       return __subn_get_opa_led_info(smp, am, data, ibdev, port, resp_len);
+}
+
+static int subn_get_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am,
+                           u8 *data, struct ib_device *ibdev, u8 port,
+                           u32 *resp_len)
+{
+       int ret;
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+
+       switch (attr_id) {
+       case IB_SMP_ATTR_NODE_DESC:
+               ret = __subn_get_opa_nodedesc(smp, am, data, ibdev, port,
+                                             resp_len);
+               break;
+       case IB_SMP_ATTR_NODE_INFO:
+               ret = __subn_get_opa_nodeinfo(smp, am, data, ibdev, port,
+                                             resp_len);
+               break;
+       case IB_SMP_ATTR_PORT_INFO:
+               ret = __subn_get_opa_portinfo(smp, am, data, ibdev, port,
+                                             resp_len);
+               break;
+       case IB_SMP_ATTR_PKEY_TABLE:
+               ret = __subn_get_opa_pkeytable(smp, am, data, ibdev, port,
+                                              resp_len);
+               break;
+       case OPA_ATTRIB_ID_SL_TO_SC_MAP:
+               ret = __subn_get_opa_sl_to_sc(smp, am, data, ibdev, port,
+                                             resp_len);
+               break;
+       case OPA_ATTRIB_ID_SC_TO_SL_MAP:
+               ret = __subn_get_opa_sc_to_sl(smp, am, data, ibdev, port,
+                                             resp_len);
+               break;
+       case OPA_ATTRIB_ID_SC_TO_VLT_MAP:
+               ret = __subn_get_opa_sc_to_vlt(smp, am, data, ibdev, port,
+                                              resp_len);
+               break;
+       case OPA_ATTRIB_ID_SC_TO_VLNT_MAP:
+               ret = __subn_get_opa_sc_to_vlnt(smp, am, data, ibdev, port,
+                                               resp_len);
+               break;
+       case OPA_ATTRIB_ID_PORT_STATE_INFO:
+               ret = __subn_get_opa_psi(smp, am, data, ibdev, port,
+                                        resp_len);
+               break;
+       case OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE:
+               ret = __subn_get_opa_bct(smp, am, data, ibdev, port,
+                                        resp_len);
+               break;
+       case OPA_ATTRIB_ID_CABLE_INFO:
+               ret = __subn_get_opa_cable_info(smp, am, data, ibdev, port,
+                                               resp_len);
+               break;
+       case IB_SMP_ATTR_VL_ARB_TABLE:
+               ret = __subn_get_opa_vl_arb(smp, am, data, ibdev, port,
+                                           resp_len);
+               break;
+       case OPA_ATTRIB_ID_CONGESTION_INFO:
+               ret = __subn_get_opa_cong_info(smp, am, data, ibdev, port,
+                                              resp_len);
+               break;
+       case OPA_ATTRIB_ID_HFI_CONGESTION_SETTING:
+               ret = __subn_get_opa_cong_setting(smp, am, data, ibdev,
+                                                 port, resp_len);
+               break;
+       case OPA_ATTRIB_ID_HFI_CONGESTION_LOG:
+               ret = __subn_get_opa_hfi1_cong_log(smp, am, data, ibdev,
+                                                  port, resp_len);
+               break;
+       case OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE:
+               ret = __subn_get_opa_cc_table(smp, am, data, ibdev, port,
+                                             resp_len);
+               break;
+       case IB_SMP_ATTR_LED_INFO:
+               ret = __subn_get_opa_led_info(smp, am, data, ibdev, port,
+                                             resp_len);
+               break;
+       case IB_SMP_ATTR_SM_INFO:
+               if (ibp->rvp.port_cap_flags & IB_PORT_SM_DISABLED)
+                       return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+               if (ibp->rvp.port_cap_flags & IB_PORT_SM)
+                       return IB_MAD_RESULT_SUCCESS;
+               /* FALLTHROUGH */
+       default:
+               smp->status |= IB_SMP_UNSUP_METH_ATTR;
+               ret = reply((struct ib_mad_hdr *)smp);
+               break;
+       }
+       return ret;
+}
+
+static int subn_set_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am,
+                           u8 *data, struct ib_device *ibdev, u8 port,
+                           u32 *resp_len)
+{
+       int ret;
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+
+       switch (attr_id) {
+       case IB_SMP_ATTR_PORT_INFO:
+               ret = __subn_set_opa_portinfo(smp, am, data, ibdev, port,
+                                             resp_len);
+               break;
+       case IB_SMP_ATTR_PKEY_TABLE:
+               ret = __subn_set_opa_pkeytable(smp, am, data, ibdev, port,
+                                              resp_len);
+               break;
+       case OPA_ATTRIB_ID_SL_TO_SC_MAP:
+               ret = __subn_set_opa_sl_to_sc(smp, am, data, ibdev, port,
+                                             resp_len);
+               break;
+       case OPA_ATTRIB_ID_SC_TO_SL_MAP:
+               ret = __subn_set_opa_sc_to_sl(smp, am, data, ibdev, port,
+                                             resp_len);
+               break;
+       case OPA_ATTRIB_ID_SC_TO_VLT_MAP:
+               ret = __subn_set_opa_sc_to_vlt(smp, am, data, ibdev, port,
+                                              resp_len);
+               break;
+       case OPA_ATTRIB_ID_SC_TO_VLNT_MAP:
+               ret = __subn_set_opa_sc_to_vlnt(smp, am, data, ibdev, port,
+                                               resp_len);
+               break;
+       case OPA_ATTRIB_ID_PORT_STATE_INFO:
+               ret = __subn_set_opa_psi(smp, am, data, ibdev, port,
+                                        resp_len);
+               break;
+       case OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE:
+               ret = __subn_set_opa_bct(smp, am, data, ibdev, port,
+                                        resp_len);
+               break;
+       case IB_SMP_ATTR_VL_ARB_TABLE:
+               ret = __subn_set_opa_vl_arb(smp, am, data, ibdev, port,
+                                           resp_len);
+               break;
+       case OPA_ATTRIB_ID_HFI_CONGESTION_SETTING:
+               ret = __subn_set_opa_cong_setting(smp, am, data, ibdev,
+                                                 port, resp_len);
+               break;
+       case OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE:
+               ret = __subn_set_opa_cc_table(smp, am, data, ibdev, port,
+                                             resp_len);
+               break;
+       case IB_SMP_ATTR_LED_INFO:
+               ret = __subn_set_opa_led_info(smp, am, data, ibdev, port,
+                                             resp_len);
+               break;
+       case IB_SMP_ATTR_SM_INFO:
+               if (ibp->rvp.port_cap_flags & IB_PORT_SM_DISABLED)
+                       return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+               if (ibp->rvp.port_cap_flags & IB_PORT_SM)
+                       return IB_MAD_RESULT_SUCCESS;
+               /* FALLTHROUGH */
+       default:
+               smp->status |= IB_SMP_UNSUP_METH_ATTR;
+               ret = reply((struct ib_mad_hdr *)smp);
+               break;
+       }
+       return ret;
+}
+
+static inline void set_aggr_error(struct opa_aggregate *ag)
+{
+       ag->err_reqlength |= cpu_to_be16(0x8000);
+}
+
+static int subn_get_opa_aggregate(struct opa_smp *smp,
+                                 struct ib_device *ibdev, u8 port,
+                                 u32 *resp_len)
+{
+       int i;
+       u32 num_attr = be32_to_cpu(smp->attr_mod) & 0x000000ff;
+       u8 *next_smp = opa_get_smp_data(smp);
+
+       if (num_attr < 1 || num_attr > 117) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       for (i = 0; i < num_attr; i++) {
+               struct opa_aggregate *agg;
+               size_t agg_data_len;
+               size_t agg_size;
+               u32 am;
+
+               agg = (struct opa_aggregate *)next_smp;
+               agg_data_len = (be16_to_cpu(agg->err_reqlength) & 0x007f) * 8;
+               agg_size = sizeof(*agg) + agg_data_len;
+               am = be32_to_cpu(agg->attr_mod);
+
+               *resp_len += agg_size;
+
+               if (next_smp + agg_size > ((u8 *)smp) + sizeof(*smp)) {
+                       smp->status |= IB_SMP_INVALID_FIELD;
+                       return reply((struct ib_mad_hdr *)smp);
+               }
+
+               /* zero the payload for this segment */
+               memset(next_smp + sizeof(*agg), 0, agg_data_len);
+
+               (void)subn_get_opa_sma(agg->attr_id, smp, am, agg->data,
+                                       ibdev, port, NULL);
+               if (smp->status & ~IB_SMP_DIRECTION) {
+                       set_aggr_error(agg);
+                       return reply((struct ib_mad_hdr *)smp);
+               }
+               next_smp += agg_size;
+       }
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int subn_set_opa_aggregate(struct opa_smp *smp,
+                                 struct ib_device *ibdev, u8 port,
+                                 u32 *resp_len)
+{
+       int i;
+       u32 num_attr = be32_to_cpu(smp->attr_mod) & 0x000000ff;
+       u8 *next_smp = opa_get_smp_data(smp);
+
+       if (num_attr < 1 || num_attr > 117) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       for (i = 0; i < num_attr; i++) {
+               struct opa_aggregate *agg;
+               size_t agg_data_len;
+               size_t agg_size;
+               u32 am;
+
+               agg = (struct opa_aggregate *)next_smp;
+               agg_data_len = (be16_to_cpu(agg->err_reqlength) & 0x007f) * 8;
+               agg_size = sizeof(*agg) + agg_data_len;
+               am = be32_to_cpu(agg->attr_mod);
+
+               *resp_len += agg_size;
+
+               if (next_smp + agg_size > ((u8 *)smp) + sizeof(*smp)) {
+                       smp->status |= IB_SMP_INVALID_FIELD;
+                       return reply((struct ib_mad_hdr *)smp);
+               }
+
+               (void)subn_set_opa_sma(agg->attr_id, smp, am, agg->data,
+                                       ibdev, port, NULL);
+               if (smp->status & ~IB_SMP_DIRECTION) {
+                       set_aggr_error(agg);
+                       return reply((struct ib_mad_hdr *)smp);
+               }
+               next_smp += agg_size;
+       }
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+/*
+ * OPAv1 specifies that, on the transition to link up, these counters
+ * are cleared:
+ *   PortRcvErrors [*]
+ *   LinkErrorRecovery
+ *   LocalLinkIntegrityErrors
+ *   ExcessiveBufferOverruns [*]
+ *
+ * [*] Error info associated with these counters is retained, but the
+ * error info status is reset to 0.
+ */
+void clear_linkup_counters(struct hfi1_devdata *dd)
+{
+       /* PortRcvErrors */
+       write_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL, 0);
+       dd->err_info_rcvport.status_and_code &= ~OPA_EI_STATUS_SMASK;
+       /* LinkErrorRecovery */
+       write_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL, 0);
+       write_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, CNTR_INVALID_VL, 0);
+       /* LocalLinkIntegrityErrors */
+       write_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL, 0);
+       write_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL, 0);
+       /* ExcessiveBufferOverruns */
+       write_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL, 0);
+       dd->rcv_ovfl_cnt = 0;
+       dd->err_info_xmit_constraint.status &= ~OPA_EI_STATUS_SMASK;
+}
+
+/*
+ * is_local_mad() returns 1 if 'mad' is sent from, and destined to the
+ * local node, 0 otherwise.
+ */
+static int is_local_mad(struct hfi1_ibport *ibp, const struct opa_mad *mad,
+                       const struct ib_wc *in_wc)
+{
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       const struct opa_smp *smp = (const struct opa_smp *)mad;
+
+       if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+               return (smp->hop_cnt == 0 &&
+                       smp->route.dr.dr_slid == OPA_LID_PERMISSIVE &&
+                       smp->route.dr.dr_dlid == OPA_LID_PERMISSIVE);
+       }
+
+       return (in_wc->slid == ppd->lid);
+}
+
+/*
+ * opa_local_smp_check() should only be called on MADs for which
+ * is_local_mad() returns true. It applies the SMP checks that are
+ * specific to SMPs which are sent from, and destined to this node.
+ * opa_local_smp_check() returns 0 if the SMP passes its checks, 1
+ * otherwise.
+ *
+ * SMPs which arrive from other nodes are instead checked by
+ * opa_smp_check().
+ */
+static int opa_local_smp_check(struct hfi1_ibport *ibp,
+                              const struct ib_wc *in_wc)
+{
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       u16 slid = in_wc->slid;
+       u16 pkey;
+
+       if (in_wc->pkey_index >= ARRAY_SIZE(ppd->pkeys))
+               return 1;
+
+       pkey = ppd->pkeys[in_wc->pkey_index];
+       /*
+        * We need to do the "node-local" checks specified in OPAv1,
+        * rev 0.90, section 9.10.26, which are:
+        *   - pkey is 0x7fff, or 0xffff
+        *   - Source QPN == 0 || Destination QPN == 0
+        *   - the MAD header's management class is either
+        *     IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE or
+        *     IB_MGMT_CLASS_SUBN_LID_ROUTED
+        *   - SLID != 0
+        *
+        * However, we know (and so don't need to check again) that,
+        * for local SMPs, the MAD stack passes MADs with:
+        *   - Source QPN of 0
+        *   - MAD mgmt_class is IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
+        *   - SLID is either: OPA_LID_PERMISSIVE (0xFFFFFFFF), or
+        *     our own port's lid
+        *
+        */
+       if (pkey == LIM_MGMT_P_KEY || pkey == FULL_MGMT_P_KEY)
+               return 0;
+       ingress_pkey_table_fail(ppd, pkey, slid);
+       return 1;
+}
+
+static int process_subn_opa(struct ib_device *ibdev, int mad_flags,
+                           u8 port, const struct opa_mad *in_mad,
+                           struct opa_mad *out_mad,
+                           u32 *resp_len)
+{
+       struct opa_smp *smp = (struct opa_smp *)out_mad;
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       u8 *data;
+       u32 am;
+       __be16 attr_id;
+       int ret;
+
+       *out_mad = *in_mad;
+       data = opa_get_smp_data(smp);
+
+       am = be32_to_cpu(smp->attr_mod);
+       attr_id = smp->attr_id;
+       if (smp->class_version != OPA_SMI_CLASS_VERSION) {
+               smp->status |= IB_SMP_UNSUP_VERSION;
+               ret = reply((struct ib_mad_hdr *)smp);
+               return ret;
+       }
+       ret = check_mkey(ibp, (struct ib_mad_hdr *)smp, mad_flags, smp->mkey,
+                        smp->route.dr.dr_slid, smp->route.dr.return_path,
+                        smp->hop_cnt);
+       if (ret) {
+               u32 port_num = be32_to_cpu(smp->attr_mod);
+
+               /*
+                * If this is a get/set portinfo, we already check the
+                * M_Key if the MAD is for another port and the M_Key
+                * is OK on the receiving port. This check is needed
+                * to increment the error counters when the M_Key
+                * fails to match on *both* ports.
+                */
+               if (attr_id == IB_SMP_ATTR_PORT_INFO &&
+                   (smp->method == IB_MGMT_METHOD_GET ||
+                    smp->method == IB_MGMT_METHOD_SET) &&
+                   port_num && port_num <= ibdev->phys_port_cnt &&
+                   port != port_num)
+                       (void)check_mkey(to_iport(ibdev, port_num),
+                                         (struct ib_mad_hdr *)smp, 0,
+                                         smp->mkey, smp->route.dr.dr_slid,
+                                         smp->route.dr.return_path,
+                                         smp->hop_cnt);
+               ret = IB_MAD_RESULT_FAILURE;
+               return ret;
+       }
+
+       *resp_len = opa_get_smp_header_size(smp);
+
+       switch (smp->method) {
+       case IB_MGMT_METHOD_GET:
+               switch (attr_id) {
+               default:
+                       clear_opa_smp_data(smp);
+                       ret = subn_get_opa_sma(attr_id, smp, am, data,
+                                              ibdev, port, resp_len);
+                       break;
+               case OPA_ATTRIB_ID_AGGREGATE:
+                       ret = subn_get_opa_aggregate(smp, ibdev, port,
+                                                    resp_len);
+                       break;
+               }
+               break;
+       case IB_MGMT_METHOD_SET:
+               switch (attr_id) {
+               default:
+                       ret = subn_set_opa_sma(attr_id, smp, am, data,
+                                              ibdev, port, resp_len);
+                       break;
+               case OPA_ATTRIB_ID_AGGREGATE:
+                       ret = subn_set_opa_aggregate(smp, ibdev, port,
+                                                    resp_len);
+                       break;
+               }
+               break;
+       case IB_MGMT_METHOD_TRAP:
+       case IB_MGMT_METHOD_REPORT:
+       case IB_MGMT_METHOD_REPORT_RESP:
+       case IB_MGMT_METHOD_GET_RESP:
+               /*
+                * The ib_mad module will call us to process responses
+                * before checking for other consumers.
+                * Just tell the caller to process it normally.
+                */
+               ret = IB_MAD_RESULT_SUCCESS;
+               break;
+       default:
+               smp->status |= IB_SMP_UNSUP_METHOD;
+               ret = reply((struct ib_mad_hdr *)smp);
+               break;
+       }
+
+       return ret;
+}
+
+static int process_subn(struct ib_device *ibdev, int mad_flags,
+                       u8 port, const struct ib_mad *in_mad,
+                       struct ib_mad *out_mad)
+{
+       struct ib_smp *smp = (struct ib_smp *)out_mad;
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       int ret;
+
+       *out_mad = *in_mad;
+       if (smp->class_version != 1) {
+               smp->status |= IB_SMP_UNSUP_VERSION;
+               ret = reply((struct ib_mad_hdr *)smp);
+               return ret;
+       }
+
+       ret = check_mkey(ibp, (struct ib_mad_hdr *)smp, mad_flags,
+                        smp->mkey, (__force __be32)smp->dr_slid,
+                        smp->return_path, smp->hop_cnt);
+       if (ret) {
+               u32 port_num = be32_to_cpu(smp->attr_mod);
+
+               /*
+                * If this is a get/set portinfo, we already check the
+                * M_Key if the MAD is for another port and the M_Key
+                * is OK on the receiving port. This check is needed
+                * to increment the error counters when the M_Key
+                * fails to match on *both* ports.
+                */
+               if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO &&
+                   (smp->method == IB_MGMT_METHOD_GET ||
+                    smp->method == IB_MGMT_METHOD_SET) &&
+                   port_num && port_num <= ibdev->phys_port_cnt &&
+                   port != port_num)
+                       (void)check_mkey(to_iport(ibdev, port_num),
+                                        (struct ib_mad_hdr *)smp, 0,
+                                        smp->mkey,
+                                        (__force __be32)smp->dr_slid,
+                                        smp->return_path, smp->hop_cnt);
+               ret = IB_MAD_RESULT_FAILURE;
+               return ret;
+       }
+
+       switch (smp->method) {
+       case IB_MGMT_METHOD_GET:
+               switch (smp->attr_id) {
+               case IB_SMP_ATTR_NODE_INFO:
+                       ret = subn_get_nodeinfo(smp, ibdev, port);
+                       break;
+               default:
+                       smp->status |= IB_SMP_UNSUP_METH_ATTR;
+                       ret = reply((struct ib_mad_hdr *)smp);
+                       break;
+               }
+               break;
+       }
+
+       return ret;
+}
+
+static int process_perf(struct ib_device *ibdev, u8 port,
+                       const struct ib_mad *in_mad,
+                       struct ib_mad *out_mad)
+{
+       struct ib_pma_mad *pmp = (struct ib_pma_mad *)out_mad;
+       struct ib_class_port_info *cpi = (struct ib_class_port_info *)
+                                               &pmp->data;
+       int ret = IB_MAD_RESULT_FAILURE;
+
+       *out_mad = *in_mad;
+       if (pmp->mad_hdr.class_version != 1) {
+               pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION;
+               ret = reply((struct ib_mad_hdr *)pmp);
+               return ret;
+       }
+
+       switch (pmp->mad_hdr.method) {
+       case IB_MGMT_METHOD_GET:
+               switch (pmp->mad_hdr.attr_id) {
+               case IB_PMA_PORT_COUNTERS:
+                       ret = pma_get_ib_portcounters(pmp, ibdev, port);
+                       break;
+               case IB_PMA_PORT_COUNTERS_EXT:
+                       ret = pma_get_ib_portcounters_ext(pmp, ibdev, port);
+                       break;
+               case IB_PMA_CLASS_PORT_INFO:
+                       cpi->capability_mask = IB_PMA_CLASS_CAP_EXT_WIDTH;
+                       ret = reply((struct ib_mad_hdr *)pmp);
+                       break;
+               default:
+                       pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+                       ret = reply((struct ib_mad_hdr *)pmp);
+                       break;
+               }
+               break;
+
+       case IB_MGMT_METHOD_SET:
+               if (pmp->mad_hdr.attr_id) {
+                       pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+                       ret = reply((struct ib_mad_hdr *)pmp);
+               }
+               break;
+
+       case IB_MGMT_METHOD_TRAP:
+       case IB_MGMT_METHOD_GET_RESP:
+               /*
+                * The ib_mad module will call us to process responses
+                * before checking for other consumers.
+                * Just tell the caller to process it normally.
+                */
+               ret = IB_MAD_RESULT_SUCCESS;
+               break;
+
+       default:
+               pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD;
+               ret = reply((struct ib_mad_hdr *)pmp);
+               break;
+       }
+
+       return ret;
+}
+
+static int process_perf_opa(struct ib_device *ibdev, u8 port,
+                           const struct opa_mad *in_mad,
+                           struct opa_mad *out_mad, u32 *resp_len)
+{
+       struct opa_pma_mad *pmp = (struct opa_pma_mad *)out_mad;
+       int ret;
+
+       *out_mad = *in_mad;
+
+       if (pmp->mad_hdr.class_version != OPA_SMI_CLASS_VERSION) {
+               pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+
+       *resp_len = sizeof(pmp->mad_hdr);
+
+       switch (pmp->mad_hdr.method) {
+       case IB_MGMT_METHOD_GET:
+               switch (pmp->mad_hdr.attr_id) {
+               case IB_PMA_CLASS_PORT_INFO:
+                       ret = pma_get_opa_classportinfo(pmp, ibdev, resp_len);
+                       break;
+               case OPA_PM_ATTRIB_ID_PORT_STATUS:
+                       ret = pma_get_opa_portstatus(pmp, ibdev, port,
+                                                    resp_len);
+                       break;
+               case OPA_PM_ATTRIB_ID_DATA_PORT_COUNTERS:
+                       ret = pma_get_opa_datacounters(pmp, ibdev, port,
+                                                      resp_len);
+                       break;
+               case OPA_PM_ATTRIB_ID_ERROR_PORT_COUNTERS:
+                       ret = pma_get_opa_porterrors(pmp, ibdev, port,
+                                                    resp_len);
+                       break;
+               case OPA_PM_ATTRIB_ID_ERROR_INFO:
+                       ret = pma_get_opa_errorinfo(pmp, ibdev, port,
+                                                   resp_len);
+                       break;
+               default:
+                       pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+                       ret = reply((struct ib_mad_hdr *)pmp);
+                       break;
+               }
+               break;
+
+       case IB_MGMT_METHOD_SET:
+               switch (pmp->mad_hdr.attr_id) {
+               case OPA_PM_ATTRIB_ID_CLEAR_PORT_STATUS:
+                       ret = pma_set_opa_portstatus(pmp, ibdev, port,
+                                                    resp_len);
+                       break;
+               case OPA_PM_ATTRIB_ID_ERROR_INFO:
+                       ret = pma_set_opa_errorinfo(pmp, ibdev, port,
+                                                   resp_len);
+                       break;
+               default:
+                       pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+                       ret = reply((struct ib_mad_hdr *)pmp);
+                       break;
+               }
+               break;
+
+       case IB_MGMT_METHOD_TRAP:
+       case IB_MGMT_METHOD_GET_RESP:
+               /*
+                * The ib_mad module will call us to process responses
+                * before checking for other consumers.
+                * Just tell the caller to process it normally.
+                */
+               ret = IB_MAD_RESULT_SUCCESS;
+               break;
+
+       default:
+               pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD;
+               ret = reply((struct ib_mad_hdr *)pmp);
+               break;
+       }
+
+       return ret;
+}
+
+static int hfi1_process_opa_mad(struct ib_device *ibdev, int mad_flags,
+                               u8 port, const struct ib_wc *in_wc,
+                               const struct ib_grh *in_grh,
+                               const struct opa_mad *in_mad,
+                               struct opa_mad *out_mad, size_t *out_mad_size,
+                               u16 *out_mad_pkey_index)
+{
+       int ret;
+       int pkey_idx;
+       u32 resp_len = 0;
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+
+       pkey_idx = hfi1_lookup_pkey_idx(ibp, LIM_MGMT_P_KEY);
+       if (pkey_idx < 0) {
+               pr_warn("failed to find limited mgmt pkey, defaulting 0x%x\n",
+                       hfi1_get_pkey(ibp, 1));
+               pkey_idx = 1;
+       }
+       *out_mad_pkey_index = (u16)pkey_idx;
+
+       switch (in_mad->mad_hdr.mgmt_class) {
+       case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+       case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+               if (is_local_mad(ibp, in_mad, in_wc)) {
+                       ret = opa_local_smp_check(ibp, in_wc);
+                       if (ret)
+                               return IB_MAD_RESULT_FAILURE;
+               }
+               ret = process_subn_opa(ibdev, mad_flags, port, in_mad,
+                                      out_mad, &resp_len);
+               goto bail;
+       case IB_MGMT_CLASS_PERF_MGMT:
+               ret = process_perf_opa(ibdev, port, in_mad, out_mad,
+                                      &resp_len);
+               goto bail;
+
+       default:
+               ret = IB_MAD_RESULT_SUCCESS;
+       }
+
+bail:
+       if (ret & IB_MAD_RESULT_REPLY)
+               *out_mad_size = round_up(resp_len, 8);
+       else if (ret & IB_MAD_RESULT_SUCCESS)
+               *out_mad_size = in_wc->byte_len - sizeof(struct ib_grh);
+
+       return ret;
+}
+
+static int hfi1_process_ib_mad(struct ib_device *ibdev, int mad_flags, u8 port,
+                              const struct ib_wc *in_wc,
+                              const struct ib_grh *in_grh,
+                              const struct ib_mad *in_mad,
+                              struct ib_mad *out_mad)
+{
+       int ret;
+
+       switch (in_mad->mad_hdr.mgmt_class) {
+       case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+       case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+               ret = process_subn(ibdev, mad_flags, port, in_mad, out_mad);
+               break;
+       case IB_MGMT_CLASS_PERF_MGMT:
+               ret = process_perf(ibdev, port, in_mad, out_mad);
+               break;
+       default:
+               ret = IB_MAD_RESULT_SUCCESS;
+               break;
+       }
+
+       return ret;
+}
+
+/**
+ * hfi1_process_mad - process an incoming MAD packet
+ * @ibdev: the infiniband device this packet came in on
+ * @mad_flags: MAD flags
+ * @port: the port number this packet came in on
+ * @in_wc: the work completion entry for this packet
+ * @in_grh: the global route header for this packet
+ * @in_mad: the incoming MAD
+ * @out_mad: any outgoing MAD reply
+ *
+ * Returns IB_MAD_RESULT_SUCCESS if this is a MAD that we are not
+ * interested in processing.
+ *
+ * Note that the verbs framework has already done the MAD sanity checks,
+ * and hop count/pointer updating for IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
+ * MADs.
+ *
+ * This is called by the ib_mad module.
+ */
+int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u8 port,
+                    const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+                    const struct ib_mad_hdr *in_mad, size_t in_mad_size,
+                    struct ib_mad_hdr *out_mad, size_t *out_mad_size,
+                    u16 *out_mad_pkey_index)
+{
+       switch (in_mad->base_version) {
+       case OPA_MGMT_BASE_VERSION:
+               if (unlikely(in_mad_size != sizeof(struct opa_mad))) {
+                       dev_err(ibdev->dma_device, "invalid in_mad_size\n");
+                       return IB_MAD_RESULT_FAILURE;
+               }
+               return hfi1_process_opa_mad(ibdev, mad_flags, port,
+                                           in_wc, in_grh,
+                                           (struct opa_mad *)in_mad,
+                                           (struct opa_mad *)out_mad,
+                                           out_mad_size,
+                                           out_mad_pkey_index);
+       case IB_MGMT_BASE_VERSION:
+               return hfi1_process_ib_mad(ibdev, mad_flags, port,
+                                         in_wc, in_grh,
+                                         (const struct ib_mad *)in_mad,
+                                         (struct ib_mad *)out_mad);
+       default:
+               break;
+       }
+
+       return IB_MAD_RESULT_FAILURE;
+}
diff --git a/drivers/infiniband/hw/hfi1/mad.h b/drivers/infiniband/hw/hfi1/mad.h
new file mode 100644 (file)
index 0000000..55ee086
--- /dev/null
@@ -0,0 +1,437 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef _HFI1_MAD_H
+#define _HFI1_MAD_H
+
+#include <rdma/ib_pma.h>
+#define USE_PI_LED_ENABLE      1 /*
+                                  * use led enabled bit in struct
+                                  * opa_port_states, if available
+                                  */
+#include <rdma/opa_smi.h>
+#include <rdma/opa_port_info.h>
+#ifndef PI_LED_ENABLE_SUP
+#define PI_LED_ENABLE_SUP 0
+#endif
+#include "opa_compat.h"
+
+/*
+ * OPA Traps
+ */
+#define OPA_TRAP_GID_NOW_IN_SERVICE             cpu_to_be16(64)
+#define OPA_TRAP_GID_OUT_OF_SERVICE             cpu_to_be16(65)
+#define OPA_TRAP_ADD_MULTICAST_GROUP            cpu_to_be16(66)
+#define OPA_TRAL_DEL_MULTICAST_GROUP            cpu_to_be16(67)
+#define OPA_TRAP_UNPATH                         cpu_to_be16(68)
+#define OPA_TRAP_REPATH                         cpu_to_be16(69)
+#define OPA_TRAP_PORT_CHANGE_STATE              cpu_to_be16(128)
+#define OPA_TRAP_LINK_INTEGRITY                 cpu_to_be16(129)
+#define OPA_TRAP_EXCESSIVE_BUFFER_OVERRUN       cpu_to_be16(130)
+#define OPA_TRAP_FLOW_WATCHDOG                  cpu_to_be16(131)
+#define OPA_TRAP_CHANGE_CAPABILITY              cpu_to_be16(144)
+#define OPA_TRAP_CHANGE_SYSGUID                 cpu_to_be16(145)
+#define OPA_TRAP_BAD_M_KEY                      cpu_to_be16(256)
+#define OPA_TRAP_BAD_P_KEY                      cpu_to_be16(257)
+#define OPA_TRAP_BAD_Q_KEY                      cpu_to_be16(258)
+#define OPA_TRAP_SWITCH_BAD_PKEY                cpu_to_be16(259)
+#define OPA_SMA_TRAP_DATA_LINK_WIDTH            cpu_to_be16(2048)
+
+/*
+ * Generic trap/notice other local changes flags (trap 144).
+ */
+#define        OPA_NOTICE_TRAP_LWDE_CHG        0x08 /* Link Width Downgrade Enable
+                                             * changed
+                                             */
+#define OPA_NOTICE_TRAP_LSE_CHG         0x04 /* Link Speed Enable changed */
+#define OPA_NOTICE_TRAP_LWE_CHG         0x02 /* Link Width Enable changed */
+#define OPA_NOTICE_TRAP_NODE_DESC_CHG   0x01
+
+struct opa_mad_notice_attr {
+       u8 generic_type;
+       u8 prod_type_msb;
+       __be16 prod_type_lsb;
+       __be16 trap_num;
+       __be16 toggle_count;
+       __be32 issuer_lid;
+       __be32 reserved1;
+       union ib_gid issuer_gid;
+
+       union {
+               struct {
+                       u8      details[64];
+               } raw_data;
+
+               struct {
+                       union ib_gid    gid;
+               } __packed ntc_64_65_66_67;
+
+               struct {
+                       __be32  lid;
+               } __packed ntc_128;
+
+               struct {
+                       __be32  lid;            /* where violation happened */
+                       u8      port_num;       /* where violation happened */
+               } __packed ntc_129_130_131;
+
+               struct {
+                       __be32  lid;            /* LID where change occurred */
+                       __be32  new_cap_mask;   /* new capability mask */
+                       __be16  reserved2;
+                       __be16  cap_mask;
+                       __be16  change_flags;   /* low 4 bits only */
+               } __packed ntc_144;
+
+               struct {
+                       __be64  new_sys_guid;
+                       __be32  lid;            /* lid where sys guid changed */
+               } __packed ntc_145;
+
+               struct {
+                       __be32  lid;
+                       __be32  dr_slid;
+                       u8      method;
+                       u8      dr_trunc_hop;
+                       __be16  attr_id;
+                       __be32  attr_mod;
+                       __be64  mkey;
+                       u8      dr_rtn_path[30];
+               } __packed ntc_256;
+
+               struct {
+                       __be32          lid1;
+                       __be32          lid2;
+                       __be32          key;
+                       u8              sl;     /* SL: high 5 bits */
+                       u8              reserved3[3];
+                       union ib_gid    gid1;
+                       union ib_gid    gid2;
+                       __be32          qp1;    /* high 8 bits reserved */
+                       __be32          qp2;    /* high 8 bits reserved */
+               } __packed ntc_257_258;
+
+               struct {
+                       __be16          flags;  /* low 8 bits reserved */
+                       __be16          pkey;
+                       __be32          lid1;
+                       __be32          lid2;
+                       u8              sl;     /* SL: high 5 bits */
+                       u8              reserved4[3];
+                       union ib_gid    gid1;
+                       union ib_gid    gid2;
+                       __be32          qp1;    /* high 8 bits reserved */
+                       __be32          qp2;    /* high 8 bits reserved */
+               } __packed ntc_259;
+
+               struct {
+                       __be32  lid;
+               } __packed ntc_2048;
+
+       };
+       u8      class_data[0];
+};
+
+#define IB_VLARB_LOWPRI_0_31    1
+#define IB_VLARB_LOWPRI_32_63   2
+#define IB_VLARB_HIGHPRI_0_31   3
+#define IB_VLARB_HIGHPRI_32_63  4
+
+#define OPA_MAX_PREEMPT_CAP         32
+#define OPA_VLARB_LOW_ELEMENTS       0
+#define OPA_VLARB_HIGH_ELEMENTS      1
+#define OPA_VLARB_PREEMPT_ELEMENTS   2
+#define OPA_VLARB_PREEMPT_MATRIX     3
+
+#define IB_PMA_PORT_COUNTERS_CONG       cpu_to_be16(0xFF00)
+
+struct ib_pma_portcounters_cong {
+       u8 reserved;
+       u8 reserved1;
+       __be16 port_check_rate;
+       __be16 symbol_error_counter;
+       u8 link_error_recovery_counter;
+       u8 link_downed_counter;
+       __be16 port_rcv_errors;
+       __be16 port_rcv_remphys_errors;
+       __be16 port_rcv_switch_relay_errors;
+       __be16 port_xmit_discards;
+       u8 port_xmit_constraint_errors;
+       u8 port_rcv_constraint_errors;
+       u8 reserved2;
+       u8 link_overrun_errors; /* LocalLink: 7:4, BufferOverrun: 3:0 */
+       __be16 reserved3;
+       __be16 vl15_dropped;
+       __be64 port_xmit_data;
+       __be64 port_rcv_data;
+       __be64 port_xmit_packets;
+       __be64 port_rcv_packets;
+       __be64 port_xmit_wait;
+       __be64 port_adr_events;
+} __packed;
+
+#define IB_SMP_UNSUP_VERSION    cpu_to_be16(0x0004)
+#define IB_SMP_UNSUP_METHOD     cpu_to_be16(0x0008)
+#define IB_SMP_UNSUP_METH_ATTR  cpu_to_be16(0x000C)
+#define IB_SMP_INVALID_FIELD    cpu_to_be16(0x001C)
+
+#define OPA_MAX_PREEMPT_CAP         32
+#define OPA_VLARB_LOW_ELEMENTS       0
+#define OPA_VLARB_HIGH_ELEMENTS      1
+#define OPA_VLARB_PREEMPT_ELEMENTS   2
+#define OPA_VLARB_PREEMPT_MATRIX     3
+
+#define HFI1_XMIT_RATE_UNSUPPORTED               0x0
+#define HFI1_XMIT_RATE_PICO                      0x7
+/* number of 4nsec cycles equaling 2secs */
+#define HFI1_CONG_TIMER_PSINTERVAL               0x1DCD64EC
+
+#define IB_CC_SVCTYPE_RC 0x0
+#define IB_CC_SVCTYPE_UC 0x1
+#define IB_CC_SVCTYPE_RD 0x2
+#define IB_CC_SVCTYPE_UD 0x3
+
+/*
+ * There should be an equivalent IB #define for the following, but
+ * I cannot find it.
+ */
+#define OPA_CC_LOG_TYPE_HFI    2
+
+struct opa_hfi1_cong_log_event_internal {
+       u32 lqpn;
+       u32 rqpn;
+       u8 sl;
+       u8 svc_type;
+       u32 rlid;
+       s64 timestamp; /* wider than 32 bits to detect 32 bit rollover */
+};
+
+struct opa_hfi1_cong_log_event {
+       u8 local_qp_cn_entry[3];
+       u8 remote_qp_number_cn_entry[3];
+       u8 sl_svc_type_cn_entry; /* 5 bits SL, 3 bits svc type */
+       u8 reserved;
+       __be32 remote_lid_cn_entry;
+       __be32 timestamp_cn_entry;
+} __packed;
+
+#define OPA_CONG_LOG_ELEMS     96
+
+struct opa_hfi1_cong_log {
+       u8 log_type;
+       u8 congestion_flags;
+       __be16 threshold_event_counter;
+       __be32 current_time_stamp;
+       u8 threshold_cong_event_map[OPA_MAX_SLS / 8];
+       struct opa_hfi1_cong_log_event events[OPA_CONG_LOG_ELEMS];
+} __packed;
+
+#define IB_CC_TABLE_CAP_DEFAULT 31
+
+/* Port control flags */
+#define IB_CC_CCS_PC_SL_BASED 0x01
+
+struct opa_congestion_setting_entry {
+       u8 ccti_increase;
+       u8 reserved;
+       __be16 ccti_timer;
+       u8 trigger_threshold;
+       u8 ccti_min; /* min CCTI for cc table */
+} __packed;
+
+struct opa_congestion_setting_entry_shadow {
+       u8 ccti_increase;
+       u8 reserved;
+       u16 ccti_timer;
+       u8 trigger_threshold;
+       u8 ccti_min; /* min CCTI for cc table */
+} __packed;
+
+struct opa_congestion_setting_attr {
+       __be32 control_map;
+       __be16 port_control;
+       struct opa_congestion_setting_entry entries[OPA_MAX_SLS];
+} __packed;
+
+struct opa_congestion_setting_attr_shadow {
+       u32 control_map;
+       u16 port_control;
+       struct opa_congestion_setting_entry_shadow entries[OPA_MAX_SLS];
+} __packed;
+
+#define IB_CC_TABLE_ENTRY_INCREASE_DEFAULT 1
+#define IB_CC_TABLE_ENTRY_TIMER_DEFAULT 1
+
+/* 64 Congestion Control table entries in a single MAD */
+#define IB_CCT_ENTRIES 64
+#define IB_CCT_MIN_ENTRIES (IB_CCT_ENTRIES * 2)
+
+struct ib_cc_table_entry {
+       __be16 entry; /* shift:2, multiplier:14 */
+};
+
+struct ib_cc_table_entry_shadow {
+       u16 entry; /* shift:2, multiplier:14 */
+};
+
+struct ib_cc_table_attr {
+       __be16 ccti_limit; /* max CCTI for cc table */
+       struct ib_cc_table_entry ccti_entries[IB_CCT_ENTRIES];
+} __packed;
+
+struct ib_cc_table_attr_shadow {
+       u16 ccti_limit; /* max CCTI for cc table */
+       struct ib_cc_table_entry_shadow ccti_entries[IB_CCT_ENTRIES];
+} __packed;
+
+#define CC_TABLE_SHADOW_MAX \
+       (IB_CC_TABLE_CAP_DEFAULT * IB_CCT_ENTRIES)
+
+struct cc_table_shadow {
+       u16 ccti_limit; /* max CCTI for cc table */
+       struct ib_cc_table_entry_shadow entries[CC_TABLE_SHADOW_MAX];
+} __packed;
+
+/*
+ * struct cc_state combines the (active) per-port congestion control
+ * table, and the (active) per-SL congestion settings. cc_state data
+ * may need to be read in code paths that we want to be fast, so it
+ * is an RCU protected structure.
+ */
+struct cc_state {
+       struct rcu_head rcu;
+       struct cc_table_shadow cct;
+       struct opa_congestion_setting_attr_shadow cong_setting;
+};
+
+/*
+ * OPA BufferControl MAD
+ */
+
+/* attribute modifier macros */
+#define OPA_AM_NPORT_SHIFT     24
+#define OPA_AM_NPORT_MASK      0xff
+#define OPA_AM_NPORT_SMASK     (OPA_AM_NPORT_MASK << OPA_AM_NPORT_SHIFT)
+#define OPA_AM_NPORT(am)       (((am) >> OPA_AM_NPORT_SHIFT) & \
+                                       OPA_AM_NPORT_MASK)
+
+#define OPA_AM_NBLK_SHIFT      24
+#define OPA_AM_NBLK_MASK       0xff
+#define OPA_AM_NBLK_SMASK      (OPA_AM_NBLK_MASK << OPA_AM_NBLK_SHIFT)
+#define OPA_AM_NBLK(am)                (((am) >> OPA_AM_NBLK_SHIFT) & \
+                                       OPA_AM_NBLK_MASK)
+
+#define OPA_AM_START_BLK_SHIFT 0
+#define OPA_AM_START_BLK_MASK  0xff
+#define OPA_AM_START_BLK_SMASK (OPA_AM_START_BLK_MASK << \
+                                       OPA_AM_START_BLK_SHIFT)
+#define OPA_AM_START_BLK(am)   (((am) >> OPA_AM_START_BLK_SHIFT) & \
+                                       OPA_AM_START_BLK_MASK)
+
+#define OPA_AM_PORTNUM_SHIFT   0
+#define OPA_AM_PORTNUM_MASK    0xff
+#define OPA_AM_PORTNUM_SMASK   (OPA_AM_PORTNUM_MASK << OPA_AM_PORTNUM_SHIFT)
+#define OPA_AM_PORTNUM(am)     (((am) >> OPA_AM_PORTNUM_SHIFT) & \
+                                       OPA_AM_PORTNUM_MASK)
+
+#define OPA_AM_ASYNC_SHIFT     12
+#define OPA_AM_ASYNC_MASK      0x1
+#define OPA_AM_ASYNC_SMASK     (OPA_AM_ASYNC_MASK << OPA_AM_ASYNC_SHIFT)
+#define OPA_AM_ASYNC(am)       (((am) >> OPA_AM_ASYNC_SHIFT) & \
+                                       OPA_AM_ASYNC_MASK)
+
+#define OPA_AM_START_SM_CFG_SHIFT      9
+#define OPA_AM_START_SM_CFG_MASK       0x1
+#define OPA_AM_START_SM_CFG_SMASK      (OPA_AM_START_SM_CFG_MASK << \
+                                               OPA_AM_START_SM_CFG_SHIFT)
+#define OPA_AM_START_SM_CFG(am)                (((am) >> OPA_AM_START_SM_CFG_SHIFT) \
+                                               & OPA_AM_START_SM_CFG_MASK)
+
+#define OPA_AM_CI_ADDR_SHIFT   19
+#define OPA_AM_CI_ADDR_MASK    0xfff
+#define OPA_AM_CI_ADDR_SMASK   (OPA_AM_CI_ADDR_MASK << OPA_CI_ADDR_SHIFT)
+#define OPA_AM_CI_ADDR(am)     (((am) >> OPA_AM_CI_ADDR_SHIFT) & \
+                                       OPA_AM_CI_ADDR_MASK)
+
+#define OPA_AM_CI_LEN_SHIFT    13
+#define OPA_AM_CI_LEN_MASK     0x3f
+#define OPA_AM_CI_LEN_SMASK    (OPA_AM_CI_LEN_MASK << OPA_CI_LEN_SHIFT)
+#define OPA_AM_CI_LEN(am)      (((am) >> OPA_AM_CI_LEN_SHIFT) & \
+                                       OPA_AM_CI_LEN_MASK)
+
+/* error info macros */
+#define OPA_EI_STATUS_SMASK    0x80
+#define OPA_EI_CODE_SMASK      0x0f
+
+struct vl_limit {
+       __be16 dedicated;
+       __be16 shared;
+};
+
+struct buffer_control {
+       __be16 reserved;
+       __be16 overall_shared_limit;
+       struct vl_limit vl[OPA_MAX_VLS];
+};
+
+struct sc2vlnt {
+       u8 vlnt[32]; /* 5 bit VL, 3 bits reserved */
+};
+
+/*
+ * The PortSamplesControl.CounterMasks field is an array of 3 bit fields
+ * which specify the N'th counter's capabilities. See ch. 16.1.3.2.
+ * We support 5 counters which only count the mandatory quantities.
+ */
+#define COUNTER_MASK(q, n) (q << ((9 - n) * 3))
+#define COUNTER_MASK0_9 \
+       cpu_to_be32(COUNTER_MASK(1, 0) | \
+                   COUNTER_MASK(1, 1) | \
+                   COUNTER_MASK(1, 2) | \
+                   COUNTER_MASK(1, 3) | \
+                   COUNTER_MASK(1, 4))
+
+#endif                         /* _HFI1_MAD_H */
diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
new file mode 100644 (file)
index 0000000..b7a80aa
--- /dev/null
@@ -0,0 +1,325 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/mmu_notifier.h>
+#include <linux/interval_tree_generic.h>
+
+#include "mmu_rb.h"
+#include "trace.h"
+
+struct mmu_rb_handler {
+       struct list_head list;
+       struct mmu_notifier mn;
+       struct rb_root *root;
+       spinlock_t lock;        /* protect the RB tree */
+       struct mmu_rb_ops *ops;
+};
+
+static LIST_HEAD(mmu_rb_handlers);
+static DEFINE_SPINLOCK(mmu_rb_lock); /* protect mmu_rb_handlers list */
+
+static unsigned long mmu_node_start(struct mmu_rb_node *);
+static unsigned long mmu_node_last(struct mmu_rb_node *);
+static struct mmu_rb_handler *find_mmu_handler(struct rb_root *);
+static inline void mmu_notifier_page(struct mmu_notifier *, struct mm_struct *,
+                                    unsigned long);
+static inline void mmu_notifier_range_start(struct mmu_notifier *,
+                                           struct mm_struct *,
+                                           unsigned long, unsigned long);
+static void mmu_notifier_mem_invalidate(struct mmu_notifier *,
+                                       struct mm_struct *,
+                                       unsigned long, unsigned long);
+static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *,
+                                          unsigned long, unsigned long);
+
+static struct mmu_notifier_ops mn_opts = {
+       .invalidate_page = mmu_notifier_page,
+       .invalidate_range_start = mmu_notifier_range_start,
+};
+
+INTERVAL_TREE_DEFINE(struct mmu_rb_node, node, unsigned long, __last,
+                    mmu_node_start, mmu_node_last, static, __mmu_int_rb);
+
+static unsigned long mmu_node_start(struct mmu_rb_node *node)
+{
+       return node->addr & PAGE_MASK;
+}
+
+static unsigned long mmu_node_last(struct mmu_rb_node *node)
+{
+       return PAGE_ALIGN(node->addr + node->len) - 1;
+}
+
+int hfi1_mmu_rb_register(struct rb_root *root, struct mmu_rb_ops *ops)
+{
+       struct mmu_rb_handler *handlr;
+
+       if (!ops->invalidate)
+               return -EINVAL;
+
+       handlr = kmalloc(sizeof(*handlr), GFP_KERNEL);
+       if (!handlr)
+               return -ENOMEM;
+
+       handlr->root = root;
+       handlr->ops = ops;
+       INIT_HLIST_NODE(&handlr->mn.hlist);
+       spin_lock_init(&handlr->lock);
+       handlr->mn.ops = &mn_opts;
+       spin_lock(&mmu_rb_lock);
+       list_add_tail_rcu(&handlr->list, &mmu_rb_handlers);
+       spin_unlock(&mmu_rb_lock);
+
+       return mmu_notifier_register(&handlr->mn, current->mm);
+}
+
+void hfi1_mmu_rb_unregister(struct rb_root *root)
+{
+       struct mmu_rb_handler *handler = find_mmu_handler(root);
+       unsigned long flags;
+
+       if (!handler)
+               return;
+
+       /* Unregister first so we don't get any more notifications. */
+       if (current->mm)
+               mmu_notifier_unregister(&handler->mn, current->mm);
+
+       spin_lock(&mmu_rb_lock);
+       list_del_rcu(&handler->list);
+       spin_unlock(&mmu_rb_lock);
+       synchronize_rcu();
+
+       spin_lock_irqsave(&handler->lock, flags);
+       if (!RB_EMPTY_ROOT(root)) {
+               struct rb_node *node;
+               struct mmu_rb_node *rbnode;
+
+               while ((node = rb_first(root))) {
+                       rbnode = rb_entry(node, struct mmu_rb_node, node);
+                       rb_erase(node, root);
+                       if (handler->ops->remove)
+                               handler->ops->remove(root, rbnode, NULL);
+               }
+       }
+       spin_unlock_irqrestore(&handler->lock, flags);
+
+       kfree(handler);
+}
+
+int hfi1_mmu_rb_insert(struct rb_root *root, struct mmu_rb_node *mnode)
+{
+       struct mmu_rb_handler *handler = find_mmu_handler(root);
+       struct mmu_rb_node *node;
+       unsigned long flags;
+       int ret = 0;
+
+       if (!handler)
+               return -EINVAL;
+
+       spin_lock_irqsave(&handler->lock, flags);
+       hfi1_cdbg(MMU, "Inserting node addr 0x%llx, len %u", mnode->addr,
+                 mnode->len);
+       node = __mmu_rb_search(handler, mnode->addr, mnode->len);
+       if (node) {
+               ret = -EINVAL;
+               goto unlock;
+       }
+       __mmu_int_rb_insert(mnode, root);
+
+       if (handler->ops->insert) {
+               ret = handler->ops->insert(root, mnode);
+               if (ret)
+                       __mmu_int_rb_remove(mnode, root);
+       }
+unlock:
+       spin_unlock_irqrestore(&handler->lock, flags);
+       return ret;
+}
+
+/* Caller must hold handler lock */
+static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *handler,
+                                          unsigned long addr,
+                                          unsigned long len)
+{
+       struct mmu_rb_node *node = NULL;
+
+       hfi1_cdbg(MMU, "Searching for addr 0x%llx, len %u", addr, len);
+       if (!handler->ops->filter) {
+               node = __mmu_int_rb_iter_first(handler->root, addr,
+                                              (addr + len) - 1);
+       } else {
+               for (node = __mmu_int_rb_iter_first(handler->root, addr,
+                                                   (addr + len) - 1);
+                    node;
+                    node = __mmu_int_rb_iter_next(node, addr,
+                                                  (addr + len) - 1)) {
+                       if (handler->ops->filter(node, addr, len))
+                               return node;
+               }
+       }
+       return node;
+}
+
+/* Caller must *not* hold handler lock. */
+static void __mmu_rb_remove(struct mmu_rb_handler *handler,
+                           struct mmu_rb_node *node, struct mm_struct *mm)
+{
+       unsigned long flags;
+
+       /* Validity of handler and node pointers has been checked by caller. */
+       hfi1_cdbg(MMU, "Removing node addr 0x%llx, len %u", node->addr,
+                 node->len);
+       spin_lock_irqsave(&handler->lock, flags);
+       __mmu_int_rb_remove(node, handler->root);
+       spin_unlock_irqrestore(&handler->lock, flags);
+
+       if (handler->ops->remove)
+               handler->ops->remove(handler->root, node, mm);
+}
+
+struct mmu_rb_node *hfi1_mmu_rb_search(struct rb_root *root, unsigned long addr,
+                                      unsigned long len)
+{
+       struct mmu_rb_handler *handler = find_mmu_handler(root);
+       struct mmu_rb_node *node;
+       unsigned long flags;
+
+       if (!handler)
+               return ERR_PTR(-EINVAL);
+
+       spin_lock_irqsave(&handler->lock, flags);
+       node = __mmu_rb_search(handler, addr, len);
+       spin_unlock_irqrestore(&handler->lock, flags);
+
+       return node;
+}
+
+struct mmu_rb_node *hfi1_mmu_rb_extract(struct rb_root *root,
+                                       unsigned long addr, unsigned long len)
+{
+       struct mmu_rb_handler *handler = find_mmu_handler(root);
+       struct mmu_rb_node *node;
+       unsigned long flags;
+
+       if (!handler)
+               return ERR_PTR(-EINVAL);
+
+       spin_lock_irqsave(&handler->lock, flags);
+       node = __mmu_rb_search(handler, addr, len);
+       if (node)
+               __mmu_int_rb_remove(node, handler->root);
+       spin_unlock_irqrestore(&handler->lock, flags);
+
+       return node;
+}
+
+void hfi1_mmu_rb_remove(struct rb_root *root, struct mmu_rb_node *node)
+{
+       struct mmu_rb_handler *handler = find_mmu_handler(root);
+
+       if (!handler || !node)
+               return;
+
+       __mmu_rb_remove(handler, node, NULL);
+}
+
+static struct mmu_rb_handler *find_mmu_handler(struct rb_root *root)
+{
+       struct mmu_rb_handler *handler;
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(handler, &mmu_rb_handlers, list) {
+               if (handler->root == root)
+                       goto unlock;
+       }
+       handler = NULL;
+unlock:
+       rcu_read_unlock();
+       return handler;
+}
+
+static inline void mmu_notifier_page(struct mmu_notifier *mn,
+                                    struct mm_struct *mm, unsigned long addr)
+{
+       mmu_notifier_mem_invalidate(mn, mm, addr, addr + PAGE_SIZE);
+}
+
+static inline void mmu_notifier_range_start(struct mmu_notifier *mn,
+                                           struct mm_struct *mm,
+                                           unsigned long start,
+                                           unsigned long end)
+{
+       mmu_notifier_mem_invalidate(mn, mm, start, end);
+}
+
+static void mmu_notifier_mem_invalidate(struct mmu_notifier *mn,
+                                       struct mm_struct *mm,
+                                       unsigned long start, unsigned long end)
+{
+       struct mmu_rb_handler *handler =
+               container_of(mn, struct mmu_rb_handler, mn);
+       struct rb_root *root = handler->root;
+       struct mmu_rb_node *node, *ptr = NULL;
+       unsigned long flags;
+
+       spin_lock_irqsave(&handler->lock, flags);
+       for (node = __mmu_int_rb_iter_first(root, start, end - 1);
+            node; node = ptr) {
+               /* Guard against node removal. */
+               ptr = __mmu_int_rb_iter_next(node, start, end - 1);
+               hfi1_cdbg(MMU, "Invalidating node addr 0x%llx, len %u",
+                         node->addr, node->len);
+               if (handler->ops->invalidate(root, node)) {
+                       __mmu_int_rb_remove(node, root);
+                       if (handler->ops->remove)
+                               handler->ops->remove(root, node, mm);
+               }
+       }
+       spin_unlock_irqrestore(&handler->lock, flags);
+}
diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.h b/drivers/infiniband/hw/hfi1/mmu_rb.h
new file mode 100644 (file)
index 0000000..7a57b9c
--- /dev/null
@@ -0,0 +1,76 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef _HFI1_MMU_RB_H
+#define _HFI1_MMU_RB_H
+
+#include "hfi.h"
+
+struct mmu_rb_node {
+       unsigned long addr;
+       unsigned long len;
+       unsigned long __last;
+       struct rb_node node;
+};
+
+struct mmu_rb_ops {
+       bool (*filter)(struct mmu_rb_node *, unsigned long, unsigned long);
+       int (*insert)(struct rb_root *, struct mmu_rb_node *);
+       void (*remove)(struct rb_root *, struct mmu_rb_node *,
+                      struct mm_struct *);
+       int (*invalidate)(struct rb_root *, struct mmu_rb_node *);
+};
+
+int hfi1_mmu_rb_register(struct rb_root *root, struct mmu_rb_ops *ops);
+void hfi1_mmu_rb_unregister(struct rb_root *);
+int hfi1_mmu_rb_insert(struct rb_root *, struct mmu_rb_node *);
+void hfi1_mmu_rb_remove(struct rb_root *, struct mmu_rb_node *);
+struct mmu_rb_node *hfi1_mmu_rb_search(struct rb_root *, unsigned long,
+                                      unsigned long);
+struct mmu_rb_node *hfi1_mmu_rb_extract(struct rb_root *, unsigned long,
+                                       unsigned long);
+
+#endif /* _HFI1_MMU_RB_H */
diff --git a/drivers/infiniband/hw/hfi1/opa_compat.h b/drivers/infiniband/hw/hfi1/opa_compat.h
new file mode 100644 (file)
index 0000000..6ef3c1c
--- /dev/null
@@ -0,0 +1,111 @@
+#ifndef _LINUX_H
+#define _LINUX_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This header file is for OPA-specific definitions which are
+ * required by the HFI driver, and which aren't yet in the Linux
+ * IB core. We'll collect these all here, then merge them into
+ * the kernel when that's convenient.
+ */
+
+/* OPA SMA attribute IDs */
+#define OPA_ATTRIB_ID_CONGESTION_INFO          cpu_to_be16(0x008b)
+#define OPA_ATTRIB_ID_HFI_CONGESTION_LOG       cpu_to_be16(0x008f)
+#define OPA_ATTRIB_ID_HFI_CONGESTION_SETTING   cpu_to_be16(0x0090)
+#define OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE cpu_to_be16(0x0091)
+
+/* OPA PMA attribute IDs */
+#define OPA_PM_ATTRIB_ID_PORT_STATUS           cpu_to_be16(0x0040)
+#define OPA_PM_ATTRIB_ID_CLEAR_PORT_STATUS     cpu_to_be16(0x0041)
+#define OPA_PM_ATTRIB_ID_DATA_PORT_COUNTERS    cpu_to_be16(0x0042)
+#define OPA_PM_ATTRIB_ID_ERROR_PORT_COUNTERS   cpu_to_be16(0x0043)
+#define OPA_PM_ATTRIB_ID_ERROR_INFO            cpu_to_be16(0x0044)
+
+/* OPA status codes */
+#define OPA_PM_STATUS_REQUEST_TOO_LARGE                cpu_to_be16(0x100)
+
+static inline u8 port_states_to_logical_state(struct opa_port_states *ps)
+{
+       return ps->portphysstate_portstate & OPA_PI_MASK_PORT_STATE;
+}
+
+static inline u8 port_states_to_phys_state(struct opa_port_states *ps)
+{
+       return ((ps->portphysstate_portstate &
+                 OPA_PI_MASK_PORT_PHYSICAL_STATE) >> 4) & 0xf;
+}
+
+/*
+ * OPA port physical states
+ * IB Volume 1, Table 146 PortInfo/IB Volume 2 Section 5.4.2(1) PortPhysState
+ * values.
+ *
+ * When writing, only values 0-3 are valid, other values are ignored.
+ * When reading, 0 is reserved.
+ *
+ * Returned by the ibphys_portstate() routine.
+ */
+enum opa_port_phys_state {
+       IB_PORTPHYSSTATE_NOP = 0,
+       /* 1 is reserved */
+       IB_PORTPHYSSTATE_POLLING = 2,
+       IB_PORTPHYSSTATE_DISABLED = 3,
+       IB_PORTPHYSSTATE_TRAINING = 4,
+       IB_PORTPHYSSTATE_LINKUP = 5,
+       IB_PORTPHYSSTATE_LINK_ERROR_RECOVERY = 6,
+       IB_PORTPHYSSTATE_PHY_TEST = 7,
+       /* 8 is reserved */
+       OPA_PORTPHYSSTATE_OFFLINE = 9,
+       OPA_PORTPHYSSTATE_GANGED = 10,
+       OPA_PORTPHYSSTATE_TEST = 11,
+       OPA_PORTPHYSSTATE_MAX = 11,
+       /* values 12-15 are reserved/ignored */
+};
+
+#endif /* _LINUX_H */
diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c
new file mode 100644 (file)
index 0000000..0bac21e
--- /dev/null
@@ -0,0 +1,1338 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/pci.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/vmalloc.h>
+#include <linux/aer.h>
+#include <linux/module.h>
+
+#include "hfi.h"
+#include "chip_registers.h"
+#include "aspm.h"
+
+/* link speed vector for Gen3 speed - not in Linux headers */
+#define GEN1_SPEED_VECTOR 0x1
+#define GEN2_SPEED_VECTOR 0x2
+#define GEN3_SPEED_VECTOR 0x3
+
+/*
+ * This file contains PCIe utility routines.
+ */
+
+/*
+ * Code to adjust PCIe capabilities.
+ */
+static void tune_pcie_caps(struct hfi1_devdata *);
+
+/*
+ * Do all the common PCIe setup and initialization.
+ * devdata is not yet allocated, and is not allocated until after this
+ * routine returns success.  Therefore dd_dev_err() can't be used for error
+ * printing.
+ */
+int hfi1_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+       int ret;
+
+       ret = pci_enable_device(pdev);
+       if (ret) {
+               /*
+                * This can happen (in theory) iff:
+                * We did a chip reset, and then failed to reprogram the
+                * BAR, or the chip reset due to an internal error.  We then
+                * unloaded the driver and reloaded it.
+                *
+                * Both reset cases set the BAR back to initial state.  For
+                * the latter case, the AER sticky error bit at offset 0x718
+                * should be set, but the Linux kernel doesn't yet know
+                * about that, it appears.  If the original BAR was retained
+                * in the kernel data structures, this may be OK.
+                */
+               hfi1_early_err(&pdev->dev, "pci enable failed: error %d\n",
+                              -ret);
+               goto done;
+       }
+
+       ret = pci_request_regions(pdev, DRIVER_NAME);
+       if (ret) {
+               hfi1_early_err(&pdev->dev,
+                              "pci_request_regions fails: err %d\n", -ret);
+               goto bail;
+       }
+
+       ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+       if (ret) {
+               /*
+                * If the 64 bit setup fails, try 32 bit.  Some systems
+                * do not setup 64 bit maps on systems with 2GB or less
+                * memory installed.
+                */
+               ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+               if (ret) {
+                       hfi1_early_err(&pdev->dev,
+                                      "Unable to set DMA mask: %d\n", ret);
+                       goto bail;
+               }
+               ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+       } else {
+               ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+       }
+       if (ret) {
+               hfi1_early_err(&pdev->dev,
+                              "Unable to set DMA consistent mask: %d\n", ret);
+               goto bail;
+       }
+
+       pci_set_master(pdev);
+       (void)pci_enable_pcie_error_reporting(pdev);
+       goto done;
+
+bail:
+       hfi1_pcie_cleanup(pdev);
+done:
+       return ret;
+}
+
+/*
+ * Clean what was done in hfi1_pcie_init()
+ */
+void hfi1_pcie_cleanup(struct pci_dev *pdev)
+{
+       pci_disable_device(pdev);
+       /*
+        * Release regions should be called after the disable. OK to
+        * call if request regions has not been called or failed.
+        */
+       pci_release_regions(pdev);
+}
+
+/*
+ * Do remaining PCIe setup, once dd is allocated, and save away
+ * fields required to re-initialize after a chip reset, or for
+ * various other purposes
+ */
+int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev,
+                    const struct pci_device_id *ent)
+{
+       unsigned long len;
+       resource_size_t addr;
+
+       dd->pcidev = pdev;
+       pci_set_drvdata(pdev, dd);
+
+       addr = pci_resource_start(pdev, 0);
+       len = pci_resource_len(pdev, 0);
+
+       /*
+        * The TXE PIO buffers are at the tail end of the chip space.
+        * Cut them off and map them separately.
+        */
+
+       /* sanity check vs expectations */
+       if (len != TXE_PIO_SEND + TXE_PIO_SIZE) {
+               dd_dev_err(dd, "chip PIO range does not match\n");
+               return -EINVAL;
+       }
+
+       dd->kregbase = ioremap_nocache(addr, TXE_PIO_SEND);
+       if (!dd->kregbase)
+               return -ENOMEM;
+
+       dd->piobase = ioremap_wc(addr + TXE_PIO_SEND, TXE_PIO_SIZE);
+       if (!dd->piobase) {
+               iounmap(dd->kregbase);
+               return -ENOMEM;
+       }
+
+       dd->flags |= HFI1_PRESENT;      /* now register routines work */
+
+       dd->kregend = dd->kregbase + TXE_PIO_SEND;
+       dd->physaddr = addr;        /* used for io_remap, etc. */
+
+       /*
+        * Re-map the chip's RcvArray as write-combining to allow us
+        * to write an entire cacheline worth of entries in one shot.
+        * If this re-map fails, just continue - the RcvArray programming
+        * function will handle both cases.
+        */
+       dd->chip_rcv_array_count = read_csr(dd, RCV_ARRAY_CNT);
+       dd->rcvarray_wc = ioremap_wc(addr + RCV_ARRAY,
+                                    dd->chip_rcv_array_count * 8);
+       dd_dev_info(dd, "WC Remapped RcvArray: %p\n", dd->rcvarray_wc);
+       /*
+        * Save BARs and command to rewrite after device reset.
+        */
+       dd->pcibar0 = addr;
+       dd->pcibar1 = addr >> 32;
+       pci_read_config_dword(dd->pcidev, PCI_ROM_ADDRESS, &dd->pci_rom);
+       pci_read_config_word(dd->pcidev, PCI_COMMAND, &dd->pci_command);
+       pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL, &dd->pcie_devctl);
+       pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL, &dd->pcie_lnkctl);
+       pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL2,
+                                 &dd->pcie_devctl2);
+       pci_read_config_dword(dd->pcidev, PCI_CFG_MSIX0, &dd->pci_msix0);
+       pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE1, &dd->pci_lnkctl3);
+       pci_read_config_dword(dd->pcidev, PCIE_CFG_TPH2, &dd->pci_tph2);
+
+       return 0;
+}
+
+/*
+ * Do PCIe cleanup related to dd, after chip-specific cleanup, etc.  Just prior
+ * to releasing the dd memory.
+ * Void because all of the core pcie cleanup functions are void.
+ */
+void hfi1_pcie_ddcleanup(struct hfi1_devdata *dd)
+{
+       u64 __iomem *base = (void __iomem *)dd->kregbase;
+
+       dd->flags &= ~HFI1_PRESENT;
+       dd->kregbase = NULL;
+       iounmap(base);
+       if (dd->rcvarray_wc)
+               iounmap(dd->rcvarray_wc);
+       if (dd->piobase)
+               iounmap(dd->piobase);
+}
+
+/*
+ * Do a Function Level Reset (FLR) on the device.
+ * Based on static function drivers/pci/pci.c:pcie_flr().
+ */
+void hfi1_pcie_flr(struct hfi1_devdata *dd)
+{
+       int i;
+       u16 status;
+
+       /* no need to check for the capability - we know the device has it */
+
+       /* wait for Transaction Pending bit to clear, at most a few ms */
+       for (i = 0; i < 4; i++) {
+               if (i)
+                       msleep((1 << (i - 1)) * 100);
+
+               pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVSTA, &status);
+               if (!(status & PCI_EXP_DEVSTA_TRPND))
+                       goto clear;
+       }
+
+       dd_dev_err(dd, "Transaction Pending bit is not clearing, proceeding with reset anyway\n");
+
+clear:
+       pcie_capability_set_word(dd->pcidev, PCI_EXP_DEVCTL,
+                                PCI_EXP_DEVCTL_BCR_FLR);
+       /* PCIe spec requires the function to be back within 100ms */
+       msleep(100);
+}
+
+static void msix_setup(struct hfi1_devdata *dd, int pos, u32 *msixcnt,
+                      struct hfi1_msix_entry *hfi1_msix_entry)
+{
+       int ret;
+       int nvec = *msixcnt;
+       struct msix_entry *msix_entry;
+       int i;
+
+       /*
+        * We can't pass hfi1_msix_entry array to msix_setup
+        * so use a dummy msix_entry array and copy the allocated
+        * irq back to the hfi1_msix_entry array.
+        */
+       msix_entry = kmalloc_array(nvec, sizeof(*msix_entry), GFP_KERNEL);
+       if (!msix_entry) {
+               ret = -ENOMEM;
+               goto do_intx;
+       }
+
+       for (i = 0; i < nvec; i++)
+               msix_entry[i] = hfi1_msix_entry[i].msix;
+
+       ret = pci_enable_msix_range(dd->pcidev, msix_entry, 1, nvec);
+       if (ret < 0)
+               goto free_msix_entry;
+       nvec = ret;
+
+       for (i = 0; i < nvec; i++)
+               hfi1_msix_entry[i].msix = msix_entry[i];
+
+       kfree(msix_entry);
+       *msixcnt = nvec;
+       return;
+
+free_msix_entry:
+       kfree(msix_entry);
+
+do_intx:
+       dd_dev_err(dd, "pci_enable_msix_range %d vectors failed: %d, falling back to INTx\n",
+                  nvec, ret);
+       *msixcnt = 0;
+       hfi1_enable_intx(dd->pcidev);
+}
+
+/* return the PCIe link speed from the given link status */
+static u32 extract_speed(u16 linkstat)
+{
+       u32 speed;
+
+       switch (linkstat & PCI_EXP_LNKSTA_CLS) {
+       default: /* not defined, assume Gen1 */
+       case PCI_EXP_LNKSTA_CLS_2_5GB:
+               speed = 2500; /* Gen 1, 2.5GHz */
+               break;
+       case PCI_EXP_LNKSTA_CLS_5_0GB:
+               speed = 5000; /* Gen 2, 5GHz */
+               break;
+       case GEN3_SPEED_VECTOR:
+               speed = 8000; /* Gen 3, 8GHz */
+               break;
+       }
+       return speed;
+}
+
+/* return the PCIe link speed from the given link status */
+static u32 extract_width(u16 linkstat)
+{
+       return (linkstat & PCI_EXP_LNKSTA_NLW) >> PCI_EXP_LNKSTA_NLW_SHIFT;
+}
+
+/* read the link status and set dd->{lbus_width,lbus_speed,lbus_info} */
+static void update_lbus_info(struct hfi1_devdata *dd)
+{
+       u16 linkstat;
+
+       pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKSTA, &linkstat);
+       dd->lbus_width = extract_width(linkstat);
+       dd->lbus_speed = extract_speed(linkstat);
+       snprintf(dd->lbus_info, sizeof(dd->lbus_info),
+                "PCIe,%uMHz,x%u", dd->lbus_speed, dd->lbus_width);
+}
+
+/*
+ * Read in the current PCIe link width and speed.  Find if the link is
+ * Gen3 capable.
+ */
+int pcie_speeds(struct hfi1_devdata *dd)
+{
+       u32 linkcap;
+       struct pci_dev *parent = dd->pcidev->bus->self;
+
+       if (!pci_is_pcie(dd->pcidev)) {
+               dd_dev_err(dd, "Can't find PCI Express capability!\n");
+               return -EINVAL;
+       }
+
+       /* find if our max speed is Gen3 and parent supports Gen3 speeds */
+       dd->link_gen3_capable = 1;
+
+       pcie_capability_read_dword(dd->pcidev, PCI_EXP_LNKCAP, &linkcap);
+       if ((linkcap & PCI_EXP_LNKCAP_SLS) != GEN3_SPEED_VECTOR) {
+               dd_dev_info(dd,
+                           "This HFI is not Gen3 capable, max speed 0x%x, need 0x3\n",
+                           linkcap & PCI_EXP_LNKCAP_SLS);
+               dd->link_gen3_capable = 0;
+       }
+
+       /*
+        * bus->max_bus_speed is set from the bridge's linkcap Max Link Speed
+        */
+       if (parent && dd->pcidev->bus->max_bus_speed != PCIE_SPEED_8_0GT) {
+               dd_dev_info(dd, "Parent PCIe bridge does not support Gen3\n");
+               dd->link_gen3_capable = 0;
+       }
+
+       /* obtain the link width and current speed */
+       update_lbus_info(dd);
+
+       dd_dev_info(dd, "%s\n", dd->lbus_info);
+
+       return 0;
+}
+
+/*
+ * Returns in *nent:
+ *     - actual number of interrupts allocated
+ *     - 0 if fell back to INTx.
+ */
+void request_msix(struct hfi1_devdata *dd, u32 *nent,
+                 struct hfi1_msix_entry *entry)
+{
+       int pos;
+
+       pos = dd->pcidev->msix_cap;
+       if (*nent && pos) {
+               msix_setup(dd, pos, nent, entry);
+               /* did it, either MSI-X or INTx */
+       } else {
+               *nent = 0;
+               hfi1_enable_intx(dd->pcidev);
+       }
+
+       tune_pcie_caps(dd);
+}
+
+void hfi1_enable_intx(struct pci_dev *pdev)
+{
+       /* first, turn on INTx */
+       pci_intx(pdev, 1);
+       /* then turn off MSI-X */
+       pci_disable_msix(pdev);
+}
+
+/* restore command and BARs after a reset has wiped them out */
+void restore_pci_variables(struct hfi1_devdata *dd)
+{
+       pci_write_config_word(dd->pcidev, PCI_COMMAND, dd->pci_command);
+       pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_0, dd->pcibar0);
+       pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_1, dd->pcibar1);
+       pci_write_config_dword(dd->pcidev, PCI_ROM_ADDRESS, dd->pci_rom);
+       pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL, dd->pcie_devctl);
+       pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL, dd->pcie_lnkctl);
+       pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL2,
+                                  dd->pcie_devctl2);
+       pci_write_config_dword(dd->pcidev, PCI_CFG_MSIX0, dd->pci_msix0);
+       pci_write_config_dword(dd->pcidev, PCIE_CFG_SPCIE1, dd->pci_lnkctl3);
+       pci_write_config_dword(dd->pcidev, PCIE_CFG_TPH2, dd->pci_tph2);
+}
+
+/*
+ * BIOS may not set PCIe bus-utilization parameters for best performance.
+ * Check and optionally adjust them to maximize our throughput.
+ */
+static int hfi1_pcie_caps;
+module_param_named(pcie_caps, hfi1_pcie_caps, int, S_IRUGO);
+MODULE_PARM_DESC(pcie_caps, "Max PCIe tuning: Payload (0..3), ReadReq (4..7)");
+
+uint aspm_mode = ASPM_MODE_DISABLED;
+module_param_named(aspm, aspm_mode, uint, S_IRUGO);
+MODULE_PARM_DESC(aspm, "PCIe ASPM: 0: disable, 1: enable, 2: dynamic");
+
+static void tune_pcie_caps(struct hfi1_devdata *dd)
+{
+       struct pci_dev *parent;
+       u16 rc_mpss, rc_mps, ep_mpss, ep_mps;
+       u16 rc_mrrs, ep_mrrs, max_mrrs, ectl;
+
+       /*
+        * Turn on extended tags in DevCtl in case the BIOS has turned it off
+        * to improve WFR SDMA bandwidth
+        */
+       pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL, &ectl);
+       if (!(ectl & PCI_EXP_DEVCTL_EXT_TAG)) {
+               dd_dev_info(dd, "Enabling PCIe extended tags\n");
+               ectl |= PCI_EXP_DEVCTL_EXT_TAG;
+               pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL, ectl);
+       }
+       /* Find out supported and configured values for parent (root) */
+       parent = dd->pcidev->bus->self;
+       /*
+        * The driver cannot perform the tuning if it does not have
+        * access to the upstream component.
+        */
+       if (!parent)
+               return;
+       if (!pci_is_root_bus(parent->bus)) {
+               dd_dev_info(dd, "Parent not root\n");
+               return;
+       }
+
+       if (!pci_is_pcie(parent) || !pci_is_pcie(dd->pcidev))
+               return;
+       rc_mpss = parent->pcie_mpss;
+       rc_mps = ffs(pcie_get_mps(parent)) - 8;
+       /* Find out supported and configured values for endpoint (us) */
+       ep_mpss = dd->pcidev->pcie_mpss;
+       ep_mps = ffs(pcie_get_mps(dd->pcidev)) - 8;
+
+       /* Find max payload supported by root, endpoint */
+       if (rc_mpss > ep_mpss)
+               rc_mpss = ep_mpss;
+
+       /* If Supported greater than limit in module param, limit it */
+       if (rc_mpss > (hfi1_pcie_caps & 7))
+               rc_mpss = hfi1_pcie_caps & 7;
+       /* If less than (allowed, supported), bump root payload */
+       if (rc_mpss > rc_mps) {
+               rc_mps = rc_mpss;
+               pcie_set_mps(parent, 128 << rc_mps);
+       }
+       /* If less than (allowed, supported), bump endpoint payload */
+       if (rc_mpss > ep_mps) {
+               ep_mps = rc_mpss;
+               pcie_set_mps(dd->pcidev, 128 << ep_mps);
+       }
+
+       /*
+        * Now the Read Request size.
+        * No field for max supported, but PCIe spec limits it to 4096,
+        * which is code '5' (log2(4096) - 7)
+        */
+       max_mrrs = 5;
+       if (max_mrrs > ((hfi1_pcie_caps >> 4) & 7))
+               max_mrrs = (hfi1_pcie_caps >> 4) & 7;
+
+       max_mrrs = 128 << max_mrrs;
+       rc_mrrs = pcie_get_readrq(parent);
+       ep_mrrs = pcie_get_readrq(dd->pcidev);
+
+       if (max_mrrs > rc_mrrs) {
+               rc_mrrs = max_mrrs;
+               pcie_set_readrq(parent, rc_mrrs);
+       }
+       if (max_mrrs > ep_mrrs) {
+               ep_mrrs = max_mrrs;
+               pcie_set_readrq(dd->pcidev, ep_mrrs);
+       }
+}
+
+/* End of PCIe capability tuning */
+
+/*
+ * From here through hfi1_pci_err_handler definition is invoked via
+ * PCI error infrastructure, registered via pci
+ */
+static pci_ers_result_t
+pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
+{
+       struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+       pci_ers_result_t ret = PCI_ERS_RESULT_RECOVERED;
+
+       switch (state) {
+       case pci_channel_io_normal:
+               dd_dev_info(dd, "State Normal, ignoring\n");
+               break;
+
+       case pci_channel_io_frozen:
+               dd_dev_info(dd, "State Frozen, requesting reset\n");
+               pci_disable_device(pdev);
+               ret = PCI_ERS_RESULT_NEED_RESET;
+               break;
+
+       case pci_channel_io_perm_failure:
+               if (dd) {
+                       dd_dev_info(dd, "State Permanent Failure, disabling\n");
+                       /* no more register accesses! */
+                       dd->flags &= ~HFI1_PRESENT;
+                       hfi1_disable_after_error(dd);
+               }
+                /* else early, or other problem */
+               ret =  PCI_ERS_RESULT_DISCONNECT;
+               break;
+
+       default: /* shouldn't happen */
+               dd_dev_info(dd, "HFI1 PCI errors detected (state %d)\n",
+                           state);
+               break;
+       }
+       return ret;
+}
+
+static pci_ers_result_t
+pci_mmio_enabled(struct pci_dev *pdev)
+{
+       u64 words = 0U;
+       struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+       pci_ers_result_t ret = PCI_ERS_RESULT_RECOVERED;
+
+       if (dd && dd->pport) {
+               words = read_port_cntr(dd->pport, C_RX_WORDS, CNTR_INVALID_VL);
+               if (words == ~0ULL)
+                       ret = PCI_ERS_RESULT_NEED_RESET;
+               dd_dev_info(dd,
+                           "HFI1 mmio_enabled function called, read wordscntr %Lx, returning %d\n",
+                           words, ret);
+       }
+       return  ret;
+}
+
+static pci_ers_result_t
+pci_slot_reset(struct pci_dev *pdev)
+{
+       struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+
+       dd_dev_info(dd, "HFI1 slot_reset function called, ignored\n");
+       return PCI_ERS_RESULT_CAN_RECOVER;
+}
+
+static pci_ers_result_t
+pci_link_reset(struct pci_dev *pdev)
+{
+       struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+
+       dd_dev_info(dd, "HFI1 link_reset function called, ignored\n");
+       return PCI_ERS_RESULT_CAN_RECOVER;
+}
+
+static void
+pci_resume(struct pci_dev *pdev)
+{
+       struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+
+       dd_dev_info(dd, "HFI1 resume function called\n");
+       pci_cleanup_aer_uncorrect_error_status(pdev);
+       /*
+        * Running jobs will fail, since it's asynchronous
+        * unlike sysfs-requested reset.   Better than
+        * doing nothing.
+        */
+       hfi1_init(dd, 1); /* same as re-init after reset */
+}
+
+const struct pci_error_handlers hfi1_pci_err_handler = {
+       .error_detected = pci_error_detected,
+       .mmio_enabled = pci_mmio_enabled,
+       .link_reset = pci_link_reset,
+       .slot_reset = pci_slot_reset,
+       .resume = pci_resume,
+};
+
+/*============================================================================*/
+/* PCIe Gen3 support */
+
+/*
+ * This code is separated out because it is expected to be removed in the
+ * final shipping product.  If not, then it will be revisited and items
+ * will be moved to more standard locations.
+ */
+
+/* ASIC_PCI_SD_HOST_STATUS.FW_DNLD_STS field values */
+#define DL_STATUS_HFI0 0x1     /* hfi0 firmware download complete */
+#define DL_STATUS_HFI1 0x2     /* hfi1 firmware download complete */
+#define DL_STATUS_BOTH 0x3     /* hfi0 and hfi1 firmware download complete */
+
+/* ASIC_PCI_SD_HOST_STATUS.FW_DNLD_ERR field values */
+#define DL_ERR_NONE            0x0     /* no error */
+#define DL_ERR_SWAP_PARITY     0x1     /* parity error in SerDes interrupt */
+                                       /*   or response data */
+#define DL_ERR_DISABLED        0x2     /* hfi disabled */
+#define DL_ERR_SECURITY        0x3     /* security check failed */
+#define DL_ERR_SBUS            0x4     /* SBus status error */
+#define DL_ERR_XFR_PARITY      0x5     /* parity error during ROM transfer*/
+
+/* gasket block secondary bus reset delay */
+#define SBR_DELAY_US 200000    /* 200ms */
+
+/* mask for PCIe capability register lnkctl2 target link speed */
+#define LNKCTL2_TARGET_LINK_SPEED_MASK 0xf
+
+static uint pcie_target = 3;
+module_param(pcie_target, uint, S_IRUGO);
+MODULE_PARM_DESC(pcie_target, "PCIe target speed (0 skip, 1-3 Gen1-3)");
+
+static uint pcie_force;
+module_param(pcie_force, uint, S_IRUGO);
+MODULE_PARM_DESC(pcie_force, "Force driver to do a PCIe firmware download even if already at target speed");
+
+static uint pcie_retry = 5;
+module_param(pcie_retry, uint, S_IRUGO);
+MODULE_PARM_DESC(pcie_retry, "Driver will try this many times to reach requested speed");
+
+#define UNSET_PSET 255
+#define DEFAULT_DISCRETE_PSET 2        /* discrete HFI */
+#define DEFAULT_MCP_PSET 4     /* MCP HFI */
+static uint pcie_pset = UNSET_PSET;
+module_param(pcie_pset, uint, S_IRUGO);
+MODULE_PARM_DESC(pcie_pset, "PCIe Eq Pset value to use, range is 0-10");
+
+/* equalization columns */
+#define PREC 0
+#define ATTN 1
+#define POST 2
+
+/* discrete silicon preliminary equalization values */
+static const u8 discrete_preliminary_eq[11][3] = {
+       /* prec   attn   post */
+       {  0x00,  0x00,  0x12 },        /* p0 */
+       {  0x00,  0x00,  0x0c },        /* p1 */
+       {  0x00,  0x00,  0x0f },        /* p2 */
+       {  0x00,  0x00,  0x09 },        /* p3 */
+       {  0x00,  0x00,  0x00 },        /* p4 */
+       {  0x06,  0x00,  0x00 },        /* p5 */
+       {  0x09,  0x00,  0x00 },        /* p6 */
+       {  0x06,  0x00,  0x0f },        /* p7 */
+       {  0x09,  0x00,  0x09 },        /* p8 */
+       {  0x0c,  0x00,  0x00 },        /* p9 */
+       {  0x00,  0x00,  0x18 },        /* p10 */
+};
+
+/* integrated silicon preliminary equalization values */
+static const u8 integrated_preliminary_eq[11][3] = {
+       /* prec   attn   post */
+       {  0x00,  0x1e,  0x07 },        /* p0 */
+       {  0x00,  0x1e,  0x05 },        /* p1 */
+       {  0x00,  0x1e,  0x06 },        /* p2 */
+       {  0x00,  0x1e,  0x04 },        /* p3 */
+       {  0x00,  0x1e,  0x00 },        /* p4 */
+       {  0x03,  0x1e,  0x00 },        /* p5 */
+       {  0x04,  0x1e,  0x00 },        /* p6 */
+       {  0x03,  0x1e,  0x06 },        /* p7 */
+       {  0x03,  0x1e,  0x04 },        /* p8 */
+       {  0x05,  0x1e,  0x00 },        /* p9 */
+       {  0x00,  0x1e,  0x0a },        /* p10 */
+};
+
+/* helper to format the value to write to hardware */
+#define eq_value(pre, curr, post) \
+       ((((u32)(pre)) << \
+                       PCIE_CFG_REG_PL102_GEN3_EQ_PRE_CURSOR_PSET_SHIFT) \
+       | (((u32)(curr)) << PCIE_CFG_REG_PL102_GEN3_EQ_CURSOR_PSET_SHIFT) \
+       | (((u32)(post)) << \
+               PCIE_CFG_REG_PL102_GEN3_EQ_POST_CURSOR_PSET_SHIFT))
+
+/*
+ * Load the given EQ preset table into the PCIe hardware.
+ */
+static int load_eq_table(struct hfi1_devdata *dd, const u8 eq[11][3], u8 fs,
+                        u8 div)
+{
+       struct pci_dev *pdev = dd->pcidev;
+       u32 hit_error = 0;
+       u32 violation;
+       u32 i;
+       u8 c_minus1, c0, c_plus1;
+
+       for (i = 0; i < 11; i++) {
+               /* set index */
+               pci_write_config_dword(pdev, PCIE_CFG_REG_PL103, i);
+               /* write the value */
+               c_minus1 = eq[i][PREC] / div;
+               c0 = fs - (eq[i][PREC] / div) - (eq[i][POST] / div);
+               c_plus1 = eq[i][POST] / div;
+               pci_write_config_dword(pdev, PCIE_CFG_REG_PL102,
+                                      eq_value(c_minus1, c0, c_plus1));
+               /* check if these coefficients violate EQ rules */
+               pci_read_config_dword(dd->pcidev, PCIE_CFG_REG_PL105,
+                                     &violation);
+               if (violation
+                   & PCIE_CFG_REG_PL105_GEN3_EQ_VIOLATE_COEF_RULES_SMASK){
+                       if (hit_error == 0) {
+                               dd_dev_err(dd,
+                                          "Gen3 EQ Table Coefficient rule violations\n");
+                               dd_dev_err(dd, "         prec   attn   post\n");
+                       }
+                       dd_dev_err(dd, "   p%02d:   %02x     %02x     %02x\n",
+                                  i, (u32)eq[i][0], (u32)eq[i][1],
+                                  (u32)eq[i][2]);
+                       dd_dev_err(dd, "            %02x     %02x     %02x\n",
+                                  (u32)c_minus1, (u32)c0, (u32)c_plus1);
+                       hit_error = 1;
+               }
+       }
+       if (hit_error)
+               return -EINVAL;
+       return 0;
+}
+
+/*
+ * Steps to be done after the PCIe firmware is downloaded and
+ * before the SBR for the Pcie Gen3.
+ * The SBus resource is already being held.
+ */
+static void pcie_post_steps(struct hfi1_devdata *dd)
+{
+       int i;
+
+       set_sbus_fast_mode(dd);
+       /*
+        * Write to the PCIe PCSes to set the G3_LOCKED_NEXT bits to 1.
+        * This avoids a spurious framing error that can otherwise be
+        * generated by the MAC layer.
+        *
+        * Use individual addresses since no broadcast is set up.
+        */
+       for (i = 0; i < NUM_PCIE_SERDES; i++) {
+               sbus_request(dd, pcie_pcs_addrs[dd->hfi1_id][i],
+                            0x03, WRITE_SBUS_RECEIVER, 0x00022132);
+       }
+
+       clear_sbus_fast_mode(dd);
+}
+
+/*
+ * Trigger a secondary bus reset (SBR) on ourselves using our parent.
+ *
+ * Based on pci_parent_bus_reset() which is not exported by the
+ * kernel core.
+ */
+static int trigger_sbr(struct hfi1_devdata *dd)
+{
+       struct pci_dev *dev = dd->pcidev;
+       struct pci_dev *pdev;
+
+       /* need a parent */
+       if (!dev->bus->self) {
+               dd_dev_err(dd, "%s: no parent device\n", __func__);
+               return -ENOTTY;
+       }
+
+       /* should not be anyone else on the bus */
+       list_for_each_entry(pdev, &dev->bus->devices, bus_list)
+               if (pdev != dev) {
+                       dd_dev_err(dd,
+                                  "%s: another device is on the same bus\n",
+                                  __func__);
+                       return -ENOTTY;
+               }
+
+       /*
+        * A secondary bus reset (SBR) issues a hot reset to our device.
+        * The following routine does a 1s wait after the reset is dropped
+        * per PCI Trhfa (recovery time).  PCIe 3.0 section 6.6.1 -
+        * Conventional Reset, paragraph 3, line 35 also says that a 1s
+        * delay after a reset is required.  Per spec requirements,
+        * the link is either working or not after that point.
+        */
+       pci_reset_bridge_secondary_bus(dev->bus->self);
+
+       return 0;
+}
+
+/*
+ * Write the given gasket interrupt register.
+ */
+static void write_gasket_interrupt(struct hfi1_devdata *dd, int index,
+                                  u16 code, u16 data)
+{
+       write_csr(dd, ASIC_PCIE_SD_INTRPT_LIST + (index * 8),
+                 (((u64)code << ASIC_PCIE_SD_INTRPT_LIST_INTRPT_CODE_SHIFT) |
+                  ((u64)data << ASIC_PCIE_SD_INTRPT_LIST_INTRPT_DATA_SHIFT)));
+}
+
+/*
+ * Tell the gasket logic how to react to the reset.
+ */
+static void arm_gasket_logic(struct hfi1_devdata *dd)
+{
+       u64 reg;
+
+       reg = (((u64)1 << dd->hfi1_id) <<
+              ASIC_PCIE_SD_HOST_CMD_INTRPT_CMD_SHIFT) |
+             ((u64)pcie_serdes_broadcast[dd->hfi1_id] <<
+              ASIC_PCIE_SD_HOST_CMD_SBUS_RCVR_ADDR_SHIFT |
+              ASIC_PCIE_SD_HOST_CMD_SBR_MODE_SMASK |
+              ((u64)SBR_DELAY_US & ASIC_PCIE_SD_HOST_CMD_TIMER_MASK) <<
+              ASIC_PCIE_SD_HOST_CMD_TIMER_SHIFT);
+       write_csr(dd, ASIC_PCIE_SD_HOST_CMD, reg);
+       /* read back to push the write */
+       read_csr(dd, ASIC_PCIE_SD_HOST_CMD);
+}
+
+/*
+ * CCE_PCIE_CTRL long name helpers
+ * We redefine these shorter macros to use in the code while leaving
+ * chip_registers.h to be autogenerated from the hardware spec.
+ */
+#define LANE_BUNDLE_MASK              CCE_PCIE_CTRL_PCIE_LANE_BUNDLE_MASK
+#define LANE_BUNDLE_SHIFT             CCE_PCIE_CTRL_PCIE_LANE_BUNDLE_SHIFT
+#define LANE_DELAY_MASK               CCE_PCIE_CTRL_PCIE_LANE_DELAY_MASK
+#define LANE_DELAY_SHIFT              CCE_PCIE_CTRL_PCIE_LANE_DELAY_SHIFT
+#define MARGIN_OVERWRITE_ENABLE_SHIFT CCE_PCIE_CTRL_XMT_MARGIN_OVERWRITE_ENABLE_SHIFT
+#define MARGIN_SHIFT                  CCE_PCIE_CTRL_XMT_MARGIN_SHIFT
+#define MARGIN_G1_G2_OVERWRITE_MASK   CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_OVERWRITE_ENABLE_MASK
+#define MARGIN_G1_G2_OVERWRITE_SHIFT  CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_OVERWRITE_ENABLE_SHIFT
+#define MARGIN_GEN1_GEN2_MASK         CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_MASK
+#define MARGIN_GEN1_GEN2_SHIFT        CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_SHIFT
+
+ /*
+  * Write xmt_margin for full-swing (WFR-B) or half-swing (WFR-C).
+  */
+static void write_xmt_margin(struct hfi1_devdata *dd, const char *fname)
+{
+       u64 pcie_ctrl;
+       u64 xmt_margin;
+       u64 xmt_margin_oe;
+       u64 lane_delay;
+       u64 lane_bundle;
+
+       pcie_ctrl = read_csr(dd, CCE_PCIE_CTRL);
+
+       /*
+        * For Discrete, use full-swing.
+        *  - PCIe TX defaults to full-swing.
+        *    Leave this register as default.
+        * For Integrated, use half-swing
+        *  - Copy xmt_margin and xmt_margin_oe
+        *    from Gen1/Gen2 to Gen3.
+        */
+       if (dd->pcidev->device == PCI_DEVICE_ID_INTEL1) { /* integrated */
+               /* extract initial fields */
+               xmt_margin = (pcie_ctrl >> MARGIN_GEN1_GEN2_SHIFT)
+                             & MARGIN_GEN1_GEN2_MASK;
+               xmt_margin_oe = (pcie_ctrl >> MARGIN_G1_G2_OVERWRITE_SHIFT)
+                                & MARGIN_G1_G2_OVERWRITE_MASK;
+               lane_delay = (pcie_ctrl >> LANE_DELAY_SHIFT) & LANE_DELAY_MASK;
+               lane_bundle = (pcie_ctrl >> LANE_BUNDLE_SHIFT)
+                              & LANE_BUNDLE_MASK;
+
+               /*
+                * For A0, EFUSE values are not set.  Override with the
+                * correct values.
+                */
+               if (is_ax(dd)) {
+                       /*
+                        * xmt_margin and OverwiteEnabel should be the
+                        * same for Gen1/Gen2 and Gen3
+                        */
+                       xmt_margin = 0x5;
+                       xmt_margin_oe = 0x1;
+                       lane_delay = 0xF; /* Delay 240ns. */
+                       lane_bundle = 0x0; /* Set to 1 lane. */
+               }
+
+               /* overwrite existing values */
+               pcie_ctrl = (xmt_margin << MARGIN_GEN1_GEN2_SHIFT)
+                       | (xmt_margin_oe << MARGIN_G1_G2_OVERWRITE_SHIFT)
+                       | (xmt_margin << MARGIN_SHIFT)
+                       | (xmt_margin_oe << MARGIN_OVERWRITE_ENABLE_SHIFT)
+                       | (lane_delay << LANE_DELAY_SHIFT)
+                       | (lane_bundle << LANE_BUNDLE_SHIFT);
+
+               write_csr(dd, CCE_PCIE_CTRL, pcie_ctrl);
+       }
+
+       dd_dev_dbg(dd, "%s: program XMT margin, CcePcieCtrl 0x%llx\n",
+                  fname, pcie_ctrl);
+}
+
+/*
+ * Do all the steps needed to transition the PCIe link to Gen3 speed.
+ */
+int do_pcie_gen3_transition(struct hfi1_devdata *dd)
+{
+       struct pci_dev *parent = dd->pcidev->bus->self;
+       u64 fw_ctrl;
+       u64 reg, therm;
+       u32 reg32, fs, lf;
+       u32 status, err;
+       int ret;
+       int do_retry, retry_count = 0;
+       uint default_pset;
+       u16 target_vector, target_speed;
+       u16 lnkctl2, vendor;
+       u8 div;
+       const u8 (*eq)[3];
+       int return_error = 0;
+
+       /* PCIe Gen3 is for the ASIC only */
+       if (dd->icode != ICODE_RTL_SILICON)
+               return 0;
+
+       if (pcie_target == 1) {                 /* target Gen1 */
+               target_vector = GEN1_SPEED_VECTOR;
+               target_speed = 2500;
+       } else if (pcie_target == 2) {          /* target Gen2 */
+               target_vector = GEN2_SPEED_VECTOR;
+               target_speed = 5000;
+       } else if (pcie_target == 3) {          /* target Gen3 */
+               target_vector = GEN3_SPEED_VECTOR;
+               target_speed = 8000;
+       } else {
+               /* off or invalid target - skip */
+               dd_dev_info(dd, "%s: Skipping PCIe transition\n", __func__);
+               return 0;
+       }
+
+       /* if already at target speed, done (unless forced) */
+       if (dd->lbus_speed == target_speed) {
+               dd_dev_info(dd, "%s: PCIe already at gen%d, %s\n", __func__,
+                           pcie_target,
+                           pcie_force ? "re-doing anyway" : "skipping");
+               if (!pcie_force)
+                       return 0;
+       }
+
+       /*
+        * The driver cannot do the transition if it has no access to the
+        * upstream component
+        */
+       if (!parent) {
+               dd_dev_info(dd, "%s: No upstream, Can't do gen3 transition\n",
+                           __func__);
+               return 0;
+       }
+
+       /*
+        * Do the Gen3 transition.  Steps are those of the PCIe Gen3
+        * recipe.
+        */
+
+       /* step 1: pcie link working in gen1/gen2 */
+
+       /* step 2: if either side is not capable of Gen3, done */
+       if (pcie_target == 3 && !dd->link_gen3_capable) {
+               dd_dev_err(dd, "The PCIe link is not Gen3 capable\n");
+               ret = -ENOSYS;
+               goto done_no_mutex;
+       }
+
+       /* hold the SBus resource across the firmware download and SBR */
+       ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT);
+       if (ret) {
+               dd_dev_err(dd, "%s: unable to acquire SBus resource\n",
+                          __func__);
+               return ret;
+       }
+
+       /* make sure thermal polling is not causing interrupts */
+       therm = read_csr(dd, ASIC_CFG_THERM_POLL_EN);
+       if (therm) {
+               write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x0);
+               msleep(100);
+               dd_dev_info(dd, "%s: Disabled therm polling\n",
+                           __func__);
+       }
+
+retry:
+       /* the SBus download will reset the spico for thermal */
+
+       /* step 3: download SBus Master firmware */
+       /* step 4: download PCIe Gen3 SerDes firmware */
+       dd_dev_info(dd, "%s: downloading firmware\n", __func__);
+       ret = load_pcie_firmware(dd);
+       if (ret) {
+               /* do not proceed if the firmware cannot be downloaded */
+               return_error = 1;
+               goto done;
+       }
+
+       /* step 5: set up device parameter settings */
+       dd_dev_info(dd, "%s: setting PCIe registers\n", __func__);
+
+       /*
+        * PcieCfgSpcie1 - Link Control 3
+        * Leave at reset value.  No need to set PerfEq - link equalization
+        * will be performed automatically after the SBR when the target
+        * speed is 8GT/s.
+        */
+
+       /* clear all 16 per-lane error bits (PCIe: Lane Error Status) */
+       pci_write_config_dword(dd->pcidev, PCIE_CFG_SPCIE2, 0xffff);
+
+       /* step 5a: Set Synopsys Port Logic registers */
+
+       /*
+        * PcieCfgRegPl2 - Port Force Link
+        *
+        * Set the low power field to 0x10 to avoid unnecessary power
+        * management messages.  All other fields are zero.
+        */
+       reg32 = 0x10ul << PCIE_CFG_REG_PL2_LOW_PWR_ENT_CNT_SHIFT;
+       pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL2, reg32);
+
+       /*
+        * PcieCfgRegPl100 - Gen3 Control
+        *
+        * turn off PcieCfgRegPl100.Gen3ZRxDcNonCompl
+        * turn on PcieCfgRegPl100.EqEieosCnt
+        * Everything else zero.
+        */
+       reg32 = PCIE_CFG_REG_PL100_EQ_EIEOS_CNT_SMASK;
+       pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL100, reg32);
+
+       /*
+        * PcieCfgRegPl101 - Gen3 EQ FS and LF
+        * PcieCfgRegPl102 - Gen3 EQ Presets to Coefficients Mapping
+        * PcieCfgRegPl103 - Gen3 EQ Preset Index
+        * PcieCfgRegPl105 - Gen3 EQ Status
+        *
+        * Give initial EQ settings.
+        */
+       if (dd->pcidev->device == PCI_DEVICE_ID_INTEL0) { /* discrete */
+               /* 1000mV, FS=24, LF = 8 */
+               fs = 24;
+               lf = 8;
+               div = 3;
+               eq = discrete_preliminary_eq;
+               default_pset = DEFAULT_DISCRETE_PSET;
+       } else {
+               /* 400mV, FS=29, LF = 9 */
+               fs = 29;
+               lf = 9;
+               div = 1;
+               eq = integrated_preliminary_eq;
+               default_pset = DEFAULT_MCP_PSET;
+       }
+       pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL101,
+                              (fs <<
+                               PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_FS_SHIFT) |
+                              (lf <<
+                               PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_LF_SHIFT));
+       ret = load_eq_table(dd, eq, fs, div);
+       if (ret)
+               goto done;
+
+       /*
+        * PcieCfgRegPl106 - Gen3 EQ Control
+        *
+        * Set Gen3EqPsetReqVec, leave other fields 0.
+        */
+       if (pcie_pset == UNSET_PSET)
+               pcie_pset = default_pset;
+       if (pcie_pset > 10) {   /* valid range is 0-10, inclusive */
+               dd_dev_err(dd, "%s: Invalid Eq Pset %u, setting to %d\n",
+                          __func__, pcie_pset, default_pset);
+               pcie_pset = default_pset;
+       }
+       dd_dev_info(dd, "%s: using EQ Pset %u\n", __func__, pcie_pset);
+       pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL106,
+                              ((1 << pcie_pset) <<
+                       PCIE_CFG_REG_PL106_GEN3_EQ_PSET_REQ_VEC_SHIFT) |
+                       PCIE_CFG_REG_PL106_GEN3_EQ_EVAL2MS_DISABLE_SMASK |
+                       PCIE_CFG_REG_PL106_GEN3_EQ_PHASE23_EXIT_MODE_SMASK);
+
+       /*
+        * step 5b: Do post firmware download steps via SBus
+        */
+       dd_dev_info(dd, "%s: doing pcie post steps\n", __func__);
+       pcie_post_steps(dd);
+
+       /*
+        * step 5c: Program gasket interrupts
+        */
+       /* set the Rx Bit Rate to REFCLK ratio */
+       write_gasket_interrupt(dd, 0, 0x0006, 0x0050);
+       /* disable pCal for PCIe Gen3 RX equalization */
+       write_gasket_interrupt(dd, 1, 0x0026, 0x5b01);
+       /*
+        * Enable iCal for PCIe Gen3 RX equalization, and set which
+        * evaluation of RX_EQ_EVAL will launch the iCal procedure.
+        */
+       write_gasket_interrupt(dd, 2, 0x0026, 0x5202);
+       /* terminate list */
+       write_gasket_interrupt(dd, 3, 0x0000, 0x0000);
+
+       /*
+        * step 5d: program XMT margin
+        */
+       write_xmt_margin(dd, __func__);
+
+       /*
+        * step 5e: disable active state power management (ASPM). It
+        * will be enabled if required later
+        */
+       dd_dev_info(dd, "%s: clearing ASPM\n", __func__);
+       aspm_hw_disable_l1(dd);
+
+       /*
+        * step 5f: clear DirectSpeedChange
+        * PcieCfgRegPl67.DirectSpeedChange must be zero to prevent the
+        * change in the speed target from starting before we are ready.
+        * This field defaults to 0 and we are not changing it, so nothing
+        * needs to be done.
+        */
+
+       /* step 5g: Set target link speed */
+       /*
+        * Set target link speed to be target on both device and parent.
+        * On setting the parent: Some system BIOSs "helpfully" set the
+        * parent target speed to Gen2 to match the ASIC's initial speed.
+        * We can set the target Gen3 because we have already checked
+        * that it is Gen3 capable earlier.
+        */
+       dd_dev_info(dd, "%s: setting parent target link speed\n", __func__);
+       pcie_capability_read_word(parent, PCI_EXP_LNKCTL2, &lnkctl2);
+       dd_dev_info(dd, "%s: ..old link control2: 0x%x\n", __func__,
+                   (u32)lnkctl2);
+       /* only write to parent if target is not as high as ours */
+       if ((lnkctl2 & LNKCTL2_TARGET_LINK_SPEED_MASK) < target_vector) {
+               lnkctl2 &= ~LNKCTL2_TARGET_LINK_SPEED_MASK;
+               lnkctl2 |= target_vector;
+               dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__,
+                           (u32)lnkctl2);
+               pcie_capability_write_word(parent, PCI_EXP_LNKCTL2, lnkctl2);
+       } else {
+               dd_dev_info(dd, "%s: ..target speed is OK\n", __func__);
+       }
+
+       dd_dev_info(dd, "%s: setting target link speed\n", __func__);
+       pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL2, &lnkctl2);
+       dd_dev_info(dd, "%s: ..old link control2: 0x%x\n", __func__,
+                   (u32)lnkctl2);
+       lnkctl2 &= ~LNKCTL2_TARGET_LINK_SPEED_MASK;
+       lnkctl2 |= target_vector;
+       dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__,
+                   (u32)lnkctl2);
+       pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL2, lnkctl2);
+
+       /* step 5h: arm gasket logic */
+       /* hold DC in reset across the SBR */
+       write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_DC_RESET_SMASK);
+       (void)read_csr(dd, CCE_DC_CTRL); /* DC reset hold */
+       /* save firmware control across the SBR */
+       fw_ctrl = read_csr(dd, MISC_CFG_FW_CTRL);
+
+       dd_dev_info(dd, "%s: arming gasket logic\n", __func__);
+       arm_gasket_logic(dd);
+
+       /*
+        * step 6: quiesce PCIe link
+        * The chip has already been reset, so there will be no traffic
+        * from the chip.  Linux has no easy way to enforce that it will
+        * not try to access the device, so we just need to hope it doesn't
+        * do it while we are doing the reset.
+        */
+
+       /*
+        * step 7: initiate the secondary bus reset (SBR)
+        * step 8: hardware brings the links back up
+        * step 9: wait for link speed transition to be complete
+        */
+       dd_dev_info(dd, "%s: calling trigger_sbr\n", __func__);
+       ret = trigger_sbr(dd);
+       if (ret)
+               goto done;
+
+       /* step 10: decide what to do next */
+
+       /* check if we can read PCI space */
+       ret = pci_read_config_word(dd->pcidev, PCI_VENDOR_ID, &vendor);
+       if (ret) {
+               dd_dev_info(dd,
+                           "%s: read of VendorID failed after SBR, err %d\n",
+                           __func__, ret);
+               return_error = 1;
+               goto done;
+       }
+       if (vendor == 0xffff) {
+               dd_dev_info(dd, "%s: VendorID is all 1s after SBR\n", __func__);
+               return_error = 1;
+               ret = -EIO;
+               goto done;
+       }
+
+       /* restore PCI space registers we know were reset */
+       dd_dev_info(dd, "%s: calling restore_pci_variables\n", __func__);
+       restore_pci_variables(dd);
+       /* restore firmware control */
+       write_csr(dd, MISC_CFG_FW_CTRL, fw_ctrl);
+
+       /*
+        * Check the gasket block status.
+        *
+        * This is the first CSR read after the SBR.  If the read returns
+        * all 1s (fails), the link did not make it back.
+        *
+        * Once we're sure we can read and write, clear the DC reset after
+        * the SBR.  Then check for any per-lane errors. Then look over
+        * the status.
+        */
+       reg = read_csr(dd, ASIC_PCIE_SD_HOST_STATUS);
+       dd_dev_info(dd, "%s: gasket block status: 0x%llx\n", __func__, reg);
+       if (reg == ~0ull) {     /* PCIe read failed/timeout */
+               dd_dev_err(dd, "SBR failed - unable to read from device\n");
+               return_error = 1;
+               ret = -ENOSYS;
+               goto done;
+       }
+
+       /* clear the DC reset */
+       write_csr(dd, CCE_DC_CTRL, 0);
+
+       /* Set the LED off */
+       setextled(dd, 0);
+
+       /* check for any per-lane errors */
+       pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE2, &reg32);
+       dd_dev_info(dd, "%s: per-lane errors: 0x%x\n", __func__, reg32);
+
+       /* extract status, look for our HFI */
+       status = (reg >> ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_SHIFT)
+                       & ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_MASK;
+       if ((status & (1 << dd->hfi1_id)) == 0) {
+               dd_dev_err(dd,
+                          "%s: gasket status 0x%x, expecting 0x%x\n",
+                          __func__, status, 1 << dd->hfi1_id);
+               ret = -EIO;
+               goto done;
+       }
+
+       /* extract error */
+       err = (reg >> ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_SHIFT)
+               & ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_MASK;
+       if (err) {
+               dd_dev_err(dd, "%s: gasket error %d\n", __func__, err);
+               ret = -EIO;
+               goto done;
+       }
+
+       /* update our link information cache */
+       update_lbus_info(dd);
+       dd_dev_info(dd, "%s: new speed and width: %s\n", __func__,
+                   dd->lbus_info);
+
+       if (dd->lbus_speed != target_speed) { /* not target */
+               /* maybe retry */
+               do_retry = retry_count < pcie_retry;
+               dd_dev_err(dd, "PCIe link speed did not switch to Gen%d%s\n",
+                          pcie_target, do_retry ? ", retrying" : "");
+               retry_count++;
+               if (do_retry) {
+                       msleep(100); /* allow time to settle */
+                       goto retry;
+               }
+               ret = -EIO;
+       }
+
+done:
+       if (therm) {
+               write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x1);
+               msleep(100);
+               dd_dev_info(dd, "%s: Re-enable therm polling\n",
+                           __func__);
+       }
+       release_chip_resource(dd, CR_SBUS);
+done_no_mutex:
+       /* return no error if it is OK to be at current speed */
+       if (ret && !return_error) {
+               dd_dev_err(dd, "Proceeding at current speed PCIe speed\n");
+               ret = 0;
+       }
+
+       dd_dev_info(dd, "%s: done\n", __func__);
+       return ret;
+}
diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c
new file mode 100644 (file)
index 0000000..d5edb1a
--- /dev/null
@@ -0,0 +1,2072 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/delay.h>
+#include "hfi.h"
+#include "qp.h"
+#include "trace.h"
+
+#define SC_CTXT_PACKET_EGRESS_TIMEOUT 350 /* in chip cycles */
+
+#define SC(name) SEND_CTXT_##name
+/*
+ * Send Context functions
+ */
+static void sc_wait_for_packet_egress(struct send_context *sc, int pause);
+
+/*
+ * Set the CM reset bit and wait for it to clear.  Use the provided
+ * sendctrl register.  This routine has no locking.
+ */
+void __cm_reset(struct hfi1_devdata *dd, u64 sendctrl)
+{
+       write_csr(dd, SEND_CTRL, sendctrl | SEND_CTRL_CM_RESET_SMASK);
+       while (1) {
+               udelay(1);
+               sendctrl = read_csr(dd, SEND_CTRL);
+               if ((sendctrl & SEND_CTRL_CM_RESET_SMASK) == 0)
+                       break;
+       }
+}
+
+/* defined in header release 48 and higher */
+#ifndef SEND_CTRL_UNSUPPORTED_VL_SHIFT
+#define SEND_CTRL_UNSUPPORTED_VL_SHIFT 3
+#define SEND_CTRL_UNSUPPORTED_VL_MASK 0xffull
+#define SEND_CTRL_UNSUPPORTED_VL_SMASK (SEND_CTRL_UNSUPPORTED_VL_MASK \
+               << SEND_CTRL_UNSUPPORTED_VL_SHIFT)
+#endif
+
+/* global control of PIO send */
+void pio_send_control(struct hfi1_devdata *dd, int op)
+{
+       u64 reg, mask;
+       unsigned long flags;
+       int write = 1;  /* write sendctrl back */
+       int flush = 0;  /* re-read sendctrl to make sure it is flushed */
+
+       spin_lock_irqsave(&dd->sendctrl_lock, flags);
+
+       reg = read_csr(dd, SEND_CTRL);
+       switch (op) {
+       case PSC_GLOBAL_ENABLE:
+               reg |= SEND_CTRL_SEND_ENABLE_SMASK;
+       /* Fall through */
+       case PSC_DATA_VL_ENABLE:
+               /* Disallow sending on VLs not enabled */
+               mask = (((~0ull) << num_vls) & SEND_CTRL_UNSUPPORTED_VL_MASK) <<
+                               SEND_CTRL_UNSUPPORTED_VL_SHIFT;
+               reg = (reg & ~SEND_CTRL_UNSUPPORTED_VL_SMASK) | mask;
+               break;
+       case PSC_GLOBAL_DISABLE:
+               reg &= ~SEND_CTRL_SEND_ENABLE_SMASK;
+               break;
+       case PSC_GLOBAL_VLARB_ENABLE:
+               reg |= SEND_CTRL_VL_ARBITER_ENABLE_SMASK;
+               break;
+       case PSC_GLOBAL_VLARB_DISABLE:
+               reg &= ~SEND_CTRL_VL_ARBITER_ENABLE_SMASK;
+               break;
+       case PSC_CM_RESET:
+               __cm_reset(dd, reg);
+               write = 0; /* CSR already written (and flushed) */
+               break;
+       case PSC_DATA_VL_DISABLE:
+               reg |= SEND_CTRL_UNSUPPORTED_VL_SMASK;
+               flush = 1;
+               break;
+       default:
+               dd_dev_err(dd, "%s: invalid control %d\n", __func__, op);
+               break;
+       }
+
+       if (write) {
+               write_csr(dd, SEND_CTRL, reg);
+               if (flush)
+                       (void)read_csr(dd, SEND_CTRL); /* flush write */
+       }
+
+       spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+}
+
+/* number of send context memory pools */
+#define NUM_SC_POOLS 2
+
+/* Send Context Size (SCS) wildcards */
+#define SCS_POOL_0 -1
+#define SCS_POOL_1 -2
+
+/* Send Context Count (SCC) wildcards */
+#define SCC_PER_VL -1
+#define SCC_PER_CPU  -2
+#define SCC_PER_KRCVQ  -3
+
+/* Send Context Size (SCS) constants */
+#define SCS_ACK_CREDITS  32
+#define SCS_VL15_CREDITS 102   /* 3 pkts of 2048B data + 128B header */
+
+#define PIO_THRESHOLD_CEILING 4096
+
+#define PIO_WAIT_BATCH_SIZE 5
+
+/* default send context sizes */
+static struct sc_config_sizes sc_config_sizes[SC_MAX] = {
+       [SC_KERNEL] = { .size  = SCS_POOL_0,    /* even divide, pool 0 */
+                       .count = SCC_PER_VL },  /* one per NUMA */
+       [SC_ACK]    = { .size  = SCS_ACK_CREDITS,
+                       .count = SCC_PER_KRCVQ },
+       [SC_USER]   = { .size  = SCS_POOL_0,    /* even divide, pool 0 */
+                       .count = SCC_PER_CPU }, /* one per CPU */
+       [SC_VL15]   = { .size  = SCS_VL15_CREDITS,
+                       .count = 1 },
+
+};
+
+/* send context memory pool configuration */
+struct mem_pool_config {
+       int centipercent;       /* % of memory, in 100ths of 1% */
+       int absolute_blocks;    /* absolute block count */
+};
+
+/* default memory pool configuration: 100% in pool 0 */
+static struct mem_pool_config sc_mem_pool_config[NUM_SC_POOLS] = {
+       /* centi%, abs blocks */
+       {  10000,     -1 },             /* pool 0 */
+       {      0,     -1 },             /* pool 1 */
+};
+
+/* memory pool information, used when calculating final sizes */
+struct mem_pool_info {
+       int centipercent;       /*
+                                * 100th of 1% of memory to use, -1 if blocks
+                                * already set
+                                */
+       int count;              /* count of contexts in the pool */
+       int blocks;             /* block size of the pool */
+       int size;               /* context size, in blocks */
+};
+
+/*
+ * Convert a pool wildcard to a valid pool index.  The wildcards
+ * start at -1 and increase negatively.  Map them as:
+ *     -1 => 0
+ *     -2 => 1
+ *     etc.
+ *
+ * Return -1 on non-wildcard input, otherwise convert to a pool number.
+ */
+static int wildcard_to_pool(int wc)
+{
+       if (wc >= 0)
+               return -1;      /* non-wildcard */
+       return -wc - 1;
+}
+
+static const char *sc_type_names[SC_MAX] = {
+       "kernel",
+       "ack",
+       "user",
+       "vl15"
+};
+
+static const char *sc_type_name(int index)
+{
+       if (index < 0 || index >= SC_MAX)
+               return "unknown";
+       return sc_type_names[index];
+}
+
+/*
+ * Read the send context memory pool configuration and send context
+ * size configuration.  Replace any wildcards and come up with final
+ * counts and sizes for the send context types.
+ */
+int init_sc_pools_and_sizes(struct hfi1_devdata *dd)
+{
+       struct mem_pool_info mem_pool_info[NUM_SC_POOLS] = { { 0 } };
+       int total_blocks = (dd->chip_pio_mem_size / PIO_BLOCK_SIZE) - 1;
+       int total_contexts = 0;
+       int fixed_blocks;
+       int pool_blocks;
+       int used_blocks;
+       int cp_total;           /* centipercent total */
+       int ab_total;           /* absolute block total */
+       int extra;
+       int i;
+
+       /*
+        * When SDMA is enabled, kernel context pio packet size is capped by
+        * "piothreshold". Reduce pio buffer allocation for kernel context by
+        * setting it to a fixed size. The allocation allows 3-deep buffering
+        * of the largest pio packets plus up to 128 bytes header, sufficient
+        * to maintain verbs performance.
+        *
+        * When SDMA is disabled, keep the default pooling allocation.
+        */
+       if (HFI1_CAP_IS_KSET(SDMA)) {
+               u16 max_pkt_size = (piothreshold < PIO_THRESHOLD_CEILING) ?
+                                        piothreshold : PIO_THRESHOLD_CEILING;
+               sc_config_sizes[SC_KERNEL].size =
+                       3 * (max_pkt_size + 128) / PIO_BLOCK_SIZE;
+       }
+
+       /*
+        * Step 0:
+        *      - copy the centipercents/absolute sizes from the pool config
+        *      - sanity check these values
+        *      - add up centipercents, then later check for full value
+        *      - add up absolute blocks, then later check for over-commit
+        */
+       cp_total = 0;
+       ab_total = 0;
+       for (i = 0; i < NUM_SC_POOLS; i++) {
+               int cp = sc_mem_pool_config[i].centipercent;
+               int ab = sc_mem_pool_config[i].absolute_blocks;
+
+               /*
+                * A negative value is "unused" or "invalid".  Both *can*
+                * be valid, but centipercent wins, so check that first
+                */
+               if (cp >= 0) {                  /* centipercent valid */
+                       cp_total += cp;
+               } else if (ab >= 0) {           /* absolute blocks valid */
+                       ab_total += ab;
+               } else {                        /* neither valid */
+                       dd_dev_err(
+                               dd,
+                               "Send context memory pool %d: both the block count and centipercent are invalid\n",
+                               i);
+                       return -EINVAL;
+               }
+
+               mem_pool_info[i].centipercent = cp;
+               mem_pool_info[i].blocks = ab;
+       }
+
+       /* do not use both % and absolute blocks for different pools */
+       if (cp_total != 0 && ab_total != 0) {
+               dd_dev_err(
+                       dd,
+                       "All send context memory pools must be described as either centipercent or blocks, no mixing between pools\n");
+               return -EINVAL;
+       }
+
+       /* if any percentages are present, they must add up to 100% x 100 */
+       if (cp_total != 0 && cp_total != 10000) {
+               dd_dev_err(
+                       dd,
+                       "Send context memory pool centipercent is %d, expecting 10000\n",
+                       cp_total);
+               return -EINVAL;
+       }
+
+       /* the absolute pool total cannot be more than the mem total */
+       if (ab_total > total_blocks) {
+               dd_dev_err(
+                       dd,
+                       "Send context memory pool absolute block count %d is larger than the memory size %d\n",
+                       ab_total, total_blocks);
+               return -EINVAL;
+       }
+
+       /*
+        * Step 2:
+        *      - copy from the context size config
+        *      - replace context type wildcard counts with real values
+        *      - add up non-memory pool block sizes
+        *      - add up memory pool user counts
+        */
+       fixed_blocks = 0;
+       for (i = 0; i < SC_MAX; i++) {
+               int count = sc_config_sizes[i].count;
+               int size = sc_config_sizes[i].size;
+               int pool;
+
+               /*
+                * Sanity check count: Either a positive value or
+                * one of the expected wildcards is valid.  The positive
+                * value is checked later when we compare against total
+                * memory available.
+                */
+               if (i == SC_ACK) {
+                       count = dd->n_krcv_queues;
+               } else if (i == SC_KERNEL) {
+                       count = INIT_SC_PER_VL * num_vls;
+               } else if (count == SCC_PER_CPU) {
+                       count = dd->num_rcv_contexts - dd->n_krcv_queues;
+               } else if (count < 0) {
+                       dd_dev_err(
+                               dd,
+                               "%s send context invalid count wildcard %d\n",
+                               sc_type_name(i), count);
+                       return -EINVAL;
+               }
+               if (total_contexts + count > dd->chip_send_contexts)
+                       count = dd->chip_send_contexts - total_contexts;
+
+               total_contexts += count;
+
+               /*
+                * Sanity check pool: The conversion will return a pool
+                * number or -1 if a fixed (non-negative) value.  The fixed
+                * value is checked later when we compare against
+                * total memory available.
+                */
+               pool = wildcard_to_pool(size);
+               if (pool == -1) {                       /* non-wildcard */
+                       fixed_blocks += size * count;
+               } else if (pool < NUM_SC_POOLS) {       /* valid wildcard */
+                       mem_pool_info[pool].count += count;
+               } else {                                /* invalid wildcard */
+                       dd_dev_err(
+                               dd,
+                               "%s send context invalid pool wildcard %d\n",
+                               sc_type_name(i), size);
+                       return -EINVAL;
+               }
+
+               dd->sc_sizes[i].count = count;
+               dd->sc_sizes[i].size = size;
+       }
+       if (fixed_blocks > total_blocks) {
+               dd_dev_err(
+                       dd,
+                       "Send context fixed block count, %u, larger than total block count %u\n",
+                       fixed_blocks, total_blocks);
+               return -EINVAL;
+       }
+
+       /* step 3: calculate the blocks in the pools, and pool context sizes */
+       pool_blocks = total_blocks - fixed_blocks;
+       if (ab_total > pool_blocks) {
+               dd_dev_err(
+                       dd,
+                       "Send context fixed pool sizes, %u, larger than pool block count %u\n",
+                       ab_total, pool_blocks);
+               return -EINVAL;
+       }
+       /* subtract off the fixed pool blocks */
+       pool_blocks -= ab_total;
+
+       for (i = 0; i < NUM_SC_POOLS; i++) {
+               struct mem_pool_info *pi = &mem_pool_info[i];
+
+               /* % beats absolute blocks */
+               if (pi->centipercent >= 0)
+                       pi->blocks = (pool_blocks * pi->centipercent) / 10000;
+
+               if (pi->blocks == 0 && pi->count != 0) {
+                       dd_dev_err(
+                               dd,
+                               "Send context memory pool %d has %u contexts, but no blocks\n",
+                               i, pi->count);
+                       return -EINVAL;
+               }
+               if (pi->count == 0) {
+                       /* warn about wasted blocks */
+                       if (pi->blocks != 0)
+                               dd_dev_err(
+                                       dd,
+                                       "Send context memory pool %d has %u blocks, but zero contexts\n",
+                                       i, pi->blocks);
+                       pi->size = 0;
+               } else {
+                       pi->size = pi->blocks / pi->count;
+               }
+       }
+
+       /* step 4: fill in the context type sizes from the pool sizes */
+       used_blocks = 0;
+       for (i = 0; i < SC_MAX; i++) {
+               if (dd->sc_sizes[i].size < 0) {
+                       unsigned pool = wildcard_to_pool(dd->sc_sizes[i].size);
+
+                       WARN_ON_ONCE(pool >= NUM_SC_POOLS);
+                       dd->sc_sizes[i].size = mem_pool_info[pool].size;
+               }
+               /* make sure we are not larger than what is allowed by the HW */
+#define PIO_MAX_BLOCKS 1024
+               if (dd->sc_sizes[i].size > PIO_MAX_BLOCKS)
+                       dd->sc_sizes[i].size = PIO_MAX_BLOCKS;
+
+               /* calculate our total usage */
+               used_blocks += dd->sc_sizes[i].size * dd->sc_sizes[i].count;
+       }
+       extra = total_blocks - used_blocks;
+       if (extra != 0)
+               dd_dev_info(dd, "unused send context blocks: %d\n", extra);
+
+       return total_contexts;
+}
+
+int init_send_contexts(struct hfi1_devdata *dd)
+{
+       u16 base;
+       int ret, i, j, context;
+
+       ret = init_credit_return(dd);
+       if (ret)
+               return ret;
+
+       dd->hw_to_sw = kmalloc_array(TXE_NUM_CONTEXTS, sizeof(u8),
+                                       GFP_KERNEL);
+       dd->send_contexts = kcalloc(dd->num_send_contexts,
+                                       sizeof(struct send_context_info),
+                                       GFP_KERNEL);
+       if (!dd->send_contexts || !dd->hw_to_sw) {
+               kfree(dd->hw_to_sw);
+               kfree(dd->send_contexts);
+               free_credit_return(dd);
+               return -ENOMEM;
+       }
+
+       /* hardware context map starts with invalid send context indices */
+       for (i = 0; i < TXE_NUM_CONTEXTS; i++)
+               dd->hw_to_sw[i] = INVALID_SCI;
+
+       /*
+        * All send contexts have their credit sizes.  Allocate credits
+        * for each context one after another from the global space.
+        */
+       context = 0;
+       base = 1;
+       for (i = 0; i < SC_MAX; i++) {
+               struct sc_config_sizes *scs = &dd->sc_sizes[i];
+
+               for (j = 0; j < scs->count; j++) {
+                       struct send_context_info *sci =
+                                               &dd->send_contexts[context];
+                       sci->type = i;
+                       sci->base = base;
+                       sci->credits = scs->size;
+
+                       context++;
+                       base += scs->size;
+               }
+       }
+
+       return 0;
+}
+
+/*
+ * Allocate a software index and hardware context of the given type.
+ *
+ * Must be called with dd->sc_lock held.
+ */
+static int sc_hw_alloc(struct hfi1_devdata *dd, int type, u32 *sw_index,
+                      u32 *hw_context)
+{
+       struct send_context_info *sci;
+       u32 index;
+       u32 context;
+
+       for (index = 0, sci = &dd->send_contexts[0];
+                       index < dd->num_send_contexts; index++, sci++) {
+               if (sci->type == type && sci->allocated == 0) {
+                       sci->allocated = 1;
+                       /* use a 1:1 mapping, but make them non-equal */
+                       context = dd->chip_send_contexts - index - 1;
+                       dd->hw_to_sw[context] = index;
+                       *sw_index = index;
+                       *hw_context = context;
+                       return 0; /* success */
+               }
+       }
+       dd_dev_err(dd, "Unable to locate a free type %d send context\n", type);
+       return -ENOSPC;
+}
+
+/*
+ * Free the send context given by its software index.
+ *
+ * Must be called with dd->sc_lock held.
+ */
+static void sc_hw_free(struct hfi1_devdata *dd, u32 sw_index, u32 hw_context)
+{
+       struct send_context_info *sci;
+
+       sci = &dd->send_contexts[sw_index];
+       if (!sci->allocated) {
+               dd_dev_err(dd, "%s: sw_index %u not allocated? hw_context %u\n",
+                          __func__, sw_index, hw_context);
+       }
+       sci->allocated = 0;
+       dd->hw_to_sw[hw_context] = INVALID_SCI;
+}
+
+/* return the base context of a context in a group */
+static inline u32 group_context(u32 context, u32 group)
+{
+       return (context >> group) << group;
+}
+
+/* return the size of a group */
+static inline u32 group_size(u32 group)
+{
+       return 1 << group;
+}
+
+/*
+ * Obtain the credit return addresses, kernel virtual and physical, for the
+ * given sc.
+ *
+ * To understand this routine:
+ * o va and pa are arrays of struct credit_return.  One for each physical
+ *   send context, per NUMA.
+ * o Each send context always looks in its relative location in a struct
+ *   credit_return for its credit return.
+ * o Each send context in a group must have its return address CSR programmed
+ *   with the same value.  Use the address of the first send context in the
+ *   group.
+ */
+static void cr_group_addresses(struct send_context *sc, dma_addr_t *pa)
+{
+       u32 gc = group_context(sc->hw_context, sc->group);
+       u32 index = sc->hw_context & 0x7;
+
+       sc->hw_free = &sc->dd->cr_base[sc->node].va[gc].cr[index];
+       *pa = (unsigned long)
+              &((struct credit_return *)sc->dd->cr_base[sc->node].pa)[gc];
+}
+
+/*
+ * Work queue function triggered in error interrupt routine for
+ * kernel contexts.
+ */
+static void sc_halted(struct work_struct *work)
+{
+       struct send_context *sc;
+
+       sc = container_of(work, struct send_context, halt_work);
+       sc_restart(sc);
+}
+
+/*
+ * Calculate PIO block threshold for this send context using the given MTU.
+ * Trigger a return when one MTU plus optional header of credits remain.
+ *
+ * Parameter mtu is in bytes.
+ * Parameter hdrqentsize is in DWORDs.
+ *
+ * Return value is what to write into the CSR: trigger return when
+ * unreturned credits pass this count.
+ */
+u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize)
+{
+       u32 release_credits;
+       u32 threshold;
+
+       /* add in the header size, then divide by the PIO block size */
+       mtu += hdrqentsize << 2;
+       release_credits = DIV_ROUND_UP(mtu, PIO_BLOCK_SIZE);
+
+       /* check against this context's credits */
+       if (sc->credits <= release_credits)
+               threshold = 1;
+       else
+               threshold = sc->credits - release_credits;
+
+       return threshold;
+}
+
+/*
+ * Calculate credit threshold in terms of percent of the allocated credits.
+ * Trigger when unreturned credits equal or exceed the percentage of the whole.
+ *
+ * Return value is what to write into the CSR: trigger return when
+ * unreturned credits pass this count.
+ */
+u32 sc_percent_to_threshold(struct send_context *sc, u32 percent)
+{
+       return (sc->credits * percent) / 100;
+}
+
+/*
+ * Set the credit return threshold.
+ */
+void sc_set_cr_threshold(struct send_context *sc, u32 new_threshold)
+{
+       unsigned long flags;
+       u32 old_threshold;
+       int force_return = 0;
+
+       spin_lock_irqsave(&sc->credit_ctrl_lock, flags);
+
+       old_threshold = (sc->credit_ctrl >>
+                               SC(CREDIT_CTRL_THRESHOLD_SHIFT))
+                        & SC(CREDIT_CTRL_THRESHOLD_MASK);
+
+       if (new_threshold != old_threshold) {
+               sc->credit_ctrl =
+                       (sc->credit_ctrl
+                               & ~SC(CREDIT_CTRL_THRESHOLD_SMASK))
+                       | ((new_threshold
+                               & SC(CREDIT_CTRL_THRESHOLD_MASK))
+                          << SC(CREDIT_CTRL_THRESHOLD_SHIFT));
+               write_kctxt_csr(sc->dd, sc->hw_context,
+                               SC(CREDIT_CTRL), sc->credit_ctrl);
+
+               /* force a credit return on change to avoid a possible stall */
+               force_return = 1;
+       }
+
+       spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags);
+
+       if (force_return)
+               sc_return_credits(sc);
+}
+
+/*
+ * set_pio_integrity
+ *
+ * Set the CHECK_ENABLE register for the send context 'sc'.
+ */
+void set_pio_integrity(struct send_context *sc)
+{
+       struct hfi1_devdata *dd = sc->dd;
+       u64 reg = 0;
+       u32 hw_context = sc->hw_context;
+       int type = sc->type;
+
+       /*
+        * No integrity checks if HFI1_CAP_NO_INTEGRITY is set, or if
+        * we're snooping.
+        */
+       if (likely(!HFI1_CAP_IS_KSET(NO_INTEGRITY)) &&
+           dd->hfi1_snoop.mode_flag != HFI1_PORT_SNOOP_MODE)
+               reg = hfi1_pkt_default_send_ctxt_mask(dd, type);
+
+       write_kctxt_csr(dd, hw_context, SC(CHECK_ENABLE), reg);
+}
+
+static u32 get_buffers_allocated(struct send_context *sc)
+{
+       int cpu;
+       u32 ret = 0;
+
+       for_each_possible_cpu(cpu)
+               ret += *per_cpu_ptr(sc->buffers_allocated, cpu);
+       return ret;
+}
+
+static void reset_buffers_allocated(struct send_context *sc)
+{
+       int cpu;
+
+       for_each_possible_cpu(cpu)
+               (*per_cpu_ptr(sc->buffers_allocated, cpu)) = 0;
+}
+
+/*
+ * Allocate a NUMA relative send context structure of the given type along
+ * with a HW context.
+ */
+struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
+                             uint hdrqentsize, int numa)
+{
+       struct send_context_info *sci;
+       struct send_context *sc = NULL;
+       dma_addr_t pa;
+       unsigned long flags;
+       u64 reg;
+       u32 thresh;
+       u32 sw_index;
+       u32 hw_context;
+       int ret;
+       u8 opval, opmask;
+
+       /* do not allocate while frozen */
+       if (dd->flags & HFI1_FROZEN)
+               return NULL;
+
+       sc = kzalloc_node(sizeof(*sc), GFP_KERNEL, numa);
+       if (!sc)
+               return NULL;
+
+       sc->buffers_allocated = alloc_percpu(u32);
+       if (!sc->buffers_allocated) {
+               kfree(sc);
+               dd_dev_err(dd,
+                          "Cannot allocate buffers_allocated per cpu counters\n"
+                         );
+               return NULL;
+       }
+
+       spin_lock_irqsave(&dd->sc_lock, flags);
+       ret = sc_hw_alloc(dd, type, &sw_index, &hw_context);
+       if (ret) {
+               spin_unlock_irqrestore(&dd->sc_lock, flags);
+               free_percpu(sc->buffers_allocated);
+               kfree(sc);
+               return NULL;
+       }
+
+       sci = &dd->send_contexts[sw_index];
+       sci->sc = sc;
+
+       sc->dd = dd;
+       sc->node = numa;
+       sc->type = type;
+       spin_lock_init(&sc->alloc_lock);
+       spin_lock_init(&sc->release_lock);
+       spin_lock_init(&sc->credit_ctrl_lock);
+       INIT_LIST_HEAD(&sc->piowait);
+       INIT_WORK(&sc->halt_work, sc_halted);
+       init_waitqueue_head(&sc->halt_wait);
+
+       /* grouping is always single context for now */
+       sc->group = 0;
+
+       sc->sw_index = sw_index;
+       sc->hw_context = hw_context;
+       cr_group_addresses(sc, &pa);
+       sc->credits = sci->credits;
+
+/* PIO Send Memory Address details */
+#define PIO_ADDR_CONTEXT_MASK 0xfful
+#define PIO_ADDR_CONTEXT_SHIFT 16
+       sc->base_addr = dd->piobase + ((hw_context & PIO_ADDR_CONTEXT_MASK)
+                                       << PIO_ADDR_CONTEXT_SHIFT);
+
+       /* set base and credits */
+       reg = ((sci->credits & SC(CTRL_CTXT_DEPTH_MASK))
+                                       << SC(CTRL_CTXT_DEPTH_SHIFT))
+               | ((sci->base & SC(CTRL_CTXT_BASE_MASK))
+                                       << SC(CTRL_CTXT_BASE_SHIFT));
+       write_kctxt_csr(dd, hw_context, SC(CTRL), reg);
+
+       set_pio_integrity(sc);
+
+       /* unmask all errors */
+       write_kctxt_csr(dd, hw_context, SC(ERR_MASK), (u64)-1);
+
+       /* set the default partition key */
+       write_kctxt_csr(dd, hw_context, SC(CHECK_PARTITION_KEY),
+                       (SC(CHECK_PARTITION_KEY_VALUE_MASK) &
+                        DEFAULT_PKEY) <<
+                       SC(CHECK_PARTITION_KEY_VALUE_SHIFT));
+
+       /* per context type checks */
+       if (type == SC_USER) {
+               opval = USER_OPCODE_CHECK_VAL;
+               opmask = USER_OPCODE_CHECK_MASK;
+       } else {
+               opval = OPCODE_CHECK_VAL_DISABLED;
+               opmask = OPCODE_CHECK_MASK_DISABLED;
+       }
+
+       /* set the send context check opcode mask and value */
+       write_kctxt_csr(dd, hw_context, SC(CHECK_OPCODE),
+                       ((u64)opmask << SC(CHECK_OPCODE_MASK_SHIFT)) |
+                       ((u64)opval << SC(CHECK_OPCODE_VALUE_SHIFT)));
+
+       /* set up credit return */
+       reg = pa & SC(CREDIT_RETURN_ADDR_ADDRESS_SMASK);
+       write_kctxt_csr(dd, hw_context, SC(CREDIT_RETURN_ADDR), reg);
+
+       /*
+        * Calculate the initial credit return threshold.
+        *
+        * For Ack contexts, set a threshold for half the credits.
+        * For User contexts use the given percentage.  This has been
+        * sanitized on driver start-up.
+        * For Kernel contexts, use the default MTU plus a header
+        * or half the credits, whichever is smaller. This should
+        * work for both the 3-deep buffering allocation and the
+        * pooling allocation.
+        */
+       if (type == SC_ACK) {
+               thresh = sc_percent_to_threshold(sc, 50);
+       } else if (type == SC_USER) {
+               thresh = sc_percent_to_threshold(sc,
+                                                user_credit_return_threshold);
+       } else { /* kernel */
+               thresh = min(sc_percent_to_threshold(sc, 50),
+                            sc_mtu_to_threshold(sc, hfi1_max_mtu,
+                                                hdrqentsize));
+       }
+       reg = thresh << SC(CREDIT_CTRL_THRESHOLD_SHIFT);
+       /* add in early return */
+       if (type == SC_USER && HFI1_CAP_IS_USET(EARLY_CREDIT_RETURN))
+               reg |= SC(CREDIT_CTRL_EARLY_RETURN_SMASK);
+       else if (HFI1_CAP_IS_KSET(EARLY_CREDIT_RETURN)) /* kernel, ack */
+               reg |= SC(CREDIT_CTRL_EARLY_RETURN_SMASK);
+
+       /* set up write-through credit_ctrl */
+       sc->credit_ctrl = reg;
+       write_kctxt_csr(dd, hw_context, SC(CREDIT_CTRL), reg);
+
+       /* User send contexts should not allow sending on VL15 */
+       if (type == SC_USER) {
+               reg = 1ULL << 15;
+               write_kctxt_csr(dd, hw_context, SC(CHECK_VL), reg);
+       }
+
+       spin_unlock_irqrestore(&dd->sc_lock, flags);
+
+       /*
+        * Allocate shadow ring to track outstanding PIO buffers _after_
+        * unlocking.  We don't know the size until the lock is held and
+        * we can't allocate while the lock is held.  No one is using
+        * the context yet, so allocate it now.
+        *
+        * User contexts do not get a shadow ring.
+        */
+       if (type != SC_USER) {
+               /*
+                * Size the shadow ring 1 larger than the number of credits
+                * so head == tail can mean empty.
+                */
+               sc->sr_size = sci->credits + 1;
+               sc->sr = kzalloc_node(sizeof(union pio_shadow_ring) *
+                               sc->sr_size, GFP_KERNEL, numa);
+               if (!sc->sr) {
+                       sc_free(sc);
+                       return NULL;
+               }
+       }
+
+       hfi1_cdbg(PIO,
+                 "Send context %u(%u) %s group %u credits %u credit_ctrl 0x%llx threshold %u\n",
+                 sw_index,
+                 hw_context,
+                 sc_type_name(type),
+                 sc->group,
+                 sc->credits,
+                 sc->credit_ctrl,
+                 thresh);
+
+       return sc;
+}
+
+/* free a per-NUMA send context structure */
+void sc_free(struct send_context *sc)
+{
+       struct hfi1_devdata *dd;
+       unsigned long flags;
+       u32 sw_index;
+       u32 hw_context;
+
+       if (!sc)
+               return;
+
+       sc->flags |= SCF_IN_FREE;       /* ensure no restarts */
+       dd = sc->dd;
+       if (!list_empty(&sc->piowait))
+               dd_dev_err(dd, "piowait list not empty!\n");
+       sw_index = sc->sw_index;
+       hw_context = sc->hw_context;
+       sc_disable(sc); /* make sure the HW is disabled */
+       flush_work(&sc->halt_work);
+
+       spin_lock_irqsave(&dd->sc_lock, flags);
+       dd->send_contexts[sw_index].sc = NULL;
+
+       /* clear/disable all registers set in sc_alloc */
+       write_kctxt_csr(dd, hw_context, SC(CTRL), 0);
+       write_kctxt_csr(dd, hw_context, SC(CHECK_ENABLE), 0);
+       write_kctxt_csr(dd, hw_context, SC(ERR_MASK), 0);
+       write_kctxt_csr(dd, hw_context, SC(CHECK_PARTITION_KEY), 0);
+       write_kctxt_csr(dd, hw_context, SC(CHECK_OPCODE), 0);
+       write_kctxt_csr(dd, hw_context, SC(CREDIT_RETURN_ADDR), 0);
+       write_kctxt_csr(dd, hw_context, SC(CREDIT_CTRL), 0);
+
+       /* release the index and context for re-use */
+       sc_hw_free(dd, sw_index, hw_context);
+       spin_unlock_irqrestore(&dd->sc_lock, flags);
+
+       kfree(sc->sr);
+       free_percpu(sc->buffers_allocated);
+       kfree(sc);
+}
+
+/* disable the context */
+void sc_disable(struct send_context *sc)
+{
+       u64 reg;
+       unsigned long flags;
+       struct pio_buf *pbuf;
+
+       if (!sc)
+               return;
+
+       /* do all steps, even if already disabled */
+       spin_lock_irqsave(&sc->alloc_lock, flags);
+       reg = read_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL));
+       reg &= ~SC(CTRL_CTXT_ENABLE_SMASK);
+       sc->flags &= ~SCF_ENABLED;
+       sc_wait_for_packet_egress(sc, 1);
+       write_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL), reg);
+       spin_unlock_irqrestore(&sc->alloc_lock, flags);
+
+       /*
+        * Flush any waiters.  Once the context is disabled,
+        * credit return interrupts are stopped (although there
+        * could be one in-process when the context is disabled).
+        * Wait one microsecond for any lingering interrupts, then
+        * proceed with the flush.
+        */
+       udelay(1);
+       spin_lock_irqsave(&sc->release_lock, flags);
+       if (sc->sr) {   /* this context has a shadow ring */
+               while (sc->sr_tail != sc->sr_head) {
+                       pbuf = &sc->sr[sc->sr_tail].pbuf;
+                       if (pbuf->cb)
+                               (*pbuf->cb)(pbuf->arg, PRC_SC_DISABLE);
+                       sc->sr_tail++;
+                       if (sc->sr_tail >= sc->sr_size)
+                               sc->sr_tail = 0;
+               }
+       }
+       spin_unlock_irqrestore(&sc->release_lock, flags);
+}
+
+/* return SendEgressCtxtStatus.PacketOccupancy */
+#define packet_occupancy(r) \
+       (((r) & SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SMASK)\
+       >> SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SHIFT)
+
+/* is egress halted on the context? */
+#define egress_halted(r) \
+       ((r) & SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_HALT_STATUS_SMASK)
+
+/* wait for packet egress, optionally pause for credit return  */
+static void sc_wait_for_packet_egress(struct send_context *sc, int pause)
+{
+       struct hfi1_devdata *dd = sc->dd;
+       u64 reg = 0;
+       u64 reg_prev;
+       u32 loop = 0;
+
+       while (1) {
+               reg_prev = reg;
+               reg = read_csr(dd, sc->hw_context * 8 +
+                              SEND_EGRESS_CTXT_STATUS);
+               /* done if egress is stopped */
+               if (egress_halted(reg))
+                       break;
+               reg = packet_occupancy(reg);
+               if (reg == 0)
+                       break;
+               /* counter is reset if occupancy count changes */
+               if (reg != reg_prev)
+                       loop = 0;
+               if (loop > 500) {
+                       /* timed out - bounce the link */
+                       dd_dev_err(dd,
+                                  "%s: context %u(%u) timeout waiting for packets to egress, remaining count %u, bouncing link\n",
+                                  __func__, sc->sw_index,
+                                  sc->hw_context, (u32)reg);
+                       queue_work(dd->pport->hfi1_wq,
+                                  &dd->pport->link_bounce_work);
+                       break;
+               }
+               loop++;
+               udelay(1);
+       }
+
+       if (pause)
+               /* Add additional delay to ensure chip returns all credits */
+               pause_for_credit_return(dd);
+}
+
+void sc_wait(struct hfi1_devdata *dd)
+{
+       int i;
+
+       for (i = 0; i < dd->num_send_contexts; i++) {
+               struct send_context *sc = dd->send_contexts[i].sc;
+
+               if (!sc)
+                       continue;
+               sc_wait_for_packet_egress(sc, 0);
+       }
+}
+
+/*
+ * Restart a context after it has been halted due to error.
+ *
+ * If the first step fails - wait for the halt to be asserted, return early.
+ * Otherwise complain about timeouts but keep going.
+ *
+ * It is expected that allocations (enabled flag bit) have been shut off
+ * already (only applies to kernel contexts).
+ */
+int sc_restart(struct send_context *sc)
+{
+       struct hfi1_devdata *dd = sc->dd;
+       u64 reg;
+       u32 loop;
+       int count;
+
+       /* bounce off if not halted, or being free'd */
+       if (!(sc->flags & SCF_HALTED) || (sc->flags & SCF_IN_FREE))
+               return -EINVAL;
+
+       dd_dev_info(dd, "restarting send context %u(%u)\n", sc->sw_index,
+                   sc->hw_context);
+
+       /*
+        * Step 1: Wait for the context to actually halt.
+        *
+        * The error interrupt is asynchronous to actually setting halt
+        * on the context.
+        */
+       loop = 0;
+       while (1) {
+               reg = read_kctxt_csr(dd, sc->hw_context, SC(STATUS));
+               if (reg & SC(STATUS_CTXT_HALTED_SMASK))
+                       break;
+               if (loop > 100) {
+                       dd_dev_err(dd, "%s: context %u(%u) not halting, skipping\n",
+                                  __func__, sc->sw_index, sc->hw_context);
+                       return -ETIME;
+               }
+               loop++;
+               udelay(1);
+       }
+
+       /*
+        * Step 2: Ensure no users are still trying to write to PIO.
+        *
+        * For kernel contexts, we have already turned off buffer allocation.
+        * Now wait for the buffer count to go to zero.
+        *
+        * For user contexts, the user handling code has cut off write access
+        * to the context's PIO pages before calling this routine and will
+        * restore write access after this routine returns.
+        */
+       if (sc->type != SC_USER) {
+               /* kernel context */
+               loop = 0;
+               while (1) {
+                       count = get_buffers_allocated(sc);
+                       if (count == 0)
+                               break;
+                       if (loop > 100) {
+                               dd_dev_err(dd,
+                                          "%s: context %u(%u) timeout waiting for PIO buffers to zero, remaining %d\n",
+                                          __func__, sc->sw_index,
+                                          sc->hw_context, count);
+                       }
+                       loop++;
+                       udelay(1);
+               }
+       }
+
+       /*
+        * Step 3: Wait for all packets to egress.
+        * This is done while disabling the send context
+        *
+        * Step 4: Disable the context
+        *
+        * This is a superset of the halt.  After the disable, the
+        * errors can be cleared.
+        */
+       sc_disable(sc);
+
+       /*
+        * Step 5: Enable the context
+        *
+        * This enable will clear the halted flag and per-send context
+        * error flags.
+        */
+       return sc_enable(sc);
+}
+
+/*
+ * PIO freeze processing.  To be called after the TXE block is fully frozen.
+ * Go through all frozen send contexts and disable them.  The contexts are
+ * already stopped by the freeze.
+ */
+void pio_freeze(struct hfi1_devdata *dd)
+{
+       struct send_context *sc;
+       int i;
+
+       for (i = 0; i < dd->num_send_contexts; i++) {
+               sc = dd->send_contexts[i].sc;
+               /*
+                * Don't disable unallocated, unfrozen, or user send contexts.
+                * User send contexts will be disabled when the process
+                * calls into the driver to reset its context.
+                */
+               if (!sc || !(sc->flags & SCF_FROZEN) || sc->type == SC_USER)
+                       continue;
+
+               /* only need to disable, the context is already stopped */
+               sc_disable(sc);
+       }
+}
+
+/*
+ * Unfreeze PIO for kernel send contexts.  The precondition for calling this
+ * is that all PIO send contexts have been disabled and the SPC freeze has
+ * been cleared.  Now perform the last step and re-enable each kernel context.
+ * User (PSM) processing will occur when PSM calls into the kernel to
+ * acknowledge the freeze.
+ */
+void pio_kernel_unfreeze(struct hfi1_devdata *dd)
+{
+       struct send_context *sc;
+       int i;
+
+       for (i = 0; i < dd->num_send_contexts; i++) {
+               sc = dd->send_contexts[i].sc;
+               if (!sc || !(sc->flags & SCF_FROZEN) || sc->type == SC_USER)
+                       continue;
+
+               sc_enable(sc);  /* will clear the sc frozen flag */
+       }
+}
+
+/*
+ * Wait for the SendPioInitCtxt.PioInitInProgress bit to clear.
+ * Returns:
+ *     -ETIMEDOUT - if we wait too long
+ *     -EIO       - if there was an error
+ */
+static int pio_init_wait_progress(struct hfi1_devdata *dd)
+{
+       u64 reg;
+       int max, count = 0;
+
+       /* max is the longest possible HW init time / delay */
+       max = (dd->icode == ICODE_FPGA_EMULATION) ? 120 : 5;
+       while (1) {
+               reg = read_csr(dd, SEND_PIO_INIT_CTXT);
+               if (!(reg & SEND_PIO_INIT_CTXT_PIO_INIT_IN_PROGRESS_SMASK))
+                       break;
+               if (count >= max)
+                       return -ETIMEDOUT;
+               udelay(5);
+               count++;
+       }
+
+       return reg & SEND_PIO_INIT_CTXT_PIO_INIT_ERR_SMASK ? -EIO : 0;
+}
+
+/*
+ * Reset all of the send contexts to their power-on state.  Used
+ * only during manual init - no lock against sc_enable needed.
+ */
+void pio_reset_all(struct hfi1_devdata *dd)
+{
+       int ret;
+
+       /* make sure the init engine is not busy */
+       ret = pio_init_wait_progress(dd);
+       /* ignore any timeout */
+       if (ret == -EIO) {
+               /* clear the error */
+               write_csr(dd, SEND_PIO_ERR_CLEAR,
+                         SEND_PIO_ERR_CLEAR_PIO_INIT_SM_IN_ERR_SMASK);
+       }
+
+       /* reset init all */
+       write_csr(dd, SEND_PIO_INIT_CTXT,
+                 SEND_PIO_INIT_CTXT_PIO_ALL_CTXT_INIT_SMASK);
+       udelay(2);
+       ret = pio_init_wait_progress(dd);
+       if (ret < 0) {
+               dd_dev_err(dd,
+                          "PIO send context init %s while initializing all PIO blocks\n",
+                          ret == -ETIMEDOUT ? "is stuck" : "had an error");
+       }
+}
+
+/* enable the context */
+int sc_enable(struct send_context *sc)
+{
+       u64 sc_ctrl, reg, pio;
+       struct hfi1_devdata *dd;
+       unsigned long flags;
+       int ret = 0;
+
+       if (!sc)
+               return -EINVAL;
+       dd = sc->dd;
+
+       /*
+        * Obtain the allocator lock to guard against any allocation
+        * attempts (which should not happen prior to context being
+        * enabled). On the release/disable side we don't need to
+        * worry about locking since the releaser will not do anything
+        * if the context accounting values have not changed.
+        */
+       spin_lock_irqsave(&sc->alloc_lock, flags);
+       sc_ctrl = read_kctxt_csr(dd, sc->hw_context, SC(CTRL));
+       if ((sc_ctrl & SC(CTRL_CTXT_ENABLE_SMASK)))
+               goto unlock; /* already enabled */
+
+       /* IMPORTANT: only clear free and fill if transitioning 0 -> 1 */
+
+       *sc->hw_free = 0;
+       sc->free = 0;
+       sc->alloc_free = 0;
+       sc->fill = 0;
+       sc->sr_head = 0;
+       sc->sr_tail = 0;
+       sc->flags = 0;
+       /* the alloc lock insures no fast path allocation */
+       reset_buffers_allocated(sc);
+
+       /*
+        * Clear all per-context errors.  Some of these will be set when
+        * we are re-enabling after a context halt.  Now that the context
+        * is disabled, the halt will not clear until after the PIO init
+        * engine runs below.
+        */
+       reg = read_kctxt_csr(dd, sc->hw_context, SC(ERR_STATUS));
+       if (reg)
+               write_kctxt_csr(dd, sc->hw_context, SC(ERR_CLEAR), reg);
+
+       /*
+        * The HW PIO initialization engine can handle only one init
+        * request at a time. Serialize access to each device's engine.
+        */
+       spin_lock(&dd->sc_init_lock);
+       /*
+        * Since access to this code block is serialized and
+        * each access waits for the initialization to complete
+        * before releasing the lock, the PIO initialization engine
+        * should not be in use, so we don't have to wait for the
+        * InProgress bit to go down.
+        */
+       pio = ((sc->hw_context & SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_MASK) <<
+              SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_SHIFT) |
+               SEND_PIO_INIT_CTXT_PIO_SINGLE_CTXT_INIT_SMASK;
+       write_csr(dd, SEND_PIO_INIT_CTXT, pio);
+       /*
+        * Wait until the engine is done.  Give the chip the required time
+        * so, hopefully, we read the register just once.
+        */
+       udelay(2);
+       ret = pio_init_wait_progress(dd);
+       spin_unlock(&dd->sc_init_lock);
+       if (ret) {
+               dd_dev_err(dd,
+                          "sctxt%u(%u): Context not enabled due to init failure %d\n",
+                          sc->sw_index, sc->hw_context, ret);
+               goto unlock;
+       }
+
+       /*
+        * All is well. Enable the context.
+        */
+       sc_ctrl |= SC(CTRL_CTXT_ENABLE_SMASK);
+       write_kctxt_csr(dd, sc->hw_context, SC(CTRL), sc_ctrl);
+       /*
+        * Read SendCtxtCtrl to force the write out and prevent a timing
+        * hazard where a PIO write may reach the context before the enable.
+        */
+       read_kctxt_csr(dd, sc->hw_context, SC(CTRL));
+       sc->flags |= SCF_ENABLED;
+
+unlock:
+       spin_unlock_irqrestore(&sc->alloc_lock, flags);
+
+       return ret;
+}
+
+/* force a credit return on the context */
+void sc_return_credits(struct send_context *sc)
+{
+       if (!sc)
+               return;
+
+       /* a 0->1 transition schedules a credit return */
+       write_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE),
+                       SC(CREDIT_FORCE_FORCE_RETURN_SMASK));
+       /*
+        * Ensure that the write is flushed and the credit return is
+        * scheduled. We care more about the 0 -> 1 transition.
+        */
+       read_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE));
+       /* set back to 0 for next time */
+       write_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE), 0);
+}
+
+/* allow all in-flight packets to drain on the context */
+void sc_flush(struct send_context *sc)
+{
+       if (!sc)
+               return;
+
+       sc_wait_for_packet_egress(sc, 1);
+}
+
+/* drop all packets on the context, no waiting until they are sent */
+void sc_drop(struct send_context *sc)
+{
+       if (!sc)
+               return;
+
+       dd_dev_info(sc->dd, "%s: context %u(%u) - not implemented\n",
+                   __func__, sc->sw_index, sc->hw_context);
+}
+
+/*
+ * Start the software reaction to a context halt or SPC freeze:
+ *     - mark the context as halted or frozen
+ *     - stop buffer allocations
+ *
+ * Called from the error interrupt.  Other work is deferred until
+ * out of the interrupt.
+ */
+void sc_stop(struct send_context *sc, int flag)
+{
+       unsigned long flags;
+
+       /* mark the context */
+       sc->flags |= flag;
+
+       /* stop buffer allocations */
+       spin_lock_irqsave(&sc->alloc_lock, flags);
+       sc->flags &= ~SCF_ENABLED;
+       spin_unlock_irqrestore(&sc->alloc_lock, flags);
+       wake_up(&sc->halt_wait);
+}
+
+#define BLOCK_DWORDS (PIO_BLOCK_SIZE / sizeof(u32))
+#define dwords_to_blocks(x) DIV_ROUND_UP(x, BLOCK_DWORDS)
+
+/*
+ * The send context buffer "allocator".
+ *
+ * @sc: the PIO send context we are allocating from
+ * @len: length of whole packet - including PBC - in dwords
+ * @cb: optional callback to call when the buffer is finished sending
+ * @arg: argument for cb
+ *
+ * Return a pointer to a PIO buffer if successful, NULL if not enough room.
+ */
+struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len,
+                               pio_release_cb cb, void *arg)
+{
+       struct pio_buf *pbuf = NULL;
+       unsigned long flags;
+       unsigned long avail;
+       unsigned long blocks = dwords_to_blocks(dw_len);
+       unsigned long start_fill;
+       int trycount = 0;
+       u32 head, next;
+
+       spin_lock_irqsave(&sc->alloc_lock, flags);
+       if (!(sc->flags & SCF_ENABLED)) {
+               spin_unlock_irqrestore(&sc->alloc_lock, flags);
+               goto done;
+       }
+
+retry:
+       avail = (unsigned long)sc->credits - (sc->fill - sc->alloc_free);
+       if (blocks > avail) {
+               /* not enough room */
+               if (unlikely(trycount)) { /* already tried to get more room */
+                       spin_unlock_irqrestore(&sc->alloc_lock, flags);
+                       goto done;
+               }
+               /* copy from receiver cache line and recalculate */
+               sc->alloc_free = ACCESS_ONCE(sc->free);
+               avail =
+                       (unsigned long)sc->credits -
+                       (sc->fill - sc->alloc_free);
+               if (blocks > avail) {
+                       /* still no room, actively update */
+                       spin_unlock_irqrestore(&sc->alloc_lock, flags);
+                       sc_release_update(sc);
+                       spin_lock_irqsave(&sc->alloc_lock, flags);
+                       sc->alloc_free = ACCESS_ONCE(sc->free);
+                       trycount++;
+                       goto retry;
+               }
+       }
+
+       /* there is enough room */
+
+       preempt_disable();
+       this_cpu_inc(*sc->buffers_allocated);
+
+       /* read this once */
+       head = sc->sr_head;
+
+       /* "allocate" the buffer */
+       start_fill = sc->fill;
+       sc->fill += blocks;
+
+       /*
+        * Fill the parts that the releaser looks at before moving the head.
+        * The only necessary piece is the sent_at field.  The credits
+        * we have just allocated cannot have been returned yet, so the
+        * cb and arg will not be looked at for a "while".  Put them
+        * on this side of the memory barrier anyway.
+        */
+       pbuf = &sc->sr[head].pbuf;
+       pbuf->sent_at = sc->fill;
+       pbuf->cb = cb;
+       pbuf->arg = arg;
+       pbuf->sc = sc;  /* could be filled in at sc->sr init time */
+       /* make sure this is in memory before updating the head */
+
+       /* calculate next head index, do not store */
+       next = head + 1;
+       if (next >= sc->sr_size)
+               next = 0;
+       /*
+        * update the head - must be last! - the releaser can look at fields
+        * in pbuf once we move the head
+        */
+       smp_wmb();
+       sc->sr_head = next;
+       spin_unlock_irqrestore(&sc->alloc_lock, flags);
+
+       /* finish filling in the buffer outside the lock */
+       pbuf->start = sc->base_addr + ((start_fill % sc->credits)
+                                                       * PIO_BLOCK_SIZE);
+       pbuf->size = sc->credits * PIO_BLOCK_SIZE;
+       pbuf->end = sc->base_addr + pbuf->size;
+       pbuf->block_count = blocks;
+       pbuf->qw_written = 0;
+       pbuf->carry_bytes = 0;
+       pbuf->carry.val64 = 0;
+done:
+       return pbuf;
+}
+
+/*
+ * There are at least two entities that can turn on credit return
+ * interrupts and they can overlap.  Avoid problems by implementing
+ * a count scheme that is enforced by a lock.  The lock is needed because
+ * the count and CSR write must be paired.
+ */
+
+/*
+ * Start credit return interrupts.  This is managed by a count.  If already
+ * on, just increment the count.
+ */
+void sc_add_credit_return_intr(struct send_context *sc)
+{
+       unsigned long flags;
+
+       /* lock must surround both the count change and the CSR update */
+       spin_lock_irqsave(&sc->credit_ctrl_lock, flags);
+       if (sc->credit_intr_count == 0) {
+               sc->credit_ctrl |= SC(CREDIT_CTRL_CREDIT_INTR_SMASK);
+               write_kctxt_csr(sc->dd, sc->hw_context,
+                               SC(CREDIT_CTRL), sc->credit_ctrl);
+       }
+       sc->credit_intr_count++;
+       spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags);
+}
+
+/*
+ * Stop credit return interrupts.  This is managed by a count.  Decrement the
+ * count, if the last user, then turn the credit interrupts off.
+ */
+void sc_del_credit_return_intr(struct send_context *sc)
+{
+       unsigned long flags;
+
+       WARN_ON(sc->credit_intr_count == 0);
+
+       /* lock must surround both the count change and the CSR update */
+       spin_lock_irqsave(&sc->credit_ctrl_lock, flags);
+       sc->credit_intr_count--;
+       if (sc->credit_intr_count == 0) {
+               sc->credit_ctrl &= ~SC(CREDIT_CTRL_CREDIT_INTR_SMASK);
+               write_kctxt_csr(sc->dd, sc->hw_context,
+                               SC(CREDIT_CTRL), sc->credit_ctrl);
+       }
+       spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags);
+}
+
+/*
+ * The caller must be careful when calling this.  All needint calls
+ * must be paired with !needint.
+ */
+void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint)
+{
+       if (needint)
+               sc_add_credit_return_intr(sc);
+       else
+               sc_del_credit_return_intr(sc);
+       trace_hfi1_wantpiointr(sc, needint, sc->credit_ctrl);
+       if (needint) {
+               mmiowb();
+               sc_return_credits(sc);
+       }
+}
+
+/**
+ * sc_piobufavail - callback when a PIO buffer is available
+ * @sc: the send context
+ *
+ * This is called from the interrupt handler when a PIO buffer is
+ * available after hfi1_verbs_send() returned an error that no buffers were
+ * available. Disable the interrupt if there are no more QPs waiting.
+ */
+static void sc_piobufavail(struct send_context *sc)
+{
+       struct hfi1_devdata *dd = sc->dd;
+       struct hfi1_ibdev *dev = &dd->verbs_dev;
+       struct list_head *list;
+       struct rvt_qp *qps[PIO_WAIT_BATCH_SIZE];
+       struct rvt_qp *qp;
+       struct hfi1_qp_priv *priv;
+       unsigned long flags;
+       unsigned i, n = 0;
+
+       if (dd->send_contexts[sc->sw_index].type != SC_KERNEL &&
+           dd->send_contexts[sc->sw_index].type != SC_VL15)
+               return;
+       list = &sc->piowait;
+       /*
+        * Note: checking that the piowait list is empty and clearing
+        * the buffer available interrupt needs to be atomic or we
+        * could end up with QPs on the wait list with the interrupt
+        * disabled.
+        */
+       write_seqlock_irqsave(&dev->iowait_lock, flags);
+       while (!list_empty(list)) {
+               struct iowait *wait;
+
+               if (n == ARRAY_SIZE(qps))
+                       break;
+               wait = list_first_entry(list, struct iowait, list);
+               qp = iowait_to_qp(wait);
+               priv = qp->priv;
+               list_del_init(&priv->s_iowait.list);
+               /* refcount held until actual wake up */
+               qps[n++] = qp;
+       }
+       /*
+        * If there had been waiters and there are more
+        * insure that we redo the force to avoid a potential hang.
+        */
+       if (n) {
+               hfi1_sc_wantpiobuf_intr(sc, 0);
+               if (!list_empty(list))
+                       hfi1_sc_wantpiobuf_intr(sc, 1);
+       }
+       write_sequnlock_irqrestore(&dev->iowait_lock, flags);
+
+       for (i = 0; i < n; i++)
+               hfi1_qp_wakeup(qps[i],
+                              RVT_S_WAIT_PIO | RVT_S_WAIT_PIO_DRAIN);
+}
+
+/* translate a send credit update to a bit code of reasons */
+static inline int fill_code(u64 hw_free)
+{
+       int code = 0;
+
+       if (hw_free & CR_STATUS_SMASK)
+               code |= PRC_STATUS_ERR;
+       if (hw_free & CR_CREDIT_RETURN_DUE_TO_PBC_SMASK)
+               code |= PRC_PBC;
+       if (hw_free & CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SMASK)
+               code |= PRC_THRESHOLD;
+       if (hw_free & CR_CREDIT_RETURN_DUE_TO_ERR_SMASK)
+               code |= PRC_FILL_ERR;
+       if (hw_free & CR_CREDIT_RETURN_DUE_TO_FORCE_SMASK)
+               code |= PRC_SC_DISABLE;
+       return code;
+}
+
+/* use the jiffies compare to get the wrap right */
+#define sent_before(a, b) time_before(a, b)    /* a < b */
+
+/*
+ * The send context buffer "releaser".
+ */
+void sc_release_update(struct send_context *sc)
+{
+       struct pio_buf *pbuf;
+       u64 hw_free;
+       u32 head, tail;
+       unsigned long old_free;
+       unsigned long free;
+       unsigned long extra;
+       unsigned long flags;
+       int code;
+
+       if (!sc)
+               return;
+
+       spin_lock_irqsave(&sc->release_lock, flags);
+       /* update free */
+       hw_free = le64_to_cpu(*sc->hw_free);            /* volatile read */
+       old_free = sc->free;
+       extra = (((hw_free & CR_COUNTER_SMASK) >> CR_COUNTER_SHIFT)
+                       - (old_free & CR_COUNTER_MASK))
+                               & CR_COUNTER_MASK;
+       free = old_free + extra;
+       trace_hfi1_piofree(sc, extra);
+
+       /* call sent buffer callbacks */
+       code = -1;                              /* code not yet set */
+       head = ACCESS_ONCE(sc->sr_head);        /* snapshot the head */
+       tail = sc->sr_tail;
+       while (head != tail) {
+               pbuf = &sc->sr[tail].pbuf;
+
+               if (sent_before(free, pbuf->sent_at)) {
+                       /* not sent yet */
+                       break;
+               }
+               if (pbuf->cb) {
+                       if (code < 0) /* fill in code on first user */
+                               code = fill_code(hw_free);
+                       (*pbuf->cb)(pbuf->arg, code);
+               }
+
+               tail++;
+               if (tail >= sc->sr_size)
+                       tail = 0;
+       }
+       sc->sr_tail = tail;
+       /* make sure tail is updated before free */
+       smp_wmb();
+       sc->free = free;
+       spin_unlock_irqrestore(&sc->release_lock, flags);
+       sc_piobufavail(sc);
+}
+
+/*
+ * Send context group releaser.  Argument is the send context that caused
+ * the interrupt.  Called from the send context interrupt handler.
+ *
+ * Call release on all contexts in the group.
+ *
+ * This routine takes the sc_lock without an irqsave because it is only
+ * called from an interrupt handler.  Adjust if that changes.
+ */
+void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context)
+{
+       struct send_context *sc;
+       u32 sw_index;
+       u32 gc, gc_end;
+
+       spin_lock(&dd->sc_lock);
+       sw_index = dd->hw_to_sw[hw_context];
+       if (unlikely(sw_index >= dd->num_send_contexts)) {
+               dd_dev_err(dd, "%s: invalid hw (%u) to sw (%u) mapping\n",
+                          __func__, hw_context, sw_index);
+               goto done;
+       }
+       sc = dd->send_contexts[sw_index].sc;
+       if (unlikely(!sc))
+               goto done;
+
+       gc = group_context(hw_context, sc->group);
+       gc_end = gc + group_size(sc->group);
+       for (; gc < gc_end; gc++) {
+               sw_index = dd->hw_to_sw[gc];
+               if (unlikely(sw_index >= dd->num_send_contexts)) {
+                       dd_dev_err(dd,
+                                  "%s: invalid hw (%u) to sw (%u) mapping\n",
+                                  __func__, hw_context, sw_index);
+                       continue;
+               }
+               sc_release_update(dd->send_contexts[sw_index].sc);
+       }
+done:
+       spin_unlock(&dd->sc_lock);
+}
+
+/*
+ * pio_select_send_context_vl() - select send context
+ * @dd: devdata
+ * @selector: a spreading factor
+ * @vl: this vl
+ *
+ * This function returns a send context based on the selector and a vl.
+ * The mapping fields are protected by RCU
+ */
+struct send_context *pio_select_send_context_vl(struct hfi1_devdata *dd,
+                                               u32 selector, u8 vl)
+{
+       struct pio_vl_map *m;
+       struct pio_map_elem *e;
+       struct send_context *rval;
+
+       /*
+        * NOTE This should only happen if SC->VL changed after the initial
+        * checks on the QP/AH
+        * Default will return VL0's send context below
+        */
+       if (unlikely(vl >= num_vls)) {
+               rval = NULL;
+               goto done;
+       }
+
+       rcu_read_lock();
+       m = rcu_dereference(dd->pio_map);
+       if (unlikely(!m)) {
+               rcu_read_unlock();
+               return dd->vld[0].sc;
+       }
+       e = m->map[vl & m->mask];
+       rval = e->ksc[selector & e->mask];
+       rcu_read_unlock();
+
+done:
+       rval = !rval ? dd->vld[0].sc : rval;
+       return rval;
+}
+
+/*
+ * pio_select_send_context_sc() - select send context
+ * @dd: devdata
+ * @selector: a spreading factor
+ * @sc5: the 5 bit sc
+ *
+ * This function returns an send context based on the selector and an sc
+ */
+struct send_context *pio_select_send_context_sc(struct hfi1_devdata *dd,
+                                               u32 selector, u8 sc5)
+{
+       u8 vl = sc_to_vlt(dd, sc5);
+
+       return pio_select_send_context_vl(dd, selector, vl);
+}
+
+/*
+ * Free the indicated map struct
+ */
+static void pio_map_free(struct pio_vl_map *m)
+{
+       int i;
+
+       for (i = 0; m && i < m->actual_vls; i++)
+               kfree(m->map[i]);
+       kfree(m);
+}
+
+/*
+ * Handle RCU callback
+ */
+static void pio_map_rcu_callback(struct rcu_head *list)
+{
+       struct pio_vl_map *m = container_of(list, struct pio_vl_map, list);
+
+       pio_map_free(m);
+}
+
+/*
+ * pio_map_init - called when #vls change
+ * @dd: hfi1_devdata
+ * @port: port number
+ * @num_vls: number of vls
+ * @vl_scontexts: per vl send context mapping (optional)
+ *
+ * This routine changes the mapping based on the number of vls.
+ *
+ * vl_scontexts is used to specify a non-uniform vl/send context
+ * loading. NULL implies auto computing the loading and giving each
+ * VL an uniform distribution of send contexts per VL.
+ *
+ * The auto algorithm computers the sc_per_vl and the number of extra
+ * send contexts. Any extra send contexts are added from the last VL
+ * on down
+ *
+ * rcu locking is used here to control access to the mapping fields.
+ *
+ * If either the num_vls or num_send_contexts are non-power of 2, the
+ * array sizes in the struct pio_vl_map and the struct pio_map_elem are
+ * rounded up to the next highest power of 2 and the first entry is
+ * reused in a round robin fashion.
+ *
+ * If an error occurs the map change is not done and the mapping is not
+ * chaged.
+ *
+ */
+int pio_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls, u8 *vl_scontexts)
+{
+       int i, j;
+       int extra, sc_per_vl;
+       int scontext = 1;
+       int num_kernel_send_contexts = 0;
+       u8 lvl_scontexts[OPA_MAX_VLS];
+       struct pio_vl_map *oldmap, *newmap;
+
+       if (!vl_scontexts) {
+               for (i = 0; i < dd->num_send_contexts; i++)
+                       if (dd->send_contexts[i].type == SC_KERNEL)
+                               num_kernel_send_contexts++;
+               /* truncate divide */
+               sc_per_vl = num_kernel_send_contexts / num_vls;
+               /* extras */
+               extra = num_kernel_send_contexts % num_vls;
+               vl_scontexts = lvl_scontexts;
+               /* add extras from last vl down */
+               for (i = num_vls - 1; i >= 0; i--, extra--)
+                       vl_scontexts[i] = sc_per_vl + (extra > 0 ? 1 : 0);
+       }
+       /* build new map */
+       newmap = kzalloc(sizeof(*newmap) +
+                        roundup_pow_of_two(num_vls) *
+                        sizeof(struct pio_map_elem *),
+                        GFP_KERNEL);
+       if (!newmap)
+               goto bail;
+       newmap->actual_vls = num_vls;
+       newmap->vls = roundup_pow_of_two(num_vls);
+       newmap->mask = (1 << ilog2(newmap->vls)) - 1;
+       for (i = 0; i < newmap->vls; i++) {
+               /* save for wrap around */
+               int first_scontext = scontext;
+
+               if (i < newmap->actual_vls) {
+                       int sz = roundup_pow_of_two(vl_scontexts[i]);
+
+                       /* only allocate once */
+                       newmap->map[i] = kzalloc(sizeof(*newmap->map[i]) +
+                                                sz * sizeof(struct
+                                                            send_context *),
+                                                GFP_KERNEL);
+                       if (!newmap->map[i])
+                               goto bail;
+                       newmap->map[i]->mask = (1 << ilog2(sz)) - 1;
+                       /* assign send contexts */
+                       for (j = 0; j < sz; j++) {
+                               if (dd->kernel_send_context[scontext])
+                                       newmap->map[i]->ksc[j] =
+                                       dd->kernel_send_context[scontext];
+                               if (++scontext >= first_scontext +
+                                                 vl_scontexts[i])
+                                       /* wrap back to first send context */
+                                       scontext = first_scontext;
+                       }
+               } else {
+                       /* just re-use entry without allocating */
+                       newmap->map[i] = newmap->map[i % num_vls];
+               }
+               scontext = first_scontext + vl_scontexts[i];
+       }
+       /* newmap in hand, save old map */
+       spin_lock_irq(&dd->pio_map_lock);
+       oldmap = rcu_dereference_protected(dd->pio_map,
+                                          lockdep_is_held(&dd->pio_map_lock));
+
+       /* publish newmap */
+       rcu_assign_pointer(dd->pio_map, newmap);
+
+       spin_unlock_irq(&dd->pio_map_lock);
+       /* success, free any old map after grace period */
+       if (oldmap)
+               call_rcu(&oldmap->list, pio_map_rcu_callback);
+       return 0;
+bail:
+       /* free any partial allocation */
+       pio_map_free(newmap);
+       return -ENOMEM;
+}
+
+void free_pio_map(struct hfi1_devdata *dd)
+{
+       /* Free PIO map if allocated */
+       if (rcu_access_pointer(dd->pio_map)) {
+               spin_lock_irq(&dd->pio_map_lock);
+               pio_map_free(rcu_access_pointer(dd->pio_map));
+               RCU_INIT_POINTER(dd->pio_map, NULL);
+               spin_unlock_irq(&dd->pio_map_lock);
+               synchronize_rcu();
+       }
+       kfree(dd->kernel_send_context);
+       dd->kernel_send_context = NULL;
+}
+
+int init_pervl_scs(struct hfi1_devdata *dd)
+{
+       int i;
+       u64 mask, all_vl_mask = (u64)0x80ff; /* VLs 0-7, 15 */
+       u64 data_vls_mask = (u64)0x00ff; /* VLs 0-7 */
+       u32 ctxt;
+       struct hfi1_pportdata *ppd = dd->pport;
+
+       dd->vld[15].sc = sc_alloc(dd, SC_VL15,
+                                 dd->rcd[0]->rcvhdrqentsize, dd->node);
+       if (!dd->vld[15].sc)
+               goto nomem;
+       hfi1_init_ctxt(dd->vld[15].sc);
+       dd->vld[15].mtu = enum_to_mtu(OPA_MTU_2048);
+
+       dd->kernel_send_context = kmalloc_node(dd->num_send_contexts *
+                                       sizeof(struct send_context *),
+                                       GFP_KERNEL, dd->node);
+       dd->kernel_send_context[0] = dd->vld[15].sc;
+
+       for (i = 0; i < num_vls; i++) {
+               /*
+                * Since this function does not deal with a specific
+                * receive context but we need the RcvHdrQ entry size,
+                * use the size from rcd[0]. It is guaranteed to be
+                * valid at this point and will remain the same for all
+                * receive contexts.
+                */
+               dd->vld[i].sc = sc_alloc(dd, SC_KERNEL,
+                                        dd->rcd[0]->rcvhdrqentsize, dd->node);
+               if (!dd->vld[i].sc)
+                       goto nomem;
+               dd->kernel_send_context[i + 1] = dd->vld[i].sc;
+               hfi1_init_ctxt(dd->vld[i].sc);
+               /* non VL15 start with the max MTU */
+               dd->vld[i].mtu = hfi1_max_mtu;
+       }
+       for (i = num_vls; i < INIT_SC_PER_VL * num_vls; i++) {
+               dd->kernel_send_context[i + 1] =
+               sc_alloc(dd, SC_KERNEL, dd->rcd[0]->rcvhdrqentsize, dd->node);
+               if (!dd->kernel_send_context[i + 1])
+                       goto nomem;
+               hfi1_init_ctxt(dd->kernel_send_context[i + 1]);
+       }
+
+       sc_enable(dd->vld[15].sc);
+       ctxt = dd->vld[15].sc->hw_context;
+       mask = all_vl_mask & ~(1LL << 15);
+       write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask);
+       dd_dev_info(dd,
+                   "Using send context %u(%u) for VL15\n",
+                   dd->vld[15].sc->sw_index, ctxt);
+
+       for (i = 0; i < num_vls; i++) {
+               sc_enable(dd->vld[i].sc);
+               ctxt = dd->vld[i].sc->hw_context;
+               mask = all_vl_mask & ~(data_vls_mask);
+               write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask);
+       }
+       for (i = num_vls; i < INIT_SC_PER_VL * num_vls; i++) {
+               sc_enable(dd->kernel_send_context[i + 1]);
+               ctxt = dd->kernel_send_context[i + 1]->hw_context;
+               mask = all_vl_mask & ~(data_vls_mask);
+               write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask);
+       }
+
+       if (pio_map_init(dd, ppd->port - 1, num_vls, NULL))
+               goto nomem;
+       return 0;
+nomem:
+       sc_free(dd->vld[15].sc);
+       for (i = 0; i < num_vls; i++)
+               sc_free(dd->vld[i].sc);
+       for (i = num_vls; i < INIT_SC_PER_VL * num_vls; i++)
+               sc_free(dd->kernel_send_context[i + 1]);
+       return -ENOMEM;
+}
+
+int init_credit_return(struct hfi1_devdata *dd)
+{
+       int ret;
+       int num_numa;
+       int i;
+
+       num_numa = num_online_nodes();
+       /* enforce the expectation that the numas are compact */
+       for (i = 0; i < num_numa; i++) {
+               if (!node_online(i)) {
+                       dd_dev_err(dd, "NUMA nodes are not compact\n");
+                       ret = -EINVAL;
+                       goto done;
+               }
+       }
+
+       dd->cr_base = kcalloc(
+               num_numa,
+               sizeof(struct credit_return_base),
+               GFP_KERNEL);
+       if (!dd->cr_base) {
+               dd_dev_err(dd, "Unable to allocate credit return base\n");
+               ret = -ENOMEM;
+               goto done;
+       }
+       for (i = 0; i < num_numa; i++) {
+               int bytes = TXE_NUM_CONTEXTS * sizeof(struct credit_return);
+
+               set_dev_node(&dd->pcidev->dev, i);
+               dd->cr_base[i].va = dma_zalloc_coherent(
+                                       &dd->pcidev->dev,
+                                       bytes,
+                                       &dd->cr_base[i].pa,
+                                       GFP_KERNEL);
+               if (!dd->cr_base[i].va) {
+                       set_dev_node(&dd->pcidev->dev, dd->node);
+                       dd_dev_err(dd,
+                                  "Unable to allocate credit return DMA range for NUMA %d\n",
+                                  i);
+                       ret = -ENOMEM;
+                       goto done;
+               }
+       }
+       set_dev_node(&dd->pcidev->dev, dd->node);
+
+       ret = 0;
+done:
+       return ret;
+}
+
+void free_credit_return(struct hfi1_devdata *dd)
+{
+       int num_numa;
+       int i;
+
+       if (!dd->cr_base)
+               return;
+
+       num_numa = num_online_nodes();
+       for (i = 0; i < num_numa; i++) {
+               if (dd->cr_base[i].va) {
+                       dma_free_coherent(&dd->pcidev->dev,
+                                         TXE_NUM_CONTEXTS *
+                                         sizeof(struct credit_return),
+                                         dd->cr_base[i].va,
+                                         dd->cr_base[i].pa);
+               }
+       }
+       kfree(dd->cr_base);
+       dd->cr_base = NULL;
+}
diff --git a/drivers/infiniband/hw/hfi1/pio.h b/drivers/infiniband/hw/hfi1/pio.h
new file mode 100644 (file)
index 0000000..464cbd2
--- /dev/null
@@ -0,0 +1,328 @@
+#ifndef _PIO_H
+#define _PIO_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/* send context types */
+#define SC_KERNEL 0
+#define SC_VL15   1
+#define SC_ACK    2
+#define SC_USER   3    /* must be the last one: it may take all left */
+#define SC_MAX    4    /* count of send context types */
+
+/* invalid send context index */
+#define INVALID_SCI 0xff
+
+/* PIO buffer release callback function */
+typedef void (*pio_release_cb)(void *arg, int code);
+
+/* PIO release codes - in bits, as there could more than one that apply */
+#define PRC_OK         0       /* no known error */
+#define PRC_STATUS_ERR 0x01    /* credit return due to status error */
+#define PRC_PBC                0x02    /* credit return due to PBC */
+#define PRC_THRESHOLD  0x04    /* credit return due to threshold */
+#define PRC_FILL_ERR   0x08    /* credit return due fill error */
+#define PRC_FORCE      0x10    /* credit return due credit force */
+#define PRC_SC_DISABLE 0x20    /* clean-up after a context disable */
+
+/* byte helper */
+union mix {
+       u64 val64;
+       u32 val32[2];
+       u8  val8[8];
+};
+
+/* an allocated PIO buffer */
+struct pio_buf {
+       struct send_context *sc;/* back pointer to owning send context */
+       pio_release_cb cb;      /* called when the buffer is released */
+       void *arg;              /* argument for cb */
+       void __iomem *start;    /* buffer start address */
+       void __iomem *end;      /* context end address */
+       unsigned long size;     /* context size, in bytes */
+       unsigned long sent_at;  /* buffer is sent when <= free */
+       u32 block_count;        /* size of buffer, in blocks */
+       u32 qw_written;         /* QW written so far */
+       u32 carry_bytes;        /* number of valid bytes in carry */
+       union mix carry;        /* pending unwritten bytes */
+};
+
+/* cache line aligned pio buffer array */
+union pio_shadow_ring {
+       struct pio_buf pbuf;
+       u64 unused[16];         /* cache line spacer */
+} ____cacheline_aligned;
+
+/* per-NUMA send context */
+struct send_context {
+       /* read-only after init */
+       struct hfi1_devdata *dd;                /* device */
+       void __iomem *base_addr;        /* start of PIO memory */
+       union pio_shadow_ring *sr;      /* shadow ring */
+
+       volatile __le64 *hw_free;       /* HW free counter */
+       struct work_struct halt_work;   /* halted context work queue entry */
+       unsigned long flags;            /* flags */
+       int node;                       /* context home node */
+       int type;                       /* context type */
+       u32 sw_index;                   /* software index number */
+       u32 hw_context;                 /* hardware context number */
+       u32 credits;                    /* number of blocks in context */
+       u32 sr_size;                    /* size of the shadow ring */
+       u32 group;                      /* credit return group */
+       /* allocator fields */
+       spinlock_t alloc_lock ____cacheline_aligned_in_smp;
+       unsigned long fill;             /* official alloc count */
+       unsigned long alloc_free;       /* copy of free (less cache thrash) */
+       u32 sr_head;                    /* shadow ring head */
+       /* releaser fields */
+       spinlock_t release_lock ____cacheline_aligned_in_smp;
+       unsigned long free;             /* official free count */
+       u32 sr_tail;                    /* shadow ring tail */
+       /* list for PIO waiters */
+       struct list_head piowait  ____cacheline_aligned_in_smp;
+       spinlock_t credit_ctrl_lock ____cacheline_aligned_in_smp;
+       u64 credit_ctrl;                /* cache for credit control */
+       u32 credit_intr_count;          /* count of credit intr users */
+       u32 __percpu *buffers_allocated;/* count of buffers allocated */
+       wait_queue_head_t halt_wait;    /* wait until kernel sees interrupt */
+};
+
+/* send context flags */
+#define SCF_ENABLED 0x01
+#define SCF_IN_FREE 0x02
+#define SCF_HALTED  0x04
+#define SCF_FROZEN  0x08
+
+struct send_context_info {
+       struct send_context *sc;        /* allocated working context */
+       u16 allocated;                  /* has this been allocated? */
+       u16 type;                       /* context type */
+       u16 base;                       /* base in PIO array */
+       u16 credits;                    /* size in PIO array */
+};
+
+/* DMA credit return, index is always (context & 0x7) */
+struct credit_return {
+       volatile __le64 cr[8];
+};
+
+/* NUMA indexed credit return array */
+struct credit_return_base {
+       struct credit_return *va;
+       dma_addr_t pa;
+};
+
+/* send context configuration sizes (one per type) */
+struct sc_config_sizes {
+       short int size;
+       short int count;
+};
+
+/*
+ * The diagram below details the relationship of the mapping structures
+ *
+ * Since the mapping now allows for non-uniform send contexts per vl, the
+ * number of send contexts for a vl is either the vl_scontexts[vl] or
+ * a computation based on num_kernel_send_contexts/num_vls:
+ *
+ * For example:
+ * nactual = vl_scontexts ? vl_scontexts[vl] : num_kernel_send_contexts/num_vls
+ *
+ * n = roundup to next highest power of 2 using nactual
+ *
+ * In the case where there are num_kernel_send_contexts/num_vls doesn't divide
+ * evenly, the extras are added from the last vl downward.
+ *
+ * For the case where n > nactual, the send contexts are assigned
+ * in a round robin fashion wrapping back to the first send context
+ * for a particular vl.
+ *
+ *               dd->pio_map
+ *                    |                                   pio_map_elem[0]
+ *                    |                                +--------------------+
+ *                    v                                |       mask         |
+ *               pio_vl_map                            |--------------------|
+ *      +--------------------------+                   | ksc[0] -> sc 1     |
+ *      |    list (RCU)            |                   |--------------------|
+ *      |--------------------------|                 ->| ksc[1] -> sc 2     |
+ *      |    mask                  |              --/  |--------------------|
+ *      |--------------------------|            -/     |        *           |
+ *      |    actual_vls (max 8)    |          -/       |--------------------|
+ *      |--------------------------|       --/         | ksc[n] -> sc n     |
+ *      |    vls (max 8)           |     -/            +--------------------+
+ *      |--------------------------|  --/
+ *      |    map[0]                |-/
+ *      |--------------------------|                   +--------------------+
+ *      |    map[1]                |---                |       mask         |
+ *      |--------------------------|   \----           |--------------------|
+ *      |           *              |        \--        | ksc[0] -> sc 1+n   |
+ *      |           *              |           \----   |--------------------|
+ *      |           *              |                \->| ksc[1] -> sc 2+n   |
+ *      |--------------------------|                   |--------------------|
+ *      |   map[vls - 1]           |-                  |         *          |
+ *      +--------------------------+ \-                |--------------------|
+ *                                     \-              | ksc[m] -> sc m+n   |
+ *                                       \             +--------------------+
+ *                                        \-
+ *                                          \
+ *                                           \-        +--------------------+
+ *                                             \-      |       mask         |
+ *                                               \     |--------------------|
+ *                                                \-   | ksc[0] -> sc 1+m+n |
+ *                                                  \- |--------------------|
+ *                                                    >| ksc[1] -> sc 2+m+n |
+ *                                                     |--------------------|
+ *                                                     |         *          |
+ *                                                     |--------------------|
+ *                                                     | ksc[o] -> sc o+m+n |
+ *                                                     +--------------------+
+ *
+ */
+
+/* Initial number of send contexts per VL */
+#define INIT_SC_PER_VL 2
+
+/*
+ * struct pio_map_elem - mapping for a vl
+ * @mask - selector mask
+ * @ksc - array of kernel send contexts for this vl
+ *
+ * The mask is used to "mod" the selector to
+ * produce index into the trailing array of
+ * kscs
+ */
+struct pio_map_elem {
+       u32 mask;
+       struct send_context *ksc[0];
+};
+
+/*
+ * struct pio_vl_map - mapping for a vl
+ * @list - rcu head for free callback
+ * @mask - vl mask to "mod" the vl to produce an index to map array
+ * @actual_vls - number of vls
+ * @vls - numbers of vls rounded to next power of 2
+ * @map - array of pio_map_elem entries
+ *
+ * This is the parent mapping structure. The trailing members of the
+ * struct point to pio_map_elem entries, which in turn point to an
+ * array of kscs for that vl.
+ */
+struct pio_vl_map {
+       struct rcu_head list;
+       u32 mask;
+       u8 actual_vls;
+       u8 vls;
+       struct pio_map_elem *map[0];
+};
+
+int pio_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls,
+                u8 *vl_scontexts);
+void free_pio_map(struct hfi1_devdata *dd);
+struct send_context *pio_select_send_context_vl(struct hfi1_devdata *dd,
+                                               u32 selector, u8 vl);
+struct send_context *pio_select_send_context_sc(struct hfi1_devdata *dd,
+                                               u32 selector, u8 sc5);
+
+/* send context functions */
+int init_credit_return(struct hfi1_devdata *dd);
+void free_credit_return(struct hfi1_devdata *dd);
+int init_sc_pools_and_sizes(struct hfi1_devdata *dd);
+int init_send_contexts(struct hfi1_devdata *dd);
+int init_credit_return(struct hfi1_devdata *dd);
+int init_pervl_scs(struct hfi1_devdata *dd);
+struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
+                             uint hdrqentsize, int numa);
+void sc_free(struct send_context *sc);
+int sc_enable(struct send_context *sc);
+void sc_disable(struct send_context *sc);
+int sc_restart(struct send_context *sc);
+void sc_return_credits(struct send_context *sc);
+void sc_flush(struct send_context *sc);
+void sc_drop(struct send_context *sc);
+void sc_stop(struct send_context *sc, int bit);
+struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len,
+                               pio_release_cb cb, void *arg);
+void sc_release_update(struct send_context *sc);
+void sc_return_credits(struct send_context *sc);
+void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context);
+void sc_add_credit_return_intr(struct send_context *sc);
+void sc_del_credit_return_intr(struct send_context *sc);
+void sc_set_cr_threshold(struct send_context *sc, u32 new_threshold);
+u32 sc_percent_to_threshold(struct send_context *sc, u32 percent);
+u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize);
+void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint);
+void sc_wait(struct hfi1_devdata *dd);
+void set_pio_integrity(struct send_context *sc);
+
+/* support functions */
+void pio_reset_all(struct hfi1_devdata *dd);
+void pio_freeze(struct hfi1_devdata *dd);
+void pio_kernel_unfreeze(struct hfi1_devdata *dd);
+
+/* global PIO send control operations */
+#define PSC_GLOBAL_ENABLE 0
+#define PSC_GLOBAL_DISABLE 1
+#define PSC_GLOBAL_VLARB_ENABLE 2
+#define PSC_GLOBAL_VLARB_DISABLE 3
+#define PSC_CM_RESET 4
+#define PSC_DATA_VL_ENABLE 5
+#define PSC_DATA_VL_DISABLE 6
+
+void __cm_reset(struct hfi1_devdata *dd, u64 sendctrl);
+void pio_send_control(struct hfi1_devdata *dd, int op);
+
+/* PIO copy routines */
+void pio_copy(struct hfi1_devdata *dd, struct pio_buf *pbuf, u64 pbc,
+             const void *from, size_t count);
+void seg_pio_copy_start(struct pio_buf *pbuf, u64 pbc,
+                       const void *from, size_t nbytes);
+void seg_pio_copy_mid(struct pio_buf *pbuf, const void *from, size_t nbytes);
+void seg_pio_copy_end(struct pio_buf *pbuf);
+
+#endif /* _PIO_H */
diff --git a/drivers/infiniband/hw/hfi1/pio_copy.c b/drivers/infiniband/hw/hfi1/pio_copy.c
new file mode 100644 (file)
index 0000000..8c25e1b
--- /dev/null
@@ -0,0 +1,867 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "hfi.h"
+
+/* additive distance between non-SOP and SOP space */
+#define SOP_DISTANCE (TXE_PIO_SIZE / 2)
+#define PIO_BLOCK_MASK (PIO_BLOCK_SIZE - 1)
+/* number of QUADWORDs in a block */
+#define PIO_BLOCK_QWS (PIO_BLOCK_SIZE / sizeof(u64))
+
+/**
+ * pio_copy - copy data block to MMIO space
+ * @pbuf: a number of blocks allocated within a PIO send context
+ * @pbc: PBC to send
+ * @from: source, must be 8 byte aligned
+ * @count: number of DWORD (32-bit) quantities to copy from source
+ *
+ * Copy data from source to PIO Send Buffer memory, 8 bytes at a time.
+ * Must always write full BLOCK_SIZE bytes blocks.  The first block must
+ * be written to the corresponding SOP=1 address.
+ *
+ * Known:
+ * o pbuf->start always starts on a block boundary
+ * o pbuf can wrap only at a block boundary
+ */
+void pio_copy(struct hfi1_devdata *dd, struct pio_buf *pbuf, u64 pbc,
+             const void *from, size_t count)
+{
+       void __iomem *dest = pbuf->start + SOP_DISTANCE;
+       void __iomem *send = dest + PIO_BLOCK_SIZE;
+       void __iomem *dend;                     /* 8-byte data end */
+
+       /* write the PBC */
+       writeq(pbc, dest);
+       dest += sizeof(u64);
+
+       /* calculate where the QWORD data ends - in SOP=1 space */
+       dend = dest + ((count >> 1) * sizeof(u64));
+
+       if (dend < send) {
+               /*
+                * all QWORD data is within the SOP block, does *not*
+                * reach the end of the SOP block
+                */
+
+               while (dest < dend) {
+                       writeq(*(u64 *)from, dest);
+                       from += sizeof(u64);
+                       dest += sizeof(u64);
+               }
+               /*
+                * No boundary checks are needed here:
+                * 0. We're not on the SOP block boundary
+                * 1. The possible DWORD dangle will still be within
+                *    the SOP block
+                * 2. We cannot wrap except on a block boundary.
+                */
+       } else {
+               /* QWORD data extends _to_ or beyond the SOP block */
+
+               /* write 8-byte SOP chunk data */
+               while (dest < send) {
+                       writeq(*(u64 *)from, dest);
+                       from += sizeof(u64);
+                       dest += sizeof(u64);
+               }
+               /* drop out of the SOP range */
+               dest -= SOP_DISTANCE;
+               dend -= SOP_DISTANCE;
+
+               /*
+                * If the wrap comes before or matches the data end,
+                * copy until until the wrap, then wrap.
+                *
+                * If the data ends at the end of the SOP above and
+                * the buffer wraps, then pbuf->end == dend == dest
+                * and nothing will get written, but we will wrap in
+                * case there is a dangling DWORD.
+                */
+               if (pbuf->end <= dend) {
+                       while (dest < pbuf->end) {
+                               writeq(*(u64 *)from, dest);
+                               from += sizeof(u64);
+                               dest += sizeof(u64);
+                       }
+
+                       dest -= pbuf->size;
+                       dend -= pbuf->size;
+               }
+
+               /* write 8-byte non-SOP, non-wrap chunk data */
+               while (dest < dend) {
+                       writeq(*(u64 *)from, dest);
+                       from += sizeof(u64);
+                       dest += sizeof(u64);
+               }
+       }
+       /* at this point we have wrapped if we are going to wrap */
+
+       /* write dangling u32, if any */
+       if (count & 1) {
+               union mix val;
+
+               val.val64 = 0;
+               val.val32[0] = *(u32 *)from;
+               writeq(val.val64, dest);
+               dest += sizeof(u64);
+       }
+       /*
+        * fill in rest of block, no need to check pbuf->end
+        * as we only wrap on a block boundary
+        */
+       while (((unsigned long)dest & PIO_BLOCK_MASK) != 0) {
+               writeq(0, dest);
+               dest += sizeof(u64);
+       }
+
+       /* finished with this buffer */
+       this_cpu_dec(*pbuf->sc->buffers_allocated);
+       preempt_enable();
+}
+
+/* USE_SHIFTS is faster in user-space tests on a Xeon X5570 @ 2.93GHz */
+#define USE_SHIFTS 1
+#ifdef USE_SHIFTS
+/*
+ * Handle carry bytes using shifts and masks.
+ *
+ * NOTE: the value the unused portion of carry is expected to always be zero.
+ */
+
+/*
+ * "zero" shift - bit shift used to zero out upper bytes.  Input is
+ * the count of LSB bytes to preserve.
+ */
+#define zshift(x) (8 * (8 - (x)))
+
+/*
+ * "merge" shift - bit shift used to merge with carry bytes.  Input is
+ * the LSB byte count to move beyond.
+ */
+#define mshift(x) (8 * (x))
+
+/*
+ * Read nbytes bytes from "from" and return them in the LSB bytes
+ * of pbuf->carry.  Other bytes are zeroed.  Any previous value
+ * pbuf->carry is lost.
+ *
+ * NOTES:
+ * o do not read from from if nbytes is zero
+ * o from may _not_ be u64 aligned
+ * o nbytes must not span a QW boundary
+ */
+static inline void read_low_bytes(struct pio_buf *pbuf, const void *from,
+                                 unsigned int nbytes)
+{
+       unsigned long off;
+
+       if (nbytes == 0) {
+               pbuf->carry.val64 = 0;
+       } else {
+               /* align our pointer */
+               off = (unsigned long)from & 0x7;
+               from = (void *)((unsigned long)from & ~0x7l);
+               pbuf->carry.val64 = ((*(u64 *)from)
+                               << zshift(nbytes + off))/* zero upper bytes */
+                               >> zshift(nbytes);      /* place at bottom */
+       }
+       pbuf->carry_bytes = nbytes;
+}
+
+/*
+ * Read nbytes bytes from "from" and put them at the next significant bytes
+ * of pbuf->carry.  Unused bytes are zeroed.  It is expected that the extra
+ * read does not overfill carry.
+ *
+ * NOTES:
+ * o from may _not_ be u64 aligned
+ * o nbytes may span a QW boundary
+ */
+static inline void read_extra_bytes(struct pio_buf *pbuf,
+                                   const void *from, unsigned int nbytes)
+{
+       unsigned long off = (unsigned long)from & 0x7;
+       unsigned int room, xbytes;
+
+       /* align our pointer */
+       from = (void *)((unsigned long)from & ~0x7l);
+
+       /* check count first - don't read anything if count is zero */
+       while (nbytes) {
+               /* find the number of bytes in this u64 */
+               room = 8 - off; /* this u64 has room for this many bytes */
+               xbytes = min(room, nbytes);
+
+               /*
+                * shift down to zero lower bytes, shift up to zero upper
+                * bytes, shift back down to move into place
+                */
+               pbuf->carry.val64 |= (((*(u64 *)from)
+                                       >> mshift(off))
+                                       << zshift(xbytes))
+                                       >> zshift(xbytes + pbuf->carry_bytes);
+               off = 0;
+               pbuf->carry_bytes += xbytes;
+               nbytes -= xbytes;
+               from += sizeof(u64);
+       }
+}
+
+/*
+ * Zero extra bytes from the end of pbuf->carry.
+ *
+ * NOTES:
+ * o zbytes <= old_bytes
+ */
+static inline void zero_extra_bytes(struct pio_buf *pbuf, unsigned int zbytes)
+{
+       unsigned int remaining;
+
+       if (zbytes == 0)        /* nothing to do */
+               return;
+
+       remaining = pbuf->carry_bytes - zbytes; /* remaining bytes */
+
+       /* NOTE: zshift only guaranteed to work if remaining != 0 */
+       if (remaining)
+               pbuf->carry.val64 = (pbuf->carry.val64 << zshift(remaining))
+                                       >> zshift(remaining);
+       else
+               pbuf->carry.val64 = 0;
+       pbuf->carry_bytes = remaining;
+}
+
+/*
+ * Write a quad word using parts of pbuf->carry and the next 8 bytes of src.
+ * Put the unused part of the next 8 bytes of src into the LSB bytes of
+ * pbuf->carry with the upper bytes zeroed..
+ *
+ * NOTES:
+ * o result must keep unused bytes zeroed
+ * o src must be u64 aligned
+ */
+static inline void merge_write8(
+       struct pio_buf *pbuf,
+       void __iomem *dest,
+       const void *src)
+{
+       u64 new, temp;
+
+       new = *(u64 *)src;
+       temp = pbuf->carry.val64 | (new << mshift(pbuf->carry_bytes));
+       writeq(temp, dest);
+       pbuf->carry.val64 = new >> zshift(pbuf->carry_bytes);
+}
+
+/*
+ * Write a quad word using all bytes of carry.
+ */
+static inline void carry8_write8(union mix carry, void __iomem *dest)
+{
+       writeq(carry.val64, dest);
+}
+
+/*
+ * Write a quad word using all the valid bytes of carry.  If carry
+ * has zero valid bytes, nothing is written.
+ * Returns 0 on nothing written, non-zero on quad word written.
+ */
+static inline int carry_write8(struct pio_buf *pbuf, void __iomem *dest)
+{
+       if (pbuf->carry_bytes) {
+               /* unused bytes are always kept zeroed, so just write */
+               writeq(pbuf->carry.val64, dest);
+               return 1;
+       }
+
+       return 0;
+}
+
+#else /* USE_SHIFTS */
+/*
+ * Handle carry bytes using byte copies.
+ *
+ * NOTE: the value the unused portion of carry is left uninitialized.
+ */
+
+/*
+ * Jump copy - no-loop copy for < 8 bytes.
+ */
+static inline void jcopy(u8 *dest, const u8 *src, u32 n)
+{
+       switch (n) {
+       case 7:
+               *dest++ = *src++;
+       case 6:
+               *dest++ = *src++;
+       case 5:
+               *dest++ = *src++;
+       case 4:
+               *dest++ = *src++;
+       case 3:
+               *dest++ = *src++;
+       case 2:
+               *dest++ = *src++;
+       case 1:
+               *dest++ = *src++;
+       }
+}
+
+/*
+ * Read nbytes from "from" and and place them in the low bytes
+ * of pbuf->carry.  Other bytes are left as-is.  Any previous
+ * value in pbuf->carry is lost.
+ *
+ * NOTES:
+ * o do not read from from if nbytes is zero
+ * o from may _not_ be u64 aligned.
+ */
+static inline void read_low_bytes(struct pio_buf *pbuf, const void *from,
+                                 unsigned int nbytes)
+{
+       jcopy(&pbuf->carry.val8[0], from, nbytes);
+       pbuf->carry_bytes = nbytes;
+}
+
+/*
+ * Read nbytes bytes from "from" and put them at the end of pbuf->carry.
+ * It is expected that the extra read does not overfill carry.
+ *
+ * NOTES:
+ * o from may _not_ be u64 aligned
+ * o nbytes may span a QW boundary
+ */
+static inline void read_extra_bytes(struct pio_buf *pbuf,
+                                   const void *from, unsigned int nbytes)
+{
+       jcopy(&pbuf->carry.val8[pbuf->carry_bytes], from, nbytes);
+       pbuf->carry_bytes += nbytes;
+}
+
+/*
+ * Zero extra bytes from the end of pbuf->carry.
+ *
+ * We do not care about the value of unused bytes in carry, so just
+ * reduce the byte count.
+ *
+ * NOTES:
+ * o zbytes <= old_bytes
+ */
+static inline void zero_extra_bytes(struct pio_buf *pbuf, unsigned int zbytes)
+{
+       pbuf->carry_bytes -= zbytes;
+}
+
+/*
+ * Write a quad word using parts of pbuf->carry and the next 8 bytes of src.
+ * Put the unused part of the next 8 bytes of src into the low bytes of
+ * pbuf->carry.
+ */
+static inline void merge_write8(
+       struct pio_buf *pbuf,
+       void *dest,
+       const void *src)
+{
+       u32 remainder = 8 - pbuf->carry_bytes;
+
+       jcopy(&pbuf->carry.val8[pbuf->carry_bytes], src, remainder);
+       writeq(pbuf->carry.val64, dest);
+       jcopy(&pbuf->carry.val8[0], src + remainder, pbuf->carry_bytes);
+}
+
+/*
+ * Write a quad word using all bytes of carry.
+ */
+static inline void carry8_write8(union mix carry, void *dest)
+{
+       writeq(carry.val64, dest);
+}
+
+/*
+ * Write a quad word using all the valid bytes of carry.  If carry
+ * has zero valid bytes, nothing is written.
+ * Returns 0 on nothing written, non-zero on quad word written.
+ */
+static inline int carry_write8(struct pio_buf *pbuf, void *dest)
+{
+       if (pbuf->carry_bytes) {
+               u64 zero = 0;
+
+               jcopy(&pbuf->carry.val8[pbuf->carry_bytes], (u8 *)&zero,
+                     8 - pbuf->carry_bytes);
+               writeq(pbuf->carry.val64, dest);
+               return 1;
+       }
+
+       return 0;
+}
+#endif /* USE_SHIFTS */
+
+/*
+ * Segmented PIO Copy - start
+ *
+ * Start a PIO copy.
+ *
+ * @pbuf: destination buffer
+ * @pbc: the PBC for the PIO buffer
+ * @from: data source, QWORD aligned
+ * @nbytes: bytes to copy
+ */
+void seg_pio_copy_start(struct pio_buf *pbuf, u64 pbc,
+                       const void *from, size_t nbytes)
+{
+       void __iomem *dest = pbuf->start + SOP_DISTANCE;
+       void __iomem *send = dest + PIO_BLOCK_SIZE;
+       void __iomem *dend;                     /* 8-byte data end */
+
+       writeq(pbc, dest);
+       dest += sizeof(u64);
+
+       /* calculate where the QWORD data ends - in SOP=1 space */
+       dend = dest + ((nbytes >> 3) * sizeof(u64));
+
+       if (dend < send) {
+               /*
+                * all QWORD data is within the SOP block, does *not*
+                * reach the end of the SOP block
+                */
+
+               while (dest < dend) {
+                       writeq(*(u64 *)from, dest);
+                       from += sizeof(u64);
+                       dest += sizeof(u64);
+               }
+               /*
+                * No boundary checks are needed here:
+                * 0. We're not on the SOP block boundary
+                * 1. The possible DWORD dangle will still be within
+                *    the SOP block
+                * 2. We cannot wrap except on a block boundary.
+                */
+       } else {
+               /* QWORD data extends _to_ or beyond the SOP block */
+
+               /* write 8-byte SOP chunk data */
+               while (dest < send) {
+                       writeq(*(u64 *)from, dest);
+                       from += sizeof(u64);
+                       dest += sizeof(u64);
+               }
+               /* drop out of the SOP range */
+               dest -= SOP_DISTANCE;
+               dend -= SOP_DISTANCE;
+
+               /*
+                * If the wrap comes before or matches the data end,
+                * copy until until the wrap, then wrap.
+                *
+                * If the data ends at the end of the SOP above and
+                * the buffer wraps, then pbuf->end == dend == dest
+                * and nothing will get written, but we will wrap in
+                * case there is a dangling DWORD.
+                */
+               if (pbuf->end <= dend) {
+                       while (dest < pbuf->end) {
+                               writeq(*(u64 *)from, dest);
+                               from += sizeof(u64);
+                               dest += sizeof(u64);
+                       }
+
+                       dest -= pbuf->size;
+                       dend -= pbuf->size;
+               }
+
+               /* write 8-byte non-SOP, non-wrap chunk data */
+               while (dest < dend) {
+                       writeq(*(u64 *)from, dest);
+                       from += sizeof(u64);
+                       dest += sizeof(u64);
+               }
+       }
+       /* at this point we have wrapped if we are going to wrap */
+
+       /* ...but it doesn't matter as we're done writing */
+
+       /* save dangling bytes, if any */
+       read_low_bytes(pbuf, from, nbytes & 0x7);
+
+       pbuf->qw_written = 1 /*PBC*/ + (nbytes >> 3);
+}
+
+/*
+ * Mid copy helper, "mixed case" - source is 64-bit aligned but carry
+ * bytes are non-zero.
+ *
+ * Whole u64s must be written to the chip, so bytes must be manually merged.
+ *
+ * @pbuf: destination buffer
+ * @from: data source, is QWORD aligned.
+ * @nbytes: bytes to copy
+ *
+ * Must handle nbytes < 8.
+ */
+static void mid_copy_mix(struct pio_buf *pbuf, const void *from, size_t nbytes)
+{
+       void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
+       void __iomem *dend;                     /* 8-byte data end */
+       unsigned long qw_to_write = (pbuf->carry_bytes + nbytes) >> 3;
+       unsigned long bytes_left = (pbuf->carry_bytes + nbytes) & 0x7;
+
+       /* calculate 8-byte data end */
+       dend = dest + (qw_to_write * sizeof(u64));
+
+       if (pbuf->qw_written < PIO_BLOCK_QWS) {
+               /*
+                * Still within SOP block.  We don't need to check for
+                * wrap because we are still in the first block and
+                * can only wrap on block boundaries.
+                */
+               void __iomem *send;             /* SOP end */
+               void __iomem *xend;
+
+               /*
+                * calculate the end of data or end of block, whichever
+                * comes first
+                */
+               send = pbuf->start + PIO_BLOCK_SIZE;
+               xend = min(send, dend);
+
+               /* shift up to SOP=1 space */
+               dest += SOP_DISTANCE;
+               xend += SOP_DISTANCE;
+
+               /* write 8-byte chunk data */
+               while (dest < xend) {
+                       merge_write8(pbuf, dest, from);
+                       from += sizeof(u64);
+                       dest += sizeof(u64);
+               }
+
+               /* shift down to SOP=0 space */
+               dest -= SOP_DISTANCE;
+       }
+       /*
+        * At this point dest could be (either, both, or neither):
+        * - at dend
+        * - at the wrap
+        */
+
+       /*
+        * If the wrap comes before or matches the data end,
+        * copy until until the wrap, then wrap.
+        *
+        * If dest is at the wrap, we will fall into the if,
+        * not do the loop, when wrap.
+        *
+        * If the data ends at the end of the SOP above and
+        * the buffer wraps, then pbuf->end == dend == dest
+        * and nothing will get written.
+        */
+       if (pbuf->end <= dend) {
+               while (dest < pbuf->end) {
+                       merge_write8(pbuf, dest, from);
+                       from += sizeof(u64);
+                       dest += sizeof(u64);
+               }
+
+               dest -= pbuf->size;
+               dend -= pbuf->size;
+       }
+
+       /* write 8-byte non-SOP, non-wrap chunk data */
+       while (dest < dend) {
+               merge_write8(pbuf, dest, from);
+               from += sizeof(u64);
+               dest += sizeof(u64);
+       }
+
+       /* adjust carry */
+       if (pbuf->carry_bytes < bytes_left) {
+               /* need to read more */
+               read_extra_bytes(pbuf, from, bytes_left - pbuf->carry_bytes);
+       } else {
+               /* remove invalid bytes */
+               zero_extra_bytes(pbuf, pbuf->carry_bytes - bytes_left);
+       }
+
+       pbuf->qw_written += qw_to_write;
+}
+
+/*
+ * Mid copy helper, "straight case" - source pointer is 64-bit aligned
+ * with no carry bytes.
+ *
+ * @pbuf: destination buffer
+ * @from: data source, is QWORD aligned
+ * @nbytes: bytes to copy
+ *
+ * Must handle nbytes < 8.
+ */
+static void mid_copy_straight(struct pio_buf *pbuf,
+                             const void *from, size_t nbytes)
+{
+       void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
+       void __iomem *dend;                     /* 8-byte data end */
+
+       /* calculate 8-byte data end */
+       dend = dest + ((nbytes >> 3) * sizeof(u64));
+
+       if (pbuf->qw_written < PIO_BLOCK_QWS) {
+               /*
+                * Still within SOP block.  We don't need to check for
+                * wrap because we are still in the first block and
+                * can only wrap on block boundaries.
+                */
+               void __iomem *send;             /* SOP end */
+               void __iomem *xend;
+
+               /*
+                * calculate the end of data or end of block, whichever
+                * comes first
+                */
+               send = pbuf->start + PIO_BLOCK_SIZE;
+               xend = min(send, dend);
+
+               /* shift up to SOP=1 space */
+               dest += SOP_DISTANCE;
+               xend += SOP_DISTANCE;
+
+               /* write 8-byte chunk data */
+               while (dest < xend) {
+                       writeq(*(u64 *)from, dest);
+                       from += sizeof(u64);
+                       dest += sizeof(u64);
+               }
+
+               /* shift down to SOP=0 space */
+               dest -= SOP_DISTANCE;
+       }
+       /*
+        * At this point dest could be (either, both, or neither):
+        * - at dend
+        * - at the wrap
+        */
+
+       /*
+        * If the wrap comes before or matches the data end,
+        * copy until until the wrap, then wrap.
+        *
+        * If dest is at the wrap, we will fall into the if,
+        * not do the loop, when wrap.
+        *
+        * If the data ends at the end of the SOP above and
+        * the buffer wraps, then pbuf->end == dend == dest
+        * and nothing will get written.
+        */
+       if (pbuf->end <= dend) {
+               while (dest < pbuf->end) {
+                       writeq(*(u64 *)from, dest);
+                       from += sizeof(u64);
+                       dest += sizeof(u64);
+               }
+
+               dest -= pbuf->size;
+               dend -= pbuf->size;
+       }
+
+       /* write 8-byte non-SOP, non-wrap chunk data */
+       while (dest < dend) {
+               writeq(*(u64 *)from, dest);
+               from += sizeof(u64);
+               dest += sizeof(u64);
+       }
+
+       /* we know carry_bytes was zero on entry to this routine */
+       read_low_bytes(pbuf, from, nbytes & 0x7);
+
+       pbuf->qw_written += nbytes >> 3;
+}
+
+/*
+ * Segmented PIO Copy - middle
+ *
+ * Must handle any aligned tail and any aligned source with any byte count.
+ *
+ * @pbuf: a number of blocks allocated within a PIO send context
+ * @from: data source
+ * @nbytes: number of bytes to copy
+ */
+void seg_pio_copy_mid(struct pio_buf *pbuf, const void *from, size_t nbytes)
+{
+       unsigned long from_align = (unsigned long)from & 0x7;
+
+       if (pbuf->carry_bytes + nbytes < 8) {
+               /* not enough bytes to fill a QW */
+               read_extra_bytes(pbuf, from, nbytes);
+               return;
+       }
+
+       if (from_align) {
+               /* misaligned source pointer - align it */
+               unsigned long to_align;
+
+               /* bytes to read to align "from" */
+               to_align = 8 - from_align;
+
+               /*
+                * In the advance-to-alignment logic below, we do not need
+                * to check if we are using more than nbytes.  This is because
+                * if we are here, we already know that carry+nbytes will
+                * fill at least one QW.
+                */
+               if (pbuf->carry_bytes + to_align < 8) {
+                       /* not enough align bytes to fill a QW */
+                       read_extra_bytes(pbuf, from, to_align);
+                       from += to_align;
+                       nbytes -= to_align;
+               } else {
+                       /* bytes to fill carry */
+                       unsigned long to_fill = 8 - pbuf->carry_bytes;
+                       /* bytes left over to be read */
+                       unsigned long extra = to_align - to_fill;
+                       void __iomem *dest;
+
+                       /* fill carry... */
+                       read_extra_bytes(pbuf, from, to_fill);
+                       from += to_fill;
+                       nbytes -= to_fill;
+
+                       /* ...now write carry */
+                       dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
+
+                       /*
+                        * The two checks immediately below cannot both be
+                        * true, hence the else.  If we have wrapped, we
+                        * cannot still be within the first block.
+                        * Conversely, if we are still in the first block, we
+                        * cannot have wrapped.  We do the wrap check first
+                        * as that is more likely.
+                        */
+                       /* adjust if we've wrapped */
+                       if (dest >= pbuf->end)
+                               dest -= pbuf->size;
+                       /* jump to SOP range if within the first block */
+                       else if (pbuf->qw_written < PIO_BLOCK_QWS)
+                               dest += SOP_DISTANCE;
+
+                       carry8_write8(pbuf->carry, dest);
+                       pbuf->qw_written++;
+
+                       /* read any extra bytes to do final alignment */
+                       /* this will overwrite anything in pbuf->carry */
+                       read_low_bytes(pbuf, from, extra);
+                       from += extra;
+                       nbytes -= extra;
+               }
+
+               /* at this point, from is QW aligned */
+       }
+
+       if (pbuf->carry_bytes)
+               mid_copy_mix(pbuf, from, nbytes);
+       else
+               mid_copy_straight(pbuf, from, nbytes);
+}
+
+/*
+ * Segmented PIO Copy - end
+ *
+ * Write any remainder (in pbuf->carry) and finish writing the whole block.
+ *
+ * @pbuf: a number of blocks allocated within a PIO send context
+ */
+void seg_pio_copy_end(struct pio_buf *pbuf)
+{
+       void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
+
+       /*
+        * The two checks immediately below cannot both be true, hence the
+        * else.  If we have wrapped, we cannot still be within the first
+        * block.  Conversely, if we are still in the first block, we
+        * cannot have wrapped.  We do the wrap check first as that is
+        * more likely.
+        */
+       /* adjust if we have wrapped */
+       if (dest >= pbuf->end)
+               dest -= pbuf->size;
+       /* jump to the SOP range if within the first block */
+       else if (pbuf->qw_written < PIO_BLOCK_QWS)
+               dest += SOP_DISTANCE;
+
+       /* write final bytes, if any */
+       if (carry_write8(pbuf, dest)) {
+               dest += sizeof(u64);
+               /*
+                * NOTE: We do not need to recalculate whether dest needs
+                * SOP_DISTANCE or not.
+                *
+                * If we are in the first block and the dangle write
+                * keeps us in the same block, dest will need
+                * to retain SOP_DISTANCE in the loop below.
+                *
+                * If we are in the first block and the dangle write pushes
+                * us to the next block, then loop below will not run
+                * and dest is not used.  Hence we do not need to update
+                * it.
+                *
+                * If we are past the first block, then SOP_DISTANCE
+                * was never added, so there is nothing to do.
+                */
+       }
+
+       /* fill in rest of block */
+       while (((unsigned long)dest & PIO_BLOCK_MASK) != 0) {
+               writeq(0, dest);
+               dest += sizeof(u64);
+       }
+
+       /* finished with this buffer */
+       this_cpu_dec(*pbuf->sc->buffers_allocated);
+       preempt_enable();
+}
diff --git a/drivers/infiniband/hw/hfi1/platform.c b/drivers/infiniband/hw/hfi1/platform.c
new file mode 100644 (file)
index 0000000..03df932
--- /dev/null
@@ -0,0 +1,909 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "hfi.h"
+#include "efivar.h"
+
+void get_platform_config(struct hfi1_devdata *dd)
+{
+       int ret = 0;
+       unsigned long size = 0;
+       u8 *temp_platform_config = NULL;
+
+       ret = read_hfi1_efi_var(dd, "configuration", &size,
+                               (void **)&temp_platform_config);
+       if (ret) {
+               dd_dev_info(dd,
+                           "%s: Failed to get platform config from UEFI, falling back to request firmware\n",
+                           __func__);
+               /* fall back to request firmware */
+               platform_config_load = 1;
+               goto bail;
+       }
+
+       dd->platform_config.data = temp_platform_config;
+       dd->platform_config.size = size;
+
+bail:
+       /* exit */;
+}
+
+void free_platform_config(struct hfi1_devdata *dd)
+{
+       if (!platform_config_load) {
+               /*
+                * was loaded from EFI, release memory
+                * allocated by read_efi_var
+                */
+               kfree(dd->platform_config.data);
+       }
+       /*
+        * else do nothing, dispose_firmware will release
+        * struct firmware platform_config on driver exit
+        */
+}
+
+void get_port_type(struct hfi1_pportdata *ppd)
+{
+       int ret;
+
+       ret = get_platform_config_field(ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
+                                       PORT_TABLE_PORT_TYPE, &ppd->port_type,
+                                       4);
+       if (ret)
+               ppd->port_type = PORT_TYPE_UNKNOWN;
+}
+
+int set_qsfp_tx(struct hfi1_pportdata *ppd, int on)
+{
+       u8 tx_ctrl_byte = on ? 0x0 : 0xF;
+       int ret = 0;
+
+       ret = qsfp_write(ppd, ppd->dd->hfi1_id, QSFP_TX_CTRL_BYTE_OFFS,
+                        &tx_ctrl_byte, 1);
+       /* we expected 1, so consider 0 an error */
+       if (ret == 0)
+               ret = -EIO;
+       else if (ret == 1)
+               ret = 0;
+       return ret;
+}
+
+static int qual_power(struct hfi1_pportdata *ppd)
+{
+       u32 cable_power_class = 0, power_class_max = 0;
+       u8 *cache = ppd->qsfp_info.cache;
+       int ret = 0;
+
+       ret = get_platform_config_field(
+               ppd->dd, PLATFORM_CONFIG_SYSTEM_TABLE, 0,
+               SYSTEM_TABLE_QSFP_POWER_CLASS_MAX, &power_class_max, 4);
+       if (ret)
+               return ret;
+
+       cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
+
+       if (cable_power_class > power_class_max)
+               ppd->offline_disabled_reason =
+                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY);
+
+       if (ppd->offline_disabled_reason ==
+                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY)) {
+               dd_dev_info(
+                       ppd->dd,
+                       "%s: Port disabled due to system power restrictions\n",
+                       __func__);
+               ret = -EPERM;
+       }
+       return ret;
+}
+
+static int qual_bitrate(struct hfi1_pportdata *ppd)
+{
+       u16 lss = ppd->link_speed_supported, lse = ppd->link_speed_enabled;
+       u8 *cache = ppd->qsfp_info.cache;
+
+       if ((lss & OPA_LINK_SPEED_25G) && (lse & OPA_LINK_SPEED_25G) &&
+           cache[QSFP_NOM_BIT_RATE_250_OFFS] < 0x64)
+               ppd->offline_disabled_reason =
+                          HFI1_ODR_MASK(OPA_LINKDOWN_REASON_LINKSPEED_POLICY);
+
+       if ((lss & OPA_LINK_SPEED_12_5G) && (lse & OPA_LINK_SPEED_12_5G) &&
+           cache[QSFP_NOM_BIT_RATE_100_OFFS] < 0x7D)
+               ppd->offline_disabled_reason =
+                          HFI1_ODR_MASK(OPA_LINKDOWN_REASON_LINKSPEED_POLICY);
+
+       if (ppd->offline_disabled_reason ==
+                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_LINKSPEED_POLICY)) {
+               dd_dev_info(
+                       ppd->dd,
+                       "%s: Cable failed bitrate check, disabling port\n",
+                       __func__);
+               return -EPERM;
+       }
+       return 0;
+}
+
+static int set_qsfp_high_power(struct hfi1_pportdata *ppd)
+{
+       u8 cable_power_class = 0, power_ctrl_byte = 0;
+       u8 *cache = ppd->qsfp_info.cache;
+       int ret;
+
+       cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
+
+       if (cable_power_class > QSFP_POWER_CLASS_1) {
+               power_ctrl_byte = cache[QSFP_PWR_CTRL_BYTE_OFFS];
+
+               power_ctrl_byte |= 1;
+               power_ctrl_byte &= ~(0x2);
+
+               ret = qsfp_write(ppd, ppd->dd->hfi1_id,
+                                QSFP_PWR_CTRL_BYTE_OFFS,
+                                &power_ctrl_byte, 1);
+               if (ret != 1)
+                       return -EIO;
+
+               if (cable_power_class > QSFP_POWER_CLASS_4) {
+                       power_ctrl_byte |= (1 << 2);
+                       ret = qsfp_write(ppd, ppd->dd->hfi1_id,
+                                        QSFP_PWR_CTRL_BYTE_OFFS,
+                                        &power_ctrl_byte, 1);
+                       if (ret != 1)
+                               return -EIO;
+               }
+
+               /* SFF 8679 rev 1.7 LPMode Deassert time */
+               msleep(300);
+       }
+       return 0;
+}
+
+static void apply_rx_cdr(struct hfi1_pportdata *ppd,
+                        u32 rx_preset_index,
+                        u8 *cdr_ctrl_byte)
+{
+       u32 rx_preset;
+       u8 *cache = ppd->qsfp_info.cache;
+       int cable_power_class;
+
+       if (!((cache[QSFP_MOD_PWR_OFFS] & 0x4) &&
+             (cache[QSFP_CDR_INFO_OFFS] & 0x40)))
+               return;
+
+       /* RX CDR present, bypass supported */
+       cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
+
+       if (cable_power_class <= QSFP_POWER_CLASS_3) {
+               /* Power class <= 3, ignore config & turn RX CDR on */
+               *cdr_ctrl_byte |= 0xF;
+               return;
+       }
+
+       get_platform_config_field(
+               ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE,
+               rx_preset_index, RX_PRESET_TABLE_QSFP_RX_CDR_APPLY,
+               &rx_preset, 4);
+
+       if (!rx_preset) {
+               dd_dev_info(
+                       ppd->dd,
+                       "%s: RX_CDR_APPLY is set to disabled\n",
+                       __func__);
+               return;
+       }
+       get_platform_config_field(
+               ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE,
+               rx_preset_index, RX_PRESET_TABLE_QSFP_RX_CDR,
+               &rx_preset, 4);
+
+       /* Expand cdr setting to all 4 lanes */
+       rx_preset = (rx_preset | (rx_preset << 1) |
+                       (rx_preset << 2) | (rx_preset << 3));
+
+       if (rx_preset) {
+               *cdr_ctrl_byte |= rx_preset;
+       } else {
+               *cdr_ctrl_byte &= rx_preset;
+               /* Preserve current TX CDR status */
+               *cdr_ctrl_byte |= (cache[QSFP_CDR_CTRL_BYTE_OFFS] & 0xF0);
+       }
+}
+
+static void apply_tx_cdr(struct hfi1_pportdata *ppd,
+                        u32 tx_preset_index,
+                        u8 *cdr_ctrl_byte)
+{
+       u32 tx_preset;
+       u8 *cache = ppd->qsfp_info.cache;
+       int cable_power_class;
+
+       if (!((cache[QSFP_MOD_PWR_OFFS] & 0x8) &&
+             (cache[QSFP_CDR_INFO_OFFS] & 0x80)))
+               return;
+
+       /* TX CDR present, bypass supported */
+       cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
+
+       if (cable_power_class <= QSFP_POWER_CLASS_3) {
+               /* Power class <= 3, ignore config & turn TX CDR on */
+               *cdr_ctrl_byte |= 0xF0;
+               return;
+       }
+
+       get_platform_config_field(
+               ppd->dd,
+               PLATFORM_CONFIG_TX_PRESET_TABLE, tx_preset_index,
+               TX_PRESET_TABLE_QSFP_TX_CDR_APPLY, &tx_preset, 4);
+
+       if (!tx_preset) {
+               dd_dev_info(
+                       ppd->dd,
+                       "%s: TX_CDR_APPLY is set to disabled\n",
+                       __func__);
+               return;
+       }
+       get_platform_config_field(
+               ppd->dd,
+               PLATFORM_CONFIG_TX_PRESET_TABLE,
+               tx_preset_index,
+               TX_PRESET_TABLE_QSFP_TX_CDR, &tx_preset, 4);
+
+       /* Expand cdr setting to all 4 lanes */
+       tx_preset = (tx_preset | (tx_preset << 1) |
+                       (tx_preset << 2) | (tx_preset << 3));
+
+       if (tx_preset)
+               *cdr_ctrl_byte |= (tx_preset << 4);
+       else
+               /* Preserve current/determined RX CDR status */
+               *cdr_ctrl_byte &= ((tx_preset << 4) | 0xF);
+}
+
+static void apply_cdr_settings(
+               struct hfi1_pportdata *ppd, u32 rx_preset_index,
+               u32 tx_preset_index)
+{
+       u8 *cache = ppd->qsfp_info.cache;
+       u8 cdr_ctrl_byte = cache[QSFP_CDR_CTRL_BYTE_OFFS];
+
+       apply_rx_cdr(ppd, rx_preset_index, &cdr_ctrl_byte);
+
+       apply_tx_cdr(ppd, tx_preset_index, &cdr_ctrl_byte);
+
+       qsfp_write(ppd, ppd->dd->hfi1_id, QSFP_CDR_CTRL_BYTE_OFFS,
+                  &cdr_ctrl_byte, 1);
+}
+
+static void apply_tx_eq_auto(struct hfi1_pportdata *ppd)
+{
+       u8 *cache = ppd->qsfp_info.cache;
+       u8 tx_eq;
+
+       if (!(cache[QSFP_EQ_INFO_OFFS] & 0x8))
+               return;
+       /* Disable adaptive TX EQ if present */
+       tx_eq = cache[(128 * 3) + 241];
+       tx_eq &= 0xF0;
+       qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 241, &tx_eq, 1);
+}
+
+static void apply_tx_eq_prog(struct hfi1_pportdata *ppd, u32 tx_preset_index)
+{
+       u8 *cache = ppd->qsfp_info.cache;
+       u32 tx_preset;
+       u8 tx_eq;
+
+       if (!(cache[QSFP_EQ_INFO_OFFS] & 0x4))
+               return;
+
+       get_platform_config_field(
+               ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE,
+               tx_preset_index, TX_PRESET_TABLE_QSFP_TX_EQ_APPLY,
+               &tx_preset, 4);
+       if (!tx_preset) {
+               dd_dev_info(
+                       ppd->dd,
+                       "%s: TX_EQ_APPLY is set to disabled\n",
+                       __func__);
+               return;
+       }
+       get_platform_config_field(
+                       ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE,
+                       tx_preset_index, TX_PRESET_TABLE_QSFP_TX_EQ,
+                       &tx_preset, 4);
+
+       if (((cache[(128 * 3) + 224] & 0xF0) >> 4) < tx_preset) {
+               dd_dev_info(
+                       ppd->dd,
+                       "%s: TX EQ %x unsupported\n",
+                       __func__, tx_preset);
+
+               dd_dev_info(
+                       ppd->dd,
+                       "%s: Applying EQ %x\n",
+                       __func__, cache[608] & 0xF0);
+
+               tx_preset = (cache[608] & 0xF0) >> 4;
+       }
+
+       tx_eq = tx_preset | (tx_preset << 4);
+       qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 234, &tx_eq, 1);
+       qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 235, &tx_eq, 1);
+}
+
+static void apply_rx_eq_emp(struct hfi1_pportdata *ppd, u32 rx_preset_index)
+{
+       u32 rx_preset;
+       u8 rx_eq, *cache = ppd->qsfp_info.cache;
+
+       if (!(cache[QSFP_EQ_INFO_OFFS] & 0x2))
+               return;
+       get_platform_config_field(
+                       ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE,
+                       rx_preset_index, RX_PRESET_TABLE_QSFP_RX_EMP_APPLY,
+                       &rx_preset, 4);
+
+       if (!rx_preset) {
+               dd_dev_info(
+                       ppd->dd,
+                       "%s: RX_EMP_APPLY is set to disabled\n",
+                       __func__);
+               return;
+       }
+       get_platform_config_field(
+               ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE,
+               rx_preset_index, RX_PRESET_TABLE_QSFP_RX_EMP,
+               &rx_preset, 4);
+
+       if ((cache[(128 * 3) + 224] & 0xF) < rx_preset) {
+               dd_dev_info(
+                       ppd->dd,
+                       "%s: Requested RX EMP %x\n",
+                       __func__, rx_preset);
+
+               dd_dev_info(
+                       ppd->dd,
+                       "%s: Applying supported EMP %x\n",
+                       __func__, cache[608] & 0xF);
+
+               rx_preset = cache[608] & 0xF;
+       }
+
+       rx_eq = rx_preset | (rx_preset << 4);
+
+       qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 236, &rx_eq, 1);
+       qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 237, &rx_eq, 1);
+}
+
+static void apply_eq_settings(struct hfi1_pportdata *ppd,
+                             u32 rx_preset_index, u32 tx_preset_index)
+{
+       u8 *cache = ppd->qsfp_info.cache;
+
+       /* no point going on w/o a page 3 */
+       if (cache[2] & 4) {
+               dd_dev_info(ppd->dd,
+                           "%s: Upper page 03 not present\n",
+                           __func__);
+               return;
+       }
+
+       apply_tx_eq_auto(ppd);
+
+       apply_tx_eq_prog(ppd, tx_preset_index);
+
+       apply_rx_eq_emp(ppd, rx_preset_index);
+}
+
+static void apply_rx_amplitude_settings(
+               struct hfi1_pportdata *ppd, u32 rx_preset_index,
+               u32 tx_preset_index)
+{
+       u32 rx_preset;
+       u8 rx_amp = 0, i = 0, preferred = 0, *cache = ppd->qsfp_info.cache;
+
+       /* no point going on w/o a page 3 */
+       if (cache[2] & 4) {
+               dd_dev_info(ppd->dd,
+                           "%s: Upper page 03 not present\n",
+                           __func__);
+               return;
+       }
+       if (!(cache[QSFP_EQ_INFO_OFFS] & 0x1)) {
+               dd_dev_info(ppd->dd,
+                           "%s: RX_AMP_APPLY is set to disabled\n",
+                           __func__);
+               return;
+       }
+
+       get_platform_config_field(ppd->dd,
+                                 PLATFORM_CONFIG_RX_PRESET_TABLE,
+                                 rx_preset_index,
+                                 RX_PRESET_TABLE_QSFP_RX_AMP_APPLY,
+                                 &rx_preset, 4);
+
+       if (!rx_preset) {
+               dd_dev_info(ppd->dd,
+                           "%s: RX_AMP_APPLY is set to disabled\n",
+                           __func__);
+               return;
+       }
+       get_platform_config_field(ppd->dd,
+                                 PLATFORM_CONFIG_RX_PRESET_TABLE,
+                                 rx_preset_index,
+                                 RX_PRESET_TABLE_QSFP_RX_AMP,
+                                 &rx_preset, 4);
+
+       dd_dev_info(ppd->dd,
+                   "%s: Requested RX AMP %x\n",
+                   __func__,
+                   rx_preset);
+
+       for (i = 0; i < 4; i++) {
+               if (cache[(128 * 3) + 225] & (1 << i)) {
+                       preferred = i;
+                       if (preferred == rx_preset)
+                               break;
+               }
+       }
+
+       /*
+        * Verify that preferred RX amplitude is not just a
+        * fall through of the default
+        */
+       if (!preferred && !(cache[(128 * 3) + 225] & 0x1)) {
+               dd_dev_info(ppd->dd, "No supported RX AMP, not applying\n");
+               return;
+       }
+
+       dd_dev_info(ppd->dd,
+                   "%s: Applying RX AMP %x\n", __func__, preferred);
+
+       rx_amp = preferred | (preferred << 4);
+       qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 238, &rx_amp, 1);
+       qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 239, &rx_amp, 1);
+}
+
+#define OPA_INVALID_INDEX 0xFFF
+
+static void apply_tx_lanes(struct hfi1_pportdata *ppd, u8 field_id,
+                          u32 config_data, const char *message)
+{
+       u8 i;
+       int ret = HCMD_SUCCESS;
+
+       for (i = 0; i < 4; i++) {
+               ret = load_8051_config(ppd->dd, field_id, i, config_data);
+               if (ret != HCMD_SUCCESS) {
+                       dd_dev_err(
+                               ppd->dd,
+                               "%s: %s for lane %u failed\n",
+                               message, __func__, i);
+               }
+       }
+}
+
+static void apply_tunings(
+               struct hfi1_pportdata *ppd, u32 tx_preset_index,
+               u8 tuning_method, u32 total_atten, u8 limiting_active)
+{
+       int ret = 0;
+       u32 config_data = 0, tx_preset = 0;
+       u8 precur = 0, attn = 0, postcur = 0, external_device_config = 0;
+       u8 *cache = ppd->qsfp_info.cache;
+
+       /* Enable external device config if channel is limiting active */
+       read_8051_config(ppd->dd, LINK_OPTIMIZATION_SETTINGS,
+                        GENERAL_CONFIG, &config_data);
+       config_data &= ~(0xff << ENABLE_EXT_DEV_CONFIG_SHIFT);
+       config_data |= ((u32)limiting_active << ENABLE_EXT_DEV_CONFIG_SHIFT);
+       ret = load_8051_config(ppd->dd, LINK_OPTIMIZATION_SETTINGS,
+                              GENERAL_CONFIG, config_data);
+       if (ret != HCMD_SUCCESS)
+               dd_dev_err(
+                       ppd->dd,
+                       "%s: Failed to set enable external device config\n",
+                       __func__);
+
+       config_data = 0; /* re-init  */
+       /* Pass tuning method to 8051 */
+       read_8051_config(ppd->dd, LINK_TUNING_PARAMETERS, GENERAL_CONFIG,
+                        &config_data);
+       config_data &= ~(0xff << TUNING_METHOD_SHIFT);
+       config_data |= ((u32)tuning_method << TUNING_METHOD_SHIFT);
+       ret = load_8051_config(ppd->dd, LINK_TUNING_PARAMETERS, GENERAL_CONFIG,
+                              config_data);
+       if (ret != HCMD_SUCCESS)
+               dd_dev_err(ppd->dd, "%s: Failed to set tuning method\n",
+                          __func__);
+
+       /* Set same channel loss for both TX and RX */
+       config_data = 0 | (total_atten << 16) | (total_atten << 24);
+       apply_tx_lanes(ppd, CHANNEL_LOSS_SETTINGS, config_data,
+                      "Setting channel loss");
+
+       /* Inform 8051 of cable capabilities */
+       if (ppd->qsfp_info.cache_valid) {
+               external_device_config =
+                       ((cache[QSFP_MOD_PWR_OFFS] & 0x4) << 3) |
+                       ((cache[QSFP_MOD_PWR_OFFS] & 0x8) << 2) |
+                       ((cache[QSFP_EQ_INFO_OFFS] & 0x2) << 1) |
+                       (cache[QSFP_EQ_INFO_OFFS] & 0x4);
+               ret = read_8051_config(ppd->dd, DC_HOST_COMM_SETTINGS,
+                                      GENERAL_CONFIG, &config_data);
+               /* Clear, then set the external device config field */
+               config_data &= ~(u32)0xFF;
+               config_data |= external_device_config;
+               ret = load_8051_config(ppd->dd, DC_HOST_COMM_SETTINGS,
+                                      GENERAL_CONFIG, config_data);
+               if (ret != HCMD_SUCCESS)
+                       dd_dev_info(ppd->dd,
+                                   "%s: Failed set ext device config params\n",
+                                   __func__);
+       }
+
+       if (tx_preset_index == OPA_INVALID_INDEX) {
+               if (ppd->port_type == PORT_TYPE_QSFP && limiting_active)
+                       dd_dev_info(ppd->dd, "%s: Invalid Tx preset index\n",
+                                   __func__);
+               return;
+       }
+
+       /* Following for limiting active channels only */
+       get_platform_config_field(
+               ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE, tx_preset_index,
+               TX_PRESET_TABLE_PRECUR, &tx_preset, 4);
+       precur = tx_preset;
+
+       get_platform_config_field(
+               ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE,
+               tx_preset_index, TX_PRESET_TABLE_ATTN, &tx_preset, 4);
+       attn = tx_preset;
+
+       get_platform_config_field(
+               ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE,
+               tx_preset_index, TX_PRESET_TABLE_POSTCUR, &tx_preset, 4);
+       postcur = tx_preset;
+
+       config_data = precur | (attn << 8) | (postcur << 16);
+
+       apply_tx_lanes(ppd, TX_EQ_SETTINGS, config_data,
+                      "Applying TX settings");
+}
+
+/* Must be holding the QSFP i2c resource */
+static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset,
+                           u32 *ptr_rx_preset, u32 *ptr_total_atten)
+{
+       int ret;
+       u16 lss = ppd->link_speed_supported, lse = ppd->link_speed_enabled;
+       u8 *cache = ppd->qsfp_info.cache;
+
+       ppd->qsfp_info.limiting_active = 1;
+
+       ret = set_qsfp_tx(ppd, 0);
+       if (ret)
+               return ret;
+
+       ret = qual_power(ppd);
+       if (ret)
+               return ret;
+
+       ret = qual_bitrate(ppd);
+       if (ret)
+               return ret;
+
+       if (ppd->qsfp_info.reset_needed) {
+               reset_qsfp(ppd);
+               ppd->qsfp_info.reset_needed = 0;
+               refresh_qsfp_cache(ppd, &ppd->qsfp_info);
+       } else {
+               ppd->qsfp_info.reset_needed = 1;
+       }
+
+       ret = set_qsfp_high_power(ppd);
+       if (ret)
+               return ret;
+
+       if (cache[QSFP_EQ_INFO_OFFS] & 0x4) {
+               ret = get_platform_config_field(
+                       ppd->dd,
+                       PLATFORM_CONFIG_PORT_TABLE, 0,
+                       PORT_TABLE_TX_PRESET_IDX_ACTIVE_EQ,
+                       ptr_tx_preset, 4);
+               if (ret) {
+                       *ptr_tx_preset = OPA_INVALID_INDEX;
+                       return ret;
+               }
+       } else {
+               ret = get_platform_config_field(
+                       ppd->dd,
+                       PLATFORM_CONFIG_PORT_TABLE, 0,
+                       PORT_TABLE_TX_PRESET_IDX_ACTIVE_NO_EQ,
+                       ptr_tx_preset, 4);
+               if (ret) {
+                       *ptr_tx_preset = OPA_INVALID_INDEX;
+                       return ret;
+               }
+       }
+
+       ret = get_platform_config_field(
+               ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
+               PORT_TABLE_RX_PRESET_IDX, ptr_rx_preset, 4);
+       if (ret) {
+               *ptr_rx_preset = OPA_INVALID_INDEX;
+               return ret;
+       }
+
+       if ((lss & OPA_LINK_SPEED_25G) && (lse & OPA_LINK_SPEED_25G))
+               get_platform_config_field(
+                       ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
+                       PORT_TABLE_LOCAL_ATTEN_25G, ptr_total_atten, 4);
+       else if ((lss & OPA_LINK_SPEED_12_5G) && (lse & OPA_LINK_SPEED_12_5G))
+               get_platform_config_field(
+                       ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
+                       PORT_TABLE_LOCAL_ATTEN_12G, ptr_total_atten, 4);
+
+       apply_cdr_settings(ppd, *ptr_rx_preset, *ptr_tx_preset);
+
+       apply_eq_settings(ppd, *ptr_rx_preset, *ptr_tx_preset);
+
+       apply_rx_amplitude_settings(ppd, *ptr_rx_preset, *ptr_tx_preset);
+
+       ret = set_qsfp_tx(ppd, 1);
+
+       return ret;
+}
+
+static int tune_qsfp(struct hfi1_pportdata *ppd,
+                    u32 *ptr_tx_preset, u32 *ptr_rx_preset,
+                    u8 *ptr_tuning_method, u32 *ptr_total_atten)
+{
+       u32 cable_atten = 0, remote_atten = 0, platform_atten = 0;
+       u16 lss = ppd->link_speed_supported, lse = ppd->link_speed_enabled;
+       int ret = 0;
+       u8 *cache = ppd->qsfp_info.cache;
+
+       switch ((cache[QSFP_MOD_TECH_OFFS] & 0xF0) >> 4) {
+       case 0xA ... 0xB:
+               ret = get_platform_config_field(
+                       ppd->dd,
+                       PLATFORM_CONFIG_PORT_TABLE, 0,
+                       PORT_TABLE_LOCAL_ATTEN_25G,
+                       &platform_atten, 4);
+               if (ret)
+                       return ret;
+
+               if ((lss & OPA_LINK_SPEED_25G) && (lse & OPA_LINK_SPEED_25G))
+                       cable_atten = cache[QSFP_CU_ATTEN_12G_OFFS];
+               else if ((lss & OPA_LINK_SPEED_12_5G) &&
+                        (lse & OPA_LINK_SPEED_12_5G))
+                       cable_atten = cache[QSFP_CU_ATTEN_7G_OFFS];
+
+               /* Fallback to configured attenuation if cable memory is bad */
+               if (cable_atten == 0 || cable_atten > 36) {
+                       ret = get_platform_config_field(
+                               ppd->dd,
+                               PLATFORM_CONFIG_SYSTEM_TABLE, 0,
+                               SYSTEM_TABLE_QSFP_ATTENUATION_DEFAULT_25G,
+                               &cable_atten, 4);
+                       if (ret)
+                               return ret;
+               }
+
+               ret = get_platform_config_field(
+                       ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
+                       PORT_TABLE_REMOTE_ATTEN_25G, &remote_atten, 4);
+               if (ret)
+                       return ret;
+
+               *ptr_total_atten = platform_atten + cable_atten + remote_atten;
+
+               *ptr_tuning_method = OPA_PASSIVE_TUNING;
+               break;
+       case 0x0 ... 0x9: /* fallthrough */
+       case 0xC: /* fallthrough */
+       case 0xE:
+               ret = tune_active_qsfp(ppd, ptr_tx_preset, ptr_rx_preset,
+                                      ptr_total_atten);
+               if (ret)
+                       return ret;
+
+               *ptr_tuning_method = OPA_ACTIVE_TUNING;
+               break;
+       case 0xD: /* fallthrough */
+       case 0xF:
+       default:
+               dd_dev_info(ppd->dd, "%s: Unknown/unsupported cable\n",
+                           __func__);
+               break;
+       }
+       return ret;
+}
+
+/*
+ * This function communicates its success or failure via ppd->driver_link_ready
+ * Thus, it depends on its association with start_link(...) which checks
+ * driver_link_ready before proceeding with the link negotiation and
+ * initialization process.
+ */
+void tune_serdes(struct hfi1_pportdata *ppd)
+{
+       int ret = 0;
+       u32 total_atten = 0;
+       u32 remote_atten = 0, platform_atten = 0;
+       u32 rx_preset_index, tx_preset_index;
+       u8 tuning_method = 0, limiting_active = 0;
+       struct hfi1_devdata *dd = ppd->dd;
+
+       rx_preset_index = OPA_INVALID_INDEX;
+       tx_preset_index = OPA_INVALID_INDEX;
+
+       /* the link defaults to enabled */
+       ppd->link_enabled = 1;
+       /* the driver link ready state defaults to not ready */
+       ppd->driver_link_ready = 0;
+       ppd->offline_disabled_reason = HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE);
+
+       /* Skip the tuning for testing (loopback != none) and simulations */
+       if (loopback != LOOPBACK_NONE ||
+           ppd->dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
+               ppd->driver_link_ready = 1;
+               return;
+       }
+
+       switch (ppd->port_type) {
+       case PORT_TYPE_DISCONNECTED:
+               ppd->offline_disabled_reason =
+                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_DISCONNECTED);
+               dd_dev_info(dd, "%s: Port disconnected, disabling port\n",
+                           __func__);
+               goto bail;
+       case PORT_TYPE_FIXED:
+               /* platform_atten, remote_atten pre-zeroed to catch error */
+               get_platform_config_field(
+                       ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
+                       PORT_TABLE_LOCAL_ATTEN_25G, &platform_atten, 4);
+
+               get_platform_config_field(
+                       ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
+                       PORT_TABLE_REMOTE_ATTEN_25G, &remote_atten, 4);
+
+               total_atten = platform_atten + remote_atten;
+
+               tuning_method = OPA_PASSIVE_TUNING;
+               break;
+       case PORT_TYPE_VARIABLE:
+               if (qsfp_mod_present(ppd)) {
+                       /*
+                        * platform_atten, remote_atten pre-zeroed to
+                        * catch error
+                        */
+                       get_platform_config_field(
+                               ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
+                               PORT_TABLE_LOCAL_ATTEN_25G,
+                               &platform_atten, 4);
+
+                       get_platform_config_field(
+                               ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
+                               PORT_TABLE_REMOTE_ATTEN_25G,
+                               &remote_atten, 4);
+
+                       total_atten = platform_atten + remote_atten;
+
+                       tuning_method = OPA_PASSIVE_TUNING;
+               } else {
+                       ppd->offline_disabled_reason =
+                            HFI1_ODR_MASK(OPA_LINKDOWN_REASON_CHASSIS_CONFIG);
+                       goto bail;
+               }
+               break;
+       case PORT_TYPE_QSFP:
+               if (qsfp_mod_present(ppd)) {
+                       ret = acquire_chip_resource(ppd->dd,
+                                                   qsfp_resource(ppd->dd),
+                                                   QSFP_WAIT);
+                       if (ret) {
+                               dd_dev_err(ppd->dd, "%s: hfi%d: cannot lock i2c chain\n",
+                                          __func__, (int)ppd->dd->hfi1_id);
+                               goto bail;
+                       }
+                       refresh_qsfp_cache(ppd, &ppd->qsfp_info);
+
+                       if (ppd->qsfp_info.cache_valid) {
+                               ret = tune_qsfp(ppd,
+                                               &tx_preset_index,
+                                               &rx_preset_index,
+                                               &tuning_method,
+                                               &total_atten);
+
+                               /*
+                                * We may have modified the QSFP memory, so
+                                * update the cache to reflect the changes
+                                */
+                               refresh_qsfp_cache(ppd, &ppd->qsfp_info);
+                               limiting_active =
+                                               ppd->qsfp_info.limiting_active;
+                       } else {
+                               dd_dev_err(dd,
+                                          "%s: Reading QSFP memory failed\n",
+                                          __func__);
+                               ret = -EINVAL; /* a fail indication */
+                       }
+                       release_chip_resource(ppd->dd, qsfp_resource(ppd->dd));
+                       if (ret)
+                               goto bail;
+               } else {
+                       ppd->offline_disabled_reason =
+                          HFI1_ODR_MASK(
+                               OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED);
+                       goto bail;
+               }
+               break;
+       default:
+               dd_dev_info(ppd->dd, "%s: Unknown port type\n", __func__);
+               ppd->port_type = PORT_TYPE_UNKNOWN;
+               tuning_method = OPA_UNKNOWN_TUNING;
+               total_atten = 0;
+               limiting_active = 0;
+               tx_preset_index = OPA_INVALID_INDEX;
+               break;
+       }
+
+       if (ppd->offline_disabled_reason ==
+                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE))
+               apply_tunings(ppd, tx_preset_index, tuning_method,
+                             total_atten, limiting_active);
+
+       if (!ret)
+               ppd->driver_link_ready = 1;
+
+       return;
+bail:
+       ppd->driver_link_ready = 0;
+}
diff --git a/drivers/infiniband/hw/hfi1/platform.h b/drivers/infiniband/hw/hfi1/platform.h
new file mode 100644 (file)
index 0000000..e2c2161
--- /dev/null
@@ -0,0 +1,305 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef __PLATFORM_H
+#define __PLATFORM_H
+
+#define METADATA_TABLE_FIELD_START_SHIFT               0
+#define METADATA_TABLE_FIELD_START_LEN_BITS            15
+#define METADATA_TABLE_FIELD_LEN_SHIFT                 16
+#define METADATA_TABLE_FIELD_LEN_LEN_BITS              16
+
+/* Header structure */
+#define PLATFORM_CONFIG_HEADER_RECORD_IDX_SHIFT                        0
+#define PLATFORM_CONFIG_HEADER_RECORD_IDX_LEN_BITS             6
+#define PLATFORM_CONFIG_HEADER_TABLE_LENGTH_SHIFT              16
+#define PLATFORM_CONFIG_HEADER_TABLE_LENGTH_LEN_BITS           12
+#define PLATFORM_CONFIG_HEADER_TABLE_TYPE_SHIFT                        28
+#define PLATFORM_CONFIG_HEADER_TABLE_TYPE_LEN_BITS             4
+
+enum platform_config_table_type_encoding {
+       PLATFORM_CONFIG_TABLE_RESERVED,
+       PLATFORM_CONFIG_SYSTEM_TABLE,
+       PLATFORM_CONFIG_PORT_TABLE,
+       PLATFORM_CONFIG_RX_PRESET_TABLE,
+       PLATFORM_CONFIG_TX_PRESET_TABLE,
+       PLATFORM_CONFIG_QSFP_ATTEN_TABLE,
+       PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE,
+       PLATFORM_CONFIG_TABLE_MAX
+};
+
+enum platform_config_system_table_fields {
+       SYSTEM_TABLE_RESERVED,
+       SYSTEM_TABLE_NODE_STRING,
+       SYSTEM_TABLE_SYSTEM_IMAGE_GUID,
+       SYSTEM_TABLE_NODE_GUID,
+       SYSTEM_TABLE_REVISION,
+       SYSTEM_TABLE_VENDOR_OUI,
+       SYSTEM_TABLE_META_VERSION,
+       SYSTEM_TABLE_DEVICE_ID,
+       SYSTEM_TABLE_PARTITION_ENFORCEMENT_CAP,
+       SYSTEM_TABLE_QSFP_POWER_CLASS_MAX,
+       SYSTEM_TABLE_QSFP_ATTENUATION_DEFAULT_12G,
+       SYSTEM_TABLE_QSFP_ATTENUATION_DEFAULT_25G,
+       SYSTEM_TABLE_VARIABLE_TABLE_ENTRIES_PER_PORT,
+       SYSTEM_TABLE_MAX
+};
+
+enum platform_config_port_table_fields {
+       PORT_TABLE_RESERVED,
+       PORT_TABLE_PORT_TYPE,
+       PORT_TABLE_LOCAL_ATTEN_12G,
+       PORT_TABLE_LOCAL_ATTEN_25G,
+       PORT_TABLE_LINK_SPEED_SUPPORTED,
+       PORT_TABLE_LINK_WIDTH_SUPPORTED,
+       PORT_TABLE_AUTO_LANE_SHEDDING_ENABLED,
+       PORT_TABLE_EXTERNAL_LOOPBACK_ALLOWED,
+       PORT_TABLE_VL_CAP,
+       PORT_TABLE_MTU_CAP,
+       PORT_TABLE_TX_LANE_ENABLE_MASK,
+       PORT_TABLE_LOCAL_MAX_TIMEOUT,
+       PORT_TABLE_REMOTE_ATTEN_12G,
+       PORT_TABLE_REMOTE_ATTEN_25G,
+       PORT_TABLE_TX_PRESET_IDX_ACTIVE_NO_EQ,
+       PORT_TABLE_TX_PRESET_IDX_ACTIVE_EQ,
+       PORT_TABLE_RX_PRESET_IDX,
+       PORT_TABLE_CABLE_REACH_CLASS,
+       PORT_TABLE_MAX
+};
+
+enum platform_config_rx_preset_table_fields {
+       RX_PRESET_TABLE_RESERVED,
+       RX_PRESET_TABLE_QSFP_RX_CDR_APPLY,
+       RX_PRESET_TABLE_QSFP_RX_EMP_APPLY,
+       RX_PRESET_TABLE_QSFP_RX_AMP_APPLY,
+       RX_PRESET_TABLE_QSFP_RX_CDR,
+       RX_PRESET_TABLE_QSFP_RX_EMP,
+       RX_PRESET_TABLE_QSFP_RX_AMP,
+       RX_PRESET_TABLE_MAX
+};
+
+enum platform_config_tx_preset_table_fields {
+       TX_PRESET_TABLE_RESERVED,
+       TX_PRESET_TABLE_PRECUR,
+       TX_PRESET_TABLE_ATTN,
+       TX_PRESET_TABLE_POSTCUR,
+       TX_PRESET_TABLE_QSFP_TX_CDR_APPLY,
+       TX_PRESET_TABLE_QSFP_TX_EQ_APPLY,
+       TX_PRESET_TABLE_QSFP_TX_CDR,
+       TX_PRESET_TABLE_QSFP_TX_EQ,
+       TX_PRESET_TABLE_MAX
+};
+
+enum platform_config_qsfp_attn_table_fields {
+       QSFP_ATTEN_TABLE_RESERVED,
+       QSFP_ATTEN_TABLE_TX_PRESET_IDX,
+       QSFP_ATTEN_TABLE_RX_PRESET_IDX,
+       QSFP_ATTEN_TABLE_MAX
+};
+
+enum platform_config_variable_settings_table_fields {
+       VARIABLE_SETTINGS_TABLE_RESERVED,
+       VARIABLE_SETTINGS_TABLE_TX_PRESET_IDX,
+       VARIABLE_SETTINGS_TABLE_RX_PRESET_IDX,
+       VARIABLE_SETTINGS_TABLE_MAX
+};
+
+struct platform_config {
+       size_t size;
+       const u8 *data;
+};
+
+struct platform_config_data {
+       u32 *table;
+       u32 *table_metadata;
+       u32 num_table;
+};
+
+/*
+ * This struct acts as a quick reference into the platform_data binary image
+ * and is populated by parse_platform_config(...) depending on the specific
+ * META_VERSION
+ */
+struct platform_config_cache {
+       u8  cache_valid;
+       struct platform_config_data config_tables[PLATFORM_CONFIG_TABLE_MAX];
+};
+
+static const u32 platform_config_table_limits[PLATFORM_CONFIG_TABLE_MAX] = {
+       0,
+       SYSTEM_TABLE_MAX,
+       PORT_TABLE_MAX,
+       RX_PRESET_TABLE_MAX,
+       TX_PRESET_TABLE_MAX,
+       QSFP_ATTEN_TABLE_MAX,
+       VARIABLE_SETTINGS_TABLE_MAX
+};
+
+/* This section defines default values and encodings for the
+ * fields defined for each table above
+ */
+
+/*
+ * =====================================================
+ *  System table encodings
+ * =====================================================
+ */
+#define PLATFORM_CONFIG_MAGIC_NUM              0x3d4f5041
+#define PLATFORM_CONFIG_MAGIC_NUMBER_LEN       4
+
+/*
+ * These power classes are the same as defined in SFF 8636 spec rev 2.4
+ * describing byte 129 in table 6-16, except enumerated in a different order
+ */
+enum platform_config_qsfp_power_class_encoding {
+       QSFP_POWER_CLASS_1 = 1,
+       QSFP_POWER_CLASS_2,
+       QSFP_POWER_CLASS_3,
+       QSFP_POWER_CLASS_4,
+       QSFP_POWER_CLASS_5,
+       QSFP_POWER_CLASS_6,
+       QSFP_POWER_CLASS_7
+};
+
+/*
+ * ====================================================
+ *  Port table encodings
+ * ====================================================
+ */
+enum platform_config_port_type_encoding {
+       PORT_TYPE_UNKNOWN,
+       PORT_TYPE_DISCONNECTED,
+       PORT_TYPE_FIXED,
+       PORT_TYPE_VARIABLE,
+       PORT_TYPE_QSFP,
+       PORT_TYPE_MAX
+};
+
+enum platform_config_link_speed_supported_encoding {
+       LINK_SPEED_SUPP_12G = 1,
+       LINK_SPEED_SUPP_25G,
+       LINK_SPEED_SUPP_12G_25G,
+       LINK_SPEED_SUPP_MAX
+};
+
+/*
+ * This is a subset (not strict) of the link downgrades
+ * supported. The link downgrades supported are expected
+ * to be supplied to the driver by another entity such as
+ * the fabric manager
+ */
+enum platform_config_link_width_supported_encoding {
+       LINK_WIDTH_SUPP_1X = 1,
+       LINK_WIDTH_SUPP_2X,
+       LINK_WIDTH_SUPP_2X_1X,
+       LINK_WIDTH_SUPP_3X,
+       LINK_WIDTH_SUPP_3X_1X,
+       LINK_WIDTH_SUPP_3X_2X,
+       LINK_WIDTH_SUPP_3X_2X_1X,
+       LINK_WIDTH_SUPP_4X,
+       LINK_WIDTH_SUPP_4X_1X,
+       LINK_WIDTH_SUPP_4X_2X,
+       LINK_WIDTH_SUPP_4X_2X_1X,
+       LINK_WIDTH_SUPP_4X_3X,
+       LINK_WIDTH_SUPP_4X_3X_1X,
+       LINK_WIDTH_SUPP_4X_3X_2X,
+       LINK_WIDTH_SUPP_4X_3X_2X_1X,
+       LINK_WIDTH_SUPP_MAX
+};
+
+enum platform_config_virtual_lane_capability_encoding {
+       VL_CAP_VL0 = 1,
+       VL_CAP_VL0_1,
+       VL_CAP_VL0_2,
+       VL_CAP_VL0_3,
+       VL_CAP_VL0_4,
+       VL_CAP_VL0_5,
+       VL_CAP_VL0_6,
+       VL_CAP_VL0_7,
+       VL_CAP_VL0_8,
+       VL_CAP_VL0_9,
+       VL_CAP_VL0_10,
+       VL_CAP_VL0_11,
+       VL_CAP_VL0_12,
+       VL_CAP_VL0_13,
+       VL_CAP_VL0_14,
+       VL_CAP_MAX
+};
+
+/* Max MTU */
+enum platform_config_mtu_capability_encoding {
+       MTU_CAP_256   = 1,
+       MTU_CAP_512   = 2,
+       MTU_CAP_1024  = 3,
+       MTU_CAP_2048  = 4,
+       MTU_CAP_4096  = 5,
+       MTU_CAP_8192  = 6,
+       MTU_CAP_10240 = 7
+};
+
+enum platform_config_local_max_timeout_encoding {
+       LOCAL_MAX_TIMEOUT_10_MS = 1,
+       LOCAL_MAX_TIMEOUT_100_MS,
+       LOCAL_MAX_TIMEOUT_1_S,
+       LOCAL_MAX_TIMEOUT_10_S,
+       LOCAL_MAX_TIMEOUT_100_S,
+       LOCAL_MAX_TIMEOUT_1000_S
+};
+
+enum link_tuning_encoding {
+       OPA_PASSIVE_TUNING,
+       OPA_ACTIVE_TUNING,
+       OPA_UNKNOWN_TUNING
+};
+
+/* platform.c */
+void get_platform_config(struct hfi1_devdata *dd);
+void free_platform_config(struct hfi1_devdata *dd);
+void get_port_type(struct hfi1_pportdata *ppd);
+int set_qsfp_tx(struct hfi1_pportdata *ppd, int on);
+void tune_serdes(struct hfi1_pportdata *ppd);
+
+#endif                 /*__PLATFORM_H*/
diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c
new file mode 100644 (file)
index 0000000..1a942ff
--- /dev/null
@@ -0,0 +1,974 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/err.h>
+#include <linux/vmalloc.h>
+#include <linux/hash.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <rdma/rdma_vt.h>
+#include <rdma/rdmavt_qp.h>
+
+#include "hfi.h"
+#include "qp.h"
+#include "trace.h"
+#include "verbs_txreq.h"
+
+unsigned int hfi1_qp_table_size = 256;
+module_param_named(qp_table_size, hfi1_qp_table_size, uint, S_IRUGO);
+MODULE_PARM_DESC(qp_table_size, "QP table size");
+
+static void flush_tx_list(struct rvt_qp *qp);
+static int iowait_sleep(
+       struct sdma_engine *sde,
+       struct iowait *wait,
+       struct sdma_txreq *stx,
+       unsigned seq);
+static void iowait_wakeup(struct iowait *wait, int reason);
+static void iowait_sdma_drained(struct iowait *wait);
+static void qp_pio_drain(struct rvt_qp *qp);
+
+static inline unsigned mk_qpn(struct rvt_qpn_table *qpt,
+                             struct rvt_qpn_map *map, unsigned off)
+{
+       return (map - qpt->map) * RVT_BITS_PER_PAGE + off;
+}
+
+/*
+ * Convert the AETH credit code into the number of credits.
+ */
+static const u16 credit_table[31] = {
+       0,                      /* 0 */
+       1,                      /* 1 */
+       2,                      /* 2 */
+       3,                      /* 3 */
+       4,                      /* 4 */
+       6,                      /* 5 */
+       8,                      /* 6 */
+       12,                     /* 7 */
+       16,                     /* 8 */
+       24,                     /* 9 */
+       32,                     /* A */
+       48,                     /* B */
+       64,                     /* C */
+       96,                     /* D */
+       128,                    /* E */
+       192,                    /* F */
+       256,                    /* 10 */
+       384,                    /* 11 */
+       512,                    /* 12 */
+       768,                    /* 13 */
+       1024,                   /* 14 */
+       1536,                   /* 15 */
+       2048,                   /* 16 */
+       3072,                   /* 17 */
+       4096,                   /* 18 */
+       6144,                   /* 19 */
+       8192,                   /* 1A */
+       12288,                  /* 1B */
+       16384,                  /* 1C */
+       24576,                  /* 1D */
+       32768                   /* 1E */
+};
+
+static void flush_tx_list(struct rvt_qp *qp)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       while (!list_empty(&priv->s_iowait.tx_head)) {
+               struct sdma_txreq *tx;
+
+               tx = list_first_entry(
+                       &priv->s_iowait.tx_head,
+                       struct sdma_txreq,
+                       list);
+               list_del_init(&tx->list);
+               hfi1_put_txreq(
+                       container_of(tx, struct verbs_txreq, txreq));
+       }
+}
+
+static void flush_iowait(struct rvt_qp *qp)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+       struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+       unsigned long flags;
+
+       write_seqlock_irqsave(&dev->iowait_lock, flags);
+       if (!list_empty(&priv->s_iowait.list)) {
+               list_del_init(&priv->s_iowait.list);
+               if (atomic_dec_and_test(&qp->refcount))
+                       wake_up(&qp->wait);
+       }
+       write_sequnlock_irqrestore(&dev->iowait_lock, flags);
+}
+
+static inline int opa_mtu_enum_to_int(int mtu)
+{
+       switch (mtu) {
+       case OPA_MTU_8192:  return 8192;
+       case OPA_MTU_10240: return 10240;
+       default:            return -1;
+       }
+}
+
+/**
+ * This function is what we would push to the core layer if we wanted to be a
+ * "first class citizen".  Instead we hide this here and rely on Verbs ULPs
+ * to blindly pass the MTU enum value from the PathRecord to us.
+ */
+static inline int verbs_mtu_enum_to_int(struct ib_device *dev, enum ib_mtu mtu)
+{
+       int val;
+
+       /* Constraining 10KB packets to 8KB packets */
+       if (mtu == (enum ib_mtu)OPA_MTU_10240)
+               mtu = OPA_MTU_8192;
+       val = opa_mtu_enum_to_int((int)mtu);
+       if (val > 0)
+               return val;
+       return ib_mtu_enum_to_int(mtu);
+}
+
+int hfi1_check_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
+                        int attr_mask, struct ib_udata *udata)
+{
+       struct ib_qp *ibqp = &qp->ibqp;
+       struct hfi1_ibdev *dev = to_idev(ibqp->device);
+       struct hfi1_devdata *dd = dd_from_dev(dev);
+       u8 sc;
+
+       if (attr_mask & IB_QP_AV) {
+               sc = ah_to_sc(ibqp->device, &attr->ah_attr);
+               if (sc == 0xf)
+                       return -EINVAL;
+
+               if (!qp_to_sdma_engine(qp, sc) &&
+                   dd->flags & HFI1_HAS_SEND_DMA)
+                       return -EINVAL;
+
+               if (!qp_to_send_context(qp, sc))
+                       return -EINVAL;
+       }
+
+       if (attr_mask & IB_QP_ALT_PATH) {
+               sc = ah_to_sc(ibqp->device, &attr->alt_ah_attr);
+               if (sc == 0xf)
+                       return -EINVAL;
+
+               if (!qp_to_sdma_engine(qp, sc) &&
+                   dd->flags & HFI1_HAS_SEND_DMA)
+                       return -EINVAL;
+
+               if (!qp_to_send_context(qp, sc))
+                       return -EINVAL;
+       }
+
+       return 0;
+}
+
+void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
+                   int attr_mask, struct ib_udata *udata)
+{
+       struct ib_qp *ibqp = &qp->ibqp;
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       if (attr_mask & IB_QP_AV) {
+               priv->s_sc = ah_to_sc(ibqp->device, &qp->remote_ah_attr);
+               priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc);
+               priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc);
+       }
+
+       if (attr_mask & IB_QP_PATH_MIG_STATE &&
+           attr->path_mig_state == IB_MIG_MIGRATED &&
+           qp->s_mig_state == IB_MIG_ARMED) {
+               qp->s_flags |= RVT_S_AHG_CLEAR;
+               priv->s_sc = ah_to_sc(ibqp->device, &qp->remote_ah_attr);
+               priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc);
+               priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc);
+       }
+}
+
+/**
+ * hfi1_check_send_wqe - validate wqe
+ * @qp - The qp
+ * @wqe - The built wqe
+ *
+ * validate wqe.  This is called
+ * prior to inserting the wqe into
+ * the ring but after the wqe has been
+ * setup.
+ *
+ * Returns 0 on success, -EINVAL on failure
+ *
+ */
+int hfi1_check_send_wqe(struct rvt_qp *qp,
+                       struct rvt_swqe *wqe)
+{
+       struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+       struct rvt_ah *ah;
+
+       switch (qp->ibqp.qp_type) {
+       case IB_QPT_RC:
+       case IB_QPT_UC:
+               if (wqe->length > 0x80000000U)
+                       return -EINVAL;
+               break;
+       case IB_QPT_SMI:
+               ah = ibah_to_rvtah(wqe->ud_wr.ah);
+               if (wqe->length > (1 << ah->log_pmtu))
+                       return -EINVAL;
+               break;
+       case IB_QPT_GSI:
+       case IB_QPT_UD:
+               ah = ibah_to_rvtah(wqe->ud_wr.ah);
+               if (wqe->length > (1 << ah->log_pmtu))
+                       return -EINVAL;
+               if (ibp->sl_to_sc[ah->attr.sl] == 0xf)
+                       return -EINVAL;
+       default:
+               break;
+       }
+       return wqe->length <= piothreshold;
+}
+
+/**
+ * hfi1_compute_aeth - compute the AETH (syndrome + MSN)
+ * @qp: the queue pair to compute the AETH for
+ *
+ * Returns the AETH.
+ */
+__be32 hfi1_compute_aeth(struct rvt_qp *qp)
+{
+       u32 aeth = qp->r_msn & HFI1_MSN_MASK;
+
+       if (qp->ibqp.srq) {
+               /*
+                * Shared receive queues don't generate credits.
+                * Set the credit field to the invalid value.
+                */
+               aeth |= HFI1_AETH_CREDIT_INVAL << HFI1_AETH_CREDIT_SHIFT;
+       } else {
+               u32 min, max, x;
+               u32 credits;
+               struct rvt_rwq *wq = qp->r_rq.wq;
+               u32 head;
+               u32 tail;
+
+               /* sanity check pointers before trusting them */
+               head = wq->head;
+               if (head >= qp->r_rq.size)
+                       head = 0;
+               tail = wq->tail;
+               if (tail >= qp->r_rq.size)
+                       tail = 0;
+               /*
+                * Compute the number of credits available (RWQEs).
+                * There is a small chance that the pair of reads are
+                * not atomic, which is OK, since the fuzziness is
+                * resolved as further ACKs go out.
+                */
+               credits = head - tail;
+               if ((int)credits < 0)
+                       credits += qp->r_rq.size;
+               /*
+                * Binary search the credit table to find the code to
+                * use.
+                */
+               min = 0;
+               max = 31;
+               for (;;) {
+                       x = (min + max) / 2;
+                       if (credit_table[x] == credits)
+                               break;
+                       if (credit_table[x] > credits) {
+                               max = x;
+                       } else {
+                               if (min == x)
+                                       break;
+                               min = x;
+                       }
+               }
+               aeth |= x << HFI1_AETH_CREDIT_SHIFT;
+       }
+       return cpu_to_be32(aeth);
+}
+
+/**
+ * _hfi1_schedule_send - schedule progress
+ * @qp: the QP
+ *
+ * This schedules qp progress w/o regard to the s_flags.
+ *
+ * It is only used in the post send, which doesn't hold
+ * the s_lock.
+ */
+void _hfi1_schedule_send(struct rvt_qp *qp)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+       struct hfi1_ibport *ibp =
+               to_iport(qp->ibqp.device, qp->port_num);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+
+       iowait_schedule(&priv->s_iowait, ppd->hfi1_wq,
+                       priv->s_sde ?
+                       priv->s_sde->cpu :
+                       cpumask_first(cpumask_of_node(dd->node)));
+}
+
+static void qp_pio_drain(struct rvt_qp *qp)
+{
+       struct hfi1_ibdev *dev;
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       if (!priv->s_sendcontext)
+               return;
+       dev = to_idev(qp->ibqp.device);
+       while (iowait_pio_pending(&priv->s_iowait)) {
+               write_seqlock_irq(&dev->iowait_lock);
+               hfi1_sc_wantpiobuf_intr(priv->s_sendcontext, 1);
+               write_sequnlock_irq(&dev->iowait_lock);
+               iowait_pio_drain(&priv->s_iowait);
+               write_seqlock_irq(&dev->iowait_lock);
+               hfi1_sc_wantpiobuf_intr(priv->s_sendcontext, 0);
+               write_sequnlock_irq(&dev->iowait_lock);
+       }
+}
+
+/**
+ * hfi1_schedule_send - schedule progress
+ * @qp: the QP
+ *
+ * This schedules qp progress and caller should hold
+ * the s_lock.
+ */
+void hfi1_schedule_send(struct rvt_qp *qp)
+{
+       if (hfi1_send_ok(qp))
+               _hfi1_schedule_send(qp);
+}
+
+/**
+ * hfi1_get_credit - flush the send work queue of a QP
+ * @qp: the qp who's send work queue to flush
+ * @aeth: the Acknowledge Extended Transport Header
+ *
+ * The QP s_lock should be held.
+ */
+void hfi1_get_credit(struct rvt_qp *qp, u32 aeth)
+{
+       u32 credit = (aeth >> HFI1_AETH_CREDIT_SHIFT) & HFI1_AETH_CREDIT_MASK;
+
+       /*
+        * If the credit is invalid, we can send
+        * as many packets as we like.  Otherwise, we have to
+        * honor the credit field.
+        */
+       if (credit == HFI1_AETH_CREDIT_INVAL) {
+               if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) {
+                       qp->s_flags |= RVT_S_UNLIMITED_CREDIT;
+                       if (qp->s_flags & RVT_S_WAIT_SSN_CREDIT) {
+                               qp->s_flags &= ~RVT_S_WAIT_SSN_CREDIT;
+                               hfi1_schedule_send(qp);
+                       }
+               }
+       } else if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) {
+               /* Compute new LSN (i.e., MSN + credit) */
+               credit = (aeth + credit_table[credit]) & HFI1_MSN_MASK;
+               if (cmp_msn(credit, qp->s_lsn) > 0) {
+                       qp->s_lsn = credit;
+                       if (qp->s_flags & RVT_S_WAIT_SSN_CREDIT) {
+                               qp->s_flags &= ~RVT_S_WAIT_SSN_CREDIT;
+                               hfi1_schedule_send(qp);
+                       }
+               }
+       }
+}
+
+void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&qp->s_lock, flags);
+       if (qp->s_flags & flag) {
+               qp->s_flags &= ~flag;
+               trace_hfi1_qpwakeup(qp, flag);
+               hfi1_schedule_send(qp);
+       }
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+       /* Notify hfi1_destroy_qp() if it is waiting. */
+       if (atomic_dec_and_test(&qp->refcount))
+               wake_up(&qp->wait);
+}
+
+static int iowait_sleep(
+       struct sdma_engine *sde,
+       struct iowait *wait,
+       struct sdma_txreq *stx,
+       unsigned seq)
+{
+       struct verbs_txreq *tx = container_of(stx, struct verbs_txreq, txreq);
+       struct rvt_qp *qp;
+       struct hfi1_qp_priv *priv;
+       unsigned long flags;
+       int ret = 0;
+       struct hfi1_ibdev *dev;
+
+       qp = tx->qp;
+       priv = qp->priv;
+
+       spin_lock_irqsave(&qp->s_lock, flags);
+       if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
+               /*
+                * If we couldn't queue the DMA request, save the info
+                * and try again later rather than destroying the
+                * buffer and undoing the side effects of the copy.
+                */
+               /* Make a common routine? */
+               dev = &sde->dd->verbs_dev;
+               list_add_tail(&stx->list, &wait->tx_head);
+               write_seqlock(&dev->iowait_lock);
+               if (sdma_progress(sde, seq, stx))
+                       goto eagain;
+               if (list_empty(&priv->s_iowait.list)) {
+                       struct hfi1_ibport *ibp =
+                               to_iport(qp->ibqp.device, qp->port_num);
+
+                       ibp->rvp.n_dmawait++;
+                       qp->s_flags |= RVT_S_WAIT_DMA_DESC;
+                       list_add_tail(&priv->s_iowait.list, &sde->dmawait);
+                       trace_hfi1_qpsleep(qp, RVT_S_WAIT_DMA_DESC);
+                       atomic_inc(&qp->refcount);
+               }
+               write_sequnlock(&dev->iowait_lock);
+               qp->s_flags &= ~RVT_S_BUSY;
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+               ret = -EBUSY;
+       } else {
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+               hfi1_put_txreq(tx);
+       }
+       return ret;
+eagain:
+       write_sequnlock(&dev->iowait_lock);
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+       list_del_init(&stx->list);
+       return -EAGAIN;
+}
+
+static void iowait_wakeup(struct iowait *wait, int reason)
+{
+       struct rvt_qp *qp = iowait_to_qp(wait);
+
+       WARN_ON(reason != SDMA_AVAIL_REASON);
+       hfi1_qp_wakeup(qp, RVT_S_WAIT_DMA_DESC);
+}
+
+static void iowait_sdma_drained(struct iowait *wait)
+{
+       struct rvt_qp *qp = iowait_to_qp(wait);
+       unsigned long flags;
+
+       /*
+        * This happens when the send engine notes
+        * a QP in the error state and cannot
+        * do the flush work until that QP's
+        * sdma work has finished.
+        */
+       spin_lock_irqsave(&qp->s_lock, flags);
+       if (qp->s_flags & RVT_S_WAIT_DMA) {
+               qp->s_flags &= ~RVT_S_WAIT_DMA;
+               hfi1_schedule_send(qp);
+       }
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+}
+
+/**
+ *
+ * qp_to_sdma_engine - map a qp to a send engine
+ * @qp: the QP
+ * @sc5: the 5 bit sc
+ *
+ * Return:
+ * A send engine for the qp or NULL for SMI type qp.
+ */
+struct sdma_engine *qp_to_sdma_engine(struct rvt_qp *qp, u8 sc5)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+       struct sdma_engine *sde;
+
+       if (!(dd->flags & HFI1_HAS_SEND_DMA))
+               return NULL;
+       switch (qp->ibqp.qp_type) {
+       case IB_QPT_SMI:
+               return NULL;
+       default:
+               break;
+       }
+       sde = sdma_select_engine_sc(dd, qp->ibqp.qp_num >> dd->qos_shift, sc5);
+       return sde;
+}
+
+/*
+ * qp_to_send_context - map a qp to a send context
+ * @qp: the QP
+ * @sc5: the 5 bit sc
+ *
+ * Return:
+ * A send context for the qp
+ */
+struct send_context *qp_to_send_context(struct rvt_qp *qp, u8 sc5)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+
+       switch (qp->ibqp.qp_type) {
+       case IB_QPT_SMI:
+               /* SMA packets to VL15 */
+               return dd->vld[15].sc;
+       default:
+               break;
+       }
+
+       return pio_select_send_context_sc(dd, qp->ibqp.qp_num >> dd->qos_shift,
+                                         sc5);
+}
+
+struct qp_iter {
+       struct hfi1_ibdev *dev;
+       struct rvt_qp *qp;
+       int specials;
+       int n;
+};
+
+struct qp_iter *qp_iter_init(struct hfi1_ibdev *dev)
+{
+       struct qp_iter *iter;
+
+       iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+       if (!iter)
+               return NULL;
+
+       iter->dev = dev;
+       iter->specials = dev->rdi.ibdev.phys_port_cnt * 2;
+       if (qp_iter_next(iter)) {
+               kfree(iter);
+               return NULL;
+       }
+
+       return iter;
+}
+
+int qp_iter_next(struct qp_iter *iter)
+{
+       struct hfi1_ibdev *dev = iter->dev;
+       int n = iter->n;
+       int ret = 1;
+       struct rvt_qp *pqp = iter->qp;
+       struct rvt_qp *qp;
+
+       /*
+        * The approach is to consider the special qps
+        * as an additional table entries before the
+        * real hash table.  Since the qp code sets
+        * the qp->next hash link to NULL, this works just fine.
+        *
+        * iter->specials is 2 * # ports
+        *
+        * n = 0..iter->specials is the special qp indices
+        *
+        * n = iter->specials..dev->rdi.qp_dev->qp_table_size+iter->specials are
+        * the potential hash bucket entries
+        *
+        */
+       for (; n <  dev->rdi.qp_dev->qp_table_size + iter->specials; n++) {
+               if (pqp) {
+                       qp = rcu_dereference(pqp->next);
+               } else {
+                       if (n < iter->specials) {
+                               struct hfi1_pportdata *ppd;
+                               struct hfi1_ibport *ibp;
+                               int pidx;
+
+                               pidx = n % dev->rdi.ibdev.phys_port_cnt;
+                               ppd = &dd_from_dev(dev)->pport[pidx];
+                               ibp = &ppd->ibport_data;
+
+                               if (!(n & 1))
+                                       qp = rcu_dereference(ibp->rvp.qp[0]);
+                               else
+                                       qp = rcu_dereference(ibp->rvp.qp[1]);
+                       } else {
+                               qp = rcu_dereference(
+                                       dev->rdi.qp_dev->qp_table[
+                                               (n - iter->specials)]);
+                       }
+               }
+               pqp = qp;
+               if (qp) {
+                       iter->qp = qp;
+                       iter->n = n;
+                       return 0;
+               }
+       }
+       return ret;
+}
+
+static const char * const qp_type_str[] = {
+       "SMI", "GSI", "RC", "UC", "UD",
+};
+
+static int qp_idle(struct rvt_qp *qp)
+{
+       return
+               qp->s_last == qp->s_acked &&
+               qp->s_acked == qp->s_cur &&
+               qp->s_cur == qp->s_tail &&
+               qp->s_tail == qp->s_head;
+}
+
+void qp_iter_print(struct seq_file *s, struct qp_iter *iter)
+{
+       struct rvt_swqe *wqe;
+       struct rvt_qp *qp = iter->qp;
+       struct hfi1_qp_priv *priv = qp->priv;
+       struct sdma_engine *sde;
+       struct send_context *send_context;
+
+       sde = qp_to_sdma_engine(qp, priv->s_sc);
+       wqe = rvt_get_swqe_ptr(qp, qp->s_last);
+       send_context = qp_to_send_context(qp, priv->s_sc);
+       seq_printf(s,
+                  "N %d %s QP %x R %u %s %u %u %u f=%x %u %u %u %u %u %u PSN %x %x %x %x %x (%u %u %u %u %u %u %u) RQP %x LID %x SL %u MTU %u %u %u %u SDE %p,%u SC %p,%u SCQ %u %u PID %d\n",
+                  iter->n,
+                  qp_idle(qp) ? "I" : "B",
+                  qp->ibqp.qp_num,
+                  atomic_read(&qp->refcount),
+                  qp_type_str[qp->ibqp.qp_type],
+                  qp->state,
+                  wqe ? wqe->wr.opcode : 0,
+                  qp->s_hdrwords,
+                  qp->s_flags,
+                  iowait_sdma_pending(&priv->s_iowait),
+                  iowait_pio_pending(&priv->s_iowait),
+                  !list_empty(&priv->s_iowait.list),
+                  qp->timeout,
+                  wqe ? wqe->ssn : 0,
+                  qp->s_lsn,
+                  qp->s_last_psn,
+                  qp->s_psn, qp->s_next_psn,
+                  qp->s_sending_psn, qp->s_sending_hpsn,
+                  qp->s_last, qp->s_acked, qp->s_cur,
+                  qp->s_tail, qp->s_head, qp->s_size,
+                  qp->s_avail,
+                  qp->remote_qpn,
+                  qp->remote_ah_attr.dlid,
+                  qp->remote_ah_attr.sl,
+                  qp->pmtu,
+                  qp->s_retry,
+                  qp->s_retry_cnt,
+                  qp->s_rnr_retry_cnt,
+                  sde,
+                  sde ? sde->this_idx : 0,
+                  send_context,
+                  send_context ? send_context->sw_index : 0,
+                  ibcq_to_rvtcq(qp->ibqp.send_cq)->queue->head,
+                  ibcq_to_rvtcq(qp->ibqp.send_cq)->queue->tail,
+                  qp->pid);
+}
+
+void qp_comm_est(struct rvt_qp *qp)
+{
+       qp->r_flags |= RVT_R_COMM_EST;
+       if (qp->ibqp.event_handler) {
+               struct ib_event ev;
+
+               ev.device = qp->ibqp.device;
+               ev.element.qp = &qp->ibqp;
+               ev.event = IB_EVENT_COMM_EST;
+               qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+       }
+}
+
+void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp,
+                   gfp_t gfp)
+{
+       struct hfi1_qp_priv *priv;
+
+       priv = kzalloc_node(sizeof(*priv), gfp, rdi->dparms.node);
+       if (!priv)
+               return ERR_PTR(-ENOMEM);
+
+       priv->owner = qp;
+
+       priv->s_hdr = kzalloc_node(sizeof(*priv->s_hdr), gfp, rdi->dparms.node);
+       if (!priv->s_hdr) {
+               kfree(priv);
+               return ERR_PTR(-ENOMEM);
+       }
+       setup_timer(&priv->s_rnr_timer, hfi1_rc_rnr_retry, (unsigned long)qp);
+       qp->s_timer.function = hfi1_rc_timeout;
+       return priv;
+}
+
+void qp_priv_free(struct rvt_dev_info *rdi, struct rvt_qp *qp)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       kfree(priv->s_hdr);
+       kfree(priv);
+}
+
+unsigned free_all_qps(struct rvt_dev_info *rdi)
+{
+       struct hfi1_ibdev *verbs_dev = container_of(rdi,
+                                                   struct hfi1_ibdev,
+                                                   rdi);
+       struct hfi1_devdata *dd = container_of(verbs_dev,
+                                              struct hfi1_devdata,
+                                              verbs_dev);
+       int n;
+       unsigned qp_inuse = 0;
+
+       for (n = 0; n < dd->num_pports; n++) {
+               struct hfi1_ibport *ibp = &dd->pport[n].ibport_data;
+
+               rcu_read_lock();
+               if (rcu_dereference(ibp->rvp.qp[0]))
+                       qp_inuse++;
+               if (rcu_dereference(ibp->rvp.qp[1]))
+                       qp_inuse++;
+               rcu_read_unlock();
+       }
+
+       return qp_inuse;
+}
+
+void flush_qp_waiters(struct rvt_qp *qp)
+{
+       flush_iowait(qp);
+       hfi1_stop_rc_timers(qp);
+}
+
+void stop_send_queue(struct rvt_qp *qp)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       cancel_work_sync(&priv->s_iowait.iowork);
+       hfi1_del_timers_sync(qp);
+}
+
+void quiesce_qp(struct rvt_qp *qp)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       iowait_sdma_drain(&priv->s_iowait);
+       qp_pio_drain(qp);
+       flush_tx_list(qp);
+}
+
+void notify_qp_reset(struct rvt_qp *qp)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       iowait_init(
+               &priv->s_iowait,
+               1,
+               _hfi1_do_send,
+               iowait_sleep,
+               iowait_wakeup,
+               iowait_sdma_drained);
+       priv->r_adefered = 0;
+       clear_ahg(qp);
+}
+
+/*
+ * Switch to alternate path.
+ * The QP s_lock should be held and interrupts disabled.
+ */
+void hfi1_migrate_qp(struct rvt_qp *qp)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+       struct ib_event ev;
+
+       qp->s_mig_state = IB_MIG_MIGRATED;
+       qp->remote_ah_attr = qp->alt_ah_attr;
+       qp->port_num = qp->alt_ah_attr.port_num;
+       qp->s_pkey_index = qp->s_alt_pkey_index;
+       qp->s_flags |= RVT_S_AHG_CLEAR;
+       priv->s_sc = ah_to_sc(qp->ibqp.device, &qp->remote_ah_attr);
+       priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc);
+
+       ev.device = qp->ibqp.device;
+       ev.element.qp = &qp->ibqp;
+       ev.event = IB_EVENT_PATH_MIG;
+       qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+}
+
+int mtu_to_path_mtu(u32 mtu)
+{
+       return mtu_to_enum(mtu, OPA_MTU_8192);
+}
+
+u32 mtu_from_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, u32 pmtu)
+{
+       u32 mtu;
+       struct hfi1_ibdev *verbs_dev = container_of(rdi,
+                                                   struct hfi1_ibdev,
+                                                   rdi);
+       struct hfi1_devdata *dd = container_of(verbs_dev,
+                                              struct hfi1_devdata,
+                                              verbs_dev);
+       struct hfi1_ibport *ibp;
+       u8 sc, vl;
+
+       ibp = &dd->pport[qp->port_num - 1].ibport_data;
+       sc = ibp->sl_to_sc[qp->remote_ah_attr.sl];
+       vl = sc_to_vlt(dd, sc);
+
+       mtu = verbs_mtu_enum_to_int(qp->ibqp.device, pmtu);
+       if (vl < PER_VL_SEND_CONTEXTS)
+               mtu = min_t(u32, mtu, dd->vld[vl].mtu);
+       return mtu;
+}
+
+int get_pmtu_from_attr(struct rvt_dev_info *rdi, struct rvt_qp *qp,
+                      struct ib_qp_attr *attr)
+{
+       int mtu, pidx = qp->port_num - 1;
+       struct hfi1_ibdev *verbs_dev = container_of(rdi,
+                                                   struct hfi1_ibdev,
+                                                   rdi);
+       struct hfi1_devdata *dd = container_of(verbs_dev,
+                                              struct hfi1_devdata,
+                                              verbs_dev);
+       mtu = verbs_mtu_enum_to_int(qp->ibqp.device, attr->path_mtu);
+       if (mtu == -1)
+               return -1; /* values less than 0 are error */
+
+       if (mtu > dd->pport[pidx].ibmtu)
+               return mtu_to_enum(dd->pport[pidx].ibmtu, IB_MTU_2048);
+       else
+               return attr->path_mtu;
+}
+
+void notify_error_qp(struct rvt_qp *qp)
+{
+       struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       write_seqlock(&dev->iowait_lock);
+       if (!list_empty(&priv->s_iowait.list) && !(qp->s_flags & RVT_S_BUSY)) {
+               qp->s_flags &= ~RVT_S_ANY_WAIT_IO;
+               list_del_init(&priv->s_iowait.list);
+               if (atomic_dec_and_test(&qp->refcount))
+                       wake_up(&qp->wait);
+       }
+       write_sequnlock(&dev->iowait_lock);
+
+       if (!(qp->s_flags & RVT_S_BUSY)) {
+               qp->s_hdrwords = 0;
+               if (qp->s_rdma_mr) {
+                       rvt_put_mr(qp->s_rdma_mr);
+                       qp->s_rdma_mr = NULL;
+               }
+               flush_tx_list(qp);
+       }
+}
+
+/**
+ * hfi1_error_port_qps - put a port's RC/UC qps into error state
+ * @ibp: the ibport.
+ * @sl: the service level.
+ *
+ * This function places all RC/UC qps with a given service level into error
+ * state. It is generally called to force upper lay apps to abandon stale qps
+ * after an sl->sc mapping change.
+ */
+void hfi1_error_port_qps(struct hfi1_ibport *ibp, u8 sl)
+{
+       struct rvt_qp *qp = NULL;
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       struct hfi1_ibdev *dev = &ppd->dd->verbs_dev;
+       int n;
+       int lastwqe;
+       struct ib_event ev;
+
+       rcu_read_lock();
+
+       /* Deal only with RC/UC qps that use the given SL. */
+       for (n = 0; n < dev->rdi.qp_dev->qp_table_size; n++) {
+               for (qp = rcu_dereference(dev->rdi.qp_dev->qp_table[n]); qp;
+                       qp = rcu_dereference(qp->next)) {
+                       if (qp->port_num == ppd->port &&
+                           (qp->ibqp.qp_type == IB_QPT_UC ||
+                            qp->ibqp.qp_type == IB_QPT_RC) &&
+                           qp->remote_ah_attr.sl == sl &&
+                           (ib_rvt_state_ops[qp->state] &
+                            RVT_POST_SEND_OK)) {
+                               spin_lock_irq(&qp->r_lock);
+                               spin_lock(&qp->s_hlock);
+                               spin_lock(&qp->s_lock);
+                               lastwqe = rvt_error_qp(qp,
+                                                      IB_WC_WR_FLUSH_ERR);
+                               spin_unlock(&qp->s_lock);
+                               spin_unlock(&qp->s_hlock);
+                               spin_unlock_irq(&qp->r_lock);
+                               if (lastwqe) {
+                                       ev.device = qp->ibqp.device;
+                                       ev.element.qp = &qp->ibqp;
+                                       ev.event =
+                                               IB_EVENT_QP_LAST_WQE_REACHED;
+                                       qp->ibqp.event_handler(&ev,
+                                               qp->ibqp.qp_context);
+                               }
+                       }
+               }
+       }
+
+       rcu_read_unlock();
+}
diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h
new file mode 100644 (file)
index 0000000..e7bc8d6
--- /dev/null
@@ -0,0 +1,160 @@
+#ifndef _QP_H
+#define _QP_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/hash.h>
+#include <rdma/rdmavt_qp.h>
+#include "verbs.h"
+#include "sdma.h"
+
+extern unsigned int hfi1_qp_table_size;
+
+/*
+ * free_ahg - clear ahg from QP
+ */
+static inline void clear_ahg(struct rvt_qp *qp)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       priv->s_hdr->ahgcount = 0;
+       qp->s_flags &= ~(RVT_S_AHG_VALID | RVT_S_AHG_CLEAR);
+       if (priv->s_sde && qp->s_ahgidx >= 0)
+               sdma_ahg_free(priv->s_sde, qp->s_ahgidx);
+       qp->s_ahgidx = -1;
+}
+
+/**
+ * hfi1_compute_aeth - compute the AETH (syndrome + MSN)
+ * @qp: the queue pair to compute the AETH for
+ *
+ * Returns the AETH.
+ */
+__be32 hfi1_compute_aeth(struct rvt_qp *qp);
+
+/**
+ * hfi1_create_qp - create a queue pair for a device
+ * @ibpd: the protection domain who's device we create the queue pair for
+ * @init_attr: the attributes of the queue pair
+ * @udata: user data for libibverbs.so
+ *
+ * Returns the queue pair on success, otherwise returns an errno.
+ *
+ * Called by the ib_create_qp() core verbs function.
+ */
+struct ib_qp *hfi1_create_qp(struct ib_pd *ibpd,
+                            struct ib_qp_init_attr *init_attr,
+                            struct ib_udata *udata);
+/**
+ * hfi1_get_credit - flush the send work queue of a QP
+ * @qp: the qp who's send work queue to flush
+ * @aeth: the Acknowledge Extended Transport Header
+ *
+ * The QP s_lock should be held.
+ */
+void hfi1_get_credit(struct rvt_qp *qp, u32 aeth);
+
+/**
+ * hfi1_qp_wakeup - wake up on the indicated event
+ * @qp: the QP
+ * @flag: flag the qp on which the qp is stalled
+ */
+void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag);
+
+struct sdma_engine *qp_to_sdma_engine(struct rvt_qp *qp, u8 sc5);
+struct send_context *qp_to_send_context(struct rvt_qp *qp, u8 sc5);
+
+struct qp_iter;
+
+/**
+ * qp_iter_init - initialize the iterator for the qp hash list
+ * @dev: the hfi1_ibdev
+ */
+struct qp_iter *qp_iter_init(struct hfi1_ibdev *dev);
+
+/**
+ * qp_iter_next - Find the next qp in the hash list
+ * @iter: the iterator for the qp hash list
+ */
+int qp_iter_next(struct qp_iter *iter);
+
+/**
+ * qp_iter_print - print the qp information to seq_file
+ * @s: the seq_file to emit the qp information on
+ * @iter: the iterator for the qp hash list
+ */
+void qp_iter_print(struct seq_file *s, struct qp_iter *iter);
+
+/**
+ * qp_comm_est - handle trap with QP established
+ * @qp: the QP
+ */
+void qp_comm_est(struct rvt_qp *qp);
+
+void _hfi1_schedule_send(struct rvt_qp *qp);
+void hfi1_schedule_send(struct rvt_qp *qp);
+
+void hfi1_migrate_qp(struct rvt_qp *qp);
+
+/*
+ * Functions provided by hfi1 driver for rdmavt to use
+ */
+void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp,
+                   gfp_t gfp);
+void qp_priv_free(struct rvt_dev_info *rdi, struct rvt_qp *qp);
+unsigned free_all_qps(struct rvt_dev_info *rdi);
+void notify_qp_reset(struct rvt_qp *qp);
+int get_pmtu_from_attr(struct rvt_dev_info *rdi, struct rvt_qp *qp,
+                      struct ib_qp_attr *attr);
+void flush_qp_waiters(struct rvt_qp *qp);
+void notify_error_qp(struct rvt_qp *qp);
+void stop_send_queue(struct rvt_qp *qp);
+void quiesce_qp(struct rvt_qp *qp);
+u32 mtu_from_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, u32 pmtu);
+int mtu_to_path_mtu(u32 mtu);
+void hfi1_error_port_qps(struct hfi1_ibport *ibp, u8 sl);
+#endif /* _QP_H */
diff --git a/drivers/infiniband/hw/hfi1/qsfp.c b/drivers/infiniband/hw/hfi1/qsfp.c
new file mode 100644 (file)
index 0000000..2441669
--- /dev/null
@@ -0,0 +1,632 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/vmalloc.h>
+
+#include "hfi.h"
+#include "twsi.h"
+
+/*
+ * QSFP support for hfi driver, using "Two Wire Serial Interface" driver
+ * in twsi.c
+ */
+#define I2C_MAX_RETRY 4
+
+/*
+ * Raw i2c write.  No set-up or lock checking.
+ */
+static int __i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
+                      int offset, void *bp, int len)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       int ret, cnt;
+       u8 *buff = bp;
+
+       cnt = 0;
+       while (cnt < len) {
+               int wlen = len - cnt;
+
+               ret = hfi1_twsi_blk_wr(dd, target, i2c_addr, offset,
+                                      buff + cnt, wlen);
+               if (ret) {
+                       /* hfi1_twsi_blk_wr() 1 for error, else 0 */
+                       return -EIO;
+               }
+               offset += wlen;
+               cnt += wlen;
+       }
+
+       /* Must wait min 20us between qsfp i2c transactions */
+       udelay(20);
+
+       return cnt;
+}
+
+/*
+ * Caller must hold the i2c chain resource.
+ */
+int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset,
+             void *bp, int len)
+{
+       int ret;
+
+       if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
+               return -EACCES;
+
+       /* make sure the TWSI bus is in a sane state */
+       ret = hfi1_twsi_reset(ppd->dd, target);
+       if (ret) {
+               hfi1_dev_porterr(ppd->dd, ppd->port,
+                                "I2C chain %d write interface reset failed\n",
+                                target);
+               return ret;
+       }
+
+       return __i2c_write(ppd, target, i2c_addr, offset, bp, len);
+}
+
+/*
+ * Raw i2c read.  No set-up or lock checking.
+ */
+static int __i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
+                     int offset, void *bp, int len)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       int ret, cnt, pass = 0;
+       int orig_offset = offset;
+
+       cnt = 0;
+       while (cnt < len) {
+               int rlen = len - cnt;
+
+               ret = hfi1_twsi_blk_rd(dd, target, i2c_addr, offset,
+                                      bp + cnt, rlen);
+               /* Some QSFP's fail first try. Retry as experiment */
+               if (ret && cnt == 0 && ++pass < I2C_MAX_RETRY)
+                       continue;
+               if (ret) {
+                       /* hfi1_twsi_blk_rd() 1 for error, else 0 */
+                       ret = -EIO;
+                       goto exit;
+               }
+               offset += rlen;
+               cnt += rlen;
+       }
+
+       ret = cnt;
+
+exit:
+       if (ret < 0) {
+               hfi1_dev_porterr(dd, ppd->port,
+                                "I2C chain %d read failed, addr 0x%x, offset 0x%x, len %d\n",
+                                target, i2c_addr, orig_offset, len);
+       }
+
+       /* Must wait min 20us between qsfp i2c transactions */
+       udelay(20);
+
+       return ret;
+}
+
+/*
+ * Caller must hold the i2c chain resource.
+ */
+int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset,
+            void *bp, int len)
+{
+       int ret;
+
+       if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
+               return -EACCES;
+
+       /* make sure the TWSI bus is in a sane state */
+       ret = hfi1_twsi_reset(ppd->dd, target);
+       if (ret) {
+               hfi1_dev_porterr(ppd->dd, ppd->port,
+                                "I2C chain %d read interface reset failed\n",
+                                target);
+               return ret;
+       }
+
+       return __i2c_read(ppd, target, i2c_addr, offset, bp, len);
+}
+
+/*
+ * Write page n, offset m of QSFP memory as defined by SFF 8636
+ * by writing @addr = ((256 * n) + m)
+ *
+ * Caller must hold the i2c chain resource.
+ */
+int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+              int len)
+{
+       int count = 0;
+       int offset;
+       int nwrite;
+       int ret;
+       u8 page;
+
+       if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
+               return -EACCES;
+
+       /* make sure the TWSI bus is in a sane state */
+       ret = hfi1_twsi_reset(ppd->dd, target);
+       if (ret) {
+               hfi1_dev_porterr(ppd->dd, ppd->port,
+                                "QSFP chain %d write interface reset failed\n",
+                                target);
+               return ret;
+       }
+
+       while (count < len) {
+               /*
+                * Set the qsfp page based on a zero-based address
+                * and a page size of QSFP_PAGESIZE bytes.
+                */
+               page = (u8)(addr / QSFP_PAGESIZE);
+
+               ret = __i2c_write(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE,
+                                 QSFP_PAGE_SELECT_BYTE_OFFS, &page, 1);
+               if (ret != 1) {
+                       hfi1_dev_porterr(ppd->dd, ppd->port,
+                                        "QSFP chain %d can't write QSFP_PAGE_SELECT_BYTE: %d\n",
+                                        target, ret);
+                       ret = -EIO;
+                       break;
+               }
+
+               offset = addr % QSFP_PAGESIZE;
+               nwrite = len - count;
+               /* truncate write to boundary if crossing boundary */
+               if (((addr % QSFP_RW_BOUNDARY) + nwrite) > QSFP_RW_BOUNDARY)
+                       nwrite = QSFP_RW_BOUNDARY - (addr % QSFP_RW_BOUNDARY);
+
+               ret = __i2c_write(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE,
+                                 offset, bp + count, nwrite);
+               if (ret <= 0)   /* stop on error or nothing written */
+                       break;
+
+               count += ret;
+               addr += ret;
+       }
+
+       if (ret < 0)
+               return ret;
+       return count;
+}
+
+/*
+ * Perform a stand-alone single QSFP write.  Acquire the resource, do the
+ * read, then release the resource.
+ */
+int one_qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+                  int len)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u32 resource = qsfp_resource(dd);
+       int ret;
+
+       ret = acquire_chip_resource(dd, resource, QSFP_WAIT);
+       if (ret)
+               return ret;
+       ret = qsfp_write(ppd, target, addr, bp, len);
+       release_chip_resource(dd, resource);
+
+       return ret;
+}
+
+/*
+ * Access page n, offset m of QSFP memory as defined by SFF 8636
+ * by reading @addr = ((256 * n) + m)
+ *
+ * Caller must hold the i2c chain resource.
+ */
+int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+             int len)
+{
+       int count = 0;
+       int offset;
+       int nread;
+       int ret;
+       u8 page;
+
+       if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
+               return -EACCES;
+
+       /* make sure the TWSI bus is in a sane state */
+       ret = hfi1_twsi_reset(ppd->dd, target);
+       if (ret) {
+               hfi1_dev_porterr(ppd->dd, ppd->port,
+                                "QSFP chain %d read interface reset failed\n",
+                                target);
+               return ret;
+       }
+
+       while (count < len) {
+               /*
+                * Set the qsfp page based on a zero-based address
+                * and a page size of QSFP_PAGESIZE bytes.
+                */
+               page = (u8)(addr / QSFP_PAGESIZE);
+               ret = __i2c_write(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE,
+                                 QSFP_PAGE_SELECT_BYTE_OFFS, &page, 1);
+               if (ret != 1) {
+                       hfi1_dev_porterr(ppd->dd, ppd->port,
+                                        "QSFP chain %d can't write QSFP_PAGE_SELECT_BYTE: %d\n",
+                                        target, ret);
+                       ret = -EIO;
+                       break;
+               }
+
+               offset = addr % QSFP_PAGESIZE;
+               nread = len - count;
+               /* truncate read to boundary if crossing boundary */
+               if (((addr % QSFP_RW_BOUNDARY) + nread) > QSFP_RW_BOUNDARY)
+                       nread = QSFP_RW_BOUNDARY - (addr % QSFP_RW_BOUNDARY);
+
+               /* QSFPs require a 5-10msec delay after write operations */
+               mdelay(5);
+               ret = __i2c_read(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE,
+                                offset, bp + count, nread);
+               if (ret <= 0)   /* stop on error or nothing read */
+                       break;
+
+               count += ret;
+               addr += ret;
+       }
+
+       if (ret < 0)
+               return ret;
+       return count;
+}
+
+/*
+ * Perform a stand-alone single QSFP read.  Acquire the resource, do the
+ * read, then release the resource.
+ */
+int one_qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+                 int len)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u32 resource = qsfp_resource(dd);
+       int ret;
+
+       ret = acquire_chip_resource(dd, resource, QSFP_WAIT);
+       if (ret)
+               return ret;
+       ret = qsfp_read(ppd, target, addr, bp, len);
+       release_chip_resource(dd, resource);
+
+       return ret;
+}
+
+/*
+ * This function caches the QSFP memory range in 128 byte chunks.
+ * As an example, the next byte after address 255 is byte 128 from
+ * upper page 01H (if existing) rather than byte 0 from lower page 00H.
+ * Access page n, offset m of QSFP memory as defined by SFF 8636
+ * in the cache by reading byte ((128 * n) + m)
+ * The calls to qsfp_{read,write} in this function correctly handle the
+ * address map difference between this mapping and the mapping implemented
+ * by those functions
+ *
+ * The caller must be holding the QSFP i2c chain resource.
+ */
+int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp)
+{
+       u32 target = ppd->dd->hfi1_id;
+       int ret;
+       unsigned long flags;
+       u8 *cache = &cp->cache[0];
+
+       /* ensure sane contents on invalid reads, for cable swaps */
+       memset(cache, 0, (QSFP_MAX_NUM_PAGES * 128));
+       spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+       ppd->qsfp_info.cache_valid = 0;
+       spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, flags);
+
+       if (!qsfp_mod_present(ppd)) {
+               ret = -ENODEV;
+               goto bail;
+       }
+
+       ret = qsfp_read(ppd, target, 0, cache, QSFP_PAGESIZE);
+       if (ret != QSFP_PAGESIZE) {
+               dd_dev_info(ppd->dd,
+                           "%s: Page 0 read failed, expected %d, got %d\n",
+                           __func__, QSFP_PAGESIZE, ret);
+               goto bail;
+       }
+
+       /* Is paging enabled? */
+       if (!(cache[2] & 4)) {
+               /* Paging enabled, page 03 required */
+               if ((cache[195] & 0xC0) == 0xC0) {
+                       /* all */
+                       ret = qsfp_read(ppd, target, 384, cache + 256, 128);
+                       if (ret <= 0 || ret != 128) {
+                               dd_dev_info(ppd->dd, "%s failed\n", __func__);
+                               goto bail;
+                       }
+                       ret = qsfp_read(ppd, target, 640, cache + 384, 128);
+                       if (ret <= 0 || ret != 128) {
+                               dd_dev_info(ppd->dd, "%s failed\n", __func__);
+                               goto bail;
+                       }
+                       ret = qsfp_read(ppd, target, 896, cache + 512, 128);
+                       if (ret <= 0 || ret != 128) {
+                               dd_dev_info(ppd->dd, "%s failed\n", __func__);
+                               goto bail;
+                       }
+               } else if ((cache[195] & 0x80) == 0x80) {
+                       /* only page 2 and 3 */
+                       ret = qsfp_read(ppd, target, 640, cache + 384, 128);
+                       if (ret <= 0 || ret != 128) {
+                               dd_dev_info(ppd->dd, "%s failed\n", __func__);
+                               goto bail;
+                       }
+                       ret = qsfp_read(ppd, target, 896, cache + 512, 128);
+                       if (ret <= 0 || ret != 128) {
+                               dd_dev_info(ppd->dd, "%s failed\n", __func__);
+                               goto bail;
+                       }
+               } else if ((cache[195] & 0x40) == 0x40) {
+                       /* only page 1 and 3 */
+                       ret = qsfp_read(ppd, target, 384, cache + 256, 128);
+                       if (ret <= 0 || ret != 128) {
+                               dd_dev_info(ppd->dd, "%s failed\n", __func__);
+                               goto bail;
+                       }
+                       ret = qsfp_read(ppd, target, 896, cache + 512, 128);
+                       if (ret <= 0 || ret != 128) {
+                               dd_dev_info(ppd->dd, "%s failed\n", __func__);
+                               goto bail;
+                       }
+               } else {
+                       /* only page 3 */
+                       ret = qsfp_read(ppd, target, 896, cache + 512, 128);
+                       if (ret <= 0 || ret != 128) {
+                               dd_dev_info(ppd->dd, "%s failed\n", __func__);
+                               goto bail;
+                       }
+               }
+       }
+
+       spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+       ppd->qsfp_info.cache_valid = 1;
+       ppd->qsfp_info.cache_refresh_required = 0;
+       spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, flags);
+
+       return 0;
+
+bail:
+       memset(cache, 0, (QSFP_MAX_NUM_PAGES * 128));
+       return ret;
+}
+
+const char * const hfi1_qsfp_devtech[16] = {
+       "850nm VCSEL", "1310nm VCSEL", "1550nm VCSEL", "1310nm FP",
+       "1310nm DFB", "1550nm DFB", "1310nm EML", "1550nm EML",
+       "Cu Misc", "1490nm DFB", "Cu NoEq", "Cu Eq",
+       "Undef", "Cu Active BothEq", "Cu FarEq", "Cu NearEq"
+};
+
+#define QSFP_DUMP_CHUNK 16 /* Holds longest string */
+#define QSFP_DEFAULT_HDR_CNT 224
+
+#define QSFP_PWR(pbyte) (((pbyte) >> 6) & 3)
+#define QSFP_HIGH_PWR(pbyte) ((pbyte) & 3)
+/* For use with QSFP_HIGH_PWR macro */
+#define QSFP_HIGH_PWR_UNUSED   0 /* Bits [1:0] = 00 implies low power module */
+
+/*
+ * Takes power class byte [Page 00 Byte 129] in SFF 8636
+ * Returns power class as integer (1 through 7, per SFF 8636 rev 2.4)
+ */
+int get_qsfp_power_class(u8 power_byte)
+{
+       if (QSFP_HIGH_PWR(power_byte) == QSFP_HIGH_PWR_UNUSED)
+               /* power classes count from 1, their bit encodings from 0 */
+               return (QSFP_PWR(power_byte) + 1);
+       /*
+        * 00 in the high power classes stands for unused, bringing
+        * balance to the off-by-1 offset above, we add 4 here to
+        * account for the difference between the low and high power
+        * groups
+        */
+       return (QSFP_HIGH_PWR(power_byte) + 4);
+}
+
+int qsfp_mod_present(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u64 reg;
+
+       reg = read_csr(dd, dd->hfi1_id ? ASIC_QSFP2_IN : ASIC_QSFP1_IN);
+       return !(reg & QSFP_HFI0_MODPRST_N);
+}
+
+/*
+ * This function maps QSFP memory addresses in 128 byte chunks in the following
+ * fashion per the CableInfo SMA query definition in the IBA 1.3 spec/OPA Gen 1
+ * spec
+ * For addr 000-127, lower page 00h
+ * For addr 128-255, upper page 00h
+ * For addr 256-383, upper page 01h
+ * For addr 384-511, upper page 02h
+ * For addr 512-639, upper page 03h
+ *
+ * For addresses beyond this range, it returns the invalid range of data buffer
+ * set to 0.
+ * For upper pages that are optional, if they are not valid, returns the
+ * particular range of bytes in the data buffer set to 0.
+ */
+int get_cable_info(struct hfi1_devdata *dd, u32 port_num, u32 addr, u32 len,
+                  u8 *data)
+{
+       struct hfi1_pportdata *ppd;
+       u32 excess_len = 0;
+       int ret = 0;
+
+       if (port_num > dd->num_pports || port_num < 1) {
+               dd_dev_info(dd, "%s: Invalid port number %d\n",
+                           __func__, port_num);
+               ret = -EINVAL;
+               goto set_zeroes;
+       }
+
+       ppd = dd->pport + (port_num - 1);
+       if (!qsfp_mod_present(ppd)) {
+               ret = -ENODEV;
+               goto set_zeroes;
+       }
+
+       if (!ppd->qsfp_info.cache_valid) {
+               ret = -EINVAL;
+               goto set_zeroes;
+       }
+
+       if (addr >= (QSFP_MAX_NUM_PAGES * 128)) {
+               ret = -ERANGE;
+               goto set_zeroes;
+       }
+
+       if ((addr + len) > (QSFP_MAX_NUM_PAGES * 128)) {
+               excess_len = (addr + len) - (QSFP_MAX_NUM_PAGES * 128);
+               memcpy(data, &ppd->qsfp_info.cache[addr], (len - excess_len));
+               data += (len - excess_len);
+               goto set_zeroes;
+       }
+
+       memcpy(data, &ppd->qsfp_info.cache[addr], len);
+       return 0;
+
+set_zeroes:
+       memset(data, 0, excess_len);
+       return ret;
+}
+
+static const char *pwr_codes[8] = {"N/AW",
+                                 "1.5W",
+                                 "2.0W",
+                                 "2.5W",
+                                 "3.5W",
+                                 "4.0W",
+                                 "4.5W",
+                                 "5.0W"
+                                };
+
+int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len)
+{
+       u8 *cache = &ppd->qsfp_info.cache[0];
+       u8 bin_buff[QSFP_DUMP_CHUNK];
+       char lenstr[6];
+       int sofar;
+       int bidx = 0;
+       u8 *atten = &cache[QSFP_ATTEN_OFFS];
+       u8 *vendor_oui = &cache[QSFP_VOUI_OFFS];
+       u8 power_byte = 0;
+
+       sofar = 0;
+       lenstr[0] = ' ';
+       lenstr[1] = '\0';
+
+       if (ppd->qsfp_info.cache_valid) {
+               if (QSFP_IS_CU(cache[QSFP_MOD_TECH_OFFS]))
+                       sprintf(lenstr, "%dM ", cache[QSFP_MOD_LEN_OFFS]);
+
+               power_byte = cache[QSFP_MOD_PWR_OFFS];
+               sofar += scnprintf(buf + sofar, len - sofar, "PWR:%.3sW\n",
+                               pwr_codes[get_qsfp_power_class(power_byte)]);
+
+               sofar += scnprintf(buf + sofar, len - sofar, "TECH:%s%s\n",
+                               lenstr,
+                       hfi1_qsfp_devtech[(cache[QSFP_MOD_TECH_OFFS]) >> 4]);
+
+               sofar += scnprintf(buf + sofar, len - sofar, "Vendor:%.*s\n",
+                                  QSFP_VEND_LEN, &cache[QSFP_VEND_OFFS]);
+
+               sofar += scnprintf(buf + sofar, len - sofar, "OUI:%06X\n",
+                                  QSFP_OUI(vendor_oui));
+
+               sofar += scnprintf(buf + sofar, len - sofar, "Part#:%.*s\n",
+                                  QSFP_PN_LEN, &cache[QSFP_PN_OFFS]);
+
+               sofar += scnprintf(buf + sofar, len - sofar, "Rev:%.*s\n",
+                                  QSFP_REV_LEN, &cache[QSFP_REV_OFFS]);
+
+               if (QSFP_IS_CU(cache[QSFP_MOD_TECH_OFFS]))
+                       sofar += scnprintf(buf + sofar, len - sofar,
+                               "Atten:%d, %d\n",
+                               QSFP_ATTEN_SDR(atten),
+                               QSFP_ATTEN_DDR(atten));
+
+               sofar += scnprintf(buf + sofar, len - sofar, "Serial:%.*s\n",
+                                  QSFP_SN_LEN, &cache[QSFP_SN_OFFS]);
+
+               sofar += scnprintf(buf + sofar, len - sofar, "Date:%.*s\n",
+                                  QSFP_DATE_LEN, &cache[QSFP_DATE_OFFS]);
+
+               sofar += scnprintf(buf + sofar, len - sofar, "Lot:%.*s\n",
+                                  QSFP_LOT_LEN, &cache[QSFP_LOT_OFFS]);
+
+               while (bidx < QSFP_DEFAULT_HDR_CNT) {
+                       int iidx;
+
+                       memcpy(bin_buff, &cache[bidx], QSFP_DUMP_CHUNK);
+                       for (iidx = 0; iidx < QSFP_DUMP_CHUNK; ++iidx) {
+                               sofar += scnprintf(buf + sofar, len - sofar,
+                                       " %02X", bin_buff[iidx]);
+                       }
+                       sofar += scnprintf(buf + sofar, len - sofar, "\n");
+                       bidx += QSFP_DUMP_CHUNK;
+               }
+       }
+       return sofar;
+}
diff --git a/drivers/infiniband/hw/hfi1/qsfp.h b/drivers/infiniband/hw/hfi1/qsfp.h
new file mode 100644 (file)
index 0000000..dadc66c
--- /dev/null
@@ -0,0 +1,240 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+/* QSFP support common definitions, for hfi driver */
+
+#define QSFP_DEV 0xA0
+#define QSFP_PWR_LAG_MSEC 2000
+#define QSFP_MODPRS_LAG_MSEC 20
+/* 128 byte pages, per SFF 8636 rev 2.4 */
+#define QSFP_MAX_NUM_PAGES     5
+
+/*
+ * Below are masks for QSFP pins.  Pins are the same for HFI0 and HFI1.
+ * _N means asserted low
+ */
+#define QSFP_HFI0_I2CCLK    BIT(0)
+#define QSFP_HFI0_I2CDAT    BIT(1)
+#define QSFP_HFI0_RESET_N   BIT(2)
+#define QSFP_HFI0_INT_N            BIT(3)
+#define QSFP_HFI0_MODPRST_N BIT(4)
+
+/* QSFP is paged at 256 bytes */
+#define QSFP_PAGESIZE 256
+/* Reads/writes cannot cross 128 byte boundaries */
+#define QSFP_RW_BOUNDARY 128
+
+/* number of bytes in i2c offset for QSFP devices */
+#define __QSFP_OFFSET_SIZE 1                           /* num address bytes */
+#define QSFP_OFFSET_SIZE (__QSFP_OFFSET_SIZE << 8)     /* shifted value */
+
+/* Defined fields that Intel requires of qualified cables */
+/* Byte 0 is Identifier, not checked */
+/* Byte 1 is reserved "status MSB" */
+#define QSFP_TX_CTRL_BYTE_OFFS 86
+#define QSFP_PWR_CTRL_BYTE_OFFS 93
+#define QSFP_CDR_CTRL_BYTE_OFFS 98
+
+#define QSFP_PAGE_SELECT_BYTE_OFFS 127
+/* Byte 128 is Identifier: must be 0x0c for QSFP, or 0x0d for QSFP+ */
+#define QSFP_MOD_ID_OFFS 128
+/*
+ * Byte 129 is "Extended Identifier".
+ * For bits [7:6]: 0:1.5W, 1:2.0W, 2:2.5W, 3:3.5W
+ * For bits [1:0]: 0:Unused, 1:4W, 2:4.5W, 3:5W
+ */
+#define QSFP_MOD_PWR_OFFS 129
+/* Byte 130 is Connector type. Not Intel req'd */
+/* Bytes 131..138 are Transceiver types, bit maps for various tech, none IB */
+/* Byte 139 is encoding. code 0x01 is 8b10b. Not Intel req'd */
+/* byte 140 is nominal bit-rate, in units of 100Mbits/sec */
+#define QSFP_NOM_BIT_RATE_100_OFFS 140
+/* Byte 141 is Extended Rate Select. Not Intel req'd */
+/* Bytes 142..145 are lengths for various fiber types. Not Intel req'd */
+/* Byte 146 is length for Copper. Units of 1 meter */
+#define QSFP_MOD_LEN_OFFS 146
+/*
+ * Byte 147 is Device technology. D0..3 not Intel req'd
+ * D4..7 select from 15 choices, translated by table:
+ */
+#define QSFP_MOD_TECH_OFFS 147
+extern const char *const hfi1_qsfp_devtech[16];
+/* Active Equalization includes fiber, copper full EQ, and copper near Eq */
+#define QSFP_IS_ACTIVE(tech) ((0xA2FF >> ((tech) >> 4)) & 1)
+/* Active Equalization includes fiber, copper full EQ, and copper far Eq */
+#define QSFP_IS_ACTIVE_FAR(tech) ((0x32FF >> ((tech) >> 4)) & 1)
+/* Attenuation should be valid for copper other than full/near Eq */
+#define QSFP_HAS_ATTEN(tech) ((0x4D00 >> ((tech) >> 4)) & 1)
+/* Length is only valid if technology is "copper" */
+#define QSFP_IS_CU(tech) ((0xED00 >> ((tech) >> 4)) & 1)
+#define QSFP_TECH_1490 9
+
+#define QSFP_OUI(oui) (((unsigned)oui[0] << 16) | ((unsigned)oui[1] << 8) | \
+                       oui[2])
+#define QSFP_OUI_AMPHENOL 0x415048
+#define QSFP_OUI_FINISAR  0x009065
+#define QSFP_OUI_GORE     0x002177
+
+/* Bytes 148..163 are Vendor Name, Left-justified Blank-filled */
+#define QSFP_VEND_OFFS 148
+#define QSFP_VEND_LEN 16
+/* Byte 164 is IB Extended transceiver codes Bits D0..3 are SDR,DDR,QDR,EDR */
+#define QSFP_IBXCV_OFFS 164
+/* Bytes 165..167 are Vendor OUI number */
+#define QSFP_VOUI_OFFS 165
+#define QSFP_VOUI_LEN 3
+/* Bytes 168..183 are Vendor Part Number, string */
+#define QSFP_PN_OFFS 168
+#define QSFP_PN_LEN 16
+/* Bytes 184,185 are Vendor Rev. Left Justified, Blank-filled */
+#define QSFP_REV_OFFS 184
+#define QSFP_REV_LEN 2
+/*
+ * Bytes 186,187 are Wavelength, if Optical. Not Intel req'd
+ *  If copper, they are attenuation in dB:
+ * Byte 186 is at 2.5Gb/sec (SDR), Byte 187 at 5.0Gb/sec (DDR)
+ */
+#define QSFP_ATTEN_OFFS 186
+#define QSFP_ATTEN_LEN 2
+/*
+ * Bytes 188,189 are Wavelength tolerance, if optical
+ * If copper, they are attenuation in dB:
+ * Byte 188 is at 12.5 Gb/s, Byte 189 at 25 Gb/s
+ */
+#define QSFP_CU_ATTEN_7G_OFFS 188
+#define QSFP_CU_ATTEN_12G_OFFS 189
+/* Byte 190 is Max Case Temp. Not Intel req'd */
+/* Byte 191 is LSB of sum of bytes 128..190. Not Intel req'd */
+#define QSFP_CC_OFFS 191
+#define QSFP_EQ_INFO_OFFS 193
+#define QSFP_CDR_INFO_OFFS 194
+/* Bytes 196..211 are Serial Number, String */
+#define QSFP_SN_OFFS 196
+#define QSFP_SN_LEN 16
+/* Bytes 212..219 are date-code YYMMDD (MM==1 for Jan) */
+#define QSFP_DATE_OFFS 212
+#define QSFP_DATE_LEN 6
+/* Bytes 218,219 are optional lot-code, string */
+#define QSFP_LOT_OFFS 218
+#define QSFP_LOT_LEN 2
+/* Bytes 220, 221 indicate monitoring options, Not Intel req'd */
+/* Byte 222 indicates nominal bitrate in units of 250Mbits/sec */
+#define QSFP_NOM_BIT_RATE_250_OFFS 222
+/* Byte 223 is LSB of sum of bytes 192..222 */
+#define QSFP_CC_EXT_OFFS 223
+
+/*
+ * Interrupt flag masks
+ */
+#define QSFP_DATA_NOT_READY            0x01
+
+#define QSFP_HIGH_TEMP_ALARM           0x80
+#define QSFP_LOW_TEMP_ALARM            0x40
+#define QSFP_HIGH_TEMP_WARNING         0x20
+#define QSFP_LOW_TEMP_WARNING          0x10
+
+#define QSFP_HIGH_VCC_ALARM            0x80
+#define QSFP_LOW_VCC_ALARM             0x40
+#define QSFP_HIGH_VCC_WARNING          0x20
+#define QSFP_LOW_VCC_WARNING           0x10
+
+#define QSFP_HIGH_POWER_ALARM          0x88
+#define QSFP_LOW_POWER_ALARM           0x44
+#define QSFP_HIGH_POWER_WARNING                0x22
+#define QSFP_LOW_POWER_WARNING         0x11
+
+#define QSFP_HIGH_BIAS_ALARM           0x88
+#define QSFP_LOW_BIAS_ALARM            0x44
+#define QSFP_HIGH_BIAS_WARNING         0x22
+#define QSFP_LOW_BIAS_WARNING          0x11
+
+#define QSFP_ATTEN_SDR(attenarray) (attenarray[0])
+#define QSFP_ATTEN_DDR(attenarray) (attenarray[1])
+
+/*
+ * struct qsfp_data encapsulates state of QSFP device for one port.
+ * it will be part of port-specific data if a board supports QSFP.
+ *
+ * Since multiple board-types use QSFP, and their pport_data structs
+ * differ (in the chip-specific section), we need a pointer to its head.
+ *
+ * Avoiding premature optimization, we will have one work_struct per port,
+ * and let the qsfp_lock arbitrate access to common resources.
+ *
+ */
+struct qsfp_data {
+       /* Helps to find our way */
+       struct hfi1_pportdata *ppd;
+       struct work_struct qsfp_work;
+       u8 cache[QSFP_MAX_NUM_PAGES * 128];
+       /* protect qsfp data */
+       spinlock_t qsfp_lock;
+       u8 check_interrupt_flags;
+       u8 reset_needed;
+       u8 limiting_active;
+       u8 cache_valid;
+       u8 cache_refresh_required;
+};
+
+int refresh_qsfp_cache(struct hfi1_pportdata *ppd,
+                      struct qsfp_data *cp);
+int get_qsfp_power_class(u8 power_byte);
+int qsfp_mod_present(struct hfi1_pportdata *ppd);
+int get_cable_info(struct hfi1_devdata *dd, u32 port_num, u32 addr,
+                  u32 len, u8 *data);
+
+int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
+             int offset, void *bp, int len);
+int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
+            int offset, void *bp, int len);
+int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+              int len);
+int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+             int len);
+int one_qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+                  int len);
+int one_qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+                 int len);
diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c
new file mode 100644 (file)
index 0000000..792f15e
--- /dev/null
@@ -0,0 +1,2580 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/io.h>
+#include <rdma/rdma_vt.h>
+#include <rdma/rdmavt_qp.h>
+
+#include "hfi.h"
+#include "qp.h"
+#include "verbs_txreq.h"
+#include "trace.h"
+
+/* cut down ridiculously long IB macro names */
+#define OP(x) IB_OPCODE_RC_##x
+
+/**
+ * hfi1_add_retry_timer - add/start a retry timer
+ * @qp - the QP
+ *
+ * add a retry timer on the QP
+ */
+static inline void hfi1_add_retry_timer(struct rvt_qp *qp)
+{
+       struct ib_qp *ibqp = &qp->ibqp;
+       struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
+
+       qp->s_flags |= RVT_S_TIMER;
+       /* 4.096 usec. * (1 << qp->timeout) */
+       qp->s_timer.expires = jiffies + qp->timeout_jiffies +
+                             rdi->busy_jiffies;
+       add_timer(&qp->s_timer);
+}
+
+/**
+ * hfi1_add_rnr_timer - add/start an rnr timer
+ * @qp - the QP
+ * @to - timeout in usecs
+ *
+ * add an rnr timer on the QP
+ */
+void hfi1_add_rnr_timer(struct rvt_qp *qp, u32 to)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       qp->s_flags |= RVT_S_WAIT_RNR;
+       qp->s_timer.expires = jiffies + usecs_to_jiffies(to);
+       add_timer(&priv->s_rnr_timer);
+}
+
+/**
+ * hfi1_mod_retry_timer - mod a retry timer
+ * @qp - the QP
+ *
+ * Modify a potentially already running retry
+ * timer
+ */
+static inline void hfi1_mod_retry_timer(struct rvt_qp *qp)
+{
+       struct ib_qp *ibqp = &qp->ibqp;
+       struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
+
+       qp->s_flags |= RVT_S_TIMER;
+       /* 4.096 usec. * (1 << qp->timeout) */
+       mod_timer(&qp->s_timer, jiffies + qp->timeout_jiffies +
+                 rdi->busy_jiffies);
+}
+
+/**
+ * hfi1_stop_retry_timer - stop a retry timer
+ * @qp - the QP
+ *
+ * stop a retry timer and return if the timer
+ * had been pending.
+ */
+static inline int hfi1_stop_retry_timer(struct rvt_qp *qp)
+{
+       int rval = 0;
+
+       /* Remove QP from retry */
+       if (qp->s_flags & RVT_S_TIMER) {
+               qp->s_flags &= ~RVT_S_TIMER;
+               rval = del_timer(&qp->s_timer);
+       }
+       return rval;
+}
+
+/**
+ * hfi1_stop_rc_timers - stop all timers
+ * @qp - the QP
+ *
+ * stop any pending timers
+ */
+void hfi1_stop_rc_timers(struct rvt_qp *qp)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       /* Remove QP from all timers */
+       if (qp->s_flags & (RVT_S_TIMER | RVT_S_WAIT_RNR)) {
+               qp->s_flags &= ~(RVT_S_TIMER | RVT_S_WAIT_RNR);
+               del_timer(&qp->s_timer);
+               del_timer(&priv->s_rnr_timer);
+       }
+}
+
+/**
+ * hfi1_stop_rnr_timer - stop an rnr timer
+ * @qp - the QP
+ *
+ * stop an rnr timer and return if the timer
+ * had been pending.
+ */
+static inline int hfi1_stop_rnr_timer(struct rvt_qp *qp)
+{
+       int rval = 0;
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       /* Remove QP from rnr timer */
+       if (qp->s_flags & RVT_S_WAIT_RNR) {
+               qp->s_flags &= ~RVT_S_WAIT_RNR;
+               rval = del_timer(&priv->s_rnr_timer);
+       }
+       return rval;
+}
+
+/**
+ * hfi1_del_timers_sync - wait for any timeout routines to exit
+ * @qp - the QP
+ */
+void hfi1_del_timers_sync(struct rvt_qp *qp)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       del_timer_sync(&qp->s_timer);
+       del_timer_sync(&priv->s_rnr_timer);
+}
+
+/* only opcode mask for adaptive pio */
+const u32 rc_only_opcode =
+       BIT(OP(SEND_ONLY) & 0x1f) |
+       BIT(OP(SEND_ONLY_WITH_IMMEDIATE & 0x1f)) |
+       BIT(OP(RDMA_WRITE_ONLY & 0x1f)) |
+       BIT(OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE & 0x1f)) |
+       BIT(OP(RDMA_READ_REQUEST & 0x1f)) |
+       BIT(OP(ACKNOWLEDGE & 0x1f)) |
+       BIT(OP(ATOMIC_ACKNOWLEDGE & 0x1f)) |
+       BIT(OP(COMPARE_SWAP & 0x1f)) |
+       BIT(OP(FETCH_ADD & 0x1f));
+
+static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
+                      u32 psn, u32 pmtu)
+{
+       u32 len;
+
+       len = delta_psn(psn, wqe->psn) * pmtu;
+       ss->sge = wqe->sg_list[0];
+       ss->sg_list = wqe->sg_list + 1;
+       ss->num_sge = wqe->wr.num_sge;
+       ss->total_len = wqe->length;
+       hfi1_skip_sge(ss, len, 0);
+       return wqe->length - len;
+}
+
+/**
+ * make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
+ * @dev: the device for this QP
+ * @qp: a pointer to the QP
+ * @ohdr: a pointer to the IB header being constructed
+ * @ps: the xmit packet state
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ * Note that we are in the responder's side of the QP context.
+ * Note the QP s_lock must be held.
+ */
+static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
+                      struct hfi1_other_headers *ohdr,
+                      struct hfi1_pkt_state *ps)
+{
+       struct rvt_ack_entry *e;
+       u32 hwords;
+       u32 len;
+       u32 bth0;
+       u32 bth2;
+       int middle = 0;
+       u32 pmtu = qp->pmtu;
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       /* Don't send an ACK if we aren't supposed to. */
+       if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
+               goto bail;
+
+       /* header size in 32-bit words LRH+BTH = (8+12)/4. */
+       hwords = 5;
+
+       switch (qp->s_ack_state) {
+       case OP(RDMA_READ_RESPONSE_LAST):
+       case OP(RDMA_READ_RESPONSE_ONLY):
+               e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+               if (e->rdma_sge.mr) {
+                       rvt_put_mr(e->rdma_sge.mr);
+                       e->rdma_sge.mr = NULL;
+               }
+               /* FALLTHROUGH */
+       case OP(ATOMIC_ACKNOWLEDGE):
+               /*
+                * We can increment the tail pointer now that the last
+                * response has been sent instead of only being
+                * constructed.
+                */
+               if (++qp->s_tail_ack_queue > HFI1_MAX_RDMA_ATOMIC)
+                       qp->s_tail_ack_queue = 0;
+               /* FALLTHROUGH */
+       case OP(SEND_ONLY):
+       case OP(ACKNOWLEDGE):
+               /* Check for no next entry in the queue. */
+               if (qp->r_head_ack_queue == qp->s_tail_ack_queue) {
+                       if (qp->s_flags & RVT_S_ACK_PENDING)
+                               goto normal;
+                       goto bail;
+               }
+
+               e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+               if (e->opcode == OP(RDMA_READ_REQUEST)) {
+                       /*
+                        * If a RDMA read response is being resent and
+                        * we haven't seen the duplicate request yet,
+                        * then stop sending the remaining responses the
+                        * responder has seen until the requester re-sends it.
+                        */
+                       len = e->rdma_sge.sge_length;
+                       if (len && !e->rdma_sge.mr) {
+                               qp->s_tail_ack_queue = qp->r_head_ack_queue;
+                               goto bail;
+                       }
+                       /* Copy SGE state in case we need to resend */
+                       ps->s_txreq->mr = e->rdma_sge.mr;
+                       if (ps->s_txreq->mr)
+                               rvt_get_mr(ps->s_txreq->mr);
+                       qp->s_ack_rdma_sge.sge = e->rdma_sge;
+                       qp->s_ack_rdma_sge.num_sge = 1;
+                       qp->s_cur_sge = &qp->s_ack_rdma_sge;
+                       if (len > pmtu) {
+                               len = pmtu;
+                               qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
+                       } else {
+                               qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);
+                               e->sent = 1;
+                       }
+                       ohdr->u.aeth = hfi1_compute_aeth(qp);
+                       hwords++;
+                       qp->s_ack_rdma_psn = e->psn;
+                       bth2 = mask_psn(qp->s_ack_rdma_psn++);
+               } else {
+                       /* COMPARE_SWAP or FETCH_ADD */
+                       qp->s_cur_sge = NULL;
+                       len = 0;
+                       qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
+                       ohdr->u.at.aeth = hfi1_compute_aeth(qp);
+                       ohdr->u.at.atomic_ack_eth[0] =
+                               cpu_to_be32(e->atomic_data >> 32);
+                       ohdr->u.at.atomic_ack_eth[1] =
+                               cpu_to_be32(e->atomic_data);
+                       hwords += sizeof(ohdr->u.at) / sizeof(u32);
+                       bth2 = mask_psn(e->psn);
+                       e->sent = 1;
+               }
+               bth0 = qp->s_ack_state << 24;
+               break;
+
+       case OP(RDMA_READ_RESPONSE_FIRST):
+               qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
+               /* FALLTHROUGH */
+       case OP(RDMA_READ_RESPONSE_MIDDLE):
+               qp->s_cur_sge = &qp->s_ack_rdma_sge;
+               ps->s_txreq->mr = qp->s_ack_rdma_sge.sge.mr;
+               if (ps->s_txreq->mr)
+                       rvt_get_mr(ps->s_txreq->mr);
+               len = qp->s_ack_rdma_sge.sge.sge_length;
+               if (len > pmtu) {
+                       len = pmtu;
+                       middle = HFI1_CAP_IS_KSET(SDMA_AHG);
+               } else {
+                       ohdr->u.aeth = hfi1_compute_aeth(qp);
+                       hwords++;
+                       qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
+                       e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+                       e->sent = 1;
+               }
+               bth0 = qp->s_ack_state << 24;
+               bth2 = mask_psn(qp->s_ack_rdma_psn++);
+               break;
+
+       default:
+normal:
+               /*
+                * Send a regular ACK.
+                * Set the s_ack_state so we wait until after sending
+                * the ACK before setting s_ack_state to ACKNOWLEDGE
+                * (see above).
+                */
+               qp->s_ack_state = OP(SEND_ONLY);
+               qp->s_flags &= ~RVT_S_ACK_PENDING;
+               qp->s_cur_sge = NULL;
+               if (qp->s_nak_state)
+                       ohdr->u.aeth =
+                               cpu_to_be32((qp->r_msn & HFI1_MSN_MASK) |
+                                           (qp->s_nak_state <<
+                                            HFI1_AETH_CREDIT_SHIFT));
+               else
+                       ohdr->u.aeth = hfi1_compute_aeth(qp);
+               hwords++;
+               len = 0;
+               bth0 = OP(ACKNOWLEDGE) << 24;
+               bth2 = mask_psn(qp->s_ack_psn);
+       }
+       qp->s_rdma_ack_cnt++;
+       qp->s_hdrwords = hwords;
+       ps->s_txreq->sde = priv->s_sde;
+       qp->s_cur_size = len;
+       hfi1_make_ruc_header(qp, ohdr, bth0, bth2, middle, ps);
+       /* pbc */
+       ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2;
+       return 1;
+
+bail:
+       qp->s_ack_state = OP(ACKNOWLEDGE);
+       /*
+        * Ensure s_rdma_ack_cnt changes are committed prior to resetting
+        * RVT_S_RESP_PENDING
+        */
+       smp_wmb();
+       qp->s_flags &= ~(RVT_S_RESP_PENDING
+                               | RVT_S_ACK_PENDING
+                               | RVT_S_AHG_VALID);
+       return 0;
+}
+
+/**
+ * hfi1_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
+ * @qp: a pointer to the QP
+ *
+ * Assumes s_lock is held.
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+       struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+       struct hfi1_other_headers *ohdr;
+       struct rvt_sge_state *ss;
+       struct rvt_swqe *wqe;
+       /* header size in 32-bit words LRH+BTH = (8+12)/4. */
+       u32 hwords = 5;
+       u32 len;
+       u32 bth0 = 0;
+       u32 bth2;
+       u32 pmtu = qp->pmtu;
+       char newreq;
+       int middle = 0;
+       int delta;
+
+       ps->s_txreq = get_txreq(ps->dev, qp);
+       if (IS_ERR(ps->s_txreq))
+               goto bail_no_tx;
+
+       ohdr = &ps->s_txreq->phdr.hdr.u.oth;
+       if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+               ohdr = &ps->s_txreq->phdr.hdr.u.l.oth;
+
+       /* Sending responses has higher priority over sending requests. */
+       if ((qp->s_flags & RVT_S_RESP_PENDING) &&
+           make_rc_ack(dev, qp, ohdr, ps))
+               return 1;
+
+       if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) {
+               if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
+                       goto bail;
+               /* We are in the error state, flush the work request. */
+               smp_read_barrier_depends(); /* see post_one_send() */
+               if (qp->s_last == ACCESS_ONCE(qp->s_head))
+                       goto bail;
+               /* If DMAs are in progress, we can't flush immediately. */
+               if (iowait_sdma_pending(&priv->s_iowait)) {
+                       qp->s_flags |= RVT_S_WAIT_DMA;
+                       goto bail;
+               }
+               clear_ahg(qp);
+               wqe = rvt_get_swqe_ptr(qp, qp->s_last);
+               hfi1_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
+                       IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
+               /* will get called again */
+               goto done_free_tx;
+       }
+
+       if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK))
+               goto bail;
+
+       if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) {
+               if (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) {
+                       qp->s_flags |= RVT_S_WAIT_PSN;
+                       goto bail;
+               }
+               qp->s_sending_psn = qp->s_psn;
+               qp->s_sending_hpsn = qp->s_psn - 1;
+       }
+
+       /* Send a request. */
+       wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
+       switch (qp->s_state) {
+       default:
+               if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK))
+                       goto bail;
+               /*
+                * Resend an old request or start a new one.
+                *
+                * We keep track of the current SWQE so that
+                * we don't reset the "furthest progress" state
+                * if we need to back up.
+                */
+               newreq = 0;
+               if (qp->s_cur == qp->s_tail) {
+                       /* Check if send work queue is empty. */
+                       if (qp->s_tail == qp->s_head) {
+                               clear_ahg(qp);
+                               goto bail;
+                       }
+                       /*
+                        * If a fence is requested, wait for previous
+                        * RDMA read and atomic operations to finish.
+                        */
+                       if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
+                           qp->s_num_rd_atomic) {
+                               qp->s_flags |= RVT_S_WAIT_FENCE;
+                               goto bail;
+                       }
+                       newreq = 1;
+                       qp->s_psn = wqe->psn;
+               }
+               /*
+                * Note that we have to be careful not to modify the
+                * original work request since we may need to resend
+                * it.
+                */
+               len = wqe->length;
+               ss = &qp->s_sge;
+               bth2 = mask_psn(qp->s_psn);
+               switch (wqe->wr.opcode) {
+               case IB_WR_SEND:
+               case IB_WR_SEND_WITH_IMM:
+                       /* If no credit, return. */
+                       if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) &&
+                           cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
+                               qp->s_flags |= RVT_S_WAIT_SSN_CREDIT;
+                               goto bail;
+                       }
+                       if (len > pmtu) {
+                               qp->s_state = OP(SEND_FIRST);
+                               len = pmtu;
+                               break;
+                       }
+                       if (wqe->wr.opcode == IB_WR_SEND) {
+                               qp->s_state = OP(SEND_ONLY);
+                       } else {
+                               qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
+                               /* Immediate data comes after the BTH */
+                               ohdr->u.imm_data = wqe->wr.ex.imm_data;
+                               hwords += 1;
+                       }
+                       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                               bth0 |= IB_BTH_SOLICITED;
+                       bth2 |= IB_BTH_REQ_ACK;
+                       if (++qp->s_cur == qp->s_size)
+                               qp->s_cur = 0;
+                       break;
+
+               case IB_WR_RDMA_WRITE:
+                       if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+                               qp->s_lsn++;
+                       /* FALLTHROUGH */
+               case IB_WR_RDMA_WRITE_WITH_IMM:
+                       /* If no credit, return. */
+                       if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) &&
+                           cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
+                               qp->s_flags |= RVT_S_WAIT_SSN_CREDIT;
+                               goto bail;
+                       }
+                       ohdr->u.rc.reth.vaddr =
+                               cpu_to_be64(wqe->rdma_wr.remote_addr);
+                       ohdr->u.rc.reth.rkey =
+                               cpu_to_be32(wqe->rdma_wr.rkey);
+                       ohdr->u.rc.reth.length = cpu_to_be32(len);
+                       hwords += sizeof(struct ib_reth) / sizeof(u32);
+                       if (len > pmtu) {
+                               qp->s_state = OP(RDMA_WRITE_FIRST);
+                               len = pmtu;
+                               break;
+                       }
+                       if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
+                               qp->s_state = OP(RDMA_WRITE_ONLY);
+                       } else {
+                               qp->s_state =
+                                       OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
+                               /* Immediate data comes after RETH */
+                               ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
+                               hwords += 1;
+                               if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                                       bth0 |= IB_BTH_SOLICITED;
+                       }
+                       bth2 |= IB_BTH_REQ_ACK;
+                       if (++qp->s_cur == qp->s_size)
+                               qp->s_cur = 0;
+                       break;
+
+               case IB_WR_RDMA_READ:
+                       /*
+                        * Don't allow more operations to be started
+                        * than the QP limits allow.
+                        */
+                       if (newreq) {
+                               if (qp->s_num_rd_atomic >=
+                                   qp->s_max_rd_atomic) {
+                                       qp->s_flags |= RVT_S_WAIT_RDMAR;
+                                       goto bail;
+                               }
+                               qp->s_num_rd_atomic++;
+                               if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+                                       qp->s_lsn++;
+                       }
+                       ohdr->u.rc.reth.vaddr =
+                               cpu_to_be64(wqe->rdma_wr.remote_addr);
+                       ohdr->u.rc.reth.rkey =
+                               cpu_to_be32(wqe->rdma_wr.rkey);
+                       ohdr->u.rc.reth.length = cpu_to_be32(len);
+                       qp->s_state = OP(RDMA_READ_REQUEST);
+                       hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
+                       ss = NULL;
+                       len = 0;
+                       bth2 |= IB_BTH_REQ_ACK;
+                       if (++qp->s_cur == qp->s_size)
+                               qp->s_cur = 0;
+                       break;
+
+               case IB_WR_ATOMIC_CMP_AND_SWP:
+               case IB_WR_ATOMIC_FETCH_AND_ADD:
+                       /*
+                        * Don't allow more operations to be started
+                        * than the QP limits allow.
+                        */
+                       if (newreq) {
+                               if (qp->s_num_rd_atomic >=
+                                   qp->s_max_rd_atomic) {
+                                       qp->s_flags |= RVT_S_WAIT_RDMAR;
+                                       goto bail;
+                               }
+                               qp->s_num_rd_atomic++;
+                               if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+                                       qp->s_lsn++;
+                       }
+                       if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+                               qp->s_state = OP(COMPARE_SWAP);
+                               ohdr->u.atomic_eth.swap_data = cpu_to_be64(
+                                       wqe->atomic_wr.swap);
+                               ohdr->u.atomic_eth.compare_data = cpu_to_be64(
+                                       wqe->atomic_wr.compare_add);
+                       } else {
+                               qp->s_state = OP(FETCH_ADD);
+                               ohdr->u.atomic_eth.swap_data = cpu_to_be64(
+                                       wqe->atomic_wr.compare_add);
+                               ohdr->u.atomic_eth.compare_data = 0;
+                       }
+                       ohdr->u.atomic_eth.vaddr[0] = cpu_to_be32(
+                               wqe->atomic_wr.remote_addr >> 32);
+                       ohdr->u.atomic_eth.vaddr[1] = cpu_to_be32(
+                               wqe->atomic_wr.remote_addr);
+                       ohdr->u.atomic_eth.rkey = cpu_to_be32(
+                               wqe->atomic_wr.rkey);
+                       hwords += sizeof(struct ib_atomic_eth) / sizeof(u32);
+                       ss = NULL;
+                       len = 0;
+                       bth2 |= IB_BTH_REQ_ACK;
+                       if (++qp->s_cur == qp->s_size)
+                               qp->s_cur = 0;
+                       break;
+
+               default:
+                       goto bail;
+               }
+               qp->s_sge.sge = wqe->sg_list[0];
+               qp->s_sge.sg_list = wqe->sg_list + 1;
+               qp->s_sge.num_sge = wqe->wr.num_sge;
+               qp->s_sge.total_len = wqe->length;
+               qp->s_len = wqe->length;
+               if (newreq) {
+                       qp->s_tail++;
+                       if (qp->s_tail >= qp->s_size)
+                               qp->s_tail = 0;
+               }
+               if (wqe->wr.opcode == IB_WR_RDMA_READ)
+                       qp->s_psn = wqe->lpsn + 1;
+               else
+                       qp->s_psn++;
+               break;
+
+       case OP(RDMA_READ_RESPONSE_FIRST):
+               /*
+                * qp->s_state is normally set to the opcode of the
+                * last packet constructed for new requests and therefore
+                * is never set to RDMA read response.
+                * RDMA_READ_RESPONSE_FIRST is used by the ACK processing
+                * thread to indicate a SEND needs to be restarted from an
+                * earlier PSN without interfering with the sending thread.
+                * See restart_rc().
+                */
+               qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
+               /* FALLTHROUGH */
+       case OP(SEND_FIRST):
+               qp->s_state = OP(SEND_MIDDLE);
+               /* FALLTHROUGH */
+       case OP(SEND_MIDDLE):
+               bth2 = mask_psn(qp->s_psn++);
+               ss = &qp->s_sge;
+               len = qp->s_len;
+               if (len > pmtu) {
+                       len = pmtu;
+                       middle = HFI1_CAP_IS_KSET(SDMA_AHG);
+                       break;
+               }
+               if (wqe->wr.opcode == IB_WR_SEND) {
+                       qp->s_state = OP(SEND_LAST);
+               } else {
+                       qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
+                       /* Immediate data comes after the BTH */
+                       ohdr->u.imm_data = wqe->wr.ex.imm_data;
+                       hwords += 1;
+               }
+               if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                       bth0 |= IB_BTH_SOLICITED;
+               bth2 |= IB_BTH_REQ_ACK;
+               qp->s_cur++;
+               if (qp->s_cur >= qp->s_size)
+                       qp->s_cur = 0;
+               break;
+
+       case OP(RDMA_READ_RESPONSE_LAST):
+               /*
+                * qp->s_state is normally set to the opcode of the
+                * last packet constructed for new requests and therefore
+                * is never set to RDMA read response.
+                * RDMA_READ_RESPONSE_LAST is used by the ACK processing
+                * thread to indicate a RDMA write needs to be restarted from
+                * an earlier PSN without interfering with the sending thread.
+                * See restart_rc().
+                */
+               qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
+               /* FALLTHROUGH */
+       case OP(RDMA_WRITE_FIRST):
+               qp->s_state = OP(RDMA_WRITE_MIDDLE);
+               /* FALLTHROUGH */
+       case OP(RDMA_WRITE_MIDDLE):
+               bth2 = mask_psn(qp->s_psn++);
+               ss = &qp->s_sge;
+               len = qp->s_len;
+               if (len > pmtu) {
+                       len = pmtu;
+                       middle = HFI1_CAP_IS_KSET(SDMA_AHG);
+                       break;
+               }
+               if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
+                       qp->s_state = OP(RDMA_WRITE_LAST);
+               } else {
+                       qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
+                       /* Immediate data comes after the BTH */
+                       ohdr->u.imm_data = wqe->wr.ex.imm_data;
+                       hwords += 1;
+                       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                               bth0 |= IB_BTH_SOLICITED;
+               }
+               bth2 |= IB_BTH_REQ_ACK;
+               qp->s_cur++;
+               if (qp->s_cur >= qp->s_size)
+                       qp->s_cur = 0;
+               break;
+
+       case OP(RDMA_READ_RESPONSE_MIDDLE):
+               /*
+                * qp->s_state is normally set to the opcode of the
+                * last packet constructed for new requests and therefore
+                * is never set to RDMA read response.
+                * RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing
+                * thread to indicate a RDMA read needs to be restarted from
+                * an earlier PSN without interfering with the sending thread.
+                * See restart_rc().
+                */
+               len = (delta_psn(qp->s_psn, wqe->psn)) * pmtu;
+               ohdr->u.rc.reth.vaddr =
+                       cpu_to_be64(wqe->rdma_wr.remote_addr + len);
+               ohdr->u.rc.reth.rkey =
+                       cpu_to_be32(wqe->rdma_wr.rkey);
+               ohdr->u.rc.reth.length = cpu_to_be32(wqe->length - len);
+               qp->s_state = OP(RDMA_READ_REQUEST);
+               hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
+               bth2 = mask_psn(qp->s_psn) | IB_BTH_REQ_ACK;
+               qp->s_psn = wqe->lpsn + 1;
+               ss = NULL;
+               len = 0;
+               qp->s_cur++;
+               if (qp->s_cur == qp->s_size)
+                       qp->s_cur = 0;
+               break;
+       }
+       qp->s_sending_hpsn = bth2;
+       delta = delta_psn(bth2, wqe->psn);
+       if (delta && delta % HFI1_PSN_CREDIT == 0)
+               bth2 |= IB_BTH_REQ_ACK;
+       if (qp->s_flags & RVT_S_SEND_ONE) {
+               qp->s_flags &= ~RVT_S_SEND_ONE;
+               qp->s_flags |= RVT_S_WAIT_ACK;
+               bth2 |= IB_BTH_REQ_ACK;
+       }
+       qp->s_len -= len;
+       qp->s_hdrwords = hwords;
+       ps->s_txreq->sde = priv->s_sde;
+       qp->s_cur_sge = ss;
+       qp->s_cur_size = len;
+       hfi1_make_ruc_header(
+               qp,
+               ohdr,
+               bth0 | (qp->s_state << 24),
+               bth2,
+               middle,
+               ps);
+       /* pbc */
+       ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2;
+       return 1;
+
+done_free_tx:
+       hfi1_put_txreq(ps->s_txreq);
+       ps->s_txreq = NULL;
+       return 1;
+
+bail:
+       hfi1_put_txreq(ps->s_txreq);
+
+bail_no_tx:
+       ps->s_txreq = NULL;
+       qp->s_flags &= ~RVT_S_BUSY;
+       qp->s_hdrwords = 0;
+       return 0;
+}
+
+/**
+ * hfi1_send_rc_ack - Construct an ACK packet and send it
+ * @qp: a pointer to the QP
+ *
+ * This is called from hfi1_rc_rcv() and handle_receive_interrupt().
+ * Note that RDMA reads and atomics are handled in the
+ * send side QP state and tasklet.
+ */
+void hfi1_send_rc_ack(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp,
+                     int is_fecn)
+{
+       struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       u64 pbc, pbc_flags = 0;
+       u16 lrh0;
+       u16 sc5;
+       u32 bth0;
+       u32 hwords;
+       u32 vl, plen;
+       struct send_context *sc;
+       struct pio_buf *pbuf;
+       struct hfi1_ib_header hdr;
+       struct hfi1_other_headers *ohdr;
+       unsigned long flags;
+
+       /* Don't send ACK or NAK if a RDMA read or atomic is pending. */
+       if (qp->s_flags & RVT_S_RESP_PENDING)
+               goto queue_ack;
+
+       /* Ensure s_rdma_ack_cnt changes are committed */
+       smp_read_barrier_depends();
+       if (qp->s_rdma_ack_cnt)
+               goto queue_ack;
+
+       /* Construct the header */
+       /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4 */
+       hwords = 6;
+       if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
+               hwords += hfi1_make_grh(ibp, &hdr.u.l.grh,
+                                      &qp->remote_ah_attr.grh, hwords, 0);
+               ohdr = &hdr.u.l.oth;
+               lrh0 = HFI1_LRH_GRH;
+       } else {
+               ohdr = &hdr.u.oth;
+               lrh0 = HFI1_LRH_BTH;
+       }
+       /* read pkey_index w/o lock (its atomic) */
+       bth0 = hfi1_get_pkey(ibp, qp->s_pkey_index) | (OP(ACKNOWLEDGE) << 24);
+       if (qp->s_mig_state == IB_MIG_MIGRATED)
+               bth0 |= IB_BTH_MIG_REQ;
+       if (qp->r_nak_state)
+               ohdr->u.aeth = cpu_to_be32((qp->r_msn & HFI1_MSN_MASK) |
+                                           (qp->r_nak_state <<
+                                            HFI1_AETH_CREDIT_SHIFT));
+       else
+               ohdr->u.aeth = hfi1_compute_aeth(qp);
+       sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
+       /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
+       pbc_flags |= ((!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT);
+       lrh0 |= (sc5 & 0xf) << 12 | (qp->remote_ah_attr.sl & 0xf) << 4;
+       hdr.lrh[0] = cpu_to_be16(lrh0);
+       hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
+       hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
+       hdr.lrh[3] = cpu_to_be16(ppd->lid | qp->remote_ah_attr.src_path_bits);
+       ohdr->bth[0] = cpu_to_be32(bth0);
+       ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
+       ohdr->bth[1] |= cpu_to_be32((!!is_fecn) << HFI1_BECN_SHIFT);
+       ohdr->bth[2] = cpu_to_be32(mask_psn(qp->r_ack_psn));
+
+       /* Don't try to send ACKs if the link isn't ACTIVE */
+       if (driver_lstate(ppd) != IB_PORT_ACTIVE)
+               return;
+
+       sc = rcd->sc;
+       plen = 2 /* PBC */ + hwords;
+       vl = sc_to_vlt(ppd->dd, sc5);
+       pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
+
+       pbuf = sc_buffer_alloc(sc, plen, NULL, NULL);
+       if (!pbuf) {
+               /*
+                * We have no room to send at the moment.  Pass
+                * responsibility for sending the ACK to the send tasklet
+                * so that when enough buffer space becomes available,
+                * the ACK is sent ahead of other outgoing packets.
+                */
+               goto queue_ack;
+       }
+
+       trace_ack_output_ibhdr(dd_from_ibdev(qp->ibqp.device), &hdr);
+
+       /* write the pbc and data */
+       ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc, &hdr, hwords);
+
+       return;
+
+queue_ack:
+       this_cpu_inc(*ibp->rvp.rc_qacks);
+       spin_lock_irqsave(&qp->s_lock, flags);
+       qp->s_flags |= RVT_S_ACK_PENDING | RVT_S_RESP_PENDING;
+       qp->s_nak_state = qp->r_nak_state;
+       qp->s_ack_psn = qp->r_ack_psn;
+       if (is_fecn)
+               qp->s_flags |= RVT_S_ECN;
+
+       /* Schedule the send tasklet. */
+       hfi1_schedule_send(qp);
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+}
+
+/**
+ * reset_psn - reset the QP state to send starting from PSN
+ * @qp: the QP
+ * @psn: the packet sequence number to restart at
+ *
+ * This is called from hfi1_rc_rcv() to process an incoming RC ACK
+ * for the given QP.
+ * Called at interrupt level with the QP s_lock held.
+ */
+static void reset_psn(struct rvt_qp *qp, u32 psn)
+{
+       u32 n = qp->s_acked;
+       struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, n);
+       u32 opcode;
+
+       qp->s_cur = n;
+
+       /*
+        * If we are starting the request from the beginning,
+        * let the normal send code handle initialization.
+        */
+       if (cmp_psn(psn, wqe->psn) <= 0) {
+               qp->s_state = OP(SEND_LAST);
+               goto done;
+       }
+
+       /* Find the work request opcode corresponding to the given PSN. */
+       opcode = wqe->wr.opcode;
+       for (;;) {
+               int diff;
+
+               if (++n == qp->s_size)
+                       n = 0;
+               if (n == qp->s_tail)
+                       break;
+               wqe = rvt_get_swqe_ptr(qp, n);
+               diff = cmp_psn(psn, wqe->psn);
+               if (diff < 0)
+                       break;
+               qp->s_cur = n;
+               /*
+                * If we are starting the request from the beginning,
+                * let the normal send code handle initialization.
+                */
+               if (diff == 0) {
+                       qp->s_state = OP(SEND_LAST);
+                       goto done;
+               }
+               opcode = wqe->wr.opcode;
+       }
+
+       /*
+        * Set the state to restart in the middle of a request.
+        * Don't change the s_sge, s_cur_sge, or s_cur_size.
+        * See hfi1_make_rc_req().
+        */
+       switch (opcode) {
+       case IB_WR_SEND:
+       case IB_WR_SEND_WITH_IMM:
+               qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);
+               break;
+
+       case IB_WR_RDMA_WRITE:
+       case IB_WR_RDMA_WRITE_WITH_IMM:
+               qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
+               break;
+
+       case IB_WR_RDMA_READ:
+               qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
+               break;
+
+       default:
+               /*
+                * This case shouldn't happen since its only
+                * one PSN per req.
+                */
+               qp->s_state = OP(SEND_LAST);
+       }
+done:
+       qp->s_psn = psn;
+       /*
+        * Set RVT_S_WAIT_PSN as rc_complete() may start the timer
+        * asynchronously before the send tasklet can get scheduled.
+        * Doing it in hfi1_make_rc_req() is too late.
+        */
+       if ((cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) &&
+           (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0))
+               qp->s_flags |= RVT_S_WAIT_PSN;
+       qp->s_flags &= ~RVT_S_AHG_VALID;
+}
+
+/*
+ * Back up requester to resend the last un-ACKed request.
+ * The QP r_lock and s_lock should be held and interrupts disabled.
+ */
+static void restart_rc(struct rvt_qp *qp, u32 psn, int wait)
+{
+       struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+       struct hfi1_ibport *ibp;
+
+       if (qp->s_retry == 0) {
+               if (qp->s_mig_state == IB_MIG_ARMED) {
+                       hfi1_migrate_qp(qp);
+                       qp->s_retry = qp->s_retry_cnt;
+               } else if (qp->s_last == qp->s_acked) {
+                       hfi1_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
+                       rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+                       return;
+               } else { /* need to handle delayed completion */
+                       return;
+               }
+       } else {
+               qp->s_retry--;
+       }
+
+       ibp = to_iport(qp->ibqp.device, qp->port_num);
+       if (wqe->wr.opcode == IB_WR_RDMA_READ)
+               ibp->rvp.n_rc_resends++;
+       else
+               ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
+
+       qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR |
+                        RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN |
+                        RVT_S_WAIT_ACK);
+       if (wait)
+               qp->s_flags |= RVT_S_SEND_ONE;
+       reset_psn(qp, psn);
+}
+
+/*
+ * This is called from s_timer for missing responses.
+ */
+void hfi1_rc_timeout(unsigned long arg)
+{
+       struct rvt_qp *qp = (struct rvt_qp *)arg;
+       struct hfi1_ibport *ibp;
+       unsigned long flags;
+
+       spin_lock_irqsave(&qp->r_lock, flags);
+       spin_lock(&qp->s_lock);
+       if (qp->s_flags & RVT_S_TIMER) {
+               ibp = to_iport(qp->ibqp.device, qp->port_num);
+               ibp->rvp.n_rc_timeouts++;
+               qp->s_flags &= ~RVT_S_TIMER;
+               del_timer(&qp->s_timer);
+               trace_hfi1_rc_timeout(qp, qp->s_last_psn + 1);
+               restart_rc(qp, qp->s_last_psn + 1, 1);
+               hfi1_schedule_send(qp);
+       }
+       spin_unlock(&qp->s_lock);
+       spin_unlock_irqrestore(&qp->r_lock, flags);
+}
+
+/*
+ * This is called from s_timer for RNR timeouts.
+ */
+void hfi1_rc_rnr_retry(unsigned long arg)
+{
+       struct rvt_qp *qp = (struct rvt_qp *)arg;
+       unsigned long flags;
+
+       spin_lock_irqsave(&qp->s_lock, flags);
+       hfi1_stop_rnr_timer(qp);
+       hfi1_schedule_send(qp);
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+}
+
+/*
+ * Set qp->s_sending_psn to the next PSN after the given one.
+ * This would be psn+1 except when RDMA reads are present.
+ */
+static void reset_sending_psn(struct rvt_qp *qp, u32 psn)
+{
+       struct rvt_swqe *wqe;
+       u32 n = qp->s_last;
+
+       /* Find the work request corresponding to the given PSN. */
+       for (;;) {
+               wqe = rvt_get_swqe_ptr(qp, n);
+               if (cmp_psn(psn, wqe->lpsn) <= 0) {
+                       if (wqe->wr.opcode == IB_WR_RDMA_READ)
+                               qp->s_sending_psn = wqe->lpsn + 1;
+                       else
+                               qp->s_sending_psn = psn + 1;
+                       break;
+               }
+               if (++n == qp->s_size)
+                       n = 0;
+               if (n == qp->s_tail)
+                       break;
+       }
+}
+
+/*
+ * This should be called with the QP s_lock held and interrupts disabled.
+ */
+void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_ib_header *hdr)
+{
+       struct hfi1_other_headers *ohdr;
+       struct rvt_swqe *wqe;
+       struct ib_wc wc;
+       unsigned i;
+       u32 opcode;
+       u32 psn;
+
+       if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND))
+               return;
+
+       /* Find out where the BTH is */
+       if ((be16_to_cpu(hdr->lrh[0]) & 3) == HFI1_LRH_BTH)
+               ohdr = &hdr->u.oth;
+       else
+               ohdr = &hdr->u.l.oth;
+
+       opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+       if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
+           opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
+               WARN_ON(!qp->s_rdma_ack_cnt);
+               qp->s_rdma_ack_cnt--;
+               return;
+       }
+
+       psn = be32_to_cpu(ohdr->bth[2]);
+       reset_sending_psn(qp, psn);
+
+       /*
+        * Start timer after a packet requesting an ACK has been sent and
+        * there are still requests that haven't been acked.
+        */
+       if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail &&
+           !(qp->s_flags &
+               (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) &&
+               (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
+               hfi1_add_retry_timer(qp);
+
+       while (qp->s_last != qp->s_acked) {
+               u32 s_last;
+
+               wqe = rvt_get_swqe_ptr(qp, qp->s_last);
+               if (cmp_psn(wqe->lpsn, qp->s_sending_psn) >= 0 &&
+                   cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)
+                       break;
+               s_last = qp->s_last;
+               if (++s_last >= qp->s_size)
+                       s_last = 0;
+               qp->s_last = s_last;
+               /* see post_send() */
+               barrier();
+               for (i = 0; i < wqe->wr.num_sge; i++) {
+                       struct rvt_sge *sge = &wqe->sg_list[i];
+
+                       rvt_put_mr(sge->mr);
+               }
+               /* Post a send completion queue entry if requested. */
+               if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) ||
+                   (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
+                       memset(&wc, 0, sizeof(wc));
+                       wc.wr_id = wqe->wr.wr_id;
+                       wc.status = IB_WC_SUCCESS;
+                       wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode];
+                       wc.byte_len = wqe->length;
+                       wc.qp = &qp->ibqp;
+                       rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc, 0);
+               }
+       }
+       /*
+        * If we were waiting for sends to complete before re-sending,
+        * and they are now complete, restart sending.
+        */
+       trace_hfi1_rc_sendcomplete(qp, psn);
+       if (qp->s_flags & RVT_S_WAIT_PSN &&
+           cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
+               qp->s_flags &= ~RVT_S_WAIT_PSN;
+               qp->s_sending_psn = qp->s_psn;
+               qp->s_sending_hpsn = qp->s_psn - 1;
+               hfi1_schedule_send(qp);
+       }
+}
+
+static inline void update_last_psn(struct rvt_qp *qp, u32 psn)
+{
+       qp->s_last_psn = psn;
+}
+
+/*
+ * Generate a SWQE completion.
+ * This is similar to hfi1_send_complete but has to check to be sure
+ * that the SGEs are not being referenced if the SWQE is being resent.
+ */
+static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
+                                        struct rvt_swqe *wqe,
+                                        struct hfi1_ibport *ibp)
+{
+       struct ib_wc wc;
+       unsigned i;
+
+       /*
+        * Don't decrement refcount and don't generate a
+        * completion if the SWQE is being resent until the send
+        * is finished.
+        */
+       if (cmp_psn(wqe->lpsn, qp->s_sending_psn) < 0 ||
+           cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
+               u32 s_last;
+
+               for (i = 0; i < wqe->wr.num_sge; i++) {
+                       struct rvt_sge *sge = &wqe->sg_list[i];
+
+                       rvt_put_mr(sge->mr);
+               }
+               s_last = qp->s_last;
+               if (++s_last >= qp->s_size)
+                       s_last = 0;
+               qp->s_last = s_last;
+               /* see post_send() */
+               barrier();
+               /* Post a send completion queue entry if requested. */
+               if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) ||
+                   (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
+                       memset(&wc, 0, sizeof(wc));
+                       wc.wr_id = wqe->wr.wr_id;
+                       wc.status = IB_WC_SUCCESS;
+                       wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode];
+                       wc.byte_len = wqe->length;
+                       wc.qp = &qp->ibqp;
+                       rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc, 0);
+               }
+       } else {
+               struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+               this_cpu_inc(*ibp->rvp.rc_delayed_comp);
+               /*
+                * If send progress not running attempt to progress
+                * SDMA queue.
+                */
+               if (ppd->dd->flags & HFI1_HAS_SEND_DMA) {
+                       struct sdma_engine *engine;
+                       u8 sc5;
+
+                       /* For now use sc to find engine */
+                       sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
+                       engine = qp_to_sdma_engine(qp, sc5);
+                       sdma_engine_progress_schedule(engine);
+               }
+       }
+
+       qp->s_retry = qp->s_retry_cnt;
+       update_last_psn(qp, wqe->lpsn);
+
+       /*
+        * If we are completing a request which is in the process of
+        * being resent, we can stop re-sending it since we know the
+        * responder has already seen it.
+        */
+       if (qp->s_acked == qp->s_cur) {
+               if (++qp->s_cur >= qp->s_size)
+                       qp->s_cur = 0;
+               qp->s_acked = qp->s_cur;
+               wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
+               if (qp->s_acked != qp->s_tail) {
+                       qp->s_state = OP(SEND_LAST);
+                       qp->s_psn = wqe->psn;
+               }
+       } else {
+               if (++qp->s_acked >= qp->s_size)
+                       qp->s_acked = 0;
+               if (qp->state == IB_QPS_SQD && qp->s_acked == qp->s_cur)
+                       qp->s_draining = 0;
+               wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+       }
+       return wqe;
+}
+
+/**
+ * do_rc_ack - process an incoming RC ACK
+ * @qp: the QP the ACK came in on
+ * @psn: the packet sequence number of the ACK
+ * @opcode: the opcode of the request that resulted in the ACK
+ *
+ * This is called from rc_rcv_resp() to process an incoming RC ACK
+ * for the given QP.
+ * May be called at interrupt level, with the QP s_lock held.
+ * Returns 1 if OK, 0 if current operation should be aborted (NAK).
+ */
+static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
+                    u64 val, struct hfi1_ctxtdata *rcd)
+{
+       struct hfi1_ibport *ibp;
+       enum ib_wc_status status;
+       struct rvt_swqe *wqe;
+       int ret = 0;
+       u32 ack_psn;
+       int diff;
+       unsigned long to;
+
+       /*
+        * Note that NAKs implicitly ACK outstanding SEND and RDMA write
+        * requests and implicitly NAK RDMA read and atomic requests issued
+        * before the NAK'ed request.  The MSN won't include the NAK'ed
+        * request but will include an ACK'ed request(s).
+        */
+       ack_psn = psn;
+       if (aeth >> 29)
+               ack_psn--;
+       wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+       ibp = to_iport(qp->ibqp.device, qp->port_num);
+
+       /*
+        * The MSN might be for a later WQE than the PSN indicates so
+        * only complete WQEs that the PSN finishes.
+        */
+       while ((diff = delta_psn(ack_psn, wqe->lpsn)) >= 0) {
+               /*
+                * RDMA_READ_RESPONSE_ONLY is a special case since
+                * we want to generate completion events for everything
+                * before the RDMA read, copy the data, then generate
+                * the completion for the read.
+                */
+               if (wqe->wr.opcode == IB_WR_RDMA_READ &&
+                   opcode == OP(RDMA_READ_RESPONSE_ONLY) &&
+                   diff == 0) {
+                       ret = 1;
+                       goto bail_stop;
+               }
+               /*
+                * If this request is a RDMA read or atomic, and the ACK is
+                * for a later operation, this ACK NAKs the RDMA read or
+                * atomic.  In other words, only a RDMA_READ_LAST or ONLY
+                * can ACK a RDMA read and likewise for atomic ops.  Note
+                * that the NAK case can only happen if relaxed ordering is
+                * used and requests are sent after an RDMA read or atomic
+                * is sent but before the response is received.
+                */
+               if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
+                    (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
+                   ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+                     wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
+                    (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
+                       /* Retry this request. */
+                       if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
+                               qp->r_flags |= RVT_R_RDMAR_SEQ;
+                               restart_rc(qp, qp->s_last_psn + 1, 0);
+                               if (list_empty(&qp->rspwait)) {
+                                       qp->r_flags |= RVT_R_RSP_SEND;
+                                       atomic_inc(&qp->refcount);
+                                       list_add_tail(&qp->rspwait,
+                                                     &rcd->qp_wait_list);
+                               }
+                       }
+                       /*
+                        * No need to process the ACK/NAK since we are
+                        * restarting an earlier request.
+                        */
+                       goto bail_stop;
+               }
+               if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+                   wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
+                       u64 *vaddr = wqe->sg_list[0].vaddr;
+                       *vaddr = val;
+               }
+               if (qp->s_num_rd_atomic &&
+                   (wqe->wr.opcode == IB_WR_RDMA_READ ||
+                    wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+                    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) {
+                       qp->s_num_rd_atomic--;
+                       /* Restart sending task if fence is complete */
+                       if ((qp->s_flags & RVT_S_WAIT_FENCE) &&
+                           !qp->s_num_rd_atomic) {
+                               qp->s_flags &= ~(RVT_S_WAIT_FENCE |
+                                                RVT_S_WAIT_ACK);
+                               hfi1_schedule_send(qp);
+                       } else if (qp->s_flags & RVT_S_WAIT_RDMAR) {
+                               qp->s_flags &= ~(RVT_S_WAIT_RDMAR |
+                                                RVT_S_WAIT_ACK);
+                               hfi1_schedule_send(qp);
+                       }
+               }
+               wqe = do_rc_completion(qp, wqe, ibp);
+               if (qp->s_acked == qp->s_tail)
+                       break;
+       }
+
+       switch (aeth >> 29) {
+       case 0:         /* ACK */
+               this_cpu_inc(*ibp->rvp.rc_acks);
+               if (qp->s_acked != qp->s_tail) {
+                       /*
+                        * We are expecting more ACKs so
+                        * mod the retry timer.
+                        */
+                       hfi1_mod_retry_timer(qp);
+                       /*
+                        * We can stop re-sending the earlier packets and
+                        * continue with the next packet the receiver wants.
+                        */
+                       if (cmp_psn(qp->s_psn, psn) <= 0)
+                               reset_psn(qp, psn + 1);
+               } else {
+                       /* No more acks - kill all timers */
+                       hfi1_stop_rc_timers(qp);
+                       if (cmp_psn(qp->s_psn, psn) <= 0) {
+                               qp->s_state = OP(SEND_LAST);
+                               qp->s_psn = psn + 1;
+                       }
+               }
+               if (qp->s_flags & RVT_S_WAIT_ACK) {
+                       qp->s_flags &= ~RVT_S_WAIT_ACK;
+                       hfi1_schedule_send(qp);
+               }
+               hfi1_get_credit(qp, aeth);
+               qp->s_rnr_retry = qp->s_rnr_retry_cnt;
+               qp->s_retry = qp->s_retry_cnt;
+               update_last_psn(qp, psn);
+               return 1;
+
+       case 1:         /* RNR NAK */
+               ibp->rvp.n_rnr_naks++;
+               if (qp->s_acked == qp->s_tail)
+                       goto bail_stop;
+               if (qp->s_flags & RVT_S_WAIT_RNR)
+                       goto bail_stop;
+               if (qp->s_rnr_retry == 0) {
+                       status = IB_WC_RNR_RETRY_EXC_ERR;
+                       goto class_b;
+               }
+               if (qp->s_rnr_retry_cnt < 7)
+                       qp->s_rnr_retry--;
+
+               /* The last valid PSN is the previous PSN. */
+               update_last_psn(qp, psn - 1);
+
+               ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
+
+               reset_psn(qp, psn);
+
+               qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK);
+               hfi1_stop_rc_timers(qp);
+               to =
+                       ib_hfi1_rnr_table[(aeth >> HFI1_AETH_CREDIT_SHIFT) &
+                                          HFI1_AETH_CREDIT_MASK];
+               hfi1_add_rnr_timer(qp, to);
+               return 0;
+
+       case 3:         /* NAK */
+               if (qp->s_acked == qp->s_tail)
+                       goto bail_stop;
+               /* The last valid PSN is the previous PSN. */
+               update_last_psn(qp, psn - 1);
+               switch ((aeth >> HFI1_AETH_CREDIT_SHIFT) &
+                       HFI1_AETH_CREDIT_MASK) {
+               case 0: /* PSN sequence error */
+                       ibp->rvp.n_seq_naks++;
+                       /*
+                        * Back up to the responder's expected PSN.
+                        * Note that we might get a NAK in the middle of an
+                        * RDMA READ response which terminates the RDMA
+                        * READ.
+                        */
+                       restart_rc(qp, psn, 0);
+                       hfi1_schedule_send(qp);
+                       break;
+
+               case 1: /* Invalid Request */
+                       status = IB_WC_REM_INV_REQ_ERR;
+                       ibp->rvp.n_other_naks++;
+                       goto class_b;
+
+               case 2: /* Remote Access Error */
+                       status = IB_WC_REM_ACCESS_ERR;
+                       ibp->rvp.n_other_naks++;
+                       goto class_b;
+
+               case 3: /* Remote Operation Error */
+                       status = IB_WC_REM_OP_ERR;
+                       ibp->rvp.n_other_naks++;
+class_b:
+                       if (qp->s_last == qp->s_acked) {
+                               hfi1_send_complete(qp, wqe, status);
+                               rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+                       }
+                       break;
+
+               default:
+                       /* Ignore other reserved NAK error codes */
+                       goto reserved;
+               }
+               qp->s_retry = qp->s_retry_cnt;
+               qp->s_rnr_retry = qp->s_rnr_retry_cnt;
+               goto bail_stop;
+
+       default:                /* 2: reserved */
+reserved:
+               /* Ignore reserved NAK codes. */
+               goto bail_stop;
+       }
+       /* cannot be reached  */
+bail_stop:
+       hfi1_stop_rc_timers(qp);
+       return ret;
+}
+
+/*
+ * We have seen an out of sequence RDMA read middle or last packet.
+ * This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE.
+ */
+static void rdma_seq_err(struct rvt_qp *qp, struct hfi1_ibport *ibp, u32 psn,
+                        struct hfi1_ctxtdata *rcd)
+{
+       struct rvt_swqe *wqe;
+
+       /* Remove QP from retry timer */
+       hfi1_stop_rc_timers(qp);
+
+       wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+
+       while (cmp_psn(psn, wqe->lpsn) > 0) {
+               if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+                   wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+                   wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
+                       break;
+               wqe = do_rc_completion(qp, wqe, ibp);
+       }
+
+       ibp->rvp.n_rdma_seq++;
+       qp->r_flags |= RVT_R_RDMAR_SEQ;
+       restart_rc(qp, qp->s_last_psn + 1, 0);
+       if (list_empty(&qp->rspwait)) {
+               qp->r_flags |= RVT_R_RSP_SEND;
+               atomic_inc(&qp->refcount);
+               list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+       }
+}
+
+/**
+ * rc_rcv_resp - process an incoming RC response packet
+ * @ibp: the port this packet came in on
+ * @ohdr: the other headers for this packet
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP for this packet
+ * @opcode: the opcode for this packet
+ * @psn: the packet sequence number for this packet
+ * @hdrsize: the header length
+ * @pmtu: the path MTU
+ *
+ * This is called from hfi1_rc_rcv() to process an incoming RC response
+ * packet for the given QP.
+ * Called at interrupt level.
+ */
+static void rc_rcv_resp(struct hfi1_ibport *ibp,
+                       struct hfi1_other_headers *ohdr,
+                       void *data, u32 tlen, struct rvt_qp *qp,
+                       u32 opcode, u32 psn, u32 hdrsize, u32 pmtu,
+                       struct hfi1_ctxtdata *rcd)
+{
+       struct rvt_swqe *wqe;
+       enum ib_wc_status status;
+       unsigned long flags;
+       int diff;
+       u32 pad;
+       u32 aeth;
+       u64 val;
+
+       spin_lock_irqsave(&qp->s_lock, flags);
+
+       trace_hfi1_rc_ack(qp, psn);
+
+       /* Ignore invalid responses. */
+       smp_read_barrier_depends(); /* see post_one_send */
+       if (cmp_psn(psn, ACCESS_ONCE(qp->s_next_psn)) >= 0)
+               goto ack_done;
+
+       /* Ignore duplicate responses. */
+       diff = cmp_psn(psn, qp->s_last_psn);
+       if (unlikely(diff <= 0)) {
+               /* Update credits for "ghost" ACKs */
+               if (diff == 0 && opcode == OP(ACKNOWLEDGE)) {
+                       aeth = be32_to_cpu(ohdr->u.aeth);
+                       if ((aeth >> 29) == 0)
+                               hfi1_get_credit(qp, aeth);
+               }
+               goto ack_done;
+       }
+
+       /*
+        * Skip everything other than the PSN we expect, if we are waiting
+        * for a reply to a restarted RDMA read or atomic op.
+        */
+       if (qp->r_flags & RVT_R_RDMAR_SEQ) {
+               if (cmp_psn(psn, qp->s_last_psn + 1) != 0)
+                       goto ack_done;
+               qp->r_flags &= ~RVT_R_RDMAR_SEQ;
+       }
+
+       if (unlikely(qp->s_acked == qp->s_tail))
+               goto ack_done;
+       wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+       status = IB_WC_SUCCESS;
+
+       switch (opcode) {
+       case OP(ACKNOWLEDGE):
+       case OP(ATOMIC_ACKNOWLEDGE):
+       case OP(RDMA_READ_RESPONSE_FIRST):
+               aeth = be32_to_cpu(ohdr->u.aeth);
+               if (opcode == OP(ATOMIC_ACKNOWLEDGE)) {
+                       __be32 *p = ohdr->u.at.atomic_ack_eth;
+
+                       val = ((u64)be32_to_cpu(p[0]) << 32) |
+                               be32_to_cpu(p[1]);
+               } else {
+                       val = 0;
+               }
+               if (!do_rc_ack(qp, aeth, psn, opcode, val, rcd) ||
+                   opcode != OP(RDMA_READ_RESPONSE_FIRST))
+                       goto ack_done;
+               wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+               if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+                       goto ack_op_err;
+               /*
+                * If this is a response to a resent RDMA read, we
+                * have to be careful to copy the data to the right
+                * location.
+                */
+               qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
+                                                 wqe, psn, pmtu);
+               goto read_middle;
+
+       case OP(RDMA_READ_RESPONSE_MIDDLE):
+               /* no AETH, no ACK */
+               if (unlikely(cmp_psn(psn, qp->s_last_psn + 1)))
+                       goto ack_seq_err;
+               if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+                       goto ack_op_err;
+read_middle:
+               if (unlikely(tlen != (hdrsize + pmtu + 4)))
+                       goto ack_len_err;
+               if (unlikely(pmtu >= qp->s_rdma_read_len))
+                       goto ack_len_err;
+
+               /*
+                * We got a response so update the timeout.
+                * 4.096 usec. * (1 << qp->timeout)
+                */
+               qp->s_flags |= RVT_S_TIMER;
+               mod_timer(&qp->s_timer, jiffies + qp->timeout_jiffies);
+               if (qp->s_flags & RVT_S_WAIT_ACK) {
+                       qp->s_flags &= ~RVT_S_WAIT_ACK;
+                       hfi1_schedule_send(qp);
+               }
+
+               if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE))
+                       qp->s_retry = qp->s_retry_cnt;
+
+               /*
+                * Update the RDMA receive state but do the copy w/o
+                * holding the locks and blocking interrupts.
+                */
+               qp->s_rdma_read_len -= pmtu;
+               update_last_psn(qp, psn);
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+               hfi1_copy_sge(&qp->s_rdma_read_sge, data, pmtu, 0, 0);
+               goto bail;
+
+       case OP(RDMA_READ_RESPONSE_ONLY):
+               aeth = be32_to_cpu(ohdr->u.aeth);
+               if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd))
+                       goto ack_done;
+               /* Get the number of bytes the message was padded by. */
+               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+               /*
+                * Check that the data size is >= 0 && <= pmtu.
+                * Remember to account for ICRC (4).
+                */
+               if (unlikely(tlen < (hdrsize + pad + 4)))
+                       goto ack_len_err;
+               /*
+                * If this is a response to a resent RDMA read, we
+                * have to be careful to copy the data to the right
+                * location.
+                */
+               wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+               qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
+                                                 wqe, psn, pmtu);
+               goto read_last;
+
+       case OP(RDMA_READ_RESPONSE_LAST):
+               /* ACKs READ req. */
+               if (unlikely(cmp_psn(psn, qp->s_last_psn + 1)))
+                       goto ack_seq_err;
+               if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+                       goto ack_op_err;
+               /* Get the number of bytes the message was padded by. */
+               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+               /*
+                * Check that the data size is >= 1 && <= pmtu.
+                * Remember to account for ICRC (4).
+                */
+               if (unlikely(tlen <= (hdrsize + pad + 4)))
+                       goto ack_len_err;
+read_last:
+               tlen -= hdrsize + pad + 4;
+               if (unlikely(tlen != qp->s_rdma_read_len))
+                       goto ack_len_err;
+               aeth = be32_to_cpu(ohdr->u.aeth);
+               hfi1_copy_sge(&qp->s_rdma_read_sge, data, tlen, 0, 0);
+               WARN_ON(qp->s_rdma_read_sge.num_sge);
+               (void)do_rc_ack(qp, aeth, psn,
+                                OP(RDMA_READ_RESPONSE_LAST), 0, rcd);
+               goto ack_done;
+       }
+
+ack_op_err:
+       status = IB_WC_LOC_QP_OP_ERR;
+       goto ack_err;
+
+ack_seq_err:
+       rdma_seq_err(qp, ibp, psn, rcd);
+       goto ack_done;
+
+ack_len_err:
+       status = IB_WC_LOC_LEN_ERR;
+ack_err:
+       if (qp->s_last == qp->s_acked) {
+               hfi1_send_complete(qp, wqe, status);
+               rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+       }
+ack_done:
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+bail:
+       return;
+}
+
+static inline void rc_defered_ack(struct hfi1_ctxtdata *rcd,
+                                 struct rvt_qp *qp)
+{
+       if (list_empty(&qp->rspwait)) {
+               qp->r_flags |= RVT_R_RSP_NAK;
+               atomic_inc(&qp->refcount);
+               list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+       }
+}
+
+static inline void rc_cancel_ack(struct rvt_qp *qp)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       priv->r_adefered = 0;
+       if (list_empty(&qp->rspwait))
+               return;
+       list_del_init(&qp->rspwait);
+       qp->r_flags &= ~RVT_R_RSP_NAK;
+       if (atomic_dec_and_test(&qp->refcount))
+               wake_up(&qp->wait);
+}
+
+/**
+ * rc_rcv_error - process an incoming duplicate or error RC packet
+ * @ohdr: the other headers for this packet
+ * @data: the packet data
+ * @qp: the QP for this packet
+ * @opcode: the opcode for this packet
+ * @psn: the packet sequence number for this packet
+ * @diff: the difference between the PSN and the expected PSN
+ *
+ * This is called from hfi1_rc_rcv() to process an unexpected
+ * incoming RC packet for the given QP.
+ * Called at interrupt level.
+ * Return 1 if no more processing is needed; otherwise return 0 to
+ * schedule a response to be sent.
+ */
+static noinline int rc_rcv_error(struct hfi1_other_headers *ohdr, void *data,
+                                struct rvt_qp *qp, u32 opcode, u32 psn,
+                                int diff, struct hfi1_ctxtdata *rcd)
+{
+       struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+       struct rvt_ack_entry *e;
+       unsigned long flags;
+       u8 i, prev;
+       int old_req;
+
+       trace_hfi1_rc_rcv_error(qp, psn);
+       if (diff > 0) {
+               /*
+                * Packet sequence error.
+                * A NAK will ACK earlier sends and RDMA writes.
+                * Don't queue the NAK if we already sent one.
+                */
+               if (!qp->r_nak_state) {
+                       ibp->rvp.n_rc_seqnak++;
+                       qp->r_nak_state = IB_NAK_PSN_ERROR;
+                       /* Use the expected PSN. */
+                       qp->r_ack_psn = qp->r_psn;
+                       /*
+                        * Wait to send the sequence NAK until all packets
+                        * in the receive queue have been processed.
+                        * Otherwise, we end up propagating congestion.
+                        */
+                       rc_defered_ack(rcd, qp);
+               }
+               goto done;
+       }
+
+       /*
+        * Handle a duplicate request.  Don't re-execute SEND, RDMA
+        * write or atomic op.  Don't NAK errors, just silently drop
+        * the duplicate request.  Note that r_sge, r_len, and
+        * r_rcv_len may be in use so don't modify them.
+        *
+        * We are supposed to ACK the earliest duplicate PSN but we
+        * can coalesce an outstanding duplicate ACK.  We have to
+        * send the earliest so that RDMA reads can be restarted at
+        * the requester's expected PSN.
+        *
+        * First, find where this duplicate PSN falls within the
+        * ACKs previously sent.
+        * old_req is true if there is an older response that is scheduled
+        * to be sent before sending this one.
+        */
+       e = NULL;
+       old_req = 1;
+       ibp->rvp.n_rc_dupreq++;
+
+       spin_lock_irqsave(&qp->s_lock, flags);
+
+       for (i = qp->r_head_ack_queue; ; i = prev) {
+               if (i == qp->s_tail_ack_queue)
+                       old_req = 0;
+               if (i)
+                       prev = i - 1;
+               else
+                       prev = HFI1_MAX_RDMA_ATOMIC;
+               if (prev == qp->r_head_ack_queue) {
+                       e = NULL;
+                       break;
+               }
+               e = &qp->s_ack_queue[prev];
+               if (!e->opcode) {
+                       e = NULL;
+                       break;
+               }
+               if (cmp_psn(psn, e->psn) >= 0) {
+                       if (prev == qp->s_tail_ack_queue &&
+                           cmp_psn(psn, e->lpsn) <= 0)
+                               old_req = 0;
+                       break;
+               }
+       }
+       switch (opcode) {
+       case OP(RDMA_READ_REQUEST): {
+               struct ib_reth *reth;
+               u32 offset;
+               u32 len;
+
+               /*
+                * If we didn't find the RDMA read request in the ack queue,
+                * we can ignore this request.
+                */
+               if (!e || e->opcode != OP(RDMA_READ_REQUEST))
+                       goto unlock_done;
+               /* RETH comes after BTH */
+               reth = &ohdr->u.rc.reth;
+               /*
+                * Address range must be a subset of the original
+                * request and start on pmtu boundaries.
+                * We reuse the old ack_queue slot since the requester
+                * should not back up and request an earlier PSN for the
+                * same request.
+                */
+               offset = delta_psn(psn, e->psn) * qp->pmtu;
+               len = be32_to_cpu(reth->length);
+               if (unlikely(offset + len != e->rdma_sge.sge_length))
+                       goto unlock_done;
+               if (e->rdma_sge.mr) {
+                       rvt_put_mr(e->rdma_sge.mr);
+                       e->rdma_sge.mr = NULL;
+               }
+               if (len != 0) {
+                       u32 rkey = be32_to_cpu(reth->rkey);
+                       u64 vaddr = be64_to_cpu(reth->vaddr);
+                       int ok;
+
+                       ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey,
+                                        IB_ACCESS_REMOTE_READ);
+                       if (unlikely(!ok))
+                               goto unlock_done;
+               } else {
+                       e->rdma_sge.vaddr = NULL;
+                       e->rdma_sge.length = 0;
+                       e->rdma_sge.sge_length = 0;
+               }
+               e->psn = psn;
+               if (old_req)
+                       goto unlock_done;
+               qp->s_tail_ack_queue = prev;
+               break;
+       }
+
+       case OP(COMPARE_SWAP):
+       case OP(FETCH_ADD): {
+               /*
+                * If we didn't find the atomic request in the ack queue
+                * or the send tasklet is already backed up to send an
+                * earlier entry, we can ignore this request.
+                */
+               if (!e || e->opcode != (u8)opcode || old_req)
+                       goto unlock_done;
+               qp->s_tail_ack_queue = prev;
+               break;
+       }
+
+       default:
+               /*
+                * Ignore this operation if it doesn't request an ACK
+                * or an earlier RDMA read or atomic is going to be resent.
+                */
+               if (!(psn & IB_BTH_REQ_ACK) || old_req)
+                       goto unlock_done;
+               /*
+                * Resend the most recent ACK if this request is
+                * after all the previous RDMA reads and atomics.
+                */
+               if (i == qp->r_head_ack_queue) {
+                       spin_unlock_irqrestore(&qp->s_lock, flags);
+                       qp->r_nak_state = 0;
+                       qp->r_ack_psn = qp->r_psn - 1;
+                       goto send_ack;
+               }
+
+               /*
+                * Resend the RDMA read or atomic op which
+                * ACKs this duplicate request.
+                */
+               qp->s_tail_ack_queue = i;
+               break;
+       }
+       qp->s_ack_state = OP(ACKNOWLEDGE);
+       qp->s_flags |= RVT_S_RESP_PENDING;
+       qp->r_nak_state = 0;
+       hfi1_schedule_send(qp);
+
+unlock_done:
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+done:
+       return 1;
+
+send_ack:
+       return 0;
+}
+
+void hfi1_rc_error(struct rvt_qp *qp, enum ib_wc_status err)
+{
+       unsigned long flags;
+       int lastwqe;
+
+       spin_lock_irqsave(&qp->s_lock, flags);
+       lastwqe = rvt_error_qp(qp, err);
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+
+       if (lastwqe) {
+               struct ib_event ev;
+
+               ev.device = qp->ibqp.device;
+               ev.element.qp = &qp->ibqp;
+               ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+               qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+       }
+}
+
+static inline void update_ack_queue(struct rvt_qp *qp, unsigned n)
+{
+       unsigned next;
+
+       next = n + 1;
+       if (next > HFI1_MAX_RDMA_ATOMIC)
+               next = 0;
+       qp->s_tail_ack_queue = next;
+       qp->s_ack_state = OP(ACKNOWLEDGE);
+}
+
+static void log_cca_event(struct hfi1_pportdata *ppd, u8 sl, u32 rlid,
+                         u32 lqpn, u32 rqpn, u8 svc_type)
+{
+       struct opa_hfi1_cong_log_event_internal *cc_event;
+       unsigned long flags;
+
+       if (sl >= OPA_MAX_SLS)
+               return;
+
+       spin_lock_irqsave(&ppd->cc_log_lock, flags);
+
+       ppd->threshold_cong_event_map[sl / 8] |= 1 << (sl % 8);
+       ppd->threshold_event_counter++;
+
+       cc_event = &ppd->cc_events[ppd->cc_log_idx++];
+       if (ppd->cc_log_idx == OPA_CONG_LOG_ELEMS)
+               ppd->cc_log_idx = 0;
+       cc_event->lqpn = lqpn & RVT_QPN_MASK;
+       cc_event->rqpn = rqpn & RVT_QPN_MASK;
+       cc_event->sl = sl;
+       cc_event->svc_type = svc_type;
+       cc_event->rlid = rlid;
+       /* keep timestamp in units of 1.024 usec */
+       cc_event->timestamp = ktime_to_ns(ktime_get()) / 1024;
+
+       spin_unlock_irqrestore(&ppd->cc_log_lock, flags);
+}
+
+void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn,
+                 u32 rqpn, u8 svc_type)
+{
+       struct cca_timer *cca_timer;
+       u16 ccti, ccti_incr, ccti_timer, ccti_limit;
+       u8 trigger_threshold;
+       struct cc_state *cc_state;
+       unsigned long flags;
+
+       if (sl >= OPA_MAX_SLS)
+               return;
+
+       cc_state = get_cc_state(ppd);
+
+       if (!cc_state)
+               return;
+
+       /*
+        * 1) increase CCTI (for this SL)
+        * 2) select IPG (i.e., call set_link_ipg())
+        * 3) start timer
+        */
+       ccti_limit = cc_state->cct.ccti_limit;
+       ccti_incr = cc_state->cong_setting.entries[sl].ccti_increase;
+       ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer;
+       trigger_threshold =
+               cc_state->cong_setting.entries[sl].trigger_threshold;
+
+       spin_lock_irqsave(&ppd->cca_timer_lock, flags);
+
+       cca_timer = &ppd->cca_timer[sl];
+       if (cca_timer->ccti < ccti_limit) {
+               if (cca_timer->ccti + ccti_incr <= ccti_limit)
+                       cca_timer->ccti += ccti_incr;
+               else
+                       cca_timer->ccti = ccti_limit;
+               set_link_ipg(ppd);
+       }
+
+       ccti = cca_timer->ccti;
+
+       if (!hrtimer_active(&cca_timer->hrtimer)) {
+               /* ccti_timer is in units of 1.024 usec */
+               unsigned long nsec = 1024 * ccti_timer;
+
+               hrtimer_start(&cca_timer->hrtimer, ns_to_ktime(nsec),
+                             HRTIMER_MODE_REL);
+       }
+
+       spin_unlock_irqrestore(&ppd->cca_timer_lock, flags);
+
+       if ((trigger_threshold != 0) && (ccti >= trigger_threshold))
+               log_cca_event(ppd, sl, rlid, lqpn, rqpn, svc_type);
+}
+
+/**
+ * hfi1_rc_rcv - process an incoming RC packet
+ * @rcd: the context pointer
+ * @hdr: the header of this packet
+ * @rcv_flags: flags relevant to rcv processing
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP for this packet
+ *
+ * This is called from qp_rcv() to process an incoming RC packet
+ * for the given QP.
+ * May be called at interrupt level.
+ */
+void hfi1_rc_rcv(struct hfi1_packet *packet)
+{
+       struct hfi1_ctxtdata *rcd = packet->rcd;
+       struct hfi1_ib_header *hdr = packet->hdr;
+       u32 rcv_flags = packet->rcv_flags;
+       void *data = packet->ebuf;
+       u32 tlen = packet->tlen;
+       struct rvt_qp *qp = packet->qp;
+       struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       struct hfi1_other_headers *ohdr = packet->ohdr;
+       u32 bth0, opcode;
+       u32 hdrsize = packet->hlen;
+       u32 psn;
+       u32 pad;
+       struct ib_wc wc;
+       u32 pmtu = qp->pmtu;
+       int diff;
+       struct ib_reth *reth;
+       unsigned long flags;
+       u32 bth1;
+       int ret, is_fecn = 0;
+       int copy_last = 0;
+
+       bth0 = be32_to_cpu(ohdr->bth[0]);
+       if (hfi1_ruc_check_hdr(ibp, hdr, rcv_flags & HFI1_HAS_GRH, qp, bth0))
+               return;
+
+       bth1 = be32_to_cpu(ohdr->bth[1]);
+       if (unlikely(bth1 & (HFI1_BECN_SMASK | HFI1_FECN_SMASK))) {
+               if (bth1 & HFI1_BECN_SMASK) {
+                       u16 rlid = qp->remote_ah_attr.dlid;
+                       u32 lqpn, rqpn;
+
+                       lqpn = qp->ibqp.qp_num;
+                       rqpn = qp->remote_qpn;
+                       process_becn(
+                               ppd,
+                               qp->remote_ah_attr.sl,
+                               rlid, lqpn, rqpn,
+                               IB_CC_SVCTYPE_RC);
+               }
+               is_fecn = bth1 & HFI1_FECN_SMASK;
+       }
+
+       psn = be32_to_cpu(ohdr->bth[2]);
+       opcode = (bth0 >> 24) & 0xff;
+
+       /*
+        * Process responses (ACKs) before anything else.  Note that the
+        * packet sequence number will be for something in the send work
+        * queue rather than the expected receive packet sequence number.
+        * In other words, this QP is the requester.
+        */
+       if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
+           opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
+               rc_rcv_resp(ibp, ohdr, data, tlen, qp, opcode, psn,
+                           hdrsize, pmtu, rcd);
+               if (is_fecn)
+                       goto send_ack;
+               return;
+       }
+
+       /* Compute 24 bits worth of difference. */
+       diff = delta_psn(psn, qp->r_psn);
+       if (unlikely(diff)) {
+               if (rc_rcv_error(ohdr, data, qp, opcode, psn, diff, rcd))
+                       return;
+               goto send_ack;
+       }
+
+       /* Check for opcode sequence errors. */
+       switch (qp->r_state) {
+       case OP(SEND_FIRST):
+       case OP(SEND_MIDDLE):
+               if (opcode == OP(SEND_MIDDLE) ||
+                   opcode == OP(SEND_LAST) ||
+                   opcode == OP(SEND_LAST_WITH_IMMEDIATE))
+                       break;
+               goto nack_inv;
+
+       case OP(RDMA_WRITE_FIRST):
+       case OP(RDMA_WRITE_MIDDLE):
+               if (opcode == OP(RDMA_WRITE_MIDDLE) ||
+                   opcode == OP(RDMA_WRITE_LAST) ||
+                   opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+                       break;
+               goto nack_inv;
+
+       default:
+               if (opcode == OP(SEND_MIDDLE) ||
+                   opcode == OP(SEND_LAST) ||
+                   opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
+                   opcode == OP(RDMA_WRITE_MIDDLE) ||
+                   opcode == OP(RDMA_WRITE_LAST) ||
+                   opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+                       goto nack_inv;
+               /*
+                * Note that it is up to the requester to not send a new
+                * RDMA read or atomic operation before receiving an ACK
+                * for the previous operation.
+                */
+               break;
+       }
+
+       if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
+               qp_comm_est(qp);
+
+       /* OK, process the packet. */
+       switch (opcode) {
+       case OP(SEND_FIRST):
+               ret = hfi1_rvt_get_rwqe(qp, 0);
+               if (ret < 0)
+                       goto nack_op_err;
+               if (!ret)
+                       goto rnr_nak;
+               qp->r_rcv_len = 0;
+               /* FALLTHROUGH */
+       case OP(SEND_MIDDLE):
+       case OP(RDMA_WRITE_MIDDLE):
+send_middle:
+               /* Check for invalid length PMTU or posted rwqe len. */
+               if (unlikely(tlen != (hdrsize + pmtu + 4)))
+                       goto nack_inv;
+               qp->r_rcv_len += pmtu;
+               if (unlikely(qp->r_rcv_len > qp->r_len))
+                       goto nack_inv;
+               hfi1_copy_sge(&qp->r_sge, data, pmtu, 1, 0);
+               break;
+
+       case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
+               /* consume RWQE */
+               ret = hfi1_rvt_get_rwqe(qp, 1);
+               if (ret < 0)
+                       goto nack_op_err;
+               if (!ret)
+                       goto rnr_nak;
+               goto send_last_imm;
+
+       case OP(SEND_ONLY):
+       case OP(SEND_ONLY_WITH_IMMEDIATE):
+               ret = hfi1_rvt_get_rwqe(qp, 0);
+               if (ret < 0)
+                       goto nack_op_err;
+               if (!ret)
+                       goto rnr_nak;
+               qp->r_rcv_len = 0;
+               if (opcode == OP(SEND_ONLY))
+                       goto no_immediate_data;
+               /* FALLTHROUGH for SEND_ONLY_WITH_IMMEDIATE */
+       case OP(SEND_LAST_WITH_IMMEDIATE):
+send_last_imm:
+               wc.ex.imm_data = ohdr->u.imm_data;
+               wc.wc_flags = IB_WC_WITH_IMM;
+               goto send_last;
+       case OP(RDMA_WRITE_LAST):
+               copy_last = ibpd_to_rvtpd(qp->ibqp.pd)->user;
+               /* fall through */
+       case OP(SEND_LAST):
+no_immediate_data:
+               wc.wc_flags = 0;
+               wc.ex.imm_data = 0;
+send_last:
+               /* Get the number of bytes the message was padded by. */
+               pad = (bth0 >> 20) & 3;
+               /* Check for invalid length. */
+               /* LAST len should be >= 1 */
+               if (unlikely(tlen < (hdrsize + pad + 4)))
+                       goto nack_inv;
+               /* Don't count the CRC. */
+               tlen -= (hdrsize + pad + 4);
+               wc.byte_len = tlen + qp->r_rcv_len;
+               if (unlikely(wc.byte_len > qp->r_len))
+                       goto nack_inv;
+               hfi1_copy_sge(&qp->r_sge, data, tlen, 1, copy_last);
+               rvt_put_ss(&qp->r_sge);
+               qp->r_msn++;
+               if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
+                       break;
+               wc.wr_id = qp->r_wr_id;
+               wc.status = IB_WC_SUCCESS;
+               if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) ||
+                   opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
+                       wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+               else
+                       wc.opcode = IB_WC_RECV;
+               wc.qp = &qp->ibqp;
+               wc.src_qp = qp->remote_qpn;
+               wc.slid = qp->remote_ah_attr.dlid;
+               /*
+                * It seems that IB mandates the presence of an SL in a
+                * work completion only for the UD transport (see section
+                * 11.4.2 of IBTA Vol. 1).
+                *
+                * However, the way the SL is chosen below is consistent
+                * with the way that IB/qib works and is trying avoid
+                * introducing incompatibilities.
+                *
+                * See also OPA Vol. 1, section 9.7.6, and table 9-17.
+                */
+               wc.sl = qp->remote_ah_attr.sl;
+               /* zero fields that are N/A */
+               wc.vendor_err = 0;
+               wc.pkey_index = 0;
+               wc.dlid_path_bits = 0;
+               wc.port_num = 0;
+               /* Signal completion event if the solicited bit is set. */
+               rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
+                            (bth0 & IB_BTH_SOLICITED) != 0);
+               break;
+
+       case OP(RDMA_WRITE_ONLY):
+               copy_last = 1;
+               /* fall through */
+       case OP(RDMA_WRITE_FIRST):
+       case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+                       goto nack_inv;
+               /* consume RWQE */
+               reth = &ohdr->u.rc.reth;
+               qp->r_len = be32_to_cpu(reth->length);
+               qp->r_rcv_len = 0;
+               qp->r_sge.sg_list = NULL;
+               if (qp->r_len != 0) {
+                       u32 rkey = be32_to_cpu(reth->rkey);
+                       u64 vaddr = be64_to_cpu(reth->vaddr);
+                       int ok;
+
+                       /* Check rkey & NAK */
+                       ok = rvt_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr,
+                                        rkey, IB_ACCESS_REMOTE_WRITE);
+                       if (unlikely(!ok))
+                               goto nack_acc;
+                       qp->r_sge.num_sge = 1;
+               } else {
+                       qp->r_sge.num_sge = 0;
+                       qp->r_sge.sge.mr = NULL;
+                       qp->r_sge.sge.vaddr = NULL;
+                       qp->r_sge.sge.length = 0;
+                       qp->r_sge.sge.sge_length = 0;
+               }
+               if (opcode == OP(RDMA_WRITE_FIRST))
+                       goto send_middle;
+               else if (opcode == OP(RDMA_WRITE_ONLY))
+                       goto no_immediate_data;
+               ret = hfi1_rvt_get_rwqe(qp, 1);
+               if (ret < 0)
+                       goto nack_op_err;
+               if (!ret)
+                       goto rnr_nak;
+               wc.ex.imm_data = ohdr->u.rc.imm_data;
+               wc.wc_flags = IB_WC_WITH_IMM;
+               goto send_last;
+
+       case OP(RDMA_READ_REQUEST): {
+               struct rvt_ack_entry *e;
+               u32 len;
+               u8 next;
+
+               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
+                       goto nack_inv;
+               next = qp->r_head_ack_queue + 1;
+               /* s_ack_queue is size HFI1_MAX_RDMA_ATOMIC+1 so use > not >= */
+               if (next > HFI1_MAX_RDMA_ATOMIC)
+                       next = 0;
+               spin_lock_irqsave(&qp->s_lock, flags);
+               if (unlikely(next == qp->s_tail_ack_queue)) {
+                       if (!qp->s_ack_queue[next].sent)
+                               goto nack_inv_unlck;
+                       update_ack_queue(qp, next);
+               }
+               e = &qp->s_ack_queue[qp->r_head_ack_queue];
+               if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
+                       rvt_put_mr(e->rdma_sge.mr);
+                       e->rdma_sge.mr = NULL;
+               }
+               reth = &ohdr->u.rc.reth;
+               len = be32_to_cpu(reth->length);
+               if (len) {
+                       u32 rkey = be32_to_cpu(reth->rkey);
+                       u64 vaddr = be64_to_cpu(reth->vaddr);
+                       int ok;
+
+                       /* Check rkey & NAK */
+                       ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr,
+                                        rkey, IB_ACCESS_REMOTE_READ);
+                       if (unlikely(!ok))
+                               goto nack_acc_unlck;
+                       /*
+                        * Update the next expected PSN.  We add 1 later
+                        * below, so only add the remainder here.
+                        */
+                       if (len > pmtu)
+                               qp->r_psn += (len - 1) / pmtu;
+               } else {
+                       e->rdma_sge.mr = NULL;
+                       e->rdma_sge.vaddr = NULL;
+                       e->rdma_sge.length = 0;
+                       e->rdma_sge.sge_length = 0;
+               }
+               e->opcode = opcode;
+               e->sent = 0;
+               e->psn = psn;
+               e->lpsn = qp->r_psn;
+               /*
+                * We need to increment the MSN here instead of when we
+                * finish sending the result since a duplicate request would
+                * increment it more than once.
+                */
+               qp->r_msn++;
+               qp->r_psn++;
+               qp->r_state = opcode;
+               qp->r_nak_state = 0;
+               qp->r_head_ack_queue = next;
+
+               /* Schedule the send tasklet. */
+               qp->s_flags |= RVT_S_RESP_PENDING;
+               hfi1_schedule_send(qp);
+
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+               if (is_fecn)
+                       goto send_ack;
+               return;
+       }
+
+       case OP(COMPARE_SWAP):
+       case OP(FETCH_ADD): {
+               struct ib_atomic_eth *ateth;
+               struct rvt_ack_entry *e;
+               u64 vaddr;
+               atomic64_t *maddr;
+               u64 sdata;
+               u32 rkey;
+               u8 next;
+
+               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
+                       goto nack_inv;
+               next = qp->r_head_ack_queue + 1;
+               if (next > HFI1_MAX_RDMA_ATOMIC)
+                       next = 0;
+               spin_lock_irqsave(&qp->s_lock, flags);
+               if (unlikely(next == qp->s_tail_ack_queue)) {
+                       if (!qp->s_ack_queue[next].sent)
+                               goto nack_inv_unlck;
+                       update_ack_queue(qp, next);
+               }
+               e = &qp->s_ack_queue[qp->r_head_ack_queue];
+               if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
+                       rvt_put_mr(e->rdma_sge.mr);
+                       e->rdma_sge.mr = NULL;
+               }
+               ateth = &ohdr->u.atomic_eth;
+               vaddr = ((u64)be32_to_cpu(ateth->vaddr[0]) << 32) |
+                       be32_to_cpu(ateth->vaddr[1]);
+               if (unlikely(vaddr & (sizeof(u64) - 1)))
+                       goto nack_inv_unlck;
+               rkey = be32_to_cpu(ateth->rkey);
+               /* Check rkey & NAK */
+               if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
+                                         vaddr, rkey,
+                                         IB_ACCESS_REMOTE_ATOMIC)))
+                       goto nack_acc_unlck;
+               /* Perform atomic OP and save result. */
+               maddr = (atomic64_t *)qp->r_sge.sge.vaddr;
+               sdata = be64_to_cpu(ateth->swap_data);
+               e->atomic_data = (opcode == OP(FETCH_ADD)) ?
+                       (u64)atomic64_add_return(sdata, maddr) - sdata :
+                       (u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr,
+                                     be64_to_cpu(ateth->compare_data),
+                                     sdata);
+               rvt_put_mr(qp->r_sge.sge.mr);
+               qp->r_sge.num_sge = 0;
+               e->opcode = opcode;
+               e->sent = 0;
+               e->psn = psn;
+               e->lpsn = psn;
+               qp->r_msn++;
+               qp->r_psn++;
+               qp->r_state = opcode;
+               qp->r_nak_state = 0;
+               qp->r_head_ack_queue = next;
+
+               /* Schedule the send tasklet. */
+               qp->s_flags |= RVT_S_RESP_PENDING;
+               hfi1_schedule_send(qp);
+
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+               if (is_fecn)
+                       goto send_ack;
+               return;
+       }
+
+       default:
+               /* NAK unknown opcodes. */
+               goto nack_inv;
+       }
+       qp->r_psn++;
+       qp->r_state = opcode;
+       qp->r_ack_psn = psn;
+       qp->r_nak_state = 0;
+       /* Send an ACK if requested or required. */
+       if (psn & IB_BTH_REQ_ACK) {
+               struct hfi1_qp_priv *priv = qp->priv;
+
+               if (packet->numpkt == 0) {
+                       rc_cancel_ack(qp);
+                       goto send_ack;
+               }
+               if (priv->r_adefered >= HFI1_PSN_CREDIT) {
+                       rc_cancel_ack(qp);
+                       goto send_ack;
+               }
+               if (unlikely(is_fecn)) {
+                       rc_cancel_ack(qp);
+                       goto send_ack;
+               }
+               priv->r_adefered++;
+               rc_defered_ack(rcd, qp);
+       }
+       return;
+
+rnr_nak:
+       qp->r_nak_state = qp->r_min_rnr_timer | IB_RNR_NAK;
+       qp->r_ack_psn = qp->r_psn;
+       /* Queue RNR NAK for later */
+       rc_defered_ack(rcd, qp);
+       return;
+
+nack_op_err:
+       hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+       qp->r_nak_state = IB_NAK_REMOTE_OPERATIONAL_ERROR;
+       qp->r_ack_psn = qp->r_psn;
+       /* Queue NAK for later */
+       rc_defered_ack(rcd, qp);
+       return;
+
+nack_inv_unlck:
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_inv:
+       hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+       qp->r_nak_state = IB_NAK_INVALID_REQUEST;
+       qp->r_ack_psn = qp->r_psn;
+       /* Queue NAK for later */
+       rc_defered_ack(rcd, qp);
+       return;
+
+nack_acc_unlck:
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_acc:
+       hfi1_rc_error(qp, IB_WC_LOC_PROT_ERR);
+       qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
+       qp->r_ack_psn = qp->r_psn;
+send_ack:
+       hfi1_send_rc_ack(rcd, qp, is_fecn);
+}
+
+void hfi1_rc_hdrerr(
+       struct hfi1_ctxtdata *rcd,
+       struct hfi1_ib_header *hdr,
+       u32 rcv_flags,
+       struct rvt_qp *qp)
+{
+       int has_grh = rcv_flags & HFI1_HAS_GRH;
+       struct hfi1_other_headers *ohdr;
+       struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+       int diff;
+       u32 opcode;
+       u32 psn, bth0;
+
+       /* Check for GRH */
+       ohdr = &hdr->u.oth;
+       if (has_grh)
+               ohdr = &hdr->u.l.oth;
+
+       bth0 = be32_to_cpu(ohdr->bth[0]);
+       if (hfi1_ruc_check_hdr(ibp, hdr, has_grh, qp, bth0))
+               return;
+
+       psn = be32_to_cpu(ohdr->bth[2]);
+       opcode = (bth0 >> 24) & 0xff;
+
+       /* Only deal with RDMA Writes for now */
+       if (opcode < IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) {
+               diff = delta_psn(psn, qp->r_psn);
+               if (!qp->r_nak_state && diff >= 0) {
+                       ibp->rvp.n_rc_seqnak++;
+                       qp->r_nak_state = IB_NAK_PSN_ERROR;
+                       /* Use the expected PSN. */
+                       qp->r_ack_psn = qp->r_psn;
+                       /*
+                        * Wait to send the sequence
+                        * NAK until all packets
+                        * in the receive queue have
+                        * been processed.
+                        * Otherwise, we end up
+                        * propagating congestion.
+                        */
+                       rc_defered_ack(rcd, qp);
+               } /* Out of sequence NAK */
+       } /* QP Request NAKs */
+}
diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c
new file mode 100644 (file)
index 0000000..a659aec
--- /dev/null
@@ -0,0 +1,979 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/spinlock.h>
+
+#include "hfi.h"
+#include "mad.h"
+#include "qp.h"
+#include "verbs_txreq.h"
+#include "trace.h"
+
+/*
+ * Convert the AETH RNR timeout code into the number of microseconds.
+ */
+const u32 ib_hfi1_rnr_table[32] = {
+       655360, /* 00: 655.36 */
+       10,     /* 01:    .01 */
+       20,     /* 02     .02 */
+       30,     /* 03:    .03 */
+       40,     /* 04:    .04 */
+       60,     /* 05:    .06 */
+       80,     /* 06:    .08 */
+       120,    /* 07:    .12 */
+       160,    /* 08:    .16 */
+       240,    /* 09:    .24 */
+       320,    /* 0A:    .32 */
+       480,    /* 0B:    .48 */
+       640,    /* 0C:    .64 */
+       960,    /* 0D:    .96 */
+       1280,   /* 0E:   1.28 */
+       1920,   /* 0F:   1.92 */
+       2560,   /* 10:   2.56 */
+       3840,   /* 11:   3.84 */
+       5120,   /* 12:   5.12 */
+       7680,   /* 13:   7.68 */
+       10240,  /* 14:  10.24 */
+       15360,  /* 15:  15.36 */
+       20480,  /* 16:  20.48 */
+       30720,  /* 17:  30.72 */
+       40960,  /* 18:  40.96 */
+       61440,  /* 19:  61.44 */
+       81920,  /* 1A:  81.92 */
+       122880, /* 1B: 122.88 */
+       163840, /* 1C: 163.84 */
+       245760, /* 1D: 245.76 */
+       327680, /* 1E: 327.68 */
+       491520  /* 1F: 491.52 */
+};
+
+/*
+ * Validate a RWQE and fill in the SGE state.
+ * Return 1 if OK.
+ */
+static int init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe)
+{
+       int i, j, ret;
+       struct ib_wc wc;
+       struct rvt_lkey_table *rkt;
+       struct rvt_pd *pd;
+       struct rvt_sge_state *ss;
+
+       rkt = &to_idev(qp->ibqp.device)->rdi.lkey_table;
+       pd = ibpd_to_rvtpd(qp->ibqp.srq ? qp->ibqp.srq->pd : qp->ibqp.pd);
+       ss = &qp->r_sge;
+       ss->sg_list = qp->r_sg_list;
+       qp->r_len = 0;
+       for (i = j = 0; i < wqe->num_sge; i++) {
+               if (wqe->sg_list[i].length == 0)
+                       continue;
+               /* Check LKEY */
+               if (!rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge,
+                                &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE))
+                       goto bad_lkey;
+               qp->r_len += wqe->sg_list[i].length;
+               j++;
+       }
+       ss->num_sge = j;
+       ss->total_len = qp->r_len;
+       ret = 1;
+       goto bail;
+
+bad_lkey:
+       while (j) {
+               struct rvt_sge *sge = --j ? &ss->sg_list[j - 1] : &ss->sge;
+
+               rvt_put_mr(sge->mr);
+       }
+       ss->num_sge = 0;
+       memset(&wc, 0, sizeof(wc));
+       wc.wr_id = wqe->wr_id;
+       wc.status = IB_WC_LOC_PROT_ERR;
+       wc.opcode = IB_WC_RECV;
+       wc.qp = &qp->ibqp;
+       /* Signal solicited completion event. */
+       rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
+       ret = 0;
+bail:
+       return ret;
+}
+
+/**
+ * hfi1_rvt_get_rwqe - copy the next RWQE into the QP's RWQE
+ * @qp: the QP
+ * @wr_id_only: update qp->r_wr_id only, not qp->r_sge
+ *
+ * Return -1 if there is a local error, 0 if no RWQE is available,
+ * otherwise return 1.
+ *
+ * Can be called from interrupt level.
+ */
+int hfi1_rvt_get_rwqe(struct rvt_qp *qp, int wr_id_only)
+{
+       unsigned long flags;
+       struct rvt_rq *rq;
+       struct rvt_rwq *wq;
+       struct rvt_srq *srq;
+       struct rvt_rwqe *wqe;
+       void (*handler)(struct ib_event *, void *);
+       u32 tail;
+       int ret;
+
+       if (qp->ibqp.srq) {
+               srq = ibsrq_to_rvtsrq(qp->ibqp.srq);
+               handler = srq->ibsrq.event_handler;
+               rq = &srq->rq;
+       } else {
+               srq = NULL;
+               handler = NULL;
+               rq = &qp->r_rq;
+       }
+
+       spin_lock_irqsave(&rq->lock, flags);
+       if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
+               ret = 0;
+               goto unlock;
+       }
+
+       wq = rq->wq;
+       tail = wq->tail;
+       /* Validate tail before using it since it is user writable. */
+       if (tail >= rq->size)
+               tail = 0;
+       if (unlikely(tail == wq->head)) {
+               ret = 0;
+               goto unlock;
+       }
+       /* Make sure entry is read after head index is read. */
+       smp_rmb();
+       wqe = rvt_get_rwqe_ptr(rq, tail);
+       /*
+        * Even though we update the tail index in memory, the verbs
+        * consumer is not supposed to post more entries until a
+        * completion is generated.
+        */
+       if (++tail >= rq->size)
+               tail = 0;
+       wq->tail = tail;
+       if (!wr_id_only && !init_sge(qp, wqe)) {
+               ret = -1;
+               goto unlock;
+       }
+       qp->r_wr_id = wqe->wr_id;
+
+       ret = 1;
+       set_bit(RVT_R_WRID_VALID, &qp->r_aflags);
+       if (handler) {
+               u32 n;
+
+               /*
+                * Validate head pointer value and compute
+                * the number of remaining WQEs.
+                */
+               n = wq->head;
+               if (n >= rq->size)
+                       n = 0;
+               if (n < tail)
+                       n += rq->size - tail;
+               else
+                       n -= tail;
+               if (n < srq->limit) {
+                       struct ib_event ev;
+
+                       srq->limit = 0;
+                       spin_unlock_irqrestore(&rq->lock, flags);
+                       ev.device = qp->ibqp.device;
+                       ev.element.srq = qp->ibqp.srq;
+                       ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
+                       handler(&ev, srq->ibsrq.srq_context);
+                       goto bail;
+               }
+       }
+unlock:
+       spin_unlock_irqrestore(&rq->lock, flags);
+bail:
+       return ret;
+}
+
+static __be64 get_sguid(struct hfi1_ibport *ibp, unsigned index)
+{
+       if (!index) {
+               struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+               return cpu_to_be64(ppd->guid);
+       }
+       return ibp->guids[index - 1];
+}
+
+static int gid_ok(union ib_gid *gid, __be64 gid_prefix, __be64 id)
+{
+       return (gid->global.interface_id == id &&
+               (gid->global.subnet_prefix == gid_prefix ||
+                gid->global.subnet_prefix == IB_DEFAULT_GID_PREFIX));
+}
+
+/*
+ *
+ * This should be called with the QP r_lock held.
+ *
+ * The s_lock will be acquired around the hfi1_migrate_qp() call.
+ */
+int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_ib_header *hdr,
+                      int has_grh, struct rvt_qp *qp, u32 bth0)
+{
+       __be64 guid;
+       unsigned long flags;
+       u8 sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
+
+       if (qp->s_mig_state == IB_MIG_ARMED && (bth0 & IB_BTH_MIG_REQ)) {
+               if (!has_grh) {
+                       if (qp->alt_ah_attr.ah_flags & IB_AH_GRH)
+                               goto err;
+               } else {
+                       if (!(qp->alt_ah_attr.ah_flags & IB_AH_GRH))
+                               goto err;
+                       guid = get_sguid(ibp, qp->alt_ah_attr.grh.sgid_index);
+                       if (!gid_ok(&hdr->u.l.grh.dgid, ibp->rvp.gid_prefix,
+                                   guid))
+                               goto err;
+                       if (!gid_ok(
+                               &hdr->u.l.grh.sgid,
+                               qp->alt_ah_attr.grh.dgid.global.subnet_prefix,
+                               qp->alt_ah_attr.grh.dgid.global.interface_id))
+                               goto err;
+               }
+               if (unlikely(rcv_pkey_check(ppd_from_ibp(ibp), (u16)bth0,
+                                           sc5, be16_to_cpu(hdr->lrh[3])))) {
+                       hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_P_KEY,
+                                      (u16)bth0,
+                                      (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF,
+                                      0, qp->ibqp.qp_num,
+                                      be16_to_cpu(hdr->lrh[3]),
+                                      be16_to_cpu(hdr->lrh[1]));
+                       goto err;
+               }
+               /* Validate the SLID. See Ch. 9.6.1.5 and 17.2.8 */
+               if (be16_to_cpu(hdr->lrh[3]) != qp->alt_ah_attr.dlid ||
+                   ppd_from_ibp(ibp)->port != qp->alt_ah_attr.port_num)
+                       goto err;
+               spin_lock_irqsave(&qp->s_lock, flags);
+               hfi1_migrate_qp(qp);
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+       } else {
+               if (!has_grh) {
+                       if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+                               goto err;
+               } else {
+                       if (!(qp->remote_ah_attr.ah_flags & IB_AH_GRH))
+                               goto err;
+                       guid = get_sguid(ibp,
+                                        qp->remote_ah_attr.grh.sgid_index);
+                       if (!gid_ok(&hdr->u.l.grh.dgid, ibp->rvp.gid_prefix,
+                                   guid))
+                               goto err;
+                       if (!gid_ok(
+                            &hdr->u.l.grh.sgid,
+                            qp->remote_ah_attr.grh.dgid.global.subnet_prefix,
+                            qp->remote_ah_attr.grh.dgid.global.interface_id))
+                               goto err;
+               }
+               if (unlikely(rcv_pkey_check(ppd_from_ibp(ibp), (u16)bth0,
+                                           sc5, be16_to_cpu(hdr->lrh[3])))) {
+                       hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_P_KEY,
+                                      (u16)bth0,
+                                      (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF,
+                                      0, qp->ibqp.qp_num,
+                                      be16_to_cpu(hdr->lrh[3]),
+                                      be16_to_cpu(hdr->lrh[1]));
+                       goto err;
+               }
+               /* Validate the SLID. See Ch. 9.6.1.5 */
+               if (be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid ||
+                   ppd_from_ibp(ibp)->port != qp->port_num)
+                       goto err;
+               if (qp->s_mig_state == IB_MIG_REARM &&
+                   !(bth0 & IB_BTH_MIG_REQ))
+                       qp->s_mig_state = IB_MIG_ARMED;
+       }
+
+       return 0;
+
+err:
+       return 1;
+}
+
+/**
+ * ruc_loopback - handle UC and RC loopback requests
+ * @sqp: the sending QP
+ *
+ * This is called from hfi1_do_send() to
+ * forward a WQE addressed to the same HFI.
+ * Note that although we are single threaded due to the tasklet, we still
+ * have to protect against post_send().  We don't have to worry about
+ * receive interrupts since this is a connected protocol and all packets
+ * will pass through here.
+ */
+static void ruc_loopback(struct rvt_qp *sqp)
+{
+       struct hfi1_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num);
+       struct rvt_qp *qp;
+       struct rvt_swqe *wqe;
+       struct rvt_sge *sge;
+       unsigned long flags;
+       struct ib_wc wc;
+       u64 sdata;
+       atomic64_t *maddr;
+       enum ib_wc_status send_status;
+       int release;
+       int ret;
+       int copy_last = 0;
+       u32 to;
+
+       rcu_read_lock();
+
+       /*
+        * Note that we check the responder QP state after
+        * checking the requester's state.
+        */
+       qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), &ibp->rvp,
+                           sqp->remote_qpn);
+
+       spin_lock_irqsave(&sqp->s_lock, flags);
+
+       /* Return if we are already busy processing a work request. */
+       if ((sqp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT)) ||
+           !(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_OR_FLUSH_SEND))
+               goto unlock;
+
+       sqp->s_flags |= RVT_S_BUSY;
+
+again:
+       smp_read_barrier_depends(); /* see post_one_send() */
+       if (sqp->s_last == ACCESS_ONCE(sqp->s_head))
+               goto clr_busy;
+       wqe = rvt_get_swqe_ptr(sqp, sqp->s_last);
+
+       /* Return if it is not OK to start a new work request. */
+       if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_NEXT_SEND_OK)) {
+               if (!(ib_rvt_state_ops[sqp->state] & RVT_FLUSH_SEND))
+                       goto clr_busy;
+               /* We are in the error state, flush the work request. */
+               send_status = IB_WC_WR_FLUSH_ERR;
+               goto flush_send;
+       }
+
+       /*
+        * We can rely on the entry not changing without the s_lock
+        * being held until we update s_last.
+        * We increment s_cur to indicate s_last is in progress.
+        */
+       if (sqp->s_last == sqp->s_cur) {
+               if (++sqp->s_cur >= sqp->s_size)
+                       sqp->s_cur = 0;
+       }
+       spin_unlock_irqrestore(&sqp->s_lock, flags);
+
+       if (!qp || !(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) ||
+           qp->ibqp.qp_type != sqp->ibqp.qp_type) {
+               ibp->rvp.n_pkt_drops++;
+               /*
+                * For RC, the requester would timeout and retry so
+                * shortcut the timeouts and just signal too many retries.
+                */
+               if (sqp->ibqp.qp_type == IB_QPT_RC)
+                       send_status = IB_WC_RETRY_EXC_ERR;
+               else
+                       send_status = IB_WC_SUCCESS;
+               goto serr;
+       }
+
+       memset(&wc, 0, sizeof(wc));
+       send_status = IB_WC_SUCCESS;
+
+       release = 1;
+       sqp->s_sge.sge = wqe->sg_list[0];
+       sqp->s_sge.sg_list = wqe->sg_list + 1;
+       sqp->s_sge.num_sge = wqe->wr.num_sge;
+       sqp->s_len = wqe->length;
+       switch (wqe->wr.opcode) {
+       case IB_WR_SEND_WITH_IMM:
+               wc.wc_flags = IB_WC_WITH_IMM;
+               wc.ex.imm_data = wqe->wr.ex.imm_data;
+               /* FALLTHROUGH */
+       case IB_WR_SEND:
+               ret = hfi1_rvt_get_rwqe(qp, 0);
+               if (ret < 0)
+                       goto op_err;
+               if (!ret)
+                       goto rnr_nak;
+               break;
+
+       case IB_WR_RDMA_WRITE_WITH_IMM:
+               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+                       goto inv_err;
+               wc.wc_flags = IB_WC_WITH_IMM;
+               wc.ex.imm_data = wqe->wr.ex.imm_data;
+               ret = hfi1_rvt_get_rwqe(qp, 1);
+               if (ret < 0)
+                       goto op_err;
+               if (!ret)
+                       goto rnr_nak;
+               /* skip copy_last set and qp_access_flags recheck */
+               goto do_write;
+       case IB_WR_RDMA_WRITE:
+               copy_last = ibpd_to_rvtpd(qp->ibqp.pd)->user;
+               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+                       goto inv_err;
+do_write:
+               if (wqe->length == 0)
+                       break;
+               if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, wqe->length,
+                                         wqe->rdma_wr.remote_addr,
+                                         wqe->rdma_wr.rkey,
+                                         IB_ACCESS_REMOTE_WRITE)))
+                       goto acc_err;
+               qp->r_sge.sg_list = NULL;
+               qp->r_sge.num_sge = 1;
+               qp->r_sge.total_len = wqe->length;
+               break;
+
+       case IB_WR_RDMA_READ:
+               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
+                       goto inv_err;
+               if (unlikely(!rvt_rkey_ok(qp, &sqp->s_sge.sge, wqe->length,
+                                         wqe->rdma_wr.remote_addr,
+                                         wqe->rdma_wr.rkey,
+                                         IB_ACCESS_REMOTE_READ)))
+                       goto acc_err;
+               release = 0;
+               sqp->s_sge.sg_list = NULL;
+               sqp->s_sge.num_sge = 1;
+               qp->r_sge.sge = wqe->sg_list[0];
+               qp->r_sge.sg_list = wqe->sg_list + 1;
+               qp->r_sge.num_sge = wqe->wr.num_sge;
+               qp->r_sge.total_len = wqe->length;
+               break;
+
+       case IB_WR_ATOMIC_CMP_AND_SWP:
+       case IB_WR_ATOMIC_FETCH_AND_ADD:
+               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
+                       goto inv_err;
+               if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
+                                         wqe->atomic_wr.remote_addr,
+                                         wqe->atomic_wr.rkey,
+                                         IB_ACCESS_REMOTE_ATOMIC)))
+                       goto acc_err;
+               /* Perform atomic OP and save result. */
+               maddr = (atomic64_t *)qp->r_sge.sge.vaddr;
+               sdata = wqe->atomic_wr.compare_add;
+               *(u64 *)sqp->s_sge.sge.vaddr =
+                       (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ?
+                       (u64)atomic64_add_return(sdata, maddr) - sdata :
+                       (u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr,
+                                     sdata, wqe->atomic_wr.swap);
+               rvt_put_mr(qp->r_sge.sge.mr);
+               qp->r_sge.num_sge = 0;
+               goto send_comp;
+
+       default:
+               send_status = IB_WC_LOC_QP_OP_ERR;
+               goto serr;
+       }
+
+       sge = &sqp->s_sge.sge;
+       while (sqp->s_len) {
+               u32 len = sqp->s_len;
+
+               if (len > sge->length)
+                       len = sge->length;
+               if (len > sge->sge_length)
+                       len = sge->sge_length;
+               WARN_ON_ONCE(len == 0);
+               hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, release, copy_last);
+               sge->vaddr += len;
+               sge->length -= len;
+               sge->sge_length -= len;
+               if (sge->sge_length == 0) {
+                       if (!release)
+                               rvt_put_mr(sge->mr);
+                       if (--sqp->s_sge.num_sge)
+                               *sge = *sqp->s_sge.sg_list++;
+               } else if (sge->length == 0 && sge->mr->lkey) {
+                       if (++sge->n >= RVT_SEGSZ) {
+                               if (++sge->m >= sge->mr->mapsz)
+                                       break;
+                               sge->n = 0;
+                       }
+                       sge->vaddr =
+                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
+                       sge->length =
+                               sge->mr->map[sge->m]->segs[sge->n].length;
+               }
+               sqp->s_len -= len;
+       }
+       if (release)
+               rvt_put_ss(&qp->r_sge);
+
+       if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
+               goto send_comp;
+
+       if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+               wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+       else
+               wc.opcode = IB_WC_RECV;
+       wc.wr_id = qp->r_wr_id;
+       wc.status = IB_WC_SUCCESS;
+       wc.byte_len = wqe->length;
+       wc.qp = &qp->ibqp;
+       wc.src_qp = qp->remote_qpn;
+       wc.slid = qp->remote_ah_attr.dlid;
+       wc.sl = qp->remote_ah_attr.sl;
+       wc.port_num = 1;
+       /* Signal completion event if the solicited bit is set. */
+       rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
+                    wqe->wr.send_flags & IB_SEND_SOLICITED);
+
+send_comp:
+       spin_lock_irqsave(&sqp->s_lock, flags);
+       ibp->rvp.n_loop_pkts++;
+flush_send:
+       sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
+       hfi1_send_complete(sqp, wqe, send_status);
+       goto again;
+
+rnr_nak:
+       /* Handle RNR NAK */
+       if (qp->ibqp.qp_type == IB_QPT_UC)
+               goto send_comp;
+       ibp->rvp.n_rnr_naks++;
+       /*
+        * Note: we don't need the s_lock held since the BUSY flag
+        * makes this single threaded.
+        */
+       if (sqp->s_rnr_retry == 0) {
+               send_status = IB_WC_RNR_RETRY_EXC_ERR;
+               goto serr;
+       }
+       if (sqp->s_rnr_retry_cnt < 7)
+               sqp->s_rnr_retry--;
+       spin_lock_irqsave(&sqp->s_lock, flags);
+       if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_RECV_OK))
+               goto clr_busy;
+       to = ib_hfi1_rnr_table[qp->r_min_rnr_timer];
+       hfi1_add_rnr_timer(sqp, to);
+       goto clr_busy;
+
+op_err:
+       send_status = IB_WC_REM_OP_ERR;
+       wc.status = IB_WC_LOC_QP_OP_ERR;
+       goto err;
+
+inv_err:
+       send_status = IB_WC_REM_INV_REQ_ERR;
+       wc.status = IB_WC_LOC_QP_OP_ERR;
+       goto err;
+
+acc_err:
+       send_status = IB_WC_REM_ACCESS_ERR;
+       wc.status = IB_WC_LOC_PROT_ERR;
+err:
+       /* responder goes to error state */
+       hfi1_rc_error(qp, wc.status);
+
+serr:
+       spin_lock_irqsave(&sqp->s_lock, flags);
+       hfi1_send_complete(sqp, wqe, send_status);
+       if (sqp->ibqp.qp_type == IB_QPT_RC) {
+               int lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR);
+
+               sqp->s_flags &= ~RVT_S_BUSY;
+               spin_unlock_irqrestore(&sqp->s_lock, flags);
+               if (lastwqe) {
+                       struct ib_event ev;
+
+                       ev.device = sqp->ibqp.device;
+                       ev.element.qp = &sqp->ibqp;
+                       ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+                       sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context);
+               }
+               goto done;
+       }
+clr_busy:
+       sqp->s_flags &= ~RVT_S_BUSY;
+unlock:
+       spin_unlock_irqrestore(&sqp->s_lock, flags);
+done:
+       rcu_read_unlock();
+}
+
+/**
+ * hfi1_make_grh - construct a GRH header
+ * @ibp: a pointer to the IB port
+ * @hdr: a pointer to the GRH header being constructed
+ * @grh: the global route address to send to
+ * @hwords: the number of 32 bit words of header being sent
+ * @nwords: the number of 32 bit words of data being sent
+ *
+ * Return the size of the header in 32 bit words.
+ */
+u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr,
+                 struct ib_global_route *grh, u32 hwords, u32 nwords)
+{
+       hdr->version_tclass_flow =
+               cpu_to_be32((IB_GRH_VERSION << IB_GRH_VERSION_SHIFT) |
+                           (grh->traffic_class << IB_GRH_TCLASS_SHIFT) |
+                           (grh->flow_label << IB_GRH_FLOW_SHIFT));
+       hdr->paylen = cpu_to_be16((hwords - 2 + nwords + SIZE_OF_CRC) << 2);
+       /* next_hdr is defined by C8-7 in ch. 8.4.1 */
+       hdr->next_hdr = IB_GRH_NEXT_HDR;
+       hdr->hop_limit = grh->hop_limit;
+       /* The SGID is 32-bit aligned. */
+       hdr->sgid.global.subnet_prefix = ibp->rvp.gid_prefix;
+       hdr->sgid.global.interface_id =
+               grh->sgid_index && grh->sgid_index < ARRAY_SIZE(ibp->guids) ?
+               ibp->guids[grh->sgid_index - 1] :
+                       cpu_to_be64(ppd_from_ibp(ibp)->guid);
+       hdr->dgid = grh->dgid;
+
+       /* GRH header size in 32-bit words. */
+       return sizeof(struct ib_grh) / sizeof(u32);
+}
+
+#define BTH2_OFFSET (offsetof(struct hfi1_pio_header, hdr.u.oth.bth[2]) / 4)
+
+/**
+ * build_ahg - create ahg in s_hdr
+ * @qp: a pointer to QP
+ * @npsn: the next PSN for the request/response
+ *
+ * This routine handles the AHG by allocating an ahg entry and causing the
+ * copy of the first middle.
+ *
+ * Subsequent middles use the copied entry, editing the
+ * PSN with 1 or 2 edits.
+ */
+static inline void build_ahg(struct rvt_qp *qp, u32 npsn)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       if (unlikely(qp->s_flags & RVT_S_AHG_CLEAR))
+               clear_ahg(qp);
+       if (!(qp->s_flags & RVT_S_AHG_VALID)) {
+               /* first middle that needs copy  */
+               if (qp->s_ahgidx < 0)
+                       qp->s_ahgidx = sdma_ahg_alloc(priv->s_sde);
+               if (qp->s_ahgidx >= 0) {
+                       qp->s_ahgpsn = npsn;
+                       priv->s_hdr->tx_flags |= SDMA_TXREQ_F_AHG_COPY;
+                       /* save to protect a change in another thread */
+                       priv->s_hdr->sde = priv->s_sde;
+                       priv->s_hdr->ahgidx = qp->s_ahgidx;
+                       qp->s_flags |= RVT_S_AHG_VALID;
+               }
+       } else {
+               /* subsequent middle after valid */
+               if (qp->s_ahgidx >= 0) {
+                       priv->s_hdr->tx_flags |= SDMA_TXREQ_F_USE_AHG;
+                       priv->s_hdr->ahgidx = qp->s_ahgidx;
+                       priv->s_hdr->ahgcount++;
+                       priv->s_hdr->ahgdesc[0] =
+                               sdma_build_ahg_descriptor(
+                                       (__force u16)cpu_to_be16((u16)npsn),
+                                       BTH2_OFFSET,
+                                       16,
+                                       16);
+                       if ((npsn & 0xffff0000) !=
+                                       (qp->s_ahgpsn & 0xffff0000)) {
+                               priv->s_hdr->ahgcount++;
+                               priv->s_hdr->ahgdesc[1] =
+                                       sdma_build_ahg_descriptor(
+                                               (__force u16)cpu_to_be16(
+                                                       (u16)(npsn >> 16)),
+                                               BTH2_OFFSET,
+                                               0,
+                                               16);
+                       }
+               }
+       }
+}
+
+void hfi1_make_ruc_header(struct rvt_qp *qp, struct hfi1_other_headers *ohdr,
+                         u32 bth0, u32 bth2, int middle,
+                         struct hfi1_pkt_state *ps)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+       struct hfi1_ibport *ibp = ps->ibp;
+       u16 lrh0;
+       u32 nwords;
+       u32 extra_bytes;
+       u32 bth1;
+
+       /* Construct the header. */
+       extra_bytes = -qp->s_cur_size & 3;
+       nwords = (qp->s_cur_size + extra_bytes) >> 2;
+       lrh0 = HFI1_LRH_BTH;
+       if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
+               qp->s_hdrwords += hfi1_make_grh(ibp,
+                                               &ps->s_txreq->phdr.hdr.u.l.grh,
+                                               &qp->remote_ah_attr.grh,
+                                               qp->s_hdrwords, nwords);
+               lrh0 = HFI1_LRH_GRH;
+               middle = 0;
+       }
+       lrh0 |= (priv->s_sc & 0xf) << 12 | (qp->remote_ah_attr.sl & 0xf) << 4;
+       /*
+        * reset s_hdr/AHG fields
+        *
+        * This insures that the ahgentry/ahgcount
+        * are at a non-AHG default to protect
+        * build_verbs_tx_desc() from using
+        * an include ahgidx.
+        *
+        * build_ahg() will modify as appropriate
+        * to use the AHG feature.
+        */
+       priv->s_hdr->tx_flags = 0;
+       priv->s_hdr->ahgcount = 0;
+       priv->s_hdr->ahgidx = 0;
+       priv->s_hdr->sde = NULL;
+       if (qp->s_mig_state == IB_MIG_MIGRATED)
+               bth0 |= IB_BTH_MIG_REQ;
+       else
+               middle = 0;
+       if (middle)
+               build_ahg(qp, bth2);
+       else
+               qp->s_flags &= ~RVT_S_AHG_VALID;
+       ps->s_txreq->phdr.hdr.lrh[0] = cpu_to_be16(lrh0);
+       ps->s_txreq->phdr.hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
+       ps->s_txreq->phdr.hdr.lrh[2] =
+               cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC);
+       ps->s_txreq->phdr.hdr.lrh[3] = cpu_to_be16(ppd_from_ibp(ibp)->lid |
+                                      qp->remote_ah_attr.src_path_bits);
+       bth0 |= hfi1_get_pkey(ibp, qp->s_pkey_index);
+       bth0 |= extra_bytes << 20;
+       ohdr->bth[0] = cpu_to_be32(bth0);
+       bth1 = qp->remote_qpn;
+       if (qp->s_flags & RVT_S_ECN) {
+               qp->s_flags &= ~RVT_S_ECN;
+               /* we recently received a FECN, so return a BECN */
+               bth1 |= (HFI1_BECN_MASK << HFI1_BECN_SHIFT);
+       }
+       ohdr->bth[1] = cpu_to_be32(bth1);
+       ohdr->bth[2] = cpu_to_be32(bth2);
+}
+
+/* when sending, force a reschedule every one of these periods */
+#define SEND_RESCHED_TIMEOUT (5 * HZ)  /* 5s in jiffies */
+
+void _hfi1_do_send(struct work_struct *work)
+{
+       struct iowait *wait = container_of(work, struct iowait, iowork);
+       struct rvt_qp *qp = iowait_to_qp(wait);
+
+       hfi1_do_send(qp);
+}
+
+/**
+ * hfi1_do_send - perform a send on a QP
+ * @work: contains a pointer to the QP
+ *
+ * Process entries in the send work queue until credit or queue is
+ * exhausted.  Only allow one CPU to send a packet per QP (tasklet).
+ * Otherwise, two threads could send packets out of order.
+ */
+void hfi1_do_send(struct rvt_qp *qp)
+{
+       struct hfi1_pkt_state ps;
+       struct hfi1_qp_priv *priv = qp->priv;
+       int (*make_req)(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
+       unsigned long timeout;
+       unsigned long timeout_int;
+       int cpu;
+
+       ps.dev = to_idev(qp->ibqp.device);
+       ps.ibp = to_iport(qp->ibqp.device, qp->port_num);
+       ps.ppd = ppd_from_ibp(ps.ibp);
+
+       switch (qp->ibqp.qp_type) {
+       case IB_QPT_RC:
+               if (!loopback && ((qp->remote_ah_attr.dlid & ~((1 << ps.ppd->lmc
+                                                               ) - 1)) ==
+                                ps.ppd->lid)) {
+                       ruc_loopback(qp);
+                       return;
+               }
+               make_req = hfi1_make_rc_req;
+               timeout_int = (qp->timeout_jiffies);
+               break;
+       case IB_QPT_UC:
+               if (!loopback && ((qp->remote_ah_attr.dlid & ~((1 << ps.ppd->lmc
+                                                               ) - 1)) ==
+                                ps.ppd->lid)) {
+                       ruc_loopback(qp);
+                       return;
+               }
+               make_req = hfi1_make_uc_req;
+               timeout_int = SEND_RESCHED_TIMEOUT;
+               break;
+       default:
+               make_req = hfi1_make_ud_req;
+               timeout_int = SEND_RESCHED_TIMEOUT;
+       }
+
+       spin_lock_irqsave(&qp->s_lock, ps.flags);
+
+       /* Return if we are already busy processing a work request. */
+       if (!hfi1_send_ok(qp)) {
+               spin_unlock_irqrestore(&qp->s_lock, ps.flags);
+               return;
+       }
+
+       qp->s_flags |= RVT_S_BUSY;
+
+       timeout = jiffies + (timeout_int) / 8;
+       cpu = priv->s_sde ? priv->s_sde->cpu :
+                       cpumask_first(cpumask_of_node(ps.ppd->dd->node));
+       /* insure a pre-built packet is handled  */
+       ps.s_txreq = get_waiting_verbs_txreq(qp);
+       do {
+               /* Check for a constructed packet to be sent. */
+               if (qp->s_hdrwords != 0) {
+                       spin_unlock_irqrestore(&qp->s_lock, ps.flags);
+                       /*
+                        * If the packet cannot be sent now, return and
+                        * the send tasklet will be woken up later.
+                        */
+                       if (hfi1_verbs_send(qp, &ps))
+                               return;
+                       /* Record that s_hdr is empty. */
+                       qp->s_hdrwords = 0;
+                       /* allow other tasks to run */
+                       if (unlikely(time_after(jiffies, timeout))) {
+                               if (workqueue_congested(cpu,
+                                                       ps.ppd->hfi1_wq)) {
+                                       spin_lock_irqsave(
+                                               &qp->s_lock,
+                                               ps.flags);
+                                       qp->s_flags &= ~RVT_S_BUSY;
+                                       hfi1_schedule_send(qp);
+                                       spin_unlock_irqrestore(
+                                               &qp->s_lock,
+                                               ps.flags);
+                                       this_cpu_inc(
+                                               *ps.ppd->dd->send_schedule);
+                                       return;
+                               }
+                               if (!irqs_disabled()) {
+                                       cond_resched();
+                                       this_cpu_inc(
+                                          *ps.ppd->dd->send_schedule);
+                               }
+                               timeout = jiffies + (timeout_int) / 8;
+                       }
+                       spin_lock_irqsave(&qp->s_lock, ps.flags);
+               }
+       } while (make_req(qp, &ps));
+
+       spin_unlock_irqrestore(&qp->s_lock, ps.flags);
+}
+
+/*
+ * This should be called with s_lock held.
+ */
+void hfi1_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
+                       enum ib_wc_status status)
+{
+       u32 old_last, last;
+       unsigned i;
+
+       if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND))
+               return;
+
+       last = qp->s_last;
+       old_last = last;
+       if (++last >= qp->s_size)
+               last = 0;
+       qp->s_last = last;
+       /* See post_send() */
+       barrier();
+       for (i = 0; i < wqe->wr.num_sge; i++) {
+               struct rvt_sge *sge = &wqe->sg_list[i];
+
+               rvt_put_mr(sge->mr);
+       }
+       if (qp->ibqp.qp_type == IB_QPT_UD ||
+           qp->ibqp.qp_type == IB_QPT_SMI ||
+           qp->ibqp.qp_type == IB_QPT_GSI)
+               atomic_dec(&ibah_to_rvtah(wqe->ud_wr.ah)->refcount);
+
+       /* See ch. 11.2.4.1 and 10.7.3.1 */
+       if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) ||
+           (wqe->wr.send_flags & IB_SEND_SIGNALED) ||
+           status != IB_WC_SUCCESS) {
+               struct ib_wc wc;
+
+               memset(&wc, 0, sizeof(wc));
+               wc.wr_id = wqe->wr.wr_id;
+               wc.status = status;
+               wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode];
+               wc.qp = &qp->ibqp;
+               if (status == IB_WC_SUCCESS)
+                       wc.byte_len = wqe->length;
+               rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc,
+                            status != IB_WC_SUCCESS);
+       }
+
+       if (qp->s_acked == old_last)
+               qp->s_acked = last;
+       if (qp->s_cur == old_last)
+               qp->s_cur = last;
+       if (qp->s_tail == old_last)
+               qp->s_tail = last;
+       if (qp->state == IB_QPS_SQD && last == qp->s_cur)
+               qp->s_draining = 0;
+}
diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c
new file mode 100644 (file)
index 0000000..f9befc0
--- /dev/null
@@ -0,0 +1,3054 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/spinlock.h>
+#include <linux/seqlock.h>
+#include <linux/netdevice.h>
+#include <linux/moduleparam.h>
+#include <linux/bitops.h>
+#include <linux/timer.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+
+#include "hfi.h"
+#include "common.h"
+#include "qp.h"
+#include "sdma.h"
+#include "iowait.h"
+#include "trace.h"
+
+/* must be a power of 2 >= 64 <= 32768 */
+#define SDMA_DESCQ_CNT 2048
+#define SDMA_DESC_INTR 64
+#define INVALID_TAIL 0xffff
+
+static uint sdma_descq_cnt = SDMA_DESCQ_CNT;
+module_param(sdma_descq_cnt, uint, S_IRUGO);
+MODULE_PARM_DESC(sdma_descq_cnt, "Number of SDMA descq entries");
+
+static uint sdma_idle_cnt = 250;
+module_param(sdma_idle_cnt, uint, S_IRUGO);
+MODULE_PARM_DESC(sdma_idle_cnt, "sdma interrupt idle delay (ns,default 250)");
+
+uint mod_num_sdma;
+module_param_named(num_sdma, mod_num_sdma, uint, S_IRUGO);
+MODULE_PARM_DESC(num_sdma, "Set max number SDMA engines to use");
+
+static uint sdma_desct_intr = SDMA_DESC_INTR;
+module_param_named(desct_intr, sdma_desct_intr, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(desct_intr, "Number of SDMA descriptor before interrupt");
+
+#define SDMA_WAIT_BATCH_SIZE 20
+/* max wait time for a SDMA engine to indicate it has halted */
+#define SDMA_ERR_HALT_TIMEOUT 10 /* ms */
+/* all SDMA engine errors that cause a halt */
+
+#define SD(name) SEND_DMA_##name
+#define ALL_SDMA_ENG_HALT_ERRS \
+       (SD(ENG_ERR_STATUS_SDMA_WRONG_DW_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_GEN_MISMATCH_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_TOO_LONG_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_TAIL_OUT_OF_BOUNDS_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_FIRST_DESC_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_MEM_READ_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_LENGTH_MISMATCH_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_PACKET_DESC_OVERFLOW_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_HEADER_SELECT_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_HEADER_ADDRESS_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_HEADER_LENGTH_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_TIMEOUT_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_DESC_TABLE_UNC_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_ASSEMBLY_UNC_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_PACKET_TRACKING_UNC_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_HEADER_STORAGE_UNC_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SMASK))
+
+/* sdma_sendctrl operations */
+#define SDMA_SENDCTRL_OP_ENABLE    BIT(0)
+#define SDMA_SENDCTRL_OP_INTENABLE BIT(1)
+#define SDMA_SENDCTRL_OP_HALT      BIT(2)
+#define SDMA_SENDCTRL_OP_CLEANUP   BIT(3)
+
+/* handle long defines */
+#define SDMA_EGRESS_PACKET_OCCUPANCY_SMASK \
+SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SMASK
+#define SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT \
+SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT
+
+static const char * const sdma_state_names[] = {
+       [sdma_state_s00_hw_down]                = "s00_HwDown",
+       [sdma_state_s10_hw_start_up_halt_wait]  = "s10_HwStartUpHaltWait",
+       [sdma_state_s15_hw_start_up_clean_wait] = "s15_HwStartUpCleanWait",
+       [sdma_state_s20_idle]                   = "s20_Idle",
+       [sdma_state_s30_sw_clean_up_wait]       = "s30_SwCleanUpWait",
+       [sdma_state_s40_hw_clean_up_wait]       = "s40_HwCleanUpWait",
+       [sdma_state_s50_hw_halt_wait]           = "s50_HwHaltWait",
+       [sdma_state_s60_idle_halt_wait]         = "s60_IdleHaltWait",
+       [sdma_state_s80_hw_freeze]              = "s80_HwFreeze",
+       [sdma_state_s82_freeze_sw_clean]        = "s82_FreezeSwClean",
+       [sdma_state_s99_running]                = "s99_Running",
+};
+
+#ifdef CONFIG_SDMA_VERBOSITY
+static const char * const sdma_event_names[] = {
+       [sdma_event_e00_go_hw_down]   = "e00_GoHwDown",
+       [sdma_event_e10_go_hw_start]  = "e10_GoHwStart",
+       [sdma_event_e15_hw_halt_done] = "e15_HwHaltDone",
+       [sdma_event_e25_hw_clean_up_done] = "e25_HwCleanUpDone",
+       [sdma_event_e30_go_running]   = "e30_GoRunning",
+       [sdma_event_e40_sw_cleaned]   = "e40_SwCleaned",
+       [sdma_event_e50_hw_cleaned]   = "e50_HwCleaned",
+       [sdma_event_e60_hw_halted]    = "e60_HwHalted",
+       [sdma_event_e70_go_idle]      = "e70_GoIdle",
+       [sdma_event_e80_hw_freeze]    = "e80_HwFreeze",
+       [sdma_event_e81_hw_frozen]    = "e81_HwFrozen",
+       [sdma_event_e82_hw_unfreeze]  = "e82_HwUnfreeze",
+       [sdma_event_e85_link_down]    = "e85_LinkDown",
+       [sdma_event_e90_sw_halted]    = "e90_SwHalted",
+};
+#endif
+
+static const struct sdma_set_state_action sdma_action_table[] = {
+       [sdma_state_s00_hw_down] = {
+               .go_s99_running_tofalse = 1,
+               .op_enable = 0,
+               .op_intenable = 0,
+               .op_halt = 0,
+               .op_cleanup = 0,
+       },
+       [sdma_state_s10_hw_start_up_halt_wait] = {
+               .op_enable = 0,
+               .op_intenable = 0,
+               .op_halt = 1,
+               .op_cleanup = 0,
+       },
+       [sdma_state_s15_hw_start_up_clean_wait] = {
+               .op_enable = 0,
+               .op_intenable = 1,
+               .op_halt = 0,
+               .op_cleanup = 1,
+       },
+       [sdma_state_s20_idle] = {
+               .op_enable = 0,
+               .op_intenable = 1,
+               .op_halt = 0,
+               .op_cleanup = 0,
+       },
+       [sdma_state_s30_sw_clean_up_wait] = {
+               .op_enable = 0,
+               .op_intenable = 0,
+               .op_halt = 0,
+               .op_cleanup = 0,
+       },
+       [sdma_state_s40_hw_clean_up_wait] = {
+               .op_enable = 0,
+               .op_intenable = 0,
+               .op_halt = 0,
+               .op_cleanup = 1,
+       },
+       [sdma_state_s50_hw_halt_wait] = {
+               .op_enable = 0,
+               .op_intenable = 0,
+               .op_halt = 0,
+               .op_cleanup = 0,
+       },
+       [sdma_state_s60_idle_halt_wait] = {
+               .go_s99_running_tofalse = 1,
+               .op_enable = 0,
+               .op_intenable = 0,
+               .op_halt = 1,
+               .op_cleanup = 0,
+       },
+       [sdma_state_s80_hw_freeze] = {
+               .op_enable = 0,
+               .op_intenable = 0,
+               .op_halt = 0,
+               .op_cleanup = 0,
+       },
+       [sdma_state_s82_freeze_sw_clean] = {
+               .op_enable = 0,
+               .op_intenable = 0,
+               .op_halt = 0,
+               .op_cleanup = 0,
+       },
+       [sdma_state_s99_running] = {
+               .op_enable = 1,
+               .op_intenable = 1,
+               .op_halt = 0,
+               .op_cleanup = 0,
+               .go_s99_running_totrue = 1,
+       },
+};
+
+#define SDMA_TAIL_UPDATE_THRESH 0x1F
+
+/* declare all statics here rather than keep sorting */
+static void sdma_complete(struct kref *);
+static void sdma_finalput(struct sdma_state *);
+static void sdma_get(struct sdma_state *);
+static void sdma_hw_clean_up_task(unsigned long);
+static void sdma_put(struct sdma_state *);
+static void sdma_set_state(struct sdma_engine *, enum sdma_states);
+static void sdma_start_hw_clean_up(struct sdma_engine *);
+static void sdma_sw_clean_up_task(unsigned long);
+static void sdma_sendctrl(struct sdma_engine *, unsigned);
+static void init_sdma_regs(struct sdma_engine *, u32, uint);
+static void sdma_process_event(
+       struct sdma_engine *sde,
+       enum sdma_events event);
+static void __sdma_process_event(
+       struct sdma_engine *sde,
+       enum sdma_events event);
+static void dump_sdma_state(struct sdma_engine *sde);
+static void sdma_make_progress(struct sdma_engine *sde, u64 status);
+static void sdma_desc_avail(struct sdma_engine *sde, unsigned avail);
+static void sdma_flush_descq(struct sdma_engine *sde);
+
+/**
+ * sdma_state_name() - return state string from enum
+ * @state: state
+ */
+static const char *sdma_state_name(enum sdma_states state)
+{
+       return sdma_state_names[state];
+}
+
+static void sdma_get(struct sdma_state *ss)
+{
+       kref_get(&ss->kref);
+}
+
+static void sdma_complete(struct kref *kref)
+{
+       struct sdma_state *ss =
+               container_of(kref, struct sdma_state, kref);
+
+       complete(&ss->comp);
+}
+
+static void sdma_put(struct sdma_state *ss)
+{
+       kref_put(&ss->kref, sdma_complete);
+}
+
+static void sdma_finalput(struct sdma_state *ss)
+{
+       sdma_put(ss);
+       wait_for_completion(&ss->comp);
+}
+
+static inline void write_sde_csr(
+       struct sdma_engine *sde,
+       u32 offset0,
+       u64 value)
+{
+       write_kctxt_csr(sde->dd, sde->this_idx, offset0, value);
+}
+
+static inline u64 read_sde_csr(
+       struct sdma_engine *sde,
+       u32 offset0)
+{
+       return read_kctxt_csr(sde->dd, sde->this_idx, offset0);
+}
+
+/*
+ * sdma_wait_for_packet_egress() - wait for the VL FIFO occupancy for
+ * sdma engine 'sde' to drop to 0.
+ */
+static void sdma_wait_for_packet_egress(struct sdma_engine *sde,
+                                       int pause)
+{
+       u64 off = 8 * sde->this_idx;
+       struct hfi1_devdata *dd = sde->dd;
+       int lcnt = 0;
+       u64 reg_prev;
+       u64 reg = 0;
+
+       while (1) {
+               reg_prev = reg;
+               reg = read_csr(dd, off + SEND_EGRESS_SEND_DMA_STATUS);
+
+               reg &= SDMA_EGRESS_PACKET_OCCUPANCY_SMASK;
+               reg >>= SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT;
+               if (reg == 0)
+                       break;
+               /* counter is reest if accupancy count changes */
+               if (reg != reg_prev)
+                       lcnt = 0;
+               if (lcnt++ > 500) {
+                       /* timed out - bounce the link */
+                       dd_dev_err(dd, "%s: engine %u timeout waiting for packets to egress, remaining count %u, bouncing link\n",
+                                  __func__, sde->this_idx, (u32)reg);
+                       queue_work(dd->pport->hfi1_wq,
+                                  &dd->pport->link_bounce_work);
+                       break;
+               }
+               udelay(1);
+       }
+}
+
+/*
+ * sdma_wait() - wait for packet egress to complete for all SDMA engines,
+ * and pause for credit return.
+ */
+void sdma_wait(struct hfi1_devdata *dd)
+{
+       int i;
+
+       for (i = 0; i < dd->num_sdma; i++) {
+               struct sdma_engine *sde = &dd->per_sdma[i];
+
+               sdma_wait_for_packet_egress(sde, 0);
+       }
+}
+
+static inline void sdma_set_desc_cnt(struct sdma_engine *sde, unsigned cnt)
+{
+       u64 reg;
+
+       if (!(sde->dd->flags & HFI1_HAS_SDMA_TIMEOUT))
+               return;
+       reg = cnt;
+       reg &= SD(DESC_CNT_CNT_MASK);
+       reg <<= SD(DESC_CNT_CNT_SHIFT);
+       write_sde_csr(sde, SD(DESC_CNT), reg);
+}
+
+static inline void complete_tx(struct sdma_engine *sde,
+                              struct sdma_txreq *tx,
+                              int res)
+{
+       /* protect against complete modifying */
+       struct iowait *wait = tx->wait;
+       callback_t complete = tx->complete;
+
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+       trace_hfi1_sdma_out_sn(sde, tx->sn);
+       if (WARN_ON_ONCE(sde->head_sn != tx->sn))
+               dd_dev_err(sde->dd, "expected %llu got %llu\n",
+                          sde->head_sn, tx->sn);
+       sde->head_sn++;
+#endif
+       sdma_txclean(sde->dd, tx);
+       if (complete)
+               (*complete)(tx, res);
+       if (wait && iowait_sdma_dec(wait))
+               iowait_drain_wakeup(wait);
+}
+
+/*
+ * Complete all the sdma requests with a SDMA_TXREQ_S_ABORTED status
+ *
+ * Depending on timing there can be txreqs in two places:
+ * - in the descq ring
+ * - in the flush list
+ *
+ * To avoid ordering issues the descq ring needs to be flushed
+ * first followed by the flush list.
+ *
+ * This routine is called from two places
+ * - From a work queue item
+ * - Directly from the state machine just before setting the
+ *   state to running
+ *
+ * Must be called with head_lock held
+ *
+ */
+static void sdma_flush(struct sdma_engine *sde)
+{
+       struct sdma_txreq *txp, *txp_next;
+       LIST_HEAD(flushlist);
+       unsigned long flags;
+
+       /* flush from head to tail */
+       sdma_flush_descq(sde);
+       spin_lock_irqsave(&sde->flushlist_lock, flags);
+       /* copy flush list */
+       list_for_each_entry_safe(txp, txp_next, &sde->flushlist, list) {
+               list_del_init(&txp->list);
+               list_add_tail(&txp->list, &flushlist);
+       }
+       spin_unlock_irqrestore(&sde->flushlist_lock, flags);
+       /* flush from flush list */
+       list_for_each_entry_safe(txp, txp_next, &flushlist, list)
+               complete_tx(sde, txp, SDMA_TXREQ_S_ABORTED);
+}
+
+/*
+ * Fields a work request for flushing the descq ring
+ * and the flush list
+ *
+ * If the engine has been brought to running during
+ * the scheduling delay, the flush is ignored, assuming
+ * that the process of bringing the engine to running
+ * would have done this flush prior to going to running.
+ *
+ */
+static void sdma_field_flush(struct work_struct *work)
+{
+       unsigned long flags;
+       struct sdma_engine *sde =
+               container_of(work, struct sdma_engine, flush_worker);
+
+       write_seqlock_irqsave(&sde->head_lock, flags);
+       if (!__sdma_running(sde))
+               sdma_flush(sde);
+       write_sequnlock_irqrestore(&sde->head_lock, flags);
+}
+
+static void sdma_err_halt_wait(struct work_struct *work)
+{
+       struct sdma_engine *sde = container_of(work, struct sdma_engine,
+                                               err_halt_worker);
+       u64 statuscsr;
+       unsigned long timeout;
+
+       timeout = jiffies + msecs_to_jiffies(SDMA_ERR_HALT_TIMEOUT);
+       while (1) {
+               statuscsr = read_sde_csr(sde, SD(STATUS));
+               statuscsr &= SD(STATUS_ENG_HALTED_SMASK);
+               if (statuscsr)
+                       break;
+               if (time_after(jiffies, timeout)) {
+                       dd_dev_err(sde->dd,
+                                  "SDMA engine %d - timeout waiting for engine to halt\n",
+                                  sde->this_idx);
+                       /*
+                        * Continue anyway.  This could happen if there was
+                        * an uncorrectable error in the wrong spot.
+                        */
+                       break;
+               }
+               usleep_range(80, 120);
+       }
+
+       sdma_process_event(sde, sdma_event_e15_hw_halt_done);
+}
+
+static void sdma_err_progress_check_schedule(struct sdma_engine *sde)
+{
+       if (!is_bx(sde->dd) && HFI1_CAP_IS_KSET(SDMA_AHG)) {
+               unsigned index;
+               struct hfi1_devdata *dd = sde->dd;
+
+               for (index = 0; index < dd->num_sdma; index++) {
+                       struct sdma_engine *curr_sdma = &dd->per_sdma[index];
+
+                       if (curr_sdma != sde)
+                               curr_sdma->progress_check_head =
+                                                       curr_sdma->descq_head;
+               }
+               dd_dev_err(sde->dd,
+                          "SDMA engine %d - check scheduled\n",
+                               sde->this_idx);
+               mod_timer(&sde->err_progress_check_timer, jiffies + 10);
+       }
+}
+
+static void sdma_err_progress_check(unsigned long data)
+{
+       unsigned index;
+       struct sdma_engine *sde = (struct sdma_engine *)data;
+
+       dd_dev_err(sde->dd, "SDE progress check event\n");
+       for (index = 0; index < sde->dd->num_sdma; index++) {
+               struct sdma_engine *curr_sde = &sde->dd->per_sdma[index];
+               unsigned long flags;
+
+               /* check progress on each engine except the current one */
+               if (curr_sde == sde)
+                       continue;
+               /*
+                * We must lock interrupts when acquiring sde->lock,
+                * to avoid a deadlock if interrupt triggers and spins on
+                * the same lock on same CPU
+                */
+               spin_lock_irqsave(&curr_sde->tail_lock, flags);
+               write_seqlock(&curr_sde->head_lock);
+
+               /* skip non-running queues */
+               if (curr_sde->state.current_state != sdma_state_s99_running) {
+                       write_sequnlock(&curr_sde->head_lock);
+                       spin_unlock_irqrestore(&curr_sde->tail_lock, flags);
+                       continue;
+               }
+
+               if ((curr_sde->descq_head != curr_sde->descq_tail) &&
+                   (curr_sde->descq_head ==
+                               curr_sde->progress_check_head))
+                       __sdma_process_event(curr_sde,
+                                            sdma_event_e90_sw_halted);
+               write_sequnlock(&curr_sde->head_lock);
+               spin_unlock_irqrestore(&curr_sde->tail_lock, flags);
+       }
+       schedule_work(&sde->err_halt_worker);
+}
+
+static void sdma_hw_clean_up_task(unsigned long opaque)
+{
+       struct sdma_engine *sde = (struct sdma_engine *)opaque;
+       u64 statuscsr;
+
+       while (1) {
+#ifdef CONFIG_SDMA_VERBOSITY
+               dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
+                          sde->this_idx, slashstrip(__FILE__), __LINE__,
+                       __func__);
+#endif
+               statuscsr = read_sde_csr(sde, SD(STATUS));
+               statuscsr &= SD(STATUS_ENG_CLEANED_UP_SMASK);
+               if (statuscsr)
+                       break;
+               udelay(10);
+       }
+
+       sdma_process_event(sde, sdma_event_e25_hw_clean_up_done);
+}
+
+static inline struct sdma_txreq *get_txhead(struct sdma_engine *sde)
+{
+       smp_read_barrier_depends(); /* see sdma_update_tail() */
+       return sde->tx_ring[sde->tx_head & sde->sdma_mask];
+}
+
+/*
+ * flush ring for recovery
+ */
+static void sdma_flush_descq(struct sdma_engine *sde)
+{
+       u16 head, tail;
+       int progress = 0;
+       struct sdma_txreq *txp = get_txhead(sde);
+
+       /* The reason for some of the complexity of this code is that
+        * not all descriptors have corresponding txps.  So, we have to
+        * be able to skip over descs until we wander into the range of
+        * the next txp on the list.
+        */
+       head = sde->descq_head & sde->sdma_mask;
+       tail = sde->descq_tail & sde->sdma_mask;
+       while (head != tail) {
+               /* advance head, wrap if needed */
+               head = ++sde->descq_head & sde->sdma_mask;
+               /* if now past this txp's descs, do the callback */
+               if (txp && txp->next_descq_idx == head) {
+                       /* remove from list */
+                       sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL;
+                       complete_tx(sde, txp, SDMA_TXREQ_S_ABORTED);
+                       trace_hfi1_sdma_progress(sde, head, tail, txp);
+                       txp = get_txhead(sde);
+               }
+               progress++;
+       }
+       if (progress)
+               sdma_desc_avail(sde, sdma_descq_freecnt(sde));
+}
+
+static void sdma_sw_clean_up_task(unsigned long opaque)
+{
+       struct sdma_engine *sde = (struct sdma_engine *)opaque;
+       unsigned long flags;
+
+       spin_lock_irqsave(&sde->tail_lock, flags);
+       write_seqlock(&sde->head_lock);
+
+       /*
+        * At this point, the following should always be true:
+        * - We are halted, so no more descriptors are getting retired.
+        * - We are not running, so no one is submitting new work.
+        * - Only we can send the e40_sw_cleaned, so we can't start
+        *   running again until we say so.  So, the active list and
+        *   descq are ours to play with.
+        */
+
+       /*
+        * In the error clean up sequence, software clean must be called
+        * before the hardware clean so we can use the hardware head in
+        * the progress routine.  A hardware clean or SPC unfreeze will
+        * reset the hardware head.
+        *
+        * Process all retired requests. The progress routine will use the
+        * latest physical hardware head - we are not running so speed does
+        * not matter.
+        */
+       sdma_make_progress(sde, 0);
+
+       sdma_flush(sde);
+
+       /*
+        * Reset our notion of head and tail.
+        * Note that the HW registers have been reset via an earlier
+        * clean up.
+        */
+       sde->descq_tail = 0;
+       sde->descq_head = 0;
+       sde->desc_avail = sdma_descq_freecnt(sde);
+       *sde->head_dma = 0;
+
+       __sdma_process_event(sde, sdma_event_e40_sw_cleaned);
+
+       write_sequnlock(&sde->head_lock);
+       spin_unlock_irqrestore(&sde->tail_lock, flags);
+}
+
+static void sdma_sw_tear_down(struct sdma_engine *sde)
+{
+       struct sdma_state *ss = &sde->state;
+
+       /* Releasing this reference means the state machine has stopped. */
+       sdma_put(ss);
+
+       /* stop waiting for all unfreeze events to complete */
+       atomic_set(&sde->dd->sdma_unfreeze_count, -1);
+       wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
+}
+
+static void sdma_start_hw_clean_up(struct sdma_engine *sde)
+{
+       tasklet_hi_schedule(&sde->sdma_hw_clean_up_task);
+}
+
+static void sdma_set_state(struct sdma_engine *sde,
+                          enum sdma_states next_state)
+{
+       struct sdma_state *ss = &sde->state;
+       const struct sdma_set_state_action *action = sdma_action_table;
+       unsigned op = 0;
+
+       trace_hfi1_sdma_state(
+               sde,
+               sdma_state_names[ss->current_state],
+               sdma_state_names[next_state]);
+
+       /* debugging bookkeeping */
+       ss->previous_state = ss->current_state;
+       ss->previous_op = ss->current_op;
+       ss->current_state = next_state;
+
+       if (ss->previous_state != sdma_state_s99_running &&
+           next_state == sdma_state_s99_running)
+               sdma_flush(sde);
+
+       if (action[next_state].op_enable)
+               op |= SDMA_SENDCTRL_OP_ENABLE;
+
+       if (action[next_state].op_intenable)
+               op |= SDMA_SENDCTRL_OP_INTENABLE;
+
+       if (action[next_state].op_halt)
+               op |= SDMA_SENDCTRL_OP_HALT;
+
+       if (action[next_state].op_cleanup)
+               op |= SDMA_SENDCTRL_OP_CLEANUP;
+
+       if (action[next_state].go_s99_running_tofalse)
+               ss->go_s99_running = 0;
+
+       if (action[next_state].go_s99_running_totrue)
+               ss->go_s99_running = 1;
+
+       ss->current_op = op;
+       sdma_sendctrl(sde, ss->current_op);
+}
+
+/**
+ * sdma_get_descq_cnt() - called when device probed
+ *
+ * Return a validated descq count.
+ *
+ * This is currently only used in the verbs initialization to build the tx
+ * list.
+ *
+ * This will probably be deleted in favor of a more scalable approach to
+ * alloc tx's.
+ *
+ */
+u16 sdma_get_descq_cnt(void)
+{
+       u16 count = sdma_descq_cnt;
+
+       if (!count)
+               return SDMA_DESCQ_CNT;
+       /* count must be a power of 2 greater than 64 and less than
+        * 32768.   Otherwise return default.
+        */
+       if (!is_power_of_2(count))
+               return SDMA_DESCQ_CNT;
+       if (count < 64 || count > 32768)
+               return SDMA_DESCQ_CNT;
+       return count;
+}
+
+/**
+ * sdma_select_engine_vl() - select sdma engine
+ * @dd: devdata
+ * @selector: a spreading factor
+ * @vl: this vl
+ *
+ *
+ * This function returns an engine based on the selector and a vl.  The
+ * mapping fields are protected by RCU.
+ */
+struct sdma_engine *sdma_select_engine_vl(
+       struct hfi1_devdata *dd,
+       u32 selector,
+       u8 vl)
+{
+       struct sdma_vl_map *m;
+       struct sdma_map_elem *e;
+       struct sdma_engine *rval;
+
+       /* NOTE This should only happen if SC->VL changed after the initial
+        *      checks on the QP/AH
+        *      Default will return engine 0 below
+        */
+       if (vl >= num_vls) {
+               rval = NULL;
+               goto done;
+       }
+
+       rcu_read_lock();
+       m = rcu_dereference(dd->sdma_map);
+       if (unlikely(!m)) {
+               rcu_read_unlock();
+               return &dd->per_sdma[0];
+       }
+       e = m->map[vl & m->mask];
+       rval = e->sde[selector & e->mask];
+       rcu_read_unlock();
+
+done:
+       rval =  !rval ? &dd->per_sdma[0] : rval;
+       trace_hfi1_sdma_engine_select(dd, selector, vl, rval->this_idx);
+       return rval;
+}
+
+/**
+ * sdma_select_engine_sc() - select sdma engine
+ * @dd: devdata
+ * @selector: a spreading factor
+ * @sc5: the 5 bit sc
+ *
+ *
+ * This function returns an engine based on the selector and an sc.
+ */
+struct sdma_engine *sdma_select_engine_sc(
+       struct hfi1_devdata *dd,
+       u32 selector,
+       u8 sc5)
+{
+       u8 vl = sc_to_vlt(dd, sc5);
+
+       return sdma_select_engine_vl(dd, selector, vl);
+}
+
+/*
+ * Free the indicated map struct
+ */
+static void sdma_map_free(struct sdma_vl_map *m)
+{
+       int i;
+
+       for (i = 0; m && i < m->actual_vls; i++)
+               kfree(m->map[i]);
+       kfree(m);
+}
+
+/*
+ * Handle RCU callback
+ */
+static void sdma_map_rcu_callback(struct rcu_head *list)
+{
+       struct sdma_vl_map *m = container_of(list, struct sdma_vl_map, list);
+
+       sdma_map_free(m);
+}
+
+/**
+ * sdma_map_init - called when # vls change
+ * @dd: hfi1_devdata
+ * @port: port number
+ * @num_vls: number of vls
+ * @vl_engines: per vl engine mapping (optional)
+ *
+ * This routine changes the mapping based on the number of vls.
+ *
+ * vl_engines is used to specify a non-uniform vl/engine loading. NULL
+ * implies auto computing the loading and giving each VLs a uniform
+ * distribution of engines per VL.
+ *
+ * The auto algorithm computes the sde_per_vl and the number of extra
+ * engines.  Any extra engines are added from the last VL on down.
+ *
+ * rcu locking is used here to control access to the mapping fields.
+ *
+ * If either the num_vls or num_sdma are non-power of 2, the array sizes
+ * in the struct sdma_vl_map and the struct sdma_map_elem are rounded
+ * up to the next highest power of 2 and the first entry is reused
+ * in a round robin fashion.
+ *
+ * If an error occurs the map change is not done and the mapping is
+ * not changed.
+ *
+ */
+int sdma_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls, u8 *vl_engines)
+{
+       int i, j;
+       int extra, sde_per_vl;
+       int engine = 0;
+       u8 lvl_engines[OPA_MAX_VLS];
+       struct sdma_vl_map *oldmap, *newmap;
+
+       if (!(dd->flags & HFI1_HAS_SEND_DMA))
+               return 0;
+
+       if (!vl_engines) {
+               /* truncate divide */
+               sde_per_vl = dd->num_sdma / num_vls;
+               /* extras */
+               extra = dd->num_sdma % num_vls;
+               vl_engines = lvl_engines;
+               /* add extras from last vl down */
+               for (i = num_vls - 1; i >= 0; i--, extra--)
+                       vl_engines[i] = sde_per_vl + (extra > 0 ? 1 : 0);
+       }
+       /* build new map */
+       newmap = kzalloc(
+               sizeof(struct sdma_vl_map) +
+                       roundup_pow_of_two(num_vls) *
+                       sizeof(struct sdma_map_elem *),
+               GFP_KERNEL);
+       if (!newmap)
+               goto bail;
+       newmap->actual_vls = num_vls;
+       newmap->vls = roundup_pow_of_two(num_vls);
+       newmap->mask = (1 << ilog2(newmap->vls)) - 1;
+       /* initialize back-map */
+       for (i = 0; i < TXE_NUM_SDMA_ENGINES; i++)
+               newmap->engine_to_vl[i] = -1;
+       for (i = 0; i < newmap->vls; i++) {
+               /* save for wrap around */
+               int first_engine = engine;
+
+               if (i < newmap->actual_vls) {
+                       int sz = roundup_pow_of_two(vl_engines[i]);
+
+                       /* only allocate once */
+                       newmap->map[i] = kzalloc(
+                               sizeof(struct sdma_map_elem) +
+                                       sz * sizeof(struct sdma_engine *),
+                               GFP_KERNEL);
+                       if (!newmap->map[i])
+                               goto bail;
+                       newmap->map[i]->mask = (1 << ilog2(sz)) - 1;
+                       /* assign engines */
+                       for (j = 0; j < sz; j++) {
+                               newmap->map[i]->sde[j] =
+                                       &dd->per_sdma[engine];
+                               if (++engine >= first_engine + vl_engines[i])
+                                       /* wrap back to first engine */
+                                       engine = first_engine;
+                       }
+                       /* assign back-map */
+                       for (j = 0; j < vl_engines[i]; j++)
+                               newmap->engine_to_vl[first_engine + j] = i;
+               } else {
+                       /* just re-use entry without allocating */
+                       newmap->map[i] = newmap->map[i % num_vls];
+               }
+               engine = first_engine + vl_engines[i];
+       }
+       /* newmap in hand, save old map */
+       spin_lock_irq(&dd->sde_map_lock);
+       oldmap = rcu_dereference_protected(dd->sdma_map,
+                                          lockdep_is_held(&dd->sde_map_lock));
+
+       /* publish newmap */
+       rcu_assign_pointer(dd->sdma_map, newmap);
+
+       spin_unlock_irq(&dd->sde_map_lock);
+       /* success, free any old map after grace period */
+       if (oldmap)
+               call_rcu(&oldmap->list, sdma_map_rcu_callback);
+       return 0;
+bail:
+       /* free any partial allocation */
+       sdma_map_free(newmap);
+       return -ENOMEM;
+}
+
+/*
+ * Clean up allocated memory.
+ *
+ * This routine is can be called regardless of the success of sdma_init()
+ *
+ */
+static void sdma_clean(struct hfi1_devdata *dd, size_t num_engines)
+{
+       size_t i;
+       struct sdma_engine *sde;
+
+       if (dd->sdma_pad_dma) {
+               dma_free_coherent(&dd->pcidev->dev, 4,
+                                 (void *)dd->sdma_pad_dma,
+                                 dd->sdma_pad_phys);
+               dd->sdma_pad_dma = NULL;
+               dd->sdma_pad_phys = 0;
+       }
+       if (dd->sdma_heads_dma) {
+               dma_free_coherent(&dd->pcidev->dev, dd->sdma_heads_size,
+                                 (void *)dd->sdma_heads_dma,
+                                 dd->sdma_heads_phys);
+               dd->sdma_heads_dma = NULL;
+               dd->sdma_heads_phys = 0;
+       }
+       for (i = 0; dd->per_sdma && i < num_engines; ++i) {
+               sde = &dd->per_sdma[i];
+
+               sde->head_dma = NULL;
+               sde->head_phys = 0;
+
+               if (sde->descq) {
+                       dma_free_coherent(
+                               &dd->pcidev->dev,
+                               sde->descq_cnt * sizeof(u64[2]),
+                               sde->descq,
+                               sde->descq_phys
+                       );
+                       sde->descq = NULL;
+                       sde->descq_phys = 0;
+               }
+               kvfree(sde->tx_ring);
+               sde->tx_ring = NULL;
+       }
+       spin_lock_irq(&dd->sde_map_lock);
+       sdma_map_free(rcu_access_pointer(dd->sdma_map));
+       RCU_INIT_POINTER(dd->sdma_map, NULL);
+       spin_unlock_irq(&dd->sde_map_lock);
+       synchronize_rcu();
+       kfree(dd->per_sdma);
+       dd->per_sdma = NULL;
+}
+
+/**
+ * sdma_init() - called when device probed
+ * @dd: hfi1_devdata
+ * @port: port number (currently only zero)
+ *
+ * sdma_init initializes the specified number of engines.
+ *
+ * The code initializes each sde, its csrs.  Interrupts
+ * are not required to be enabled.
+ *
+ * Returns:
+ * 0 - success, -errno on failure
+ */
+int sdma_init(struct hfi1_devdata *dd, u8 port)
+{
+       unsigned this_idx;
+       struct sdma_engine *sde;
+       u16 descq_cnt;
+       void *curr_head;
+       struct hfi1_pportdata *ppd = dd->pport + port;
+       u32 per_sdma_credits;
+       uint idle_cnt = sdma_idle_cnt;
+       size_t num_engines = dd->chip_sdma_engines;
+
+       if (!HFI1_CAP_IS_KSET(SDMA)) {
+               HFI1_CAP_CLEAR(SDMA_AHG);
+               return 0;
+       }
+       if (mod_num_sdma &&
+           /* can't exceed chip support */
+           mod_num_sdma <= dd->chip_sdma_engines &&
+           /* count must be >= vls */
+           mod_num_sdma >= num_vls)
+               num_engines = mod_num_sdma;
+
+       dd_dev_info(dd, "SDMA mod_num_sdma: %u\n", mod_num_sdma);
+       dd_dev_info(dd, "SDMA chip_sdma_engines: %u\n", dd->chip_sdma_engines);
+       dd_dev_info(dd, "SDMA chip_sdma_mem_size: %u\n",
+                   dd->chip_sdma_mem_size);
+
+       per_sdma_credits =
+               dd->chip_sdma_mem_size / (num_engines * SDMA_BLOCK_SIZE);
+
+       /* set up freeze waitqueue */
+       init_waitqueue_head(&dd->sdma_unfreeze_wq);
+       atomic_set(&dd->sdma_unfreeze_count, 0);
+
+       descq_cnt = sdma_get_descq_cnt();
+       dd_dev_info(dd, "SDMA engines %zu descq_cnt %u\n",
+                   num_engines, descq_cnt);
+
+       /* alloc memory for array of send engines */
+       dd->per_sdma = kcalloc(num_engines, sizeof(*dd->per_sdma), GFP_KERNEL);
+       if (!dd->per_sdma)
+               return -ENOMEM;
+
+       idle_cnt = ns_to_cclock(dd, idle_cnt);
+       if (!sdma_desct_intr)
+               sdma_desct_intr = SDMA_DESC_INTR;
+
+       /* Allocate memory for SendDMA descriptor FIFOs */
+       for (this_idx = 0; this_idx < num_engines; ++this_idx) {
+               sde = &dd->per_sdma[this_idx];
+               sde->dd = dd;
+               sde->ppd = ppd;
+               sde->this_idx = this_idx;
+               sde->descq_cnt = descq_cnt;
+               sde->desc_avail = sdma_descq_freecnt(sde);
+               sde->sdma_shift = ilog2(descq_cnt);
+               sde->sdma_mask = (1 << sde->sdma_shift) - 1;
+
+               /* Create a mask specifically for each interrupt source */
+               sde->int_mask = (u64)1 << (0 * TXE_NUM_SDMA_ENGINES +
+                                          this_idx);
+               sde->progress_mask = (u64)1 << (1 * TXE_NUM_SDMA_ENGINES +
+                                               this_idx);
+               sde->idle_mask = (u64)1 << (2 * TXE_NUM_SDMA_ENGINES +
+                                           this_idx);
+               /* Create a combined mask to cover all 3 interrupt sources */
+               sde->imask = sde->int_mask | sde->progress_mask |
+                            sde->idle_mask;
+
+               spin_lock_init(&sde->tail_lock);
+               seqlock_init(&sde->head_lock);
+               spin_lock_init(&sde->senddmactrl_lock);
+               spin_lock_init(&sde->flushlist_lock);
+               /* insure there is always a zero bit */
+               sde->ahg_bits = 0xfffffffe00000000ULL;
+
+               sdma_set_state(sde, sdma_state_s00_hw_down);
+
+               /* set up reference counting */
+               kref_init(&sde->state.kref);
+               init_completion(&sde->state.comp);
+
+               INIT_LIST_HEAD(&sde->flushlist);
+               INIT_LIST_HEAD(&sde->dmawait);
+
+               sde->tail_csr =
+                       get_kctxt_csr_addr(dd, this_idx, SD(TAIL));
+
+               if (idle_cnt)
+                       dd->default_desc1 =
+                               SDMA_DESC1_HEAD_TO_HOST_FLAG;
+               else
+                       dd->default_desc1 =
+                               SDMA_DESC1_INT_REQ_FLAG;
+
+               tasklet_init(&sde->sdma_hw_clean_up_task, sdma_hw_clean_up_task,
+                            (unsigned long)sde);
+
+               tasklet_init(&sde->sdma_sw_clean_up_task, sdma_sw_clean_up_task,
+                            (unsigned long)sde);
+               INIT_WORK(&sde->err_halt_worker, sdma_err_halt_wait);
+               INIT_WORK(&sde->flush_worker, sdma_field_flush);
+
+               sde->progress_check_head = 0;
+
+               setup_timer(&sde->err_progress_check_timer,
+                           sdma_err_progress_check, (unsigned long)sde);
+
+               sde->descq = dma_zalloc_coherent(
+                       &dd->pcidev->dev,
+                       descq_cnt * sizeof(u64[2]),
+                       &sde->descq_phys,
+                       GFP_KERNEL
+               );
+               if (!sde->descq)
+                       goto bail;
+               sde->tx_ring =
+                       kcalloc(descq_cnt, sizeof(struct sdma_txreq *),
+                               GFP_KERNEL);
+               if (!sde->tx_ring)
+                       sde->tx_ring =
+                               vzalloc(
+                                       sizeof(struct sdma_txreq *) *
+                                       descq_cnt);
+               if (!sde->tx_ring)
+                       goto bail;
+       }
+
+       dd->sdma_heads_size = L1_CACHE_BYTES * num_engines;
+       /* Allocate memory for DMA of head registers to memory */
+       dd->sdma_heads_dma = dma_zalloc_coherent(
+               &dd->pcidev->dev,
+               dd->sdma_heads_size,
+               &dd->sdma_heads_phys,
+               GFP_KERNEL
+       );
+       if (!dd->sdma_heads_dma) {
+               dd_dev_err(dd, "failed to allocate SendDMA head memory\n");
+               goto bail;
+       }
+
+       /* Allocate memory for pad */
+       dd->sdma_pad_dma = dma_zalloc_coherent(
+               &dd->pcidev->dev,
+               sizeof(u32),
+               &dd->sdma_pad_phys,
+               GFP_KERNEL
+       );
+       if (!dd->sdma_pad_dma) {
+               dd_dev_err(dd, "failed to allocate SendDMA pad memory\n");
+               goto bail;
+       }
+
+       /* assign each engine to different cacheline and init registers */
+       curr_head = (void *)dd->sdma_heads_dma;
+       for (this_idx = 0; this_idx < num_engines; ++this_idx) {
+               unsigned long phys_offset;
+
+               sde = &dd->per_sdma[this_idx];
+
+               sde->head_dma = curr_head;
+               curr_head += L1_CACHE_BYTES;
+               phys_offset = (unsigned long)sde->head_dma -
+                             (unsigned long)dd->sdma_heads_dma;
+               sde->head_phys = dd->sdma_heads_phys + phys_offset;
+               init_sdma_regs(sde, per_sdma_credits, idle_cnt);
+       }
+       dd->flags |= HFI1_HAS_SEND_DMA;
+       dd->flags |= idle_cnt ? HFI1_HAS_SDMA_TIMEOUT : 0;
+       dd->num_sdma = num_engines;
+       if (sdma_map_init(dd, port, ppd->vls_operational, NULL))
+               goto bail;
+       dd_dev_info(dd, "SDMA num_sdma: %u\n", dd->num_sdma);
+       return 0;
+
+bail:
+       sdma_clean(dd, num_engines);
+       return -ENOMEM;
+}
+
+/**
+ * sdma_all_running() - called when the link goes up
+ * @dd: hfi1_devdata
+ *
+ * This routine moves all engines to the running state.
+ */
+void sdma_all_running(struct hfi1_devdata *dd)
+{
+       struct sdma_engine *sde;
+       unsigned int i;
+
+       /* move all engines to running */
+       for (i = 0; i < dd->num_sdma; ++i) {
+               sde = &dd->per_sdma[i];
+               sdma_process_event(sde, sdma_event_e30_go_running);
+       }
+}
+
+/**
+ * sdma_all_idle() - called when the link goes down
+ * @dd: hfi1_devdata
+ *
+ * This routine moves all engines to the idle state.
+ */
+void sdma_all_idle(struct hfi1_devdata *dd)
+{
+       struct sdma_engine *sde;
+       unsigned int i;
+
+       /* idle all engines */
+       for (i = 0; i < dd->num_sdma; ++i) {
+               sde = &dd->per_sdma[i];
+               sdma_process_event(sde, sdma_event_e70_go_idle);
+       }
+}
+
+/**
+ * sdma_start() - called to kick off state processing for all engines
+ * @dd: hfi1_devdata
+ *
+ * This routine is for kicking off the state processing for all required
+ * sdma engines.  Interrupts need to be working at this point.
+ *
+ */
+void sdma_start(struct hfi1_devdata *dd)
+{
+       unsigned i;
+       struct sdma_engine *sde;
+
+       /* kick off the engines state processing */
+       for (i = 0; i < dd->num_sdma; ++i) {
+               sde = &dd->per_sdma[i];
+               sdma_process_event(sde, sdma_event_e10_go_hw_start);
+       }
+}
+
+/**
+ * sdma_exit() - used when module is removed
+ * @dd: hfi1_devdata
+ */
+void sdma_exit(struct hfi1_devdata *dd)
+{
+       unsigned this_idx;
+       struct sdma_engine *sde;
+
+       for (this_idx = 0; dd->per_sdma && this_idx < dd->num_sdma;
+                       ++this_idx) {
+               sde = &dd->per_sdma[this_idx];
+               if (!list_empty(&sde->dmawait))
+                       dd_dev_err(dd, "sde %u: dmawait list not empty!\n",
+                                  sde->this_idx);
+               sdma_process_event(sde, sdma_event_e00_go_hw_down);
+
+               del_timer_sync(&sde->err_progress_check_timer);
+
+               /*
+                * This waits for the state machine to exit so it is not
+                * necessary to kill the sdma_sw_clean_up_task to make sure
+                * it is not running.
+                */
+               sdma_finalput(&sde->state);
+       }
+       sdma_clean(dd, dd->num_sdma);
+}
+
+/*
+ * unmap the indicated descriptor
+ */
+static inline void sdma_unmap_desc(
+       struct hfi1_devdata *dd,
+       struct sdma_desc *descp)
+{
+       switch (sdma_mapping_type(descp)) {
+       case SDMA_MAP_SINGLE:
+               dma_unmap_single(
+                       &dd->pcidev->dev,
+                       sdma_mapping_addr(descp),
+                       sdma_mapping_len(descp),
+                       DMA_TO_DEVICE);
+               break;
+       case SDMA_MAP_PAGE:
+               dma_unmap_page(
+                       &dd->pcidev->dev,
+                       sdma_mapping_addr(descp),
+                       sdma_mapping_len(descp),
+                       DMA_TO_DEVICE);
+               break;
+       }
+}
+
+/*
+ * return the mode as indicated by the first
+ * descriptor in the tx.
+ */
+static inline u8 ahg_mode(struct sdma_txreq *tx)
+{
+       return (tx->descp[0].qw[1] & SDMA_DESC1_HEADER_MODE_SMASK)
+               >> SDMA_DESC1_HEADER_MODE_SHIFT;
+}
+
+/**
+ * sdma_txclean() - clean tx of mappings, descp *kmalloc's
+ * @dd: hfi1_devdata for unmapping
+ * @tx: tx request to clean
+ *
+ * This is used in the progress routine to clean the tx or
+ * by the ULP to toss an in-process tx build.
+ *
+ * The code can be called multiple times without issue.
+ *
+ */
+void sdma_txclean(
+       struct hfi1_devdata *dd,
+       struct sdma_txreq *tx)
+{
+       u16 i;
+
+       if (tx->num_desc) {
+               u8 skip = 0, mode = ahg_mode(tx);
+
+               /* unmap first */
+               sdma_unmap_desc(dd, &tx->descp[0]);
+               /* determine number of AHG descriptors to skip */
+               if (mode > SDMA_AHG_APPLY_UPDATE1)
+                       skip = mode >> 1;
+               for (i = 1 + skip; i < tx->num_desc; i++)
+                       sdma_unmap_desc(dd, &tx->descp[i]);
+               tx->num_desc = 0;
+       }
+       kfree(tx->coalesce_buf);
+       tx->coalesce_buf = NULL;
+       /* kmalloc'ed descp */
+       if (unlikely(tx->desc_limit > ARRAY_SIZE(tx->descs))) {
+               tx->desc_limit = ARRAY_SIZE(tx->descs);
+               kfree(tx->descp);
+       }
+}
+
+static inline u16 sdma_gethead(struct sdma_engine *sde)
+{
+       struct hfi1_devdata *dd = sde->dd;
+       int use_dmahead;
+       u16 hwhead;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+       dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
+                  sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
+#endif
+
+retry:
+       use_dmahead = HFI1_CAP_IS_KSET(USE_SDMA_HEAD) && __sdma_running(sde) &&
+                                       (dd->flags & HFI1_HAS_SDMA_TIMEOUT);
+       hwhead = use_dmahead ?
+               (u16)le64_to_cpu(*sde->head_dma) :
+               (u16)read_sde_csr(sde, SD(HEAD));
+
+       if (unlikely(HFI1_CAP_IS_KSET(SDMA_HEAD_CHECK))) {
+               u16 cnt;
+               u16 swtail;
+               u16 swhead;
+               int sane;
+
+               swhead = sde->descq_head & sde->sdma_mask;
+               /* this code is really bad for cache line trading */
+               swtail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask;
+               cnt = sde->descq_cnt;
+
+               if (swhead < swtail)
+                       /* not wrapped */
+                       sane = (hwhead >= swhead) & (hwhead <= swtail);
+               else if (swhead > swtail)
+                       /* wrapped around */
+                       sane = ((hwhead >= swhead) && (hwhead < cnt)) ||
+                               (hwhead <= swtail);
+               else
+                       /* empty */
+                       sane = (hwhead == swhead);
+
+               if (unlikely(!sane)) {
+                       dd_dev_err(dd, "SDMA(%u) bad head (%s) hwhd=%hu swhd=%hu swtl=%hu cnt=%hu\n",
+                                  sde->this_idx,
+                                  use_dmahead ? "dma" : "kreg",
+                                  hwhead, swhead, swtail, cnt);
+                       if (use_dmahead) {
+                               /* try one more time, using csr */
+                               use_dmahead = 0;
+                               goto retry;
+                       }
+                       /* proceed as if no progress */
+                       hwhead = swhead;
+               }
+       }
+       return hwhead;
+}
+
+/*
+ * This is called when there are send DMA descriptors that might be
+ * available.
+ *
+ * This is called with head_lock held.
+ */
+static void sdma_desc_avail(struct sdma_engine *sde, unsigned avail)
+{
+       struct iowait *wait, *nw;
+       struct iowait *waits[SDMA_WAIT_BATCH_SIZE];
+       unsigned i, n = 0, seq;
+       struct sdma_txreq *stx;
+       struct hfi1_ibdev *dev = &sde->dd->verbs_dev;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+       dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
+                  slashstrip(__FILE__), __LINE__, __func__);
+       dd_dev_err(sde->dd, "avail: %u\n", avail);
+#endif
+
+       do {
+               seq = read_seqbegin(&dev->iowait_lock);
+               if (!list_empty(&sde->dmawait)) {
+                       /* at least one item */
+                       write_seqlock(&dev->iowait_lock);
+                       /* Harvest waiters wanting DMA descriptors */
+                       list_for_each_entry_safe(
+                                       wait,
+                                       nw,
+                                       &sde->dmawait,
+                                       list) {
+                               u16 num_desc = 0;
+
+                               if (!wait->wakeup)
+                                       continue;
+                               if (n == ARRAY_SIZE(waits))
+                                       break;
+                               if (!list_empty(&wait->tx_head)) {
+                                       stx = list_first_entry(
+                                               &wait->tx_head,
+                                               struct sdma_txreq,
+                                               list);
+                                       num_desc = stx->num_desc;
+                               }
+                               if (num_desc > avail)
+                                       break;
+                               avail -= num_desc;
+                               list_del_init(&wait->list);
+                               waits[n++] = wait;
+                       }
+                       write_sequnlock(&dev->iowait_lock);
+                       break;
+               }
+       } while (read_seqretry(&dev->iowait_lock, seq));
+
+       for (i = 0; i < n; i++)
+               waits[i]->wakeup(waits[i], SDMA_AVAIL_REASON);
+}
+
+/* head_lock must be held */
+static void sdma_make_progress(struct sdma_engine *sde, u64 status)
+{
+       struct sdma_txreq *txp = NULL;
+       int progress = 0;
+       u16 hwhead, swhead;
+       int idle_check_done = 0;
+
+       hwhead = sdma_gethead(sde);
+
+       /* The reason for some of the complexity of this code is that
+        * not all descriptors have corresponding txps.  So, we have to
+        * be able to skip over descs until we wander into the range of
+        * the next txp on the list.
+        */
+
+retry:
+       txp = get_txhead(sde);
+       swhead = sde->descq_head & sde->sdma_mask;
+       trace_hfi1_sdma_progress(sde, hwhead, swhead, txp);
+       while (swhead != hwhead) {
+               /* advance head, wrap if needed */
+               swhead = ++sde->descq_head & sde->sdma_mask;
+
+               /* if now past this txp's descs, do the callback */
+               if (txp && txp->next_descq_idx == swhead) {
+                       /* remove from list */
+                       sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL;
+                       complete_tx(sde, txp, SDMA_TXREQ_S_OK);
+                       /* see if there is another txp */
+                       txp = get_txhead(sde);
+               }
+               trace_hfi1_sdma_progress(sde, hwhead, swhead, txp);
+               progress++;
+       }
+
+       /*
+        * The SDMA idle interrupt is not guaranteed to be ordered with respect
+        * to updates to the the dma_head location in host memory. The head
+        * value read might not be fully up to date. If there are pending
+        * descriptors and the SDMA idle interrupt fired then read from the
+        * CSR SDMA head instead to get the latest value from the hardware.
+        * The hardware SDMA head should be read at most once in this invocation
+        * of sdma_make_progress(..) which is ensured by idle_check_done flag
+        */
+       if ((status & sde->idle_mask) && !idle_check_done) {
+               u16 swtail;
+
+               swtail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask;
+               if (swtail != hwhead) {
+                       hwhead = (u16)read_sde_csr(sde, SD(HEAD));
+                       idle_check_done = 1;
+                       goto retry;
+               }
+       }
+
+       sde->last_status = status;
+       if (progress)
+               sdma_desc_avail(sde, sdma_descq_freecnt(sde));
+}
+
+/*
+ * sdma_engine_interrupt() - interrupt handler for engine
+ * @sde: sdma engine
+ * @status: sdma interrupt reason
+ *
+ * Status is a mask of the 3 possible interrupts for this engine.  It will
+ * contain bits _only_ for this SDMA engine.  It will contain at least one
+ * bit, it may contain more.
+ */
+void sdma_engine_interrupt(struct sdma_engine *sde, u64 status)
+{
+       trace_hfi1_sdma_engine_interrupt(sde, status);
+       write_seqlock(&sde->head_lock);
+       sdma_set_desc_cnt(sde, sdma_desct_intr);
+       if (status & sde->idle_mask)
+               sde->idle_int_cnt++;
+       else if (status & sde->progress_mask)
+               sde->progress_int_cnt++;
+       else if (status & sde->int_mask)
+               sde->sdma_int_cnt++;
+       sdma_make_progress(sde, status);
+       write_sequnlock(&sde->head_lock);
+}
+
+/**
+ * sdma_engine_error() - error handler for engine
+ * @sde: sdma engine
+ * @status: sdma interrupt reason
+ */
+void sdma_engine_error(struct sdma_engine *sde, u64 status)
+{
+       unsigned long flags;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+       dd_dev_err(sde->dd, "CONFIG SDMA(%u) error status 0x%llx state %s\n",
+                  sde->this_idx,
+                  (unsigned long long)status,
+                  sdma_state_names[sde->state.current_state]);
+#endif
+       spin_lock_irqsave(&sde->tail_lock, flags);
+       write_seqlock(&sde->head_lock);
+       if (status & ALL_SDMA_ENG_HALT_ERRS)
+               __sdma_process_event(sde, sdma_event_e60_hw_halted);
+       if (status & ~SD(ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK)) {
+               dd_dev_err(sde->dd,
+                          "SDMA (%u) engine error: 0x%llx state %s\n",
+                          sde->this_idx,
+                          (unsigned long long)status,
+                          sdma_state_names[sde->state.current_state]);
+               dump_sdma_state(sde);
+       }
+       write_sequnlock(&sde->head_lock);
+       spin_unlock_irqrestore(&sde->tail_lock, flags);
+}
+
+static void sdma_sendctrl(struct sdma_engine *sde, unsigned op)
+{
+       u64 set_senddmactrl = 0;
+       u64 clr_senddmactrl = 0;
+       unsigned long flags;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+       dd_dev_err(sde->dd, "CONFIG SDMA(%u) senddmactrl E=%d I=%d H=%d C=%d\n",
+                  sde->this_idx,
+                  (op & SDMA_SENDCTRL_OP_ENABLE) ? 1 : 0,
+                  (op & SDMA_SENDCTRL_OP_INTENABLE) ? 1 : 0,
+                  (op & SDMA_SENDCTRL_OP_HALT) ? 1 : 0,
+                  (op & SDMA_SENDCTRL_OP_CLEANUP) ? 1 : 0);
+#endif
+
+       if (op & SDMA_SENDCTRL_OP_ENABLE)
+               set_senddmactrl |= SD(CTRL_SDMA_ENABLE_SMASK);
+       else
+               clr_senddmactrl |= SD(CTRL_SDMA_ENABLE_SMASK);
+
+       if (op & SDMA_SENDCTRL_OP_INTENABLE)
+               set_senddmactrl |= SD(CTRL_SDMA_INT_ENABLE_SMASK);
+       else
+               clr_senddmactrl |= SD(CTRL_SDMA_INT_ENABLE_SMASK);
+
+       if (op & SDMA_SENDCTRL_OP_HALT)
+               set_senddmactrl |= SD(CTRL_SDMA_HALT_SMASK);
+       else
+               clr_senddmactrl |= SD(CTRL_SDMA_HALT_SMASK);
+
+       spin_lock_irqsave(&sde->senddmactrl_lock, flags);
+
+       sde->p_senddmactrl |= set_senddmactrl;
+       sde->p_senddmactrl &= ~clr_senddmactrl;
+
+       if (op & SDMA_SENDCTRL_OP_CLEANUP)
+               write_sde_csr(sde, SD(CTRL),
+                             sde->p_senddmactrl |
+                             SD(CTRL_SDMA_CLEANUP_SMASK));
+       else
+               write_sde_csr(sde, SD(CTRL), sde->p_senddmactrl);
+
+       spin_unlock_irqrestore(&sde->senddmactrl_lock, flags);
+
+#ifdef CONFIG_SDMA_VERBOSITY
+       sdma_dumpstate(sde);
+#endif
+}
+
+static void sdma_setlengen(struct sdma_engine *sde)
+{
+#ifdef CONFIG_SDMA_VERBOSITY
+       dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
+                  sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
+#endif
+
+       /*
+        * Set SendDmaLenGen and clear-then-set the MSB of the generation
+        * count to enable generation checking and load the internal
+        * generation counter.
+        */
+       write_sde_csr(sde, SD(LEN_GEN),
+                     (sde->descq_cnt / 64) << SD(LEN_GEN_LENGTH_SHIFT));
+       write_sde_csr(sde, SD(LEN_GEN),
+                     ((sde->descq_cnt / 64) << SD(LEN_GEN_LENGTH_SHIFT)) |
+                     (4ULL << SD(LEN_GEN_GENERATION_SHIFT)));
+}
+
+static inline void sdma_update_tail(struct sdma_engine *sde, u16 tail)
+{
+       /* Commit writes to memory and advance the tail on the chip */
+       smp_wmb(); /* see get_txhead() */
+       writeq(tail, sde->tail_csr);
+}
+
+/*
+ * This is called when changing to state s10_hw_start_up_halt_wait as
+ * a result of send buffer errors or send DMA descriptor errors.
+ */
+static void sdma_hw_start_up(struct sdma_engine *sde)
+{
+       u64 reg;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+       dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
+                  sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
+#endif
+
+       sdma_setlengen(sde);
+       sdma_update_tail(sde, 0); /* Set SendDmaTail */
+       *sde->head_dma = 0;
+
+       reg = SD(ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_MASK) <<
+             SD(ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SHIFT);
+       write_sde_csr(sde, SD(ENG_ERR_CLEAR), reg);
+}
+
+#define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \
+(r &= ~SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
+
+#define SET_STATIC_RATE_CONTROL_SMASK(r) \
+(r |= SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
+/*
+ * set_sdma_integrity
+ *
+ * Set the SEND_DMA_CHECK_ENABLE register for send DMA engine 'sde'.
+ */
+static void set_sdma_integrity(struct sdma_engine *sde)
+{
+       struct hfi1_devdata *dd = sde->dd;
+       u64 reg;
+
+       if (unlikely(HFI1_CAP_IS_KSET(NO_INTEGRITY)))
+               return;
+
+       reg = hfi1_pkt_base_sdma_integrity(dd);
+
+       if (HFI1_CAP_IS_KSET(STATIC_RATE_CTRL))
+               CLEAR_STATIC_RATE_CONTROL_SMASK(reg);
+       else
+               SET_STATIC_RATE_CONTROL_SMASK(reg);
+
+       write_sde_csr(sde, SD(CHECK_ENABLE), reg);
+}
+
+static void init_sdma_regs(
+       struct sdma_engine *sde,
+       u32 credits,
+       uint idle_cnt)
+{
+       u8 opval, opmask;
+#ifdef CONFIG_SDMA_VERBOSITY
+       struct hfi1_devdata *dd = sde->dd;
+
+       dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n",
+                  sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
+#endif
+
+       write_sde_csr(sde, SD(BASE_ADDR), sde->descq_phys);
+       sdma_setlengen(sde);
+       sdma_update_tail(sde, 0); /* Set SendDmaTail */
+       write_sde_csr(sde, SD(RELOAD_CNT), idle_cnt);
+       write_sde_csr(sde, SD(DESC_CNT), 0);
+       write_sde_csr(sde, SD(HEAD_ADDR), sde->head_phys);
+       write_sde_csr(sde, SD(MEMORY),
+                     ((u64)credits << SD(MEMORY_SDMA_MEMORY_CNT_SHIFT)) |
+                     ((u64)(credits * sde->this_idx) <<
+                      SD(MEMORY_SDMA_MEMORY_INDEX_SHIFT)));
+       write_sde_csr(sde, SD(ENG_ERR_MASK), ~0ull);
+       set_sdma_integrity(sde);
+       opmask = OPCODE_CHECK_MASK_DISABLED;
+       opval = OPCODE_CHECK_VAL_DISABLED;
+       write_sde_csr(sde, SD(CHECK_OPCODE),
+                     (opmask << SEND_CTXT_CHECK_OPCODE_MASK_SHIFT) |
+                     (opval << SEND_CTXT_CHECK_OPCODE_VALUE_SHIFT));
+}
+
+#ifdef CONFIG_SDMA_VERBOSITY
+
+#define sdma_dumpstate_helper0(reg) do { \
+               csr = read_csr(sde->dd, reg); \
+               dd_dev_err(sde->dd, "%36s     0x%016llx\n", #reg, csr); \
+       } while (0)
+
+#define sdma_dumpstate_helper(reg) do { \
+               csr = read_sde_csr(sde, reg); \
+               dd_dev_err(sde->dd, "%36s[%02u] 0x%016llx\n", \
+                       #reg, sde->this_idx, csr); \
+       } while (0)
+
+#define sdma_dumpstate_helper2(reg) do { \
+               csr = read_csr(sde->dd, reg + (8 * i)); \
+               dd_dev_err(sde->dd, "%33s_%02u     0x%016llx\n", \
+                               #reg, i, csr); \
+       } while (0)
+
+void sdma_dumpstate(struct sdma_engine *sde)
+{
+       u64 csr;
+       unsigned i;
+
+       sdma_dumpstate_helper(SD(CTRL));
+       sdma_dumpstate_helper(SD(STATUS));
+       sdma_dumpstate_helper0(SD(ERR_STATUS));
+       sdma_dumpstate_helper0(SD(ERR_MASK));
+       sdma_dumpstate_helper(SD(ENG_ERR_STATUS));
+       sdma_dumpstate_helper(SD(ENG_ERR_MASK));
+
+       for (i = 0; i < CCE_NUM_INT_CSRS; ++i) {
+               sdma_dumpstate_helper2(CCE_INT_STATUS);
+               sdma_dumpstate_helper2(CCE_INT_MASK);
+               sdma_dumpstate_helper2(CCE_INT_BLOCKED);
+       }
+
+       sdma_dumpstate_helper(SD(TAIL));
+       sdma_dumpstate_helper(SD(HEAD));
+       sdma_dumpstate_helper(SD(PRIORITY_THLD));
+       sdma_dumpstate_helper(SD(IDLE_CNT));
+       sdma_dumpstate_helper(SD(RELOAD_CNT));
+       sdma_dumpstate_helper(SD(DESC_CNT));
+       sdma_dumpstate_helper(SD(DESC_FETCHED_CNT));
+       sdma_dumpstate_helper(SD(MEMORY));
+       sdma_dumpstate_helper0(SD(ENGINES));
+       sdma_dumpstate_helper0(SD(MEM_SIZE));
+       /* sdma_dumpstate_helper(SEND_EGRESS_SEND_DMA_STATUS);  */
+       sdma_dumpstate_helper(SD(BASE_ADDR));
+       sdma_dumpstate_helper(SD(LEN_GEN));
+       sdma_dumpstate_helper(SD(HEAD_ADDR));
+       sdma_dumpstate_helper(SD(CHECK_ENABLE));
+       sdma_dumpstate_helper(SD(CHECK_VL));
+       sdma_dumpstate_helper(SD(CHECK_JOB_KEY));
+       sdma_dumpstate_helper(SD(CHECK_PARTITION_KEY));
+       sdma_dumpstate_helper(SD(CHECK_SLID));
+       sdma_dumpstate_helper(SD(CHECK_OPCODE));
+}
+#endif
+
+static void dump_sdma_state(struct sdma_engine *sde)
+{
+       struct hw_sdma_desc *descq;
+       struct hw_sdma_desc *descqp;
+       u64 desc[2];
+       u64 addr;
+       u8 gen;
+       u16 len;
+       u16 head, tail, cnt;
+
+       head = sde->descq_head & sde->sdma_mask;
+       tail = sde->descq_tail & sde->sdma_mask;
+       cnt = sdma_descq_freecnt(sde);
+       descq = sde->descq;
+
+       dd_dev_err(sde->dd,
+                  "SDMA (%u) descq_head: %u descq_tail: %u freecnt: %u FLE %d\n",
+                  sde->this_idx, head, tail, cnt,
+                  !list_empty(&sde->flushlist));
+
+       /* print info for each entry in the descriptor queue */
+       while (head != tail) {
+               char flags[6] = { 'x', 'x', 'x', 'x', 0 };
+
+               descqp = &sde->descq[head];
+               desc[0] = le64_to_cpu(descqp->qw[0]);
+               desc[1] = le64_to_cpu(descqp->qw[1]);
+               flags[0] = (desc[1] & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-';
+               flags[1] = (desc[1] & SDMA_DESC1_HEAD_TO_HOST_FLAG) ?
+                               'H' : '-';
+               flags[2] = (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-';
+               flags[3] = (desc[0] & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-';
+               addr = (desc[0] >> SDMA_DESC0_PHY_ADDR_SHIFT)
+                       & SDMA_DESC0_PHY_ADDR_MASK;
+               gen = (desc[1] >> SDMA_DESC1_GENERATION_SHIFT)
+                       & SDMA_DESC1_GENERATION_MASK;
+               len = (desc[0] >> SDMA_DESC0_BYTE_COUNT_SHIFT)
+                       & SDMA_DESC0_BYTE_COUNT_MASK;
+               dd_dev_err(sde->dd,
+                          "SDMA sdmadesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes\n",
+                          head, flags, addr, gen, len);
+               dd_dev_err(sde->dd,
+                          "\tdesc0:0x%016llx desc1 0x%016llx\n",
+                          desc[0], desc[1]);
+               if (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG)
+                       dd_dev_err(sde->dd,
+                                  "\taidx: %u amode: %u alen: %u\n",
+                                  (u8)((desc[1] &
+                                        SDMA_DESC1_HEADER_INDEX_SMASK) >>
+                                       SDMA_DESC1_HEADER_INDEX_SHIFT),
+                                  (u8)((desc[1] &
+                                        SDMA_DESC1_HEADER_MODE_SMASK) >>
+                                       SDMA_DESC1_HEADER_MODE_SHIFT),
+                                  (u8)((desc[1] &
+                                        SDMA_DESC1_HEADER_DWS_SMASK) >>
+                                       SDMA_DESC1_HEADER_DWS_SHIFT));
+               head++;
+               head &= sde->sdma_mask;
+       }
+}
+
+#define SDE_FMT \
+       "SDE %u CPU %d STE %s C 0x%llx S 0x%016llx E 0x%llx T(HW) 0x%llx T(SW) 0x%x H(HW) 0x%llx H(SW) 0x%x H(D) 0x%llx DM 0x%llx GL 0x%llx R 0x%llx LIS 0x%llx AHGI 0x%llx TXT %u TXH %u DT %u DH %u FLNE %d DQF %u SLC 0x%llx\n"
+/**
+ * sdma_seqfile_dump_sde() - debugfs dump of sde
+ * @s: seq file
+ * @sde: send dma engine to dump
+ *
+ * This routine dumps the sde to the indicated seq file.
+ */
+void sdma_seqfile_dump_sde(struct seq_file *s, struct sdma_engine *sde)
+{
+       u16 head, tail;
+       struct hw_sdma_desc *descqp;
+       u64 desc[2];
+       u64 addr;
+       u8 gen;
+       u16 len;
+
+       head = sde->descq_head & sde->sdma_mask;
+       tail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask;
+       seq_printf(s, SDE_FMT, sde->this_idx,
+                  sde->cpu,
+                  sdma_state_name(sde->state.current_state),
+                  (unsigned long long)read_sde_csr(sde, SD(CTRL)),
+                  (unsigned long long)read_sde_csr(sde, SD(STATUS)),
+                  (unsigned long long)read_sde_csr(sde, SD(ENG_ERR_STATUS)),
+                  (unsigned long long)read_sde_csr(sde, SD(TAIL)), tail,
+                  (unsigned long long)read_sde_csr(sde, SD(HEAD)), head,
+                  (unsigned long long)le64_to_cpu(*sde->head_dma),
+                  (unsigned long long)read_sde_csr(sde, SD(MEMORY)),
+                  (unsigned long long)read_sde_csr(sde, SD(LEN_GEN)),
+                  (unsigned long long)read_sde_csr(sde, SD(RELOAD_CNT)),
+                  (unsigned long long)sde->last_status,
+                  (unsigned long long)sde->ahg_bits,
+                  sde->tx_tail,
+                  sde->tx_head,
+                  sde->descq_tail,
+                  sde->descq_head,
+                  !list_empty(&sde->flushlist),
+                  sde->descq_full_count,
+                  (unsigned long long)read_sde_csr(sde, SEND_DMA_CHECK_SLID));
+
+       /* print info for each entry in the descriptor queue */
+       while (head != tail) {
+               char flags[6] = { 'x', 'x', 'x', 'x', 0 };
+
+               descqp = &sde->descq[head];
+               desc[0] = le64_to_cpu(descqp->qw[0]);
+               desc[1] = le64_to_cpu(descqp->qw[1]);
+               flags[0] = (desc[1] & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-';
+               flags[1] = (desc[1] & SDMA_DESC1_HEAD_TO_HOST_FLAG) ?
+                               'H' : '-';
+               flags[2] = (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-';
+               flags[3] = (desc[0] & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-';
+               addr = (desc[0] >> SDMA_DESC0_PHY_ADDR_SHIFT)
+                       & SDMA_DESC0_PHY_ADDR_MASK;
+               gen = (desc[1] >> SDMA_DESC1_GENERATION_SHIFT)
+                       & SDMA_DESC1_GENERATION_MASK;
+               len = (desc[0] >> SDMA_DESC0_BYTE_COUNT_SHIFT)
+                       & SDMA_DESC0_BYTE_COUNT_MASK;
+               seq_printf(s,
+                          "\tdesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes\n",
+                          head, flags, addr, gen, len);
+               if (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG)
+                       seq_printf(s, "\t\tahgidx: %u ahgmode: %u\n",
+                                  (u8)((desc[1] &
+                                        SDMA_DESC1_HEADER_INDEX_SMASK) >>
+                                       SDMA_DESC1_HEADER_INDEX_SHIFT),
+                                  (u8)((desc[1] &
+                                        SDMA_DESC1_HEADER_MODE_SMASK) >>
+                                       SDMA_DESC1_HEADER_MODE_SHIFT));
+               head = (head + 1) & sde->sdma_mask;
+       }
+}
+
+/*
+ * add the generation number into
+ * the qw1 and return
+ */
+static inline u64 add_gen(struct sdma_engine *sde, u64 qw1)
+{
+       u8 generation = (sde->descq_tail >> sde->sdma_shift) & 3;
+
+       qw1 &= ~SDMA_DESC1_GENERATION_SMASK;
+       qw1 |= ((u64)generation & SDMA_DESC1_GENERATION_MASK)
+                       << SDMA_DESC1_GENERATION_SHIFT;
+       return qw1;
+}
+
+/*
+ * This routine submits the indicated tx
+ *
+ * Space has already been guaranteed and
+ * tail side of ring is locked.
+ *
+ * The hardware tail update is done
+ * in the caller and that is facilitated
+ * by returning the new tail.
+ *
+ * There is special case logic for ahg
+ * to not add the generation number for
+ * up to 2 descriptors that follow the
+ * first descriptor.
+ *
+ */
+static inline u16 submit_tx(struct sdma_engine *sde, struct sdma_txreq *tx)
+{
+       int i;
+       u16 tail;
+       struct sdma_desc *descp = tx->descp;
+       u8 skip = 0, mode = ahg_mode(tx);
+
+       tail = sde->descq_tail & sde->sdma_mask;
+       sde->descq[tail].qw[0] = cpu_to_le64(descp->qw[0]);
+       sde->descq[tail].qw[1] = cpu_to_le64(add_gen(sde, descp->qw[1]));
+       trace_hfi1_sdma_descriptor(sde, descp->qw[0], descp->qw[1],
+                                  tail, &sde->descq[tail]);
+       tail = ++sde->descq_tail & sde->sdma_mask;
+       descp++;
+       if (mode > SDMA_AHG_APPLY_UPDATE1)
+               skip = mode >> 1;
+       for (i = 1; i < tx->num_desc; i++, descp++) {
+               u64 qw1;
+
+               sde->descq[tail].qw[0] = cpu_to_le64(descp->qw[0]);
+               if (skip) {
+                       /* edits don't have generation */
+                       qw1 = descp->qw[1];
+                       skip--;
+               } else {
+                       /* replace generation with real one for non-edits */
+                       qw1 = add_gen(sde, descp->qw[1]);
+               }
+               sde->descq[tail].qw[1] = cpu_to_le64(qw1);
+               trace_hfi1_sdma_descriptor(sde, descp->qw[0], qw1,
+                                          tail, &sde->descq[tail]);
+               tail = ++sde->descq_tail & sde->sdma_mask;
+       }
+       tx->next_descq_idx = tail;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+       tx->sn = sde->tail_sn++;
+       trace_hfi1_sdma_in_sn(sde, tx->sn);
+       WARN_ON_ONCE(sde->tx_ring[sde->tx_tail & sde->sdma_mask]);
+#endif
+       sde->tx_ring[sde->tx_tail++ & sde->sdma_mask] = tx;
+       sde->desc_avail -= tx->num_desc;
+       return tail;
+}
+
+/*
+ * Check for progress
+ */
+static int sdma_check_progress(
+       struct sdma_engine *sde,
+       struct iowait *wait,
+       struct sdma_txreq *tx)
+{
+       int ret;
+
+       sde->desc_avail = sdma_descq_freecnt(sde);
+       if (tx->num_desc <= sde->desc_avail)
+               return -EAGAIN;
+       /* pulse the head_lock */
+       if (wait && wait->sleep) {
+               unsigned seq;
+
+               seq = raw_seqcount_begin(
+                       (const seqcount_t *)&sde->head_lock.seqcount);
+               ret = wait->sleep(sde, wait, tx, seq);
+               if (ret == -EAGAIN)
+                       sde->desc_avail = sdma_descq_freecnt(sde);
+       } else {
+               ret = -EBUSY;
+       }
+       return ret;
+}
+
+/**
+ * sdma_send_txreq() - submit a tx req to ring
+ * @sde: sdma engine to use
+ * @wait: wait structure to use when full (may be NULL)
+ * @tx: sdma_txreq to submit
+ *
+ * The call submits the tx into the ring.  If a iowait structure is non-NULL
+ * the packet will be queued to the list in wait.
+ *
+ * Return:
+ * 0 - Success, -EINVAL - sdma_txreq incomplete, -EBUSY - no space in
+ * ring (wait == NULL)
+ * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state
+ */
+int sdma_send_txreq(struct sdma_engine *sde,
+                   struct iowait *wait,
+                   struct sdma_txreq *tx)
+{
+       int ret = 0;
+       u16 tail;
+       unsigned long flags;
+
+       /* user should have supplied entire packet */
+       if (unlikely(tx->tlen))
+               return -EINVAL;
+       tx->wait = wait;
+       spin_lock_irqsave(&sde->tail_lock, flags);
+retry:
+       if (unlikely(!__sdma_running(sde)))
+               goto unlock_noconn;
+       if (unlikely(tx->num_desc > sde->desc_avail))
+               goto nodesc;
+       tail = submit_tx(sde, tx);
+       if (wait)
+               iowait_sdma_inc(wait);
+       sdma_update_tail(sde, tail);
+unlock:
+       spin_unlock_irqrestore(&sde->tail_lock, flags);
+       return ret;
+unlock_noconn:
+       if (wait)
+               iowait_sdma_inc(wait);
+       tx->next_descq_idx = 0;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+       tx->sn = sde->tail_sn++;
+       trace_hfi1_sdma_in_sn(sde, tx->sn);
+#endif
+       spin_lock(&sde->flushlist_lock);
+       list_add_tail(&tx->list, &sde->flushlist);
+       spin_unlock(&sde->flushlist_lock);
+       if (wait) {
+               wait->tx_count++;
+               wait->count += tx->num_desc;
+       }
+       schedule_work(&sde->flush_worker);
+       ret = -ECOMM;
+       goto unlock;
+nodesc:
+       ret = sdma_check_progress(sde, wait, tx);
+       if (ret == -EAGAIN) {
+               ret = 0;
+               goto retry;
+       }
+       sde->descq_full_count++;
+       goto unlock;
+}
+
+/**
+ * sdma_send_txlist() - submit a list of tx req to ring
+ * @sde: sdma engine to use
+ * @wait: wait structure to use when full (may be NULL)
+ * @tx_list: list of sdma_txreqs to submit
+ *
+ * The call submits the list into the ring.
+ *
+ * If the iowait structure is non-NULL and not equal to the iowait list
+ * the unprocessed part of the list  will be appended to the list in wait.
+ *
+ * In all cases, the tx_list will be updated so the head of the tx_list is
+ * the list of descriptors that have yet to be transmitted.
+ *
+ * The intent of this call is to provide a more efficient
+ * way of submitting multiple packets to SDMA while holding the tail
+ * side locking.
+ *
+ * Return:
+ * > 0 - Success (value is number of sdma_txreq's submitted),
+ * -EINVAL - sdma_txreq incomplete, -EBUSY - no space in ring (wait == NULL)
+ * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state
+ */
+int sdma_send_txlist(struct sdma_engine *sde, struct iowait *wait,
+                    struct list_head *tx_list)
+{
+       struct sdma_txreq *tx, *tx_next;
+       int ret = 0;
+       unsigned long flags;
+       u16 tail = INVALID_TAIL;
+       int count = 0;
+
+       spin_lock_irqsave(&sde->tail_lock, flags);
+retry:
+       list_for_each_entry_safe(tx, tx_next, tx_list, list) {
+               tx->wait = wait;
+               if (unlikely(!__sdma_running(sde)))
+                       goto unlock_noconn;
+               if (unlikely(tx->num_desc > sde->desc_avail))
+                       goto nodesc;
+               if (unlikely(tx->tlen)) {
+                       ret = -EINVAL;
+                       goto update_tail;
+               }
+               list_del_init(&tx->list);
+               tail = submit_tx(sde, tx);
+               count++;
+               if (tail != INVALID_TAIL &&
+                   (count & SDMA_TAIL_UPDATE_THRESH) == 0) {
+                       sdma_update_tail(sde, tail);
+                       tail = INVALID_TAIL;
+               }
+       }
+update_tail:
+       if (wait)
+               iowait_sdma_add(wait, count);
+       if (tail != INVALID_TAIL)
+               sdma_update_tail(sde, tail);
+       spin_unlock_irqrestore(&sde->tail_lock, flags);
+       return ret == 0 ? count : ret;
+unlock_noconn:
+       spin_lock(&sde->flushlist_lock);
+       list_for_each_entry_safe(tx, tx_next, tx_list, list) {
+               tx->wait = wait;
+               list_del_init(&tx->list);
+               if (wait)
+                       iowait_sdma_inc(wait);
+               tx->next_descq_idx = 0;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+               tx->sn = sde->tail_sn++;
+               trace_hfi1_sdma_in_sn(sde, tx->sn);
+#endif
+               list_add_tail(&tx->list, &sde->flushlist);
+               if (wait) {
+                       wait->tx_count++;
+                       wait->count += tx->num_desc;
+               }
+       }
+       spin_unlock(&sde->flushlist_lock);
+       schedule_work(&sde->flush_worker);
+       ret = -ECOMM;
+       goto update_tail;
+nodesc:
+       ret = sdma_check_progress(sde, wait, tx);
+       if (ret == -EAGAIN) {
+               ret = 0;
+               goto retry;
+       }
+       sde->descq_full_count++;
+       goto update_tail;
+}
+
+static void sdma_process_event(struct sdma_engine *sde, enum sdma_events event)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&sde->tail_lock, flags);
+       write_seqlock(&sde->head_lock);
+
+       __sdma_process_event(sde, event);
+
+       if (sde->state.current_state == sdma_state_s99_running)
+               sdma_desc_avail(sde, sdma_descq_freecnt(sde));
+
+       write_sequnlock(&sde->head_lock);
+       spin_unlock_irqrestore(&sde->tail_lock, flags);
+}
+
+static void __sdma_process_event(struct sdma_engine *sde,
+                                enum sdma_events event)
+{
+       struct sdma_state *ss = &sde->state;
+       int need_progress = 0;
+
+       /* CONFIG SDMA temporary */
+#ifdef CONFIG_SDMA_VERBOSITY
+       dd_dev_err(sde->dd, "CONFIG SDMA(%u) [%s] %s\n", sde->this_idx,
+                  sdma_state_names[ss->current_state],
+                  sdma_event_names[event]);
+#endif
+
+       switch (ss->current_state) {
+       case sdma_state_s00_hw_down:
+               switch (event) {
+               case sdma_event_e00_go_hw_down:
+                       break;
+               case sdma_event_e30_go_running:
+                       /*
+                        * If down, but running requested (usually result
+                        * of link up, then we need to start up.
+                        * This can happen when hw down is requested while
+                        * bringing the link up with traffic active on
+                        * 7220, e.g.
+                        */
+                       ss->go_s99_running = 1;
+                       /* fall through and start dma engine */
+               case sdma_event_e10_go_hw_start:
+                       /* This reference means the state machine is started */
+                       sdma_get(&sde->state);
+                       sdma_set_state(sde,
+                                      sdma_state_s10_hw_start_up_halt_wait);
+                       break;
+               case sdma_event_e15_hw_halt_done:
+                       break;
+               case sdma_event_e25_hw_clean_up_done:
+                       break;
+               case sdma_event_e40_sw_cleaned:
+                       sdma_sw_tear_down(sde);
+                       break;
+               case sdma_event_e50_hw_cleaned:
+                       break;
+               case sdma_event_e60_hw_halted:
+                       break;
+               case sdma_event_e70_go_idle:
+                       break;
+               case sdma_event_e80_hw_freeze:
+                       break;
+               case sdma_event_e81_hw_frozen:
+                       break;
+               case sdma_event_e82_hw_unfreeze:
+                       break;
+               case sdma_event_e85_link_down:
+                       break;
+               case sdma_event_e90_sw_halted:
+                       break;
+               }
+               break;
+
+       case sdma_state_s10_hw_start_up_halt_wait:
+               switch (event) {
+               case sdma_event_e00_go_hw_down:
+                       sdma_set_state(sde, sdma_state_s00_hw_down);
+                       sdma_sw_tear_down(sde);
+                       break;
+               case sdma_event_e10_go_hw_start:
+                       break;
+               case sdma_event_e15_hw_halt_done:
+                       sdma_set_state(sde,
+                                      sdma_state_s15_hw_start_up_clean_wait);
+                       sdma_start_hw_clean_up(sde);
+                       break;
+               case sdma_event_e25_hw_clean_up_done:
+                       break;
+               case sdma_event_e30_go_running:
+                       ss->go_s99_running = 1;
+                       break;
+               case sdma_event_e40_sw_cleaned:
+                       break;
+               case sdma_event_e50_hw_cleaned:
+                       break;
+               case sdma_event_e60_hw_halted:
+                       schedule_work(&sde->err_halt_worker);
+                       break;
+               case sdma_event_e70_go_idle:
+                       ss->go_s99_running = 0;
+                       break;
+               case sdma_event_e80_hw_freeze:
+                       break;
+               case sdma_event_e81_hw_frozen:
+                       break;
+               case sdma_event_e82_hw_unfreeze:
+                       break;
+               case sdma_event_e85_link_down:
+                       break;
+               case sdma_event_e90_sw_halted:
+                       break;
+               }
+               break;
+
+       case sdma_state_s15_hw_start_up_clean_wait:
+               switch (event) {
+               case sdma_event_e00_go_hw_down:
+                       sdma_set_state(sde, sdma_state_s00_hw_down);
+                       sdma_sw_tear_down(sde);
+                       break;
+               case sdma_event_e10_go_hw_start:
+                       break;
+               case sdma_event_e15_hw_halt_done:
+                       break;
+               case sdma_event_e25_hw_clean_up_done:
+                       sdma_hw_start_up(sde);
+                       sdma_set_state(sde, ss->go_s99_running ?
+                                      sdma_state_s99_running :
+                                      sdma_state_s20_idle);
+                       break;
+               case sdma_event_e30_go_running:
+                       ss->go_s99_running = 1;
+                       break;
+               case sdma_event_e40_sw_cleaned:
+                       break;
+               case sdma_event_e50_hw_cleaned:
+                       break;
+               case sdma_event_e60_hw_halted:
+                       break;
+               case sdma_event_e70_go_idle:
+                       ss->go_s99_running = 0;
+                       break;
+               case sdma_event_e80_hw_freeze:
+                       break;
+               case sdma_event_e81_hw_frozen:
+                       break;
+               case sdma_event_e82_hw_unfreeze:
+                       break;
+               case sdma_event_e85_link_down:
+                       break;
+               case sdma_event_e90_sw_halted:
+                       break;
+               }
+               break;
+
+       case sdma_state_s20_idle:
+               switch (event) {
+               case sdma_event_e00_go_hw_down:
+                       sdma_set_state(sde, sdma_state_s00_hw_down);
+                       sdma_sw_tear_down(sde);
+                       break;
+               case sdma_event_e10_go_hw_start:
+                       break;
+               case sdma_event_e15_hw_halt_done:
+                       break;
+               case sdma_event_e25_hw_clean_up_done:
+                       break;
+               case sdma_event_e30_go_running:
+                       sdma_set_state(sde, sdma_state_s99_running);
+                       ss->go_s99_running = 1;
+                       break;
+               case sdma_event_e40_sw_cleaned:
+                       break;
+               case sdma_event_e50_hw_cleaned:
+                       break;
+               case sdma_event_e60_hw_halted:
+                       sdma_set_state(sde, sdma_state_s50_hw_halt_wait);
+                       schedule_work(&sde->err_halt_worker);
+                       break;
+               case sdma_event_e70_go_idle:
+                       break;
+               case sdma_event_e85_link_down:
+                       /* fall through */
+               case sdma_event_e80_hw_freeze:
+                       sdma_set_state(sde, sdma_state_s80_hw_freeze);
+                       atomic_dec(&sde->dd->sdma_unfreeze_count);
+                       wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
+                       break;
+               case sdma_event_e81_hw_frozen:
+                       break;
+               case sdma_event_e82_hw_unfreeze:
+                       break;
+               case sdma_event_e90_sw_halted:
+                       break;
+               }
+               break;
+
+       case sdma_state_s30_sw_clean_up_wait:
+               switch (event) {
+               case sdma_event_e00_go_hw_down:
+                       sdma_set_state(sde, sdma_state_s00_hw_down);
+                       break;
+               case sdma_event_e10_go_hw_start:
+                       break;
+               case sdma_event_e15_hw_halt_done:
+                       break;
+               case sdma_event_e25_hw_clean_up_done:
+                       break;
+               case sdma_event_e30_go_running:
+                       ss->go_s99_running = 1;
+                       break;
+               case sdma_event_e40_sw_cleaned:
+                       sdma_set_state(sde, sdma_state_s40_hw_clean_up_wait);
+                       sdma_start_hw_clean_up(sde);
+                       break;
+               case sdma_event_e50_hw_cleaned:
+                       break;
+               case sdma_event_e60_hw_halted:
+                       break;
+               case sdma_event_e70_go_idle:
+                       ss->go_s99_running = 0;
+                       break;
+               case sdma_event_e80_hw_freeze:
+                       break;
+               case sdma_event_e81_hw_frozen:
+                       break;
+               case sdma_event_e82_hw_unfreeze:
+                       break;
+               case sdma_event_e85_link_down:
+                       ss->go_s99_running = 0;
+                       break;
+               case sdma_event_e90_sw_halted:
+                       break;
+               }
+               break;
+
+       case sdma_state_s40_hw_clean_up_wait:
+               switch (event) {
+               case sdma_event_e00_go_hw_down:
+                       sdma_set_state(sde, sdma_state_s00_hw_down);
+                       tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
+                       break;
+               case sdma_event_e10_go_hw_start:
+                       break;
+               case sdma_event_e15_hw_halt_done:
+                       break;
+               case sdma_event_e25_hw_clean_up_done:
+                       sdma_hw_start_up(sde);
+                       sdma_set_state(sde, ss->go_s99_running ?
+                                      sdma_state_s99_running :
+                                      sdma_state_s20_idle);
+                       break;
+               case sdma_event_e30_go_running:
+                       ss->go_s99_running = 1;
+                       break;
+               case sdma_event_e40_sw_cleaned:
+                       break;
+               case sdma_event_e50_hw_cleaned:
+                       break;
+               case sdma_event_e60_hw_halted:
+                       break;
+               case sdma_event_e70_go_idle:
+                       ss->go_s99_running = 0;
+                       break;
+               case sdma_event_e80_hw_freeze:
+                       break;
+               case sdma_event_e81_hw_frozen:
+                       break;
+               case sdma_event_e82_hw_unfreeze:
+                       break;
+               case sdma_event_e85_link_down:
+                       ss->go_s99_running = 0;
+                       break;
+               case sdma_event_e90_sw_halted:
+                       break;
+               }
+               break;
+
+       case sdma_state_s50_hw_halt_wait:
+               switch (event) {
+               case sdma_event_e00_go_hw_down:
+                       sdma_set_state(sde, sdma_state_s00_hw_down);
+                       tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
+                       break;
+               case sdma_event_e10_go_hw_start:
+                       break;
+               case sdma_event_e15_hw_halt_done:
+                       sdma_set_state(sde, sdma_state_s30_sw_clean_up_wait);
+                       tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
+                       break;
+               case sdma_event_e25_hw_clean_up_done:
+                       break;
+               case sdma_event_e30_go_running:
+                       ss->go_s99_running = 1;
+                       break;
+               case sdma_event_e40_sw_cleaned:
+                       break;
+               case sdma_event_e50_hw_cleaned:
+                       break;
+               case sdma_event_e60_hw_halted:
+                       schedule_work(&sde->err_halt_worker);
+                       break;
+               case sdma_event_e70_go_idle:
+                       ss->go_s99_running = 0;
+                       break;
+               case sdma_event_e80_hw_freeze:
+                       break;
+               case sdma_event_e81_hw_frozen:
+                       break;
+               case sdma_event_e82_hw_unfreeze:
+                       break;
+               case sdma_event_e85_link_down:
+                       ss->go_s99_running = 0;
+                       break;
+               case sdma_event_e90_sw_halted:
+                       break;
+               }
+               break;
+
+       case sdma_state_s60_idle_halt_wait:
+               switch (event) {
+               case sdma_event_e00_go_hw_down:
+                       sdma_set_state(sde, sdma_state_s00_hw_down);
+                       tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
+                       break;
+               case sdma_event_e10_go_hw_start:
+                       break;
+               case sdma_event_e15_hw_halt_done:
+                       sdma_set_state(sde, sdma_state_s30_sw_clean_up_wait);
+                       tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
+                       break;
+               case sdma_event_e25_hw_clean_up_done:
+                       break;
+               case sdma_event_e30_go_running:
+                       ss->go_s99_running = 1;
+                       break;
+               case sdma_event_e40_sw_cleaned:
+                       break;
+               case sdma_event_e50_hw_cleaned:
+                       break;
+               case sdma_event_e60_hw_halted:
+                       schedule_work(&sde->err_halt_worker);
+                       break;
+               case sdma_event_e70_go_idle:
+                       ss->go_s99_running = 0;
+                       break;
+               case sdma_event_e80_hw_freeze:
+                       break;
+               case sdma_event_e81_hw_frozen:
+                       break;
+               case sdma_event_e82_hw_unfreeze:
+                       break;
+               case sdma_event_e85_link_down:
+                       break;
+               case sdma_event_e90_sw_halted:
+                       break;
+               }
+               break;
+
+       case sdma_state_s80_hw_freeze:
+               switch (event) {
+               case sdma_event_e00_go_hw_down:
+                       sdma_set_state(sde, sdma_state_s00_hw_down);
+                       tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
+                       break;
+               case sdma_event_e10_go_hw_start:
+                       break;
+               case sdma_event_e15_hw_halt_done:
+                       break;
+               case sdma_event_e25_hw_clean_up_done:
+                       break;
+               case sdma_event_e30_go_running:
+                       ss->go_s99_running = 1;
+                       break;
+               case sdma_event_e40_sw_cleaned:
+                       break;
+               case sdma_event_e50_hw_cleaned:
+                       break;
+               case sdma_event_e60_hw_halted:
+                       break;
+               case sdma_event_e70_go_idle:
+                       ss->go_s99_running = 0;
+                       break;
+               case sdma_event_e80_hw_freeze:
+                       break;
+               case sdma_event_e81_hw_frozen:
+                       sdma_set_state(sde, sdma_state_s82_freeze_sw_clean);
+                       tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
+                       break;
+               case sdma_event_e82_hw_unfreeze:
+                       break;
+               case sdma_event_e85_link_down:
+                       break;
+               case sdma_event_e90_sw_halted:
+                       break;
+               }
+               break;
+
+       case sdma_state_s82_freeze_sw_clean:
+               switch (event) {
+               case sdma_event_e00_go_hw_down:
+                       sdma_set_state(sde, sdma_state_s00_hw_down);
+                       tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
+                       break;
+               case sdma_event_e10_go_hw_start:
+                       break;
+               case sdma_event_e15_hw_halt_done:
+                       break;
+               case sdma_event_e25_hw_clean_up_done:
+                       break;
+               case sdma_event_e30_go_running:
+                       ss->go_s99_running = 1;
+                       break;
+               case sdma_event_e40_sw_cleaned:
+                       /* notify caller this engine is done cleaning */
+                       atomic_dec(&sde->dd->sdma_unfreeze_count);
+                       wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
+                       break;
+               case sdma_event_e50_hw_cleaned:
+                       break;
+               case sdma_event_e60_hw_halted:
+                       break;
+               case sdma_event_e70_go_idle:
+                       ss->go_s99_running = 0;
+                       break;
+               case sdma_event_e80_hw_freeze:
+                       break;
+               case sdma_event_e81_hw_frozen:
+                       break;
+               case sdma_event_e82_hw_unfreeze:
+                       sdma_hw_start_up(sde);
+                       sdma_set_state(sde, ss->go_s99_running ?
+                                      sdma_state_s99_running :
+                                      sdma_state_s20_idle);
+                       break;
+               case sdma_event_e85_link_down:
+                       break;
+               case sdma_event_e90_sw_halted:
+                       break;
+               }
+               break;
+
+       case sdma_state_s99_running:
+               switch (event) {
+               case sdma_event_e00_go_hw_down:
+                       sdma_set_state(sde, sdma_state_s00_hw_down);
+                       tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
+                       break;
+               case sdma_event_e10_go_hw_start:
+                       break;
+               case sdma_event_e15_hw_halt_done:
+                       break;
+               case sdma_event_e25_hw_clean_up_done:
+                       break;
+               case sdma_event_e30_go_running:
+                       break;
+               case sdma_event_e40_sw_cleaned:
+                       break;
+               case sdma_event_e50_hw_cleaned:
+                       break;
+               case sdma_event_e60_hw_halted:
+                       need_progress = 1;
+                       sdma_err_progress_check_schedule(sde);
+               case sdma_event_e90_sw_halted:
+                       /*
+                       * SW initiated halt does not perform engines
+                       * progress check
+                       */
+                       sdma_set_state(sde, sdma_state_s50_hw_halt_wait);
+                       schedule_work(&sde->err_halt_worker);
+                       break;
+               case sdma_event_e70_go_idle:
+                       sdma_set_state(sde, sdma_state_s60_idle_halt_wait);
+                       break;
+               case sdma_event_e85_link_down:
+                       ss->go_s99_running = 0;
+                       /* fall through */
+               case sdma_event_e80_hw_freeze:
+                       sdma_set_state(sde, sdma_state_s80_hw_freeze);
+                       atomic_dec(&sde->dd->sdma_unfreeze_count);
+                       wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
+                       break;
+               case sdma_event_e81_hw_frozen:
+                       break;
+               case sdma_event_e82_hw_unfreeze:
+                       break;
+               }
+               break;
+       }
+
+       ss->last_event = event;
+       if (need_progress)
+               sdma_make_progress(sde, 0);
+}
+
+/*
+ * _extend_sdma_tx_descs() - helper to extend txreq
+ *
+ * This is called once the initial nominal allocation
+ * of descriptors in the sdma_txreq is exhausted.
+ *
+ * The code will bump the allocation up to the max
+ * of MAX_DESC (64) descriptors. There doesn't seem
+ * much point in an interim step. The last descriptor
+ * is reserved for coalesce buffer in order to support
+ * cases where input packet has >MAX_DESC iovecs.
+ *
+ */
+static int _extend_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx)
+{
+       int i;
+
+       /* Handle last descriptor */
+       if (unlikely((tx->num_desc == (MAX_DESC - 1)))) {
+               /* if tlen is 0, it is for padding, release last descriptor */
+               if (!tx->tlen) {
+                       tx->desc_limit = MAX_DESC;
+               } else if (!tx->coalesce_buf) {
+                       /* allocate coalesce buffer with space for padding */
+                       tx->coalesce_buf = kmalloc(tx->tlen + sizeof(u32),
+                                                  GFP_ATOMIC);
+                       if (!tx->coalesce_buf)
+                               goto enomem;
+                       tx->coalesce_idx = 0;
+               }
+               return 0;
+       }
+
+       if (unlikely(tx->num_desc == MAX_DESC))
+               goto enomem;
+
+       tx->descp = kmalloc_array(
+                       MAX_DESC,
+                       sizeof(struct sdma_desc),
+                       GFP_ATOMIC);
+       if (!tx->descp)
+               goto enomem;
+
+       /* reserve last descriptor for coalescing */
+       tx->desc_limit = MAX_DESC - 1;
+       /* copy ones already built */
+       for (i = 0; i < tx->num_desc; i++)
+               tx->descp[i] = tx->descs[i];
+       return 0;
+enomem:
+       sdma_txclean(dd, tx);
+       return -ENOMEM;
+}
+
+/*
+ * ext_coal_sdma_tx_descs() - extend or coalesce sdma tx descriptors
+ *
+ * This is called once the initial nominal allocation of descriptors
+ * in the sdma_txreq is exhausted.
+ *
+ * This function calls _extend_sdma_tx_descs to extend or allocate
+ * coalesce buffer. If there is a allocated coalesce buffer, it will
+ * copy the input packet data into the coalesce buffer. It also adds
+ * coalesce buffer descriptor once when whole packet is received.
+ *
+ * Return:
+ * <0 - error
+ * 0 - coalescing, don't populate descriptor
+ * 1 - continue with populating descriptor
+ */
+int ext_coal_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx,
+                          int type, void *kvaddr, struct page *page,
+                          unsigned long offset, u16 len)
+{
+       int pad_len, rval;
+       dma_addr_t addr;
+
+       rval = _extend_sdma_tx_descs(dd, tx);
+       if (rval) {
+               sdma_txclean(dd, tx);
+               return rval;
+       }
+
+       /* If coalesce buffer is allocated, copy data into it */
+       if (tx->coalesce_buf) {
+               if (type == SDMA_MAP_NONE) {
+                       sdma_txclean(dd, tx);
+                       return -EINVAL;
+               }
+
+               if (type == SDMA_MAP_PAGE) {
+                       kvaddr = kmap(page);
+                       kvaddr += offset;
+               } else if (WARN_ON(!kvaddr)) {
+                       sdma_txclean(dd, tx);
+                       return -EINVAL;
+               }
+
+               memcpy(tx->coalesce_buf + tx->coalesce_idx, kvaddr, len);
+               tx->coalesce_idx += len;
+               if (type == SDMA_MAP_PAGE)
+                       kunmap(page);
+
+               /* If there is more data, return */
+               if (tx->tlen - tx->coalesce_idx)
+                       return 0;
+
+               /* Whole packet is received; add any padding */
+               pad_len = tx->packet_len & (sizeof(u32) - 1);
+               if (pad_len) {
+                       pad_len = sizeof(u32) - pad_len;
+                       memset(tx->coalesce_buf + tx->coalesce_idx, 0, pad_len);
+                       /* padding is taken care of for coalescing case */
+                       tx->packet_len += pad_len;
+                       tx->tlen += pad_len;
+               }
+
+               /* dma map the coalesce buffer */
+               addr = dma_map_single(&dd->pcidev->dev,
+                                     tx->coalesce_buf,
+                                     tx->tlen,
+                                     DMA_TO_DEVICE);
+
+               if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) {
+                       sdma_txclean(dd, tx);
+                       return -ENOSPC;
+               }
+
+               /* Add descriptor for coalesce buffer */
+               tx->desc_limit = MAX_DESC;
+               return _sdma_txadd_daddr(dd, SDMA_MAP_SINGLE, tx,
+                                        addr, tx->tlen);
+       }
+
+       return 1;
+}
+
+/* Update sdes when the lmc changes */
+void sdma_update_lmc(struct hfi1_devdata *dd, u64 mask, u32 lid)
+{
+       struct sdma_engine *sde;
+       int i;
+       u64 sreg;
+
+       sreg = ((mask & SD(CHECK_SLID_MASK_MASK)) <<
+               SD(CHECK_SLID_MASK_SHIFT)) |
+               (((lid & mask) & SD(CHECK_SLID_VALUE_MASK)) <<
+               SD(CHECK_SLID_VALUE_SHIFT));
+
+       for (i = 0; i < dd->num_sdma; i++) {
+               hfi1_cdbg(LINKVERB, "SendDmaEngine[%d].SLID_CHECK = 0x%x",
+                         i, (u32)sreg);
+               sde = &dd->per_sdma[i];
+               write_sde_csr(sde, SD(CHECK_SLID), sreg);
+       }
+}
+
+/* tx not dword sized - pad */
+int _pad_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx)
+{
+       int rval = 0;
+
+       tx->num_desc++;
+       if ((unlikely(tx->num_desc == tx->desc_limit))) {
+               rval = _extend_sdma_tx_descs(dd, tx);
+               if (rval) {
+                       sdma_txclean(dd, tx);
+                       return rval;
+               }
+       }
+       /* finish the one just added */
+       make_tx_sdma_desc(
+               tx,
+               SDMA_MAP_NONE,
+               dd->sdma_pad_phys,
+               sizeof(u32) - (tx->packet_len & (sizeof(u32) - 1)));
+       _sdma_close_tx(dd, tx);
+       return rval;
+}
+
+/*
+ * Add ahg to the sdma_txreq
+ *
+ * The logic will consume up to 3
+ * descriptors at the beginning of
+ * sdma_txreq.
+ */
+void _sdma_txreq_ahgadd(
+       struct sdma_txreq *tx,
+       u8 num_ahg,
+       u8 ahg_entry,
+       u32 *ahg,
+       u8 ahg_hlen)
+{
+       u32 i, shift = 0, desc = 0;
+       u8 mode;
+
+       WARN_ON_ONCE(num_ahg > 9 || (ahg_hlen & 3) || ahg_hlen == 4);
+       /* compute mode */
+       if (num_ahg == 1)
+               mode = SDMA_AHG_APPLY_UPDATE1;
+       else if (num_ahg <= 5)
+               mode = SDMA_AHG_APPLY_UPDATE2;
+       else
+               mode = SDMA_AHG_APPLY_UPDATE3;
+       tx->num_desc++;
+       /* initialize to consumed descriptors to zero */
+       switch (mode) {
+       case SDMA_AHG_APPLY_UPDATE3:
+               tx->num_desc++;
+               tx->descs[2].qw[0] = 0;
+               tx->descs[2].qw[1] = 0;
+               /* FALLTHROUGH */
+       case SDMA_AHG_APPLY_UPDATE2:
+               tx->num_desc++;
+               tx->descs[1].qw[0] = 0;
+               tx->descs[1].qw[1] = 0;
+               break;
+       }
+       ahg_hlen >>= 2;
+       tx->descs[0].qw[1] |=
+               (((u64)ahg_entry & SDMA_DESC1_HEADER_INDEX_MASK)
+                       << SDMA_DESC1_HEADER_INDEX_SHIFT) |
+               (((u64)ahg_hlen & SDMA_DESC1_HEADER_DWS_MASK)
+                       << SDMA_DESC1_HEADER_DWS_SHIFT) |
+               (((u64)mode & SDMA_DESC1_HEADER_MODE_MASK)
+                       << SDMA_DESC1_HEADER_MODE_SHIFT) |
+               (((u64)ahg[0] & SDMA_DESC1_HEADER_UPDATE1_MASK)
+                       << SDMA_DESC1_HEADER_UPDATE1_SHIFT);
+       for (i = 0; i < (num_ahg - 1); i++) {
+               if (!shift && !(i & 2))
+                       desc++;
+               tx->descs[desc].qw[!!(i & 2)] |=
+                       (((u64)ahg[i + 1])
+                               << shift);
+               shift = (shift + 32) & 63;
+       }
+}
+
+/**
+ * sdma_ahg_alloc - allocate an AHG entry
+ * @sde: engine to allocate from
+ *
+ * Return:
+ * 0-31 when successful, -EOPNOTSUPP if AHG is not enabled,
+ * -ENOSPC if an entry is not available
+ */
+int sdma_ahg_alloc(struct sdma_engine *sde)
+{
+       int nr;
+       int oldbit;
+
+       if (!sde) {
+               trace_hfi1_ahg_allocate(sde, -EINVAL);
+               return -EINVAL;
+       }
+       while (1) {
+               nr = ffz(ACCESS_ONCE(sde->ahg_bits));
+               if (nr > 31) {
+                       trace_hfi1_ahg_allocate(sde, -ENOSPC);
+                       return -ENOSPC;
+               }
+               oldbit = test_and_set_bit(nr, &sde->ahg_bits);
+               if (!oldbit)
+                       break;
+               cpu_relax();
+       }
+       trace_hfi1_ahg_allocate(sde, nr);
+       return nr;
+}
+
+/**
+ * sdma_ahg_free - free an AHG entry
+ * @sde: engine to return AHG entry
+ * @ahg_index: index to free
+ *
+ * This routine frees the indicate AHG entry.
+ */
+void sdma_ahg_free(struct sdma_engine *sde, int ahg_index)
+{
+       if (!sde)
+               return;
+       trace_hfi1_ahg_deallocate(sde, ahg_index);
+       if (ahg_index < 0 || ahg_index > 31)
+               return;
+       clear_bit(ahg_index, &sde->ahg_bits);
+}
+
+/*
+ * SPC freeze handling for SDMA engines.  Called when the driver knows
+ * the SPC is going into a freeze but before the freeze is fully
+ * settled.  Generally an error interrupt.
+ *
+ * This event will pull the engine out of running so no more entries can be
+ * added to the engine's queue.
+ */
+void sdma_freeze_notify(struct hfi1_devdata *dd, int link_down)
+{
+       int i;
+       enum sdma_events event = link_down ? sdma_event_e85_link_down :
+                                            sdma_event_e80_hw_freeze;
+
+       /* set up the wait but do not wait here */
+       atomic_set(&dd->sdma_unfreeze_count, dd->num_sdma);
+
+       /* tell all engines to stop running and wait */
+       for (i = 0; i < dd->num_sdma; i++)
+               sdma_process_event(&dd->per_sdma[i], event);
+
+       /* sdma_freeze() will wait for all engines to have stopped */
+}
+
+/*
+ * SPC freeze handling for SDMA engines.  Called when the driver knows
+ * the SPC is fully frozen.
+ */
+void sdma_freeze(struct hfi1_devdata *dd)
+{
+       int i;
+       int ret;
+
+       /*
+        * Make sure all engines have moved out of the running state before
+        * continuing.
+        */
+       ret = wait_event_interruptible(dd->sdma_unfreeze_wq,
+                                      atomic_read(&dd->sdma_unfreeze_count) <=
+                                      0);
+       /* interrupted or count is negative, then unloading - just exit */
+       if (ret || atomic_read(&dd->sdma_unfreeze_count) < 0)
+               return;
+
+       /* set up the count for the next wait */
+       atomic_set(&dd->sdma_unfreeze_count, dd->num_sdma);
+
+       /* tell all engines that the SPC is frozen, they can start cleaning */
+       for (i = 0; i < dd->num_sdma; i++)
+               sdma_process_event(&dd->per_sdma[i], sdma_event_e81_hw_frozen);
+
+       /*
+        * Wait for everyone to finish software clean before exiting.  The
+        * software clean will read engine CSRs, so must be completed before
+        * the next step, which will clear the engine CSRs.
+        */
+       (void)wait_event_interruptible(dd->sdma_unfreeze_wq,
+                               atomic_read(&dd->sdma_unfreeze_count) <= 0);
+       /* no need to check results - done no matter what */
+}
+
+/*
+ * SPC freeze handling for the SDMA engines.  Called after the SPC is unfrozen.
+ *
+ * The SPC freeze acts like a SDMA halt and a hardware clean combined.  All
+ * that is left is a software clean.  We could do it after the SPC is fully
+ * frozen, but then we'd have to add another state to wait for the unfreeze.
+ * Instead, just defer the software clean until the unfreeze step.
+ */
+void sdma_unfreeze(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /* tell all engines start freeze clean up */
+       for (i = 0; i < dd->num_sdma; i++)
+               sdma_process_event(&dd->per_sdma[i],
+                                  sdma_event_e82_hw_unfreeze);
+}
+
+/**
+ * _sdma_engine_progress_schedule() - schedule progress on engine
+ * @sde: sdma_engine to schedule progress
+ *
+ */
+void _sdma_engine_progress_schedule(
+       struct sdma_engine *sde)
+{
+       trace_hfi1_sdma_engine_progress(sde, sde->progress_mask);
+       /* assume we have selected a good cpu */
+       write_csr(sde->dd,
+                 CCE_INT_FORCE + (8 * (IS_SDMA_START / 64)),
+                 sde->progress_mask);
+}
diff --git a/drivers/infiniband/hw/hfi1/sdma.h b/drivers/infiniband/hw/hfi1/sdma.h
new file mode 100644 (file)
index 0000000..8f50c99
--- /dev/null
@@ -0,0 +1,1082 @@
+#ifndef _HFI1_SDMA_H
+#define _HFI1_SDMA_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <asm/byteorder.h>
+#include <linux/workqueue.h>
+#include <linux/rculist.h>
+
+#include "hfi.h"
+#include "verbs.h"
+#include "sdma_txreq.h"
+
+/* Hardware limit */
+#define MAX_DESC 64
+/* Hardware limit for SDMA packet size */
+#define MAX_SDMA_PKT_SIZE ((16 * 1024) - 1)
+
+#define SDMA_TXREQ_S_OK        0
+#define SDMA_TXREQ_S_SENDERROR 1
+#define SDMA_TXREQ_S_ABORTED   2
+#define SDMA_TXREQ_S_SHUTDOWN  3
+
+/* flags bits */
+#define SDMA_TXREQ_F_URGENT       0x0001
+#define SDMA_TXREQ_F_AHG_COPY     0x0002
+#define SDMA_TXREQ_F_USE_AHG      0x0004
+
+#define SDMA_MAP_NONE          0
+#define SDMA_MAP_SINGLE        1
+#define SDMA_MAP_PAGE          2
+
+#define SDMA_AHG_VALUE_MASK          0xffff
+#define SDMA_AHG_VALUE_SHIFT         0
+#define SDMA_AHG_INDEX_MASK          0xf
+#define SDMA_AHG_INDEX_SHIFT         16
+#define SDMA_AHG_FIELD_LEN_MASK      0xf
+#define SDMA_AHG_FIELD_LEN_SHIFT     20
+#define SDMA_AHG_FIELD_START_MASK    0x1f
+#define SDMA_AHG_FIELD_START_SHIFT   24
+#define SDMA_AHG_UPDATE_ENABLE_MASK  0x1
+#define SDMA_AHG_UPDATE_ENABLE_SHIFT 31
+
+/* AHG modes */
+
+/*
+ * Be aware the ordering and values
+ * for SDMA_AHG_APPLY_UPDATE[123]
+ * are assumed in generating a skip
+ * count in submit_tx() in sdma.c
+ */
+#define SDMA_AHG_NO_AHG              0
+#define SDMA_AHG_COPY                1
+#define SDMA_AHG_APPLY_UPDATE1       2
+#define SDMA_AHG_APPLY_UPDATE2       3
+#define SDMA_AHG_APPLY_UPDATE3       4
+
+/*
+ * Bits defined in the send DMA descriptor.
+ */
+#define SDMA_DESC0_FIRST_DESC_FLAG      BIT_ULL(63)
+#define SDMA_DESC0_LAST_DESC_FLAG       BIT_ULL(62)
+#define SDMA_DESC0_BYTE_COUNT_SHIFT     48
+#define SDMA_DESC0_BYTE_COUNT_WIDTH     14
+#define SDMA_DESC0_BYTE_COUNT_MASK \
+       ((1ULL << SDMA_DESC0_BYTE_COUNT_WIDTH) - 1)
+#define SDMA_DESC0_BYTE_COUNT_SMASK \
+       (SDMA_DESC0_BYTE_COUNT_MASK << SDMA_DESC0_BYTE_COUNT_SHIFT)
+#define SDMA_DESC0_PHY_ADDR_SHIFT       0
+#define SDMA_DESC0_PHY_ADDR_WIDTH       48
+#define SDMA_DESC0_PHY_ADDR_MASK \
+       ((1ULL << SDMA_DESC0_PHY_ADDR_WIDTH) - 1)
+#define SDMA_DESC0_PHY_ADDR_SMASK \
+       (SDMA_DESC0_PHY_ADDR_MASK << SDMA_DESC0_PHY_ADDR_SHIFT)
+
+#define SDMA_DESC1_HEADER_UPDATE1_SHIFT 32
+#define SDMA_DESC1_HEADER_UPDATE1_WIDTH 32
+#define SDMA_DESC1_HEADER_UPDATE1_MASK \
+       ((1ULL << SDMA_DESC1_HEADER_UPDATE1_WIDTH) - 1)
+#define SDMA_DESC1_HEADER_UPDATE1_SMASK \
+       (SDMA_DESC1_HEADER_UPDATE1_MASK << SDMA_DESC1_HEADER_UPDATE1_SHIFT)
+#define SDMA_DESC1_HEADER_MODE_SHIFT    13
+#define SDMA_DESC1_HEADER_MODE_WIDTH    3
+#define SDMA_DESC1_HEADER_MODE_MASK \
+       ((1ULL << SDMA_DESC1_HEADER_MODE_WIDTH) - 1)
+#define SDMA_DESC1_HEADER_MODE_SMASK \
+       (SDMA_DESC1_HEADER_MODE_MASK << SDMA_DESC1_HEADER_MODE_SHIFT)
+#define SDMA_DESC1_HEADER_INDEX_SHIFT   8
+#define SDMA_DESC1_HEADER_INDEX_WIDTH   5
+#define SDMA_DESC1_HEADER_INDEX_MASK \
+       ((1ULL << SDMA_DESC1_HEADER_INDEX_WIDTH) - 1)
+#define SDMA_DESC1_HEADER_INDEX_SMASK \
+       (SDMA_DESC1_HEADER_INDEX_MASK << SDMA_DESC1_HEADER_INDEX_SHIFT)
+#define SDMA_DESC1_HEADER_DWS_SHIFT     4
+#define SDMA_DESC1_HEADER_DWS_WIDTH     4
+#define SDMA_DESC1_HEADER_DWS_MASK \
+       ((1ULL << SDMA_DESC1_HEADER_DWS_WIDTH) - 1)
+#define SDMA_DESC1_HEADER_DWS_SMASK \
+       (SDMA_DESC1_HEADER_DWS_MASK << SDMA_DESC1_HEADER_DWS_SHIFT)
+#define SDMA_DESC1_GENERATION_SHIFT     2
+#define SDMA_DESC1_GENERATION_WIDTH     2
+#define SDMA_DESC1_GENERATION_MASK \
+       ((1ULL << SDMA_DESC1_GENERATION_WIDTH) - 1)
+#define SDMA_DESC1_GENERATION_SMASK \
+       (SDMA_DESC1_GENERATION_MASK << SDMA_DESC1_GENERATION_SHIFT)
+#define SDMA_DESC1_INT_REQ_FLAG         BIT_ULL(1)
+#define SDMA_DESC1_HEAD_TO_HOST_FLAG    BIT_ULL(0)
+
+enum sdma_states {
+       sdma_state_s00_hw_down,
+       sdma_state_s10_hw_start_up_halt_wait,
+       sdma_state_s15_hw_start_up_clean_wait,
+       sdma_state_s20_idle,
+       sdma_state_s30_sw_clean_up_wait,
+       sdma_state_s40_hw_clean_up_wait,
+       sdma_state_s50_hw_halt_wait,
+       sdma_state_s60_idle_halt_wait,
+       sdma_state_s80_hw_freeze,
+       sdma_state_s82_freeze_sw_clean,
+       sdma_state_s99_running,
+};
+
+enum sdma_events {
+       sdma_event_e00_go_hw_down,
+       sdma_event_e10_go_hw_start,
+       sdma_event_e15_hw_halt_done,
+       sdma_event_e25_hw_clean_up_done,
+       sdma_event_e30_go_running,
+       sdma_event_e40_sw_cleaned,
+       sdma_event_e50_hw_cleaned,
+       sdma_event_e60_hw_halted,
+       sdma_event_e70_go_idle,
+       sdma_event_e80_hw_freeze,
+       sdma_event_e81_hw_frozen,
+       sdma_event_e82_hw_unfreeze,
+       sdma_event_e85_link_down,
+       sdma_event_e90_sw_halted,
+};
+
+struct sdma_set_state_action {
+       unsigned op_enable:1;
+       unsigned op_intenable:1;
+       unsigned op_halt:1;
+       unsigned op_cleanup:1;
+       unsigned go_s99_running_tofalse:1;
+       unsigned go_s99_running_totrue:1;
+};
+
+struct sdma_state {
+       struct kref          kref;
+       struct completion    comp;
+       enum sdma_states current_state;
+       unsigned             current_op;
+       unsigned             go_s99_running;
+       /* debugging/development */
+       enum sdma_states previous_state;
+       unsigned             previous_op;
+       enum sdma_events last_event;
+};
+
+/**
+ * DOC: sdma exported routines
+ *
+ * These sdma routines fit into three categories:
+ * - The SDMA API for building and submitting packets
+ *   to the ring
+ *
+ * - Initialization and tear down routines to buildup
+ *   and tear down SDMA
+ *
+ * - ISR entrances to handle interrupts, state changes
+ *   and errors
+ */
+
+/**
+ * DOC: sdma PSM/verbs API
+ *
+ * The sdma API is designed to be used by both PSM
+ * and verbs to supply packets to the SDMA ring.
+ *
+ * The usage of the API is as follows:
+ *
+ * Embed a struct iowait in the QP or
+ * PQ.  The iowait should be initialized with a
+ * call to iowait_init().
+ *
+ * The user of the API should create an allocation method
+ * for their version of the txreq. slabs, pre-allocated lists,
+ * and dma pools can be used.  Once the user's overload of
+ * the sdma_txreq has been allocated, the sdma_txreq member
+ * must be initialized with sdma_txinit() or sdma_txinit_ahg().
+ *
+ * The txreq must be declared with the sdma_txreq first.
+ *
+ * The tx request, once initialized,  is manipulated with calls to
+ * sdma_txadd_daddr(), sdma_txadd_page(), or sdma_txadd_kvaddr()
+ * for each disjoint memory location.  It is the user's responsibility
+ * to understand the packet boundaries and page boundaries to do the
+ * appropriate number of sdma_txadd_* calls..  The user
+ * must be prepared to deal with failures from these routines due to
+ * either memory allocation or dma_mapping failures.
+ *
+ * The mapping specifics for each memory location are recorded
+ * in the tx. Memory locations added with sdma_txadd_page()
+ * and sdma_txadd_kvaddr() are automatically mapped when added
+ * to the tx and nmapped as part of the progress processing in the
+ * SDMA interrupt handling.
+ *
+ * sdma_txadd_daddr() is used to add an dma_addr_t memory to the
+ * tx.   An example of a use case would be a pre-allocated
+ * set of headers allocated via dma_pool_alloc() or
+ * dma_alloc_coherent().  For these memory locations, it
+ * is the responsibility of the user to handle that unmapping.
+ * (This would usually be at an unload or job termination.)
+ *
+ * The routine sdma_send_txreq() is used to submit
+ * a tx to the ring after the appropriate number of
+ * sdma_txadd_* have been done.
+ *
+ * If it is desired to send a burst of sdma_txreqs, sdma_send_txlist()
+ * can be used to submit a list of packets.
+ *
+ * The user is free to use the link overhead in the struct sdma_txreq as
+ * long as the tx isn't in flight.
+ *
+ * The extreme degenerate case of the number of descriptors
+ * exceeding the ring size is automatically handled as
+ * memory locations are added.  An overflow of the descriptor
+ * array that is part of the sdma_txreq is also automatically
+ * handled.
+ *
+ */
+
+/**
+ * DOC: Infrastructure calls
+ *
+ * sdma_init() is used to initialize data structures and
+ * CSRs for the desired number of SDMA engines.
+ *
+ * sdma_start() is used to kick the SDMA engines initialized
+ * with sdma_init().   Interrupts must be enabled at this
+ * point since aspects of the state machine are interrupt
+ * driven.
+ *
+ * sdma_engine_error() and sdma_engine_interrupt() are
+ * entrances for interrupts.
+ *
+ * sdma_map_init() is for the management of the mapping
+ * table when the number of vls is changed.
+ *
+ */
+
+/*
+ * struct hw_sdma_desc - raw 128 bit SDMA descriptor
+ *
+ * This is the raw descriptor in the SDMA ring
+ */
+struct hw_sdma_desc {
+       /* private:  don't use directly */
+       __le64 qw[2];
+};
+
+/**
+ * struct sdma_engine - Data pertaining to each SDMA engine.
+ * @dd: a back-pointer to the device data
+ * @ppd: per port back-pointer
+ * @imask: mask for irq manipulation
+ * @idle_mask: mask for determining if an interrupt is due to sdma_idle
+ *
+ * This structure has the state for each sdma_engine.
+ *
+ * Accessing to non public fields are not supported
+ * since the private members are subject to change.
+ */
+struct sdma_engine {
+       /* read mostly */
+       struct hfi1_devdata *dd;
+       struct hfi1_pportdata *ppd;
+       /* private: */
+       void __iomem *tail_csr;
+       u64 imask;                      /* clear interrupt mask */
+       u64 idle_mask;
+       u64 progress_mask;
+       u64 int_mask;
+       /* private: */
+       volatile __le64      *head_dma; /* DMA'ed by chip */
+       /* private: */
+       dma_addr_t            head_phys;
+       /* private: */
+       struct hw_sdma_desc *descq;
+       /* private: */
+       unsigned descq_full_count;
+       struct sdma_txreq **tx_ring;
+       /* private: */
+       dma_addr_t            descq_phys;
+       /* private */
+       u32 sdma_mask;
+       /* private */
+       struct sdma_state state;
+       /* private */
+       int cpu;
+       /* private: */
+       u8 sdma_shift;
+       /* private: */
+       u8 this_idx; /* zero relative engine */
+       /* protect changes to senddmactrl shadow */
+       spinlock_t senddmactrl_lock;
+       /* private: */
+       u64 p_senddmactrl;              /* shadow per-engine SendDmaCtrl */
+
+       /* read/write using tail_lock */
+       spinlock_t            tail_lock ____cacheline_aligned_in_smp;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+       /* private: */
+       u64                   tail_sn;
+#endif
+       /* private: */
+       u32                   descq_tail;
+       /* private: */
+       unsigned long         ahg_bits;
+       /* private: */
+       u16                   desc_avail;
+       /* private: */
+       u16                   tx_tail;
+       /* private: */
+       u16 descq_cnt;
+
+       /* read/write using head_lock */
+       /* private: */
+       seqlock_t            head_lock ____cacheline_aligned_in_smp;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+       /* private: */
+       u64                   head_sn;
+#endif
+       /* private: */
+       u32                   descq_head;
+       /* private: */
+       u16                   tx_head;
+       /* private: */
+       u64                   last_status;
+       /* private */
+       u64                     err_cnt;
+       /* private */
+       u64                     sdma_int_cnt;
+       u64                     idle_int_cnt;
+       u64                     progress_int_cnt;
+
+       /* private: */
+       struct list_head      dmawait;
+
+       /* CONFIG SDMA for now, just blindly duplicate */
+       /* private: */
+       struct tasklet_struct sdma_hw_clean_up_task
+               ____cacheline_aligned_in_smp;
+
+       /* private: */
+       struct tasklet_struct sdma_sw_clean_up_task
+               ____cacheline_aligned_in_smp;
+       /* private: */
+       struct work_struct err_halt_worker;
+       /* private */
+       struct timer_list     err_progress_check_timer;
+       u32                   progress_check_head;
+       /* private: */
+       struct work_struct flush_worker;
+       /* protect flush list */
+       spinlock_t flushlist_lock;
+       /* private: */
+       struct list_head flushlist;
+};
+
+int sdma_init(struct hfi1_devdata *dd, u8 port);
+void sdma_start(struct hfi1_devdata *dd);
+void sdma_exit(struct hfi1_devdata *dd);
+void sdma_all_running(struct hfi1_devdata *dd);
+void sdma_all_idle(struct hfi1_devdata *dd);
+void sdma_freeze_notify(struct hfi1_devdata *dd, int go_idle);
+void sdma_freeze(struct hfi1_devdata *dd);
+void sdma_unfreeze(struct hfi1_devdata *dd);
+void sdma_wait(struct hfi1_devdata *dd);
+
+/**
+ * sdma_empty() - idle engine test
+ * @engine: sdma engine
+ *
+ * Currently used by verbs as a latency optimization.
+ *
+ * Return:
+ * 1 - empty, 0 - non-empty
+ */
+static inline int sdma_empty(struct sdma_engine *sde)
+{
+       return sde->descq_tail == sde->descq_head;
+}
+
+static inline u16 sdma_descq_freecnt(struct sdma_engine *sde)
+{
+       return sde->descq_cnt -
+               (sde->descq_tail -
+                ACCESS_ONCE(sde->descq_head)) - 1;
+}
+
+static inline u16 sdma_descq_inprocess(struct sdma_engine *sde)
+{
+       return sde->descq_cnt - sdma_descq_freecnt(sde);
+}
+
+/*
+ * Either head_lock or tail lock required to see
+ * a steady state.
+ */
+static inline int __sdma_running(struct sdma_engine *engine)
+{
+       return engine->state.current_state == sdma_state_s99_running;
+}
+
+/**
+ * sdma_running() - state suitability test
+ * @engine: sdma engine
+ *
+ * sdma_running probes the internal state to determine if it is suitable
+ * for submitting packets.
+ *
+ * Return:
+ * 1 - ok to submit, 0 - not ok to submit
+ *
+ */
+static inline int sdma_running(struct sdma_engine *engine)
+{
+       unsigned long flags;
+       int ret;
+
+       spin_lock_irqsave(&engine->tail_lock, flags);
+       ret = __sdma_running(engine);
+       spin_unlock_irqrestore(&engine->tail_lock, flags);
+       return ret;
+}
+
+void _sdma_txreq_ahgadd(
+       struct sdma_txreq *tx,
+       u8 num_ahg,
+       u8 ahg_entry,
+       u32 *ahg,
+       u8 ahg_hlen);
+
+/**
+ * sdma_txinit_ahg() - initialize an sdma_txreq struct with AHG
+ * @tx: tx request to initialize
+ * @flags: flags to key last descriptor additions
+ * @tlen: total packet length (pbc + headers + data)
+ * @ahg_entry: ahg entry to use  (0 - 31)
+ * @num_ahg: ahg descriptor for first descriptor (0 - 9)
+ * @ahg: array of AHG descriptors (up to 9 entries)
+ * @ahg_hlen: number of bytes from ASIC entry to use
+ * @cb: callback
+ *
+ * The allocation of the sdma_txreq and it enclosing structure is user
+ * dependent.  This routine must be called to initialize the user independent
+ * fields.
+ *
+ * The currently supported flags are SDMA_TXREQ_F_URGENT,
+ * SDMA_TXREQ_F_AHG_COPY, and SDMA_TXREQ_F_USE_AHG.
+ *
+ * SDMA_TXREQ_F_URGENT is used for latency sensitive situations where the
+ * completion is desired as soon as possible.
+ *
+ * SDMA_TXREQ_F_AHG_COPY causes the header in the first descriptor to be
+ * copied to chip entry. SDMA_TXREQ_F_USE_AHG causes the code to add in
+ * the AHG descriptors into the first 1 to 3 descriptors.
+ *
+ * Completions of submitted requests can be gotten on selected
+ * txreqs by giving a completion routine callback to sdma_txinit() or
+ * sdma_txinit_ahg().  The environment in which the callback runs
+ * can be from an ISR, a tasklet, or a thread, so no sleeping
+ * kernel routines can be used.   Aspects of the sdma ring may
+ * be locked so care should be taken with locking.
+ *
+ * The callback pointer can be NULL to avoid any callback for the packet
+ * being submitted. The callback will be provided this tx, a status, and a flag.
+ *
+ * The status will be one of SDMA_TXREQ_S_OK, SDMA_TXREQ_S_SENDERROR,
+ * SDMA_TXREQ_S_ABORTED, or SDMA_TXREQ_S_SHUTDOWN.
+ *
+ * The flag, if the is the iowait had been used, indicates the iowait
+ * sdma_busy count has reached zero.
+ *
+ * user data portion of tlen should be precise.   The sdma_txadd_* entrances
+ * will pad with a descriptor references 1 - 3 bytes when the number of bytes
+ * specified in tlen have been supplied to the sdma_txreq.
+ *
+ * ahg_hlen is used to determine the number of on-chip entry bytes to
+ * use as the header.   This is for cases where the stored header is
+ * larger than the header to be used in a packet.  This is typical
+ * for verbs where an RDMA_WRITE_FIRST is larger than the packet in
+ * and RDMA_WRITE_MIDDLE.
+ *
+ */
+static inline int sdma_txinit_ahg(
+       struct sdma_txreq *tx,
+       u16 flags,
+       u16 tlen,
+       u8 ahg_entry,
+       u8 num_ahg,
+       u32 *ahg,
+       u8 ahg_hlen,
+       void (*cb)(struct sdma_txreq *, int))
+{
+       if (tlen == 0)
+               return -ENODATA;
+       if (tlen > MAX_SDMA_PKT_SIZE)
+               return -EMSGSIZE;
+       tx->desc_limit = ARRAY_SIZE(tx->descs);
+       tx->descp = &tx->descs[0];
+       INIT_LIST_HEAD(&tx->list);
+       tx->num_desc = 0;
+       tx->flags = flags;
+       tx->complete = cb;
+       tx->coalesce_buf = NULL;
+       tx->wait = NULL;
+       tx->packet_len = tlen;
+       tx->tlen = tx->packet_len;
+       tx->descs[0].qw[0] = SDMA_DESC0_FIRST_DESC_FLAG;
+       tx->descs[0].qw[1] = 0;
+       if (flags & SDMA_TXREQ_F_AHG_COPY)
+               tx->descs[0].qw[1] |=
+                       (((u64)ahg_entry & SDMA_DESC1_HEADER_INDEX_MASK)
+                               << SDMA_DESC1_HEADER_INDEX_SHIFT) |
+                       (((u64)SDMA_AHG_COPY & SDMA_DESC1_HEADER_MODE_MASK)
+                               << SDMA_DESC1_HEADER_MODE_SHIFT);
+       else if (flags & SDMA_TXREQ_F_USE_AHG && num_ahg)
+               _sdma_txreq_ahgadd(tx, num_ahg, ahg_entry, ahg, ahg_hlen);
+       return 0;
+}
+
+/**
+ * sdma_txinit() - initialize an sdma_txreq struct (no AHG)
+ * @tx: tx request to initialize
+ * @flags: flags to key last descriptor additions
+ * @tlen: total packet length (pbc + headers + data)
+ * @cb: callback pointer
+ *
+ * The allocation of the sdma_txreq and it enclosing structure is user
+ * dependent.  This routine must be called to initialize the user
+ * independent fields.
+ *
+ * The currently supported flags is SDMA_TXREQ_F_URGENT.
+ *
+ * SDMA_TXREQ_F_URGENT is used for latency sensitive situations where the
+ * completion is desired as soon as possible.
+ *
+ * Completions of submitted requests can be gotten on selected
+ * txreqs by giving a completion routine callback to sdma_txinit() or
+ * sdma_txinit_ahg().  The environment in which the callback runs
+ * can be from an ISR, a tasklet, or a thread, so no sleeping
+ * kernel routines can be used.   The head size of the sdma ring may
+ * be locked so care should be taken with locking.
+ *
+ * The callback pointer can be NULL to avoid any callback for the packet
+ * being submitted.
+ *
+ * The callback, if non-NULL,  will be provided this tx and a status.  The
+ * status will be one of SDMA_TXREQ_S_OK, SDMA_TXREQ_S_SENDERROR,
+ * SDMA_TXREQ_S_ABORTED, or SDMA_TXREQ_S_SHUTDOWN.
+ *
+ */
+static inline int sdma_txinit(
+       struct sdma_txreq *tx,
+       u16 flags,
+       u16 tlen,
+       void (*cb)(struct sdma_txreq *, int))
+{
+       return sdma_txinit_ahg(tx, flags, tlen, 0, 0, NULL, 0, cb);
+}
+
+/* helpers - don't use */
+static inline int sdma_mapping_type(struct sdma_desc *d)
+{
+       return (d->qw[1] & SDMA_DESC1_GENERATION_SMASK)
+               >> SDMA_DESC1_GENERATION_SHIFT;
+}
+
+static inline size_t sdma_mapping_len(struct sdma_desc *d)
+{
+       return (d->qw[0] & SDMA_DESC0_BYTE_COUNT_SMASK)
+               >> SDMA_DESC0_BYTE_COUNT_SHIFT;
+}
+
+static inline dma_addr_t sdma_mapping_addr(struct sdma_desc *d)
+{
+       return (d->qw[0] & SDMA_DESC0_PHY_ADDR_SMASK)
+               >> SDMA_DESC0_PHY_ADDR_SHIFT;
+}
+
+static inline void make_tx_sdma_desc(
+       struct sdma_txreq *tx,
+       int type,
+       dma_addr_t addr,
+       size_t len)
+{
+       struct sdma_desc *desc = &tx->descp[tx->num_desc];
+
+       if (!tx->num_desc) {
+               /* qw[0] zero; qw[1] first, ahg mode already in from init */
+               desc->qw[1] |= ((u64)type & SDMA_DESC1_GENERATION_MASK)
+                               << SDMA_DESC1_GENERATION_SHIFT;
+       } else {
+               desc->qw[0] = 0;
+               desc->qw[1] = ((u64)type & SDMA_DESC1_GENERATION_MASK)
+                               << SDMA_DESC1_GENERATION_SHIFT;
+       }
+       desc->qw[0] |= (((u64)addr & SDMA_DESC0_PHY_ADDR_MASK)
+                               << SDMA_DESC0_PHY_ADDR_SHIFT) |
+                       (((u64)len & SDMA_DESC0_BYTE_COUNT_MASK)
+                               << SDMA_DESC0_BYTE_COUNT_SHIFT);
+}
+
+/* helper to extend txreq */
+int ext_coal_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx,
+                          int type, void *kvaddr, struct page *page,
+                          unsigned long offset, u16 len);
+int _pad_sdma_tx_descs(struct hfi1_devdata *, struct sdma_txreq *);
+void sdma_txclean(struct hfi1_devdata *, struct sdma_txreq *);
+
+/* helpers used by public routines */
+static inline void _sdma_close_tx(struct hfi1_devdata *dd,
+                                 struct sdma_txreq *tx)
+{
+       tx->descp[tx->num_desc].qw[0] |=
+               SDMA_DESC0_LAST_DESC_FLAG;
+       tx->descp[tx->num_desc].qw[1] |=
+               dd->default_desc1;
+       if (tx->flags & SDMA_TXREQ_F_URGENT)
+               tx->descp[tx->num_desc].qw[1] |=
+                       (SDMA_DESC1_HEAD_TO_HOST_FLAG |
+                        SDMA_DESC1_INT_REQ_FLAG);
+}
+
+static inline int _sdma_txadd_daddr(
+       struct hfi1_devdata *dd,
+       int type,
+       struct sdma_txreq *tx,
+       dma_addr_t addr,
+       u16 len)
+{
+       int rval = 0;
+
+       make_tx_sdma_desc(
+               tx,
+               type,
+               addr, len);
+       WARN_ON(len > tx->tlen);
+       tx->tlen -= len;
+       /* special cases for last */
+       if (!tx->tlen) {
+               if (tx->packet_len & (sizeof(u32) - 1)) {
+                       rval = _pad_sdma_tx_descs(dd, tx);
+                       if (rval)
+                               return rval;
+               } else {
+                       _sdma_close_tx(dd, tx);
+               }
+       }
+       tx->num_desc++;
+       return rval;
+}
+
+/**
+ * sdma_txadd_page() - add a page to the sdma_txreq
+ * @dd: the device to use for mapping
+ * @tx: tx request to which the page is added
+ * @page: page to map
+ * @offset: offset within the page
+ * @len: length in bytes
+ *
+ * This is used to add a page/offset/length descriptor.
+ *
+ * The mapping/unmapping of the page/offset/len is automatically handled.
+ *
+ * Return:
+ * 0 - success, -ENOSPC - mapping fail, -ENOMEM - couldn't
+ * extend/coalesce descriptor array
+ */
+static inline int sdma_txadd_page(
+       struct hfi1_devdata *dd,
+       struct sdma_txreq *tx,
+       struct page *page,
+       unsigned long offset,
+       u16 len)
+{
+       dma_addr_t addr;
+       int rval;
+
+       if ((unlikely(tx->num_desc == tx->desc_limit))) {
+               rval = ext_coal_sdma_tx_descs(dd, tx, SDMA_MAP_PAGE,
+                                             NULL, page, offset, len);
+               if (rval <= 0)
+                       return rval;
+       }
+
+       addr = dma_map_page(
+                      &dd->pcidev->dev,
+                      page,
+                      offset,
+                      len,
+                      DMA_TO_DEVICE);
+
+       if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) {
+               sdma_txclean(dd, tx);
+               return -ENOSPC;
+       }
+
+       return _sdma_txadd_daddr(
+                       dd, SDMA_MAP_PAGE, tx, addr, len);
+}
+
+/**
+ * sdma_txadd_daddr() - add a dma address to the sdma_txreq
+ * @dd: the device to use for mapping
+ * @tx: sdma_txreq to which the page is added
+ * @addr: dma address mapped by caller
+ * @len: length in bytes
+ *
+ * This is used to add a descriptor for memory that is already dma mapped.
+ *
+ * In this case, there is no unmapping as part of the progress processing for
+ * this memory location.
+ *
+ * Return:
+ * 0 - success, -ENOMEM - couldn't extend descriptor array
+ */
+
+static inline int sdma_txadd_daddr(
+       struct hfi1_devdata *dd,
+       struct sdma_txreq *tx,
+       dma_addr_t addr,
+       u16 len)
+{
+       int rval;
+
+       if ((unlikely(tx->num_desc == tx->desc_limit))) {
+               rval = ext_coal_sdma_tx_descs(dd, tx, SDMA_MAP_NONE,
+                                             NULL, NULL, 0, 0);
+               if (rval <= 0)
+                       return rval;
+       }
+
+       return _sdma_txadd_daddr(dd, SDMA_MAP_NONE, tx, addr, len);
+}
+
+/**
+ * sdma_txadd_kvaddr() - add a kernel virtual address to sdma_txreq
+ * @dd: the device to use for mapping
+ * @tx: sdma_txreq to which the page is added
+ * @kvaddr: the kernel virtual address
+ * @len: length in bytes
+ *
+ * This is used to add a descriptor referenced by the indicated kvaddr and
+ * len.
+ *
+ * The mapping/unmapping of the kvaddr and len is automatically handled.
+ *
+ * Return:
+ * 0 - success, -ENOSPC - mapping fail, -ENOMEM - couldn't extend/coalesce
+ * descriptor array
+ */
+static inline int sdma_txadd_kvaddr(
+       struct hfi1_devdata *dd,
+       struct sdma_txreq *tx,
+       void *kvaddr,
+       u16 len)
+{
+       dma_addr_t addr;
+       int rval;
+
+       if ((unlikely(tx->num_desc == tx->desc_limit))) {
+               rval = ext_coal_sdma_tx_descs(dd, tx, SDMA_MAP_SINGLE,
+                                             kvaddr, NULL, 0, len);
+               if (rval <= 0)
+                       return rval;
+       }
+
+       addr = dma_map_single(
+                      &dd->pcidev->dev,
+                      kvaddr,
+                      len,
+                      DMA_TO_DEVICE);
+
+       if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) {
+               sdma_txclean(dd, tx);
+               return -ENOSPC;
+       }
+
+       return _sdma_txadd_daddr(
+                       dd, SDMA_MAP_SINGLE, tx, addr, len);
+}
+
+struct iowait;
+
+int sdma_send_txreq(struct sdma_engine *sde,
+                   struct iowait *wait,
+                   struct sdma_txreq *tx);
+int sdma_send_txlist(struct sdma_engine *sde,
+                    struct iowait *wait,
+                    struct list_head *tx_list);
+
+int sdma_ahg_alloc(struct sdma_engine *sde);
+void sdma_ahg_free(struct sdma_engine *sde, int ahg_index);
+
+/**
+ * sdma_build_ahg - build ahg descriptor
+ * @data
+ * @dwindex
+ * @startbit
+ * @bits
+ *
+ * Build and return a 32 bit descriptor.
+ */
+static inline u32 sdma_build_ahg_descriptor(
+       u16 data,
+       u8 dwindex,
+       u8 startbit,
+       u8 bits)
+{
+       return (u32)(1UL << SDMA_AHG_UPDATE_ENABLE_SHIFT |
+               ((startbit & SDMA_AHG_FIELD_START_MASK) <<
+               SDMA_AHG_FIELD_START_SHIFT) |
+               ((bits & SDMA_AHG_FIELD_LEN_MASK) <<
+               SDMA_AHG_FIELD_LEN_SHIFT) |
+               ((dwindex & SDMA_AHG_INDEX_MASK) <<
+               SDMA_AHG_INDEX_SHIFT) |
+               ((data & SDMA_AHG_VALUE_MASK) <<
+               SDMA_AHG_VALUE_SHIFT));
+}
+
+/**
+ * sdma_progress - use seq number of detect head progress
+ * @sde: sdma_engine to check
+ * @seq: base seq count
+ * @tx: txreq for which we need to check descriptor availability
+ *
+ * This is used in the appropriate spot in the sleep routine
+ * to check for potential ring progress.  This routine gets the
+ * seqcount before queuing the iowait structure for progress.
+ *
+ * If the seqcount indicates that progress needs to be checked,
+ * re-submission is detected by checking whether the descriptor
+ * queue has enough descriptor for the txreq.
+ */
+static inline unsigned sdma_progress(struct sdma_engine *sde, unsigned seq,
+                                    struct sdma_txreq *tx)
+{
+       if (read_seqretry(&sde->head_lock, seq)) {
+               sde->desc_avail = sdma_descq_freecnt(sde);
+               if (tx->num_desc > sde->desc_avail)
+                       return 0;
+               return 1;
+       }
+       return 0;
+}
+
+/**
+ * sdma_iowait_schedule() - initialize wait structure
+ * @sde: sdma_engine to schedule
+ * @wait: wait struct to schedule
+ *
+ * This function initializes the iowait
+ * structure embedded in the QP or PQ.
+ *
+ */
+static inline void sdma_iowait_schedule(
+       struct sdma_engine *sde,
+       struct iowait *wait)
+{
+       struct hfi1_pportdata *ppd = sde->dd->pport;
+
+       iowait_schedule(wait, ppd->hfi1_wq, sde->cpu);
+}
+
+/* for use by interrupt handling */
+void sdma_engine_error(struct sdma_engine *sde, u64 status);
+void sdma_engine_interrupt(struct sdma_engine *sde, u64 status);
+
+/*
+ *
+ * The diagram below details the relationship of the mapping structures
+ *
+ * Since the mapping now allows for non-uniform engines per vl, the
+ * number of engines for a vl is either the vl_engines[vl] or
+ * a computation based on num_sdma/num_vls:
+ *
+ * For example:
+ * nactual = vl_engines ? vl_engines[vl] : num_sdma/num_vls
+ *
+ * n = roundup to next highest power of 2 using nactual
+ *
+ * In the case where there are num_sdma/num_vls doesn't divide
+ * evenly, the extras are added from the last vl downward.
+ *
+ * For the case where n > nactual, the engines are assigned
+ * in a round robin fashion wrapping back to the first engine
+ * for a particular vl.
+ *
+ *               dd->sdma_map
+ *                    |                                   sdma_map_elem[0]
+ *                    |                                +--------------------+
+ *                    v                                |       mask         |
+ *               sdma_vl_map                           |--------------------|
+ *      +--------------------------+                   | sde[0] -> eng 1    |
+ *      |    list (RCU)            |                   |--------------------|
+ *      |--------------------------|                 ->| sde[1] -> eng 2    |
+ *      |    mask                  |              --/  |--------------------|
+ *      |--------------------------|            -/     |        *           |
+ *      |    actual_vls (max 8)    |          -/       |--------------------|
+ *      |--------------------------|       --/         | sde[n] -> eng n    |
+ *      |    vls (max 8)           |     -/            +--------------------+
+ *      |--------------------------|  --/
+ *      |    map[0]                |-/
+ *      |--------------------------|                   +--------------------+
+ *      |    map[1]                |---                |       mask         |
+ *      |--------------------------|   \----           |--------------------|
+ *      |           *              |        \--        | sde[0] -> eng 1+n  |
+ *      |           *              |           \----   |--------------------|
+ *      |           *              |                \->| sde[1] -> eng 2+n  |
+ *      |--------------------------|                   |--------------------|
+ *      |   map[vls - 1]           |-                  |         *          |
+ *      +--------------------------+ \-                |--------------------|
+ *                                     \-              | sde[m] -> eng m+n  |
+ *                                       \             +--------------------+
+ *                                        \-
+ *                                          \
+ *                                           \-        +--------------------+
+ *                                             \-      |       mask         |
+ *                                               \     |--------------------|
+ *                                                \-   | sde[0] -> eng 1+m+n|
+ *                                                  \- |--------------------|
+ *                                                    >| sde[1] -> eng 2+m+n|
+ *                                                     |--------------------|
+ *                                                     |         *          |
+ *                                                     |--------------------|
+ *                                                     | sde[o] -> eng o+m+n|
+ *                                                     +--------------------+
+ *
+ */
+
+/**
+ * struct sdma_map_elem - mapping for a vl
+ * @mask - selector mask
+ * @sde - array of engines for this vl
+ *
+ * The mask is used to "mod" the selector
+ * to produce index into the trailing
+ * array of sdes.
+ */
+struct sdma_map_elem {
+       u32 mask;
+       struct sdma_engine *sde[0];
+};
+
+/**
+ * struct sdma_map_el - mapping for a vl
+ * @engine_to_vl - map of an engine to a vl
+ * @list - rcu head for free callback
+ * @mask - vl mask to "mod" the vl to produce an index to map array
+ * @actual_vls - number of vls
+ * @vls - number of vls rounded to next power of 2
+ * @map - array of sdma_map_elem entries
+ *
+ * This is the parent mapping structure.  The trailing
+ * members of the struct point to sdma_map_elem entries, which
+ * in turn point to an array of sde's for that vl.
+ */
+struct sdma_vl_map {
+       s8 engine_to_vl[TXE_NUM_SDMA_ENGINES];
+       struct rcu_head list;
+       u32 mask;
+       u8 actual_vls;
+       u8 vls;
+       struct sdma_map_elem *map[0];
+};
+
+int sdma_map_init(
+       struct hfi1_devdata *dd,
+       u8 port,
+       u8 num_vls,
+       u8 *vl_engines);
+
+/* slow path */
+void _sdma_engine_progress_schedule(struct sdma_engine *sde);
+
+/**
+ * sdma_engine_progress_schedule() - schedule progress on engine
+ * @sde: sdma_engine to schedule progress
+ *
+ * This is the fast path.
+ *
+ */
+static inline void sdma_engine_progress_schedule(
+       struct sdma_engine *sde)
+{
+       if (!sde || sdma_descq_inprocess(sde) < (sde->descq_cnt / 8))
+               return;
+       _sdma_engine_progress_schedule(sde);
+}
+
+struct sdma_engine *sdma_select_engine_sc(
+       struct hfi1_devdata *dd,
+       u32 selector,
+       u8 sc5);
+
+struct sdma_engine *sdma_select_engine_vl(
+       struct hfi1_devdata *dd,
+       u32 selector,
+       u8 vl);
+
+void sdma_seqfile_dump_sde(struct seq_file *s, struct sdma_engine *);
+
+#ifdef CONFIG_SDMA_VERBOSITY
+void sdma_dumpstate(struct sdma_engine *);
+#endif
+static inline char *slashstrip(char *s)
+{
+       char *r = s;
+
+       while (*s)
+               if (*s++ == '/')
+                       r = s;
+       return r;
+}
+
+u16 sdma_get_descq_cnt(void);
+
+extern uint mod_num_sdma;
+
+void sdma_update_lmc(struct hfi1_devdata *dd, u64 mask, u32 lid);
+
+#endif
diff --git a/drivers/infiniband/hw/hfi1/sdma_txreq.h b/drivers/infiniband/hw/hfi1/sdma_txreq.h
new file mode 100644 (file)
index 0000000..bf7d777
--- /dev/null
@@ -0,0 +1,135 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef HFI1_SDMA_TXREQ_H
+#define HFI1_SDMA_TXREQ_H
+
+/* increased for AHG */
+#define NUM_DESC 6
+
+/*
+ * struct sdma_desc - canonical fragment descriptor
+ *
+ * This is the descriptor carried in the tx request
+ * corresponding to each fragment.
+ *
+ */
+struct sdma_desc {
+       /* private:  don't use directly */
+       u64 qw[2];
+};
+
+/**
+ * struct sdma_txreq - the sdma_txreq structure (one per packet)
+ * @list: for use by user and by queuing for wait
+ *
+ * This is the representation of a packet which consists of some
+ * number of fragments.   Storage is provided to within the structure.
+ * for all fragments.
+ *
+ * The storage for the descriptors are automatically extended as needed
+ * when the currently allocation is exceeded.
+ *
+ * The user (Verbs or PSM) may overload this structure with fields
+ * specific to their use by putting this struct first in their struct.
+ * The method of allocation of the overloaded structure is user dependent
+ *
+ * The list is the only public field in the structure.
+ *
+ */
+
+#define SDMA_TXREQ_S_OK        0
+#define SDMA_TXREQ_S_SENDERROR 1
+#define SDMA_TXREQ_S_ABORTED   2
+#define SDMA_TXREQ_S_SHUTDOWN  3
+
+/* flags bits */
+#define SDMA_TXREQ_F_URGENT       0x0001
+#define SDMA_TXREQ_F_AHG_COPY     0x0002
+#define SDMA_TXREQ_F_USE_AHG      0x0004
+
+struct sdma_txreq;
+typedef void (*callback_t)(struct sdma_txreq *, int);
+
+struct iowait;
+struct sdma_txreq {
+       struct list_head list;
+       /* private: */
+       struct sdma_desc *descp;
+       /* private: */
+       void *coalesce_buf;
+       /* private: */
+       struct iowait *wait;
+       /* private: */
+       callback_t                  complete;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+       u64 sn;
+#endif
+       /* private: - used in coalesce/pad processing */
+       u16                         packet_len;
+       /* private: - down-counted to trigger last */
+       u16                         tlen;
+       /* private: */
+       u16                         num_desc;
+       /* private: */
+       u16                         desc_limit;
+       /* private: */
+       u16                         next_descq_idx;
+       /* private: */
+       u16 coalesce_idx;
+       /* private: flags */
+       u16                         flags;
+       /* private: */
+       struct sdma_desc descs[NUM_DESC];
+};
+
+static inline int sdma_txreq_built(struct sdma_txreq *tx)
+{
+       return tx->num_desc;
+}
+
+#endif                          /* HFI1_SDMA_TXREQ_H */
diff --git a/drivers/infiniband/hw/hfi1/sysfs.c b/drivers/infiniband/hw/hfi1/sysfs.c
new file mode 100644 (file)
index 0000000..91fc2ae
--- /dev/null
@@ -0,0 +1,785 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/ctype.h>
+
+#include "hfi.h"
+#include "mad.h"
+#include "trace.h"
+
+/*
+ * Start of per-port congestion control structures and support code
+ */
+
+/*
+ * Congestion control table size followed by table entries
+ */
+static ssize_t read_cc_table_bin(struct file *filp, struct kobject *kobj,
+                                struct bin_attribute *bin_attr,
+                                char *buf, loff_t pos, size_t count)
+{
+       int ret;
+       struct hfi1_pportdata *ppd =
+               container_of(kobj, struct hfi1_pportdata, pport_cc_kobj);
+       struct cc_state *cc_state;
+
+       ret = ppd->total_cct_entry * sizeof(struct ib_cc_table_entry_shadow)
+                + sizeof(__be16);
+
+       if (pos > ret)
+               return -EINVAL;
+
+       if (count > ret - pos)
+               count = ret - pos;
+
+       if (!count)
+               return count;
+
+       rcu_read_lock();
+       cc_state = get_cc_state(ppd);
+       if (!cc_state) {
+               rcu_read_unlock();
+               return -EINVAL;
+       }
+       memcpy(buf, (void *)&cc_state->cct + pos, count);
+       rcu_read_unlock();
+
+       return count;
+}
+
+static void port_release(struct kobject *kobj)
+{
+       /* nothing to do since memory is freed by hfi1_free_devdata() */
+}
+
+static struct bin_attribute cc_table_bin_attr = {
+       .attr = {.name = "cc_table_bin", .mode = 0444},
+       .read = read_cc_table_bin,
+       .size = PAGE_SIZE,
+};
+
+/*
+ * Congestion settings: port control, control map and an array of 16
+ * entries for the congestion entries - increase, timer, event log
+ * trigger threshold and the minimum injection rate delay.
+ */
+static ssize_t read_cc_setting_bin(struct file *filp, struct kobject *kobj,
+                                  struct bin_attribute *bin_attr,
+                                  char *buf, loff_t pos, size_t count)
+{
+       int ret;
+       struct hfi1_pportdata *ppd =
+               container_of(kobj, struct hfi1_pportdata, pport_cc_kobj);
+       struct cc_state *cc_state;
+
+       ret = sizeof(struct opa_congestion_setting_attr_shadow);
+
+       if (pos > ret)
+               return -EINVAL;
+       if (count > ret - pos)
+               count = ret - pos;
+
+       if (!count)
+               return count;
+
+       rcu_read_lock();
+       cc_state = get_cc_state(ppd);
+       if (!cc_state) {
+               rcu_read_unlock();
+               return -EINVAL;
+       }
+       memcpy(buf, (void *)&cc_state->cong_setting + pos, count);
+       rcu_read_unlock();
+
+       return count;
+}
+
+static struct bin_attribute cc_setting_bin_attr = {
+       .attr = {.name = "cc_settings_bin", .mode = 0444},
+       .read = read_cc_setting_bin,
+       .size = PAGE_SIZE,
+};
+
+struct hfi1_port_attr {
+       struct attribute attr;
+       ssize_t (*show)(struct hfi1_pportdata *, char *);
+       ssize_t (*store)(struct hfi1_pportdata *, const char *, size_t);
+};
+
+static ssize_t cc_prescan_show(struct hfi1_pportdata *ppd, char *buf)
+{
+       return sprintf(buf, "%s\n", ppd->cc_prescan ? "on" : "off");
+}
+
+static ssize_t cc_prescan_store(struct hfi1_pportdata *ppd, const char *buf,
+                               size_t count)
+{
+       if (!memcmp(buf, "on", 2))
+               ppd->cc_prescan = true;
+       else if (!memcmp(buf, "off", 3))
+               ppd->cc_prescan = false;
+
+       return count;
+}
+
+static struct hfi1_port_attr cc_prescan_attr =
+               __ATTR(cc_prescan, 0600, cc_prescan_show, cc_prescan_store);
+
+static ssize_t cc_attr_show(struct kobject *kobj, struct attribute *attr,
+                           char *buf)
+{
+       struct hfi1_port_attr *port_attr =
+               container_of(attr, struct hfi1_port_attr, attr);
+       struct hfi1_pportdata *ppd =
+               container_of(kobj, struct hfi1_pportdata, pport_cc_kobj);
+
+       return port_attr->show(ppd, buf);
+}
+
+static ssize_t cc_attr_store(struct kobject *kobj, struct attribute *attr,
+                            const char *buf, size_t count)
+{
+       struct hfi1_port_attr *port_attr =
+               container_of(attr, struct hfi1_port_attr, attr);
+       struct hfi1_pportdata *ppd =
+               container_of(kobj, struct hfi1_pportdata, pport_cc_kobj);
+
+       return port_attr->store(ppd, buf, count);
+}
+
+static const struct sysfs_ops port_cc_sysfs_ops = {
+       .show = cc_attr_show,
+       .store = cc_attr_store
+};
+
+static struct attribute *port_cc_default_attributes[] = {
+       &cc_prescan_attr.attr
+};
+
+static struct kobj_type port_cc_ktype = {
+       .release = port_release,
+       .sysfs_ops = &port_cc_sysfs_ops,
+       .default_attrs = port_cc_default_attributes
+};
+
+/* Start sc2vl */
+#define HFI1_SC2VL_ATTR(N)                                 \
+       static struct hfi1_sc2vl_attr hfi1_sc2vl_attr_##N = { \
+               .attr = { .name = __stringify(N), .mode = 0444 }, \
+               .sc = N \
+       }
+
+struct hfi1_sc2vl_attr {
+       struct attribute attr;
+       int sc;
+};
+
+HFI1_SC2VL_ATTR(0);
+HFI1_SC2VL_ATTR(1);
+HFI1_SC2VL_ATTR(2);
+HFI1_SC2VL_ATTR(3);
+HFI1_SC2VL_ATTR(4);
+HFI1_SC2VL_ATTR(5);
+HFI1_SC2VL_ATTR(6);
+HFI1_SC2VL_ATTR(7);
+HFI1_SC2VL_ATTR(8);
+HFI1_SC2VL_ATTR(9);
+HFI1_SC2VL_ATTR(10);
+HFI1_SC2VL_ATTR(11);
+HFI1_SC2VL_ATTR(12);
+HFI1_SC2VL_ATTR(13);
+HFI1_SC2VL_ATTR(14);
+HFI1_SC2VL_ATTR(15);
+HFI1_SC2VL_ATTR(16);
+HFI1_SC2VL_ATTR(17);
+HFI1_SC2VL_ATTR(18);
+HFI1_SC2VL_ATTR(19);
+HFI1_SC2VL_ATTR(20);
+HFI1_SC2VL_ATTR(21);
+HFI1_SC2VL_ATTR(22);
+HFI1_SC2VL_ATTR(23);
+HFI1_SC2VL_ATTR(24);
+HFI1_SC2VL_ATTR(25);
+HFI1_SC2VL_ATTR(26);
+HFI1_SC2VL_ATTR(27);
+HFI1_SC2VL_ATTR(28);
+HFI1_SC2VL_ATTR(29);
+HFI1_SC2VL_ATTR(30);
+HFI1_SC2VL_ATTR(31);
+
+static struct attribute *sc2vl_default_attributes[] = {
+       &hfi1_sc2vl_attr_0.attr,
+       &hfi1_sc2vl_attr_1.attr,
+       &hfi1_sc2vl_attr_2.attr,
+       &hfi1_sc2vl_attr_3.attr,
+       &hfi1_sc2vl_attr_4.attr,
+       &hfi1_sc2vl_attr_5.attr,
+       &hfi1_sc2vl_attr_6.attr,
+       &hfi1_sc2vl_attr_7.attr,
+       &hfi1_sc2vl_attr_8.attr,
+       &hfi1_sc2vl_attr_9.attr,
+       &hfi1_sc2vl_attr_10.attr,
+       &hfi1_sc2vl_attr_11.attr,
+       &hfi1_sc2vl_attr_12.attr,
+       &hfi1_sc2vl_attr_13.attr,
+       &hfi1_sc2vl_attr_14.attr,
+       &hfi1_sc2vl_attr_15.attr,
+       &hfi1_sc2vl_attr_16.attr,
+       &hfi1_sc2vl_attr_17.attr,
+       &hfi1_sc2vl_attr_18.attr,
+       &hfi1_sc2vl_attr_19.attr,
+       &hfi1_sc2vl_attr_20.attr,
+       &hfi1_sc2vl_attr_21.attr,
+       &hfi1_sc2vl_attr_22.attr,
+       &hfi1_sc2vl_attr_23.attr,
+       &hfi1_sc2vl_attr_24.attr,
+       &hfi1_sc2vl_attr_25.attr,
+       &hfi1_sc2vl_attr_26.attr,
+       &hfi1_sc2vl_attr_27.attr,
+       &hfi1_sc2vl_attr_28.attr,
+       &hfi1_sc2vl_attr_29.attr,
+       &hfi1_sc2vl_attr_30.attr,
+       &hfi1_sc2vl_attr_31.attr,
+       NULL
+};
+
+static ssize_t sc2vl_attr_show(struct kobject *kobj, struct attribute *attr,
+                              char *buf)
+{
+       struct hfi1_sc2vl_attr *sattr =
+               container_of(attr, struct hfi1_sc2vl_attr, attr);
+       struct hfi1_pportdata *ppd =
+               container_of(kobj, struct hfi1_pportdata, sc2vl_kobj);
+       struct hfi1_devdata *dd = ppd->dd;
+
+       return sprintf(buf, "%u\n", *((u8 *)dd->sc2vl + sattr->sc));
+}
+
+static const struct sysfs_ops hfi1_sc2vl_ops = {
+       .show = sc2vl_attr_show,
+};
+
+static struct kobj_type hfi1_sc2vl_ktype = {
+       .release = port_release,
+       .sysfs_ops = &hfi1_sc2vl_ops,
+       .default_attrs = sc2vl_default_attributes
+};
+
+/* End sc2vl */
+
+/* Start sl2sc */
+#define HFI1_SL2SC_ATTR(N)                                 \
+       static struct hfi1_sl2sc_attr hfi1_sl2sc_attr_##N = {     \
+               .attr = { .name = __stringify(N), .mode = 0444 }, \
+               .sl = N                                           \
+       }
+
+struct hfi1_sl2sc_attr {
+       struct attribute attr;
+       int sl;
+};
+
+HFI1_SL2SC_ATTR(0);
+HFI1_SL2SC_ATTR(1);
+HFI1_SL2SC_ATTR(2);
+HFI1_SL2SC_ATTR(3);
+HFI1_SL2SC_ATTR(4);
+HFI1_SL2SC_ATTR(5);
+HFI1_SL2SC_ATTR(6);
+HFI1_SL2SC_ATTR(7);
+HFI1_SL2SC_ATTR(8);
+HFI1_SL2SC_ATTR(9);
+HFI1_SL2SC_ATTR(10);
+HFI1_SL2SC_ATTR(11);
+HFI1_SL2SC_ATTR(12);
+HFI1_SL2SC_ATTR(13);
+HFI1_SL2SC_ATTR(14);
+HFI1_SL2SC_ATTR(15);
+HFI1_SL2SC_ATTR(16);
+HFI1_SL2SC_ATTR(17);
+HFI1_SL2SC_ATTR(18);
+HFI1_SL2SC_ATTR(19);
+HFI1_SL2SC_ATTR(20);
+HFI1_SL2SC_ATTR(21);
+HFI1_SL2SC_ATTR(22);
+HFI1_SL2SC_ATTR(23);
+HFI1_SL2SC_ATTR(24);
+HFI1_SL2SC_ATTR(25);
+HFI1_SL2SC_ATTR(26);
+HFI1_SL2SC_ATTR(27);
+HFI1_SL2SC_ATTR(28);
+HFI1_SL2SC_ATTR(29);
+HFI1_SL2SC_ATTR(30);
+HFI1_SL2SC_ATTR(31);
+
+static struct attribute *sl2sc_default_attributes[] = {
+       &hfi1_sl2sc_attr_0.attr,
+       &hfi1_sl2sc_attr_1.attr,
+       &hfi1_sl2sc_attr_2.attr,
+       &hfi1_sl2sc_attr_3.attr,
+       &hfi1_sl2sc_attr_4.attr,
+       &hfi1_sl2sc_attr_5.attr,
+       &hfi1_sl2sc_attr_6.attr,
+       &hfi1_sl2sc_attr_7.attr,
+       &hfi1_sl2sc_attr_8.attr,
+       &hfi1_sl2sc_attr_9.attr,
+       &hfi1_sl2sc_attr_10.attr,
+       &hfi1_sl2sc_attr_11.attr,
+       &hfi1_sl2sc_attr_12.attr,
+       &hfi1_sl2sc_attr_13.attr,
+       &hfi1_sl2sc_attr_14.attr,
+       &hfi1_sl2sc_attr_15.attr,
+       &hfi1_sl2sc_attr_16.attr,
+       &hfi1_sl2sc_attr_17.attr,
+       &hfi1_sl2sc_attr_18.attr,
+       &hfi1_sl2sc_attr_19.attr,
+       &hfi1_sl2sc_attr_20.attr,
+       &hfi1_sl2sc_attr_21.attr,
+       &hfi1_sl2sc_attr_22.attr,
+       &hfi1_sl2sc_attr_23.attr,
+       &hfi1_sl2sc_attr_24.attr,
+       &hfi1_sl2sc_attr_25.attr,
+       &hfi1_sl2sc_attr_26.attr,
+       &hfi1_sl2sc_attr_27.attr,
+       &hfi1_sl2sc_attr_28.attr,
+       &hfi1_sl2sc_attr_29.attr,
+       &hfi1_sl2sc_attr_30.attr,
+       &hfi1_sl2sc_attr_31.attr,
+       NULL
+};
+
+static ssize_t sl2sc_attr_show(struct kobject *kobj, struct attribute *attr,
+                              char *buf)
+{
+       struct hfi1_sl2sc_attr *sattr =
+               container_of(attr, struct hfi1_sl2sc_attr, attr);
+       struct hfi1_pportdata *ppd =
+               container_of(kobj, struct hfi1_pportdata, sl2sc_kobj);
+       struct hfi1_ibport *ibp = &ppd->ibport_data;
+
+       return sprintf(buf, "%u\n", ibp->sl_to_sc[sattr->sl]);
+}
+
+static const struct sysfs_ops hfi1_sl2sc_ops = {
+       .show = sl2sc_attr_show,
+};
+
+static struct kobj_type hfi1_sl2sc_ktype = {
+       .release = port_release,
+       .sysfs_ops = &hfi1_sl2sc_ops,
+       .default_attrs = sl2sc_default_attributes
+};
+
+/* End sl2sc */
+
+/* Start vl2mtu */
+
+#define HFI1_VL2MTU_ATTR(N) \
+       static struct hfi1_vl2mtu_attr hfi1_vl2mtu_attr_##N = { \
+               .attr = { .name = __stringify(N), .mode = 0444 }, \
+               .vl = N                                           \
+       }
+
+struct hfi1_vl2mtu_attr {
+       struct attribute attr;
+       int vl;
+};
+
+HFI1_VL2MTU_ATTR(0);
+HFI1_VL2MTU_ATTR(1);
+HFI1_VL2MTU_ATTR(2);
+HFI1_VL2MTU_ATTR(3);
+HFI1_VL2MTU_ATTR(4);
+HFI1_VL2MTU_ATTR(5);
+HFI1_VL2MTU_ATTR(6);
+HFI1_VL2MTU_ATTR(7);
+HFI1_VL2MTU_ATTR(8);
+HFI1_VL2MTU_ATTR(9);
+HFI1_VL2MTU_ATTR(10);
+HFI1_VL2MTU_ATTR(11);
+HFI1_VL2MTU_ATTR(12);
+HFI1_VL2MTU_ATTR(13);
+HFI1_VL2MTU_ATTR(14);
+HFI1_VL2MTU_ATTR(15);
+
+static struct attribute *vl2mtu_default_attributes[] = {
+       &hfi1_vl2mtu_attr_0.attr,
+       &hfi1_vl2mtu_attr_1.attr,
+       &hfi1_vl2mtu_attr_2.attr,
+       &hfi1_vl2mtu_attr_3.attr,
+       &hfi1_vl2mtu_attr_4.attr,
+       &hfi1_vl2mtu_attr_5.attr,
+       &hfi1_vl2mtu_attr_6.attr,
+       &hfi1_vl2mtu_attr_7.attr,
+       &hfi1_vl2mtu_attr_8.attr,
+       &hfi1_vl2mtu_attr_9.attr,
+       &hfi1_vl2mtu_attr_10.attr,
+       &hfi1_vl2mtu_attr_11.attr,
+       &hfi1_vl2mtu_attr_12.attr,
+       &hfi1_vl2mtu_attr_13.attr,
+       &hfi1_vl2mtu_attr_14.attr,
+       &hfi1_vl2mtu_attr_15.attr,
+       NULL
+};
+
+static ssize_t vl2mtu_attr_show(struct kobject *kobj, struct attribute *attr,
+                               char *buf)
+{
+       struct hfi1_vl2mtu_attr *vlattr =
+               container_of(attr, struct hfi1_vl2mtu_attr, attr);
+       struct hfi1_pportdata *ppd =
+               container_of(kobj, struct hfi1_pportdata, vl2mtu_kobj);
+       struct hfi1_devdata *dd = ppd->dd;
+
+       return sprintf(buf, "%u\n", dd->vld[vlattr->vl].mtu);
+}
+
+static const struct sysfs_ops hfi1_vl2mtu_ops = {
+       .show = vl2mtu_attr_show,
+};
+
+static struct kobj_type hfi1_vl2mtu_ktype = {
+       .release = port_release,
+       .sysfs_ops = &hfi1_vl2mtu_ops,
+       .default_attrs = vl2mtu_default_attributes
+};
+
+/* end of per-port file structures and support code */
+
+/*
+ * Start of per-unit (or driver, in some cases, but replicated
+ * per unit) functions (these get a device *)
+ */
+static ssize_t show_rev(struct device *device, struct device_attribute *attr,
+                       char *buf)
+{
+       struct hfi1_ibdev *dev =
+               container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+
+       return sprintf(buf, "%x\n", dd_from_dev(dev)->minrev);
+}
+
+static ssize_t show_hfi(struct device *device, struct device_attribute *attr,
+                       char *buf)
+{
+       struct hfi1_ibdev *dev =
+               container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+       struct hfi1_devdata *dd = dd_from_dev(dev);
+       int ret;
+
+       if (!dd->boardname)
+               ret = -EINVAL;
+       else
+               ret = scnprintf(buf, PAGE_SIZE, "%s\n", dd->boardname);
+       return ret;
+}
+
+static ssize_t show_boardversion(struct device *device,
+                                struct device_attribute *attr, char *buf)
+{
+       struct hfi1_ibdev *dev =
+               container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+       struct hfi1_devdata *dd = dd_from_dev(dev);
+
+       /* The string printed here is already newline-terminated. */
+       return scnprintf(buf, PAGE_SIZE, "%s", dd->boardversion);
+}
+
+static ssize_t show_nctxts(struct device *device,
+                          struct device_attribute *attr, char *buf)
+{
+       struct hfi1_ibdev *dev =
+               container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+       struct hfi1_devdata *dd = dd_from_dev(dev);
+
+       /*
+        * Return the smaller of send and receive contexts.
+        * Normally, user level applications would require both a send
+        * and a receive context, so returning the smaller of the two counts
+        * give a more accurate picture of total contexts available.
+        */
+       return scnprintf(buf, PAGE_SIZE, "%u\n",
+                        min(dd->num_rcv_contexts - dd->first_user_ctxt,
+                            (u32)dd->sc_sizes[SC_USER].count));
+}
+
+static ssize_t show_nfreectxts(struct device *device,
+                              struct device_attribute *attr, char *buf)
+{
+       struct hfi1_ibdev *dev =
+               container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+       struct hfi1_devdata *dd = dd_from_dev(dev);
+
+       /* Return the number of free user ports (contexts) available. */
+       return scnprintf(buf, PAGE_SIZE, "%u\n", dd->freectxts);
+}
+
+static ssize_t show_serial(struct device *device,
+                          struct device_attribute *attr, char *buf)
+{
+       struct hfi1_ibdev *dev =
+               container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+       struct hfi1_devdata *dd = dd_from_dev(dev);
+
+       return scnprintf(buf, PAGE_SIZE, "%s", dd->serial);
+}
+
+static ssize_t store_chip_reset(struct device *device,
+                               struct device_attribute *attr, const char *buf,
+                               size_t count)
+{
+       struct hfi1_ibdev *dev =
+               container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+       struct hfi1_devdata *dd = dd_from_dev(dev);
+       int ret;
+
+       if (count < 5 || memcmp(buf, "reset", 5) || !dd->diag_client) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       ret = hfi1_reset_device(dd->unit);
+bail:
+       return ret < 0 ? ret : count;
+}
+
+/*
+ * Convert the reported temperature from an integer (reported in
+ * units of 0.25C) to a floating point number.
+ */
+#define temp2str(temp, buf, size, idx)                                 \
+       scnprintf((buf) + (idx), (size) - (idx), "%u.%02u ",            \
+                             ((temp) >> 2), ((temp) & 0x3) * 25)
+
+/*
+ * Dump tempsense values, in decimal, to ease shell-scripts.
+ */
+static ssize_t show_tempsense(struct device *device,
+                             struct device_attribute *attr, char *buf)
+{
+       struct hfi1_ibdev *dev =
+               container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+       struct hfi1_devdata *dd = dd_from_dev(dev);
+       struct hfi1_temp temp;
+       int ret;
+
+       ret = hfi1_tempsense_rd(dd, &temp);
+       if (!ret) {
+               int idx = 0;
+
+               idx += temp2str(temp.curr, buf, PAGE_SIZE, idx);
+               idx += temp2str(temp.lo_lim, buf, PAGE_SIZE, idx);
+               idx += temp2str(temp.hi_lim, buf, PAGE_SIZE, idx);
+               idx += temp2str(temp.crit_lim, buf, PAGE_SIZE, idx);
+               idx += scnprintf(buf + idx, PAGE_SIZE - idx,
+                               "%u %u %u\n", temp.triggers & 0x1,
+                               temp.triggers & 0x2, temp.triggers & 0x4);
+               ret = idx;
+       }
+       return ret;
+}
+
+/*
+ * end of per-unit (or driver, in some cases, but replicated
+ * per unit) functions
+ */
+
+/* start of per-unit file structures and support code */
+static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_hfi, NULL);
+static DEVICE_ATTR(nctxts, S_IRUGO, show_nctxts, NULL);
+static DEVICE_ATTR(nfreectxts, S_IRUGO, show_nfreectxts, NULL);
+static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL);
+static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL);
+static DEVICE_ATTR(tempsense, S_IRUGO, show_tempsense, NULL);
+static DEVICE_ATTR(chip_reset, S_IWUSR, NULL, store_chip_reset);
+
+static struct device_attribute *hfi1_attributes[] = {
+       &dev_attr_hw_rev,
+       &dev_attr_board_id,
+       &dev_attr_nctxts,
+       &dev_attr_nfreectxts,
+       &dev_attr_serial,
+       &dev_attr_boardversion,
+       &dev_attr_tempsense,
+       &dev_attr_chip_reset,
+};
+
+int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num,
+                          struct kobject *kobj)
+{
+       struct hfi1_pportdata *ppd;
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       int ret;
+
+       if (!port_num || port_num > dd->num_pports) {
+               dd_dev_err(dd,
+                          "Skipping infiniband class with invalid port %u\n",
+                          port_num);
+               return -ENODEV;
+       }
+       ppd = &dd->pport[port_num - 1];
+
+       ret = kobject_init_and_add(&ppd->sc2vl_kobj, &hfi1_sc2vl_ktype, kobj,
+                                  "sc2vl");
+       if (ret) {
+               dd_dev_err(dd,
+                          "Skipping sc2vl sysfs info, (err %d) port %u\n",
+                          ret, port_num);
+               goto bail;
+       }
+       kobject_uevent(&ppd->sc2vl_kobj, KOBJ_ADD);
+
+       ret = kobject_init_and_add(&ppd->sl2sc_kobj, &hfi1_sl2sc_ktype, kobj,
+                                  "sl2sc");
+       if (ret) {
+               dd_dev_err(dd,
+                          "Skipping sl2sc sysfs info, (err %d) port %u\n",
+                          ret, port_num);
+               goto bail_sc2vl;
+       }
+       kobject_uevent(&ppd->sl2sc_kobj, KOBJ_ADD);
+
+       ret = kobject_init_and_add(&ppd->vl2mtu_kobj, &hfi1_vl2mtu_ktype, kobj,
+                                  "vl2mtu");
+       if (ret) {
+               dd_dev_err(dd,
+                          "Skipping vl2mtu sysfs info, (err %d) port %u\n",
+                          ret, port_num);
+               goto bail_sl2sc;
+       }
+       kobject_uevent(&ppd->vl2mtu_kobj, KOBJ_ADD);
+
+       ret = kobject_init_and_add(&ppd->pport_cc_kobj, &port_cc_ktype,
+                                  kobj, "CCMgtA");
+       if (ret) {
+               dd_dev_err(dd,
+                          "Skipping Congestion Control sysfs info, (err %d) port %u\n",
+                          ret, port_num);
+               goto bail_vl2mtu;
+       }
+
+       kobject_uevent(&ppd->pport_cc_kobj, KOBJ_ADD);
+
+       ret = sysfs_create_bin_file(&ppd->pport_cc_kobj, &cc_setting_bin_attr);
+       if (ret) {
+               dd_dev_err(dd,
+                          "Skipping Congestion Control setting sysfs info, (err %d) port %u\n",
+                          ret, port_num);
+               goto bail_cc;
+       }
+
+       ret = sysfs_create_bin_file(&ppd->pport_cc_kobj, &cc_table_bin_attr);
+       if (ret) {
+               dd_dev_err(dd,
+                          "Skipping Congestion Control table sysfs info, (err %d) port %u\n",
+                          ret, port_num);
+               goto bail_cc_entry_bin;
+       }
+
+       dd_dev_info(dd,
+                   "Congestion Control Agent enabled for port %d\n",
+                   port_num);
+
+       return 0;
+
+bail_cc_entry_bin:
+       sysfs_remove_bin_file(&ppd->pport_cc_kobj,
+                             &cc_setting_bin_attr);
+bail_cc:
+       kobject_put(&ppd->pport_cc_kobj);
+bail_vl2mtu:
+       kobject_put(&ppd->vl2mtu_kobj);
+bail_sl2sc:
+       kobject_put(&ppd->sl2sc_kobj);
+bail_sc2vl:
+       kobject_put(&ppd->sc2vl_kobj);
+bail:
+       return ret;
+}
+
+/*
+ * Register and create our files in /sys/class/infiniband.
+ */
+int hfi1_verbs_register_sysfs(struct hfi1_devdata *dd)
+{
+       struct ib_device *dev = &dd->verbs_dev.rdi.ibdev;
+       int i, ret;
+
+       for (i = 0; i < ARRAY_SIZE(hfi1_attributes); ++i) {
+               ret = device_create_file(&dev->dev, hfi1_attributes[i]);
+               if (ret)
+                       goto bail;
+       }
+
+       return 0;
+bail:
+       for (i = 0; i < ARRAY_SIZE(hfi1_attributes); ++i)
+               device_remove_file(&dev->dev, hfi1_attributes[i]);
+       return ret;
+}
+
+/*
+ * Unregister and remove our files in /sys/class/infiniband.
+ */
+void hfi1_verbs_unregister_sysfs(struct hfi1_devdata *dd)
+{
+       struct hfi1_pportdata *ppd;
+       int i;
+
+       for (i = 0; i < dd->num_pports; i++) {
+               ppd = &dd->pport[i];
+
+               sysfs_remove_bin_file(&ppd->pport_cc_kobj,
+                                     &cc_setting_bin_attr);
+               sysfs_remove_bin_file(&ppd->pport_cc_kobj,
+                                     &cc_table_bin_attr);
+               kobject_put(&ppd->pport_cc_kobj);
+               kobject_put(&ppd->vl2mtu_kobj);
+               kobject_put(&ppd->sl2sc_kobj);
+               kobject_put(&ppd->sc2vl_kobj);
+       }
+}
diff --git a/drivers/infiniband/hw/hfi1/trace.c b/drivers/infiniband/hw/hfi1/trace.c
new file mode 100644 (file)
index 0000000..79b2952
--- /dev/null
@@ -0,0 +1,243 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
+u8 ibhdr_exhdr_len(struct hfi1_ib_header *hdr)
+{
+       struct hfi1_other_headers *ohdr;
+       u8 opcode;
+       u8 lnh = (u8)(be16_to_cpu(hdr->lrh[0]) & 3);
+
+       if (lnh == HFI1_LRH_BTH)
+               ohdr = &hdr->u.oth;
+       else
+               ohdr = &hdr->u.l.oth;
+       opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+       return hdr_len_by_opcode[opcode] == 0 ?
+              0 : hdr_len_by_opcode[opcode] - (12 + 8);
+}
+
+#define IMM_PRN  "imm %d"
+#define RETH_PRN "reth vaddr 0x%.16llx rkey 0x%.8x dlen 0x%.8x"
+#define AETH_PRN "aeth syn 0x%.2x %s msn 0x%.8x"
+#define DETH_PRN "deth qkey 0x%.8x sqpn 0x%.6x"
+#define IETH_PRN "ieth rkey 0x%.8x"
+#define ATOMICACKETH_PRN "origdata %lld"
+#define ATOMICETH_PRN "vaddr 0x%llx rkey 0x%.8x sdata %lld cdata %lld"
+
+#define OP(transport, op) IB_OPCODE_## transport ## _ ## op
+
+static u64 ib_u64_get(__be32 *p)
+{
+       return ((u64)be32_to_cpu(p[0]) << 32) | be32_to_cpu(p[1]);
+}
+
+static const char *parse_syndrome(u8 syndrome)
+{
+       switch (syndrome >> 5) {
+       case 0:
+               return "ACK";
+       case 1:
+               return "RNRNAK";
+       case 3:
+               return "NAK";
+       }
+       return "";
+}
+
+const char *parse_everbs_hdrs(
+       struct trace_seq *p,
+       u8 opcode,
+       void *ehdrs)
+{
+       union ib_ehdrs *eh = ehdrs;
+       const char *ret = trace_seq_buffer_ptr(p);
+
+       switch (opcode) {
+       /* imm */
+       case OP(RC, SEND_LAST_WITH_IMMEDIATE):
+       case OP(UC, SEND_LAST_WITH_IMMEDIATE):
+       case OP(RC, SEND_ONLY_WITH_IMMEDIATE):
+       case OP(UC, SEND_ONLY_WITH_IMMEDIATE):
+       case OP(RC, RDMA_WRITE_LAST_WITH_IMMEDIATE):
+       case OP(UC, RDMA_WRITE_LAST_WITH_IMMEDIATE):
+               trace_seq_printf(p, IMM_PRN,
+                                be32_to_cpu(eh->imm_data));
+               break;
+       /* reth + imm */
+       case OP(RC, RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+       case OP(UC, RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+               trace_seq_printf(p, RETH_PRN " " IMM_PRN,
+                                (unsigned long long)ib_u64_get(
+                                (__be32 *)&eh->rc.reth.vaddr),
+                                be32_to_cpu(eh->rc.reth.rkey),
+                                be32_to_cpu(eh->rc.reth.length),
+                                be32_to_cpu(eh->rc.imm_data));
+               break;
+       /* reth */
+       case OP(RC, RDMA_READ_REQUEST):
+       case OP(RC, RDMA_WRITE_FIRST):
+       case OP(UC, RDMA_WRITE_FIRST):
+       case OP(RC, RDMA_WRITE_ONLY):
+       case OP(UC, RDMA_WRITE_ONLY):
+               trace_seq_printf(p, RETH_PRN,
+                                (unsigned long long)ib_u64_get(
+                                (__be32 *)&eh->rc.reth.vaddr),
+                                be32_to_cpu(eh->rc.reth.rkey),
+                                be32_to_cpu(eh->rc.reth.length));
+               break;
+       case OP(RC, RDMA_READ_RESPONSE_FIRST):
+       case OP(RC, RDMA_READ_RESPONSE_LAST):
+       case OP(RC, RDMA_READ_RESPONSE_ONLY):
+       case OP(RC, ACKNOWLEDGE):
+               trace_seq_printf(p, AETH_PRN, be32_to_cpu(eh->aeth) >> 24,
+                                parse_syndrome(be32_to_cpu(eh->aeth) >> 24),
+                                be32_to_cpu(eh->aeth) & HFI1_MSN_MASK);
+               break;
+       /* aeth + atomicacketh */
+       case OP(RC, ATOMIC_ACKNOWLEDGE):
+               trace_seq_printf(p, AETH_PRN " " ATOMICACKETH_PRN,
+                                be32_to_cpu(eh->at.aeth) >> 24,
+                                parse_syndrome(be32_to_cpu(eh->at.aeth) >> 24),
+                                be32_to_cpu(eh->at.aeth) & HFI1_MSN_MASK,
+                                (unsigned long long)
+                                ib_u64_get(eh->at.atomic_ack_eth));
+               break;
+       /* atomiceth */
+       case OP(RC, COMPARE_SWAP):
+       case OP(RC, FETCH_ADD):
+               trace_seq_printf(p, ATOMICETH_PRN,
+                                (unsigned long long)ib_u64_get(
+                                eh->atomic_eth.vaddr),
+                                eh->atomic_eth.rkey,
+                                (unsigned long long)ib_u64_get(
+                                (__be32 *)&eh->atomic_eth.swap_data),
+                                (unsigned long long)ib_u64_get(
+                                (__be32 *)&eh->atomic_eth.compare_data));
+               break;
+       /* deth */
+       case OP(UD, SEND_ONLY):
+       case OP(UD, SEND_ONLY_WITH_IMMEDIATE):
+               trace_seq_printf(p, DETH_PRN,
+                                be32_to_cpu(eh->ud.deth[0]),
+                                be32_to_cpu(eh->ud.deth[1]) & RVT_QPN_MASK);
+               break;
+       /* ieth */
+       case OP(RC, SEND_LAST_WITH_INVALIDATE):
+       case OP(RC, SEND_ONLY_WITH_INVALIDATE):
+               trace_seq_printf(p, IETH_PRN,
+                                be32_to_cpu(eh->ieth));
+               break;
+       }
+       trace_seq_putc(p, 0);
+       return ret;
+}
+
+const char *parse_sdma_flags(
+       struct trace_seq *p,
+       u64 desc0, u64 desc1)
+{
+       const char *ret = trace_seq_buffer_ptr(p);
+       char flags[5] = { 'x', 'x', 'x', 'x', 0 };
+
+       flags[0] = (desc1 & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-';
+       flags[1] = (desc1 & SDMA_DESC1_HEAD_TO_HOST_FLAG) ?  'H' : '-';
+       flags[2] = (desc0 & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-';
+       flags[3] = (desc0 & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-';
+       trace_seq_printf(p, "%s", flags);
+       if (desc0 & SDMA_DESC0_FIRST_DESC_FLAG)
+               trace_seq_printf(p, " amode:%u aidx:%u alen:%u",
+                                (u8)((desc1 >> SDMA_DESC1_HEADER_MODE_SHIFT) &
+                                     SDMA_DESC1_HEADER_MODE_MASK),
+                                (u8)((desc1 >> SDMA_DESC1_HEADER_INDEX_SHIFT) &
+                                     SDMA_DESC1_HEADER_INDEX_MASK),
+                                (u8)((desc1 >> SDMA_DESC1_HEADER_DWS_SHIFT) &
+                                     SDMA_DESC1_HEADER_DWS_MASK));
+       return ret;
+}
+
+const char *print_u32_array(
+       struct trace_seq *p,
+       u32 *arr, int len)
+{
+       int i;
+       const char *ret = trace_seq_buffer_ptr(p);
+
+       for (i = 0; i < len ; i++)
+               trace_seq_printf(p, "%s%#x", i == 0 ? "" : " ", arr[i]);
+       trace_seq_putc(p, 0);
+       return ret;
+}
+
+const char *print_u64_array(
+       struct trace_seq *p,
+       u64 *arr, int len)
+{
+       int i;
+       const char *ret = trace_seq_buffer_ptr(p);
+
+       for (i = 0; i < len; i++)
+               trace_seq_printf(p, "%s0x%016llx", i == 0 ? "" : " ", arr[i]);
+       trace_seq_putc(p, 0);
+       return ret;
+}
+
+__hfi1_trace_fn(PKT);
+__hfi1_trace_fn(PROC);
+__hfi1_trace_fn(SDMA);
+__hfi1_trace_fn(LINKVERB);
+__hfi1_trace_fn(DEBUG);
+__hfi1_trace_fn(SNOOP);
+__hfi1_trace_fn(CNTR);
+__hfi1_trace_fn(PIO);
+__hfi1_trace_fn(DC8051);
+__hfi1_trace_fn(FIRMWARE);
+__hfi1_trace_fn(RCVCTRL);
+__hfi1_trace_fn(TID);
+__hfi1_trace_fn(MMU);
+__hfi1_trace_fn(IOCTL);
diff --git a/drivers/infiniband/hw/hfi1/trace.h b/drivers/infiniband/hw/hfi1/trace.h
new file mode 100644 (file)
index 0000000..28c1d08
--- /dev/null
@@ -0,0 +1,1372 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#undef TRACE_SYSTEM_VAR
+#define TRACE_SYSTEM_VAR hfi1
+
+#if !defined(__HFI1_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "hfi.h"
+#include "mad.h"
+#include "sdma.h"
+
+#define DD_DEV_ENTRY(dd)       __string(dev, dev_name(&(dd)->pcidev->dev))
+#define DD_DEV_ASSIGN(dd)      __assign_str(dev, dev_name(&(dd)->pcidev->dev))
+
+#define packettype_name(etype) { RHF_RCV_TYPE_##etype, #etype }
+#define show_packettype(etype)                  \
+__print_symbolic(etype,                         \
+       packettype_name(EXPECTED),              \
+       packettype_name(EAGER),                 \
+       packettype_name(IB),                    \
+       packettype_name(ERROR),                 \
+       packettype_name(BYPASS))
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_rx
+
+TRACE_EVENT(hfi1_rcvhdr,
+           TP_PROTO(struct hfi1_devdata *dd,
+                    u32 ctxt,
+                    u64 eflags,
+                    u32 etype,
+                    u32 hlen,
+                    u32 tlen,
+                    u32 updegr,
+                    u32 etail
+                    ),
+           TP_ARGS(dd, ctxt, eflags, etype, hlen, tlen, updegr, etail),
+           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+                            __field(u64, eflags)
+                            __field(u32, ctxt)
+                            __field(u32, etype)
+                            __field(u32, hlen)
+                            __field(u32, tlen)
+                            __field(u32, updegr)
+                            __field(u32, etail)
+                            ),
+           TP_fast_assign(DD_DEV_ASSIGN(dd);
+                          __entry->eflags = eflags;
+                          __entry->ctxt = ctxt;
+                          __entry->etype = etype;
+                          __entry->hlen = hlen;
+                          __entry->tlen = tlen;
+                          __entry->updegr = updegr;
+                          __entry->etail = etail;
+                          ),
+           TP_printk(
+                     "[%s] ctxt %d eflags 0x%llx etype %d,%s hlen %d tlen %d updegr %d etail %d",
+                     __get_str(dev),
+                     __entry->ctxt,
+                     __entry->eflags,
+                     __entry->etype, show_packettype(__entry->etype),
+                     __entry->hlen,
+                     __entry->tlen,
+                     __entry->updegr,
+                     __entry->etail
+                     )
+);
+
+TRACE_EVENT(hfi1_receive_interrupt,
+           TP_PROTO(struct hfi1_devdata *dd, u32 ctxt),
+           TP_ARGS(dd, ctxt),
+           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+                            __field(u32, ctxt)
+                            __field(u8, slow_path)
+                            __field(u8, dma_rtail)
+                            ),
+           TP_fast_assign(DD_DEV_ASSIGN(dd);
+                          __entry->ctxt = ctxt;
+                          if (dd->rcd[ctxt]->do_interrupt ==
+                              &handle_receive_interrupt) {
+                               __entry->slow_path = 1;
+                               __entry->dma_rtail = 0xFF;
+                          } else if (dd->rcd[ctxt]->do_interrupt ==
+                                     &handle_receive_interrupt_dma_rtail){
+                               __entry->dma_rtail = 1;
+                               __entry->slow_path = 0;
+                          } else if (dd->rcd[ctxt]->do_interrupt ==
+                                     &handle_receive_interrupt_nodma_rtail) {
+                               __entry->dma_rtail = 0;
+                               __entry->slow_path = 0;
+                          }
+                          ),
+           TP_printk("[%s] ctxt %d SlowPath: %d DmaRtail: %d",
+                     __get_str(dev),
+                     __entry->ctxt,
+                     __entry->slow_path,
+                     __entry->dma_rtail
+                     )
+);
+
+TRACE_EVENT(hfi1_exp_tid_reg,
+           TP_PROTO(unsigned ctxt, u16 subctxt, u32 rarr,
+                    u32 npages, unsigned long va, unsigned long pa,
+                    dma_addr_t dma),
+           TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma),
+           TP_STRUCT__entry(
+                   __field(unsigned, ctxt)
+                   __field(u16, subctxt)
+                   __field(u32, rarr)
+                   __field(u32, npages)
+                   __field(unsigned long, va)
+                   __field(unsigned long, pa)
+                   __field(dma_addr_t, dma)
+                   ),
+           TP_fast_assign(
+                   __entry->ctxt = ctxt;
+                   __entry->subctxt = subctxt;
+                   __entry->rarr = rarr;
+                   __entry->npages = npages;
+                   __entry->va = va;
+                   __entry->pa = pa;
+                   __entry->dma = dma;
+                   ),
+           TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx",
+                     __entry->ctxt,
+                     __entry->subctxt,
+                     __entry->rarr,
+                     __entry->npages,
+                     __entry->pa,
+                     __entry->va,
+                     __entry->dma
+                   )
+       );
+
+TRACE_EVENT(hfi1_exp_tid_unreg,
+           TP_PROTO(unsigned ctxt, u16 subctxt, u32 rarr, u32 npages,
+                    unsigned long va, unsigned long pa, dma_addr_t dma),
+           TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma),
+           TP_STRUCT__entry(
+                   __field(unsigned, ctxt)
+                   __field(u16, subctxt)
+                   __field(u32, rarr)
+                   __field(u32, npages)
+                   __field(unsigned long, va)
+                   __field(unsigned long, pa)
+                   __field(dma_addr_t, dma)
+                   ),
+           TP_fast_assign(
+                   __entry->ctxt = ctxt;
+                   __entry->subctxt = subctxt;
+                   __entry->rarr = rarr;
+                   __entry->npages = npages;
+                   __entry->va = va;
+                   __entry->pa = pa;
+                   __entry->dma = dma;
+                   ),
+           TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx",
+                     __entry->ctxt,
+                     __entry->subctxt,
+                     __entry->rarr,
+                     __entry->npages,
+                     __entry->pa,
+                     __entry->va,
+                     __entry->dma
+                   )
+       );
+
+TRACE_EVENT(hfi1_exp_tid_inval,
+           TP_PROTO(unsigned ctxt, u16 subctxt, unsigned long va, u32 rarr,
+                    u32 npages, dma_addr_t dma),
+           TP_ARGS(ctxt, subctxt, va, rarr, npages, dma),
+           TP_STRUCT__entry(
+                   __field(unsigned, ctxt)
+                   __field(u16, subctxt)
+                   __field(unsigned long, va)
+                   __field(u32, rarr)
+                   __field(u32, npages)
+                   __field(dma_addr_t, dma)
+                   ),
+           TP_fast_assign(
+                   __entry->ctxt = ctxt;
+                   __entry->subctxt = subctxt;
+                   __entry->va = va;
+                   __entry->rarr = rarr;
+                   __entry->npages = npages;
+                   __entry->dma = dma;
+                   ),
+           TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx dma: 0x%llx",
+                     __entry->ctxt,
+                     __entry->subctxt,
+                     __entry->rarr,
+                     __entry->npages,
+                     __entry->va,
+                     __entry->dma
+                   )
+       );
+
+TRACE_EVENT(hfi1_mmu_invalidate,
+           TP_PROTO(unsigned ctxt, u16 subctxt, const char *type,
+                    unsigned long start, unsigned long end),
+           TP_ARGS(ctxt, subctxt, type, start, end),
+           TP_STRUCT__entry(
+                   __field(unsigned, ctxt)
+                   __field(u16, subctxt)
+                   __string(type, type)
+                   __field(unsigned long, start)
+                   __field(unsigned long, end)
+                   ),
+           TP_fast_assign(
+                   __entry->ctxt = ctxt;
+                   __entry->subctxt = subctxt;
+                   __assign_str(type, type);
+                   __entry->start = start;
+                   __entry->end = end;
+                   ),
+           TP_printk("[%3u:%02u] MMU Invalidate (%s) 0x%lx - 0x%lx",
+                     __entry->ctxt,
+                     __entry->subctxt,
+                     __get_str(type),
+                     __entry->start,
+                     __entry->end
+                   )
+       );
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_tx
+
+TRACE_EVENT(hfi1_piofree,
+           TP_PROTO(struct send_context *sc, int extra),
+           TP_ARGS(sc, extra),
+           TP_STRUCT__entry(DD_DEV_ENTRY(sc->dd)
+                            __field(u32, sw_index)
+                            __field(u32, hw_context)
+                            __field(int, extra)
+                            ),
+           TP_fast_assign(DD_DEV_ASSIGN(sc->dd);
+                          __entry->sw_index = sc->sw_index;
+                          __entry->hw_context = sc->hw_context;
+                          __entry->extra = extra;
+                          ),
+           TP_printk("[%s] ctxt %u(%u) extra %d",
+                     __get_str(dev),
+                     __entry->sw_index,
+                     __entry->hw_context,
+                     __entry->extra
+                     )
+);
+
+TRACE_EVENT(hfi1_wantpiointr,
+           TP_PROTO(struct send_context *sc, u32 needint, u64 credit_ctrl),
+           TP_ARGS(sc, needint, credit_ctrl),
+           TP_STRUCT__entry(DD_DEV_ENTRY(sc->dd)
+                            __field(u32, sw_index)
+                            __field(u32, hw_context)
+                            __field(u32, needint)
+                            __field(u64, credit_ctrl)
+                            ),
+           TP_fast_assign(DD_DEV_ASSIGN(sc->dd);
+                          __entry->sw_index = sc->sw_index;
+                          __entry->hw_context = sc->hw_context;
+                          __entry->needint = needint;
+                          __entry->credit_ctrl = credit_ctrl;
+                          ),
+           TP_printk("[%s] ctxt %u(%u) on %d credit_ctrl 0x%llx",
+                     __get_str(dev),
+                     __entry->sw_index,
+                     __entry->hw_context,
+                     __entry->needint,
+                     (unsigned long long)__entry->credit_ctrl
+                      )
+);
+
+DECLARE_EVENT_CLASS(hfi1_qpsleepwakeup_template,
+                   TP_PROTO(struct rvt_qp *qp, u32 flags),
+                   TP_ARGS(qp, flags),
+                   TP_STRUCT__entry(
+                           DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+                           __field(u32, qpn)
+                           __field(u32, flags)
+                           __field(u32, s_flags)
+                           ),
+                   TP_fast_assign(
+                           DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+                           __entry->flags = flags;
+                           __entry->qpn = qp->ibqp.qp_num;
+                           __entry->s_flags = qp->s_flags;
+                           ),
+                   TP_printk(
+                           "[%s] qpn 0x%x flags 0x%x s_flags 0x%x",
+                           __get_str(dev),
+                           __entry->qpn,
+                           __entry->flags,
+                           __entry->s_flags
+                           )
+);
+
+DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpwakeup,
+            TP_PROTO(struct rvt_qp *qp, u32 flags),
+            TP_ARGS(qp, flags));
+
+DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpsleep,
+            TP_PROTO(struct rvt_qp *qp, u32 flags),
+            TP_ARGS(qp, flags));
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_ibhdrs
+
+u8 ibhdr_exhdr_len(struct hfi1_ib_header *hdr);
+const char *parse_everbs_hdrs(struct trace_seq *p, u8 opcode, void *ehdrs);
+
+#define __parse_ib_ehdrs(op, ehdrs) parse_everbs_hdrs(p, op, ehdrs)
+
+const char *parse_sdma_flags(struct trace_seq *p, u64 desc0, u64 desc1);
+
+#define __parse_sdma_flags(desc0, desc1) parse_sdma_flags(p, desc0, desc1)
+
+#define lrh_name(lrh) { HFI1_##lrh, #lrh }
+#define show_lnh(lrh)                    \
+__print_symbolic(lrh,                    \
+       lrh_name(LRH_BTH),               \
+       lrh_name(LRH_GRH))
+
+#define ib_opcode_name(opcode) { IB_OPCODE_##opcode, #opcode  }
+#define show_ib_opcode(opcode)                             \
+__print_symbolic(opcode,                                   \
+       ib_opcode_name(RC_SEND_FIRST),                     \
+       ib_opcode_name(RC_SEND_MIDDLE),                    \
+       ib_opcode_name(RC_SEND_LAST),                      \
+       ib_opcode_name(RC_SEND_LAST_WITH_IMMEDIATE),       \
+       ib_opcode_name(RC_SEND_ONLY),                      \
+       ib_opcode_name(RC_SEND_ONLY_WITH_IMMEDIATE),       \
+       ib_opcode_name(RC_RDMA_WRITE_FIRST),               \
+       ib_opcode_name(RC_RDMA_WRITE_MIDDLE),              \
+       ib_opcode_name(RC_RDMA_WRITE_LAST),                \
+       ib_opcode_name(RC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \
+       ib_opcode_name(RC_RDMA_WRITE_ONLY),                \
+       ib_opcode_name(RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \
+       ib_opcode_name(RC_RDMA_READ_REQUEST),              \
+       ib_opcode_name(RC_RDMA_READ_RESPONSE_FIRST),       \
+       ib_opcode_name(RC_RDMA_READ_RESPONSE_MIDDLE),      \
+       ib_opcode_name(RC_RDMA_READ_RESPONSE_LAST),        \
+       ib_opcode_name(RC_RDMA_READ_RESPONSE_ONLY),        \
+       ib_opcode_name(RC_ACKNOWLEDGE),                    \
+       ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE),             \
+       ib_opcode_name(RC_COMPARE_SWAP),                   \
+       ib_opcode_name(RC_FETCH_ADD),                      \
+       ib_opcode_name(RC_SEND_LAST_WITH_INVALIDATE),      \
+       ib_opcode_name(RC_SEND_ONLY_WITH_INVALIDATE),      \
+       ib_opcode_name(UC_SEND_FIRST),                     \
+       ib_opcode_name(UC_SEND_MIDDLE),                    \
+       ib_opcode_name(UC_SEND_LAST),                      \
+       ib_opcode_name(UC_SEND_LAST_WITH_IMMEDIATE),       \
+       ib_opcode_name(UC_SEND_ONLY),                      \
+       ib_opcode_name(UC_SEND_ONLY_WITH_IMMEDIATE),       \
+       ib_opcode_name(UC_RDMA_WRITE_FIRST),               \
+       ib_opcode_name(UC_RDMA_WRITE_MIDDLE),              \
+       ib_opcode_name(UC_RDMA_WRITE_LAST),                \
+       ib_opcode_name(UC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \
+       ib_opcode_name(UC_RDMA_WRITE_ONLY),                \
+       ib_opcode_name(UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \
+       ib_opcode_name(UD_SEND_ONLY),                      \
+       ib_opcode_name(UD_SEND_ONLY_WITH_IMMEDIATE),       \
+       ib_opcode_name(CNP))
+
+#define LRH_PRN "vl %d lver %d sl %d lnh %d,%s dlid %.4x len %d slid %.4x"
+#define BTH_PRN \
+       "op 0x%.2x,%s se %d m %d pad %d tver %d pkey 0x%.4x " \
+       "f %d b %d qpn 0x%.6x a %d psn 0x%.8x"
+#define EHDR_PRN "%s"
+
+DECLARE_EVENT_CLASS(hfi1_ibhdr_template,
+                   TP_PROTO(struct hfi1_devdata *dd,
+                            struct hfi1_ib_header *hdr),
+                   TP_ARGS(dd, hdr),
+                   TP_STRUCT__entry(
+                           DD_DEV_ENTRY(dd)
+                           /* LRH */
+                           __field(u8, vl)
+                           __field(u8, lver)
+                           __field(u8, sl)
+                           __field(u8, lnh)
+                           __field(u16, dlid)
+                           __field(u16, len)
+                           __field(u16, slid)
+                           /* BTH */
+                           __field(u8, opcode)
+                           __field(u8, se)
+                           __field(u8, m)
+                           __field(u8, pad)
+                           __field(u8, tver)
+                           __field(u16, pkey)
+                           __field(u8, f)
+                           __field(u8, b)
+                           __field(u32, qpn)
+                           __field(u8, a)
+                           __field(u32, psn)
+                           /* extended headers */
+                           __dynamic_array(u8, ehdrs, ibhdr_exhdr_len(hdr))
+                           ),
+                   TP_fast_assign(
+                          struct hfi1_other_headers *ohdr;
+
+                          DD_DEV_ASSIGN(dd);
+                          /* LRH */
+                          __entry->vl =
+                          (u8)(be16_to_cpu(hdr->lrh[0]) >> 12);
+                          __entry->lver =
+                          (u8)(be16_to_cpu(hdr->lrh[0]) >> 8) & 0xf;
+                          __entry->sl =
+                          (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf;
+                          __entry->lnh =
+                          (u8)(be16_to_cpu(hdr->lrh[0]) & 3);
+                          __entry->dlid =
+                          be16_to_cpu(hdr->lrh[1]);
+                          /* allow for larger len */
+                          __entry->len =
+                          be16_to_cpu(hdr->lrh[2]);
+                          __entry->slid =
+                          be16_to_cpu(hdr->lrh[3]);
+                          /* BTH */
+                          if (__entry->lnh == HFI1_LRH_BTH)
+                               ohdr = &hdr->u.oth;
+                          else
+                               ohdr = &hdr->u.l.oth;
+                         __entry->opcode =
+                         (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
+                         __entry->se =
+                         (be32_to_cpu(ohdr->bth[0]) >> 23) & 1;
+                         __entry->m =
+                         (be32_to_cpu(ohdr->bth[0]) >> 22) & 1;
+                         __entry->pad =
+                         (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+                         __entry->tver =
+                         (be32_to_cpu(ohdr->bth[0]) >> 16) & 0xf;
+                         __entry->pkey =
+                         be32_to_cpu(ohdr->bth[0]) & 0xffff;
+                         __entry->f =
+                         (be32_to_cpu(ohdr->bth[1]) >> HFI1_FECN_SHIFT) &
+                         HFI1_FECN_MASK;
+                         __entry->b =
+                         (be32_to_cpu(ohdr->bth[1]) >> HFI1_BECN_SHIFT) &
+                         HFI1_BECN_MASK;
+                         __entry->qpn =
+                         be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK;
+                         __entry->a =
+                         (be32_to_cpu(ohdr->bth[2]) >> 31) & 1;
+                         /* allow for larger PSN */
+                         __entry->psn =
+                         be32_to_cpu(ohdr->bth[2]) & 0x7fffffff;
+                         /* extended headers */
+                         memcpy(__get_dynamic_array(ehdrs), &ohdr->u,
+                                ibhdr_exhdr_len(hdr));
+                        ),
+                   TP_printk("[%s] " LRH_PRN " " BTH_PRN " " EHDR_PRN,
+                             __get_str(dev),
+                             /* LRH */
+                             __entry->vl,
+                             __entry->lver,
+                             __entry->sl,
+                             __entry->lnh, show_lnh(__entry->lnh),
+                             __entry->dlid,
+                             __entry->len,
+                             __entry->slid,
+                             /* BTH */
+                             __entry->opcode, show_ib_opcode(__entry->opcode),
+                             __entry->se,
+                             __entry->m,
+                             __entry->pad,
+                             __entry->tver,
+                             __entry->pkey,
+                             __entry->f,
+                             __entry->b,
+                             __entry->qpn,
+                             __entry->a,
+                             __entry->psn,
+                             /* extended headers */
+                             __parse_ib_ehdrs(
+                                       __entry->opcode,
+                                       (void *)__get_dynamic_array(ehdrs))
+                            )
+);
+
+DEFINE_EVENT(hfi1_ibhdr_template, input_ibhdr,
+            TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
+            TP_ARGS(dd, hdr));
+
+DEFINE_EVENT(hfi1_ibhdr_template, pio_output_ibhdr,
+            TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
+            TP_ARGS(dd, hdr));
+
+DEFINE_EVENT(hfi1_ibhdr_template, ack_output_ibhdr,
+            TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
+            TP_ARGS(dd, hdr));
+
+DEFINE_EVENT(hfi1_ibhdr_template, sdma_output_ibhdr,
+            TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
+            TP_ARGS(dd, hdr));
+
+#define SNOOP_PRN \
+       "slid %.4x dlid %.4x qpn 0x%.6x opcode 0x%.2x,%s " \
+       "svc lvl %d pkey 0x%.4x [header = %d bytes] [data = %d bytes]"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_snoop
+
+TRACE_EVENT(snoop_capture,
+           TP_PROTO(struct hfi1_devdata *dd,
+                    int hdr_len,
+                    struct hfi1_ib_header *hdr,
+                    int data_len,
+                    void *data),
+           TP_ARGS(dd, hdr_len, hdr, data_len, data),
+           TP_STRUCT__entry(
+               DD_DEV_ENTRY(dd)
+               __field(u16, slid)
+               __field(u16, dlid)
+               __field(u32, qpn)
+               __field(u8, opcode)
+               __field(u8, sl)
+               __field(u16, pkey)
+               __field(u32, hdr_len)
+               __field(u32, data_len)
+               __field(u8, lnh)
+               __dynamic_array(u8, raw_hdr, hdr_len)
+               __dynamic_array(u8, raw_pkt, data_len)
+               ),
+           TP_fast_assign(
+               struct hfi1_other_headers *ohdr;
+
+               __entry->lnh = (u8)(be16_to_cpu(hdr->lrh[0]) & 3);
+               if (__entry->lnh == HFI1_LRH_BTH)
+                       ohdr = &hdr->u.oth;
+               else
+                       ohdr = &hdr->u.l.oth;
+               DD_DEV_ASSIGN(dd);
+               __entry->slid = be16_to_cpu(hdr->lrh[3]);
+               __entry->dlid = be16_to_cpu(hdr->lrh[1]);
+               __entry->qpn = be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK;
+               __entry->opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
+               __entry->sl = (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf;
+               __entry->pkey = be32_to_cpu(ohdr->bth[0]) & 0xffff;
+               __entry->hdr_len = hdr_len;
+               __entry->data_len = data_len;
+               memcpy(__get_dynamic_array(raw_hdr), hdr, hdr_len);
+               memcpy(__get_dynamic_array(raw_pkt), data, data_len);
+               ),
+           TP_printk(
+               "[%s] " SNOOP_PRN,
+               __get_str(dev),
+               __entry->slid,
+               __entry->dlid,
+               __entry->qpn,
+               __entry->opcode,
+               show_ib_opcode(__entry->opcode),
+               __entry->sl,
+               __entry->pkey,
+               __entry->hdr_len,
+               __entry->data_len
+               )
+);
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_ctxts
+
+#define UCTXT_FMT \
+       "cred:%u, credaddr:0x%llx, piobase:0x%llx, rcvhdr_cnt:%u, "     \
+       "rcvbase:0x%llx, rcvegrc:%u, rcvegrb:0x%llx"
+TRACE_EVENT(hfi1_uctxtdata,
+           TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ctxtdata *uctxt),
+           TP_ARGS(dd, uctxt),
+           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+                            __field(unsigned, ctxt)
+                            __field(u32, credits)
+                            __field(u64, hw_free)
+                            __field(u64, piobase)
+                            __field(u16, rcvhdrq_cnt)
+                            __field(u64, rcvhdrq_phys)
+                            __field(u32, eager_cnt)
+                            __field(u64, rcvegr_phys)
+                            ),
+           TP_fast_assign(DD_DEV_ASSIGN(dd);
+                          __entry->ctxt = uctxt->ctxt;
+                          __entry->credits = uctxt->sc->credits;
+                          __entry->hw_free = (u64)uctxt->sc->hw_free;
+                          __entry->piobase = (u64)uctxt->sc->base_addr;
+                          __entry->rcvhdrq_cnt = uctxt->rcvhdrq_cnt;
+                          __entry->rcvhdrq_phys = uctxt->rcvhdrq_phys;
+                          __entry->eager_cnt = uctxt->egrbufs.alloced;
+                          __entry->rcvegr_phys =
+                          uctxt->egrbufs.rcvtids[0].phys;
+                          ),
+           TP_printk("[%s] ctxt %u " UCTXT_FMT,
+                     __get_str(dev),
+                     __entry->ctxt,
+                     __entry->credits,
+                     __entry->hw_free,
+                     __entry->piobase,
+                     __entry->rcvhdrq_cnt,
+                     __entry->rcvhdrq_phys,
+                     __entry->eager_cnt,
+                     __entry->rcvegr_phys
+                     )
+);
+
+#define CINFO_FMT \
+       "egrtids:%u, egr_size:%u, hdrq_cnt:%u, hdrq_size:%u, sdma_ring_size:%u"
+TRACE_EVENT(hfi1_ctxt_info,
+           TP_PROTO(struct hfi1_devdata *dd, unsigned ctxt, unsigned subctxt,
+                    struct hfi1_ctxt_info cinfo),
+           TP_ARGS(dd, ctxt, subctxt, cinfo),
+           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+                            __field(unsigned, ctxt)
+                            __field(unsigned, subctxt)
+                            __field(u16, egrtids)
+                            __field(u16, rcvhdrq_cnt)
+                            __field(u16, rcvhdrq_size)
+                            __field(u16, sdma_ring_size)
+                            __field(u32, rcvegr_size)
+                            ),
+           TP_fast_assign(DD_DEV_ASSIGN(dd);
+                           __entry->ctxt = ctxt;
+                           __entry->subctxt = subctxt;
+                           __entry->egrtids = cinfo.egrtids;
+                           __entry->rcvhdrq_cnt = cinfo.rcvhdrq_cnt;
+                           __entry->rcvhdrq_size = cinfo.rcvhdrq_entsize;
+                           __entry->sdma_ring_size = cinfo.sdma_ring_size;
+                           __entry->rcvegr_size = cinfo.rcvegr_size;
+                           ),
+           TP_printk("[%s] ctxt %u:%u " CINFO_FMT,
+                     __get_str(dev),
+                     __entry->ctxt,
+                     __entry->subctxt,
+                     __entry->egrtids,
+                     __entry->rcvegr_size,
+                     __entry->rcvhdrq_cnt,
+                     __entry->rcvhdrq_size,
+                     __entry->sdma_ring_size
+                     )
+);
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_sma
+
+#define BCT_FORMAT \
+       "shared_limit %x vls 0-7 [%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x] 15 [%x,%x]"
+
+#define BCT(field) \
+       be16_to_cpu( \
+               ((struct buffer_control *)__get_dynamic_array(bct))->field \
+       )
+
+DECLARE_EVENT_CLASS(hfi1_bct_template,
+                   TP_PROTO(struct hfi1_devdata *dd,
+                            struct buffer_control *bc),
+                   TP_ARGS(dd, bc),
+                   TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+                                    __dynamic_array(u8, bct, sizeof(*bc))
+                                    ),
+                   TP_fast_assign(DD_DEV_ASSIGN(dd);
+                                  memcpy(__get_dynamic_array(bct), bc,
+                                         sizeof(*bc));
+                                  ),
+                   TP_printk(BCT_FORMAT,
+                             BCT(overall_shared_limit),
+
+                             BCT(vl[0].dedicated),
+                             BCT(vl[0].shared),
+
+                             BCT(vl[1].dedicated),
+                             BCT(vl[1].shared),
+
+                             BCT(vl[2].dedicated),
+                             BCT(vl[2].shared),
+
+                             BCT(vl[3].dedicated),
+                             BCT(vl[3].shared),
+
+                             BCT(vl[4].dedicated),
+                             BCT(vl[4].shared),
+
+                             BCT(vl[5].dedicated),
+                             BCT(vl[5].shared),
+
+                             BCT(vl[6].dedicated),
+                             BCT(vl[6].shared),
+
+                             BCT(vl[7].dedicated),
+                             BCT(vl[7].shared),
+
+                             BCT(vl[15].dedicated),
+                             BCT(vl[15].shared)
+                             )
+);
+
+DEFINE_EVENT(hfi1_bct_template, bct_set,
+            TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc),
+            TP_ARGS(dd, bc));
+
+DEFINE_EVENT(hfi1_bct_template, bct_get,
+            TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc),
+            TP_ARGS(dd, bc));
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_sdma
+
+TRACE_EVENT(hfi1_sdma_descriptor,
+           TP_PROTO(struct sdma_engine *sde,
+                    u64 desc0,
+                    u64 desc1,
+                    u16 e,
+                    void *descp),
+       TP_ARGS(sde, desc0, desc1, e, descp),
+       TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+                        __field(void *, descp)
+                        __field(u64, desc0)
+                        __field(u64, desc1)
+                        __field(u16, e)
+                        __field(u8, idx)
+                        ),
+       TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+                      __entry->desc0 = desc0;
+                      __entry->desc1 = desc1;
+                      __entry->idx = sde->this_idx;
+                      __entry->descp = descp;
+                      __entry->e = e;
+                      ),
+       TP_printk(
+                 "[%s] SDE(%u) flags:%s addr:0x%016llx gen:%u len:%u d0:%016llx d1:%016llx to %p,%u",
+                 __get_str(dev),
+                 __entry->idx,
+                 __parse_sdma_flags(__entry->desc0, __entry->desc1),
+                 (__entry->desc0 >> SDMA_DESC0_PHY_ADDR_SHIFT) &
+                 SDMA_DESC0_PHY_ADDR_MASK,
+                 (u8)((__entry->desc1 >> SDMA_DESC1_GENERATION_SHIFT) &
+                      SDMA_DESC1_GENERATION_MASK),
+                 (u16)((__entry->desc0 >> SDMA_DESC0_BYTE_COUNT_SHIFT) &
+                       SDMA_DESC0_BYTE_COUNT_MASK),
+                 __entry->desc0,
+                 __entry->desc1,
+                 __entry->descp,
+                 __entry->e
+                 )
+);
+
+TRACE_EVENT(hfi1_sdma_engine_select,
+           TP_PROTO(struct hfi1_devdata *dd, u32 sel, u8 vl, u8 idx),
+           TP_ARGS(dd, sel, vl, idx),
+           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+                            __field(u32, sel)
+                            __field(u8, vl)
+                            __field(u8, idx)
+                            ),
+           TP_fast_assign(DD_DEV_ASSIGN(dd);
+                          __entry->sel = sel;
+                          __entry->vl = vl;
+                          __entry->idx = idx;
+                          ),
+           TP_printk("[%s] selecting SDE %u sel 0x%x vl %u",
+                     __get_str(dev),
+                     __entry->idx,
+                     __entry->sel,
+                     __entry->vl
+                     )
+);
+
+DECLARE_EVENT_CLASS(hfi1_sdma_engine_class,
+                   TP_PROTO(struct sdma_engine *sde, u64 status),
+                   TP_ARGS(sde, status),
+                   TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+                                    __field(u64, status)
+                                    __field(u8, idx)
+                                    ),
+                   TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+                                  __entry->status = status;
+                                  __entry->idx = sde->this_idx;
+                                  ),
+                   TP_printk("[%s] SDE(%u) status %llx",
+                             __get_str(dev),
+                             __entry->idx,
+                             (unsigned long long)__entry->status
+                             )
+);
+
+DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_interrupt,
+            TP_PROTO(struct sdma_engine *sde, u64 status),
+            TP_ARGS(sde, status)
+);
+
+DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_progress,
+            TP_PROTO(struct sdma_engine *sde, u64 status),
+            TP_ARGS(sde, status)
+);
+
+DECLARE_EVENT_CLASS(hfi1_sdma_ahg_ad,
+                   TP_PROTO(struct sdma_engine *sde, int aidx),
+                   TP_ARGS(sde, aidx),
+                   TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+                                    __field(int, aidx)
+                                    __field(u8, idx)
+                                    ),
+                   TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+                                  __entry->idx = sde->this_idx;
+                                  __entry->aidx = aidx;
+                                  ),
+                   TP_printk("[%s] SDE(%u) aidx %d",
+                             __get_str(dev),
+                             __entry->idx,
+                             __entry->aidx
+                             )
+);
+
+DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_allocate,
+            TP_PROTO(struct sdma_engine *sde, int aidx),
+            TP_ARGS(sde, aidx));
+
+DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_deallocate,
+            TP_PROTO(struct sdma_engine *sde, int aidx),
+            TP_ARGS(sde, aidx));
+
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+TRACE_EVENT(hfi1_sdma_progress,
+           TP_PROTO(struct sdma_engine *sde,
+                    u16 hwhead,
+                    u16 swhead,
+                    struct sdma_txreq *txp
+                    ),
+           TP_ARGS(sde, hwhead, swhead, txp),
+           TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+                            __field(u64, sn)
+                            __field(u16, hwhead)
+                            __field(u16, swhead)
+                            __field(u16, txnext)
+                            __field(u16, tx_tail)
+                            __field(u16, tx_head)
+                            __field(u8, idx)
+                            ),
+           TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+                          __entry->hwhead = hwhead;
+                          __entry->swhead = swhead;
+                          __entry->tx_tail = sde->tx_tail;
+                          __entry->tx_head = sde->tx_head;
+                          __entry->txnext = txp ? txp->next_descq_idx : ~0;
+                          __entry->idx = sde->this_idx;
+                          __entry->sn = txp ? txp->sn : ~0;
+                          ),
+           TP_printk(
+                     "[%s] SDE(%u) sn %llu hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u",
+                     __get_str(dev),
+                     __entry->idx,
+                     __entry->sn,
+                     __entry->hwhead,
+                     __entry->swhead,
+                     __entry->txnext,
+                     __entry->tx_head,
+                     __entry->tx_tail
+                     )
+);
+#else
+TRACE_EVENT(hfi1_sdma_progress,
+           TP_PROTO(struct sdma_engine *sde,
+                    u16 hwhead, u16 swhead,
+                    struct sdma_txreq *txp
+           ),
+       TP_ARGS(sde, hwhead, swhead, txp),
+       TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+                        __field(u16, hwhead)
+                        __field(u16, swhead)
+                        __field(u16, txnext)
+                        __field(u16, tx_tail)
+                        __field(u16, tx_head)
+                        __field(u8, idx)
+                        ),
+       TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+                      __entry->hwhead = hwhead;
+                      __entry->swhead = swhead;
+                      __entry->tx_tail = sde->tx_tail;
+                      __entry->tx_head = sde->tx_head;
+                      __entry->txnext = txp ? txp->next_descq_idx : ~0;
+                      __entry->idx = sde->this_idx;
+                      ),
+       TP_printk(
+                 "[%s] SDE(%u) hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u",
+                 __get_str(dev),
+                 __entry->idx,
+                 __entry->hwhead,
+                 __entry->swhead,
+                 __entry->txnext,
+                 __entry->tx_head,
+                 __entry->tx_tail
+                 )
+);
+#endif
+
+DECLARE_EVENT_CLASS(hfi1_sdma_sn,
+                   TP_PROTO(struct sdma_engine *sde, u64 sn),
+                   TP_ARGS(sde, sn),
+                   TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+                                    __field(u64, sn)
+                                    __field(u8, idx)
+                                    ),
+                   TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+                                  __entry->sn = sn;
+                                  __entry->idx = sde->this_idx;
+                                  ),
+                   TP_printk("[%s] SDE(%u) sn %llu",
+                             __get_str(dev),
+                             __entry->idx,
+                             __entry->sn
+                             )
+);
+
+DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_out_sn,
+            TP_PROTO(
+               struct sdma_engine *sde,
+               u64 sn
+            ),
+            TP_ARGS(sde, sn)
+);
+
+DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_in_sn,
+            TP_PROTO(struct sdma_engine *sde, u64 sn),
+            TP_ARGS(sde, sn)
+);
+
+#define USDMA_HDR_FORMAT \
+       "[%s:%u:%u:%u] PBC=(0x%x 0x%x) LRH=(0x%x 0x%x) BTH=(0x%x 0x%x 0x%x) KDETH=(0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x) TIDVal=0x%x"
+
+TRACE_EVENT(hfi1_sdma_user_header,
+           TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req,
+                    struct hfi1_pkt_header *hdr, u32 tidval),
+           TP_ARGS(dd, ctxt, subctxt, req, hdr, tidval),
+           TP_STRUCT__entry(
+                   DD_DEV_ENTRY(dd)
+                   __field(u16, ctxt)
+                   __field(u8, subctxt)
+                   __field(u16, req)
+                   __field(__le32, pbc0)
+                   __field(__le32, pbc1)
+                   __field(__be32, lrh0)
+                   __field(__be32, lrh1)
+                   __field(__be32, bth0)
+                   __field(__be32, bth1)
+                   __field(__be32, bth2)
+                   __field(__le32, kdeth0)
+                   __field(__le32, kdeth1)
+                   __field(__le32, kdeth2)
+                   __field(__le32, kdeth3)
+                   __field(__le32, kdeth4)
+                   __field(__le32, kdeth5)
+                   __field(__le32, kdeth6)
+                   __field(__le32, kdeth7)
+                   __field(__le32, kdeth8)
+                   __field(u32, tidval)
+                   ),
+           TP_fast_assign(
+                   __le32 *pbc = (__le32 *)hdr->pbc;
+                   __be32 *lrh = (__be32 *)hdr->lrh;
+                   __be32 *bth = (__be32 *)hdr->bth;
+                   __le32 *kdeth = (__le32 *)&hdr->kdeth;
+
+                   DD_DEV_ASSIGN(dd);
+                   __entry->ctxt = ctxt;
+                   __entry->subctxt = subctxt;
+                   __entry->req = req;
+                   __entry->pbc0 = pbc[0];
+                   __entry->pbc1 = pbc[1];
+                   __entry->lrh0 = be32_to_cpu(lrh[0]);
+                   __entry->lrh1 = be32_to_cpu(lrh[1]);
+                   __entry->bth0 = be32_to_cpu(bth[0]);
+                   __entry->bth1 = be32_to_cpu(bth[1]);
+                   __entry->bth2 = be32_to_cpu(bth[2]);
+                   __entry->kdeth0 = kdeth[0];
+                   __entry->kdeth1 = kdeth[1];
+                   __entry->kdeth2 = kdeth[2];
+                   __entry->kdeth3 = kdeth[3];
+                   __entry->kdeth4 = kdeth[4];
+                   __entry->kdeth5 = kdeth[5];
+                   __entry->kdeth6 = kdeth[6];
+                   __entry->kdeth7 = kdeth[7];
+                   __entry->kdeth8 = kdeth[8];
+                   __entry->tidval = tidval;
+                   ),
+           TP_printk(USDMA_HDR_FORMAT,
+                     __get_str(dev),
+                     __entry->ctxt,
+                     __entry->subctxt,
+                     __entry->req,
+                     __entry->pbc1,
+                     __entry->pbc0,
+                     __entry->lrh0,
+                     __entry->lrh1,
+                     __entry->bth0,
+                     __entry->bth1,
+                     __entry->bth2,
+                     __entry->kdeth0,
+                     __entry->kdeth1,
+                     __entry->kdeth2,
+                     __entry->kdeth3,
+                     __entry->kdeth4,
+                     __entry->kdeth5,
+                     __entry->kdeth6,
+                     __entry->kdeth7,
+                     __entry->kdeth8,
+                     __entry->tidval
+                   )
+       );
+
+#define SDMA_UREQ_FMT \
+       "[%s:%u:%u] ver/op=0x%x, iovcnt=%u, npkts=%u, frag=%u, idx=%u"
+TRACE_EVENT(hfi1_sdma_user_reqinfo,
+           TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 *i),
+           TP_ARGS(dd, ctxt, subctxt, i),
+           TP_STRUCT__entry(
+                   DD_DEV_ENTRY(dd);
+                   __field(u16, ctxt)
+                   __field(u8, subctxt)
+                   __field(u8, ver_opcode)
+                   __field(u8, iovcnt)
+                   __field(u16, npkts)
+                   __field(u16, fragsize)
+                   __field(u16, comp_idx)
+                   ),
+           TP_fast_assign(
+                   DD_DEV_ASSIGN(dd);
+                   __entry->ctxt = ctxt;
+                   __entry->subctxt = subctxt;
+                   __entry->ver_opcode = i[0] & 0xff;
+                   __entry->iovcnt = (i[0] >> 8) & 0xff;
+                   __entry->npkts = i[1];
+                   __entry->fragsize = i[2];
+                   __entry->comp_idx = i[3];
+                   ),
+           TP_printk(SDMA_UREQ_FMT,
+                     __get_str(dev),
+                     __entry->ctxt,
+                     __entry->subctxt,
+                     __entry->ver_opcode,
+                     __entry->iovcnt,
+                     __entry->npkts,
+                     __entry->fragsize,
+                     __entry->comp_idx
+                   )
+       );
+
+#define usdma_complete_name(st) { st, #st }
+#define show_usdma_complete_state(st)                  \
+       __print_symbolic(st,                            \
+                        usdma_complete_name(FREE),     \
+                        usdma_complete_name(QUEUED),   \
+                        usdma_complete_name(COMPLETE), \
+                        usdma_complete_name(ERROR))
+
+TRACE_EVENT(hfi1_sdma_user_completion,
+           TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 idx,
+                    u8 state, int code),
+           TP_ARGS(dd, ctxt, subctxt, idx, state, code),
+           TP_STRUCT__entry(
+                   DD_DEV_ENTRY(dd)
+                   __field(u16, ctxt)
+                   __field(u8, subctxt)
+                   __field(u16, idx)
+                   __field(u8, state)
+                   __field(int, code)
+                   ),
+           TP_fast_assign(
+                   DD_DEV_ASSIGN(dd);
+                   __entry->ctxt = ctxt;
+                   __entry->subctxt = subctxt;
+                   __entry->idx = idx;
+                   __entry->state = state;
+                   __entry->code = code;
+                   ),
+           TP_printk("[%s:%u:%u:%u] SDMA completion state %s (%d)",
+                     __get_str(dev), __entry->ctxt, __entry->subctxt,
+                     __entry->idx, show_usdma_complete_state(__entry->state),
+                     __entry->code)
+       );
+
+const char *print_u32_array(struct trace_seq *, u32 *, int);
+#define __print_u32_hex(arr, len) print_u32_array(p, arr, len)
+
+TRACE_EVENT(hfi1_sdma_user_header_ahg,
+           TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req,
+                    u8 sde, u8 ahgidx, u32 *ahg, int len, u32 tidval),
+           TP_ARGS(dd, ctxt, subctxt, req, sde, ahgidx, ahg, len, tidval),
+           TP_STRUCT__entry(
+                   DD_DEV_ENTRY(dd)
+                   __field(u16, ctxt)
+                   __field(u8, subctxt)
+                   __field(u16, req)
+                   __field(u8, sde)
+                   __field(u8, idx)
+                   __field(int, len)
+                   __field(u32, tidval)
+                   __array(u32, ahg, 10)
+                   ),
+           TP_fast_assign(
+                   DD_DEV_ASSIGN(dd);
+                   __entry->ctxt = ctxt;
+                   __entry->subctxt = subctxt;
+                   __entry->req = req;
+                   __entry->sde = sde;
+                   __entry->idx = ahgidx;
+                   __entry->len = len;
+                   __entry->tidval = tidval;
+                   memcpy(__entry->ahg, ahg, len * sizeof(u32));
+                   ),
+           TP_printk("[%s:%u:%u:%u] (SDE%u/AHG%u) ahg[0-%d]=(%s) TIDVal=0x%x",
+                     __get_str(dev),
+                     __entry->ctxt,
+                     __entry->subctxt,
+                     __entry->req,
+                     __entry->sde,
+                     __entry->idx,
+                     __entry->len - 1,
+                     __print_u32_hex(__entry->ahg, __entry->len),
+                     __entry->tidval
+                   )
+       );
+
+TRACE_EVENT(hfi1_sdma_state,
+           TP_PROTO(struct sdma_engine *sde,
+                    const char *cstate,
+                    const char *nstate
+                    ),
+           TP_ARGS(sde, cstate, nstate),
+           TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+                            __string(curstate, cstate)
+                            __string(newstate, nstate)
+                            ),
+       TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+                      __assign_str(curstate, cstate);
+                      __assign_str(newstate, nstate);
+                      ),
+       TP_printk("[%s] current state %s new state %s",
+                 __get_str(dev),
+                 __get_str(curstate),
+                 __get_str(newstate)
+                 )
+);
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_rc
+
+DECLARE_EVENT_CLASS(hfi1_rc_template,
+                   TP_PROTO(struct rvt_qp *qp, u32 psn),
+                   TP_ARGS(qp, psn),
+                   TP_STRUCT__entry(
+                       DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+                       __field(u32, qpn)
+                       __field(u32, s_flags)
+                       __field(u32, psn)
+                       __field(u32, s_psn)
+                       __field(u32, s_next_psn)
+                       __field(u32, s_sending_psn)
+                       __field(u32, s_sending_hpsn)
+                       __field(u32, r_psn)
+                       ),
+                   TP_fast_assign(
+                       DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+                       __entry->qpn = qp->ibqp.qp_num;
+                       __entry->s_flags = qp->s_flags;
+                       __entry->psn = psn;
+                       __entry->s_psn = qp->s_psn;
+                       __entry->s_next_psn = qp->s_next_psn;
+                       __entry->s_sending_psn = qp->s_sending_psn;
+                       __entry->s_sending_hpsn = qp->s_sending_hpsn;
+                       __entry->r_psn = qp->r_psn;
+                       ),
+                   TP_printk(
+                       "[%s] qpn 0x%x s_flags 0x%x psn 0x%x s_psn 0x%x s_next_psn 0x%x s_sending_psn 0x%x sending_hpsn 0x%x r_psn 0x%x",
+                       __get_str(dev),
+                       __entry->qpn,
+                       __entry->s_flags,
+                       __entry->psn,
+                       __entry->s_psn,
+                       __entry->s_next_psn,
+                       __entry->s_sending_psn,
+                       __entry->s_sending_hpsn,
+                       __entry->r_psn
+                       )
+);
+
+DEFINE_EVENT(hfi1_rc_template, hfi1_rc_sendcomplete,
+            TP_PROTO(struct rvt_qp *qp, u32 psn),
+            TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(hfi1_rc_template, hfi1_rc_ack,
+            TP_PROTO(struct rvt_qp *qp, u32 psn),
+            TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(hfi1_rc_template, hfi1_rc_timeout,
+            TP_PROTO(struct rvt_qp *qp, u32 psn),
+            TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(hfi1_rc_template, hfi1_rc_rcv_error,
+            TP_PROTO(struct rvt_qp *qp, u32 psn),
+            TP_ARGS(qp, psn)
+);
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_misc
+
+TRACE_EVENT(hfi1_interrupt,
+           TP_PROTO(struct hfi1_devdata *dd, const struct is_table *is_entry,
+                    int src),
+           TP_ARGS(dd, is_entry, src),
+           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+                            __array(char, buf, 64)
+                            __field(int, src)
+                            ),
+           TP_fast_assign(DD_DEV_ASSIGN(dd)
+                          is_entry->is_name(__entry->buf, 64,
+                                            src - is_entry->start);
+                          __entry->src = src;
+                          ),
+           TP_printk("[%s] source: %s [%d]", __get_str(dev), __entry->buf,
+                     __entry->src)
+);
+
+/*
+ * Note:
+ * This produces a REALLY ugly trace in the console output when the string is
+ * too long.
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_trace
+
+#define MAX_MSG_LEN 512
+
+DECLARE_EVENT_CLASS(hfi1_trace_template,
+                   TP_PROTO(const char *function, struct va_format *vaf),
+                   TP_ARGS(function, vaf),
+                   TP_STRUCT__entry(__string(function, function)
+                                    __dynamic_array(char, msg, MAX_MSG_LEN)
+                                    ),
+                   TP_fast_assign(__assign_str(function, function);
+                                  WARN_ON_ONCE(vsnprintf
+                                               (__get_dynamic_array(msg),
+                                                MAX_MSG_LEN, vaf->fmt,
+                                                *vaf->va) >=
+                                               MAX_MSG_LEN);
+                                  ),
+                   TP_printk("(%s) %s",
+                             __get_str(function),
+                             __get_str(msg))
+);
+
+/*
+ * It may be nice to macroize the __hfi1_trace but the va_* stuff requires an
+ * actual function to work and can not be in a macro.
+ */
+#define __hfi1_trace_def(lvl) \
+void __hfi1_trace_##lvl(const char *funct, char *fmt, ...);            \
+                                                                       \
+DEFINE_EVENT(hfi1_trace_template, hfi1_ ##lvl,                         \
+       TP_PROTO(const char *function, struct va_format *vaf),          \
+       TP_ARGS(function, vaf))
+
+#define __hfi1_trace_fn(lvl) \
+void __hfi1_trace_##lvl(const char *func, char *fmt, ...)              \
+{                                                                      \
+       struct va_format vaf = {                                        \
+               .fmt = fmt,                                             \
+       };                                                              \
+       va_list args;                                                   \
+                                                                       \
+       va_start(args, fmt);                                            \
+       vaf.va = &args;                                                 \
+       trace_hfi1_ ##lvl(func, &vaf);                                  \
+       va_end(args);                                                   \
+       return;                                                         \
+}
+
+/*
+ * To create a new trace level simply define it below and as a __hfi1_trace_fn
+ * in trace.c. This will create all the hooks for calling
+ * hfi1_cdbg(LVL, fmt, ...); as well as take care of all
+ * the debugfs stuff.
+ */
+__hfi1_trace_def(PKT);
+__hfi1_trace_def(PROC);
+__hfi1_trace_def(SDMA);
+__hfi1_trace_def(LINKVERB);
+__hfi1_trace_def(DEBUG);
+__hfi1_trace_def(SNOOP);
+__hfi1_trace_def(CNTR);
+__hfi1_trace_def(PIO);
+__hfi1_trace_def(DC8051);
+__hfi1_trace_def(FIRMWARE);
+__hfi1_trace_def(RCVCTRL);
+__hfi1_trace_def(TID);
+__hfi1_trace_def(MMU);
+__hfi1_trace_def(IOCTL);
+
+#define hfi1_cdbg(which, fmt, ...) \
+       __hfi1_trace_##which(__func__, fmt, ##__VA_ARGS__)
+
+#define hfi1_dbg(fmt, ...) \
+       hfi1_cdbg(DEBUG, fmt, ##__VA_ARGS__)
+
+/*
+ * Define HFI1_EARLY_DBG at compile time or here to enable early trace
+ * messages. Do not check in an enablement for this.
+ */
+
+#ifdef HFI1_EARLY_DBG
+#define hfi1_dbg_early(fmt, ...) \
+       trace_printk(fmt, ##__VA_ARGS__)
+#else
+#define hfi1_dbg_early(fmt, ...)
+#endif
+
+#endif /* __HFI1_TRACE_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/hw/hfi1/twsi.c b/drivers/infiniband/hw/hfi1/twsi.c
new file mode 100644 (file)
index 0000000..e82e52a
--- /dev/null
@@ -0,0 +1,489 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/vmalloc.h>
+
+#include "hfi.h"
+#include "twsi.h"
+
+/*
+ * "Two Wire Serial Interface" support.
+ *
+ * Originally written for a not-quite-i2c serial eeprom, which is
+ * still used on some supported boards. Later boards have added a
+ * variety of other uses, most board-specific, so the bit-boffing
+ * part has been split off to this file, while the other parts
+ * have been moved to chip-specific files.
+ *
+ * We have also dropped all pretense of fully generic (e.g. pretend
+ * we don't know whether '1' is the higher voltage) interface, as
+ * the restrictions of the generic i2c interface (e.g. no access from
+ * driver itself) make it unsuitable for this use.
+ */
+
+#define READ_CMD 1
+#define WRITE_CMD 0
+
+/**
+ * i2c_wait_for_writes - wait for a write
+ * @dd: the hfi1_ib device
+ *
+ * We use this instead of udelay directly, so we can make sure
+ * that previous register writes have been flushed all the way
+ * to the chip.  Since we are delaying anyway, the cost doesn't
+ * hurt, and makes the bit twiddling more regular
+ */
+static void i2c_wait_for_writes(struct hfi1_devdata *dd, u32 target)
+{
+       /*
+        * implicit read of EXTStatus is as good as explicit
+        * read of scratch, if all we want to do is flush
+        * writes.
+        */
+       hfi1_gpio_mod(dd, target, 0, 0, 0);
+       rmb(); /* inlined, so prevent compiler reordering */
+}
+
+/*
+ * QSFP modules are allowed to hold SCL low for 500uSec. Allow twice that
+ * for "almost compliant" modules
+ */
+#define SCL_WAIT_USEC 1000
+
+/* BUF_WAIT is time bus must be free between STOP or ACK and to next START.
+ * Should be 20, but some chips need more.
+ */
+#define TWSI_BUF_WAIT_USEC 60
+
+static void scl_out(struct hfi1_devdata *dd, u32 target, u8 bit)
+{
+       u32 mask;
+
+       udelay(1);
+
+       mask = QSFP_HFI0_I2CCLK;
+
+       /* SCL is meant to be bare-drain, so never set "OUT", just DIR */
+       hfi1_gpio_mod(dd, target, 0, bit ? 0 : mask, mask);
+
+       /*
+        * Allow for slow slaves by simple
+        * delay for falling edge, sampling on rise.
+        */
+       if (!bit) {
+               udelay(2);
+       } else {
+               int rise_usec;
+
+               for (rise_usec = SCL_WAIT_USEC; rise_usec > 0; rise_usec -= 2) {
+                       if (mask & hfi1_gpio_mod(dd, target, 0, 0, 0))
+                               break;
+                       udelay(2);
+               }
+               if (rise_usec <= 0)
+                       dd_dev_err(dd, "SCL interface stuck low > %d uSec\n",
+                                  SCL_WAIT_USEC);
+       }
+       i2c_wait_for_writes(dd, target);
+}
+
+static u8 scl_in(struct hfi1_devdata *dd, u32 target, int wait)
+{
+       u32 read_val, mask;
+
+       mask = QSFP_HFI0_I2CCLK;
+       /* SCL is meant to be bare-drain, so never set "OUT", just DIR */
+       hfi1_gpio_mod(dd, target, 0, 0, mask);
+       read_val = hfi1_gpio_mod(dd, target, 0, 0, 0);
+       if (wait)
+               i2c_wait_for_writes(dd, target);
+       return (read_val & mask) >> GPIO_SCL_NUM;
+}
+
+static void sda_out(struct hfi1_devdata *dd, u32 target, u8 bit)
+{
+       u32 mask;
+
+       mask = QSFP_HFI0_I2CDAT;
+
+       /* SDA is meant to be bare-drain, so never set "OUT", just DIR */
+       hfi1_gpio_mod(dd, target, 0, bit ? 0 : mask, mask);
+
+       i2c_wait_for_writes(dd, target);
+       udelay(2);
+}
+
+static u8 sda_in(struct hfi1_devdata *dd, u32 target, int wait)
+{
+       u32 read_val, mask;
+
+       mask = QSFP_HFI0_I2CDAT;
+       /* SDA is meant to be bare-drain, so never set "OUT", just DIR */
+       hfi1_gpio_mod(dd, target, 0, 0, mask);
+       read_val = hfi1_gpio_mod(dd, target, 0, 0, 0);
+       if (wait)
+               i2c_wait_for_writes(dd, target);
+       return (read_val & mask) >> GPIO_SDA_NUM;
+}
+
+/**
+ * i2c_ackrcv - see if ack following write is true
+ * @dd: the hfi1_ib device
+ */
+static int i2c_ackrcv(struct hfi1_devdata *dd, u32 target)
+{
+       u8 ack_received;
+
+       /* AT ENTRY SCL = LOW */
+       /* change direction, ignore data */
+       ack_received = sda_in(dd, target, 1);
+       scl_out(dd, target, 1);
+       ack_received = sda_in(dd, target, 1) == 0;
+       scl_out(dd, target, 0);
+       return ack_received;
+}
+
+static void stop_cmd(struct hfi1_devdata *dd, u32 target);
+
+/**
+ * rd_byte - read a byte, sending STOP on last, else ACK
+ * @dd: the hfi1_ib device
+ *
+ * Returns byte shifted out of device
+ */
+static int rd_byte(struct hfi1_devdata *dd, u32 target, int last)
+{
+       int bit_cntr, data;
+
+       data = 0;
+
+       for (bit_cntr = 7; bit_cntr >= 0; --bit_cntr) {
+               data <<= 1;
+               scl_out(dd, target, 1);
+               data |= sda_in(dd, target, 0);
+               scl_out(dd, target, 0);
+       }
+       if (last) {
+               scl_out(dd, target, 1);
+               stop_cmd(dd, target);
+       } else {
+               sda_out(dd, target, 0);
+               scl_out(dd, target, 1);
+               scl_out(dd, target, 0);
+               sda_out(dd, target, 1);
+       }
+       return data;
+}
+
+/**
+ * wr_byte - write a byte, one bit at a time
+ * @dd: the hfi1_ib device
+ * @data: the byte to write
+ *
+ * Returns 0 if we got the following ack, otherwise 1
+ */
+static int wr_byte(struct hfi1_devdata *dd, u32 target, u8 data)
+{
+       int bit_cntr;
+       u8 bit;
+
+       for (bit_cntr = 7; bit_cntr >= 0; bit_cntr--) {
+               bit = (data >> bit_cntr) & 1;
+               sda_out(dd, target, bit);
+               scl_out(dd, target, 1);
+               scl_out(dd, target, 0);
+       }
+       return (!i2c_ackrcv(dd, target)) ? 1 : 0;
+}
+
+/*
+ * issue TWSI start sequence:
+ * (both clock/data high, clock high, data low while clock is high)
+ */
+static void start_seq(struct hfi1_devdata *dd, u32 target)
+{
+       sda_out(dd, target, 1);
+       scl_out(dd, target, 1);
+       sda_out(dd, target, 0);
+       udelay(1);
+       scl_out(dd, target, 0);
+}
+
+/**
+ * stop_seq - transmit the stop sequence
+ * @dd: the hfi1_ib device
+ *
+ * (both clock/data low, clock high, data high while clock is high)
+ */
+static void stop_seq(struct hfi1_devdata *dd, u32 target)
+{
+       scl_out(dd, target, 0);
+       sda_out(dd, target, 0);
+       scl_out(dd, target, 1);
+       sda_out(dd, target, 1);
+}
+
+/**
+ * stop_cmd - transmit the stop condition
+ * @dd: the hfi1_ib device
+ *
+ * (both clock/data low, clock high, data high while clock is high)
+ */
+static void stop_cmd(struct hfi1_devdata *dd, u32 target)
+{
+       stop_seq(dd, target);
+       udelay(TWSI_BUF_WAIT_USEC);
+}
+
+/**
+ * hfi1_twsi_reset - reset I2C communication
+ * @dd: the hfi1_ib device
+ * returns 0 if ok, -EIO on error
+ */
+int hfi1_twsi_reset(struct hfi1_devdata *dd, u32 target)
+{
+       int clock_cycles_left = 9;
+       u32 mask;
+
+       /* Both SCL and SDA should be high. If not, there
+        * is something wrong.
+        */
+       mask = QSFP_HFI0_I2CCLK | QSFP_HFI0_I2CDAT;
+
+       /*
+        * Force pins to desired innocuous state.
+        * This is the default power-on state with out=0 and dir=0,
+        * So tri-stated and should be floating high (barring HW problems)
+        */
+       hfi1_gpio_mod(dd, target, 0, 0, mask);
+
+       /* Check if SCL is low, if it is low then we have a slave device
+        * misbehaving and there is not much we can do.
+        */
+       if (!scl_in(dd, target, 0))
+               return -EIO;
+
+       /* Check if SDA is low, if it is low then we have to clock SDA
+        * up to 9 times for the device to release the bus
+        */
+       while (clock_cycles_left--) {
+               if (sda_in(dd, target, 0))
+                       return 0;
+               scl_out(dd, target, 0);
+               scl_out(dd, target, 1);
+       }
+
+       return -EIO;
+}
+
+#define HFI1_TWSI_START 0x100
+#define HFI1_TWSI_STOP 0x200
+
+/* Write byte to TWSI, optionally prefixed with START or suffixed with
+ * STOP.
+ * returns 0 if OK (ACK received), else != 0
+ */
+static int twsi_wr(struct hfi1_devdata *dd, u32 target, int data, int flags)
+{
+       int ret = 1;
+
+       if (flags & HFI1_TWSI_START)
+               start_seq(dd, target);
+
+       /* Leaves SCL low (from i2c_ackrcv()) */
+       ret = wr_byte(dd, target, data);
+
+       if (flags & HFI1_TWSI_STOP)
+               stop_cmd(dd, target);
+       return ret;
+}
+
+/* Added functionality for IBA7220-based cards */
+#define HFI1_TEMP_DEV 0x98
+
+/*
+ * hfi1_twsi_blk_rd
+ * General interface for data transfer from twsi devices.
+ * One vestige of its former role is that it recognizes a device
+ * HFI1_TWSI_NO_DEV and does the correct operation for the legacy part,
+ * which responded to all TWSI device codes, interpreting them as
+ * address within device. On all other devices found on board handled by
+ * this driver, the device is followed by a N-byte "address" which selects
+ * the "register" or "offset" within the device from which data should
+ * be read.
+ */
+int hfi1_twsi_blk_rd(struct hfi1_devdata *dd, u32 target, int dev, int addr,
+                    void *buffer, int len)
+{
+       u8 *bp = buffer;
+       int ret = 1;
+       int i;
+       int offset_size;
+
+       /* obtain the offset size, strip it from the device address */
+       offset_size = (dev >> 8) & 0xff;
+       dev &= 0xff;
+
+       /* allow at most a 2 byte offset */
+       if (offset_size > 2)
+               goto bail;
+
+       if (dev == HFI1_TWSI_NO_DEV) {
+               /* legacy not-really-I2C */
+               addr = (addr << 1) | READ_CMD;
+               ret = twsi_wr(dd, target, addr, HFI1_TWSI_START);
+       } else {
+               /* Actual I2C */
+               if (offset_size) {
+                       ret = twsi_wr(dd, target,
+                                     dev | WRITE_CMD, HFI1_TWSI_START);
+                       if (ret) {
+                               stop_cmd(dd, target);
+                               goto bail;
+                       }
+
+                       for (i = 0; i < offset_size; i++) {
+                               ret = twsi_wr(dd, target,
+                                             (addr >> (i * 8)) & 0xff, 0);
+                               udelay(TWSI_BUF_WAIT_USEC);
+                               if (ret) {
+                                       dd_dev_err(dd, "Failed to write byte %d of offset 0x%04X\n",
+                                                  i, addr);
+                                       goto bail;
+                               }
+                       }
+               }
+               ret = twsi_wr(dd, target, dev | READ_CMD, HFI1_TWSI_START);
+       }
+       if (ret) {
+               stop_cmd(dd, target);
+               goto bail;
+       }
+
+       /*
+        * block devices keeps clocking data out as long as we ack,
+        * automatically incrementing the address. Some have "pages"
+        * whose boundaries will not be crossed, but the handling
+        * of these is left to the caller, who is in a better
+        * position to know.
+        */
+       while (len-- > 0) {
+               /*
+                * Get and store data, sending ACK if length remaining,
+                * else STOP
+                */
+               *bp++ = rd_byte(dd, target, !len);
+       }
+
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+/*
+ * hfi1_twsi_blk_wr
+ * General interface for data transfer to twsi devices.
+ * One vestige of its former role is that it recognizes a device
+ * HFI1_TWSI_NO_DEV and does the correct operation for the legacy part,
+ * which responded to all TWSI device codes, interpreting them as
+ * address within device. On all other devices found on board handled by
+ * this driver, the device is followed by a N-byte "address" which selects
+ * the "register" or "offset" within the device to which data should
+ * be written.
+ */
+int hfi1_twsi_blk_wr(struct hfi1_devdata *dd, u32 target, int dev, int addr,
+                    const void *buffer, int len)
+{
+       const u8 *bp = buffer;
+       int ret = 1;
+       int i;
+       int offset_size;
+
+       /* obtain the offset size, strip it from the device address */
+       offset_size = (dev >> 8) & 0xff;
+       dev &= 0xff;
+
+       /* allow at most a 2 byte offset */
+       if (offset_size > 2)
+               goto bail;
+
+       if (dev == HFI1_TWSI_NO_DEV) {
+               if (twsi_wr(dd, target, (addr << 1) | WRITE_CMD,
+                           HFI1_TWSI_START)) {
+                       goto failed_write;
+               }
+       } else {
+               /* Real I2C */
+               if (twsi_wr(dd, target, dev | WRITE_CMD, HFI1_TWSI_START))
+                       goto failed_write;
+       }
+
+       for (i = 0; i < offset_size; i++) {
+               ret = twsi_wr(dd, target, (addr >> (i * 8)) & 0xff, 0);
+               udelay(TWSI_BUF_WAIT_USEC);
+               if (ret) {
+                       dd_dev_err(dd, "Failed to write byte %d of offset 0x%04X\n",
+                                  i, addr);
+                       goto bail;
+               }
+       }
+
+       for (i = 0; i < len; i++)
+               if (twsi_wr(dd, target, *bp++, 0))
+                       goto failed_write;
+
+       ret = 0;
+
+failed_write:
+       stop_cmd(dd, target);
+
+bail:
+       return ret;
+}
diff --git a/drivers/infiniband/hw/hfi1/twsi.h b/drivers/infiniband/hw/hfi1/twsi.h
new file mode 100644 (file)
index 0000000..5b8a5b5
--- /dev/null
@@ -0,0 +1,65 @@
+#ifndef _TWSI_H
+#define _TWSI_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define HFI1_TWSI_NO_DEV 0xFF
+
+struct hfi1_devdata;
+
+/* Bit position of SDA/SCL pins in ASIC_QSFP* registers  */
+#define  GPIO_SDA_NUM 1
+#define  GPIO_SCL_NUM 0
+
+/* these functions must be called with qsfp_lock held */
+int hfi1_twsi_reset(struct hfi1_devdata *dd, u32 target);
+int hfi1_twsi_blk_rd(struct hfi1_devdata *dd, u32 target, int dev, int addr,
+                    void *buffer, int len);
+int hfi1_twsi_blk_wr(struct hfi1_devdata *dd, u32 target, int dev, int addr,
+                    const void *buffer, int len);
+
+#endif /* _TWSI_H */
diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c
new file mode 100644 (file)
index 0000000..df773d4
--- /dev/null
@@ -0,0 +1,604 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "hfi.h"
+#include "verbs_txreq.h"
+#include "qp.h"
+
+/* cut down ridiculously long IB macro names */
+#define OP(x) IB_OPCODE_UC_##x
+
+/* only opcode mask for adaptive pio */
+const u32 uc_only_opcode =
+       BIT(OP(SEND_ONLY) & 0x1f) |
+       BIT(OP(SEND_ONLY_WITH_IMMEDIATE & 0x1f)) |
+       BIT(OP(RDMA_WRITE_ONLY & 0x1f)) |
+       BIT(OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE & 0x1f));
+
+/**
+ * hfi1_make_uc_req - construct a request packet (SEND, RDMA write)
+ * @qp: a pointer to the QP
+ *
+ * Assume s_lock is held.
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+       struct hfi1_other_headers *ohdr;
+       struct rvt_swqe *wqe;
+       u32 hwords = 5;
+       u32 bth0 = 0;
+       u32 len;
+       u32 pmtu = qp->pmtu;
+       int middle = 0;
+
+       ps->s_txreq = get_txreq(ps->dev, qp);
+       if (IS_ERR(ps->s_txreq))
+               goto bail_no_tx;
+
+       if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) {
+               if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
+                       goto bail;
+               /* We are in the error state, flush the work request. */
+               smp_read_barrier_depends(); /* see post_one_send() */
+               if (qp->s_last == ACCESS_ONCE(qp->s_head))
+                       goto bail;
+               /* If DMAs are in progress, we can't flush immediately. */
+               if (iowait_sdma_pending(&priv->s_iowait)) {
+                       qp->s_flags |= RVT_S_WAIT_DMA;
+                       goto bail;
+               }
+               clear_ahg(qp);
+               wqe = rvt_get_swqe_ptr(qp, qp->s_last);
+               hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+               goto done_free_tx;
+       }
+
+       ohdr = &ps->s_txreq->phdr.hdr.u.oth;
+       if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+               ohdr = &ps->s_txreq->phdr.hdr.u.l.oth;
+
+       /* Get the next send request. */
+       wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
+       qp->s_wqe = NULL;
+       switch (qp->s_state) {
+       default:
+               if (!(ib_rvt_state_ops[qp->state] &
+                   RVT_PROCESS_NEXT_SEND_OK))
+                       goto bail;
+               /* Check if send work queue is empty. */
+               smp_read_barrier_depends(); /* see post_one_send() */
+               if (qp->s_cur == ACCESS_ONCE(qp->s_head)) {
+                       clear_ahg(qp);
+                       goto bail;
+               }
+               /*
+                * Start a new request.
+                */
+               qp->s_psn = wqe->psn;
+               qp->s_sge.sge = wqe->sg_list[0];
+               qp->s_sge.sg_list = wqe->sg_list + 1;
+               qp->s_sge.num_sge = wqe->wr.num_sge;
+               qp->s_sge.total_len = wqe->length;
+               len = wqe->length;
+               qp->s_len = len;
+               switch (wqe->wr.opcode) {
+               case IB_WR_SEND:
+               case IB_WR_SEND_WITH_IMM:
+                       if (len > pmtu) {
+                               qp->s_state = OP(SEND_FIRST);
+                               len = pmtu;
+                               break;
+                       }
+                       if (wqe->wr.opcode == IB_WR_SEND) {
+                               qp->s_state = OP(SEND_ONLY);
+                       } else {
+                               qp->s_state =
+                                       OP(SEND_ONLY_WITH_IMMEDIATE);
+                               /* Immediate data comes after the BTH */
+                               ohdr->u.imm_data = wqe->wr.ex.imm_data;
+                               hwords += 1;
+                       }
+                       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                               bth0 |= IB_BTH_SOLICITED;
+                       qp->s_wqe = wqe;
+                       if (++qp->s_cur >= qp->s_size)
+                               qp->s_cur = 0;
+                       break;
+
+               case IB_WR_RDMA_WRITE:
+               case IB_WR_RDMA_WRITE_WITH_IMM:
+                       ohdr->u.rc.reth.vaddr =
+                               cpu_to_be64(wqe->rdma_wr.remote_addr);
+                       ohdr->u.rc.reth.rkey =
+                               cpu_to_be32(wqe->rdma_wr.rkey);
+                       ohdr->u.rc.reth.length = cpu_to_be32(len);
+                       hwords += sizeof(struct ib_reth) / 4;
+                       if (len > pmtu) {
+                               qp->s_state = OP(RDMA_WRITE_FIRST);
+                               len = pmtu;
+                               break;
+                       }
+                       if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
+                               qp->s_state = OP(RDMA_WRITE_ONLY);
+                       } else {
+                               qp->s_state =
+                                       OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
+                               /* Immediate data comes after the RETH */
+                               ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
+                               hwords += 1;
+                               if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                                       bth0 |= IB_BTH_SOLICITED;
+                       }
+                       qp->s_wqe = wqe;
+                       if (++qp->s_cur >= qp->s_size)
+                               qp->s_cur = 0;
+                       break;
+
+               default:
+                       goto bail;
+               }
+               break;
+
+       case OP(SEND_FIRST):
+               qp->s_state = OP(SEND_MIDDLE);
+               /* FALLTHROUGH */
+       case OP(SEND_MIDDLE):
+               len = qp->s_len;
+               if (len > pmtu) {
+                       len = pmtu;
+                       middle = HFI1_CAP_IS_KSET(SDMA_AHG);
+                       break;
+               }
+               if (wqe->wr.opcode == IB_WR_SEND) {
+                       qp->s_state = OP(SEND_LAST);
+               } else {
+                       qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
+                       /* Immediate data comes after the BTH */
+                       ohdr->u.imm_data = wqe->wr.ex.imm_data;
+                       hwords += 1;
+               }
+               if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                       bth0 |= IB_BTH_SOLICITED;
+               qp->s_wqe = wqe;
+               if (++qp->s_cur >= qp->s_size)
+                       qp->s_cur = 0;
+               break;
+
+       case OP(RDMA_WRITE_FIRST):
+               qp->s_state = OP(RDMA_WRITE_MIDDLE);
+               /* FALLTHROUGH */
+       case OP(RDMA_WRITE_MIDDLE):
+               len = qp->s_len;
+               if (len > pmtu) {
+                       len = pmtu;
+                       middle = HFI1_CAP_IS_KSET(SDMA_AHG);
+                       break;
+               }
+               if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
+                       qp->s_state = OP(RDMA_WRITE_LAST);
+               } else {
+                       qp->s_state =
+                               OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
+                       /* Immediate data comes after the BTH */
+                       ohdr->u.imm_data = wqe->wr.ex.imm_data;
+                       hwords += 1;
+                       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                               bth0 |= IB_BTH_SOLICITED;
+               }
+               qp->s_wqe = wqe;
+               if (++qp->s_cur >= qp->s_size)
+                       qp->s_cur = 0;
+               break;
+       }
+       qp->s_len -= len;
+       qp->s_hdrwords = hwords;
+       ps->s_txreq->sde = priv->s_sde;
+       qp->s_cur_sge = &qp->s_sge;
+       qp->s_cur_size = len;
+       hfi1_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24),
+                            mask_psn(qp->s_psn++), middle, ps);
+       /* pbc */
+       ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2;
+       return 1;
+
+done_free_tx:
+       hfi1_put_txreq(ps->s_txreq);
+       ps->s_txreq = NULL;
+       return 1;
+
+bail:
+       hfi1_put_txreq(ps->s_txreq);
+
+bail_no_tx:
+       ps->s_txreq = NULL;
+       qp->s_flags &= ~RVT_S_BUSY;
+       qp->s_hdrwords = 0;
+       return 0;
+}
+
+/**
+ * hfi1_uc_rcv - handle an incoming UC packet
+ * @ibp: the port the packet came in on
+ * @hdr: the header of the packet
+ * @rcv_flags: flags relevant to rcv processing
+ * @data: the packet data
+ * @tlen: the length of the packet
+ * @qp: the QP for this packet.
+ *
+ * This is called from qp_rcv() to process an incoming UC packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void hfi1_uc_rcv(struct hfi1_packet *packet)
+{
+       struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data;
+       struct hfi1_ib_header *hdr = packet->hdr;
+       u32 rcv_flags = packet->rcv_flags;
+       void *data = packet->ebuf;
+       u32 tlen = packet->tlen;
+       struct rvt_qp *qp = packet->qp;
+       struct hfi1_other_headers *ohdr = packet->ohdr;
+       u32 bth0, opcode;
+       u32 hdrsize = packet->hlen;
+       u32 psn;
+       u32 pad;
+       struct ib_wc wc;
+       u32 pmtu = qp->pmtu;
+       struct ib_reth *reth;
+       int has_grh = rcv_flags & HFI1_HAS_GRH;
+       int ret;
+       u32 bth1;
+
+       bth0 = be32_to_cpu(ohdr->bth[0]);
+       if (hfi1_ruc_check_hdr(ibp, hdr, has_grh, qp, bth0))
+               return;
+
+       bth1 = be32_to_cpu(ohdr->bth[1]);
+       if (unlikely(bth1 & (HFI1_BECN_SMASK | HFI1_FECN_SMASK))) {
+               if (bth1 & HFI1_BECN_SMASK) {
+                       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+                       u32 rqpn, lqpn;
+                       u16 rlid = be16_to_cpu(hdr->lrh[3]);
+                       u8 sl, sc5;
+
+                       lqpn = bth1 & RVT_QPN_MASK;
+                       rqpn = qp->remote_qpn;
+
+                       sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
+                       sl = ibp->sc_to_sl[sc5];
+
+                       process_becn(ppd, sl, rlid, lqpn, rqpn,
+                                    IB_CC_SVCTYPE_UC);
+               }
+
+               if (bth1 & HFI1_FECN_SMASK) {
+                       struct ib_grh *grh = NULL;
+                       u16 pkey = (u16)be32_to_cpu(ohdr->bth[0]);
+                       u16 slid = be16_to_cpu(hdr->lrh[3]);
+                       u16 dlid = be16_to_cpu(hdr->lrh[1]);
+                       u32 src_qp = qp->remote_qpn;
+                       u8 sc5;
+
+                       sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
+                       if (has_grh)
+                               grh = &hdr->u.l.grh;
+
+                       return_cnp(ibp, qp, src_qp, pkey, dlid, slid, sc5,
+                                  grh);
+               }
+       }
+
+       psn = be32_to_cpu(ohdr->bth[2]);
+       opcode = (bth0 >> 24) & 0xff;
+
+       /* Compare the PSN verses the expected PSN. */
+       if (unlikely(cmp_psn(psn, qp->r_psn) != 0)) {
+               /*
+                * Handle a sequence error.
+                * Silently drop any current message.
+                */
+               qp->r_psn = psn;
+inv:
+               if (qp->r_state == OP(SEND_FIRST) ||
+                   qp->r_state == OP(SEND_MIDDLE)) {
+                       set_bit(RVT_R_REWIND_SGE, &qp->r_aflags);
+                       qp->r_sge.num_sge = 0;
+               } else {
+                       rvt_put_ss(&qp->r_sge);
+               }
+               qp->r_state = OP(SEND_LAST);
+               switch (opcode) {
+               case OP(SEND_FIRST):
+               case OP(SEND_ONLY):
+               case OP(SEND_ONLY_WITH_IMMEDIATE):
+                       goto send_first;
+
+               case OP(RDMA_WRITE_FIRST):
+               case OP(RDMA_WRITE_ONLY):
+               case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+                       goto rdma_first;
+
+               default:
+                       goto drop;
+               }
+       }
+
+       /* Check for opcode sequence errors. */
+       switch (qp->r_state) {
+       case OP(SEND_FIRST):
+       case OP(SEND_MIDDLE):
+               if (opcode == OP(SEND_MIDDLE) ||
+                   opcode == OP(SEND_LAST) ||
+                   opcode == OP(SEND_LAST_WITH_IMMEDIATE))
+                       break;
+               goto inv;
+
+       case OP(RDMA_WRITE_FIRST):
+       case OP(RDMA_WRITE_MIDDLE):
+               if (opcode == OP(RDMA_WRITE_MIDDLE) ||
+                   opcode == OP(RDMA_WRITE_LAST) ||
+                   opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+                       break;
+               goto inv;
+
+       default:
+               if (opcode == OP(SEND_FIRST) ||
+                   opcode == OP(SEND_ONLY) ||
+                   opcode == OP(SEND_ONLY_WITH_IMMEDIATE) ||
+                   opcode == OP(RDMA_WRITE_FIRST) ||
+                   opcode == OP(RDMA_WRITE_ONLY) ||
+                   opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
+                       break;
+               goto inv;
+       }
+
+       if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
+               qp_comm_est(qp);
+
+       /* OK, process the packet. */
+       switch (opcode) {
+       case OP(SEND_FIRST):
+       case OP(SEND_ONLY):
+       case OP(SEND_ONLY_WITH_IMMEDIATE):
+send_first:
+               if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags)) {
+                       qp->r_sge = qp->s_rdma_read_sge;
+               } else {
+                       ret = hfi1_rvt_get_rwqe(qp, 0);
+                       if (ret < 0)
+                               goto op_err;
+                       if (!ret)
+                               goto drop;
+                       /*
+                        * qp->s_rdma_read_sge will be the owner
+                        * of the mr references.
+                        */
+                       qp->s_rdma_read_sge = qp->r_sge;
+               }
+               qp->r_rcv_len = 0;
+               if (opcode == OP(SEND_ONLY))
+                       goto no_immediate_data;
+               else if (opcode == OP(SEND_ONLY_WITH_IMMEDIATE))
+                       goto send_last_imm;
+               /* FALLTHROUGH */
+       case OP(SEND_MIDDLE):
+               /* Check for invalid length PMTU or posted rwqe len. */
+               if (unlikely(tlen != (hdrsize + pmtu + 4)))
+                       goto rewind;
+               qp->r_rcv_len += pmtu;
+               if (unlikely(qp->r_rcv_len > qp->r_len))
+                       goto rewind;
+               hfi1_copy_sge(&qp->r_sge, data, pmtu, 0, 0);
+               break;
+
+       case OP(SEND_LAST_WITH_IMMEDIATE):
+send_last_imm:
+               wc.ex.imm_data = ohdr->u.imm_data;
+               wc.wc_flags = IB_WC_WITH_IMM;
+               goto send_last;
+       case OP(SEND_LAST):
+no_immediate_data:
+               wc.ex.imm_data = 0;
+               wc.wc_flags = 0;
+send_last:
+               /* Get the number of bytes the message was padded by. */
+               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+               /* Check for invalid length. */
+               /* LAST len should be >= 1 */
+               if (unlikely(tlen < (hdrsize + pad + 4)))
+                       goto rewind;
+               /* Don't count the CRC. */
+               tlen -= (hdrsize + pad + 4);
+               wc.byte_len = tlen + qp->r_rcv_len;
+               if (unlikely(wc.byte_len > qp->r_len))
+                       goto rewind;
+               wc.opcode = IB_WC_RECV;
+               hfi1_copy_sge(&qp->r_sge, data, tlen, 0, 0);
+               rvt_put_ss(&qp->s_rdma_read_sge);
+last_imm:
+               wc.wr_id = qp->r_wr_id;
+               wc.status = IB_WC_SUCCESS;
+               wc.qp = &qp->ibqp;
+               wc.src_qp = qp->remote_qpn;
+               wc.slid = qp->remote_ah_attr.dlid;
+               /*
+                * It seems that IB mandates the presence of an SL in a
+                * work completion only for the UD transport (see section
+                * 11.4.2 of IBTA Vol. 1).
+                *
+                * However, the way the SL is chosen below is consistent
+                * with the way that IB/qib works and is trying avoid
+                * introducing incompatibilities.
+                *
+                * See also OPA Vol. 1, section 9.7.6, and table 9-17.
+                */
+               wc.sl = qp->remote_ah_attr.sl;
+               /* zero fields that are N/A */
+               wc.vendor_err = 0;
+               wc.pkey_index = 0;
+               wc.dlid_path_bits = 0;
+               wc.port_num = 0;
+               /* Signal completion event if the solicited bit is set. */
+               rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
+                            (ohdr->bth[0] &
+                             cpu_to_be32(IB_BTH_SOLICITED)) != 0);
+               break;
+
+       case OP(RDMA_WRITE_FIRST):
+       case OP(RDMA_WRITE_ONLY):
+       case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): /* consume RWQE */
+rdma_first:
+               if (unlikely(!(qp->qp_access_flags &
+                              IB_ACCESS_REMOTE_WRITE))) {
+                       goto drop;
+               }
+               reth = &ohdr->u.rc.reth;
+               qp->r_len = be32_to_cpu(reth->length);
+               qp->r_rcv_len = 0;
+               qp->r_sge.sg_list = NULL;
+               if (qp->r_len != 0) {
+                       u32 rkey = be32_to_cpu(reth->rkey);
+                       u64 vaddr = be64_to_cpu(reth->vaddr);
+                       int ok;
+
+                       /* Check rkey */
+                       ok = rvt_rkey_ok(qp, &qp->r_sge.sge, qp->r_len,
+                                        vaddr, rkey, IB_ACCESS_REMOTE_WRITE);
+                       if (unlikely(!ok))
+                               goto drop;
+                       qp->r_sge.num_sge = 1;
+               } else {
+                       qp->r_sge.num_sge = 0;
+                       qp->r_sge.sge.mr = NULL;
+                       qp->r_sge.sge.vaddr = NULL;
+                       qp->r_sge.sge.length = 0;
+                       qp->r_sge.sge.sge_length = 0;
+               }
+               if (opcode == OP(RDMA_WRITE_ONLY)) {
+                       goto rdma_last;
+               } else if (opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) {
+                       wc.ex.imm_data = ohdr->u.rc.imm_data;
+                       goto rdma_last_imm;
+               }
+               /* FALLTHROUGH */
+       case OP(RDMA_WRITE_MIDDLE):
+               /* Check for invalid length PMTU or posted rwqe len. */
+               if (unlikely(tlen != (hdrsize + pmtu + 4)))
+                       goto drop;
+               qp->r_rcv_len += pmtu;
+               if (unlikely(qp->r_rcv_len > qp->r_len))
+                       goto drop;
+               hfi1_copy_sge(&qp->r_sge, data, pmtu, 1, 0);
+               break;
+
+       case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
+               wc.ex.imm_data = ohdr->u.imm_data;
+rdma_last_imm:
+               wc.wc_flags = IB_WC_WITH_IMM;
+
+               /* Get the number of bytes the message was padded by. */
+               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+               /* Check for invalid length. */
+               /* LAST len should be >= 1 */
+               if (unlikely(tlen < (hdrsize + pad + 4)))
+                       goto drop;
+               /* Don't count the CRC. */
+               tlen -= (hdrsize + pad + 4);
+               if (unlikely(tlen + qp->r_rcv_len != qp->r_len))
+                       goto drop;
+               if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags)) {
+                       rvt_put_ss(&qp->s_rdma_read_sge);
+               } else {
+                       ret = hfi1_rvt_get_rwqe(qp, 1);
+                       if (ret < 0)
+                               goto op_err;
+                       if (!ret)
+                               goto drop;
+               }
+               wc.byte_len = qp->r_len;
+               wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+               hfi1_copy_sge(&qp->r_sge, data, tlen, 1, 0);
+               rvt_put_ss(&qp->r_sge);
+               goto last_imm;
+
+       case OP(RDMA_WRITE_LAST):
+rdma_last:
+               /* Get the number of bytes the message was padded by. */
+               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+               /* Check for invalid length. */
+               /* LAST len should be >= 1 */
+               if (unlikely(tlen < (hdrsize + pad + 4)))
+                       goto drop;
+               /* Don't count the CRC. */
+               tlen -= (hdrsize + pad + 4);
+               if (unlikely(tlen + qp->r_rcv_len != qp->r_len))
+                       goto drop;
+               hfi1_copy_sge(&qp->r_sge, data, tlen, 1, 0);
+               rvt_put_ss(&qp->r_sge);
+               break;
+
+       default:
+               /* Drop packet for unknown opcodes. */
+               goto drop;
+       }
+       qp->r_psn++;
+       qp->r_state = opcode;
+       return;
+
+rewind:
+       set_bit(RVT_R_REWIND_SGE, &qp->r_aflags);
+       qp->r_sge.num_sge = 0;
+drop:
+       ibp->rvp.n_pkt_drops++;
+       return;
+
+op_err:
+       hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+}
diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c
new file mode 100644 (file)
index 0000000..1e503ad
--- /dev/null
@@ -0,0 +1,911 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/net.h>
+#include <rdma/ib_smi.h>
+
+#include "hfi.h"
+#include "mad.h"
+#include "verbs_txreq.h"
+#include "qp.h"
+
+/**
+ * ud_loopback - handle send on loopback QPs
+ * @sqp: the sending QP
+ * @swqe: the send work request
+ *
+ * This is called from hfi1_make_ud_req() to forward a WQE addressed
+ * to the same HFI.
+ * Note that the receive interrupt handler may be calling hfi1_ud_rcv()
+ * while this is being called.
+ */
+static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
+{
+       struct hfi1_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num);
+       struct hfi1_pportdata *ppd;
+       struct rvt_qp *qp;
+       struct ib_ah_attr *ah_attr;
+       unsigned long flags;
+       struct rvt_sge_state ssge;
+       struct rvt_sge *sge;
+       struct ib_wc wc;
+       u32 length;
+       enum ib_qp_type sqptype, dqptype;
+
+       rcu_read_lock();
+
+       qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), &ibp->rvp,
+                           swqe->ud_wr.remote_qpn);
+       if (!qp) {
+               ibp->rvp.n_pkt_drops++;
+               rcu_read_unlock();
+               return;
+       }
+
+       sqptype = sqp->ibqp.qp_type == IB_QPT_GSI ?
+                       IB_QPT_UD : sqp->ibqp.qp_type;
+       dqptype = qp->ibqp.qp_type == IB_QPT_GSI ?
+                       IB_QPT_UD : qp->ibqp.qp_type;
+
+       if (dqptype != sqptype ||
+           !(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
+               ibp->rvp.n_pkt_drops++;
+               goto drop;
+       }
+
+       ah_attr = &ibah_to_rvtah(swqe->ud_wr.ah)->attr;
+       ppd = ppd_from_ibp(ibp);
+
+       if (qp->ibqp.qp_num > 1) {
+               u16 pkey;
+               u16 slid;
+               u8 sc5 = ibp->sl_to_sc[ah_attr->sl];
+
+               pkey = hfi1_get_pkey(ibp, sqp->s_pkey_index);
+               slid = ppd->lid | (ah_attr->src_path_bits &
+                                  ((1 << ppd->lmc) - 1));
+               if (unlikely(ingress_pkey_check(ppd, pkey, sc5,
+                                               qp->s_pkey_index, slid))) {
+                       hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_P_KEY, pkey,
+                                      ah_attr->sl,
+                                      sqp->ibqp.qp_num, qp->ibqp.qp_num,
+                                      slid, ah_attr->dlid);
+                       goto drop;
+               }
+       }
+
+       /*
+        * Check that the qkey matches (except for QP0, see 9.6.1.4.1).
+        * Qkeys with the high order bit set mean use the
+        * qkey from the QP context instead of the WR (see 10.2.5).
+        */
+       if (qp->ibqp.qp_num) {
+               u32 qkey;
+
+               qkey = (int)swqe->ud_wr.remote_qkey < 0 ?
+                       sqp->qkey : swqe->ud_wr.remote_qkey;
+               if (unlikely(qkey != qp->qkey)) {
+                       u16 lid;
+
+                       lid = ppd->lid | (ah_attr->src_path_bits &
+                                         ((1 << ppd->lmc) - 1));
+                       hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_Q_KEY, qkey,
+                                      ah_attr->sl,
+                                      sqp->ibqp.qp_num, qp->ibqp.qp_num,
+                                      lid,
+                                      ah_attr->dlid);
+                       goto drop;
+               }
+       }
+
+       /*
+        * A GRH is expected to precede the data even if not
+        * present on the wire.
+        */
+       length = swqe->length;
+       memset(&wc, 0, sizeof(wc));
+       wc.byte_len = length + sizeof(struct ib_grh);
+
+       if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
+               wc.wc_flags = IB_WC_WITH_IMM;
+               wc.ex.imm_data = swqe->wr.ex.imm_data;
+       }
+
+       spin_lock_irqsave(&qp->r_lock, flags);
+
+       /*
+        * Get the next work request entry to find where to put the data.
+        */
+       if (qp->r_flags & RVT_R_REUSE_SGE) {
+               qp->r_flags &= ~RVT_R_REUSE_SGE;
+       } else {
+               int ret;
+
+               ret = hfi1_rvt_get_rwqe(qp, 0);
+               if (ret < 0) {
+                       hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+                       goto bail_unlock;
+               }
+               if (!ret) {
+                       if (qp->ibqp.qp_num == 0)
+                               ibp->rvp.n_vl15_dropped++;
+                       goto bail_unlock;
+               }
+       }
+       /* Silently drop packets which are too big. */
+       if (unlikely(wc.byte_len > qp->r_len)) {
+               qp->r_flags |= RVT_R_REUSE_SGE;
+               ibp->rvp.n_pkt_drops++;
+               goto bail_unlock;
+       }
+
+       if (ah_attr->ah_flags & IB_AH_GRH) {
+               hfi1_copy_sge(&qp->r_sge, &ah_attr->grh,
+                             sizeof(struct ib_grh), 1, 0);
+               wc.wc_flags |= IB_WC_GRH;
+       } else {
+               hfi1_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1);
+       }
+       ssge.sg_list = swqe->sg_list + 1;
+       ssge.sge = *swqe->sg_list;
+       ssge.num_sge = swqe->wr.num_sge;
+       sge = &ssge.sge;
+       while (length) {
+               u32 len = sge->length;
+
+               if (len > length)
+                       len = length;
+               if (len > sge->sge_length)
+                       len = sge->sge_length;
+               WARN_ON_ONCE(len == 0);
+               hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, 1, 0);
+               sge->vaddr += len;
+               sge->length -= len;
+               sge->sge_length -= len;
+               if (sge->sge_length == 0) {
+                       if (--ssge.num_sge)
+                               *sge = *ssge.sg_list++;
+               } else if (sge->length == 0 && sge->mr->lkey) {
+                       if (++sge->n >= RVT_SEGSZ) {
+                               if (++sge->m >= sge->mr->mapsz)
+                                       break;
+                               sge->n = 0;
+                       }
+                       sge->vaddr =
+                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
+                       sge->length =
+                               sge->mr->map[sge->m]->segs[sge->n].length;
+               }
+               length -= len;
+       }
+       rvt_put_ss(&qp->r_sge);
+       if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
+               goto bail_unlock;
+       wc.wr_id = qp->r_wr_id;
+       wc.status = IB_WC_SUCCESS;
+       wc.opcode = IB_WC_RECV;
+       wc.qp = &qp->ibqp;
+       wc.src_qp = sqp->ibqp.qp_num;
+       if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI) {
+               if (sqp->ibqp.qp_type == IB_QPT_GSI ||
+                   sqp->ibqp.qp_type == IB_QPT_SMI)
+                       wc.pkey_index = swqe->ud_wr.pkey_index;
+               else
+                       wc.pkey_index = sqp->s_pkey_index;
+       } else {
+               wc.pkey_index = 0;
+       }
+       wc.slid = ppd->lid | (ah_attr->src_path_bits & ((1 << ppd->lmc) - 1));
+       /* Check for loopback when the port lid is not set */
+       if (wc.slid == 0 && sqp->ibqp.qp_type == IB_QPT_GSI)
+               wc.slid = be16_to_cpu(IB_LID_PERMISSIVE);
+       wc.sl = ah_attr->sl;
+       wc.dlid_path_bits = ah_attr->dlid & ((1 << ppd->lmc) - 1);
+       wc.port_num = qp->port_num;
+       /* Signal completion event if the solicited bit is set. */
+       rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
+                    swqe->wr.send_flags & IB_SEND_SOLICITED);
+       ibp->rvp.n_loop_pkts++;
+bail_unlock:
+       spin_unlock_irqrestore(&qp->r_lock, flags);
+drop:
+       rcu_read_unlock();
+}
+
+/**
+ * hfi1_make_ud_req - construct a UD request packet
+ * @qp: the QP
+ *
+ * Assume s_lock is held.
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+       struct hfi1_other_headers *ohdr;
+       struct ib_ah_attr *ah_attr;
+       struct hfi1_pportdata *ppd;
+       struct hfi1_ibport *ibp;
+       struct rvt_swqe *wqe;
+       u32 nwords;
+       u32 extra_bytes;
+       u32 bth0;
+       u16 lrh0;
+       u16 lid;
+       int next_cur;
+       u8 sc5;
+
+       ps->s_txreq = get_txreq(ps->dev, qp);
+       if (IS_ERR(ps->s_txreq))
+               goto bail_no_tx;
+
+       if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK)) {
+               if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
+                       goto bail;
+               /* We are in the error state, flush the work request. */
+               smp_read_barrier_depends(); /* see post_one_send */
+               if (qp->s_last == ACCESS_ONCE(qp->s_head))
+                       goto bail;
+               /* If DMAs are in progress, we can't flush immediately. */
+               if (iowait_sdma_pending(&priv->s_iowait)) {
+                       qp->s_flags |= RVT_S_WAIT_DMA;
+                       goto bail;
+               }
+               wqe = rvt_get_swqe_ptr(qp, qp->s_last);
+               hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+               goto done_free_tx;
+       }
+
+       /* see post_one_send() */
+       smp_read_barrier_depends();
+       if (qp->s_cur == ACCESS_ONCE(qp->s_head))
+               goto bail;
+
+       wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
+       next_cur = qp->s_cur + 1;
+       if (next_cur >= qp->s_size)
+               next_cur = 0;
+
+       /* Construct the header. */
+       ibp = to_iport(qp->ibqp.device, qp->port_num);
+       ppd = ppd_from_ibp(ibp);
+       ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr;
+       if (ah_attr->dlid < be16_to_cpu(IB_MULTICAST_LID_BASE) ||
+           ah_attr->dlid == be16_to_cpu(IB_LID_PERMISSIVE)) {
+               lid = ah_attr->dlid & ~((1 << ppd->lmc) - 1);
+               if (unlikely(!loopback &&
+                            (lid == ppd->lid ||
+                             (lid == be16_to_cpu(IB_LID_PERMISSIVE) &&
+                             qp->ibqp.qp_type == IB_QPT_GSI)))) {
+                       unsigned long tflags = ps->flags;
+                       /*
+                        * If DMAs are in progress, we can't generate
+                        * a completion for the loopback packet since
+                        * it would be out of order.
+                        * Instead of waiting, we could queue a
+                        * zero length descriptor so we get a callback.
+                        */
+                       if (iowait_sdma_pending(&priv->s_iowait)) {
+                               qp->s_flags |= RVT_S_WAIT_DMA;
+                               goto bail;
+                       }
+                       qp->s_cur = next_cur;
+                       spin_unlock_irqrestore(&qp->s_lock, tflags);
+                       ud_loopback(qp, wqe);
+                       spin_lock_irqsave(&qp->s_lock, tflags);
+                       ps->flags = tflags;
+                       hfi1_send_complete(qp, wqe, IB_WC_SUCCESS);
+                       goto done_free_tx;
+               }
+       }
+
+       qp->s_cur = next_cur;
+       extra_bytes = -wqe->length & 3;
+       nwords = (wqe->length + extra_bytes) >> 2;
+
+       /* header size in 32-bit words LRH+BTH+DETH = (8+12+8)/4. */
+       qp->s_hdrwords = 7;
+       qp->s_cur_size = wqe->length;
+       qp->s_cur_sge = &qp->s_sge;
+       qp->s_srate = ah_attr->static_rate;
+       qp->srate_mbps = ib_rate_to_mbps(qp->s_srate);
+       qp->s_wqe = wqe;
+       qp->s_sge.sge = wqe->sg_list[0];
+       qp->s_sge.sg_list = wqe->sg_list + 1;
+       qp->s_sge.num_sge = wqe->wr.num_sge;
+       qp->s_sge.total_len = wqe->length;
+
+       if (ah_attr->ah_flags & IB_AH_GRH) {
+               /* Header size in 32-bit words. */
+               qp->s_hdrwords += hfi1_make_grh(ibp,
+                                               &ps->s_txreq->phdr.hdr.u.l.grh,
+                                               &ah_attr->grh,
+                                               qp->s_hdrwords, nwords);
+               lrh0 = HFI1_LRH_GRH;
+               ohdr = &ps->s_txreq->phdr.hdr.u.l.oth;
+               /*
+                * Don't worry about sending to locally attached multicast
+                * QPs.  It is unspecified by the spec. what happens.
+                */
+       } else {
+               /* Header size in 32-bit words. */
+               lrh0 = HFI1_LRH_BTH;
+               ohdr = &ps->s_txreq->phdr.hdr.u.oth;
+       }
+       if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
+               qp->s_hdrwords++;
+               ohdr->u.ud.imm_data = wqe->wr.ex.imm_data;
+               bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24;
+       } else {
+               bth0 = IB_OPCODE_UD_SEND_ONLY << 24;
+       }
+       sc5 = ibp->sl_to_sc[ah_attr->sl];
+       lrh0 |= (ah_attr->sl & 0xf) << 4;
+       if (qp->ibqp.qp_type == IB_QPT_SMI) {
+               lrh0 |= 0xF000; /* Set VL (see ch. 13.5.3.1) */
+               priv->s_sc = 0xf;
+       } else {
+               lrh0 |= (sc5 & 0xf) << 12;
+               priv->s_sc = sc5;
+       }
+       priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc);
+       ps->s_txreq->sde = priv->s_sde;
+       priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc);
+       ps->s_txreq->psc = priv->s_sendcontext;
+       ps->s_txreq->phdr.hdr.lrh[0] = cpu_to_be16(lrh0);
+       ps->s_txreq->phdr.hdr.lrh[1] = cpu_to_be16(ah_attr->dlid);
+       ps->s_txreq->phdr.hdr.lrh[2] =
+               cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC);
+       if (ah_attr->dlid == be16_to_cpu(IB_LID_PERMISSIVE)) {
+               ps->s_txreq->phdr.hdr.lrh[3] = IB_LID_PERMISSIVE;
+       } else {
+               lid = ppd->lid;
+               if (lid) {
+                       lid |= ah_attr->src_path_bits & ((1 << ppd->lmc) - 1);
+                       ps->s_txreq->phdr.hdr.lrh[3] = cpu_to_be16(lid);
+               } else {
+                       ps->s_txreq->phdr.hdr.lrh[3] = IB_LID_PERMISSIVE;
+               }
+       }
+       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+               bth0 |= IB_BTH_SOLICITED;
+       bth0 |= extra_bytes << 20;
+       if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI)
+               bth0 |= hfi1_get_pkey(ibp, wqe->ud_wr.pkey_index);
+       else
+               bth0 |= hfi1_get_pkey(ibp, qp->s_pkey_index);
+       ohdr->bth[0] = cpu_to_be32(bth0);
+       ohdr->bth[1] = cpu_to_be32(wqe->ud_wr.remote_qpn);
+       ohdr->bth[2] = cpu_to_be32(mask_psn(wqe->psn));
+       /*
+        * Qkeys with the high order bit set mean use the
+        * qkey from the QP context instead of the WR (see 10.2.5).
+        */
+       ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->ud_wr.remote_qkey < 0 ?
+                                        qp->qkey : wqe->ud_wr.remote_qkey);
+       ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
+       /* disarm any ahg */
+       priv->s_hdr->ahgcount = 0;
+       priv->s_hdr->ahgidx = 0;
+       priv->s_hdr->tx_flags = 0;
+       priv->s_hdr->sde = NULL;
+       /* pbc */
+       ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2;
+
+       return 1;
+
+done_free_tx:
+       hfi1_put_txreq(ps->s_txreq);
+       ps->s_txreq = NULL;
+       return 1;
+
+bail:
+       hfi1_put_txreq(ps->s_txreq);
+
+bail_no_tx:
+       ps->s_txreq = NULL;
+       qp->s_flags &= ~RVT_S_BUSY;
+       qp->s_hdrwords = 0;
+       return 0;
+}
+
+/*
+ * Hardware can't check this so we do it here.
+ *
+ * This is a slightly different algorithm than the standard pkey check.  It
+ * special cases the management keys and allows for 0x7fff and 0xffff to be in
+ * the table at the same time.
+ *
+ * @returns the index found or -1 if not found
+ */
+int hfi1_lookup_pkey_idx(struct hfi1_ibport *ibp, u16 pkey)
+{
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       unsigned i;
+
+       if (pkey == FULL_MGMT_P_KEY || pkey == LIM_MGMT_P_KEY) {
+               unsigned lim_idx = -1;
+
+               for (i = 0; i < ARRAY_SIZE(ppd->pkeys); ++i) {
+                       /* here we look for an exact match */
+                       if (ppd->pkeys[i] == pkey)
+                               return i;
+                       if (ppd->pkeys[i] == LIM_MGMT_P_KEY)
+                               lim_idx = i;
+               }
+
+               /* did not find 0xffff return 0x7fff idx if found */
+               if (pkey == FULL_MGMT_P_KEY)
+                       return lim_idx;
+
+               /* no match...  */
+               return -1;
+       }
+
+       pkey &= 0x7fff; /* remove limited/full membership bit */
+
+       for (i = 0; i < ARRAY_SIZE(ppd->pkeys); ++i)
+               if ((ppd->pkeys[i] & 0x7fff) == pkey)
+                       return i;
+
+       /*
+        * Should not get here, this means hardware failed to validate pkeys.
+        */
+       return -1;
+}
+
+void return_cnp(struct hfi1_ibport *ibp, struct rvt_qp *qp, u32 remote_qpn,
+               u32 pkey, u32 slid, u32 dlid, u8 sc5,
+               const struct ib_grh *old_grh)
+{
+       u64 pbc, pbc_flags = 0;
+       u32 bth0, plen, vl, hwords = 5;
+       u16 lrh0;
+       u8 sl = ibp->sc_to_sl[sc5];
+       struct hfi1_ib_header hdr;
+       struct hfi1_other_headers *ohdr;
+       struct pio_buf *pbuf;
+       struct send_context *ctxt = qp_to_send_context(qp, sc5);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+       if (old_grh) {
+               struct ib_grh *grh = &hdr.u.l.grh;
+
+               grh->version_tclass_flow = old_grh->version_tclass_flow;
+               grh->paylen = cpu_to_be16((hwords - 2 + SIZE_OF_CRC) << 2);
+               grh->hop_limit = 0xff;
+               grh->sgid = old_grh->dgid;
+               grh->dgid = old_grh->sgid;
+               ohdr = &hdr.u.l.oth;
+               lrh0 = HFI1_LRH_GRH;
+               hwords += sizeof(struct ib_grh) / sizeof(u32);
+       } else {
+               ohdr = &hdr.u.oth;
+               lrh0 = HFI1_LRH_BTH;
+       }
+
+       lrh0 |= (sc5 & 0xf) << 12 | sl << 4;
+
+       bth0 = pkey | (IB_OPCODE_CNP << 24);
+       ohdr->bth[0] = cpu_to_be32(bth0);
+
+       ohdr->bth[1] = cpu_to_be32(remote_qpn | (1 << HFI1_BECN_SHIFT));
+       ohdr->bth[2] = 0; /* PSN 0 */
+
+       hdr.lrh[0] = cpu_to_be16(lrh0);
+       hdr.lrh[1] = cpu_to_be16(dlid);
+       hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
+       hdr.lrh[3] = cpu_to_be16(slid);
+
+       plen = 2 /* PBC */ + hwords;
+       pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT;
+       vl = sc_to_vlt(ppd->dd, sc5);
+       pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
+       if (ctxt) {
+               pbuf = sc_buffer_alloc(ctxt, plen, NULL, NULL);
+               if (pbuf)
+                       ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc,
+                                                &hdr, hwords);
+       }
+}
+
+/*
+ * opa_smp_check() - Do the regular pkey checking, and the additional
+ * checks for SMPs specified in OPAv1 rev 0.90, section 9.10.26
+ * ("SMA Packet Checks").
+ *
+ * Note that:
+ *   - Checks are done using the pkey directly from the packet's BTH,
+ *     and specifically _not_ the pkey that we attach to the completion,
+ *     which may be different.
+ *   - These checks are specifically for "non-local" SMPs (i.e., SMPs
+ *     which originated on another node). SMPs which are sent from, and
+ *     destined to this node are checked in opa_local_smp_check().
+ *
+ * At the point where opa_smp_check() is called, we know:
+ *   - destination QP is QP0
+ *
+ * opa_smp_check() returns 0 if all checks succeed, 1 otherwise.
+ */
+static int opa_smp_check(struct hfi1_ibport *ibp, u16 pkey, u8 sc5,
+                        struct rvt_qp *qp, u16 slid, struct opa_smp *smp)
+{
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+       /*
+        * I don't think it's possible for us to get here with sc != 0xf,
+        * but check it to be certain.
+        */
+       if (sc5 != 0xf)
+               return 1;
+
+       if (rcv_pkey_check(ppd, pkey, sc5, slid))
+               return 1;
+
+       /*
+        * At this point we know (and so don't need to check again) that
+        * the pkey is either LIM_MGMT_P_KEY, or FULL_MGMT_P_KEY
+        * (see ingress_pkey_check).
+        */
+       if (smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE &&
+           smp->mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED) {
+               ingress_pkey_table_fail(ppd, pkey, slid);
+               return 1;
+       }
+
+       /*
+        * SMPs fall into one of four (disjoint) categories:
+        * SMA request, SMA response, trap, or trap repress.
+        * Our response depends, in part, on which type of
+        * SMP we're processing.
+        *
+        * If this is not an SMA request, or trap repress:
+        *   - accept MAD if the port is running an SM
+        *   - pkey == FULL_MGMT_P_KEY =>
+        *       reply with unsupported method (i.e., just mark
+        *       the smp's status field here, and let it be
+        *       processed normally)
+        *   - pkey != LIM_MGMT_P_KEY =>
+        *       increment port recv constraint errors, drop MAD
+        * If this is an SMA request or trap repress:
+        *   - pkey != FULL_MGMT_P_KEY =>
+        *       increment port recv constraint errors, drop MAD
+        */
+       switch (smp->method) {
+       case IB_MGMT_METHOD_GET:
+       case IB_MGMT_METHOD_SET:
+       case IB_MGMT_METHOD_REPORT:
+       case IB_MGMT_METHOD_TRAP_REPRESS:
+               if (pkey != FULL_MGMT_P_KEY) {
+                       ingress_pkey_table_fail(ppd, pkey, slid);
+                       return 1;
+               }
+               break;
+       case IB_MGMT_METHOD_SEND:
+       case IB_MGMT_METHOD_TRAP:
+       case IB_MGMT_METHOD_GET_RESP:
+       case IB_MGMT_METHOD_REPORT_RESP:
+               if (ibp->rvp.port_cap_flags & IB_PORT_SM)
+                       return 0;
+               if (pkey == FULL_MGMT_P_KEY) {
+                       smp->status |= IB_SMP_UNSUP_METHOD;
+                       return 0;
+               }
+               if (pkey != LIM_MGMT_P_KEY) {
+                       ingress_pkey_table_fail(ppd, pkey, slid);
+                       return 1;
+               }
+               break;
+       default:
+               break;
+       }
+       return 0;
+}
+
+/**
+ * hfi1_ud_rcv - receive an incoming UD packet
+ * @ibp: the port the packet came in on
+ * @hdr: the packet header
+ * @rcv_flags: flags relevant to rcv processing
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP the packet came on
+ *
+ * This is called from qp_rcv() to process an incoming UD packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void hfi1_ud_rcv(struct hfi1_packet *packet)
+{
+       struct hfi1_other_headers *ohdr = packet->ohdr;
+       int opcode;
+       u32 hdrsize = packet->hlen;
+       u32 pad;
+       struct ib_wc wc;
+       u32 qkey;
+       u32 src_qp;
+       u16 dlid, pkey;
+       int mgmt_pkey_idx = -1;
+       struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data;
+       struct hfi1_ib_header *hdr = packet->hdr;
+       u32 rcv_flags = packet->rcv_flags;
+       void *data = packet->ebuf;
+       u32 tlen = packet->tlen;
+       struct rvt_qp *qp = packet->qp;
+       bool has_grh = rcv_flags & HFI1_HAS_GRH;
+       bool sc4_bit = has_sc4_bit(packet);
+       u8 sc;
+       u32 bth1;
+       int is_mcast;
+       struct ib_grh *grh = NULL;
+
+       qkey = be32_to_cpu(ohdr->u.ud.deth[0]);
+       src_qp = be32_to_cpu(ohdr->u.ud.deth[1]) & RVT_QPN_MASK;
+       dlid = be16_to_cpu(hdr->lrh[1]);
+       is_mcast = (dlid > be16_to_cpu(IB_MULTICAST_LID_BASE)) &&
+                       (dlid != be16_to_cpu(IB_LID_PERMISSIVE));
+       bth1 = be32_to_cpu(ohdr->bth[1]);
+       if (unlikely(bth1 & HFI1_BECN_SMASK)) {
+               /*
+                * In pre-B0 h/w the CNP_OPCODE is handled via an
+                * error path.
+                */
+               struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+               u32 lqpn =  be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK;
+               u8 sl, sc5;
+
+               sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
+               sc5 |= sc4_bit;
+               sl = ibp->sc_to_sl[sc5];
+
+               process_becn(ppd, sl, 0, lqpn, 0, IB_CC_SVCTYPE_UD);
+       }
+
+       /*
+        * The opcode is in the low byte when its in network order
+        * (top byte when in host order).
+        */
+       opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+       opcode &= 0xff;
+
+       pkey = (u16)be32_to_cpu(ohdr->bth[0]);
+
+       if (!is_mcast && (opcode != IB_OPCODE_CNP) && bth1 & HFI1_FECN_SMASK) {
+               u16 slid = be16_to_cpu(hdr->lrh[3]);
+               u8 sc5;
+
+               sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
+               sc5 |= sc4_bit;
+
+               return_cnp(ibp, qp, src_qp, pkey, dlid, slid, sc5, grh);
+       }
+       /*
+        * Get the number of bytes the message was padded by
+        * and drop incomplete packets.
+        */
+       pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+       if (unlikely(tlen < (hdrsize + pad + 4)))
+               goto drop;
+
+       tlen -= hdrsize + pad + 4;
+
+       /*
+        * Check that the permissive LID is only used on QP0
+        * and the QKEY matches (see 9.6.1.4.1 and 9.6.1.5.1).
+        */
+       if (qp->ibqp.qp_num) {
+               if (unlikely(hdr->lrh[1] == IB_LID_PERMISSIVE ||
+                            hdr->lrh[3] == IB_LID_PERMISSIVE))
+                       goto drop;
+               if (qp->ibqp.qp_num > 1) {
+                       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+                       u16 slid;
+                       u8 sc5;
+
+                       sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
+                       sc5 |= sc4_bit;
+
+                       slid = be16_to_cpu(hdr->lrh[3]);
+                       if (unlikely(rcv_pkey_check(ppd, pkey, sc5, slid))) {
+                               /*
+                                * Traps will not be sent for packets dropped
+                                * by the HW. This is fine, as sending trap
+                                * for invalid pkeys is optional according to
+                                * IB spec (release 1.3, section 10.9.4)
+                                */
+                               hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_P_KEY,
+                                              pkey,
+                                              (be16_to_cpu(hdr->lrh[0]) >> 4) &
+                                               0xF,
+                                              src_qp, qp->ibqp.qp_num,
+                                              be16_to_cpu(hdr->lrh[3]),
+                                              be16_to_cpu(hdr->lrh[1]));
+                               return;
+                       }
+               } else {
+                       /* GSI packet */
+                       mgmt_pkey_idx = hfi1_lookup_pkey_idx(ibp, pkey);
+                       if (mgmt_pkey_idx < 0)
+                               goto drop;
+               }
+               if (unlikely(qkey != qp->qkey)) {
+                       hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_Q_KEY, qkey,
+                                      (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF,
+                                      src_qp, qp->ibqp.qp_num,
+                                      be16_to_cpu(hdr->lrh[3]),
+                                      be16_to_cpu(hdr->lrh[1]));
+                       return;
+               }
+               /* Drop invalid MAD packets (see 13.5.3.1). */
+               if (unlikely(qp->ibqp.qp_num == 1 &&
+                            (tlen > 2048 ||
+                             (be16_to_cpu(hdr->lrh[0]) >> 12) == 15)))
+                       goto drop;
+       } else {
+               /* Received on QP0, and so by definition, this is an SMP */
+               struct opa_smp *smp = (struct opa_smp *)data;
+               u16 slid = be16_to_cpu(hdr->lrh[3]);
+               u8 sc5;
+
+               sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
+               sc5 |= sc4_bit;
+
+               if (opa_smp_check(ibp, pkey, sc5, qp, slid, smp))
+                       goto drop;
+
+               if (tlen > 2048)
+                       goto drop;
+               if ((hdr->lrh[1] == IB_LID_PERMISSIVE ||
+                    hdr->lrh[3] == IB_LID_PERMISSIVE) &&
+                   smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+                       goto drop;
+
+               /* look up SMI pkey */
+               mgmt_pkey_idx = hfi1_lookup_pkey_idx(ibp, pkey);
+               if (mgmt_pkey_idx < 0)
+                       goto drop;
+       }
+
+       if (qp->ibqp.qp_num > 1 &&
+           opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) {
+               wc.ex.imm_data = ohdr->u.ud.imm_data;
+               wc.wc_flags = IB_WC_WITH_IMM;
+               tlen -= sizeof(u32);
+       } else if (opcode == IB_OPCODE_UD_SEND_ONLY) {
+               wc.ex.imm_data = 0;
+               wc.wc_flags = 0;
+       } else {
+               goto drop;
+       }
+
+       /*
+        * A GRH is expected to precede the data even if not
+        * present on the wire.
+        */
+       wc.byte_len = tlen + sizeof(struct ib_grh);
+
+       /*
+        * Get the next work request entry to find where to put the data.
+        */
+       if (qp->r_flags & RVT_R_REUSE_SGE) {
+               qp->r_flags &= ~RVT_R_REUSE_SGE;
+       } else {
+               int ret;
+
+               ret = hfi1_rvt_get_rwqe(qp, 0);
+               if (ret < 0) {
+                       hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+                       return;
+               }
+               if (!ret) {
+                       if (qp->ibqp.qp_num == 0)
+                               ibp->rvp.n_vl15_dropped++;
+                       return;
+               }
+       }
+       /* Silently drop packets which are too big. */
+       if (unlikely(wc.byte_len > qp->r_len)) {
+               qp->r_flags |= RVT_R_REUSE_SGE;
+               goto drop;
+       }
+       if (has_grh) {
+               hfi1_copy_sge(&qp->r_sge, &hdr->u.l.grh,
+                             sizeof(struct ib_grh), 1, 0);
+               wc.wc_flags |= IB_WC_GRH;
+       } else {
+               hfi1_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1);
+       }
+       hfi1_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh),
+                     1, 0);
+       rvt_put_ss(&qp->r_sge);
+       if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
+               return;
+       wc.wr_id = qp->r_wr_id;
+       wc.status = IB_WC_SUCCESS;
+       wc.opcode = IB_WC_RECV;
+       wc.vendor_err = 0;
+       wc.qp = &qp->ibqp;
+       wc.src_qp = src_qp;
+
+       if (qp->ibqp.qp_type == IB_QPT_GSI ||
+           qp->ibqp.qp_type == IB_QPT_SMI) {
+               if (mgmt_pkey_idx < 0) {
+                       if (net_ratelimit()) {
+                               struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+                               struct hfi1_devdata *dd = ppd->dd;
+
+                               dd_dev_err(dd, "QP type %d mgmt_pkey_idx < 0 and packet not dropped???\n",
+                                          qp->ibqp.qp_type);
+                               mgmt_pkey_idx = 0;
+                       }
+               }
+               wc.pkey_index = (unsigned)mgmt_pkey_idx;
+       } else {
+               wc.pkey_index = 0;
+       }
+
+       wc.slid = be16_to_cpu(hdr->lrh[3]);
+       sc = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
+       sc |= sc4_bit;
+       wc.sl = ibp->sc_to_sl[sc];
+
+       /*
+        * Save the LMC lower bits if the destination LID is a unicast LID.
+        */
+       wc.dlid_path_bits = dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE) ? 0 :
+               dlid & ((1 << ppd_from_ibp(ibp)->lmc) - 1);
+       wc.port_num = qp->port_num;
+       /* Signal completion event if the solicited bit is set. */
+       rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
+                    (ohdr->bth[0] &
+                     cpu_to_be32(IB_BTH_SOLICITED)) != 0);
+       return;
+
+drop:
+       ibp->rvp.n_pkt_drops++;
+}
diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c
new file mode 100644 (file)
index 0000000..1b640a3
--- /dev/null
@@ -0,0 +1,1050 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <asm/page.h>
+
+#include "user_exp_rcv.h"
+#include "trace.h"
+#include "mmu_rb.h"
+
+struct tid_group {
+       struct list_head list;
+       unsigned base;
+       u8 size;
+       u8 used;
+       u8 map;
+};
+
+struct tid_rb_node {
+       struct mmu_rb_node mmu;
+       unsigned long phys;
+       struct tid_group *grp;
+       u32 rcventry;
+       dma_addr_t dma_addr;
+       bool freed;
+       unsigned npages;
+       struct page *pages[0];
+};
+
+struct tid_pageset {
+       u16 idx;
+       u16 count;
+};
+
+#define EXP_TID_SET_EMPTY(set) (set.count == 0 && list_empty(&set.list))
+
+#define num_user_pages(vaddr, len)                                    \
+       (1 + (((((unsigned long)(vaddr) +                              \
+                (unsigned long)(len) - 1) & PAGE_MASK) -              \
+              ((unsigned long)vaddr & PAGE_MASK)) >> PAGE_SHIFT))
+
+static void unlock_exp_tids(struct hfi1_ctxtdata *, struct exp_tid_set *,
+                           struct rb_root *);
+static u32 find_phys_blocks(struct page **, unsigned, struct tid_pageset *);
+static int set_rcvarray_entry(struct file *, unsigned long, u32,
+                             struct tid_group *, struct page **, unsigned);
+static int mmu_rb_insert(struct rb_root *, struct mmu_rb_node *);
+static void mmu_rb_remove(struct rb_root *, struct mmu_rb_node *,
+                         struct mm_struct *);
+static int mmu_rb_invalidate(struct rb_root *, struct mmu_rb_node *);
+static int program_rcvarray(struct file *, unsigned long, struct tid_group *,
+                           struct tid_pageset *, unsigned, u16, struct page **,
+                           u32 *, unsigned *, unsigned *);
+static int unprogram_rcvarray(struct file *, u32, struct tid_group **);
+static void clear_tid_node(struct hfi1_filedata *, u16, struct tid_rb_node *);
+
+static struct mmu_rb_ops tid_rb_ops = {
+       .insert = mmu_rb_insert,
+       .remove = mmu_rb_remove,
+       .invalidate = mmu_rb_invalidate
+};
+
+static inline u32 rcventry2tidinfo(u32 rcventry)
+{
+       u32 pair = rcventry & ~0x1;
+
+       return EXP_TID_SET(IDX, pair >> 1) |
+               EXP_TID_SET(CTRL, 1 << (rcventry - pair));
+}
+
+static inline void exp_tid_group_init(struct exp_tid_set *set)
+{
+       INIT_LIST_HEAD(&set->list);
+       set->count = 0;
+}
+
+static inline void tid_group_remove(struct tid_group *grp,
+                                   struct exp_tid_set *set)
+{
+       list_del_init(&grp->list);
+       set->count--;
+}
+
+static inline void tid_group_add_tail(struct tid_group *grp,
+                                     struct exp_tid_set *set)
+{
+       list_add_tail(&grp->list, &set->list);
+       set->count++;
+}
+
+static inline struct tid_group *tid_group_pop(struct exp_tid_set *set)
+{
+       struct tid_group *grp =
+               list_first_entry(&set->list, struct tid_group, list);
+       list_del_init(&grp->list);
+       set->count--;
+       return grp;
+}
+
+static inline void tid_group_move(struct tid_group *group,
+                                 struct exp_tid_set *s1,
+                                 struct exp_tid_set *s2)
+{
+       tid_group_remove(group, s1);
+       tid_group_add_tail(group, s2);
+}
+
+/*
+ * Initialize context and file private data needed for Expected
+ * receive caching. This needs to be done after the context has
+ * been configured with the eager/expected RcvEntry counts.
+ */
+int hfi1_user_exp_rcv_init(struct file *fp)
+{
+       struct hfi1_filedata *fd = fp->private_data;
+       struct hfi1_ctxtdata *uctxt = fd->uctxt;
+       struct hfi1_devdata *dd = uctxt->dd;
+       unsigned tidbase;
+       int i, ret = 0;
+
+       spin_lock_init(&fd->tid_lock);
+       spin_lock_init(&fd->invalid_lock);
+       fd->tid_rb_root = RB_ROOT;
+
+       if (!uctxt->subctxt_cnt || !fd->subctxt) {
+               exp_tid_group_init(&uctxt->tid_group_list);
+               exp_tid_group_init(&uctxt->tid_used_list);
+               exp_tid_group_init(&uctxt->tid_full_list);
+
+               tidbase = uctxt->expected_base;
+               for (i = 0; i < uctxt->expected_count /
+                            dd->rcv_entries.group_size; i++) {
+                       struct tid_group *grp;
+
+                       grp = kzalloc(sizeof(*grp), GFP_KERNEL);
+                       if (!grp) {
+                               /*
+                                * If we fail here, the groups already
+                                * allocated will be freed by the close
+                                * call.
+                                */
+                               ret = -ENOMEM;
+                               goto done;
+                       }
+                       grp->size = dd->rcv_entries.group_size;
+                       grp->base = tidbase;
+                       tid_group_add_tail(grp, &uctxt->tid_group_list);
+                       tidbase += dd->rcv_entries.group_size;
+               }
+       }
+
+       fd->entry_to_rb = kcalloc(uctxt->expected_count,
+                                    sizeof(struct rb_node *),
+                                    GFP_KERNEL);
+       if (!fd->entry_to_rb)
+               return -ENOMEM;
+
+       if (!HFI1_CAP_IS_USET(TID_UNMAP)) {
+               fd->invalid_tid_idx = 0;
+               fd->invalid_tids = kzalloc(uctxt->expected_count *
+                                          sizeof(u32), GFP_KERNEL);
+               if (!fd->invalid_tids) {
+                       ret = -ENOMEM;
+                       goto done;
+               }
+
+               /*
+                * Register MMU notifier callbacks. If the registration
+                * fails, continue but turn off the TID caching for
+                * all user contexts.
+                */
+               ret = hfi1_mmu_rb_register(&fd->tid_rb_root, &tid_rb_ops);
+               if (ret) {
+                       dd_dev_info(dd,
+                                   "Failed MMU notifier registration %d\n",
+                                   ret);
+                       HFI1_CAP_USET(TID_UNMAP);
+                       ret = 0;
+               }
+       }
+
+       /*
+        * PSM does not have a good way to separate, count, and
+        * effectively enforce a limit on RcvArray entries used by
+        * subctxts (when context sharing is used) when TID caching
+        * is enabled. To help with that, we calculate a per-process
+        * RcvArray entry share and enforce that.
+        * If TID caching is not in use, PSM deals with usage on its
+        * own. In that case, we allow any subctxt to take all of the
+        * entries.
+        *
+        * Make sure that we set the tid counts only after successful
+        * init.
+        */
+       spin_lock(&fd->tid_lock);
+       if (uctxt->subctxt_cnt && !HFI1_CAP_IS_USET(TID_UNMAP)) {
+               u16 remainder;
+
+               fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt;
+               remainder = uctxt->expected_count % uctxt->subctxt_cnt;
+               if (remainder && fd->subctxt < remainder)
+                       fd->tid_limit++;
+       } else {
+               fd->tid_limit = uctxt->expected_count;
+       }
+       spin_unlock(&fd->tid_lock);
+done:
+       return ret;
+}
+
+int hfi1_user_exp_rcv_free(struct hfi1_filedata *fd)
+{
+       struct hfi1_ctxtdata *uctxt = fd->uctxt;
+       struct tid_group *grp, *gptr;
+
+       if (!test_bit(HFI1_CTXT_SETUP_DONE, &uctxt->event_flags))
+               return 0;
+       /*
+        * The notifier would have been removed when the process'es mm
+        * was freed.
+        */
+       if (!HFI1_CAP_IS_USET(TID_UNMAP))
+               hfi1_mmu_rb_unregister(&fd->tid_rb_root);
+
+       kfree(fd->invalid_tids);
+
+       if (!uctxt->cnt) {
+               if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list))
+                       unlock_exp_tids(uctxt, &uctxt->tid_full_list,
+                                       &fd->tid_rb_root);
+               if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list))
+                       unlock_exp_tids(uctxt, &uctxt->tid_used_list,
+                                       &fd->tid_rb_root);
+               list_for_each_entry_safe(grp, gptr, &uctxt->tid_group_list.list,
+                                        list) {
+                       list_del_init(&grp->list);
+                       kfree(grp);
+               }
+               hfi1_clear_tids(uctxt);
+       }
+
+       kfree(fd->entry_to_rb);
+       return 0;
+}
+
+/*
+ * Write an "empty" RcvArray entry.
+ * This function exists so the TID registaration code can use it
+ * to write to unused/unneeded entries and still take advantage
+ * of the WC performance improvements. The HFI will ignore this
+ * write to the RcvArray entry.
+ */
+static inline void rcv_array_wc_fill(struct hfi1_devdata *dd, u32 index)
+{
+       /*
+        * Doing the WC fill writes only makes sense if the device is
+        * present and the RcvArray has been mapped as WC memory.
+        */
+       if ((dd->flags & HFI1_PRESENT) && dd->rcvarray_wc)
+               writeq(0, dd->rcvarray_wc + (index * 8));
+}
+
+/*
+ * RcvArray entry allocation for Expected Receives is done by the
+ * following algorithm:
+ *
+ * The context keeps 3 lists of groups of RcvArray entries:
+ *   1. List of empty groups - tid_group_list
+ *      This list is created during user context creation and
+ *      contains elements which describe sets (of 8) of empty
+ *      RcvArray entries.
+ *   2. List of partially used groups - tid_used_list
+ *      This list contains sets of RcvArray entries which are
+ *      not completely used up. Another mapping request could
+ *      use some of all of the remaining entries.
+ *   3. List of full groups - tid_full_list
+ *      This is the list where sets that are completely used
+ *      up go.
+ *
+ * An attempt to optimize the usage of RcvArray entries is
+ * made by finding all sets of physically contiguous pages in a
+ * user's buffer.
+ * These physically contiguous sets are further split into
+ * sizes supported by the receive engine of the HFI. The
+ * resulting sets of pages are stored in struct tid_pageset,
+ * which describes the sets as:
+ *    * .count - number of pages in this set
+ *    * .idx - starting index into struct page ** array
+ *                    of this set
+ *
+ * From this point on, the algorithm deals with the page sets
+ * described above. The number of pagesets is divided by the
+ * RcvArray group size to produce the number of full groups
+ * needed.
+ *
+ * Groups from the 3 lists are manipulated using the following
+ * rules:
+ *   1. For each set of 8 pagesets, a complete group from
+ *      tid_group_list is taken, programmed, and moved to
+ *      the tid_full_list list.
+ *   2. For all remaining pagesets:
+ *      2.1 If the tid_used_list is empty and the tid_group_list
+ *          is empty, stop processing pageset and return only
+ *          what has been programmed up to this point.
+ *      2.2 If the tid_used_list is empty and the tid_group_list
+ *          is not empty, move a group from tid_group_list to
+ *          tid_used_list.
+ *      2.3 For each group is tid_used_group, program as much as
+ *          can fit into the group. If the group becomes fully
+ *          used, move it to tid_full_list.
+ */
+int hfi1_user_exp_rcv_setup(struct file *fp, struct hfi1_tid_info *tinfo)
+{
+       int ret = 0, need_group = 0, pinned;
+       struct hfi1_filedata *fd = fp->private_data;
+       struct hfi1_ctxtdata *uctxt = fd->uctxt;
+       struct hfi1_devdata *dd = uctxt->dd;
+       unsigned npages, ngroups, pageidx = 0, pageset_count, npagesets,
+               tididx = 0, mapped, mapped_pages = 0;
+       unsigned long vaddr = tinfo->vaddr;
+       struct page **pages = NULL;
+       u32 *tidlist = NULL;
+       struct tid_pageset *pagesets = NULL;
+
+       /* Get the number of pages the user buffer spans */
+       npages = num_user_pages(vaddr, tinfo->length);
+       if (!npages)
+               return -EINVAL;
+
+       if (npages > uctxt->expected_count) {
+               dd_dev_err(dd, "Expected buffer too big\n");
+               return -EINVAL;
+       }
+
+       /* Verify that access is OK for the user buffer */
+       if (!access_ok(VERIFY_WRITE, (void __user *)vaddr,
+                      npages * PAGE_SIZE)) {
+               dd_dev_err(dd, "Fail vaddr %p, %u pages, !access_ok\n",
+                          (void *)vaddr, npages);
+               return -EFAULT;
+       }
+
+       pagesets = kcalloc(uctxt->expected_count, sizeof(*pagesets),
+                          GFP_KERNEL);
+       if (!pagesets)
+               return -ENOMEM;
+
+       /* Allocate the array of struct page pointers needed for pinning */
+       pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
+       if (!pages) {
+               ret = -ENOMEM;
+               goto bail;
+       }
+
+       /*
+        * Pin all the pages of the user buffer. If we can't pin all the
+        * pages, accept the amount pinned so far and program only that.
+        * User space knows how to deal with partially programmed buffers.
+        */
+       if (!hfi1_can_pin_pages(dd, fd->tid_n_pinned, npages)) {
+               ret = -ENOMEM;
+               goto bail;
+       }
+
+       pinned = hfi1_acquire_user_pages(vaddr, npages, true, pages);
+       if (pinned <= 0) {
+               ret = pinned;
+               goto bail;
+       }
+       fd->tid_n_pinned += npages;
+
+       /* Find sets of physically contiguous pages */
+       npagesets = find_phys_blocks(pages, pinned, pagesets);
+
+       /*
+        * We don't need to access this under a lock since tid_used is per
+        * process and the same process cannot be in hfi1_user_exp_rcv_clear()
+        * and hfi1_user_exp_rcv_setup() at the same time.
+        */
+       spin_lock(&fd->tid_lock);
+       if (fd->tid_used + npagesets > fd->tid_limit)
+               pageset_count = fd->tid_limit - fd->tid_used;
+       else
+               pageset_count = npagesets;
+       spin_unlock(&fd->tid_lock);
+
+       if (!pageset_count)
+               goto bail;
+
+       ngroups = pageset_count / dd->rcv_entries.group_size;
+       tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL);
+       if (!tidlist) {
+               ret = -ENOMEM;
+               goto nomem;
+       }
+
+       tididx = 0;
+
+       /*
+        * From this point on, we are going to be using shared (between master
+        * and subcontexts) context resources. We need to take the lock.
+        */
+       mutex_lock(&uctxt->exp_lock);
+       /*
+        * The first step is to program the RcvArray entries which are complete
+        * groups.
+        */
+       while (ngroups && uctxt->tid_group_list.count) {
+               struct tid_group *grp =
+                       tid_group_pop(&uctxt->tid_group_list);
+
+               ret = program_rcvarray(fp, vaddr, grp, pagesets,
+                                      pageidx, dd->rcv_entries.group_size,
+                                      pages, tidlist, &tididx, &mapped);
+               /*
+                * If there was a failure to program the RcvArray
+                * entries for the entire group, reset the grp fields
+                * and add the grp back to the free group list.
+                */
+               if (ret <= 0) {
+                       tid_group_add_tail(grp, &uctxt->tid_group_list);
+                       hfi1_cdbg(TID,
+                                 "Failed to program RcvArray group %d", ret);
+                       goto unlock;
+               }
+
+               tid_group_add_tail(grp, &uctxt->tid_full_list);
+               ngroups--;
+               pageidx += ret;
+               mapped_pages += mapped;
+       }
+
+       while (pageidx < pageset_count) {
+               struct tid_group *grp, *ptr;
+               /*
+                * If we don't have any partially used tid groups, check
+                * if we have empty groups. If so, take one from there and
+                * put in the partially used list.
+                */
+               if (!uctxt->tid_used_list.count || need_group) {
+                       if (!uctxt->tid_group_list.count)
+                               goto unlock;
+
+                       grp = tid_group_pop(&uctxt->tid_group_list);
+                       tid_group_add_tail(grp, &uctxt->tid_used_list);
+                       need_group = 0;
+               }
+               /*
+                * There is an optimization opportunity here - instead of
+                * fitting as many page sets as we can, check for a group
+                * later on in the list that could fit all of them.
+                */
+               list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list,
+                                        list) {
+                       unsigned use = min_t(unsigned, pageset_count - pageidx,
+                                            grp->size - grp->used);
+
+                       ret = program_rcvarray(fp, vaddr, grp, pagesets,
+                                              pageidx, use, pages, tidlist,
+                                              &tididx, &mapped);
+                       if (ret < 0) {
+                               hfi1_cdbg(TID,
+                                         "Failed to program RcvArray entries %d",
+                                         ret);
+                               ret = -EFAULT;
+                               goto unlock;
+                       } else if (ret > 0) {
+                               if (grp->used == grp->size)
+                                       tid_group_move(grp,
+                                                      &uctxt->tid_used_list,
+                                                      &uctxt->tid_full_list);
+                               pageidx += ret;
+                               mapped_pages += mapped;
+                               need_group = 0;
+                               /* Check if we are done so we break out early */
+                               if (pageidx >= pageset_count)
+                                       break;
+                       } else if (WARN_ON(ret == 0)) {
+                               /*
+                                * If ret is 0, we did not program any entries
+                                * into this group, which can only happen if
+                                * we've screwed up the accounting somewhere.
+                                * Warn and try to continue.
+                                */
+                               need_group = 1;
+                       }
+               }
+       }
+unlock:
+       mutex_unlock(&uctxt->exp_lock);
+nomem:
+       hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx,
+                 mapped_pages, ret);
+       if (tididx) {
+               spin_lock(&fd->tid_lock);
+               fd->tid_used += tididx;
+               spin_unlock(&fd->tid_lock);
+               tinfo->tidcnt = tididx;
+               tinfo->length = mapped_pages * PAGE_SIZE;
+
+               if (copy_to_user((void __user *)(unsigned long)tinfo->tidlist,
+                                tidlist, sizeof(tidlist[0]) * tididx)) {
+                       /*
+                        * On failure to copy to the user level, we need to undo
+                        * everything done so far so we don't leak resources.
+                        */
+                       tinfo->tidlist = (unsigned long)&tidlist;
+                       hfi1_user_exp_rcv_clear(fp, tinfo);
+                       tinfo->tidlist = 0;
+                       ret = -EFAULT;
+                       goto bail;
+               }
+       }
+
+       /*
+        * If not everything was mapped (due to insufficient RcvArray entries,
+        * for example), unpin all unmapped pages so we can pin them nex time.
+        */
+       if (mapped_pages != pinned) {
+               hfi1_release_user_pages(current->mm, &pages[mapped_pages],
+                                       pinned - mapped_pages,
+                                       false);
+               fd->tid_n_pinned -= pinned - mapped_pages;
+       }
+bail:
+       kfree(pagesets);
+       kfree(pages);
+       kfree(tidlist);
+       return ret > 0 ? 0 : ret;
+}
+
+int hfi1_user_exp_rcv_clear(struct file *fp, struct hfi1_tid_info *tinfo)
+{
+       int ret = 0;
+       struct hfi1_filedata *fd = fp->private_data;
+       struct hfi1_ctxtdata *uctxt = fd->uctxt;
+       u32 *tidinfo;
+       unsigned tididx;
+
+       tidinfo = kcalloc(tinfo->tidcnt, sizeof(*tidinfo), GFP_KERNEL);
+       if (!tidinfo)
+               return -ENOMEM;
+
+       if (copy_from_user(tidinfo, (void __user *)(unsigned long)
+                          tinfo->tidlist, sizeof(tidinfo[0]) *
+                          tinfo->tidcnt)) {
+               ret = -EFAULT;
+               goto done;
+       }
+
+       mutex_lock(&uctxt->exp_lock);
+       for (tididx = 0; tididx < tinfo->tidcnt; tididx++) {
+               ret = unprogram_rcvarray(fp, tidinfo[tididx], NULL);
+               if (ret) {
+                       hfi1_cdbg(TID, "Failed to unprogram rcv array %d",
+                                 ret);
+                       break;
+               }
+       }
+       spin_lock(&fd->tid_lock);
+       fd->tid_used -= tididx;
+       spin_unlock(&fd->tid_lock);
+       tinfo->tidcnt = tididx;
+       mutex_unlock(&uctxt->exp_lock);
+done:
+       kfree(tidinfo);
+       return ret;
+}
+
+int hfi1_user_exp_rcv_invalid(struct file *fp, struct hfi1_tid_info *tinfo)
+{
+       struct hfi1_filedata *fd = fp->private_data;
+       struct hfi1_ctxtdata *uctxt = fd->uctxt;
+       unsigned long *ev = uctxt->dd->events +
+               (((uctxt->ctxt - uctxt->dd->first_user_ctxt) *
+                 HFI1_MAX_SHARED_CTXTS) + fd->subctxt);
+       u32 *array;
+       int ret = 0;
+
+       if (!fd->invalid_tids)
+               return -EINVAL;
+
+       /*
+        * copy_to_user() can sleep, which will leave the invalid_lock
+        * locked and cause the MMU notifier to be blocked on the lock
+        * for a long time.
+        * Copy the data to a local buffer so we can release the lock.
+        */
+       array = kcalloc(uctxt->expected_count, sizeof(*array), GFP_KERNEL);
+       if (!array)
+               return -EFAULT;
+
+       spin_lock(&fd->invalid_lock);
+       if (fd->invalid_tid_idx) {
+               memcpy(array, fd->invalid_tids, sizeof(*array) *
+                      fd->invalid_tid_idx);
+               memset(fd->invalid_tids, 0, sizeof(*fd->invalid_tids) *
+                      fd->invalid_tid_idx);
+               tinfo->tidcnt = fd->invalid_tid_idx;
+               fd->invalid_tid_idx = 0;
+               /*
+                * Reset the user flag while still holding the lock.
+                * Otherwise, PSM can miss events.
+                */
+               clear_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
+       } else {
+               tinfo->tidcnt = 0;
+       }
+       spin_unlock(&fd->invalid_lock);
+
+       if (tinfo->tidcnt) {
+               if (copy_to_user((void __user *)tinfo->tidlist,
+                                array, sizeof(*array) * tinfo->tidcnt))
+                       ret = -EFAULT;
+       }
+       kfree(array);
+
+       return ret;
+}
+
+static u32 find_phys_blocks(struct page **pages, unsigned npages,
+                           struct tid_pageset *list)
+{
+       unsigned pagecount, pageidx, setcount = 0, i;
+       unsigned long pfn, this_pfn;
+
+       if (!npages)
+               return 0;
+
+       /*
+        * Look for sets of physically contiguous pages in the user buffer.
+        * This will allow us to optimize Expected RcvArray entry usage by
+        * using the bigger supported sizes.
+        */
+       pfn = page_to_pfn(pages[0]);
+       for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) {
+               this_pfn = i < npages ? page_to_pfn(pages[i]) : 0;
+
+               /*
+                * If the pfn's are not sequential, pages are not physically
+                * contiguous.
+                */
+               if (this_pfn != ++pfn) {
+                       /*
+                        * At this point we have to loop over the set of
+                        * physically contiguous pages and break them down it
+                        * sizes supported by the HW.
+                        * There are two main constraints:
+                        *     1. The max buffer size is MAX_EXPECTED_BUFFER.
+                        *        If the total set size is bigger than that
+                        *        program only a MAX_EXPECTED_BUFFER chunk.
+                        *     2. The buffer size has to be a power of two. If
+                        *        it is not, round down to the closes power of
+                        *        2 and program that size.
+                        */
+                       while (pagecount) {
+                               int maxpages = pagecount;
+                               u32 bufsize = pagecount * PAGE_SIZE;
+
+                               if (bufsize > MAX_EXPECTED_BUFFER)
+                                       maxpages =
+                                               MAX_EXPECTED_BUFFER >>
+                                               PAGE_SHIFT;
+                               else if (!is_power_of_2(bufsize))
+                                       maxpages =
+                                               rounddown_pow_of_two(bufsize) >>
+                                               PAGE_SHIFT;
+
+                               list[setcount].idx = pageidx;
+                               list[setcount].count = maxpages;
+                               pagecount -= maxpages;
+                               pageidx += maxpages;
+                               setcount++;
+                       }
+                       pageidx = i;
+                       pagecount = 1;
+                       pfn = this_pfn;
+               } else {
+                       pagecount++;
+               }
+       }
+       return setcount;
+}
+
+/**
+ * program_rcvarray() - program an RcvArray group with receive buffers
+ * @fp: file pointer
+ * @vaddr: starting user virtual address
+ * @grp: RcvArray group
+ * @sets: array of struct tid_pageset holding information on physically
+ *        contiguous chunks from the user buffer
+ * @start: starting index into sets array
+ * @count: number of struct tid_pageset's to program
+ * @pages: an array of struct page * for the user buffer
+ * @tidlist: the array of u32 elements when the information about the
+ *           programmed RcvArray entries is to be encoded.
+ * @tididx: starting offset into tidlist
+ * @pmapped: (output parameter) number of pages programmed into the RcvArray
+ *           entries.
+ *
+ * This function will program up to 'count' number of RcvArray entries from the
+ * group 'grp'. To make best use of write-combining writes, the function will
+ * perform writes to the unused RcvArray entries which will be ignored by the
+ * HW. Each RcvArray entry will be programmed with a physically contiguous
+ * buffer chunk from the user's virtual buffer.
+ *
+ * Return:
+ * -EINVAL if the requested count is larger than the size of the group,
+ * -ENOMEM or -EFAULT on error from set_rcvarray_entry(), or
+ * number of RcvArray entries programmed.
+ */
+static int program_rcvarray(struct file *fp, unsigned long vaddr,
+                           struct tid_group *grp,
+                           struct tid_pageset *sets,
+                           unsigned start, u16 count, struct page **pages,
+                           u32 *tidlist, unsigned *tididx, unsigned *pmapped)
+{
+       struct hfi1_filedata *fd = fp->private_data;
+       struct hfi1_ctxtdata *uctxt = fd->uctxt;
+       struct hfi1_devdata *dd = uctxt->dd;
+       u16 idx;
+       u32 tidinfo = 0, rcventry, useidx = 0;
+       int mapped = 0;
+
+       /* Count should never be larger than the group size */
+       if (count > grp->size)
+               return -EINVAL;
+
+       /* Find the first unused entry in the group */
+       for (idx = 0; idx < grp->size; idx++) {
+               if (!(grp->map & (1 << idx))) {
+                       useidx = idx;
+                       break;
+               }
+               rcv_array_wc_fill(dd, grp->base + idx);
+       }
+
+       idx = 0;
+       while (idx < count) {
+               u16 npages, pageidx, setidx = start + idx;
+               int ret = 0;
+
+               /*
+                * If this entry in the group is used, move to the next one.
+                * If we go past the end of the group, exit the loop.
+                */
+               if (useidx >= grp->size) {
+                       break;
+               } else if (grp->map & (1 << useidx)) {
+                       rcv_array_wc_fill(dd, grp->base + useidx);
+                       useidx++;
+                       continue;
+               }
+
+               rcventry = grp->base + useidx;
+               npages = sets[setidx].count;
+               pageidx = sets[setidx].idx;
+
+               ret = set_rcvarray_entry(fp, vaddr + (pageidx * PAGE_SIZE),
+                                        rcventry, grp, pages + pageidx,
+                                        npages);
+               if (ret)
+                       return ret;
+               mapped += npages;
+
+               tidinfo = rcventry2tidinfo(rcventry - uctxt->expected_base) |
+                       EXP_TID_SET(LEN, npages);
+               tidlist[(*tididx)++] = tidinfo;
+               grp->used++;
+               grp->map |= 1 << useidx++;
+               idx++;
+       }
+
+       /* Fill the rest of the group with "blank" writes */
+       for (; useidx < grp->size; useidx++)
+               rcv_array_wc_fill(dd, grp->base + useidx);
+       *pmapped = mapped;
+       return idx;
+}
+
+static int set_rcvarray_entry(struct file *fp, unsigned long vaddr,
+                             u32 rcventry, struct tid_group *grp,
+                             struct page **pages, unsigned npages)
+{
+       int ret;
+       struct hfi1_filedata *fd = fp->private_data;
+       struct hfi1_ctxtdata *uctxt = fd->uctxt;
+       struct tid_rb_node *node;
+       struct hfi1_devdata *dd = uctxt->dd;
+       struct rb_root *root = &fd->tid_rb_root;
+       dma_addr_t phys;
+
+       /*
+        * Allocate the node first so we can handle a potential
+        * failure before we've programmed anything.
+        */
+       node = kzalloc(sizeof(*node) + (sizeof(struct page *) * npages),
+                      GFP_KERNEL);
+       if (!node)
+               return -ENOMEM;
+
+       phys = pci_map_single(dd->pcidev,
+                             __va(page_to_phys(pages[0])),
+                             npages * PAGE_SIZE, PCI_DMA_FROMDEVICE);
+       if (dma_mapping_error(&dd->pcidev->dev, phys)) {
+               dd_dev_err(dd, "Failed to DMA map Exp Rcv pages 0x%llx\n",
+                          phys);
+               kfree(node);
+               return -EFAULT;
+       }
+
+       node->mmu.addr = vaddr;
+       node->mmu.len = npages * PAGE_SIZE;
+       node->phys = page_to_phys(pages[0]);
+       node->npages = npages;
+       node->rcventry = rcventry;
+       node->dma_addr = phys;
+       node->grp = grp;
+       node->freed = false;
+       memcpy(node->pages, pages, sizeof(struct page *) * npages);
+
+       if (HFI1_CAP_IS_USET(TID_UNMAP))
+               ret = mmu_rb_insert(root, &node->mmu);
+       else
+               ret = hfi1_mmu_rb_insert(root, &node->mmu);
+
+       if (ret) {
+               hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d",
+                         node->rcventry, node->mmu.addr, node->phys, ret);
+               pci_unmap_single(dd->pcidev, phys, npages * PAGE_SIZE,
+                                PCI_DMA_FROMDEVICE);
+               kfree(node);
+               return -EFAULT;
+       }
+       hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1);
+       trace_hfi1_exp_tid_reg(uctxt->ctxt, fd->subctxt, rcventry, npages,
+                              node->mmu.addr, node->phys, phys);
+       return 0;
+}
+
+static int unprogram_rcvarray(struct file *fp, u32 tidinfo,
+                             struct tid_group **grp)
+{
+       struct hfi1_filedata *fd = fp->private_data;
+       struct hfi1_ctxtdata *uctxt = fd->uctxt;
+       struct hfi1_devdata *dd = uctxt->dd;
+       struct tid_rb_node *node;
+       u8 tidctrl = EXP_TID_GET(tidinfo, CTRL);
+       u32 tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry;
+
+       if (tididx >= uctxt->expected_count) {
+               dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n",
+                          tididx, uctxt->ctxt);
+               return -EINVAL;
+       }
+
+       if (tidctrl == 0x3)
+               return -EINVAL;
+
+       rcventry = tididx + (tidctrl - 1);
+
+       node = fd->entry_to_rb[rcventry];
+       if (!node || node->rcventry != (uctxt->expected_base + rcventry))
+               return -EBADF;
+       if (HFI1_CAP_IS_USET(TID_UNMAP))
+               mmu_rb_remove(&fd->tid_rb_root, &node->mmu, NULL);
+       else
+               hfi1_mmu_rb_remove(&fd->tid_rb_root, &node->mmu);
+
+       if (grp)
+               *grp = node->grp;
+       clear_tid_node(fd, fd->subctxt, node);
+       return 0;
+}
+
+static void clear_tid_node(struct hfi1_filedata *fd, u16 subctxt,
+                          struct tid_rb_node *node)
+{
+       struct hfi1_ctxtdata *uctxt = fd->uctxt;
+       struct hfi1_devdata *dd = uctxt->dd;
+
+       trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry,
+                                node->npages, node->mmu.addr, node->phys,
+                                node->dma_addr);
+
+       hfi1_put_tid(dd, node->rcventry, PT_INVALID, 0, 0);
+       /*
+        * Make sure device has seen the write before we unpin the
+        * pages.
+        */
+       flush_wc();
+
+       pci_unmap_single(dd->pcidev, node->dma_addr, node->mmu.len,
+                        PCI_DMA_FROMDEVICE);
+       hfi1_release_user_pages(current->mm, node->pages, node->npages, true);
+       fd->tid_n_pinned -= node->npages;
+
+       node->grp->used--;
+       node->grp->map &= ~(1 << (node->rcventry - node->grp->base));
+
+       if (node->grp->used == node->grp->size - 1)
+               tid_group_move(node->grp, &uctxt->tid_full_list,
+                              &uctxt->tid_used_list);
+       else if (!node->grp->used)
+               tid_group_move(node->grp, &uctxt->tid_used_list,
+                              &uctxt->tid_group_list);
+       kfree(node);
+}
+
+static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
+                           struct exp_tid_set *set, struct rb_root *root)
+{
+       struct tid_group *grp, *ptr;
+       struct hfi1_filedata *fd = container_of(root, struct hfi1_filedata,
+                                               tid_rb_root);
+       int i;
+
+       list_for_each_entry_safe(grp, ptr, &set->list, list) {
+               list_del_init(&grp->list);
+
+               for (i = 0; i < grp->size; i++) {
+                       if (grp->map & (1 << i)) {
+                               u16 rcventry = grp->base + i;
+                               struct tid_rb_node *node;
+
+                               node = fd->entry_to_rb[rcventry -
+                                                         uctxt->expected_base];
+                               if (!node || node->rcventry != rcventry)
+                                       continue;
+                               if (HFI1_CAP_IS_USET(TID_UNMAP))
+                                       mmu_rb_remove(&fd->tid_rb_root,
+                                                     &node->mmu, NULL);
+                               else
+                                       hfi1_mmu_rb_remove(&fd->tid_rb_root,
+                                                          &node->mmu);
+                               clear_tid_node(fd, -1, node);
+                       }
+               }
+       }
+}
+
+static int mmu_rb_invalidate(struct rb_root *root, struct mmu_rb_node *mnode)
+{
+       struct hfi1_filedata *fdata =
+               container_of(root, struct hfi1_filedata, tid_rb_root);
+       struct hfi1_ctxtdata *uctxt = fdata->uctxt;
+       struct tid_rb_node *node =
+               container_of(mnode, struct tid_rb_node, mmu);
+
+       if (node->freed)
+               return 0;
+
+       trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt, node->mmu.addr,
+                                node->rcventry, node->npages, node->dma_addr);
+       node->freed = true;
+
+       spin_lock(&fdata->invalid_lock);
+       if (fdata->invalid_tid_idx < uctxt->expected_count) {
+               fdata->invalid_tids[fdata->invalid_tid_idx] =
+                       rcventry2tidinfo(node->rcventry - uctxt->expected_base);
+               fdata->invalid_tids[fdata->invalid_tid_idx] |=
+                       EXP_TID_SET(LEN, node->npages);
+               if (!fdata->invalid_tid_idx) {
+                       unsigned long *ev;
+
+                       /*
+                        * hfi1_set_uevent_bits() sets a user event flag
+                        * for all processes. Because calling into the
+                        * driver to process TID cache invalidations is
+                        * expensive and TID cache invalidations are
+                        * handled on a per-process basis, we can
+                        * optimize this to set the flag only for the
+                        * process in question.
+                        */
+                       ev = uctxt->dd->events +
+                               (((uctxt->ctxt - uctxt->dd->first_user_ctxt) *
+                                 HFI1_MAX_SHARED_CTXTS) + fdata->subctxt);
+                       set_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
+               }
+               fdata->invalid_tid_idx++;
+       }
+       spin_unlock(&fdata->invalid_lock);
+       return 0;
+}
+
+static int mmu_rb_insert(struct rb_root *root, struct mmu_rb_node *node)
+{
+       struct hfi1_filedata *fdata =
+               container_of(root, struct hfi1_filedata, tid_rb_root);
+       struct tid_rb_node *tnode =
+               container_of(node, struct tid_rb_node, mmu);
+       u32 base = fdata->uctxt->expected_base;
+
+       fdata->entry_to_rb[tnode->rcventry - base] = tnode;
+       return 0;
+}
+
+static void mmu_rb_remove(struct rb_root *root, struct mmu_rb_node *node,
+                         struct mm_struct *mm)
+{
+       struct hfi1_filedata *fdata =
+               container_of(root, struct hfi1_filedata, tid_rb_root);
+       struct tid_rb_node *tnode =
+               container_of(node, struct tid_rb_node, mmu);
+       u32 base = fdata->uctxt->expected_base;
+
+       fdata->entry_to_rb[tnode->rcventry - base] = NULL;
+}
diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.h b/drivers/infiniband/hw/hfi1/user_exp_rcv.h
new file mode 100644 (file)
index 0000000..9bc8d9f
--- /dev/null
@@ -0,0 +1,79 @@
+#ifndef _HFI1_USER_EXP_RCV_H
+#define _HFI1_USER_EXP_RCV_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "hfi.h"
+
+#define EXP_TID_TIDLEN_MASK   0x7FFULL
+#define EXP_TID_TIDLEN_SHIFT  0
+#define EXP_TID_TIDCTRL_MASK  0x3ULL
+#define EXP_TID_TIDCTRL_SHIFT 20
+#define EXP_TID_TIDIDX_MASK   0x3FFULL
+#define EXP_TID_TIDIDX_SHIFT  22
+#define EXP_TID_GET(tid, field)        \
+       (((tid) >> EXP_TID_TID##field##_SHIFT) & EXP_TID_TID##field##_MASK)
+
+#define EXP_TID_SET(field, value)                      \
+       (((value) & EXP_TID_TID##field##_MASK) <<       \
+        EXP_TID_TID##field##_SHIFT)
+#define EXP_TID_CLEAR(tid, field) ({                                   \
+               (tid) &= ~(EXP_TID_TID##field##_MASK <<                 \
+                          EXP_TID_TID##field##_SHIFT);                 \
+               })
+#define EXP_TID_RESET(tid, field, value) do {                          \
+               EXP_TID_CLEAR(tid, field);                              \
+               (tid) |= EXP_TID_SET(field, (value));                   \
+       } while (0)
+
+int hfi1_user_exp_rcv_init(struct file *);
+int hfi1_user_exp_rcv_free(struct hfi1_filedata *);
+int hfi1_user_exp_rcv_setup(struct file *, struct hfi1_tid_info *);
+int hfi1_user_exp_rcv_clear(struct file *, struct hfi1_tid_info *);
+int hfi1_user_exp_rcv_invalid(struct file *, struct hfi1_tid_info *);
+
+#endif /* _HFI1_USER_EXP_RCV_H */
diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c
new file mode 100644 (file)
index 0000000..88e10b5
--- /dev/null
@@ -0,0 +1,135 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/device.h>
+#include <linux/module.h>
+
+#include "hfi.h"
+
+static unsigned long cache_size = 256;
+module_param(cache_size, ulong, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(cache_size, "Send and receive side cache size limit (in MB)");
+
+/*
+ * Determine whether the caller can pin pages.
+ *
+ * This function should be used in the implementation of buffer caches.
+ * The cache implementation should call this function prior to attempting
+ * to pin buffer pages in order to determine whether they should do so.
+ * The function computes cache limits based on the configured ulimit and
+ * cache size. Use of this function is especially important for caches
+ * which are not limited in any other way (e.g. by HW resources) and, thus,
+ * could keeping caching buffers.
+ *
+ */
+bool hfi1_can_pin_pages(struct hfi1_devdata *dd, u32 nlocked, u32 npages)
+{
+       unsigned long ulimit = rlimit(RLIMIT_MEMLOCK), pinned, cache_limit,
+               size = (cache_size * (1UL << 20)); /* convert to bytes */
+       unsigned usr_ctxts = dd->num_rcv_contexts - dd->first_user_ctxt;
+       bool can_lock = capable(CAP_IPC_LOCK);
+
+       /*
+        * Calculate per-cache size. The calculation below uses only a quarter
+        * of the available per-context limit. This leaves space for other
+        * pinning. Should we worry about shared ctxts?
+        */
+       cache_limit = (ulimit / usr_ctxts) / 4;
+
+       /* If ulimit isn't set to "unlimited" and is smaller than cache_size. */
+       if (ulimit != (-1UL) && size > cache_limit)
+               size = cache_limit;
+
+       /* Convert to number of pages */
+       size = DIV_ROUND_UP(size, PAGE_SIZE);
+
+       down_read(&current->mm->mmap_sem);
+       pinned = current->mm->pinned_vm;
+       up_read(&current->mm->mmap_sem);
+
+       /* First, check the absolute limit against all pinned pages. */
+       if (pinned + npages >= ulimit && !can_lock)
+               return false;
+
+       return ((nlocked + npages) <= size) || can_lock;
+}
+
+int hfi1_acquire_user_pages(unsigned long vaddr, size_t npages, bool writable,
+                           struct page **pages)
+{
+       int ret;
+
+       ret = get_user_pages_fast(vaddr, npages, writable, pages);
+       if (ret < 0)
+               return ret;
+
+       down_write(&current->mm->mmap_sem);
+       current->mm->pinned_vm += ret;
+       up_write(&current->mm->mmap_sem);
+
+       return ret;
+}
+
+void hfi1_release_user_pages(struct mm_struct *mm, struct page **p,
+                            size_t npages, bool dirty)
+{
+       size_t i;
+
+       for (i = 0; i < npages; i++) {
+               if (dirty)
+                       set_page_dirty_lock(p[i]);
+               put_page(p[i]);
+       }
+
+       if (mm) { /* during close after signal, mm can be NULL */
+               down_write(&mm->mmap_sem);
+               mm->pinned_vm -= npages;
+               up_write(&mm->mmap_sem);
+       }
+}
diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c
new file mode 100644 (file)
index 0000000..29f4795
--- /dev/null
@@ -0,0 +1,1625 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/dmapool.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <linux/io.h>
+#include <linux/uio.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/mmu_context.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+
+#include "hfi.h"
+#include "sdma.h"
+#include "user_sdma.h"
+#include "verbs.h"  /* for the headers */
+#include "common.h" /* for struct hfi1_tid_info */
+#include "trace.h"
+#include "mmu_rb.h"
+
+static uint hfi1_sdma_comp_ring_size = 128;
+module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO);
+MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128");
+
+/* The maximum number of Data io vectors per message/request */
+#define MAX_VECTORS_PER_REQ 8
+/*
+ * Maximum number of packet to send from each message/request
+ * before moving to the next one.
+ */
+#define MAX_PKTS_PER_QUEUE 16
+
+#define num_pages(x) (1 + ((((x) - 1) & PAGE_MASK) >> PAGE_SHIFT))
+
+#define req_opcode(x) \
+       (((x) >> HFI1_SDMA_REQ_OPCODE_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK)
+#define req_version(x) \
+       (((x) >> HFI1_SDMA_REQ_VERSION_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK)
+#define req_iovcnt(x) \
+       (((x) >> HFI1_SDMA_REQ_IOVCNT_SHIFT) & HFI1_SDMA_REQ_IOVCNT_MASK)
+
+/* Number of BTH.PSN bits used for sequence number in expected rcvs */
+#define BTH_SEQ_MASK 0x7ffull
+
+/*
+ * Define fields in the KDETH header so we can update the header
+ * template.
+ */
+#define KDETH_OFFSET_SHIFT        0
+#define KDETH_OFFSET_MASK         0x7fff
+#define KDETH_OM_SHIFT            15
+#define KDETH_OM_MASK             0x1
+#define KDETH_TID_SHIFT           16
+#define KDETH_TID_MASK            0x3ff
+#define KDETH_TIDCTRL_SHIFT       26
+#define KDETH_TIDCTRL_MASK        0x3
+#define KDETH_INTR_SHIFT          28
+#define KDETH_INTR_MASK           0x1
+#define KDETH_SH_SHIFT            29
+#define KDETH_SH_MASK             0x1
+#define KDETH_HCRC_UPPER_SHIFT    16
+#define KDETH_HCRC_UPPER_MASK     0xff
+#define KDETH_HCRC_LOWER_SHIFT    24
+#define KDETH_HCRC_LOWER_MASK     0xff
+
+#define PBC2LRH(x) ((((x) & 0xfff) << 2) - 4)
+#define LRH2PBC(x) ((((x) >> 2) + 1) & 0xfff)
+
+#define KDETH_GET(val, field)                                          \
+       (((le32_to_cpu((val))) >> KDETH_##field##_SHIFT) & KDETH_##field##_MASK)
+#define KDETH_SET(dw, field, val) do {                                 \
+               u32 dwval = le32_to_cpu(dw);                            \
+               dwval &= ~(KDETH_##field##_MASK << KDETH_##field##_SHIFT); \
+               dwval |= (((val) & KDETH_##field##_MASK) << \
+                         KDETH_##field##_SHIFT);                       \
+               dw = cpu_to_le32(dwval);                                \
+       } while (0)
+
+#define AHG_HEADER_SET(arr, idx, dw, bit, width, value)                        \
+       do {                                                            \
+               if ((idx) < ARRAY_SIZE((arr)))                          \
+                       (arr)[(idx++)] = sdma_build_ahg_descriptor(     \
+                               (__force u16)(value), (dw), (bit),      \
+                                                       (width));       \
+               else                                                    \
+                       return -ERANGE;                                 \
+       } while (0)
+
+/* KDETH OM multipliers and switch over point */
+#define KDETH_OM_SMALL     4
+#define KDETH_OM_LARGE     64
+#define KDETH_OM_MAX_SIZE  (1 << ((KDETH_OM_LARGE / KDETH_OM_SMALL) + 1))
+
+/* Last packet in the request */
+#define TXREQ_FLAGS_REQ_LAST_PKT BIT(0)
+
+#define SDMA_REQ_IN_USE     0
+#define SDMA_REQ_FOR_THREAD 1
+#define SDMA_REQ_SEND_DONE  2
+#define SDMA_REQ_HAVE_AHG   3
+#define SDMA_REQ_HAS_ERROR  4
+#define SDMA_REQ_DONE_ERROR 5
+
+#define SDMA_PKT_Q_INACTIVE BIT(0)
+#define SDMA_PKT_Q_ACTIVE   BIT(1)
+#define SDMA_PKT_Q_DEFERRED BIT(2)
+
+/*
+ * Maximum retry attempts to submit a TX request
+ * before putting the process to sleep.
+ */
+#define MAX_DEFER_RETRY_COUNT 1
+
+static unsigned initial_pkt_count = 8;
+
+#define SDMA_IOWAIT_TIMEOUT 1000 /* in milliseconds */
+
+struct sdma_mmu_node;
+
+struct user_sdma_iovec {
+       struct list_head list;
+       struct iovec iov;
+       /* number of pages in this vector */
+       unsigned npages;
+       /* array of pinned pages for this vector */
+       struct page **pages;
+       /*
+        * offset into the virtual address space of the vector at
+        * which we last left off.
+        */
+       u64 offset;
+       struct sdma_mmu_node *node;
+};
+
+#define SDMA_CACHE_NODE_EVICT BIT(0)
+
+struct sdma_mmu_node {
+       struct mmu_rb_node rb;
+       struct list_head list;
+       struct hfi1_user_sdma_pkt_q *pq;
+       atomic_t refcount;
+       struct page **pages;
+       unsigned npages;
+       unsigned long flags;
+};
+
+struct user_sdma_request {
+       struct sdma_req_info info;
+       struct hfi1_user_sdma_pkt_q *pq;
+       struct hfi1_user_sdma_comp_q *cq;
+       /* This is the original header from user space */
+       struct hfi1_pkt_header hdr;
+       /*
+        * Pointer to the SDMA engine for this request.
+        * Since different request could be on different VLs,
+        * each request will need it's own engine pointer.
+        */
+       struct sdma_engine *sde;
+       u8 ahg_idx;
+       u32 ahg[9];
+       /*
+        * KDETH.Offset (Eager) field
+        * We need to remember the initial value so the headers
+        * can be updated properly.
+        */
+       u32 koffset;
+       /*
+        * KDETH.OFFSET (TID) field
+        * The offset can cover multiple packets, depending on the
+        * size of the TID entry.
+        */
+       u32 tidoffset;
+       /*
+        * KDETH.OM
+        * Remember this because the header template always sets it
+        * to 0.
+        */
+       u8 omfactor;
+       /*
+        * We copy the iovs for this request (based on
+        * info.iovcnt). These are only the data vectors
+        */
+       unsigned data_iovs;
+       /* total length of the data in the request */
+       u32 data_len;
+       /* progress index moving along the iovs array */
+       unsigned iov_idx;
+       struct user_sdma_iovec iovs[MAX_VECTORS_PER_REQ];
+       /* number of elements copied to the tids array */
+       u16 n_tids;
+       /* TID array values copied from the tid_iov vector */
+       u32 *tids;
+       u16 tididx;
+       u32 sent;
+       u64 seqnum;
+       u64 seqcomp;
+       u64 seqsubmitted;
+       struct list_head txps;
+       unsigned long flags;
+       /* status of the last txreq completed */
+       int status;
+};
+
+/*
+ * A single txreq could span up to 3 physical pages when the MTU
+ * is sufficiently large (> 4K). Each of the IOV pointers also
+ * needs it's own set of flags so the vector has been handled
+ * independently of each other.
+ */
+struct user_sdma_txreq {
+       /* Packet header for the txreq */
+       struct hfi1_pkt_header hdr;
+       struct sdma_txreq txreq;
+       struct list_head list;
+       struct user_sdma_request *req;
+       u16 flags;
+       unsigned busycount;
+       u64 seqnum;
+};
+
+#define SDMA_DBG(req, fmt, ...)                                     \
+       hfi1_cdbg(SDMA, "[%u:%u:%u:%u] " fmt, (req)->pq->dd->unit, \
+                (req)->pq->ctxt, (req)->pq->subctxt, (req)->info.comp_idx, \
+                ##__VA_ARGS__)
+#define SDMA_Q_DBG(pq, fmt, ...)                        \
+       hfi1_cdbg(SDMA, "[%u:%u:%u] " fmt, (pq)->dd->unit, (pq)->ctxt, \
+                (pq)->subctxt, ##__VA_ARGS__)
+
+static int user_sdma_send_pkts(struct user_sdma_request *, unsigned);
+static int num_user_pages(const struct iovec *);
+static void user_sdma_txreq_cb(struct sdma_txreq *, int);
+static inline void pq_update(struct hfi1_user_sdma_pkt_q *);
+static void user_sdma_free_request(struct user_sdma_request *, bool);
+static int pin_vector_pages(struct user_sdma_request *,
+                           struct user_sdma_iovec *);
+static void unpin_vector_pages(struct mm_struct *, struct page **, unsigned,
+                              unsigned);
+static int check_header_template(struct user_sdma_request *,
+                                struct hfi1_pkt_header *, u32, u32);
+static int set_txreq_header(struct user_sdma_request *,
+                           struct user_sdma_txreq *, u32);
+static int set_txreq_header_ahg(struct user_sdma_request *,
+                               struct user_sdma_txreq *, u32);
+static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *,
+                                 struct hfi1_user_sdma_comp_q *,
+                                 u16, enum hfi1_sdma_comp_state, int);
+static inline u32 set_pkt_bth_psn(__be32, u8, u32);
+static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len);
+
+static int defer_packet_queue(
+       struct sdma_engine *,
+       struct iowait *,
+       struct sdma_txreq *,
+       unsigned seq);
+static void activate_packet_queue(struct iowait *, int);
+static bool sdma_rb_filter(struct mmu_rb_node *, unsigned long, unsigned long);
+static int sdma_rb_insert(struct rb_root *, struct mmu_rb_node *);
+static void sdma_rb_remove(struct rb_root *, struct mmu_rb_node *,
+                          struct mm_struct *);
+static int sdma_rb_invalidate(struct rb_root *, struct mmu_rb_node *);
+
+static struct mmu_rb_ops sdma_rb_ops = {
+       .filter = sdma_rb_filter,
+       .insert = sdma_rb_insert,
+       .remove = sdma_rb_remove,
+       .invalidate = sdma_rb_invalidate
+};
+
+static int defer_packet_queue(
+       struct sdma_engine *sde,
+       struct iowait *wait,
+       struct sdma_txreq *txreq,
+       unsigned seq)
+{
+       struct hfi1_user_sdma_pkt_q *pq =
+               container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
+       struct hfi1_ibdev *dev = &pq->dd->verbs_dev;
+       struct user_sdma_txreq *tx =
+               container_of(txreq, struct user_sdma_txreq, txreq);
+
+       if (sdma_progress(sde, seq, txreq)) {
+               if (tx->busycount++ < MAX_DEFER_RETRY_COUNT)
+                       goto eagain;
+       }
+       /*
+        * We are assuming that if the list is enqueued somewhere, it
+        * is to the dmawait list since that is the only place where
+        * it is supposed to be enqueued.
+        */
+       xchg(&pq->state, SDMA_PKT_Q_DEFERRED);
+       write_seqlock(&dev->iowait_lock);
+       if (list_empty(&pq->busy.list))
+               list_add_tail(&pq->busy.list, &sde->dmawait);
+       write_sequnlock(&dev->iowait_lock);
+       return -EBUSY;
+eagain:
+       return -EAGAIN;
+}
+
+static void activate_packet_queue(struct iowait *wait, int reason)
+{
+       struct hfi1_user_sdma_pkt_q *pq =
+               container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
+       xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
+       wake_up(&wait->wait_dma);
+};
+
+static void sdma_kmem_cache_ctor(void *obj)
+{
+       struct user_sdma_txreq *tx = obj;
+
+       memset(tx, 0, sizeof(*tx));
+}
+
+int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp)
+{
+       struct hfi1_filedata *fd;
+       int ret = 0;
+       unsigned memsize;
+       char buf[64];
+       struct hfi1_devdata *dd;
+       struct hfi1_user_sdma_comp_q *cq;
+       struct hfi1_user_sdma_pkt_q *pq;
+       unsigned long flags;
+
+       if (!uctxt || !fp) {
+               ret = -EBADF;
+               goto done;
+       }
+
+       fd = fp->private_data;
+
+       if (!hfi1_sdma_comp_ring_size) {
+               ret = -EINVAL;
+               goto done;
+       }
+
+       dd = uctxt->dd;
+
+       pq = kzalloc(sizeof(*pq), GFP_KERNEL);
+       if (!pq)
+               goto pq_nomem;
+
+       memsize = sizeof(*pq->reqs) * hfi1_sdma_comp_ring_size;
+       pq->reqs = kzalloc(memsize, GFP_KERNEL);
+       if (!pq->reqs)
+               goto pq_reqs_nomem;
+
+       INIT_LIST_HEAD(&pq->list);
+       pq->dd = dd;
+       pq->ctxt = uctxt->ctxt;
+       pq->subctxt = fd->subctxt;
+       pq->n_max_reqs = hfi1_sdma_comp_ring_size;
+       pq->state = SDMA_PKT_Q_INACTIVE;
+       atomic_set(&pq->n_reqs, 0);
+       init_waitqueue_head(&pq->wait);
+       pq->sdma_rb_root = RB_ROOT;
+       INIT_LIST_HEAD(&pq->evict);
+       spin_lock_init(&pq->evict_lock);
+
+       iowait_init(&pq->busy, 0, NULL, defer_packet_queue,
+                   activate_packet_queue, NULL);
+       pq->reqidx = 0;
+       snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt,
+                fd->subctxt);
+       pq->txreq_cache = kmem_cache_create(buf,
+                              sizeof(struct user_sdma_txreq),
+                                           L1_CACHE_BYTES,
+                                           SLAB_HWCACHE_ALIGN,
+                                           sdma_kmem_cache_ctor);
+       if (!pq->txreq_cache) {
+               dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n",
+                          uctxt->ctxt);
+               goto pq_txreq_nomem;
+       }
+       fd->pq = pq;
+       cq = kzalloc(sizeof(*cq), GFP_KERNEL);
+       if (!cq)
+               goto cq_nomem;
+
+       memsize = PAGE_ALIGN(sizeof(*cq->comps) * hfi1_sdma_comp_ring_size);
+       cq->comps = vmalloc_user(memsize);
+       if (!cq->comps)
+               goto cq_comps_nomem;
+
+       cq->nentries = hfi1_sdma_comp_ring_size;
+       fd->cq = cq;
+
+       ret = hfi1_mmu_rb_register(&pq->sdma_rb_root, &sdma_rb_ops);
+       if (ret) {
+               dd_dev_err(dd, "Failed to register with MMU %d", ret);
+               goto done;
+       }
+
+       spin_lock_irqsave(&uctxt->sdma_qlock, flags);
+       list_add(&pq->list, &uctxt->sdma_queues);
+       spin_unlock_irqrestore(&uctxt->sdma_qlock, flags);
+       goto done;
+
+cq_comps_nomem:
+       kfree(cq);
+cq_nomem:
+       kmem_cache_destroy(pq->txreq_cache);
+pq_txreq_nomem:
+       kfree(pq->reqs);
+pq_reqs_nomem:
+       kfree(pq);
+       fd->pq = NULL;
+pq_nomem:
+       ret = -ENOMEM;
+done:
+       return ret;
+}
+
+int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd)
+{
+       struct hfi1_ctxtdata *uctxt = fd->uctxt;
+       struct hfi1_user_sdma_pkt_q *pq;
+       unsigned long flags;
+
+       hfi1_cdbg(SDMA, "[%u:%u:%u] Freeing user SDMA queues", uctxt->dd->unit,
+                 uctxt->ctxt, fd->subctxt);
+       pq = fd->pq;
+       hfi1_mmu_rb_unregister(&pq->sdma_rb_root);
+       if (pq) {
+               spin_lock_irqsave(&uctxt->sdma_qlock, flags);
+               if (!list_empty(&pq->list))
+                       list_del_init(&pq->list);
+               spin_unlock_irqrestore(&uctxt->sdma_qlock, flags);
+               iowait_sdma_drain(&pq->busy);
+               /* Wait until all requests have been freed. */
+               wait_event_interruptible(
+                       pq->wait,
+                       (ACCESS_ONCE(pq->state) == SDMA_PKT_Q_INACTIVE));
+               kfree(pq->reqs);
+               kmem_cache_destroy(pq->txreq_cache);
+               kfree(pq);
+               fd->pq = NULL;
+       }
+       if (fd->cq) {
+               vfree(fd->cq->comps);
+               kfree(fd->cq);
+               fd->cq = NULL;
+       }
+       return 0;
+}
+
+int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
+                                  unsigned long dim, unsigned long *count)
+{
+       int ret = 0, i = 0;
+       struct hfi1_filedata *fd = fp->private_data;
+       struct hfi1_ctxtdata *uctxt = fd->uctxt;
+       struct hfi1_user_sdma_pkt_q *pq = fd->pq;
+       struct hfi1_user_sdma_comp_q *cq = fd->cq;
+       struct hfi1_devdata *dd = pq->dd;
+       unsigned long idx = 0;
+       u8 pcount = initial_pkt_count;
+       struct sdma_req_info info;
+       struct user_sdma_request *req;
+       u8 opcode, sc, vl;
+       int req_queued = 0;
+
+       if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) {
+               hfi1_cdbg(
+                  SDMA,
+                  "[%u:%u:%u] First vector not big enough for header %lu/%lu",
+                  dd->unit, uctxt->ctxt, fd->subctxt,
+                  iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr));
+               return -EINVAL;
+       }
+       ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info));
+       if (ret) {
+               hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)",
+                         dd->unit, uctxt->ctxt, fd->subctxt, ret);
+               return -EFAULT;
+       }
+
+       trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt,
+                                    (u16 *)&info);
+       if (cq->comps[info.comp_idx].status == QUEUED ||
+           test_bit(SDMA_REQ_IN_USE, &pq->reqs[info.comp_idx].flags)) {
+               hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in QUEUED state",
+                         dd->unit, uctxt->ctxt, fd->subctxt,
+                         info.comp_idx);
+               return -EBADSLT;
+       }
+       if (!info.fragsize) {
+               hfi1_cdbg(SDMA,
+                         "[%u:%u:%u:%u] Request does not specify fragsize",
+                         dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
+               return -EINVAL;
+       }
+       /*
+        * We've done all the safety checks that we can up to this point,
+        * "allocate" the request entry.
+        */
+       hfi1_cdbg(SDMA, "[%u:%u:%u] Using req/comp entry %u\n", dd->unit,
+                 uctxt->ctxt, fd->subctxt, info.comp_idx);
+       req = pq->reqs + info.comp_idx;
+       memset(req, 0, sizeof(*req));
+       /* Mark the request as IN_USE before we start filling it in. */
+       set_bit(SDMA_REQ_IN_USE, &req->flags);
+       req->data_iovs = req_iovcnt(info.ctrl) - 1;
+       req->pq = pq;
+       req->cq = cq;
+       req->status = -1;
+       INIT_LIST_HEAD(&req->txps);
+
+       memcpy(&req->info, &info, sizeof(info));
+
+       if (req_opcode(info.ctrl) == EXPECTED)
+               req->data_iovs--;
+
+       if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) {
+               SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs,
+                        MAX_VECTORS_PER_REQ);
+               return -EINVAL;
+       }
+       /* Copy the header from the user buffer */
+       ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info),
+                            sizeof(req->hdr));
+       if (ret) {
+               SDMA_DBG(req, "Failed to copy header template (%d)", ret);
+               ret = -EFAULT;
+               goto free_req;
+       }
+
+       /* If Static rate control is not enabled, sanitize the header. */
+       if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL))
+               req->hdr.pbc[2] = 0;
+
+       /* Validate the opcode. Do not trust packets from user space blindly. */
+       opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff;
+       if ((opcode & USER_OPCODE_CHECK_MASK) !=
+            USER_OPCODE_CHECK_VAL) {
+               SDMA_DBG(req, "Invalid opcode (%d)", opcode);
+               ret = -EINVAL;
+               goto free_req;
+       }
+       /*
+        * Validate the vl. Do not trust packets from user space blindly.
+        * VL comes from PBC, SC comes from LRH, and the VL needs to
+        * match the SC look up.
+        */
+       vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF;
+       sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) |
+             (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4));
+       if (vl >= dd->pport->vls_operational ||
+           vl != sc_to_vlt(dd, sc)) {
+               SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl);
+               ret = -EINVAL;
+               goto free_req;
+       }
+
+       /* Checking P_KEY for requests from user-space */
+       if (egress_pkey_check(dd->pport, req->hdr.lrh, req->hdr.bth, sc,
+                             PKEY_CHECK_INVALID)) {
+               ret = -EINVAL;
+               goto free_req;
+       }
+
+       /*
+        * Also should check the BTH.lnh. If it says the next header is GRH then
+        * the RXE parsing will be off and will land in the middle of the KDETH
+        * or miss it entirely.
+        */
+       if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) {
+               SDMA_DBG(req, "User tried to pass in a GRH");
+               ret = -EINVAL;
+               goto free_req;
+       }
+
+       req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]);
+       /*
+        * Calculate the initial TID offset based on the values of
+        * KDETH.OFFSET and KDETH.OM that are passed in.
+        */
+       req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) *
+               (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
+                KDETH_OM_LARGE : KDETH_OM_SMALL);
+       SDMA_DBG(req, "Initial TID offset %u", req->tidoffset);
+       idx++;
+
+       /* Save all the IO vector structures */
+       while (i < req->data_iovs) {
+               INIT_LIST_HEAD(&req->iovs[i].list);
+               memcpy(&req->iovs[i].iov, iovec + idx++, sizeof(struct iovec));
+               ret = pin_vector_pages(req, &req->iovs[i]);
+               if (ret) {
+                       req->status = ret;
+                       goto free_req;
+               }
+               req->data_len += req->iovs[i++].iov.iov_len;
+       }
+       SDMA_DBG(req, "total data length %u", req->data_len);
+
+       if (pcount > req->info.npkts)
+               pcount = req->info.npkts;
+       /*
+        * Copy any TID info
+        * User space will provide the TID info only when the
+        * request type is EXPECTED. This is true even if there is
+        * only one packet in the request and the header is already
+        * setup. The reason for the singular TID case is that the
+        * driver needs to perform safety checks.
+        */
+       if (req_opcode(req->info.ctrl) == EXPECTED) {
+               u16 ntids = iovec[idx].iov_len / sizeof(*req->tids);
+
+               if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) {
+                       ret = -EINVAL;
+                       goto free_req;
+               }
+               req->tids = kcalloc(ntids, sizeof(*req->tids), GFP_KERNEL);
+               if (!req->tids) {
+                       ret = -ENOMEM;
+                       goto free_req;
+               }
+               /*
+                * We have to copy all of the tids because they may vary
+                * in size and, therefore, the TID count might not be
+                * equal to the pkt count. However, there is no way to
+                * tell at this point.
+                */
+               ret = copy_from_user(req->tids, iovec[idx].iov_base,
+                                    ntids * sizeof(*req->tids));
+               if (ret) {
+                       SDMA_DBG(req, "Failed to copy %d TIDs (%d)",
+                                ntids, ret);
+                       ret = -EFAULT;
+                       goto free_req;
+               }
+               req->n_tids = ntids;
+               idx++;
+       }
+
+       /* Have to select the engine */
+       req->sde = sdma_select_engine_vl(dd,
+                                        (u32)(uctxt->ctxt + fd->subctxt),
+                                        vl);
+       if (!req->sde || !sdma_running(req->sde)) {
+               ret = -ECOMM;
+               goto free_req;
+       }
+
+       /* We don't need an AHG entry if the request contains only one packet */
+       if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG)) {
+               int ahg = sdma_ahg_alloc(req->sde);
+
+               if (likely(ahg >= 0)) {
+                       req->ahg_idx = (u8)ahg;
+                       set_bit(SDMA_REQ_HAVE_AHG, &req->flags);
+               }
+       }
+
+       set_comp_state(pq, cq, info.comp_idx, QUEUED, 0);
+       atomic_inc(&pq->n_reqs);
+       req_queued = 1;
+       /* Send the first N packets in the request to buy us some time */
+       ret = user_sdma_send_pkts(req, pcount);
+       if (unlikely(ret < 0 && ret != -EBUSY)) {
+               req->status = ret;
+               goto free_req;
+       }
+
+       /*
+        * It is possible that the SDMA engine would have processed all the
+        * submitted packets by the time we get here. Therefore, only set
+        * packet queue state to ACTIVE if there are still uncompleted
+        * requests.
+        */
+       if (atomic_read(&pq->n_reqs))
+               xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
+
+       /*
+        * This is a somewhat blocking send implementation.
+        * The driver will block the caller until all packets of the
+        * request have been submitted to the SDMA engine. However, it
+        * will not wait for send completions.
+        */
+       while (!test_bit(SDMA_REQ_SEND_DONE, &req->flags)) {
+               ret = user_sdma_send_pkts(req, pcount);
+               if (ret < 0) {
+                       if (ret != -EBUSY) {
+                               req->status = ret;
+                               set_bit(SDMA_REQ_DONE_ERROR, &req->flags);
+                               if (ACCESS_ONCE(req->seqcomp) ==
+                                   req->seqsubmitted - 1)
+                                       goto free_req;
+                               return ret;
+                       }
+                       wait_event_interruptible_timeout(
+                               pq->busy.wait_dma,
+                               (pq->state == SDMA_PKT_Q_ACTIVE),
+                               msecs_to_jiffies(
+                                       SDMA_IOWAIT_TIMEOUT));
+               }
+       }
+       *count += idx;
+       return 0;
+free_req:
+       user_sdma_free_request(req, true);
+       if (req_queued)
+               pq_update(pq);
+       set_comp_state(pq, cq, info.comp_idx, ERROR, req->status);
+       return ret;
+}
+
+static inline u32 compute_data_length(struct user_sdma_request *req,
+                                     struct user_sdma_txreq *tx)
+{
+       /*
+        * Determine the proper size of the packet data.
+        * The size of the data of the first packet is in the header
+        * template. However, it includes the header and ICRC, which need
+        * to be subtracted.
+        * The size of the remaining packets is the minimum of the frag
+        * size (MTU) or remaining data in the request.
+        */
+       u32 len;
+
+       if (!req->seqnum) {
+               len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) -
+                      (sizeof(tx->hdr) - 4));
+       } else if (req_opcode(req->info.ctrl) == EXPECTED) {
+               u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) *
+                       PAGE_SIZE;
+               /*
+                * Get the data length based on the remaining space in the
+                * TID pair.
+                */
+               len = min(tidlen - req->tidoffset, (u32)req->info.fragsize);
+               /* If we've filled up the TID pair, move to the next one. */
+               if (unlikely(!len) && ++req->tididx < req->n_tids &&
+                   req->tids[req->tididx]) {
+                       tidlen = EXP_TID_GET(req->tids[req->tididx],
+                                            LEN) * PAGE_SIZE;
+                       req->tidoffset = 0;
+                       len = min_t(u32, tidlen, req->info.fragsize);
+               }
+               /*
+                * Since the TID pairs map entire pages, make sure that we
+                * are not going to try to send more data that we have
+                * remaining.
+                */
+               len = min(len, req->data_len - req->sent);
+       } else {
+               len = min(req->data_len - req->sent, (u32)req->info.fragsize);
+       }
+       SDMA_DBG(req, "Data Length = %u", len);
+       return len;
+}
+
+static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len)
+{
+       /* (Size of complete header - size of PBC) + 4B ICRC + data length */
+       return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len);
+}
+
+static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
+{
+       int ret = 0;
+       unsigned npkts = 0;
+       struct user_sdma_txreq *tx = NULL;
+       struct hfi1_user_sdma_pkt_q *pq = NULL;
+       struct user_sdma_iovec *iovec = NULL;
+
+       if (!req->pq)
+               return -EINVAL;
+
+       pq = req->pq;
+
+       /* If tx completion has reported an error, we are done. */
+       if (test_bit(SDMA_REQ_HAS_ERROR, &req->flags)) {
+               set_bit(SDMA_REQ_DONE_ERROR, &req->flags);
+               return -EFAULT;
+       }
+
+       /*
+        * Check if we might have sent the entire request already
+        */
+       if (unlikely(req->seqnum == req->info.npkts)) {
+               if (!list_empty(&req->txps))
+                       goto dosend;
+               return ret;
+       }
+
+       if (!maxpkts || maxpkts > req->info.npkts - req->seqnum)
+               maxpkts = req->info.npkts - req->seqnum;
+
+       while (npkts < maxpkts) {
+               u32 datalen = 0, queued = 0, data_sent = 0;
+               u64 iov_offset = 0;
+
+               /*
+                * Check whether any of the completions have come back
+                * with errors. If so, we are not going to process any
+                * more packets from this request.
+                */
+               if (test_bit(SDMA_REQ_HAS_ERROR, &req->flags)) {
+                       set_bit(SDMA_REQ_DONE_ERROR, &req->flags);
+                       return -EFAULT;
+               }
+
+               tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL);
+               if (!tx)
+                       return -ENOMEM;
+
+               tx->flags = 0;
+               tx->req = req;
+               tx->busycount = 0;
+               INIT_LIST_HEAD(&tx->list);
+
+               if (req->seqnum == req->info.npkts - 1)
+                       tx->flags |= TXREQ_FLAGS_REQ_LAST_PKT;
+
+               /*
+                * Calculate the payload size - this is min of the fragment
+                * (MTU) size or the remaining bytes in the request but only
+                * if we have payload data.
+                */
+               if (req->data_len) {
+                       iovec = &req->iovs[req->iov_idx];
+                       if (ACCESS_ONCE(iovec->offset) == iovec->iov.iov_len) {
+                               if (++req->iov_idx == req->data_iovs) {
+                                       ret = -EFAULT;
+                                       goto free_txreq;
+                               }
+                               iovec = &req->iovs[req->iov_idx];
+                               WARN_ON(iovec->offset);
+                       }
+
+                       datalen = compute_data_length(req, tx);
+                       if (!datalen) {
+                               SDMA_DBG(req,
+                                        "Request has data but pkt len is 0");
+                               ret = -EFAULT;
+                               goto free_tx;
+                       }
+               }
+
+               if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags)) {
+                       if (!req->seqnum) {
+                               u16 pbclen = le16_to_cpu(req->hdr.pbc[0]);
+                               u32 lrhlen = get_lrh_len(req->hdr, datalen);
+                               /*
+                                * Copy the request header into the tx header
+                                * because the HW needs a cacheline-aligned
+                                * address.
+                                * This copy can be optimized out if the hdr
+                                * member of user_sdma_request were also
+                                * cacheline aligned.
+                                */
+                               memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr));
+                               if (PBC2LRH(pbclen) != lrhlen) {
+                                       pbclen = (pbclen & 0xf000) |
+                                               LRH2PBC(lrhlen);
+                                       tx->hdr.pbc[0] = cpu_to_le16(pbclen);
+                               }
+                               ret = sdma_txinit_ahg(&tx->txreq,
+                                                     SDMA_TXREQ_F_AHG_COPY,
+                                                     sizeof(tx->hdr) + datalen,
+                                                     req->ahg_idx, 0, NULL, 0,
+                                                     user_sdma_txreq_cb);
+                               if (ret)
+                                       goto free_tx;
+                               ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq,
+                                                       &tx->hdr,
+                                                       sizeof(tx->hdr));
+                               if (ret)
+                                       goto free_txreq;
+                       } else {
+                               int changes;
+
+                               changes = set_txreq_header_ahg(req, tx,
+                                                              datalen);
+                               if (changes < 0)
+                                       goto free_tx;
+                               sdma_txinit_ahg(&tx->txreq,
+                                               SDMA_TXREQ_F_USE_AHG,
+                                               datalen, req->ahg_idx, changes,
+                                               req->ahg, sizeof(req->hdr),
+                                               user_sdma_txreq_cb);
+                       }
+               } else {
+                       ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) +
+                                         datalen, user_sdma_txreq_cb);
+                       if (ret)
+                               goto free_tx;
+                       /*
+                        * Modify the header for this packet. This only needs
+                        * to be done if we are not going to use AHG. Otherwise,
+                        * the HW will do it based on the changes we gave it
+                        * during sdma_txinit_ahg().
+                        */
+                       ret = set_txreq_header(req, tx, datalen);
+                       if (ret)
+                               goto free_txreq;
+               }
+
+               /*
+                * If the request contains any data vectors, add up to
+                * fragsize bytes to the descriptor.
+                */
+               while (queued < datalen &&
+                      (req->sent + data_sent) < req->data_len) {
+                       unsigned long base, offset;
+                       unsigned pageidx, len;
+
+                       base = (unsigned long)iovec->iov.iov_base;
+                       offset = offset_in_page(base + iovec->offset +
+                                               iov_offset);
+                       pageidx = (((iovec->offset + iov_offset +
+                                    base) - (base & PAGE_MASK)) >> PAGE_SHIFT);
+                       len = offset + req->info.fragsize > PAGE_SIZE ?
+                               PAGE_SIZE - offset : req->info.fragsize;
+                       len = min((datalen - queued), len);
+                       ret = sdma_txadd_page(pq->dd, &tx->txreq,
+                                             iovec->pages[pageidx],
+                                             offset, len);
+                       if (ret) {
+                               SDMA_DBG(req, "SDMA txreq add page failed %d\n",
+                                        ret);
+                               goto free_txreq;
+                       }
+                       iov_offset += len;
+                       queued += len;
+                       data_sent += len;
+                       if (unlikely(queued < datalen &&
+                                    pageidx == iovec->npages &&
+                                    req->iov_idx < req->data_iovs - 1)) {
+                               iovec->offset += iov_offset;
+                               iovec = &req->iovs[++req->iov_idx];
+                               iov_offset = 0;
+                       }
+               }
+               /*
+                * The txreq was submitted successfully so we can update
+                * the counters.
+                */
+               req->koffset += datalen;
+               if (req_opcode(req->info.ctrl) == EXPECTED)
+                       req->tidoffset += datalen;
+               req->sent += data_sent;
+               if (req->data_len)
+                       iovec->offset += iov_offset;
+               list_add_tail(&tx->txreq.list, &req->txps);
+               /*
+                * It is important to increment this here as it is used to
+                * generate the BTH.PSN and, therefore, can't be bulk-updated
+                * outside of the loop.
+                */
+               tx->seqnum = req->seqnum++;
+               npkts++;
+       }
+dosend:
+       ret = sdma_send_txlist(req->sde, &pq->busy, &req->txps);
+       if (list_empty(&req->txps)) {
+               req->seqsubmitted = req->seqnum;
+               if (req->seqnum == req->info.npkts) {
+                       set_bit(SDMA_REQ_SEND_DONE, &req->flags);
+                       /*
+                        * The txreq has already been submitted to the HW queue
+                        * so we can free the AHG entry now. Corruption will not
+                        * happen due to the sequential manner in which
+                        * descriptors are processed.
+                        */
+                       if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags))
+                               sdma_ahg_free(req->sde, req->ahg_idx);
+               }
+       } else if (ret > 0) {
+               req->seqsubmitted += ret;
+               ret = 0;
+       }
+       return ret;
+
+free_txreq:
+       sdma_txclean(pq->dd, &tx->txreq);
+free_tx:
+       kmem_cache_free(pq->txreq_cache, tx);
+       return ret;
+}
+
+/*
+ * How many pages in this iovec element?
+ */
+static inline int num_user_pages(const struct iovec *iov)
+{
+       const unsigned long addr  = (unsigned long)iov->iov_base;
+       const unsigned long len   = iov->iov_len;
+       const unsigned long spage = addr & PAGE_MASK;
+       const unsigned long epage = (addr + len - 1) & PAGE_MASK;
+
+       return 1 + ((epage - spage) >> PAGE_SHIFT);
+}
+
+static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages)
+{
+       u32 cleared = 0;
+       struct sdma_mmu_node *node, *ptr;
+       struct list_head to_evict = LIST_HEAD_INIT(to_evict);
+
+       spin_lock(&pq->evict_lock);
+       list_for_each_entry_safe_reverse(node, ptr, &pq->evict, list) {
+               /* Make sure that no one is still using the node. */
+               if (!atomic_read(&node->refcount)) {
+                       set_bit(SDMA_CACHE_NODE_EVICT, &node->flags);
+                       list_del_init(&node->list);
+                       list_add(&node->list, &to_evict);
+                       cleared += node->npages;
+                       if (cleared >= npages)
+                               break;
+               }
+       }
+       spin_unlock(&pq->evict_lock);
+
+       list_for_each_entry_safe(node, ptr, &to_evict, list)
+               hfi1_mmu_rb_remove(&pq->sdma_rb_root, &node->rb);
+
+       return cleared;
+}
+
+static int pin_vector_pages(struct user_sdma_request *req,
+                           struct user_sdma_iovec *iovec) {
+       int ret = 0, pinned, npages, cleared;
+       struct page **pages;
+       struct hfi1_user_sdma_pkt_q *pq = req->pq;
+       struct sdma_mmu_node *node = NULL;
+       struct mmu_rb_node *rb_node;
+
+       rb_node = hfi1_mmu_rb_extract(&pq->sdma_rb_root,
+                                     (unsigned long)iovec->iov.iov_base,
+                                     iovec->iov.iov_len);
+       if (rb_node && !IS_ERR(rb_node))
+               node = container_of(rb_node, struct sdma_mmu_node, rb);
+       else
+               rb_node = NULL;
+
+       if (!node) {
+               node = kzalloc(sizeof(*node), GFP_KERNEL);
+               if (!node)
+                       return -ENOMEM;
+
+               node->rb.addr = (unsigned long)iovec->iov.iov_base;
+               node->pq = pq;
+               atomic_set(&node->refcount, 0);
+               INIT_LIST_HEAD(&node->list);
+       }
+
+       npages = num_user_pages(&iovec->iov);
+       if (node->npages < npages) {
+               pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
+               if (!pages) {
+                       SDMA_DBG(req, "Failed page array alloc");
+                       ret = -ENOMEM;
+                       goto bail;
+               }
+               memcpy(pages, node->pages, node->npages * sizeof(*pages));
+
+               npages -= node->npages;
+
+               /*
+                * If rb_node is NULL, it means that this is brand new node
+                * and, therefore not on the eviction list.
+                * If, however, the rb_node is non-NULL, it means that the
+                * node is already in RB tree and, therefore on the eviction
+                * list (nodes are unconditionally inserted in the eviction
+                * list). In that case, we have to remove the node prior to
+                * calling the eviction function in order to prevent it from
+                * freeing this node.
+                */
+               if (rb_node) {
+                       spin_lock(&pq->evict_lock);
+                       list_del_init(&node->list);
+                       spin_unlock(&pq->evict_lock);
+               }
+retry:
+               if (!hfi1_can_pin_pages(pq->dd, pq->n_locked, npages)) {
+                       cleared = sdma_cache_evict(pq, npages);
+                       if (cleared >= npages)
+                               goto retry;
+               }
+               pinned = hfi1_acquire_user_pages(
+                       ((unsigned long)iovec->iov.iov_base +
+                        (node->npages * PAGE_SIZE)), npages, 0,
+                       pages + node->npages);
+               if (pinned < 0) {
+                       kfree(pages);
+                       ret = pinned;
+                       goto bail;
+               }
+               if (pinned != npages) {
+                       unpin_vector_pages(current->mm, pages, node->npages,
+                                          pinned);
+                       ret = -EFAULT;
+                       goto bail;
+               }
+               kfree(node->pages);
+               node->rb.len = iovec->iov.iov_len;
+               node->pages = pages;
+               node->npages += pinned;
+               npages = node->npages;
+               spin_lock(&pq->evict_lock);
+               list_add(&node->list, &pq->evict);
+               pq->n_locked += pinned;
+               spin_unlock(&pq->evict_lock);
+       }
+       iovec->pages = node->pages;
+       iovec->npages = npages;
+       iovec->node = node;
+
+       ret = hfi1_mmu_rb_insert(&req->pq->sdma_rb_root, &node->rb);
+       if (ret) {
+               spin_lock(&pq->evict_lock);
+               if (!list_empty(&node->list))
+                       list_del(&node->list);
+               pq->n_locked -= node->npages;
+               spin_unlock(&pq->evict_lock);
+               goto bail;
+       }
+       return 0;
+bail:
+       if (rb_node)
+               unpin_vector_pages(current->mm, node->pages, 0, node->npages);
+       kfree(node);
+       return ret;
+}
+
+static void unpin_vector_pages(struct mm_struct *mm, struct page **pages,
+                              unsigned start, unsigned npages)
+{
+       hfi1_release_user_pages(mm, pages + start, npages, 0);
+       kfree(pages);
+}
+
+static int check_header_template(struct user_sdma_request *req,
+                                struct hfi1_pkt_header *hdr, u32 lrhlen,
+                                u32 datalen)
+{
+       /*
+        * Perform safety checks for any type of packet:
+        *    - transfer size is multiple of 64bytes
+        *    - packet length is multiple of 4bytes
+        *    - entire request length is multiple of 4bytes
+        *    - packet length is not larger than MTU size
+        *
+        * These checks are only done for the first packet of the
+        * transfer since the header is "given" to us by user space.
+        * For the remainder of the packets we compute the values.
+        */
+       if (req->info.fragsize % PIO_BLOCK_SIZE ||
+           lrhlen & 0x3 || req->data_len & 0x3  ||
+           lrhlen > get_lrh_len(*hdr, req->info.fragsize))
+               return -EINVAL;
+
+       if (req_opcode(req->info.ctrl) == EXPECTED) {
+               /*
+                * The header is checked only on the first packet. Furthermore,
+                * we ensure that at least one TID entry is copied when the
+                * request is submitted. Therefore, we don't have to verify that
+                * tididx points to something sane.
+                */
+               u32 tidval = req->tids[req->tididx],
+                       tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE,
+                       tididx = EXP_TID_GET(tidval, IDX),
+                       tidctrl = EXP_TID_GET(tidval, CTRL),
+                       tidoff;
+               __le32 kval = hdr->kdeth.ver_tid_offset;
+
+               tidoff = KDETH_GET(kval, OFFSET) *
+                         (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
+                          KDETH_OM_LARGE : KDETH_OM_SMALL);
+               /*
+                * Expected receive packets have the following
+                * additional checks:
+                *     - offset is not larger than the TID size
+                *     - TIDCtrl values match between header and TID array
+                *     - TID indexes match between header and TID array
+                */
+               if ((tidoff + datalen > tidlen) ||
+                   KDETH_GET(kval, TIDCTRL) != tidctrl ||
+                   KDETH_GET(kval, TID) != tididx)
+                       return -EINVAL;
+       }
+       return 0;
+}
+
+/*
+ * Correctly set the BTH.PSN field based on type of
+ * transfer - eager packets can just increment the PSN but
+ * expected packets encode generation and sequence in the
+ * BTH.PSN field so just incrementing will result in errors.
+ */
+static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags)
+{
+       u32 val = be32_to_cpu(bthpsn),
+               mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull :
+                       0xffffffull),
+               psn = val & mask;
+       if (expct)
+               psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK);
+       else
+               psn = psn + frags;
+       return psn & mask;
+}
+
+static int set_txreq_header(struct user_sdma_request *req,
+                           struct user_sdma_txreq *tx, u32 datalen)
+{
+       struct hfi1_user_sdma_pkt_q *pq = req->pq;
+       struct hfi1_pkt_header *hdr = &tx->hdr;
+       u16 pbclen;
+       int ret;
+       u32 tidval = 0, lrhlen = get_lrh_len(*hdr, datalen);
+
+       /* Copy the header template to the request before modification */
+       memcpy(hdr, &req->hdr, sizeof(*hdr));
+
+       /*
+        * Check if the PBC and LRH length are mismatched. If so
+        * adjust both in the header.
+        */
+       pbclen = le16_to_cpu(hdr->pbc[0]);
+       if (PBC2LRH(pbclen) != lrhlen) {
+               pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
+               hdr->pbc[0] = cpu_to_le16(pbclen);
+               hdr->lrh[2] = cpu_to_be16(lrhlen >> 2);
+               /*
+                * Third packet
+                * This is the first packet in the sequence that has
+                * a "static" size that can be used for the rest of
+                * the packets (besides the last one).
+                */
+               if (unlikely(req->seqnum == 2)) {
+                       /*
+                        * From this point on the lengths in both the
+                        * PBC and LRH are the same until the last
+                        * packet.
+                        * Adjust the template so we don't have to update
+                        * every packet
+                        */
+                       req->hdr.pbc[0] = hdr->pbc[0];
+                       req->hdr.lrh[2] = hdr->lrh[2];
+               }
+       }
+       /*
+        * We only have to modify the header if this is not the
+        * first packet in the request. Otherwise, we use the
+        * header given to us.
+        */
+       if (unlikely(!req->seqnum)) {
+               ret = check_header_template(req, hdr, lrhlen, datalen);
+               if (ret)
+                       return ret;
+               goto done;
+       }
+
+       hdr->bth[2] = cpu_to_be32(
+               set_pkt_bth_psn(hdr->bth[2],
+                               (req_opcode(req->info.ctrl) == EXPECTED),
+                               req->seqnum));
+
+       /* Set ACK request on last packet */
+       if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT))
+               hdr->bth[2] |= cpu_to_be32(1UL << 31);
+
+       /* Set the new offset */
+       hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset);
+       /* Expected packets have to fill in the new TID information */
+       if (req_opcode(req->info.ctrl) == EXPECTED) {
+               tidval = req->tids[req->tididx];
+               /*
+                * If the offset puts us at the end of the current TID,
+                * advance everything.
+                */
+               if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
+                                        PAGE_SIZE)) {
+                       req->tidoffset = 0;
+                       /*
+                        * Since we don't copy all the TIDs, all at once,
+                        * we have to check again.
+                        */
+                       if (++req->tididx > req->n_tids - 1 ||
+                           !req->tids[req->tididx]) {
+                               return -EINVAL;
+                       }
+                       tidval = req->tids[req->tididx];
+               }
+               req->omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >=
+                       KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE : KDETH_OM_SMALL;
+               /* Set KDETH.TIDCtrl based on value for this TID. */
+               KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL,
+                         EXP_TID_GET(tidval, CTRL));
+               /* Set KDETH.TID based on value for this TID */
+               KDETH_SET(hdr->kdeth.ver_tid_offset, TID,
+                         EXP_TID_GET(tidval, IDX));
+               /* Clear KDETH.SH only on the last packet */
+               if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT))
+                       KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0);
+               /*
+                * Set the KDETH.OFFSET and KDETH.OM based on size of
+                * transfer.
+                */
+               SDMA_DBG(req, "TID offset %ubytes %uunits om%u",
+                        req->tidoffset, req->tidoffset / req->omfactor,
+                        !!(req->omfactor - KDETH_OM_SMALL));
+               KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET,
+                         req->tidoffset / req->omfactor);
+               KDETH_SET(hdr->kdeth.ver_tid_offset, OM,
+                         !!(req->omfactor - KDETH_OM_SMALL));
+       }
+done:
+       trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt,
+                                   req->info.comp_idx, hdr, tidval);
+       return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr));
+}
+
+static int set_txreq_header_ahg(struct user_sdma_request *req,
+                               struct user_sdma_txreq *tx, u32 len)
+{
+       int diff = 0;
+       struct hfi1_user_sdma_pkt_q *pq = req->pq;
+       struct hfi1_pkt_header *hdr = &req->hdr;
+       u16 pbclen = le16_to_cpu(hdr->pbc[0]);
+       u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, len);
+
+       if (PBC2LRH(pbclen) != lrhlen) {
+               /* PBC.PbcLengthDWs */
+               AHG_HEADER_SET(req->ahg, diff, 0, 0, 12,
+                              cpu_to_le16(LRH2PBC(lrhlen)));
+               /* LRH.PktLen (we need the full 16 bits due to byte swap) */
+               AHG_HEADER_SET(req->ahg, diff, 3, 0, 16,
+                              cpu_to_be16(lrhlen >> 2));
+       }
+
+       /*
+        * Do the common updates
+        */
+       /* BTH.PSN and BTH.A */
+       val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) &
+               (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff);
+       if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT))
+               val32 |= 1UL << 31;
+       AHG_HEADER_SET(req->ahg, diff, 6, 0, 16, cpu_to_be16(val32 >> 16));
+       AHG_HEADER_SET(req->ahg, diff, 6, 16, 16, cpu_to_be16(val32 & 0xffff));
+       /* KDETH.Offset */
+       AHG_HEADER_SET(req->ahg, diff, 15, 0, 16,
+                      cpu_to_le16(req->koffset & 0xffff));
+       AHG_HEADER_SET(req->ahg, diff, 15, 16, 16,
+                      cpu_to_le16(req->koffset >> 16));
+       if (req_opcode(req->info.ctrl) == EXPECTED) {
+               __le16 val;
+
+               tidval = req->tids[req->tididx];
+
+               /*
+                * If the offset puts us at the end of the current TID,
+                * advance everything.
+                */
+               if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
+                                        PAGE_SIZE)) {
+                       req->tidoffset = 0;
+                       /*
+                        * Since we don't copy all the TIDs, all at once,
+                        * we have to check again.
+                        */
+                       if (++req->tididx > req->n_tids - 1 ||
+                           !req->tids[req->tididx]) {
+                               return -EINVAL;
+                       }
+                       tidval = req->tids[req->tididx];
+               }
+               req->omfactor = ((EXP_TID_GET(tidval, LEN) *
+                                 PAGE_SIZE) >=
+                                KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE :
+                       KDETH_OM_SMALL;
+               /* KDETH.OM and KDETH.OFFSET (TID) */
+               AHG_HEADER_SET(req->ahg, diff, 7, 0, 16,
+                              ((!!(req->omfactor - KDETH_OM_SMALL)) << 15 |
+                               ((req->tidoffset / req->omfactor) & 0x7fff)));
+               /* KDETH.TIDCtrl, KDETH.TID */
+               val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) |
+                                       (EXP_TID_GET(tidval, IDX) & 0x3ff));
+               /* Clear KDETH.SH on last packet */
+               if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT)) {
+                       val |= cpu_to_le16(KDETH_GET(hdr->kdeth.ver_tid_offset,
+                                                               INTR) >> 16);
+                       val &= cpu_to_le16(~(1U << 13));
+                       AHG_HEADER_SET(req->ahg, diff, 7, 16, 14, val);
+               } else {
+                       AHG_HEADER_SET(req->ahg, diff, 7, 16, 12, val);
+               }
+       }
+
+       trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt,
+                                       req->info.comp_idx, req->sde->this_idx,
+                                       req->ahg_idx, req->ahg, diff, tidval);
+       return diff;
+}
+
+/*
+ * SDMA tx request completion callback. Called when the SDMA progress
+ * state machine gets notification that the SDMA descriptors for this
+ * tx request have been processed by the DMA engine. Called in
+ * interrupt context.
+ */
+static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
+{
+       struct user_sdma_txreq *tx =
+               container_of(txreq, struct user_sdma_txreq, txreq);
+       struct user_sdma_request *req;
+       struct hfi1_user_sdma_pkt_q *pq;
+       struct hfi1_user_sdma_comp_q *cq;
+       u16 idx;
+
+       if (!tx->req)
+               return;
+
+       req = tx->req;
+       pq = req->pq;
+       cq = req->cq;
+
+       if (status != SDMA_TXREQ_S_OK) {
+               SDMA_DBG(req, "SDMA completion with error %d",
+                        status);
+               set_bit(SDMA_REQ_HAS_ERROR, &req->flags);
+       }
+
+       req->seqcomp = tx->seqnum;
+       kmem_cache_free(pq->txreq_cache, tx);
+       tx = NULL;
+
+       idx = req->info.comp_idx;
+       if (req->status == -1 && status == SDMA_TXREQ_S_OK) {
+               if (req->seqcomp == req->info.npkts - 1) {
+                       req->status = 0;
+                       user_sdma_free_request(req, false);
+                       pq_update(pq);
+                       set_comp_state(pq, cq, idx, COMPLETE, 0);
+               }
+       } else {
+               if (status != SDMA_TXREQ_S_OK)
+                       req->status = status;
+               if (req->seqcomp == (ACCESS_ONCE(req->seqsubmitted) - 1) &&
+                   (test_bit(SDMA_REQ_SEND_DONE, &req->flags) ||
+                    test_bit(SDMA_REQ_DONE_ERROR, &req->flags))) {
+                       user_sdma_free_request(req, false);
+                       pq_update(pq);
+                       set_comp_state(pq, cq, idx, ERROR, req->status);
+               }
+       }
+}
+
+static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq)
+{
+       if (atomic_dec_and_test(&pq->n_reqs)) {
+               xchg(&pq->state, SDMA_PKT_Q_INACTIVE);
+               wake_up(&pq->wait);
+       }
+}
+
+static void user_sdma_free_request(struct user_sdma_request *req, bool unpin)
+{
+       if (!list_empty(&req->txps)) {
+               struct sdma_txreq *t, *p;
+
+               list_for_each_entry_safe(t, p, &req->txps, list) {
+                       struct user_sdma_txreq *tx =
+                               container_of(t, struct user_sdma_txreq, txreq);
+                       list_del_init(&t->list);
+                       sdma_txclean(req->pq->dd, t);
+                       kmem_cache_free(req->pq->txreq_cache, tx);
+               }
+       }
+       if (req->data_iovs) {
+               struct sdma_mmu_node *node;
+               int i;
+
+               for (i = 0; i < req->data_iovs; i++) {
+                       node = req->iovs[i].node;
+                       if (!node)
+                               continue;
+
+                       if (unpin)
+                               hfi1_mmu_rb_remove(&req->pq->sdma_rb_root,
+                                                  &node->rb);
+                       else
+                               atomic_dec(&node->refcount);
+               }
+       }
+       kfree(req->tids);
+       clear_bit(SDMA_REQ_IN_USE, &req->flags);
+}
+
+static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
+                                 struct hfi1_user_sdma_comp_q *cq,
+                                 u16 idx, enum hfi1_sdma_comp_state state,
+                                 int ret)
+{
+       hfi1_cdbg(SDMA, "[%u:%u:%u:%u] Setting completion status %u %d",
+                 pq->dd->unit, pq->ctxt, pq->subctxt, idx, state, ret);
+       cq->comps[idx].status = state;
+       if (state == ERROR)
+               cq->comps[idx].errcode = -ret;
+       trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt,
+                                       idx, state, ret);
+}
+
+static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
+                          unsigned long len)
+{
+       return (bool)(node->addr == addr);
+}
+
+static int sdma_rb_insert(struct rb_root *root, struct mmu_rb_node *mnode)
+{
+       struct sdma_mmu_node *node =
+               container_of(mnode, struct sdma_mmu_node, rb);
+
+       atomic_inc(&node->refcount);
+       return 0;
+}
+
+static void sdma_rb_remove(struct rb_root *root, struct mmu_rb_node *mnode,
+                          struct mm_struct *mm)
+{
+       struct sdma_mmu_node *node =
+               container_of(mnode, struct sdma_mmu_node, rb);
+
+       spin_lock(&node->pq->evict_lock);
+       /*
+        * We've been called by the MMU notifier but this node has been
+        * scheduled for eviction. The eviction function will take care
+        * of freeing this node.
+        * We have to take the above lock first because we are racing
+        * against the setting of the bit in the eviction function.
+        */
+       if (mm && test_bit(SDMA_CACHE_NODE_EVICT, &node->flags)) {
+               spin_unlock(&node->pq->evict_lock);
+               return;
+       }
+
+       if (!list_empty(&node->list))
+               list_del(&node->list);
+       node->pq->n_locked -= node->npages;
+       spin_unlock(&node->pq->evict_lock);
+
+       /*
+        * If mm is set, we are being called by the MMU notifier and we
+        * should not pass a mm_struct to unpin_vector_page(). This is to
+        * prevent a deadlock when hfi1_release_user_pages() attempts to
+        * take the mmap_sem, which the MMU notifier has already taken.
+        */
+       unpin_vector_pages(mm ? NULL : current->mm, node->pages, 0,
+                          node->npages);
+       /*
+        * If called by the MMU notifier, we have to adjust the pinned
+        * page count ourselves.
+        */
+       if (mm)
+               mm->pinned_vm -= node->npages;
+       kfree(node);
+}
+
+static int sdma_rb_invalidate(struct rb_root *root, struct mmu_rb_node *mnode)
+{
+       struct sdma_mmu_node *node =
+               container_of(mnode, struct sdma_mmu_node, rb);
+
+       if (!atomic_read(&node->refcount))
+               return 1;
+       return 0;
+}
diff --git a/drivers/infiniband/hw/hfi1/user_sdma.h b/drivers/infiniband/hw/hfi1/user_sdma.h
new file mode 100644 (file)
index 0000000..b9240e3
--- /dev/null
@@ -0,0 +1,84 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/device.h>
+#include <linux/wait.h>
+
+#include "common.h"
+#include "iowait.h"
+#include "user_exp_rcv.h"
+
+extern uint extended_psn;
+
+struct hfi1_user_sdma_pkt_q {
+       struct list_head list;
+       unsigned ctxt;
+       unsigned subctxt;
+       u16 n_max_reqs;
+       atomic_t n_reqs;
+       u16 reqidx;
+       struct hfi1_devdata *dd;
+       struct kmem_cache *txreq_cache;
+       struct user_sdma_request *reqs;
+       struct iowait busy;
+       unsigned state;
+       wait_queue_head_t wait;
+       unsigned long unpinned;
+       struct rb_root sdma_rb_root;
+       u32 n_locked;
+       struct list_head evict;
+       spinlock_t evict_lock; /* protect evict and n_locked */
+};
+
+struct hfi1_user_sdma_comp_q {
+       u16 nentries;
+       struct hfi1_sdma_comp_entry *comps;
+};
+
+int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *, struct file *);
+int hfi1_user_sdma_free_queues(struct hfi1_filedata *);
+int hfi1_user_sdma_process_request(struct file *, struct iovec *, unsigned long,
+                                  unsigned long *);
diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c
new file mode 100644 (file)
index 0000000..849c4b9
--- /dev/null
@@ -0,0 +1,1764 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_user_verbs.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/utsname.h>
+#include <linux/rculist.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+
+#include "hfi.h"
+#include "common.h"
+#include "device.h"
+#include "trace.h"
+#include "qp.h"
+#include "verbs_txreq.h"
+
+static unsigned int hfi1_lkey_table_size = 16;
+module_param_named(lkey_table_size, hfi1_lkey_table_size, uint,
+                  S_IRUGO);
+MODULE_PARM_DESC(lkey_table_size,
+                "LKEY table size in bits (2^n, 1 <= n <= 23)");
+
+static unsigned int hfi1_max_pds = 0xFFFF;
+module_param_named(max_pds, hfi1_max_pds, uint, S_IRUGO);
+MODULE_PARM_DESC(max_pds,
+                "Maximum number of protection domains to support");
+
+static unsigned int hfi1_max_ahs = 0xFFFF;
+module_param_named(max_ahs, hfi1_max_ahs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support");
+
+unsigned int hfi1_max_cqes = 0x2FFFF;
+module_param_named(max_cqes, hfi1_max_cqes, uint, S_IRUGO);
+MODULE_PARM_DESC(max_cqes,
+                "Maximum number of completion queue entries to support");
+
+unsigned int hfi1_max_cqs = 0x1FFFF;
+module_param_named(max_cqs, hfi1_max_cqs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support");
+
+unsigned int hfi1_max_qp_wrs = 0x3FFF;
+module_param_named(max_qp_wrs, hfi1_max_qp_wrs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support");
+
+unsigned int hfi1_max_qps = 16384;
+module_param_named(max_qps, hfi1_max_qps, uint, S_IRUGO);
+MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support");
+
+unsigned int hfi1_max_sges = 0x60;
+module_param_named(max_sges, hfi1_max_sges, uint, S_IRUGO);
+MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support");
+
+unsigned int hfi1_max_mcast_grps = 16384;
+module_param_named(max_mcast_grps, hfi1_max_mcast_grps, uint, S_IRUGO);
+MODULE_PARM_DESC(max_mcast_grps,
+                "Maximum number of multicast groups to support");
+
+unsigned int hfi1_max_mcast_qp_attached = 16;
+module_param_named(max_mcast_qp_attached, hfi1_max_mcast_qp_attached,
+                  uint, S_IRUGO);
+MODULE_PARM_DESC(max_mcast_qp_attached,
+                "Maximum number of attached QPs to support");
+
+unsigned int hfi1_max_srqs = 1024;
+module_param_named(max_srqs, hfi1_max_srqs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support");
+
+unsigned int hfi1_max_srq_sges = 128;
+module_param_named(max_srq_sges, hfi1_max_srq_sges, uint, S_IRUGO);
+MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support");
+
+unsigned int hfi1_max_srq_wrs = 0x1FFFF;
+module_param_named(max_srq_wrs, hfi1_max_srq_wrs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support");
+
+unsigned short piothreshold = 256;
+module_param(piothreshold, ushort, S_IRUGO);
+MODULE_PARM_DESC(piothreshold, "size used to determine sdma vs. pio");
+
+#define COPY_CACHELESS 1
+#define COPY_ADAPTIVE  2
+static unsigned int sge_copy_mode;
+module_param(sge_copy_mode, uint, S_IRUGO);
+MODULE_PARM_DESC(sge_copy_mode,
+                "Verbs copy mode: 0 use memcpy, 1 use cacheless copy, 2 adapt based on WSS");
+
+static void verbs_sdma_complete(
+       struct sdma_txreq *cookie,
+       int status);
+
+static int pio_wait(struct rvt_qp *qp,
+                   struct send_context *sc,
+                   struct hfi1_pkt_state *ps,
+                   u32 flag);
+
+/* Length of buffer to create verbs txreq cache name */
+#define TXREQ_NAME_LEN 24
+
+static uint wss_threshold;
+module_param(wss_threshold, uint, S_IRUGO);
+MODULE_PARM_DESC(wss_threshold, "Percentage (1-100) of LLC to use as a threshold for a cacheless copy");
+static uint wss_clean_period = 256;
+module_param(wss_clean_period, uint, S_IRUGO);
+MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the page copy table is cleaned");
+
+/* memory working set size */
+struct hfi1_wss {
+       unsigned long *entries;
+       atomic_t total_count;
+       atomic_t clean_counter;
+       atomic_t clean_entry;
+
+       int threshold;
+       int num_entries;
+       long pages_mask;
+};
+
+static struct hfi1_wss wss;
+
+int hfi1_wss_init(void)
+{
+       long llc_size;
+       long llc_bits;
+       long table_size;
+       long table_bits;
+
+       /* check for a valid percent range - default to 80 if none or invalid */
+       if (wss_threshold < 1 || wss_threshold > 100)
+               wss_threshold = 80;
+       /* reject a wildly large period */
+       if (wss_clean_period > 1000000)
+               wss_clean_period = 256;
+       /* reject a zero period */
+       if (wss_clean_period == 0)
+               wss_clean_period = 1;
+
+       /*
+        * Calculate the table size - the next power of 2 larger than the
+        * LLC size.  LLC size is in KiB.
+        */
+       llc_size = wss_llc_size() * 1024;
+       table_size = roundup_pow_of_two(llc_size);
+
+       /* one bit per page in rounded up table */
+       llc_bits = llc_size / PAGE_SIZE;
+       table_bits = table_size / PAGE_SIZE;
+       wss.pages_mask = table_bits - 1;
+       wss.num_entries = table_bits / BITS_PER_LONG;
+
+       wss.threshold = (llc_bits * wss_threshold) / 100;
+       if (wss.threshold == 0)
+               wss.threshold = 1;
+
+       atomic_set(&wss.clean_counter, wss_clean_period);
+
+       wss.entries = kcalloc(wss.num_entries, sizeof(*wss.entries),
+                             GFP_KERNEL);
+       if (!wss.entries) {
+               hfi1_wss_exit();
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+
+void hfi1_wss_exit(void)
+{
+       /* coded to handle partially initialized and repeat callers */
+       kfree(wss.entries);
+       wss.entries = NULL;
+}
+
+/*
+ * Advance the clean counter.  When the clean period has expired,
+ * clean an entry.
+ *
+ * This is implemented in atomics to avoid locking.  Because multiple
+ * variables are involved, it can be racy which can lead to slightly
+ * inaccurate information.  Since this is only a heuristic, this is
+ * OK.  Any innaccuracies will clean themselves out as the counter
+ * advances.  That said, it is unlikely the entry clean operation will
+ * race - the next possible racer will not start until the next clean
+ * period.
+ *
+ * The clean counter is implemented as a decrement to zero.  When zero
+ * is reached an entry is cleaned.
+ */
+static void wss_advance_clean_counter(void)
+{
+       int entry;
+       int weight;
+       unsigned long bits;
+
+       /* become the cleaner if we decrement the counter to zero */
+       if (atomic_dec_and_test(&wss.clean_counter)) {
+               /*
+                * Set, not add, the clean period.  This avoids an issue
+                * where the counter could decrement below the clean period.
+                * Doing a set can result in lost decrements, slowing the
+                * clean advance.  Since this a heuristic, this possible
+                * slowdown is OK.
+                *
+                * An alternative is to loop, advancing the counter by a
+                * clean period until the result is > 0. However, this could
+                * lead to several threads keeping another in the clean loop.
+                * This could be mitigated by limiting the number of times
+                * we stay in the loop.
+                */
+               atomic_set(&wss.clean_counter, wss_clean_period);
+
+               /*
+                * Uniquely grab the entry to clean and move to next.
+                * The current entry is always the lower bits of
+                * wss.clean_entry.  The table size, wss.num_entries,
+                * is always a power-of-2.
+                */
+               entry = (atomic_inc_return(&wss.clean_entry) - 1)
+                       & (wss.num_entries - 1);
+
+               /* clear the entry and count the bits */
+               bits = xchg(&wss.entries[entry], 0);
+               weight = hweight64((u64)bits);
+               /* only adjust the contended total count if needed */
+               if (weight)
+                       atomic_sub(weight, &wss.total_count);
+       }
+}
+
+/*
+ * Insert the given address into the working set array.
+ */
+static void wss_insert(void *address)
+{
+       u32 page = ((unsigned long)address >> PAGE_SHIFT) & wss.pages_mask;
+       u32 entry = page / BITS_PER_LONG; /* assumes this ends up a shift */
+       u32 nr = page & (BITS_PER_LONG - 1);
+
+       if (!test_and_set_bit(nr, &wss.entries[entry]))
+               atomic_inc(&wss.total_count);
+
+       wss_advance_clean_counter();
+}
+
+/*
+ * Is the working set larger than the threshold?
+ */
+static inline int wss_exceeds_threshold(void)
+{
+       return atomic_read(&wss.total_count) >= wss.threshold;
+}
+
+/*
+ * Translate ib_wr_opcode into ib_wc_opcode.
+ */
+const enum ib_wc_opcode ib_hfi1_wc_opcode[] = {
+       [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
+       [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
+       [IB_WR_SEND] = IB_WC_SEND,
+       [IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
+       [IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
+       [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
+       [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD
+};
+
+/*
+ * Length of header by opcode, 0 --> not supported
+ */
+const u8 hdr_len_by_opcode[256] = {
+       /* RC */
+       [IB_OPCODE_RC_SEND_FIRST]                     = 12 + 8,
+       [IB_OPCODE_RC_SEND_MIDDLE]                    = 12 + 8,
+       [IB_OPCODE_RC_SEND_LAST]                      = 12 + 8,
+       [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE]       = 12 + 8 + 4,
+       [IB_OPCODE_RC_SEND_ONLY]                      = 12 + 8,
+       [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE]       = 12 + 8 + 4,
+       [IB_OPCODE_RC_RDMA_WRITE_FIRST]               = 12 + 8 + 16,
+       [IB_OPCODE_RC_RDMA_WRITE_MIDDLE]              = 12 + 8,
+       [IB_OPCODE_RC_RDMA_WRITE_LAST]                = 12 + 8,
+       [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4,
+       [IB_OPCODE_RC_RDMA_WRITE_ONLY]                = 12 + 8 + 16,
+       [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20,
+       [IB_OPCODE_RC_RDMA_READ_REQUEST]              = 12 + 8 + 16,
+       [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST]       = 12 + 8 + 4,
+       [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE]      = 12 + 8,
+       [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST]        = 12 + 8 + 4,
+       [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY]        = 12 + 8 + 4,
+       [IB_OPCODE_RC_ACKNOWLEDGE]                    = 12 + 8 + 4,
+       [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE]             = 12 + 8 + 4,
+       [IB_OPCODE_RC_COMPARE_SWAP]                   = 12 + 8 + 28,
+       [IB_OPCODE_RC_FETCH_ADD]                      = 12 + 8 + 28,
+       [IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE]      = 12 + 8 + 4,
+       [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE]      = 12 + 8 + 4,
+       /* UC */
+       [IB_OPCODE_UC_SEND_FIRST]                     = 12 + 8,
+       [IB_OPCODE_UC_SEND_MIDDLE]                    = 12 + 8,
+       [IB_OPCODE_UC_SEND_LAST]                      = 12 + 8,
+       [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE]       = 12 + 8 + 4,
+       [IB_OPCODE_UC_SEND_ONLY]                      = 12 + 8,
+       [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE]       = 12 + 8 + 4,
+       [IB_OPCODE_UC_RDMA_WRITE_FIRST]               = 12 + 8 + 16,
+       [IB_OPCODE_UC_RDMA_WRITE_MIDDLE]              = 12 + 8,
+       [IB_OPCODE_UC_RDMA_WRITE_LAST]                = 12 + 8,
+       [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4,
+       [IB_OPCODE_UC_RDMA_WRITE_ONLY]                = 12 + 8 + 16,
+       [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20,
+       /* UD */
+       [IB_OPCODE_UD_SEND_ONLY]                      = 12 + 8 + 8,
+       [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE]       = 12 + 8 + 12
+};
+
+static const opcode_handler opcode_handler_tbl[256] = {
+       /* RC */
+       [IB_OPCODE_RC_SEND_FIRST]                     = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_SEND_MIDDLE]                    = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_SEND_LAST]                      = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE]       = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_SEND_ONLY]                      = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE]       = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_RDMA_WRITE_FIRST]               = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_RDMA_WRITE_MIDDLE]              = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_RDMA_WRITE_LAST]                = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_RDMA_WRITE_ONLY]                = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_RDMA_READ_REQUEST]              = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST]       = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE]      = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST]        = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY]        = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_ACKNOWLEDGE]                    = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE]             = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_COMPARE_SWAP]                   = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_FETCH_ADD]                      = &hfi1_rc_rcv,
+       /* UC */
+       [IB_OPCODE_UC_SEND_FIRST]                     = &hfi1_uc_rcv,
+       [IB_OPCODE_UC_SEND_MIDDLE]                    = &hfi1_uc_rcv,
+       [IB_OPCODE_UC_SEND_LAST]                      = &hfi1_uc_rcv,
+       [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE]       = &hfi1_uc_rcv,
+       [IB_OPCODE_UC_SEND_ONLY]                      = &hfi1_uc_rcv,
+       [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE]       = &hfi1_uc_rcv,
+       [IB_OPCODE_UC_RDMA_WRITE_FIRST]               = &hfi1_uc_rcv,
+       [IB_OPCODE_UC_RDMA_WRITE_MIDDLE]              = &hfi1_uc_rcv,
+       [IB_OPCODE_UC_RDMA_WRITE_LAST]                = &hfi1_uc_rcv,
+       [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_uc_rcv,
+       [IB_OPCODE_UC_RDMA_WRITE_ONLY]                = &hfi1_uc_rcv,
+       [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_uc_rcv,
+       /* UD */
+       [IB_OPCODE_UD_SEND_ONLY]                      = &hfi1_ud_rcv,
+       [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE]       = &hfi1_ud_rcv,
+       /* CNP */
+       [IB_OPCODE_CNP]                               = &hfi1_cnp_rcv
+};
+
+/*
+ * System image GUID.
+ */
+__be64 ib_hfi1_sys_image_guid;
+
+/**
+ * hfi1_copy_sge - copy data to SGE memory
+ * @ss: the SGE state
+ * @data: the data to copy
+ * @length: the length of the data
+ * @copy_last: do a separate copy of the last 8 bytes
+ */
+void hfi1_copy_sge(
+       struct rvt_sge_state *ss,
+       void *data, u32 length,
+       int release,
+       int copy_last)
+{
+       struct rvt_sge *sge = &ss->sge;
+       int in_last = 0;
+       int i;
+       int cacheless_copy = 0;
+
+       if (sge_copy_mode == COPY_CACHELESS) {
+               cacheless_copy = length >= PAGE_SIZE;
+       } else if (sge_copy_mode == COPY_ADAPTIVE) {
+               if (length >= PAGE_SIZE) {
+                       /*
+                        * NOTE: this *assumes*:
+                        * o The first vaddr is the dest.
+                        * o If multiple pages, then vaddr is sequential.
+                        */
+                       wss_insert(sge->vaddr);
+                       if (length >= (2 * PAGE_SIZE))
+                               wss_insert(sge->vaddr + PAGE_SIZE);
+
+                       cacheless_copy = wss_exceeds_threshold();
+               } else {
+                       wss_advance_clean_counter();
+               }
+       }
+       if (copy_last) {
+               if (length > 8) {
+                       length -= 8;
+               } else {
+                       copy_last = 0;
+                       in_last = 1;
+               }
+       }
+
+again:
+       while (length) {
+               u32 len = sge->length;
+
+               if (len > length)
+                       len = length;
+               if (len > sge->sge_length)
+                       len = sge->sge_length;
+               WARN_ON_ONCE(len == 0);
+               if (unlikely(in_last)) {
+                       /* enforce byte transfer ordering */
+                       for (i = 0; i < len; i++)
+                               ((u8 *)sge->vaddr)[i] = ((u8 *)data)[i];
+               } else if (cacheless_copy) {
+                       cacheless_memcpy(sge->vaddr, data, len);
+               } else {
+                       memcpy(sge->vaddr, data, len);
+               }
+               sge->vaddr += len;
+               sge->length -= len;
+               sge->sge_length -= len;
+               if (sge->sge_length == 0) {
+                       if (release)
+                               rvt_put_mr(sge->mr);
+                       if (--ss->num_sge)
+                               *sge = *ss->sg_list++;
+               } else if (sge->length == 0 && sge->mr->lkey) {
+                       if (++sge->n >= RVT_SEGSZ) {
+                               if (++sge->m >= sge->mr->mapsz)
+                                       break;
+                               sge->n = 0;
+                       }
+                       sge->vaddr =
+                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
+                       sge->length =
+                               sge->mr->map[sge->m]->segs[sge->n].length;
+               }
+               data += len;
+               length -= len;
+       }
+
+       if (copy_last) {
+               copy_last = 0;
+               in_last = 1;
+               length = 8;
+               goto again;
+       }
+}
+
+/**
+ * hfi1_skip_sge - skip over SGE memory
+ * @ss: the SGE state
+ * @length: the number of bytes to skip
+ */
+void hfi1_skip_sge(struct rvt_sge_state *ss, u32 length, int release)
+{
+       struct rvt_sge *sge = &ss->sge;
+
+       while (length) {
+               u32 len = sge->length;
+
+               if (len > length)
+                       len = length;
+               if (len > sge->sge_length)
+                       len = sge->sge_length;
+               WARN_ON_ONCE(len == 0);
+               sge->vaddr += len;
+               sge->length -= len;
+               sge->sge_length -= len;
+               if (sge->sge_length == 0) {
+                       if (release)
+                               rvt_put_mr(sge->mr);
+                       if (--ss->num_sge)
+                               *sge = *ss->sg_list++;
+               } else if (sge->length == 0 && sge->mr->lkey) {
+                       if (++sge->n >= RVT_SEGSZ) {
+                               if (++sge->m >= sge->mr->mapsz)
+                                       break;
+                               sge->n = 0;
+                       }
+                       sge->vaddr =
+                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
+                       sge->length =
+                               sge->mr->map[sge->m]->segs[sge->n].length;
+               }
+               length -= len;
+       }
+}
+
+/*
+ * Make sure the QP is ready and able to accept the given opcode.
+ */
+static inline int qp_ok(int opcode, struct hfi1_packet *packet)
+{
+       struct hfi1_ibport *ibp;
+
+       if (!(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK))
+               goto dropit;
+       if (((opcode & RVT_OPCODE_QP_MASK) == packet->qp->allowed_ops) ||
+           (opcode == IB_OPCODE_CNP))
+               return 1;
+dropit:
+       ibp = &packet->rcd->ppd->ibport_data;
+       ibp->rvp.n_pkt_drops++;
+       return 0;
+}
+
+/**
+ * hfi1_ib_rcv - process an incoming packet
+ * @packet: data packet information
+ *
+ * This is called to process an incoming packet at interrupt level.
+ *
+ * Tlen is the length of the header + data + CRC in bytes.
+ */
+void hfi1_ib_rcv(struct hfi1_packet *packet)
+{
+       struct hfi1_ctxtdata *rcd = packet->rcd;
+       struct hfi1_ib_header *hdr = packet->hdr;
+       u32 tlen = packet->tlen;
+       struct hfi1_pportdata *ppd = rcd->ppd;
+       struct hfi1_ibport *ibp = &ppd->ibport_data;
+       struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi;
+       unsigned long flags;
+       u32 qp_num;
+       int lnh;
+       u8 opcode;
+       u16 lid;
+
+       /* Check for GRH */
+       lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+       if (lnh == HFI1_LRH_BTH) {
+               packet->ohdr = &hdr->u.oth;
+       } else if (lnh == HFI1_LRH_GRH) {
+               u32 vtf;
+
+               packet->ohdr = &hdr->u.l.oth;
+               if (hdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR)
+                       goto drop;
+               vtf = be32_to_cpu(hdr->u.l.grh.version_tclass_flow);
+               if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION)
+                       goto drop;
+               packet->rcv_flags |= HFI1_HAS_GRH;
+       } else {
+               goto drop;
+       }
+
+       trace_input_ibhdr(rcd->dd, hdr);
+
+       opcode = (be32_to_cpu(packet->ohdr->bth[0]) >> 24);
+       inc_opstats(tlen, &rcd->opstats->stats[opcode]);
+
+       /* Get the destination QP number. */
+       qp_num = be32_to_cpu(packet->ohdr->bth[1]) & RVT_QPN_MASK;
+       lid = be16_to_cpu(hdr->lrh[1]);
+       if (unlikely((lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) &&
+                    (lid != be16_to_cpu(IB_LID_PERMISSIVE)))) {
+               struct rvt_mcast *mcast;
+               struct rvt_mcast_qp *p;
+
+               if (lnh != HFI1_LRH_GRH)
+                       goto drop;
+               mcast = rvt_mcast_find(&ibp->rvp, &hdr->u.l.grh.dgid);
+               if (!mcast)
+                       goto drop;
+               list_for_each_entry_rcu(p, &mcast->qp_list, list) {
+                       packet->qp = p->qp;
+                       spin_lock_irqsave(&packet->qp->r_lock, flags);
+                       if (likely((qp_ok(opcode, packet))))
+                               opcode_handler_tbl[opcode](packet);
+                       spin_unlock_irqrestore(&packet->qp->r_lock, flags);
+               }
+               /*
+                * Notify rvt_multicast_detach() if it is waiting for us
+                * to finish.
+                */
+               if (atomic_dec_return(&mcast->refcount) <= 1)
+                       wake_up(&mcast->wait);
+       } else {
+               rcu_read_lock();
+               packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
+               if (!packet->qp) {
+                       rcu_read_unlock();
+                       goto drop;
+               }
+               spin_lock_irqsave(&packet->qp->r_lock, flags);
+               if (likely((qp_ok(opcode, packet))))
+                       opcode_handler_tbl[opcode](packet);
+               spin_unlock_irqrestore(&packet->qp->r_lock, flags);
+               rcu_read_unlock();
+       }
+       return;
+
+drop:
+       ibp->rvp.n_pkt_drops++;
+}
+
+/*
+ * This is called from a timer to check for QPs
+ * which need kernel memory in order to send a packet.
+ */
+static void mem_timer(unsigned long data)
+{
+       struct hfi1_ibdev *dev = (struct hfi1_ibdev *)data;
+       struct list_head *list = &dev->memwait;
+       struct rvt_qp *qp = NULL;
+       struct iowait *wait;
+       unsigned long flags;
+       struct hfi1_qp_priv *priv;
+
+       write_seqlock_irqsave(&dev->iowait_lock, flags);
+       if (!list_empty(list)) {
+               wait = list_first_entry(list, struct iowait, list);
+               qp = iowait_to_qp(wait);
+               priv = qp->priv;
+               list_del_init(&priv->s_iowait.list);
+               /* refcount held until actual wake up */
+               if (!list_empty(list))
+                       mod_timer(&dev->mem_timer, jiffies + 1);
+       }
+       write_sequnlock_irqrestore(&dev->iowait_lock, flags);
+
+       if (qp)
+               hfi1_qp_wakeup(qp, RVT_S_WAIT_KMEM);
+}
+
+void update_sge(struct rvt_sge_state *ss, u32 length)
+{
+       struct rvt_sge *sge = &ss->sge;
+
+       sge->vaddr += length;
+       sge->length -= length;
+       sge->sge_length -= length;
+       if (sge->sge_length == 0) {
+               if (--ss->num_sge)
+                       *sge = *ss->sg_list++;
+       } else if (sge->length == 0 && sge->mr->lkey) {
+               if (++sge->n >= RVT_SEGSZ) {
+                       if (++sge->m >= sge->mr->mapsz)
+                               return;
+                       sge->n = 0;
+               }
+               sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
+               sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
+       }
+}
+
+/*
+ * This is called with progress side lock held.
+ */
+/* New API */
+static void verbs_sdma_complete(
+       struct sdma_txreq *cookie,
+       int status)
+{
+       struct verbs_txreq *tx =
+               container_of(cookie, struct verbs_txreq, txreq);
+       struct rvt_qp *qp = tx->qp;
+
+       spin_lock(&qp->s_lock);
+       if (tx->wqe) {
+               hfi1_send_complete(qp, tx->wqe, IB_WC_SUCCESS);
+       } else if (qp->ibqp.qp_type == IB_QPT_RC) {
+               struct hfi1_ib_header *hdr;
+
+               hdr = &tx->phdr.hdr;
+               hfi1_rc_send_complete(qp, hdr);
+       }
+       spin_unlock(&qp->s_lock);
+
+       hfi1_put_txreq(tx);
+}
+
+static int wait_kmem(struct hfi1_ibdev *dev,
+                    struct rvt_qp *qp,
+                    struct hfi1_pkt_state *ps)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+       unsigned long flags;
+       int ret = 0;
+
+       spin_lock_irqsave(&qp->s_lock, flags);
+       if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
+               write_seqlock(&dev->iowait_lock);
+               list_add_tail(&ps->s_txreq->txreq.list,
+                             &priv->s_iowait.tx_head);
+               if (list_empty(&priv->s_iowait.list)) {
+                       if (list_empty(&dev->memwait))
+                               mod_timer(&dev->mem_timer, jiffies + 1);
+                       qp->s_flags |= RVT_S_WAIT_KMEM;
+                       list_add_tail(&priv->s_iowait.list, &dev->memwait);
+                       trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM);
+                       atomic_inc(&qp->refcount);
+               }
+               write_sequnlock(&dev->iowait_lock);
+               qp->s_flags &= ~RVT_S_BUSY;
+               ret = -EBUSY;
+       }
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+
+       return ret;
+}
+
+/*
+ * This routine calls txadds for each sg entry.
+ *
+ * Add failures will revert the sge cursor
+ */
+static noinline int build_verbs_ulp_payload(
+       struct sdma_engine *sde,
+       struct rvt_sge_state *ss,
+       u32 length,
+       struct verbs_txreq *tx)
+{
+       struct rvt_sge *sg_list = ss->sg_list;
+       struct rvt_sge sge = ss->sge;
+       u8 num_sge = ss->num_sge;
+       u32 len;
+       int ret = 0;
+
+       while (length) {
+               len = ss->sge.length;
+               if (len > length)
+                       len = length;
+               if (len > ss->sge.sge_length)
+                       len = ss->sge.sge_length;
+               WARN_ON_ONCE(len == 0);
+               ret = sdma_txadd_kvaddr(
+                       sde->dd,
+                       &tx->txreq,
+                       ss->sge.vaddr,
+                       len);
+               if (ret)
+                       goto bail_txadd;
+               update_sge(ss, len);
+               length -= len;
+       }
+       return ret;
+bail_txadd:
+       /* unwind cursor */
+       ss->sge = sge;
+       ss->num_sge = num_sge;
+       ss->sg_list = sg_list;
+       return ret;
+}
+
+/*
+ * Build the number of DMA descriptors needed to send length bytes of data.
+ *
+ * NOTE: DMA mapping is held in the tx until completed in the ring or
+ *       the tx desc is freed without having been submitted to the ring
+ *
+ * This routine ensures all the helper routine calls succeed.
+ */
+/* New API */
+static int build_verbs_tx_desc(
+       struct sdma_engine *sde,
+       struct rvt_sge_state *ss,
+       u32 length,
+       struct verbs_txreq *tx,
+       struct ahg_ib_header *ahdr,
+       u64 pbc)
+{
+       int ret = 0;
+       struct hfi1_pio_header *phdr = &tx->phdr;
+       u16 hdrbytes = tx->hdr_dwords << 2;
+
+       if (!ahdr->ahgcount) {
+               ret = sdma_txinit_ahg(
+                       &tx->txreq,
+                       ahdr->tx_flags,
+                       hdrbytes + length,
+                       ahdr->ahgidx,
+                       0,
+                       NULL,
+                       0,
+                       verbs_sdma_complete);
+               if (ret)
+                       goto bail_txadd;
+               phdr->pbc = cpu_to_le64(pbc);
+               ret = sdma_txadd_kvaddr(
+                       sde->dd,
+                       &tx->txreq,
+                       phdr,
+                       hdrbytes);
+               if (ret)
+                       goto bail_txadd;
+       } else {
+               ret = sdma_txinit_ahg(
+                       &tx->txreq,
+                       ahdr->tx_flags,
+                       length,
+                       ahdr->ahgidx,
+                       ahdr->ahgcount,
+                       ahdr->ahgdesc,
+                       hdrbytes,
+                       verbs_sdma_complete);
+               if (ret)
+                       goto bail_txadd;
+       }
+
+       /* add the ulp payload - if any.  ss can be NULL for acks */
+       if (ss)
+               ret = build_verbs_ulp_payload(sde, ss, length, tx);
+bail_txadd:
+       return ret;
+}
+
+int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+                       u64 pbc)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+       struct ahg_ib_header *ahdr = priv->s_hdr;
+       u32 hdrwords = qp->s_hdrwords;
+       struct rvt_sge_state *ss = qp->s_cur_sge;
+       u32 len = qp->s_cur_size;
+       u32 plen = hdrwords + ((len + 3) >> 2) + 2; /* includes pbc */
+       struct hfi1_ibdev *dev = ps->dev;
+       struct hfi1_pportdata *ppd = ps->ppd;
+       struct verbs_txreq *tx;
+       u64 pbc_flags = 0;
+       u8 sc5 = priv->s_sc;
+
+       int ret;
+
+       tx = ps->s_txreq;
+       if (!sdma_txreq_built(&tx->txreq)) {
+               if (likely(pbc == 0)) {
+                       u32 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5);
+                       /* No vl15 here */
+                       /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
+                       pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT;
+
+                       pbc = create_pbc(ppd,
+                                        pbc_flags,
+                                        qp->srate_mbps,
+                                        vl,
+                                        plen);
+               }
+               tx->wqe = qp->s_wqe;
+               ret = build_verbs_tx_desc(tx->sde, ss, len, tx, ahdr, pbc);
+               if (unlikely(ret))
+                       goto bail_build;
+       }
+       ret =  sdma_send_txreq(tx->sde, &priv->s_iowait, &tx->txreq);
+       if (unlikely(ret < 0)) {
+               if (ret == -ECOMM)
+                       goto bail_ecomm;
+               return ret;
+       }
+       trace_sdma_output_ibhdr(dd_from_ibdev(qp->ibqp.device),
+                               &ps->s_txreq->phdr.hdr);
+       return ret;
+
+bail_ecomm:
+       /* The current one got "sent" */
+       return 0;
+bail_build:
+       ret = wait_kmem(dev, qp, ps);
+       if (!ret) {
+               /* free txreq - bad state */
+               hfi1_put_txreq(ps->s_txreq);
+               ps->s_txreq = NULL;
+       }
+       return ret;
+}
+
+/*
+ * If we are now in the error state, return zero to flush the
+ * send work request.
+ */
+static int pio_wait(struct rvt_qp *qp,
+                   struct send_context *sc,
+                   struct hfi1_pkt_state *ps,
+                   u32 flag)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+       struct hfi1_devdata *dd = sc->dd;
+       struct hfi1_ibdev *dev = &dd->verbs_dev;
+       unsigned long flags;
+       int ret = 0;
+
+       /*
+        * Note that as soon as want_buffer() is called and
+        * possibly before it returns, sc_piobufavail()
+        * could be called. Therefore, put QP on the I/O wait list before
+        * enabling the PIO avail interrupt.
+        */
+       spin_lock_irqsave(&qp->s_lock, flags);
+       if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
+               write_seqlock(&dev->iowait_lock);
+               list_add_tail(&ps->s_txreq->txreq.list,
+                             &priv->s_iowait.tx_head);
+               if (list_empty(&priv->s_iowait.list)) {
+                       struct hfi1_ibdev *dev = &dd->verbs_dev;
+                       int was_empty;
+
+                       dev->n_piowait += !!(flag & RVT_S_WAIT_PIO);
+                       dev->n_piodrain += !!(flag & RVT_S_WAIT_PIO_DRAIN);
+                       qp->s_flags |= flag;
+                       was_empty = list_empty(&sc->piowait);
+                       list_add_tail(&priv->s_iowait.list, &sc->piowait);
+                       trace_hfi1_qpsleep(qp, RVT_S_WAIT_PIO);
+                       atomic_inc(&qp->refcount);
+                       /* counting: only call wantpiobuf_intr if first user */
+                       if (was_empty)
+                               hfi1_sc_wantpiobuf_intr(sc, 1);
+               }
+               write_sequnlock(&dev->iowait_lock);
+               qp->s_flags &= ~RVT_S_BUSY;
+               ret = -EBUSY;
+       }
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+       return ret;
+}
+
+static void verbs_pio_complete(void *arg, int code)
+{
+       struct rvt_qp *qp = (struct rvt_qp *)arg;
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       if (iowait_pio_dec(&priv->s_iowait))
+               iowait_drain_wakeup(&priv->s_iowait);
+}
+
+int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+                       u64 pbc)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+       u32 hdrwords = qp->s_hdrwords;
+       struct rvt_sge_state *ss = qp->s_cur_sge;
+       u32 len = qp->s_cur_size;
+       u32 dwords = (len + 3) >> 2;
+       u32 plen = hdrwords + dwords + 2; /* includes pbc */
+       struct hfi1_pportdata *ppd = ps->ppd;
+       u32 *hdr = (u32 *)&ps->s_txreq->phdr.hdr;
+       u64 pbc_flags = 0;
+       u8 sc5;
+       unsigned long flags = 0;
+       struct send_context *sc;
+       struct pio_buf *pbuf;
+       int wc_status = IB_WC_SUCCESS;
+       int ret = 0;
+       pio_release_cb cb = NULL;
+
+       /* only RC/UC use complete */
+       switch (qp->ibqp.qp_type) {
+       case IB_QPT_RC:
+       case IB_QPT_UC:
+               cb = verbs_pio_complete;
+               break;
+       default:
+               break;
+       }
+
+       /* vl15 special case taken care of in ud.c */
+       sc5 = priv->s_sc;
+       sc = ps->s_txreq->psc;
+
+       if (likely(pbc == 0)) {
+               u8 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5);
+               /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
+               pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT;
+               pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
+       }
+       if (cb)
+               iowait_pio_inc(&priv->s_iowait);
+       pbuf = sc_buffer_alloc(sc, plen, cb, qp);
+       if (unlikely(!pbuf)) {
+               if (cb)
+                       verbs_pio_complete(qp, 0);
+               if (ppd->host_link_state != HLS_UP_ACTIVE) {
+                       /*
+                        * If we have filled the PIO buffers to capacity and are
+                        * not in an active state this request is not going to
+                        * go out to so just complete it with an error or else a
+                        * ULP or the core may be stuck waiting.
+                        */
+                       hfi1_cdbg(
+                               PIO,
+                               "alloc failed. state not active, completing");
+                       wc_status = IB_WC_GENERAL_ERR;
+                       goto pio_bail;
+               } else {
+                       /*
+                        * This is a normal occurrence. The PIO buffs are full
+                        * up but we are still happily sending, well we could be
+                        * so lets continue to queue the request.
+                        */
+                       hfi1_cdbg(PIO, "alloc failed. state active, queuing");
+                       ret = pio_wait(qp, sc, ps, RVT_S_WAIT_PIO);
+                       if (!ret)
+                               /* txreq not queued - free */
+                               goto bail;
+                       /* tx consumed in wait */
+                       return ret;
+               }
+       }
+
+       if (len == 0) {
+               pio_copy(ppd->dd, pbuf, pbc, hdr, hdrwords);
+       } else {
+               if (ss) {
+                       seg_pio_copy_start(pbuf, pbc, hdr, hdrwords * 4);
+                       while (len) {
+                               void *addr = ss->sge.vaddr;
+                               u32 slen = ss->sge.length;
+
+                               if (slen > len)
+                                       slen = len;
+                               update_sge(ss, slen);
+                               seg_pio_copy_mid(pbuf, addr, slen);
+                               len -= slen;
+                       }
+                       seg_pio_copy_end(pbuf);
+               }
+       }
+
+       trace_pio_output_ibhdr(dd_from_ibdev(qp->ibqp.device),
+                              &ps->s_txreq->phdr.hdr);
+
+pio_bail:
+       if (qp->s_wqe) {
+               spin_lock_irqsave(&qp->s_lock, flags);
+               hfi1_send_complete(qp, qp->s_wqe, wc_status);
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+       } else if (qp->ibqp.qp_type == IB_QPT_RC) {
+               spin_lock_irqsave(&qp->s_lock, flags);
+               hfi1_rc_send_complete(qp, &ps->s_txreq->phdr.hdr);
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+       }
+
+       ret = 0;
+
+bail:
+       hfi1_put_txreq(ps->s_txreq);
+       return ret;
+}
+
+/*
+ * egress_pkey_matches_entry - return 1 if the pkey matches ent (ent
+ * being an entry from the partition key table), return 0
+ * otherwise. Use the matching criteria for egress partition keys
+ * specified in the OPAv1 spec., section 9.1l.7.
+ */
+static inline int egress_pkey_matches_entry(u16 pkey, u16 ent)
+{
+       u16 mkey = pkey & PKEY_LOW_15_MASK;
+       u16 mentry = ent & PKEY_LOW_15_MASK;
+
+       if (mkey == mentry) {
+               /*
+                * If pkey[15] is set (full partition member),
+                * is bit 15 in the corresponding table element
+                * clear (limited member)?
+                */
+               if (pkey & PKEY_MEMBER_MASK)
+                       return !!(ent & PKEY_MEMBER_MASK);
+               return 1;
+       }
+       return 0;
+}
+
+/**
+ * egress_pkey_check - check P_KEY of a packet
+ * @ppd:    Physical IB port data
+ * @lrh: Local route header
+ * @bth: Base transport header
+ * @sc5:    SC for packet
+ * @s_pkey_index: It will be used for look up optimization for kernel contexts
+ * only. If it is negative value, then it means user contexts is calling this
+ * function.
+ *
+ * It checks if hdr's pkey is valid.
+ *
+ * Return: 0 on success, otherwise, 1
+ */
+int egress_pkey_check(struct hfi1_pportdata *ppd, __be16 *lrh, __be32 *bth,
+                     u8 sc5, int8_t s_pkey_index)
+{
+       struct hfi1_devdata *dd;
+       int i;
+       u16 pkey;
+       int is_user_ctxt_mechanism = (s_pkey_index < 0);
+
+       if (!(ppd->part_enforce & HFI1_PART_ENFORCE_OUT))
+               return 0;
+
+       pkey = (u16)be32_to_cpu(bth[0]);
+
+       /* If SC15, pkey[0:14] must be 0x7fff */
+       if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
+               goto bad;
+
+       /* Is the pkey = 0x0, or 0x8000? */
+       if ((pkey & PKEY_LOW_15_MASK) == 0)
+               goto bad;
+
+       /*
+        * For the kernel contexts only, if a qp is passed into the function,
+        * the most likely matching pkey has index qp->s_pkey_index
+        */
+       if (!is_user_ctxt_mechanism &&
+           egress_pkey_matches_entry(pkey, ppd->pkeys[s_pkey_index])) {
+               return 0;
+       }
+
+       for (i = 0; i < MAX_PKEY_VALUES; i++) {
+               if (egress_pkey_matches_entry(pkey, ppd->pkeys[i]))
+                       return 0;
+       }
+bad:
+       /*
+        * For the user-context mechanism, the P_KEY check would only happen
+        * once per SDMA request, not once per packet.  Therefore, there's no
+        * need to increment the counter for the user-context mechanism.
+        */
+       if (!is_user_ctxt_mechanism) {
+               incr_cntr64(&ppd->port_xmit_constraint_errors);
+               dd = ppd->dd;
+               if (!(dd->err_info_xmit_constraint.status &
+                     OPA_EI_STATUS_SMASK)) {
+                       u16 slid = be16_to_cpu(lrh[3]);
+
+                       dd->err_info_xmit_constraint.status |=
+                               OPA_EI_STATUS_SMASK;
+                       dd->err_info_xmit_constraint.slid = slid;
+                       dd->err_info_xmit_constraint.pkey = pkey;
+               }
+       }
+       return 1;
+}
+
+/**
+ * get_send_routine - choose an egress routine
+ *
+ * Choose an egress routine based on QP type
+ * and size
+ */
+static inline send_routine get_send_routine(struct rvt_qp *qp,
+                                           struct verbs_txreq *tx)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+       struct hfi1_qp_priv *priv = qp->priv;
+       struct hfi1_ib_header *h = &tx->phdr.hdr;
+
+       if (unlikely(!(dd->flags & HFI1_HAS_SEND_DMA)))
+               return dd->process_pio_send;
+       switch (qp->ibqp.qp_type) {
+       case IB_QPT_SMI:
+               return dd->process_pio_send;
+       case IB_QPT_GSI:
+       case IB_QPT_UD:
+               break;
+       case IB_QPT_RC:
+               if (piothreshold &&
+                   qp->s_cur_size <= min(piothreshold, qp->pmtu) &&
+                   (BIT(get_opcode(h) & 0x1f) & rc_only_opcode) &&
+                   iowait_sdma_pending(&priv->s_iowait) == 0 &&
+                   !sdma_txreq_built(&tx->txreq))
+                       return dd->process_pio_send;
+               break;
+       case IB_QPT_UC:
+               if (piothreshold &&
+                   qp->s_cur_size <= min(piothreshold, qp->pmtu) &&
+                   (BIT(get_opcode(h) & 0x1f) & uc_only_opcode) &&
+                   iowait_sdma_pending(&priv->s_iowait) == 0 &&
+                   !sdma_txreq_built(&tx->txreq))
+                       return dd->process_pio_send;
+               break;
+       default:
+               break;
+       }
+       return dd->process_dma_send;
+}
+
+/**
+ * hfi1_verbs_send - send a packet
+ * @qp: the QP to send on
+ * @ps: the state of the packet to send
+ *
+ * Return zero if packet is sent or queued OK.
+ * Return non-zero and clear qp->s_flags RVT_S_BUSY otherwise.
+ */
+int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+       struct hfi1_qp_priv *priv = qp->priv;
+       struct hfi1_other_headers *ohdr;
+       struct hfi1_ib_header *hdr;
+       send_routine sr;
+       int ret;
+       u8 lnh;
+
+       hdr = &ps->s_txreq->phdr.hdr;
+       /* locate the pkey within the headers */
+       lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+       if (lnh == HFI1_LRH_GRH)
+               ohdr = &hdr->u.l.oth;
+       else
+               ohdr = &hdr->u.oth;
+
+       sr = get_send_routine(qp, ps->s_txreq);
+       ret = egress_pkey_check(dd->pport,
+                               hdr->lrh,
+                               ohdr->bth,
+                               priv->s_sc,
+                               qp->s_pkey_index);
+       if (unlikely(ret)) {
+               /*
+                * The value we are returning here does not get propagated to
+                * the verbs caller. Thus we need to complete the request with
+                * error otherwise the caller could be sitting waiting on the
+                * completion event. Only do this for PIO. SDMA has its own
+                * mechanism for handling the errors. So for SDMA we can just
+                * return.
+                */
+               if (sr == dd->process_pio_send) {
+                       unsigned long flags;
+
+                       hfi1_cdbg(PIO, "%s() Failed. Completing with err",
+                                 __func__);
+                       spin_lock_irqsave(&qp->s_lock, flags);
+                       hfi1_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR);
+                       spin_unlock_irqrestore(&qp->s_lock, flags);
+               }
+               return -EINVAL;
+       }
+       if (sr == dd->process_dma_send && iowait_pio_pending(&priv->s_iowait))
+               return pio_wait(qp,
+                               ps->s_txreq->psc,
+                               ps,
+                               RVT_S_WAIT_PIO_DRAIN);
+       return sr(qp, ps, 0);
+}
+
+/**
+ * hfi1_fill_device_attr - Fill in rvt dev info device attributes.
+ * @dd: the device data structure
+ */
+static void hfi1_fill_device_attr(struct hfi1_devdata *dd)
+{
+       struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
+
+       memset(&rdi->dparms.props, 0, sizeof(rdi->dparms.props));
+
+       rdi->dparms.props.device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR |
+                       IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT |
+                       IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN |
+                       IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE;
+       rdi->dparms.props.page_size_cap = PAGE_SIZE;
+       rdi->dparms.props.vendor_id = dd->oui1 << 16 | dd->oui2 << 8 | dd->oui3;
+       rdi->dparms.props.vendor_part_id = dd->pcidev->device;
+       rdi->dparms.props.hw_ver = dd->minrev;
+       rdi->dparms.props.sys_image_guid = ib_hfi1_sys_image_guid;
+       rdi->dparms.props.max_mr_size = ~0ULL;
+       rdi->dparms.props.max_qp = hfi1_max_qps;
+       rdi->dparms.props.max_qp_wr = hfi1_max_qp_wrs;
+       rdi->dparms.props.max_sge = hfi1_max_sges;
+       rdi->dparms.props.max_sge_rd = hfi1_max_sges;
+       rdi->dparms.props.max_cq = hfi1_max_cqs;
+       rdi->dparms.props.max_ah = hfi1_max_ahs;
+       rdi->dparms.props.max_cqe = hfi1_max_cqes;
+       rdi->dparms.props.max_mr = rdi->lkey_table.max;
+       rdi->dparms.props.max_fmr = rdi->lkey_table.max;
+       rdi->dparms.props.max_map_per_fmr = 32767;
+       rdi->dparms.props.max_pd = hfi1_max_pds;
+       rdi->dparms.props.max_qp_rd_atom = HFI1_MAX_RDMA_ATOMIC;
+       rdi->dparms.props.max_qp_init_rd_atom = 255;
+       rdi->dparms.props.max_srq = hfi1_max_srqs;
+       rdi->dparms.props.max_srq_wr = hfi1_max_srq_wrs;
+       rdi->dparms.props.max_srq_sge = hfi1_max_srq_sges;
+       rdi->dparms.props.atomic_cap = IB_ATOMIC_GLOB;
+       rdi->dparms.props.max_pkeys = hfi1_get_npkeys(dd);
+       rdi->dparms.props.max_mcast_grp = hfi1_max_mcast_grps;
+       rdi->dparms.props.max_mcast_qp_attach = hfi1_max_mcast_qp_attached;
+       rdi->dparms.props.max_total_mcast_qp_attach =
+                                       rdi->dparms.props.max_mcast_qp_attach *
+                                       rdi->dparms.props.max_mcast_grp;
+}
+
+static inline u16 opa_speed_to_ib(u16 in)
+{
+       u16 out = 0;
+
+       if (in & OPA_LINK_SPEED_25G)
+               out |= IB_SPEED_EDR;
+       if (in & OPA_LINK_SPEED_12_5G)
+               out |= IB_SPEED_FDR;
+
+       return out;
+}
+
+/*
+ * Convert a single OPA link width (no multiple flags) to an IB value.
+ * A zero OPA link width means link down, which means the IB width value
+ * is a don't care.
+ */
+static inline u16 opa_width_to_ib(u16 in)
+{
+       switch (in) {
+       case OPA_LINK_WIDTH_1X:
+       /* map 2x and 3x to 1x as they don't exist in IB */
+       case OPA_LINK_WIDTH_2X:
+       case OPA_LINK_WIDTH_3X:
+               return IB_WIDTH_1X;
+       default: /* link down or unknown, return our largest width */
+       case OPA_LINK_WIDTH_4X:
+               return IB_WIDTH_4X;
+       }
+}
+
+static int query_port(struct rvt_dev_info *rdi, u8 port_num,
+                     struct ib_port_attr *props)
+{
+       struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
+       struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
+       struct hfi1_pportdata *ppd = &dd->pport[port_num - 1];
+       u16 lid = ppd->lid;
+
+       props->lid = lid ? lid : 0;
+       props->lmc = ppd->lmc;
+       /* OPA logical states match IB logical states */
+       props->state = driver_lstate(ppd);
+       props->phys_state = hfi1_ibphys_portstate(ppd);
+       props->gid_tbl_len = HFI1_GUIDS_PER_PORT;
+       props->active_width = (u8)opa_width_to_ib(ppd->link_width_active);
+       /* see rate_show() in ib core/sysfs.c */
+       props->active_speed = (u8)opa_speed_to_ib(ppd->link_speed_active);
+       props->max_vl_num = ppd->vls_supported;
+
+       /* Once we are a "first class" citizen and have added the OPA MTUs to
+        * the core we can advertise the larger MTU enum to the ULPs, for now
+        * advertise only 4K.
+        *
+        * Those applications which are either OPA aware or pass the MTU enum
+        * from the Path Records to us will get the new 8k MTU.  Those that
+        * attempt to process the MTU enum may fail in various ways.
+        */
+       props->max_mtu = mtu_to_enum((!valid_ib_mtu(hfi1_max_mtu) ?
+                                     4096 : hfi1_max_mtu), IB_MTU_4096);
+       props->active_mtu = !valid_ib_mtu(ppd->ibmtu) ? props->max_mtu :
+               mtu_to_enum(ppd->ibmtu, IB_MTU_2048);
+
+       return 0;
+}
+
+static int modify_device(struct ib_device *device,
+                        int device_modify_mask,
+                        struct ib_device_modify *device_modify)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(device);
+       unsigned i;
+       int ret;
+
+       if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID |
+                                  IB_DEVICE_MODIFY_NODE_DESC)) {
+               ret = -EOPNOTSUPP;
+               goto bail;
+       }
+
+       if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC) {
+               memcpy(device->node_desc, device_modify->node_desc, 64);
+               for (i = 0; i < dd->num_pports; i++) {
+                       struct hfi1_ibport *ibp = &dd->pport[i].ibport_data;
+
+                       hfi1_node_desc_chg(ibp);
+               }
+       }
+
+       if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) {
+               ib_hfi1_sys_image_guid =
+                       cpu_to_be64(device_modify->sys_image_guid);
+               for (i = 0; i < dd->num_pports; i++) {
+                       struct hfi1_ibport *ibp = &dd->pport[i].ibport_data;
+
+                       hfi1_sys_guid_chg(ibp);
+               }
+       }
+
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+static int shut_down_port(struct rvt_dev_info *rdi, u8 port_num)
+{
+       struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
+       struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
+       struct hfi1_pportdata *ppd = &dd->pport[port_num - 1];
+       int ret;
+
+       set_link_down_reason(ppd, OPA_LINKDOWN_REASON_UNKNOWN, 0,
+                            OPA_LINKDOWN_REASON_UNKNOWN);
+       ret = set_link_state(ppd, HLS_DN_DOWNDEF);
+       return ret;
+}
+
+static int hfi1_get_guid_be(struct rvt_dev_info *rdi, struct rvt_ibport *rvp,
+                           int guid_index, __be64 *guid)
+{
+       struct hfi1_ibport *ibp = container_of(rvp, struct hfi1_ibport, rvp);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+       if (guid_index == 0)
+               *guid = cpu_to_be64(ppd->guid);
+       else if (guid_index < HFI1_GUIDS_PER_PORT)
+               *guid = ibp->guids[guid_index - 1];
+       else
+               return -EINVAL;
+
+       return 0;
+}
+
+/*
+ * convert ah port,sl to sc
+ */
+u8 ah_to_sc(struct ib_device *ibdev, struct ib_ah_attr *ah)
+{
+       struct hfi1_ibport *ibp = to_iport(ibdev, ah->port_num);
+
+       return ibp->sl_to_sc[ah->sl];
+}
+
+static int hfi1_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr)
+{
+       struct hfi1_ibport *ibp;
+       struct hfi1_pportdata *ppd;
+       struct hfi1_devdata *dd;
+       u8 sc5;
+
+       /* test the mapping for validity */
+       ibp = to_iport(ibdev, ah_attr->port_num);
+       ppd = ppd_from_ibp(ibp);
+       sc5 = ibp->sl_to_sc[ah_attr->sl];
+       dd = dd_from_ppd(ppd);
+       if (sc_to_vlt(dd, sc5) > num_vls && sc_to_vlt(dd, sc5) != 0xf)
+               return -EINVAL;
+       return 0;
+}
+
+static void hfi1_notify_new_ah(struct ib_device *ibdev,
+                              struct ib_ah_attr *ah_attr,
+                              struct rvt_ah *ah)
+{
+       struct hfi1_ibport *ibp;
+       struct hfi1_pportdata *ppd;
+       struct hfi1_devdata *dd;
+       u8 sc5;
+
+       /*
+        * Do not trust reading anything from rvt_ah at this point as it is not
+        * done being setup. We can however modify things which we need to set.
+        */
+
+       ibp = to_iport(ibdev, ah_attr->port_num);
+       ppd = ppd_from_ibp(ibp);
+       sc5 = ibp->sl_to_sc[ah->attr.sl];
+       dd = dd_from_ppd(ppd);
+       ah->vl = sc_to_vlt(dd, sc5);
+       if (ah->vl < num_vls || ah->vl == 15)
+               ah->log_pmtu = ilog2(dd->vld[ah->vl].mtu);
+}
+
+struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u16 dlid)
+{
+       struct ib_ah_attr attr;
+       struct ib_ah *ah = ERR_PTR(-EINVAL);
+       struct rvt_qp *qp0;
+
+       memset(&attr, 0, sizeof(attr));
+       attr.dlid = dlid;
+       attr.port_num = ppd_from_ibp(ibp)->port;
+       rcu_read_lock();
+       qp0 = rcu_dereference(ibp->rvp.qp[0]);
+       if (qp0)
+               ah = ib_create_ah(qp0->ibqp.pd, &attr);
+       rcu_read_unlock();
+       return ah;
+}
+
+/**
+ * hfi1_get_npkeys - return the size of the PKEY table for context 0
+ * @dd: the hfi1_ib device
+ */
+unsigned hfi1_get_npkeys(struct hfi1_devdata *dd)
+{
+       return ARRAY_SIZE(dd->pport[0].pkeys);
+}
+
+static void init_ibport(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_ibport *ibp = &ppd->ibport_data;
+       size_t sz = ARRAY_SIZE(ibp->sl_to_sc);
+       int i;
+
+       for (i = 0; i < sz; i++) {
+               ibp->sl_to_sc[i] = i;
+               ibp->sc_to_sl[i] = i;
+       }
+
+       spin_lock_init(&ibp->rvp.lock);
+       /* Set the prefix to the default value (see ch. 4.1.1) */
+       ibp->rvp.gid_prefix = IB_DEFAULT_GID_PREFIX;
+       ibp->rvp.sm_lid = 0;
+       /* Below should only set bits defined in OPA PortInfo.CapabilityMask */
+       ibp->rvp.port_cap_flags = IB_PORT_AUTO_MIGR_SUP |
+               IB_PORT_CAP_MASK_NOTICE_SUP;
+       ibp->rvp.pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA;
+       ibp->rvp.pma_counter_select[1] = IB_PMA_PORT_RCV_DATA;
+       ibp->rvp.pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS;
+       ibp->rvp.pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS;
+       ibp->rvp.pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT;
+
+       RCU_INIT_POINTER(ibp->rvp.qp[0], NULL);
+       RCU_INIT_POINTER(ibp->rvp.qp[1], NULL);
+}
+
+/**
+ * hfi1_register_ib_device - register our device with the infiniband core
+ * @dd: the device data structure
+ * Return 0 if successful, errno if unsuccessful.
+ */
+int hfi1_register_ib_device(struct hfi1_devdata *dd)
+{
+       struct hfi1_ibdev *dev = &dd->verbs_dev;
+       struct ib_device *ibdev = &dev->rdi.ibdev;
+       struct hfi1_pportdata *ppd = dd->pport;
+       unsigned i;
+       int ret;
+       size_t lcpysz = IB_DEVICE_NAME_MAX;
+
+       for (i = 0; i < dd->num_pports; i++)
+               init_ibport(ppd + i);
+
+       /* Only need to initialize non-zero fields. */
+
+       setup_timer(&dev->mem_timer, mem_timer, (unsigned long)dev);
+
+       seqlock_init(&dev->iowait_lock);
+       INIT_LIST_HEAD(&dev->txwait);
+       INIT_LIST_HEAD(&dev->memwait);
+
+       ret = verbs_txreq_init(dev);
+       if (ret)
+               goto err_verbs_txreq;
+
+       /*
+        * The system image GUID is supposed to be the same for all
+        * HFIs in a single system but since there can be other
+        * device types in the system, we can't be sure this is unique.
+        */
+       if (!ib_hfi1_sys_image_guid)
+               ib_hfi1_sys_image_guid = cpu_to_be64(ppd->guid);
+       lcpysz = strlcpy(ibdev->name, class_name(), lcpysz);
+       strlcpy(ibdev->name + lcpysz, "_%d", IB_DEVICE_NAME_MAX - lcpysz);
+       ibdev->owner = THIS_MODULE;
+       ibdev->node_guid = cpu_to_be64(ppd->guid);
+       ibdev->phys_port_cnt = dd->num_pports;
+       ibdev->dma_device = &dd->pcidev->dev;
+       ibdev->modify_device = modify_device;
+
+       /* keep process mad in the driver */
+       ibdev->process_mad = hfi1_process_mad;
+
+       strncpy(ibdev->node_desc, init_utsname()->nodename,
+               sizeof(ibdev->node_desc));
+
+       /*
+        * Fill in rvt info object.
+        */
+       dd->verbs_dev.rdi.driver_f.port_callback = hfi1_create_port_files;
+       dd->verbs_dev.rdi.driver_f.get_card_name = get_card_name;
+       dd->verbs_dev.rdi.driver_f.get_pci_dev = get_pci_dev;
+       dd->verbs_dev.rdi.driver_f.check_ah = hfi1_check_ah;
+       dd->verbs_dev.rdi.driver_f.notify_new_ah = hfi1_notify_new_ah;
+       dd->verbs_dev.rdi.driver_f.get_guid_be = hfi1_get_guid_be;
+       dd->verbs_dev.rdi.driver_f.query_port_state = query_port;
+       dd->verbs_dev.rdi.driver_f.shut_down_port = shut_down_port;
+       dd->verbs_dev.rdi.driver_f.cap_mask_chg = hfi1_cap_mask_chg;
+       /*
+        * Fill in rvt info device attributes.
+        */
+       hfi1_fill_device_attr(dd);
+
+       /* queue pair */
+       dd->verbs_dev.rdi.dparms.qp_table_size = hfi1_qp_table_size;
+       dd->verbs_dev.rdi.dparms.qpn_start = 0;
+       dd->verbs_dev.rdi.dparms.qpn_inc = 1;
+       dd->verbs_dev.rdi.dparms.qos_shift = dd->qos_shift;
+       dd->verbs_dev.rdi.dparms.qpn_res_start = kdeth_qp << 16;
+       dd->verbs_dev.rdi.dparms.qpn_res_end =
+       dd->verbs_dev.rdi.dparms.qpn_res_start + 65535;
+       dd->verbs_dev.rdi.dparms.max_rdma_atomic = HFI1_MAX_RDMA_ATOMIC;
+       dd->verbs_dev.rdi.dparms.psn_mask = PSN_MASK;
+       dd->verbs_dev.rdi.dparms.psn_shift = PSN_SHIFT;
+       dd->verbs_dev.rdi.dparms.psn_modify_mask = PSN_MODIFY_MASK;
+       dd->verbs_dev.rdi.dparms.core_cap_flags = RDMA_CORE_PORT_INTEL_OPA;
+       dd->verbs_dev.rdi.dparms.max_mad_size = OPA_MGMT_MAD_SIZE;
+
+       dd->verbs_dev.rdi.driver_f.qp_priv_alloc = qp_priv_alloc;
+       dd->verbs_dev.rdi.driver_f.qp_priv_free = qp_priv_free;
+       dd->verbs_dev.rdi.driver_f.free_all_qps = free_all_qps;
+       dd->verbs_dev.rdi.driver_f.notify_qp_reset = notify_qp_reset;
+       dd->verbs_dev.rdi.driver_f.do_send = hfi1_do_send;
+       dd->verbs_dev.rdi.driver_f.schedule_send = hfi1_schedule_send;
+       dd->verbs_dev.rdi.driver_f.schedule_send_no_lock = _hfi1_schedule_send;
+       dd->verbs_dev.rdi.driver_f.get_pmtu_from_attr = get_pmtu_from_attr;
+       dd->verbs_dev.rdi.driver_f.notify_error_qp = notify_error_qp;
+       dd->verbs_dev.rdi.driver_f.flush_qp_waiters = flush_qp_waiters;
+       dd->verbs_dev.rdi.driver_f.stop_send_queue = stop_send_queue;
+       dd->verbs_dev.rdi.driver_f.quiesce_qp = quiesce_qp;
+       dd->verbs_dev.rdi.driver_f.notify_error_qp = notify_error_qp;
+       dd->verbs_dev.rdi.driver_f.mtu_from_qp = mtu_from_qp;
+       dd->verbs_dev.rdi.driver_f.mtu_to_path_mtu = mtu_to_path_mtu;
+       dd->verbs_dev.rdi.driver_f.check_modify_qp = hfi1_check_modify_qp;
+       dd->verbs_dev.rdi.driver_f.modify_qp = hfi1_modify_qp;
+       dd->verbs_dev.rdi.driver_f.check_send_wqe = hfi1_check_send_wqe;
+
+       /* completeion queue */
+       snprintf(dd->verbs_dev.rdi.dparms.cq_name,
+                sizeof(dd->verbs_dev.rdi.dparms.cq_name),
+                "hfi1_cq%d", dd->unit);
+       dd->verbs_dev.rdi.dparms.node = dd->node;
+
+       /* misc settings */
+       dd->verbs_dev.rdi.flags = 0; /* Let rdmavt handle it all */
+       dd->verbs_dev.rdi.dparms.lkey_table_size = hfi1_lkey_table_size;
+       dd->verbs_dev.rdi.dparms.nports = dd->num_pports;
+       dd->verbs_dev.rdi.dparms.npkeys = hfi1_get_npkeys(dd);
+
+       ppd = dd->pport;
+       for (i = 0; i < dd->num_pports; i++, ppd++)
+               rvt_init_port(&dd->verbs_dev.rdi,
+                             &ppd->ibport_data.rvp,
+                             i,
+                             ppd->pkeys);
+
+       ret = rvt_register_device(&dd->verbs_dev.rdi);
+       if (ret)
+               goto err_verbs_txreq;
+
+       ret = hfi1_verbs_register_sysfs(dd);
+       if (ret)
+               goto err_class;
+
+       return ret;
+
+err_class:
+       rvt_unregister_device(&dd->verbs_dev.rdi);
+err_verbs_txreq:
+       verbs_txreq_exit(dev);
+       dd_dev_err(dd, "cannot register verbs: %d!\n", -ret);
+       return ret;
+}
+
+void hfi1_unregister_ib_device(struct hfi1_devdata *dd)
+{
+       struct hfi1_ibdev *dev = &dd->verbs_dev;
+
+       hfi1_verbs_unregister_sysfs(dd);
+
+       rvt_unregister_device(&dd->verbs_dev.rdi);
+
+       if (!list_empty(&dev->txwait))
+               dd_dev_err(dd, "txwait list not empty!\n");
+       if (!list_empty(&dev->memwait))
+               dd_dev_err(dd, "memwait list not empty!\n");
+
+       del_timer_sync(&dev->mem_timer);
+       verbs_txreq_exit(dev);
+}
+
+void hfi1_cnp_rcv(struct hfi1_packet *packet)
+{
+       struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data;
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       struct hfi1_ib_header *hdr = packet->hdr;
+       struct rvt_qp *qp = packet->qp;
+       u32 lqpn, rqpn = 0;
+       u16 rlid = 0;
+       u8 sl, sc5, sc4_bit, svc_type;
+       bool sc4_set = has_sc4_bit(packet);
+
+       switch (packet->qp->ibqp.qp_type) {
+       case IB_QPT_UC:
+               rlid = qp->remote_ah_attr.dlid;
+               rqpn = qp->remote_qpn;
+               svc_type = IB_CC_SVCTYPE_UC;
+               break;
+       case IB_QPT_RC:
+               rlid = qp->remote_ah_attr.dlid;
+               rqpn = qp->remote_qpn;
+               svc_type = IB_CC_SVCTYPE_RC;
+               break;
+       case IB_QPT_SMI:
+       case IB_QPT_GSI:
+       case IB_QPT_UD:
+               svc_type = IB_CC_SVCTYPE_UD;
+               break;
+       default:
+               ibp->rvp.n_pkt_drops++;
+               return;
+       }
+
+       sc4_bit = sc4_set << 4;
+       sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
+       sc5 |= sc4_bit;
+       sl = ibp->sc_to_sl[sc5];
+       lqpn = qp->ibqp.qp_num;
+
+       process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type);
+}
diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h
new file mode 100644 (file)
index 0000000..4883567
--- /dev/null
@@ -0,0 +1,531 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef HFI1_VERBS_H
+#define HFI1_VERBS_H
+
+#include <linux/types.h>
+#include <linux/seqlock.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/kref.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_mad.h>
+#include <rdma/rdma_vt.h>
+#include <rdma/rdmavt_qp.h>
+#include <rdma/rdmavt_cq.h>
+
+struct hfi1_ctxtdata;
+struct hfi1_pportdata;
+struct hfi1_devdata;
+struct hfi1_packet;
+
+#include "iowait.h"
+
+#define HFI1_MAX_RDMA_ATOMIC     16
+#define HFI1_GUIDS_PER_PORT    5
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define HFI1_UVERBS_ABI_VERSION       2
+
+#define IB_SEQ_NAK     (3 << 29)
+
+/* AETH NAK opcode values */
+#define IB_RNR_NAK                      0x20
+#define IB_NAK_PSN_ERROR                0x60
+#define IB_NAK_INVALID_REQUEST          0x61
+#define IB_NAK_REMOTE_ACCESS_ERROR      0x62
+#define IB_NAK_REMOTE_OPERATIONAL_ERROR 0x63
+#define IB_NAK_INVALID_RD_REQUEST       0x64
+
+/* IB Performance Manager status values */
+#define IB_PMA_SAMPLE_STATUS_DONE       0x00
+#define IB_PMA_SAMPLE_STATUS_STARTED    0x01
+#define IB_PMA_SAMPLE_STATUS_RUNNING    0x02
+
+/* Mandatory IB performance counter select values. */
+#define IB_PMA_PORT_XMIT_DATA   cpu_to_be16(0x0001)
+#define IB_PMA_PORT_RCV_DATA    cpu_to_be16(0x0002)
+#define IB_PMA_PORT_XMIT_PKTS   cpu_to_be16(0x0003)
+#define IB_PMA_PORT_RCV_PKTS    cpu_to_be16(0x0004)
+#define IB_PMA_PORT_XMIT_WAIT   cpu_to_be16(0x0005)
+
+#define HFI1_VENDOR_IPG                cpu_to_be16(0xFFA0)
+
+#define IB_BTH_REQ_ACK         BIT(31)
+#define IB_BTH_SOLICITED       BIT(23)
+#define IB_BTH_MIG_REQ         BIT(22)
+
+#define IB_GRH_VERSION         6
+#define IB_GRH_VERSION_MASK    0xF
+#define IB_GRH_VERSION_SHIFT   28
+#define IB_GRH_TCLASS_MASK     0xFF
+#define IB_GRH_TCLASS_SHIFT    20
+#define IB_GRH_FLOW_MASK       0xFFFFF
+#define IB_GRH_FLOW_SHIFT      0
+#define IB_GRH_NEXT_HDR                0x1B
+
+#define IB_DEFAULT_GID_PREFIX  cpu_to_be64(0xfe80000000000000ULL)
+
+/* flags passed by hfi1_ib_rcv() */
+enum {
+       HFI1_HAS_GRH = (1 << 0),
+};
+
+struct ib_reth {
+       __be64 vaddr;
+       __be32 rkey;
+       __be32 length;
+} __packed;
+
+struct ib_atomic_eth {
+       __be32 vaddr[2];        /* unaligned so access as 2 32-bit words */
+       __be32 rkey;
+       __be64 swap_data;
+       __be64 compare_data;
+} __packed;
+
+union ib_ehdrs {
+       struct {
+               __be32 deth[2];
+               __be32 imm_data;
+       } ud;
+       struct {
+               struct ib_reth reth;
+               __be32 imm_data;
+       } rc;
+       struct {
+               __be32 aeth;
+               __be32 atomic_ack_eth[2];
+       } at;
+       __be32 imm_data;
+       __be32 aeth;
+       __be32 ieth;
+       struct ib_atomic_eth atomic_eth;
+}  __packed;
+
+struct hfi1_other_headers {
+       __be32 bth[3];
+       union ib_ehdrs u;
+} __packed;
+
+/*
+ * Note that UD packets with a GRH header are 8+40+12+8 = 68 bytes
+ * long (72 w/ imm_data).  Only the first 56 bytes of the IB header
+ * will be in the eager header buffer.  The remaining 12 or 16 bytes
+ * are in the data buffer.
+ */
+struct hfi1_ib_header {
+       __be16 lrh[4];
+       union {
+               struct {
+                       struct ib_grh grh;
+                       struct hfi1_other_headers oth;
+               } l;
+               struct hfi1_other_headers oth;
+       } u;
+} __packed;
+
+struct ahg_ib_header {
+       struct sdma_engine *sde;
+       u32 ahgdesc[2];
+       u16 tx_flags;
+       u8 ahgcount;
+       u8 ahgidx;
+       struct hfi1_ib_header ibh;
+};
+
+struct hfi1_pio_header {
+       __le64 pbc;
+       struct hfi1_ib_header hdr;
+} __packed;
+
+/*
+ * hfi1 specific data structures that will be hidden from rvt after the queue
+ * pair is made common
+ */
+struct hfi1_qp_priv {
+       struct ahg_ib_header *s_hdr;              /* next header to send */
+       struct sdma_engine *s_sde;                /* current sde */
+       struct send_context *s_sendcontext;       /* current sendcontext */
+       u8 s_sc;                                  /* SC[0..4] for next packet */
+       u8 r_adefered;                            /* number of acks defered */
+       struct iowait s_iowait;
+       struct timer_list s_rnr_timer;
+       struct rvt_qp *owner;
+};
+
+/*
+ * This structure is used to hold commonly lookedup and computed values during
+ * the send engine progress.
+ */
+struct hfi1_pkt_state {
+       struct hfi1_ibdev *dev;
+       struct hfi1_ibport *ibp;
+       struct hfi1_pportdata *ppd;
+       struct verbs_txreq *s_txreq;
+       unsigned long flags;
+};
+
+#define HFI1_PSN_CREDIT  16
+
+struct hfi1_opcode_stats {
+       u64 n_packets;          /* number of packets */
+       u64 n_bytes;            /* total number of bytes */
+};
+
+struct hfi1_opcode_stats_perctx {
+       struct hfi1_opcode_stats stats[256];
+};
+
+static inline void inc_opstats(
+       u32 tlen,
+       struct hfi1_opcode_stats *stats)
+{
+#ifdef CONFIG_DEBUG_FS
+       stats->n_bytes += tlen;
+       stats->n_packets++;
+#endif
+}
+
+struct hfi1_ibport {
+       struct rvt_qp __rcu *qp[2];
+       struct rvt_ibport rvp;
+
+       __be64 guids[HFI1_GUIDS_PER_PORT        - 1];   /* writable GUIDs */
+
+       /* the first 16 entries are sl_to_vl for !OPA */
+       u8 sl_to_sc[32];
+       u8 sc_to_sl[32];
+};
+
+struct hfi1_ibdev {
+       struct rvt_dev_info rdi; /* Must be first */
+
+       /* QP numbers are shared by all IB ports */
+       /* protect wait lists */
+       seqlock_t iowait_lock;
+       struct list_head txwait;        /* list for wait verbs_txreq */
+       struct list_head memwait;       /* list for wait kernel memory */
+       struct list_head txreq_free;
+       struct kmem_cache *verbs_txreq_cache;
+       struct timer_list mem_timer;
+
+       u64 n_piowait;
+       u64 n_piodrain;
+       u64 n_txwait;
+       u64 n_kmem_wait;
+
+#ifdef CONFIG_DEBUG_FS
+       /* per HFI debugfs */
+       struct dentry *hfi1_ibdev_dbg;
+       /* per HFI symlinks to above */
+       struct dentry *hfi1_ibdev_link;
+#endif
+};
+
+static inline struct hfi1_ibdev *to_idev(struct ib_device *ibdev)
+{
+       struct rvt_dev_info *rdi;
+
+       rdi = container_of(ibdev, struct rvt_dev_info, ibdev);
+       return container_of(rdi, struct hfi1_ibdev, rdi);
+}
+
+static inline struct rvt_qp *iowait_to_qp(struct  iowait *s_iowait)
+{
+       struct hfi1_qp_priv *priv;
+
+       priv = container_of(s_iowait, struct hfi1_qp_priv, s_iowait);
+       return priv->owner;
+}
+
+/*
+ * Send if not busy or waiting for I/O and either
+ * a RC response is pending or we can process send work requests.
+ */
+static inline int hfi1_send_ok(struct rvt_qp *qp)
+{
+       return !(qp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT_IO)) &&
+               (qp->s_hdrwords || (qp->s_flags & RVT_S_RESP_PENDING) ||
+                !(qp->s_flags & RVT_S_ANY_WAIT_SEND));
+}
+
+/*
+ * This must be called with s_lock held.
+ */
+void hfi1_bad_pqkey(struct hfi1_ibport *ibp, __be16 trap_num, u32 key, u32 sl,
+                   u32 qp1, u32 qp2, u16 lid1, u16 lid2);
+void hfi1_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num);
+void hfi1_sys_guid_chg(struct hfi1_ibport *ibp);
+void hfi1_node_desc_chg(struct hfi1_ibport *ibp);
+int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u8 port,
+                    const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+                    const struct ib_mad_hdr *in_mad, size_t in_mad_size,
+                    struct ib_mad_hdr *out_mad, size_t *out_mad_size,
+                    u16 *out_mad_pkey_index);
+
+/*
+ * The PSN_MASK and PSN_SHIFT allow for
+ * 1) comparing two PSNs
+ * 2) returning the PSN with any upper bits masked
+ * 3) returning the difference between to PSNs
+ *
+ * The number of significant bits in the PSN must
+ * necessarily be at least one bit less than
+ * the container holding the PSN.
+ */
+#ifndef CONFIG_HFI1_VERBS_31BIT_PSN
+#define PSN_MASK 0xFFFFFF
+#define PSN_SHIFT 8
+#else
+#define PSN_MASK 0x7FFFFFFF
+#define PSN_SHIFT 1
+#endif
+#define PSN_MODIFY_MASK 0xFFFFFF
+
+/*
+ * Compare the lower 24 bits of the msn values.
+ * Returns an integer <, ==, or > than zero.
+ */
+static inline int cmp_msn(u32 a, u32 b)
+{
+       return (((int)a) - ((int)b)) << 8;
+}
+
+/*
+ * Compare two PSNs
+ * Returns an integer <, ==, or > than zero.
+ */
+static inline int cmp_psn(u32 a, u32 b)
+{
+       return (((int)a) - ((int)b)) << PSN_SHIFT;
+}
+
+/*
+ * Return masked PSN
+ */
+static inline u32 mask_psn(u32 a)
+{
+       return a & PSN_MASK;
+}
+
+/*
+ * Return delta between two PSNs
+ */
+static inline u32 delta_psn(u32 a, u32 b)
+{
+       return (((int)a - (int)b) << PSN_SHIFT) >> PSN_SHIFT;
+}
+
+struct verbs_txreq;
+void hfi1_put_txreq(struct verbs_txreq *tx);
+
+int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
+
+void hfi1_copy_sge(struct rvt_sge_state *ss, void *data, u32 length,
+                  int release, int copy_last);
+
+void hfi1_skip_sge(struct rvt_sge_state *ss, u32 length, int release);
+
+void hfi1_cnp_rcv(struct hfi1_packet *packet);
+
+void hfi1_uc_rcv(struct hfi1_packet *packet);
+
+void hfi1_rc_rcv(struct hfi1_packet *packet);
+
+void hfi1_rc_hdrerr(
+       struct hfi1_ctxtdata *rcd,
+       struct hfi1_ib_header *hdr,
+       u32 rcv_flags,
+       struct rvt_qp *qp);
+
+u8 ah_to_sc(struct ib_device *ibdev, struct ib_ah_attr *ah_attr);
+
+struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u16 dlid);
+
+void hfi1_rc_rnr_retry(unsigned long arg);
+void hfi1_add_rnr_timer(struct rvt_qp *qp, u32 to);
+void hfi1_rc_timeout(unsigned long arg);
+void hfi1_del_timers_sync(struct rvt_qp *qp);
+void hfi1_stop_rc_timers(struct rvt_qp *qp);
+
+void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_ib_header *hdr);
+
+void hfi1_rc_error(struct rvt_qp *qp, enum ib_wc_status err);
+
+void hfi1_ud_rcv(struct hfi1_packet *packet);
+
+int hfi1_lookup_pkey_idx(struct hfi1_ibport *ibp, u16 pkey);
+
+int hfi1_rvt_get_rwqe(struct rvt_qp *qp, int wr_id_only);
+
+void hfi1_migrate_qp(struct rvt_qp *qp);
+
+int hfi1_check_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
+                        int attr_mask, struct ib_udata *udata);
+
+void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
+                   int attr_mask, struct ib_udata *udata);
+
+int hfi1_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe);
+
+extern const u32 rc_only_opcode;
+extern const u32 uc_only_opcode;
+
+static inline u8 get_opcode(struct hfi1_ib_header *h)
+{
+       u16 lnh = be16_to_cpu(h->lrh[0]) & 3;
+
+       if (lnh == IB_LNH_IBA_LOCAL)
+               return be32_to_cpu(h->u.oth.bth[0]) >> 24;
+       else
+               return be32_to_cpu(h->u.l.oth.bth[0]) >> 24;
+}
+
+int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_ib_header *hdr,
+                      int has_grh, struct rvt_qp *qp, u32 bth0);
+
+u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr,
+                 struct ib_global_route *grh, u32 hwords, u32 nwords);
+
+void hfi1_make_ruc_header(struct rvt_qp *qp, struct hfi1_other_headers *ohdr,
+                         u32 bth0, u32 bth2, int middle,
+                         struct hfi1_pkt_state *ps);
+
+void _hfi1_do_send(struct work_struct *work);
+
+void hfi1_do_send(struct rvt_qp *qp);
+
+void hfi1_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
+                       enum ib_wc_status status);
+
+void hfi1_send_rc_ack(struct hfi1_ctxtdata *, struct rvt_qp *qp, int is_fecn);
+
+int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
+
+int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
+
+int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
+
+int hfi1_register_ib_device(struct hfi1_devdata *);
+
+void hfi1_unregister_ib_device(struct hfi1_devdata *);
+
+void hfi1_ib_rcv(struct hfi1_packet *packet);
+
+unsigned hfi1_get_npkeys(struct hfi1_devdata *);
+
+int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+                       u64 pbc);
+
+int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+                       u64 pbc);
+
+int hfi1_wss_init(void);
+void hfi1_wss_exit(void);
+
+/* platform specific: return the lowest level cache (llc) size, in KiB */
+static inline int wss_llc_size(void)
+{
+       /* assume that the boot CPU value is universal for all CPUs */
+       return boot_cpu_data.x86_cache_size;
+}
+
+/* platform specific: cacheless copy */
+static inline void cacheless_memcpy(void *dst, void *src, size_t n)
+{
+       /*
+        * Use the only available X64 cacheless copy.  Add a __user cast
+        * to quiet sparse.  The src agument is already in the kernel so
+        * there are no security issues.  The extra fault recovery machinery
+        * is not invoked.
+        */
+       __copy_user_nocache(dst, (void __user *)src, n, 0);
+}
+
+extern const enum ib_wc_opcode ib_hfi1_wc_opcode[];
+
+extern const u8 hdr_len_by_opcode[];
+
+extern const int ib_rvt_state_ops[];
+
+extern __be64 ib_hfi1_sys_image_guid;    /* in network order */
+
+extern unsigned int hfi1_max_cqes;
+
+extern unsigned int hfi1_max_cqs;
+
+extern unsigned int hfi1_max_qp_wrs;
+
+extern unsigned int hfi1_max_qps;
+
+extern unsigned int hfi1_max_sges;
+
+extern unsigned int hfi1_max_mcast_grps;
+
+extern unsigned int hfi1_max_mcast_qp_attached;
+
+extern unsigned int hfi1_max_srqs;
+
+extern unsigned int hfi1_max_srq_sges;
+
+extern unsigned int hfi1_max_srq_wrs;
+
+extern unsigned short piothreshold;
+
+extern const u32 ib_hfi1_rnr_table[];
+
+#endif                          /* HFI1_VERBS_H */
diff --git a/drivers/infiniband/hw/hfi1/verbs_txreq.c b/drivers/infiniband/hw/hfi1/verbs_txreq.c
new file mode 100644 (file)
index 0000000..bc95c41
--- /dev/null
@@ -0,0 +1,149 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "hfi.h"
+#include "verbs_txreq.h"
+#include "qp.h"
+#include "trace.h"
+
+#define TXREQ_LEN 24
+
+void hfi1_put_txreq(struct verbs_txreq *tx)
+{
+       struct hfi1_ibdev *dev;
+       struct rvt_qp *qp;
+       unsigned long flags;
+       unsigned int seq;
+       struct hfi1_qp_priv *priv;
+
+       qp = tx->qp;
+       dev = to_idev(qp->ibqp.device);
+
+       if (tx->mr)
+               rvt_put_mr(tx->mr);
+
+       sdma_txclean(dd_from_dev(dev), &tx->txreq);
+
+       /* Free verbs_txreq and return to slab cache */
+       kmem_cache_free(dev->verbs_txreq_cache, tx);
+
+       do {
+               seq = read_seqbegin(&dev->iowait_lock);
+               if (!list_empty(&dev->txwait)) {
+                       struct iowait *wait;
+
+                       write_seqlock_irqsave(&dev->iowait_lock, flags);
+                       wait = list_first_entry(&dev->txwait, struct iowait,
+                                               list);
+                       qp = iowait_to_qp(wait);
+                       priv = qp->priv;
+                       list_del_init(&priv->s_iowait.list);
+                       /* refcount held until actual wake up */
+                       write_sequnlock_irqrestore(&dev->iowait_lock, flags);
+                       hfi1_qp_wakeup(qp, RVT_S_WAIT_TX);
+                       break;
+               }
+       } while (read_seqretry(&dev->iowait_lock, seq));
+}
+
+struct verbs_txreq *__get_txreq(struct hfi1_ibdev *dev,
+                               struct rvt_qp *qp)
+{
+       struct verbs_txreq *tx = ERR_PTR(-EBUSY);
+       unsigned long flags;
+
+       spin_lock_irqsave(&qp->s_lock, flags);
+       write_seqlock(&dev->iowait_lock);
+       if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
+               struct hfi1_qp_priv *priv;
+
+               tx = kmem_cache_alloc(dev->verbs_txreq_cache, GFP_ATOMIC);
+               if (tx)
+                       goto out;
+               priv = qp->priv;
+               if (list_empty(&priv->s_iowait.list)) {
+                       dev->n_txwait++;
+                       qp->s_flags |= RVT_S_WAIT_TX;
+                       list_add_tail(&priv->s_iowait.list, &dev->txwait);
+                       trace_hfi1_qpsleep(qp, RVT_S_WAIT_TX);
+                       atomic_inc(&qp->refcount);
+               }
+               qp->s_flags &= ~RVT_S_BUSY;
+       }
+out:
+       write_sequnlock(&dev->iowait_lock);
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+       return tx;
+}
+
+static void verbs_txreq_kmem_cache_ctor(void *obj)
+{
+       struct verbs_txreq *tx = (struct verbs_txreq *)obj;
+
+       memset(tx, 0, sizeof(*tx));
+}
+
+int verbs_txreq_init(struct hfi1_ibdev *dev)
+{
+       char buf[TXREQ_LEN];
+       struct hfi1_devdata *dd = dd_from_dev(dev);
+
+       snprintf(buf, sizeof(buf), "hfi1_%u_vtxreq_cache", dd->unit);
+       dev->verbs_txreq_cache = kmem_cache_create(buf,
+                                                  sizeof(struct verbs_txreq),
+                                                  0, SLAB_HWCACHE_ALIGN,
+                                                  verbs_txreq_kmem_cache_ctor);
+       if (!dev->verbs_txreq_cache)
+               return -ENOMEM;
+       return 0;
+}
+
+void verbs_txreq_exit(struct hfi1_ibdev *dev)
+{
+       kmem_cache_destroy(dev->verbs_txreq_cache);
+       dev->verbs_txreq_cache = NULL;
+}
diff --git a/drivers/infiniband/hw/hfi1/verbs_txreq.h b/drivers/infiniband/hw/hfi1/verbs_txreq.h
new file mode 100644 (file)
index 0000000..1cf69b2
--- /dev/null
@@ -0,0 +1,116 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef HFI1_VERBS_TXREQ_H
+#define HFI1_VERBS_TXREQ_H
+
+#include <linux/types.h>
+#include <linux/slab.h>
+
+#include "verbs.h"
+#include "sdma_txreq.h"
+#include "iowait.h"
+
+struct verbs_txreq {
+       struct hfi1_pio_header  phdr;
+       struct sdma_txreq       txreq;
+       struct rvt_qp           *qp;
+       struct rvt_swqe         *wqe;
+       struct rvt_mregion      *mr;
+       struct rvt_sge_state    *ss;
+       struct sdma_engine     *sde;
+       struct send_context     *psc;
+       u16                     hdr_dwords;
+};
+
+struct hfi1_ibdev;
+struct verbs_txreq *__get_txreq(struct hfi1_ibdev *dev,
+                               struct rvt_qp *qp);
+
+static inline struct verbs_txreq *get_txreq(struct hfi1_ibdev *dev,
+                                           struct rvt_qp *qp)
+{
+       struct verbs_txreq *tx;
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       tx = kmem_cache_alloc(dev->verbs_txreq_cache, GFP_ATOMIC);
+       if (unlikely(!tx)) {
+               /* call slow path to get the lock */
+               tx = __get_txreq(dev, qp);
+               if (IS_ERR(tx))
+                       return tx;
+       }
+       tx->qp = qp;
+       tx->mr = NULL;
+       tx->sde = priv->s_sde;
+       tx->psc = priv->s_sendcontext;
+       /* so that we can test if the sdma decriptors are there */
+       tx->txreq.num_desc = 0;
+       return tx;
+}
+
+static inline struct sdma_txreq *get_sdma_txreq(struct verbs_txreq *tx)
+{
+       return &tx->txreq;
+}
+
+static inline struct verbs_txreq *get_waiting_verbs_txreq(struct rvt_qp *qp)
+{
+       struct sdma_txreq *stx;
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       stx = iowait_get_txhead(&priv->s_iowait);
+       if (stx)
+               return container_of(stx, struct verbs_txreq, txreq);
+       return NULL;
+}
+
+void hfi1_put_txreq(struct verbs_txreq *tx);
+int verbs_txreq_init(struct hfi1_ibdev *dev);
+void verbs_txreq_exit(struct hfi1_ibdev *dev);
+
+#endif                         /* HFI1_VERBS_TXREQ_H */
index 4a740f7..02a735b 100644 (file)
@@ -2361,58 +2361,130 @@ static int i40iw_port_immutable(struct ib_device *ibdev, u8 port_num,
        return 0;
 }
 
+static const char * const i40iw_hw_stat_names[] = {
+       // 32bit names
+       [I40IW_HW_STAT_INDEX_IP4RXDISCARD] = "ip4InDiscards",
+       [I40IW_HW_STAT_INDEX_IP4RXTRUNC] = "ip4InTruncatedPkts",
+       [I40IW_HW_STAT_INDEX_IP4TXNOROUTE] = "ip4OutNoRoutes",
+       [I40IW_HW_STAT_INDEX_IP6RXDISCARD] = "ip6InDiscards",
+       [I40IW_HW_STAT_INDEX_IP6RXTRUNC] = "ip6InTruncatedPkts",
+       [I40IW_HW_STAT_INDEX_IP6TXNOROUTE] = "ip6OutNoRoutes",
+       [I40IW_HW_STAT_INDEX_TCPRTXSEG] = "tcpRetransSegs",
+       [I40IW_HW_STAT_INDEX_TCPRXOPTERR] = "tcpInOptErrors",
+       [I40IW_HW_STAT_INDEX_TCPRXPROTOERR] = "tcpInProtoErrors",
+       // 64bit names
+       [I40IW_HW_STAT_INDEX_IP4RXOCTS + I40IW_HW_STAT_INDEX_MAX_32] =
+               "ip4InOctets",
+       [I40IW_HW_STAT_INDEX_IP4RXPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
+               "ip4InPkts",
+       [I40IW_HW_STAT_INDEX_IP4RXFRAGS + I40IW_HW_STAT_INDEX_MAX_32] =
+               "ip4InReasmRqd",
+       [I40IW_HW_STAT_INDEX_IP4RXMCPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
+               "ip4InMcastPkts",
+       [I40IW_HW_STAT_INDEX_IP4TXOCTS + I40IW_HW_STAT_INDEX_MAX_32] =
+               "ip4OutOctets",
+       [I40IW_HW_STAT_INDEX_IP4TXPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
+               "ip4OutPkts",
+       [I40IW_HW_STAT_INDEX_IP4TXFRAGS + I40IW_HW_STAT_INDEX_MAX_32] =
+               "ip4OutSegRqd",
+       [I40IW_HW_STAT_INDEX_IP4TXMCPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
+               "ip4OutMcastPkts",
+       [I40IW_HW_STAT_INDEX_IP6RXOCTS + I40IW_HW_STAT_INDEX_MAX_32] =
+               "ip6InOctets",
+       [I40IW_HW_STAT_INDEX_IP6RXPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
+               "ip6InPkts",
+       [I40IW_HW_STAT_INDEX_IP6RXFRAGS + I40IW_HW_STAT_INDEX_MAX_32] =
+               "ip6InReasmRqd",
+       [I40IW_HW_STAT_INDEX_IP6RXMCPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
+               "ip6InMcastPkts",
+       [I40IW_HW_STAT_INDEX_IP6TXOCTS + I40IW_HW_STAT_INDEX_MAX_32] =
+               "ip6OutOctets",
+       [I40IW_HW_STAT_INDEX_IP6TXPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
+               "ip6OutPkts",
+       [I40IW_HW_STAT_INDEX_IP6TXFRAGS + I40IW_HW_STAT_INDEX_MAX_32] =
+               "ip6OutSegRqd",
+       [I40IW_HW_STAT_INDEX_IP6TXMCPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
+               "ip6OutMcastPkts",
+       [I40IW_HW_STAT_INDEX_TCPRXSEGS + I40IW_HW_STAT_INDEX_MAX_32] =
+               "tcpInSegs",
+       [I40IW_HW_STAT_INDEX_TCPTXSEG + I40IW_HW_STAT_INDEX_MAX_32] =
+               "tcpOutSegs",
+       [I40IW_HW_STAT_INDEX_RDMARXRDS + I40IW_HW_STAT_INDEX_MAX_32] =
+               "iwInRdmaReads",
+       [I40IW_HW_STAT_INDEX_RDMARXSNDS + I40IW_HW_STAT_INDEX_MAX_32] =
+               "iwInRdmaSends",
+       [I40IW_HW_STAT_INDEX_RDMARXWRS + I40IW_HW_STAT_INDEX_MAX_32] =
+               "iwInRdmaWrites",
+       [I40IW_HW_STAT_INDEX_RDMATXRDS + I40IW_HW_STAT_INDEX_MAX_32] =
+               "iwOutRdmaReads",
+       [I40IW_HW_STAT_INDEX_RDMATXSNDS + I40IW_HW_STAT_INDEX_MAX_32] =
+               "iwOutRdmaSends",
+       [I40IW_HW_STAT_INDEX_RDMATXWRS + I40IW_HW_STAT_INDEX_MAX_32] =
+               "iwOutRdmaWrites",
+       [I40IW_HW_STAT_INDEX_RDMAVBND + I40IW_HW_STAT_INDEX_MAX_32] =
+               "iwRdmaBnd",
+       [I40IW_HW_STAT_INDEX_RDMAVINV + I40IW_HW_STAT_INDEX_MAX_32] =
+               "iwRdmaInv"
+};
+
 /**
- * i40iw_get_protocol_stats - Populates the rdma_stats structure
- * @ibdev: ib dev struct
- * @stats: iw protocol stats struct
+ * i40iw_alloc_hw_stats - Allocate a hw stats structure
+ * @ibdev: device pointer from stack
+ * @port_num: port number
  */
-static int i40iw_get_protocol_stats(struct ib_device *ibdev,
-                                   union rdma_protocol_stats *stats)
+static struct rdma_hw_stats *i40iw_alloc_hw_stats(struct ib_device *ibdev,
+                                                 u8 port_num)
+{
+       struct i40iw_device *iwdev = to_iwdev(ibdev);
+       struct i40iw_sc_dev *dev = &iwdev->sc_dev;
+       int num_counters = I40IW_HW_STAT_INDEX_MAX_32 +
+               I40IW_HW_STAT_INDEX_MAX_64;
+       unsigned long lifespan = RDMA_HW_STATS_DEFAULT_LIFESPAN;
+
+       BUILD_BUG_ON(ARRAY_SIZE(i40iw_hw_stat_names) !=
+                    (I40IW_HW_STAT_INDEX_MAX_32 +
+                     I40IW_HW_STAT_INDEX_MAX_64));
+
+       /*
+        * PFs get the default update lifespan, but VFs only update once
+        * per second
+        */
+       if (!dev->is_pf)
+               lifespan = 1000;
+       return rdma_alloc_hw_stats_struct(i40iw_hw_stat_names, num_counters,
+                                         lifespan);
+}
+
+/**
+ * i40iw_get_hw_stats - Populates the rdma_hw_stats structure
+ * @ibdev: device pointer from stack
+ * @stats: stats pointer from stack
+ * @port_num: port number
+ * @index: which hw counter the stack is requesting we update
+ */
+static int i40iw_get_hw_stats(struct ib_device *ibdev,
+                             struct rdma_hw_stats *stats,
+                             u8 port_num, int index)
 {
        struct i40iw_device *iwdev = to_iwdev(ibdev);
        struct i40iw_sc_dev *dev = &iwdev->sc_dev;
        struct i40iw_dev_pestat *devstat = &dev->dev_pestat;
        struct i40iw_dev_hw_stats *hw_stats = &devstat->hw_stats;
-       struct timespec curr_time;
-       static struct timespec last_rd_time = {0, 0};
        unsigned long flags;
 
-       curr_time = current_kernel_time();
-       memset(stats, 0, sizeof(*stats));
-
        if (dev->is_pf) {
                spin_lock_irqsave(&devstat->stats_lock, flags);
                devstat->ops.iw_hw_stat_read_all(devstat,
                        &devstat->hw_stats);
                spin_unlock_irqrestore(&devstat->stats_lock, flags);
        } else {
-               if (((u64)curr_time.tv_sec - (u64)last_rd_time.tv_sec) > 1)
-                       if (i40iw_vchnl_vf_get_pe_stats(dev, &devstat->hw_stats))
-                               return -ENOSYS;
+               if (i40iw_vchnl_vf_get_pe_stats(dev, &devstat->hw_stats))
+                       return -ENOSYS;
        }
 
-       stats->iw.ipInReceives = hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP4RXPKTS] +
-                                hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP6RXPKTS];
-       stats->iw.ipInTruncatedPkts = hw_stats->stat_value_32[I40IW_HW_STAT_INDEX_IP4RXTRUNC] +
-                                     hw_stats->stat_value_32[I40IW_HW_STAT_INDEX_IP6RXTRUNC];
-       stats->iw.ipInDiscards = hw_stats->stat_value_32[I40IW_HW_STAT_INDEX_IP4RXDISCARD] +
-                                hw_stats->stat_value_32[I40IW_HW_STAT_INDEX_IP6RXDISCARD];
-       stats->iw.ipOutNoRoutes = hw_stats->stat_value_32[I40IW_HW_STAT_INDEX_IP4TXNOROUTE] +
-                                 hw_stats->stat_value_32[I40IW_HW_STAT_INDEX_IP6TXNOROUTE];
-       stats->iw.ipReasmReqds = hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP4RXFRAGS] +
-                                hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP6RXFRAGS];
-       stats->iw.ipFragCreates = hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP4TXFRAGS] +
-                                 hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP6TXFRAGS];
-       stats->iw.ipInMcastPkts = hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP4RXMCPKTS] +
-                                 hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP6RXMCPKTS];
-       stats->iw.ipOutMcastPkts = hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP4TXMCPKTS] +
-                                  hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP6TXMCPKTS];
-       stats->iw.tcpOutSegs = hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_TCPTXSEG];
-       stats->iw.tcpInSegs = hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_TCPRXSEGS];
-       stats->iw.tcpRetransSegs = hw_stats->stat_value_32[I40IW_HW_STAT_INDEX_TCPRTXSEG];
-
-       last_rd_time = curr_time;
-       return 0;
+       memcpy(&stats->value[0], &hw_stats, sizeof(*hw_stats));
+
+       return stats->num_counters;
 }
 
 /**
@@ -2551,7 +2623,8 @@ static struct i40iw_ib_device *i40iw_init_rdma_device(struct i40iw_device *iwdev
        iwibdev->ibdev.get_dma_mr = i40iw_get_dma_mr;
        iwibdev->ibdev.reg_user_mr = i40iw_reg_user_mr;
        iwibdev->ibdev.dereg_mr = i40iw_dereg_mr;
-       iwibdev->ibdev.get_protocol_stats = i40iw_get_protocol_stats;
+       iwibdev->ibdev.alloc_hw_stats = i40iw_alloc_hw_stats;
+       iwibdev->ibdev.get_hw_stats = i40iw_get_hw_stats;
        iwibdev->ibdev.query_device = i40iw_query_device;
        iwibdev->ibdev.create_ah = i40iw_create_ah;
        iwibdev->ibdev.destroy_ah = i40iw_destroy_ah;
index 82d7c4b..ce40340 100644 (file)
@@ -1308,21 +1308,6 @@ static const struct  qib_hwerror_msgs qib_7322p_error_msgs[] = {
        SYM_LSB(IntMask, fldname##17IntMask)), \
        .msg = #fldname "_C", .sz = sizeof(#fldname "_C") }
 
-static const struct  qib_hwerror_msgs qib_7322_intr_msgs[] = {
-       INTR_AUTO_P(SDmaInt),
-       INTR_AUTO_P(SDmaProgressInt),
-       INTR_AUTO_P(SDmaIdleInt),
-       INTR_AUTO_P(SDmaCleanupDone),
-       INTR_AUTO_C(RcvUrg),
-       INTR_AUTO_P(ErrInt),
-       INTR_AUTO(ErrInt),      /* non-port-specific errs */
-       INTR_AUTO(AssertGPIOInt),
-       INTR_AUTO_P(SendDoneInt),
-       INTR_AUTO(SendBufAvailInt),
-       INTR_AUTO_C(RcvAvail),
-       { .mask = 0, .sz = 0 }
-};
-
 #define TXSYMPTOM_AUTO_P(fldname) \
        { .mask = SYM_MASK(SendHdrErrSymptom_0, fldname), \
        .msg = #fldname, .sz = sizeof(#fldname) }
index 0bd1837..d2ac298 100644 (file)
@@ -1172,11 +1172,13 @@ static int pma_get_classportinfo(struct ib_pma_mad *pmp,
         * Set the most significant bit of CM2 to indicate support for
         * congestion statistics
         */
-       p->reserved[0] = dd->psxmitwait_supported << 7;
+       ib_set_cpi_capmask2(p,
+                           dd->psxmitwait_supported <<
+                           (31 - IB_CLASS_PORT_INFO_RESP_TIME_FIELD_SIZE));
        /*
         * Expected response time is 4.096 usec. * 2^18 == 1.073741824 sec.
         */
-       p->resp_time_value = 18;
+       ib_set_cpi_resp_time(p, 18);
 
        return reply((struct ib_smp *) pmp);
 }
index 6888f03..4f87815 100644 (file)
@@ -159,6 +159,7 @@ struct qib_other_headers {
                } at;
                __be32 imm_data;
                __be32 aeth;
+               __be32 ieth;
                struct ib_atomic_eth atomic_eth;
        } u;
 } __packed;
index b1ffc8b..6ca6fa8 100644 (file)
@@ -525,6 +525,7 @@ int rvt_driver_cq_init(struct rvt_dev_info *rdi)
                return PTR_ERR(task);
        }
 
+       set_user_nice(task, MIN_NICE);
        cpu = cpumask_first(cpumask_of_node(rdi->dparms.node));
        kthread_bind(task, cpu);
        wake_up_process(task);
index 0ff765b..0f4d450 100644 (file)
@@ -124,11 +124,13 @@ static int rvt_init_mregion(struct rvt_mregion *mr, struct ib_pd *pd,
                            int count)
 {
        int m, i = 0;
+       struct rvt_dev_info *dev = ib_to_rvt(pd->device);
 
        mr->mapsz = 0;
        m = (count + RVT_SEGSZ - 1) / RVT_SEGSZ;
        for (; i < m; i++) {
-               mr->map[i] = kzalloc(sizeof(*mr->map[0]), GFP_KERNEL);
+               mr->map[i] = kzalloc_node(sizeof(*mr->map[0]), GFP_KERNEL,
+                                         dev->dparms.node);
                if (!mr->map[i]) {
                        rvt_deinit_mregion(mr);
                        return -ENOMEM;
index 0f12c21..5fa4d4d 100644 (file)
@@ -397,6 +397,7 @@ static void free_qpn(struct rvt_qpn_table *qpt, u32 qpn)
 static void rvt_clear_mr_refs(struct rvt_qp *qp, int clr_sends)
 {
        unsigned n;
+       struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
 
        if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags))
                rvt_put_ss(&qp->s_rdma_read_sge);
@@ -431,7 +432,7 @@ static void rvt_clear_mr_refs(struct rvt_qp *qp, int clr_sends)
        if (qp->ibqp.qp_type != IB_QPT_RC)
                return;
 
-       for (n = 0; n < ARRAY_SIZE(qp->s_ack_queue); n++) {
+       for (n = 0; n < rvt_max_atomic(rdi); n++) {
                struct rvt_ack_entry *e = &qp->s_ack_queue[n];
 
                if (e->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST &&
@@ -569,7 +570,12 @@ static void rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
        qp->s_ssn = 1;
        qp->s_lsn = 0;
        qp->s_mig_state = IB_MIG_MIGRATED;
-       memset(qp->s_ack_queue, 0, sizeof(qp->s_ack_queue));
+       if (qp->s_ack_queue)
+               memset(
+                       qp->s_ack_queue,
+                       0,
+                       rvt_max_atomic(rdi) *
+                               sizeof(*qp->s_ack_queue));
        qp->r_head_ack_queue = 0;
        qp->s_tail_ack_queue = 0;
        qp->s_num_rd_atomic = 0;
@@ -653,9 +659,9 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
                if (gfp == GFP_NOIO)
                        swq = __vmalloc(
                                (init_attr->cap.max_send_wr + 1) * sz,
-                               gfp, PAGE_KERNEL);
+                               gfp | __GFP_ZERO, PAGE_KERNEL);
                else
-                       swq = vmalloc_node(
+                       swq = vzalloc_node(
                                (init_attr->cap.max_send_wr + 1) * sz,
                                rdi->dparms.node);
                if (!swq)
@@ -677,6 +683,16 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
                        goto bail_swq;
 
                RCU_INIT_POINTER(qp->next, NULL);
+               if (init_attr->qp_type == IB_QPT_RC) {
+                       qp->s_ack_queue =
+                               kzalloc_node(
+                                       sizeof(*qp->s_ack_queue) *
+                                        rvt_max_atomic(rdi),
+                                       gfp,
+                                       rdi->dparms.node);
+                       if (!qp->s_ack_queue)
+                               goto bail_qp;
+               }
 
                /*
                 * Driver needs to set up it's private QP structure and do any
@@ -704,9 +720,9 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
                                qp->r_rq.wq = __vmalloc(
                                                sizeof(struct rvt_rwq) +
                                                qp->r_rq.size * sz,
-                                               gfp, PAGE_KERNEL);
+                                               gfp | __GFP_ZERO, PAGE_KERNEL);
                        else
-                               qp->r_rq.wq = vmalloc_node(
+                               qp->r_rq.wq = vzalloc_node(
                                                sizeof(struct rvt_rwq) +
                                                qp->r_rq.size * sz,
                                                rdi->dparms.node);
@@ -857,6 +873,7 @@ bail_driver_priv:
        rdi->driver_f.qp_priv_free(rdi, qp);
 
 bail_qp:
+       kfree(qp->s_ack_queue);
        kfree(qp);
 
 bail_swq:
@@ -1284,6 +1301,7 @@ int rvt_destroy_qp(struct ib_qp *ibqp)
                vfree(qp->r_rq.wq);
        vfree(qp->s_wq);
        rdi->driver_f.qp_priv_free(rdi, qp);
+       kfree(qp->s_ack_queue);
        kfree(qp);
        return 0;
 }
index caec8e9..bab7db6 100644 (file)
@@ -92,6 +92,8 @@ enum {
        IPOIB_FLAG_UMCAST         = 10,
        IPOIB_STOP_NEIGH_GC       = 11,
        IPOIB_NEIGH_TBL_FLUSH     = 12,
+       IPOIB_FLAG_DEV_ADDR_SET   = 13,
+       IPOIB_FLAG_DEV_ADDR_CTRL  = 14,
 
        IPOIB_MAX_BACKOFF_SECONDS = 16,
 
@@ -392,6 +394,7 @@ struct ipoib_dev_priv {
        struct ipoib_ethtool_st ethtool;
        struct timer_list poll_timer;
        unsigned max_send_sge;
+       bool sm_fullmember_sendonly_support;
 };
 
 struct ipoib_ah {
@@ -476,6 +479,7 @@ void ipoib_reap_ah(struct work_struct *work);
 
 void ipoib_mark_paths_invalid(struct net_device *dev);
 void ipoib_flush_paths(struct net_device *dev);
+int ipoib_check_sm_sendonly_fullmember_support(struct ipoib_dev_priv *priv);
 struct ipoib_dev_priv *ipoib_intf_alloc(const char *format);
 
 int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
index 418e5a1..45c40a1 100644 (file)
@@ -997,6 +997,106 @@ static inline int update_child_pkey(struct ipoib_dev_priv *priv)
        return 0;
 }
 
+/*
+ * returns true if the device address of the ipoib interface has changed and the
+ * new address is a valid one (i.e in the gid table), return false otherwise.
+ */
+static bool ipoib_dev_addr_changed_valid(struct ipoib_dev_priv *priv)
+{
+       union ib_gid search_gid;
+       union ib_gid gid0;
+       union ib_gid *netdev_gid;
+       int err;
+       u16 index;
+       u8 port;
+       bool ret = false;
+
+       netdev_gid = (union ib_gid *)(priv->dev->dev_addr + 4);
+       if (ib_query_gid(priv->ca, priv->port, 0, &gid0, NULL))
+               return false;
+
+       netif_addr_lock(priv->dev);
+
+       /* The subnet prefix may have changed, update it now so we won't have
+        * to do it later
+        */
+       priv->local_gid.global.subnet_prefix = gid0.global.subnet_prefix;
+       netdev_gid->global.subnet_prefix = gid0.global.subnet_prefix;
+       search_gid.global.subnet_prefix = gid0.global.subnet_prefix;
+
+       search_gid.global.interface_id = priv->local_gid.global.interface_id;
+
+       netif_addr_unlock(priv->dev);
+
+       err = ib_find_gid(priv->ca, &search_gid, IB_GID_TYPE_IB,
+                         priv->dev, &port, &index);
+
+       netif_addr_lock(priv->dev);
+
+       if (search_gid.global.interface_id !=
+           priv->local_gid.global.interface_id)
+               /* There was a change while we were looking up the gid, bail
+                * here and let the next work sort this out
+                */
+               goto out;
+
+       /* The next section of code needs some background:
+        * Per IB spec the port GUID can't change if the HCA is powered on.
+        * port GUID is the basis for GID at index 0 which is the basis for
+        * the default device address of a ipoib interface.
+        *
+        * so it seems the flow should be:
+        * if user_changed_dev_addr && gid in gid tbl
+        *      set bit dev_addr_set
+        *      return true
+        * else
+        *      return false
+        *
+        * The issue is that there are devices that don't follow the spec,
+        * they change the port GUID when the HCA is powered, so in order
+        * not to break userspace applications, We need to check if the
+        * user wanted to control the device address and we assume that
+        * if he sets the device address back to be based on GID index 0,
+        * he no longer wishs to control it.
+        *
+        * If the user doesn't control the the device address,
+        * IPOIB_FLAG_DEV_ADDR_SET is set and ib_find_gid failed it means
+        * the port GUID has changed and GID at index 0 has changed
+        * so we need to change priv->local_gid and priv->dev->dev_addr
+        * to reflect the new GID.
+        */
+       if (!test_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags)) {
+               if (!err && port == priv->port) {
+                       set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags);
+                       if (index == 0)
+                               clear_bit(IPOIB_FLAG_DEV_ADDR_CTRL,
+                                         &priv->flags);
+                       else
+                               set_bit(IPOIB_FLAG_DEV_ADDR_CTRL, &priv->flags);
+                       ret = true;
+               } else {
+                       ret = false;
+               }
+       } else {
+               if (!err && port == priv->port) {
+                       ret = true;
+               } else {
+                       if (!test_bit(IPOIB_FLAG_DEV_ADDR_CTRL, &priv->flags)) {
+                               memcpy(&priv->local_gid, &gid0,
+                                      sizeof(priv->local_gid));
+                               memcpy(priv->dev->dev_addr + 4, &gid0,
+                                      sizeof(priv->local_gid));
+                               ret = true;
+                       }
+               }
+       }
+
+out:
+       netif_addr_unlock(priv->dev);
+
+       return ret;
+}
+
 static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
                                enum ipoib_flush_level level,
                                int nesting)
@@ -1018,6 +1118,9 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
 
        if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags) &&
            level != IPOIB_FLUSH_HEAVY) {
+               /* Make sure the dev_addr is set even if not flushing */
+               if (level == IPOIB_FLUSH_LIGHT)
+                       ipoib_dev_addr_changed_valid(priv);
                ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n");
                return;
        }
@@ -1029,7 +1132,8 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
                                update_parent_pkey(priv);
                        else
                                update_child_pkey(priv);
-               }
+               } else if (level == IPOIB_FLUSH_LIGHT)
+                       ipoib_dev_addr_changed_valid(priv);
                ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_ADMIN_UP not set.\n");
                return;
        }
@@ -1081,7 +1185,8 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
        if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
                if (level >= IPOIB_FLUSH_NORMAL)
                        ipoib_ib_dev_up(dev);
-               ipoib_mcast_restart_task(&priv->restart_task);
+               if (ipoib_dev_addr_changed_valid(priv))
+                       ipoib_mcast_restart_task(&priv->restart_task);
        }
 }
 
index b940ef1..2d7c163 100644 (file)
@@ -99,6 +99,7 @@ static struct net_device *ipoib_get_net_dev_by_params(
                struct ib_device *dev, u8 port, u16 pkey,
                const union ib_gid *gid, const struct sockaddr *addr,
                void *client_data);
+static int ipoib_set_mac(struct net_device *dev, void *addr);
 
 static struct ib_client ipoib_client = {
        .name   = "ipoib",
@@ -117,6 +118,8 @@ int ipoib_open(struct net_device *dev)
 
        set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
 
+       priv->sm_fullmember_sendonly_support = false;
+
        if (ipoib_ib_dev_open(dev)) {
                if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
                        return 0;
@@ -629,6 +632,77 @@ void ipoib_mark_paths_invalid(struct net_device *dev)
        spin_unlock_irq(&priv->lock);
 }
 
+struct classport_info_context {
+       struct ipoib_dev_priv   *priv;
+       struct completion       done;
+       struct ib_sa_query      *sa_query;
+};
+
+static void classport_info_query_cb(int status, struct ib_class_port_info *rec,
+                                   void *context)
+{
+       struct classport_info_context *cb_ctx = context;
+       struct ipoib_dev_priv *priv;
+
+       WARN_ON(!context);
+
+       priv = cb_ctx->priv;
+
+       if (status || !rec) {
+               pr_debug("device: %s failed query classport_info status: %d\n",
+                        priv->dev->name, status);
+               /* keeps the default, will try next mcast_restart */
+               priv->sm_fullmember_sendonly_support = false;
+               goto out;
+       }
+
+       if (ib_get_cpi_capmask2(rec) &
+           IB_SA_CAP_MASK2_SENDONLY_FULL_MEM_SUPPORT) {
+               pr_debug("device: %s enabled fullmember-sendonly for sendonly MCG\n",
+                        priv->dev->name);
+               priv->sm_fullmember_sendonly_support = true;
+       } else {
+               pr_debug("device: %s disabled fullmember-sendonly for sendonly MCG\n",
+                        priv->dev->name);
+               priv->sm_fullmember_sendonly_support = false;
+       }
+
+out:
+       complete(&cb_ctx->done);
+}
+
+int ipoib_check_sm_sendonly_fullmember_support(struct ipoib_dev_priv *priv)
+{
+       struct classport_info_context *callback_context;
+       int ret;
+
+       callback_context = kmalloc(sizeof(*callback_context), GFP_KERNEL);
+       if (!callback_context)
+               return -ENOMEM;
+
+       callback_context->priv = priv;
+       init_completion(&callback_context->done);
+
+       ret = ib_sa_classport_info_rec_query(&ipoib_sa_client,
+                                            priv->ca, priv->port, 3000,
+                                            GFP_KERNEL,
+                                            classport_info_query_cb,
+                                            callback_context,
+                                            &callback_context->sa_query);
+       if (ret < 0) {
+               pr_info("%s failed to send ib_sa_classport_info query, ret: %d\n",
+                       priv->dev->name, ret);
+               kfree(callback_context);
+               return ret;
+       }
+
+       /* waiting for the callback to finish before returnning */
+       wait_for_completion(&callback_context->done);
+       kfree(callback_context);
+
+       return ret;
+}
+
 void ipoib_flush_paths(struct net_device *dev)
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -1649,6 +1723,7 @@ static const struct net_device_ops ipoib_netdev_ops_pf = {
        .ndo_get_vf_config       = ipoib_get_vf_config,
        .ndo_get_vf_stats        = ipoib_get_vf_stats,
        .ndo_set_vf_guid         = ipoib_set_vf_guid,
+       .ndo_set_mac_address     = ipoib_set_mac,
 };
 
 static const struct net_device_ops ipoib_netdev_ops_vf = {
@@ -1771,6 +1846,70 @@ int ipoib_add_umcast_attr(struct net_device *dev)
        return device_create_file(&dev->dev, &dev_attr_umcast);
 }
 
+static void set_base_guid(struct ipoib_dev_priv *priv, union ib_gid *gid)
+{
+       struct ipoib_dev_priv *child_priv;
+       struct net_device *netdev = priv->dev;
+
+       netif_addr_lock(netdev);
+
+       memcpy(&priv->local_gid.global.interface_id,
+              &gid->global.interface_id,
+              sizeof(gid->global.interface_id));
+       memcpy(netdev->dev_addr + 4, &priv->local_gid, sizeof(priv->local_gid));
+       clear_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags);
+
+       netif_addr_unlock(netdev);
+
+       if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
+               down_read(&priv->vlan_rwsem);
+               list_for_each_entry(child_priv, &priv->child_intfs, list)
+                       set_base_guid(child_priv, gid);
+               up_read(&priv->vlan_rwsem);
+       }
+}
+
+static int ipoib_check_lladdr(struct net_device *dev,
+                             struct sockaddr_storage *ss)
+{
+       union ib_gid *gid = (union ib_gid *)(ss->__data + 4);
+       int ret = 0;
+
+       netif_addr_lock(dev);
+
+       /* Make sure the QPN, reserved and subnet prefix match the current
+        * lladdr, it also makes sure the lladdr is unicast.
+        */
+       if (memcmp(dev->dev_addr, ss->__data,
+                  4 + sizeof(gid->global.subnet_prefix)) ||
+           gid->global.interface_id == 0)
+               ret = -EINVAL;
+
+       netif_addr_unlock(dev);
+
+       return ret;
+}
+
+static int ipoib_set_mac(struct net_device *dev, void *addr)
+{
+       struct ipoib_dev_priv *priv = netdev_priv(dev);
+       struct sockaddr_storage *ss = addr;
+       int ret;
+
+       if (!(dev->priv_flags & IFF_LIVE_ADDR_CHANGE) && netif_running(dev))
+               return -EBUSY;
+
+       ret = ipoib_check_lladdr(dev, ss);
+       if (ret)
+               return ret;
+
+       set_base_guid(priv, (union ib_gid *)(ss->__data + 4));
+
+       queue_work(ipoib_workqueue, &priv->flush_light);
+
+       return 0;
+}
+
 static ssize_t create_child(struct device *dev,
                            struct device_attribute *attr,
                            const char *buf, size_t count)
@@ -1894,6 +2033,7 @@ static struct net_device *ipoib_add_port(const char *format,
                goto device_init_failed;
        } else
                memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
+       set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags);
 
        result = ipoib_dev_init(priv->dev, hca, port);
        if (result < 0) {
index 2588931..82fbc94 100644 (file)
@@ -64,6 +64,9 @@ struct ipoib_mcast_iter {
        unsigned int       send_only;
 };
 
+/* join state that allows creating mcg with sendonly member request */
+#define SENDONLY_FULLMEMBER_JOIN       8
+
 /*
  * This should be called with the priv->lock held
  */
@@ -326,12 +329,23 @@ void ipoib_mcast_carrier_on_task(struct work_struct *work)
        struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
                                                   carrier_on_task);
        struct ib_port_attr attr;
+       int ret;
 
        if (ib_query_port(priv->ca, priv->port, &attr) ||
            attr.state != IB_PORT_ACTIVE) {
                ipoib_dbg(priv, "Keeping carrier off until IB port is active\n");
                return;
        }
+       /*
+        * Check if can send sendonly MCG's with sendonly-fullmember join state.
+        * It done here after the successfully join to the broadcast group,
+        * because the broadcast group must always be joined first and is always
+        * re-joined if the SM changes substantially.
+        */
+       ret = ipoib_check_sm_sendonly_fullmember_support(priv);
+       if (ret < 0)
+               pr_debug("%s failed query sm support for sendonly-fullmember (ret: %d)\n",
+                        priv->dev->name, ret);
 
        /*
         * Take rtnl_lock to avoid racing with ipoib_stop() and
@@ -515,22 +529,20 @@ static int ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
                rec.hop_limit     = priv->broadcast->mcmember.hop_limit;
 
                /*
-                * Send-only IB Multicast joins do not work at the core
-                * IB layer yet, so we can't use them here.  However,
-                * we are emulating an Ethernet multicast send, which
-                * does not require a multicast subscription and will
-                * still send properly.  The most appropriate thing to
+                * Send-only IB Multicast joins work at the core IB layer but
+                * require specific SM support.
+                * We can use such joins here only if the current SM supports that feature.
+                * However, if not, we emulate an Ethernet multicast send,
+                * which does not require a multicast subscription and will
+                * still send properly. The most appropriate thing to
                 * do is to create the group if it doesn't exist as that
                 * most closely emulates the behavior, from a user space
-                * application perspecitive, of Ethernet multicast
-                * operation.  For now, we do a full join, maybe later
-                * when the core IB layers support send only joins we
-                * will use them.
+                * application perspective, of Ethernet multicast operation.
                 */
-#if 0
-               if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
-                       rec.join_state = 4;
-#endif
+               if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) &&
+                   priv->sm_fullmember_sendonly_support)
+                       /* SM supports sendonly-fullmember, otherwise fallback to full-member */
+                       rec.join_state = SENDONLY_FULLMEMBER_JOIN;
        }
        spin_unlock_irq(&priv->lock);
 
@@ -570,11 +582,13 @@ void ipoib_mcast_join_task(struct work_struct *work)
                return;
        }
        priv->local_lid = port_attr.lid;
+       netif_addr_lock(dev);
 
-       if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid, NULL))
-               ipoib_warn(priv, "ib_query_gid() failed\n");
-       else
-               memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
+       if (!test_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags)) {
+               netif_addr_unlock(dev);
+               return;
+       }
+       netif_addr_unlock(dev);
 
        spin_lock_irq(&priv->lock);
        if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags))
index b809c37..1e7cbba 100644 (file)
@@ -307,5 +307,8 @@ void ipoib_event(struct ib_event_handler *handler,
                queue_work(ipoib_workqueue, &priv->flush_normal);
        } else if (record->event == IB_EVENT_PKEY_CHANGE) {
                queue_work(ipoib_workqueue, &priv->flush_heavy);
+       } else if (record->event == IB_EVENT_GID_CHANGE &&
+                  !test_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags)) {
+               queue_work(ipoib_workqueue, &priv->flush_light);
        }
 }
index fca1a88..64a3559 100644 (file)
@@ -68,6 +68,8 @@ int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv,
        priv->pkey = pkey;
 
        memcpy(priv->dev->dev_addr, ppriv->dev->dev_addr, INFINIBAND_ALEN);
+       memcpy(&priv->local_gid, &ppriv->local_gid, sizeof(priv->local_gid));
+       set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags);
        priv->dev->broadcast[8] = pkey >> 8;
        priv->dev->broadcast[9] = pkey & 0xff;
 
index 897b5a4..a990c04 100644 (file)
@@ -2596,9 +2596,19 @@ static void isert_free_conn(struct iscsi_conn *conn)
        isert_put_conn(isert_conn);
 }
 
+static void isert_get_rx_pdu(struct iscsi_conn *conn)
+{
+       struct completion comp;
+
+       init_completion(&comp);
+
+       wait_for_completion_interruptible(&comp);
+}
+
 static struct iscsit_transport iser_target_transport = {
        .name                   = "IB/iSER",
        .transport_type         = ISCSI_INFINIBAND,
+       .rdma_shutdown          = true,
        .priv_size              = sizeof(struct isert_cmd),
        .owner                  = THIS_MODULE,
        .iscsit_setup_np        = isert_setup_np,
@@ -2614,6 +2624,7 @@ static struct iscsit_transport iser_target_transport = {
        .iscsit_queue_data_in   = isert_put_datain,
        .iscsit_queue_status    = isert_put_response,
        .iscsit_aborted_task    = isert_aborted_task,
+       .iscsit_get_rx_pdu      = isert_get_rx_pdu,
        .iscsit_get_sup_prot_ops = isert_get_sup_prot_ops,
 };
 
index 2843f1a..e68b20c 100644 (file)
@@ -254,8 +254,8 @@ static void srpt_get_class_port_info(struct ib_dm_mad *mad)
        memset(cif, 0, sizeof(*cif));
        cif->base_version = 1;
        cif->class_version = 1;
-       cif->resp_time_value = 20;
 
+       ib_set_cpi_resp_time(cif, 20);
        mad->mad_hdr.status = 0;
 }
 
@@ -1767,14 +1767,6 @@ static void __srpt_close_all_ch(struct srpt_device *sdev)
        }
 }
 
-/**
- * srpt_shutdown_session() - Whether or not a session may be shut down.
- */
-static int srpt_shutdown_session(struct se_session *se_sess)
-{
-       return 1;
-}
-
 static void srpt_free_ch(struct kref *kref)
 {
        struct srpt_rdma_ch *ch = container_of(kref, struct srpt_rdma_ch, kref);
@@ -3064,7 +3056,6 @@ static const struct target_core_fabric_ops srpt_template = {
        .tpg_get_inst_index             = srpt_tpg_get_inst_index,
        .release_cmd                    = srpt_release_cmd,
        .check_stop_free                = srpt_check_stop_free,
-       .shutdown_session               = srpt_shutdown_session,
        .close_session                  = srpt_close_session,
        .sess_get_index                 = srpt_sess_get_index,
        .sess_get_initiator_sid         = NULL,
index 1142a93..804dbcc 100644 (file)
@@ -87,7 +87,7 @@
 #define DRIVER_AUTHOR "Marko Friedemann <mfr@bmx-chemnitz.de>"
 #define DRIVER_DESC "X-Box pad driver"
 
-#define XPAD_PKT_LEN 32
+#define XPAD_PKT_LEN 64
 
 /* xbox d-pads should map to buttons, as is required for DDR pads
    but we map them to axes when possible to simplify things */
@@ -129,6 +129,7 @@ static const struct xpad_device {
        { 0x045e, 0x028e, "Microsoft X-Box 360 pad", 0, XTYPE_XBOX360 },
        { 0x045e, 0x02d1, "Microsoft X-Box One pad", 0, XTYPE_XBOXONE },
        { 0x045e, 0x02dd, "Microsoft X-Box One pad (Firmware 2015)", 0, XTYPE_XBOXONE },
+       { 0x045e, 0x02e3, "Microsoft X-Box One Elite pad", 0, XTYPE_XBOXONE },
        { 0x045e, 0x0291, "Xbox 360 Wireless Receiver (XBOX)", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX360W },
        { 0x045e, 0x0719, "Xbox 360 Wireless Receiver", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX360W },
        { 0x044f, 0x0f07, "Thrustmaster, Inc. Controller", 0, XTYPE_XBOX },
@@ -173,9 +174,11 @@ static const struct xpad_device {
        { 0x0e6f, 0x0006, "Edge wireless Controller", 0, XTYPE_XBOX },
        { 0x0e6f, 0x0105, "HSM3 Xbox360 dancepad", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX360 },
        { 0x0e6f, 0x0113, "Afterglow AX.1 Gamepad for Xbox 360", 0, XTYPE_XBOX360 },
+       { 0x0e6f, 0x0139, "Afterglow Prismatic Wired Controller", 0, XTYPE_XBOXONE },
        { 0x0e6f, 0x0201, "Pelican PL-3601 'TSZ' Wired Xbox 360 Controller", 0, XTYPE_XBOX360 },
        { 0x0e6f, 0x0213, "Afterglow Gamepad for Xbox 360", 0, XTYPE_XBOX360 },
        { 0x0e6f, 0x021f, "Rock Candy Gamepad for Xbox 360", 0, XTYPE_XBOX360 },
+       { 0x0e6f, 0x0146, "Rock Candy Wired Controller for Xbox One", 0, XTYPE_XBOXONE },
        { 0x0e6f, 0x0301, "Logic3 Controller", 0, XTYPE_XBOX360 },
        { 0x0e6f, 0x0401, "Logic3 Controller", 0, XTYPE_XBOX360 },
        { 0x0e8f, 0x0201, "SmartJoy Frag Xpad/PS2 adaptor", 0, XTYPE_XBOX },
@@ -183,6 +186,7 @@ static const struct xpad_device {
        { 0x0f0d, 0x000a, "Hori Co. DOA4 FightStick", 0, XTYPE_XBOX360 },
        { 0x0f0d, 0x000d, "Hori Fighting Stick EX2", MAP_TRIGGERS_TO_BUTTONS, XTYPE_XBOX360 },
        { 0x0f0d, 0x0016, "Hori Real Arcade Pro.EX", MAP_TRIGGERS_TO_BUTTONS, XTYPE_XBOX360 },
+       { 0x0f0d, 0x0067, "HORIPAD ONE", 0, XTYPE_XBOXONE },
        { 0x0f30, 0x0202, "Joytech Advanced Controller", 0, XTYPE_XBOX },
        { 0x0f30, 0x8888, "BigBen XBMiniPad Controller", 0, XTYPE_XBOX },
        { 0x102c, 0xff0c, "Joytech Wireless Advanced Controller", 0, XTYPE_XBOX },
@@ -199,6 +203,7 @@ static const struct xpad_device {
        { 0x162e, 0xbeef, "Joytech Neo-Se Take2", 0, XTYPE_XBOX360 },
        { 0x1689, 0xfd00, "Razer Onza Tournament Edition", 0, XTYPE_XBOX360 },
        { 0x1689, 0xfd01, "Razer Onza Classic Edition", 0, XTYPE_XBOX360 },
+       { 0x24c6, 0x542a, "Xbox ONE spectra", 0, XTYPE_XBOXONE },
        { 0x24c6, 0x5d04, "Razer Sabertooth", 0, XTYPE_XBOX360 },
        { 0x1bad, 0x0002, "Harmonix Rock Band Guitar", 0, XTYPE_XBOX360 },
        { 0x1bad, 0x0003, "Harmonix Rock Band Drumkit", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX360 },
@@ -212,6 +217,8 @@ static const struct xpad_device {
        { 0x24c6, 0x5000, "Razer Atrox Arcade Stick", MAP_TRIGGERS_TO_BUTTONS, XTYPE_XBOX360 },
        { 0x24c6, 0x5300, "PowerA MINI PROEX Controller", 0, XTYPE_XBOX360 },
        { 0x24c6, 0x5303, "Xbox Airflo wired controller", 0, XTYPE_XBOX360 },
+       { 0x24c6, 0x541a, "PowerA Xbox One Mini Wired Controller", 0, XTYPE_XBOXONE },
+       { 0x24c6, 0x543a, "PowerA Xbox One wired controller", 0, XTYPE_XBOXONE },
        { 0x24c6, 0x5500, "Hori XBOX 360 EX 2 with Turbo", 0, XTYPE_XBOX360 },
        { 0x24c6, 0x5501, "Hori Real Arcade Pro VX-SA", 0, XTYPE_XBOX360 },
        { 0x24c6, 0x5506, "Hori SOULCALIBUR V Stick", 0, XTYPE_XBOX360 },
@@ -307,13 +314,16 @@ static struct usb_device_id xpad_table[] = {
        { USB_DEVICE(0x0738, 0x4540) },         /* Mad Catz Beat Pad */
        XPAD_XBOXONE_VENDOR(0x0738),            /* Mad Catz FightStick TE 2 */
        XPAD_XBOX360_VENDOR(0x0e6f),            /* 0x0e6f X-Box 360 controllers */
+       XPAD_XBOXONE_VENDOR(0x0e6f),            /* 0x0e6f X-Box One controllers */
        XPAD_XBOX360_VENDOR(0x12ab),            /* X-Box 360 dance pads */
        XPAD_XBOX360_VENDOR(0x1430),            /* RedOctane X-Box 360 controllers */
        XPAD_XBOX360_VENDOR(0x146b),            /* BigBen Interactive Controllers */
        XPAD_XBOX360_VENDOR(0x1bad),            /* Harminix Rock Band Guitar and Drums */
        XPAD_XBOX360_VENDOR(0x0f0d),            /* Hori Controllers */
+       XPAD_XBOXONE_VENDOR(0x0f0d),            /* Hori Controllers */
        XPAD_XBOX360_VENDOR(0x1689),            /* Razer Onza */
        XPAD_XBOX360_VENDOR(0x24c6),            /* PowerA Controllers */
+       XPAD_XBOXONE_VENDOR(0x24c6),            /* PowerA Controllers */
        XPAD_XBOX360_VENDOR(0x1532),            /* Razer Sabertooth */
        XPAD_XBOX360_VENDOR(0x15e4),            /* Numark X-Box 360 controllers */
        XPAD_XBOX360_VENDOR(0x162e),            /* Joytech X-Box 360 controllers */
@@ -457,6 +467,10 @@ static void xpad_process_packet(struct usb_xpad *xpad, u16 cmd, unsigned char *d
 static void xpad360_process_packet(struct usb_xpad *xpad, struct input_dev *dev,
                                   u16 cmd, unsigned char *data)
 {
+       /* valid pad data */
+       if (data[0] != 0x00)
+               return;
+
        /* digital pad */
        if (xpad->mapping & MAP_DPAD_TO_BUTTONS) {
                /* dpad as buttons (left, right, up, down) */
@@ -756,6 +770,7 @@ static bool xpad_prepare_next_out_packet(struct usb_xpad *xpad)
        if (packet) {
                memcpy(xpad->odata, packet->data, packet->len);
                xpad->irq_out->transfer_buffer_length = packet->len;
+               packet->pending = false;
                return true;
        }
 
@@ -797,7 +812,6 @@ static void xpad_irq_out(struct urb *urb)
        switch (status) {
        case 0:
                /* success */
-               xpad->out_packets[xpad->last_out_packet].pending = false;
                xpad->irq_out_active = xpad_prepare_next_out_packet(xpad);
                break;
 
index 6d96bff..29ddeb7 100644 (file)
@@ -70,10 +70,13 @@ struct max77693_haptic {
 
 static int max77693_haptic_set_duty_cycle(struct max77693_haptic *haptic)
 {
-       int delta = (haptic->pwm_dev->period + haptic->pwm_duty) / 2;
+       struct pwm_args pargs;
+       int delta;
        int error;
 
-       error = pwm_config(haptic->pwm_dev, delta, haptic->pwm_dev->period);
+       pwm_get_args(haptic->pwm_dev, &pargs);
+       delta = (pargs.period + haptic->pwm_duty) / 2;
+       error = pwm_config(haptic->pwm_dev, delta, pargs.period);
        if (error) {
                dev_err(haptic->dev, "failed to configure pwm: %d\n", error);
                return error;
@@ -234,6 +237,7 @@ static int max77693_haptic_play_effect(struct input_dev *dev, void *data,
                                       struct ff_effect *effect)
 {
        struct max77693_haptic *haptic = input_get_drvdata(dev);
+       struct pwm_args pargs;
        u64 period_mag_multi;
 
        haptic->magnitude = effect->u.rumble.strong_magnitude;
@@ -245,7 +249,8 @@ static int max77693_haptic_play_effect(struct input_dev *dev, void *data,
         * The formula to convert magnitude to pwm_duty as follows:
         * - pwm_duty = (magnitude * pwm_period) / MAX_MAGNITUDE(0xFFFF)
         */
-       period_mag_multi = (u64)haptic->pwm_dev->period * haptic->magnitude;
+       pwm_get_args(haptic->pwm_dev, &pargs);
+       period_mag_multi = (u64)pargs.period * haptic->magnitude;
        haptic->pwm_duty = (unsigned int)(period_mag_multi >>
                                                MAX_MAGNITUDE_SHIFT);
 
@@ -329,6 +334,12 @@ static int max77693_haptic_probe(struct platform_device *pdev)
                return PTR_ERR(haptic->pwm_dev);
        }
 
+       /*
+        * FIXME: pwm_apply_args() should be removed when switching to the
+        * atomic PWM API.
+        */
+       pwm_apply_args(haptic->pwm_dev);
+
        haptic->motor_reg = devm_regulator_get(&pdev->dev, "haptic");
        if (IS_ERR(haptic->motor_reg)) {
                dev_err(&pdev->dev, "failed to get regulator\n");
index 8d6326d..99bc762 100644 (file)
@@ -306,6 +306,12 @@ static int max8997_haptic_probe(struct platform_device *pdev)
                                error);
                        goto err_free_mem;
                }
+
+               /*
+                * FIXME: pwm_apply_args() should be removed when switching to
+                * the atomic PWM API.
+                */
+               pwm_apply_args(chip->pwm);
                break;
 
        default:
index f2261ab..5f9655d 100644 (file)
 #include <linux/platform_device.h>
 #include <linux/pwm.h>
 #include <linux/slab.h>
+#include <linux/workqueue.h>
 
 struct pwm_beeper {
        struct input_dev *input;
        struct pwm_device *pwm;
+       struct work_struct work;
        unsigned long period;
 };
 
 #define HZ_TO_NANOSECONDS(x) (1000000000UL/(x))
 
+static void __pwm_beeper_set(struct pwm_beeper *beeper)
+{
+       unsigned long period = beeper->period;
+
+       if (period) {
+               pwm_config(beeper->pwm, period / 2, period);
+               pwm_enable(beeper->pwm);
+       } else
+               pwm_disable(beeper->pwm);
+}
+
+static void pwm_beeper_work(struct work_struct *work)
+{
+       struct pwm_beeper *beeper =
+               container_of(work, struct pwm_beeper, work);
+
+       __pwm_beeper_set(beeper);
+}
+
 static int pwm_beeper_event(struct input_dev *input,
                            unsigned int type, unsigned int code, int value)
 {
-       int ret = 0;
        struct pwm_beeper *beeper = input_get_drvdata(input);
-       unsigned long period;
 
        if (type != EV_SND || value < 0)
                return -EINVAL;
@@ -49,22 +68,31 @@ static int pwm_beeper_event(struct input_dev *input,
                return -EINVAL;
        }
 
-       if (value == 0) {
-               pwm_disable(beeper->pwm);
-       } else {
-               period = HZ_TO_NANOSECONDS(value);
-               ret = pwm_config(beeper->pwm, period / 2, period);
-               if (ret)
-                       return ret;
-               ret = pwm_enable(beeper->pwm);
-               if (ret)
-                       return ret;
-               beeper->period = period;
-       }
+       if (value == 0)
+               beeper->period = 0;
+       else
+               beeper->period = HZ_TO_NANOSECONDS(value);
+
+       schedule_work(&beeper->work);
 
        return 0;
 }
 
+static void pwm_beeper_stop(struct pwm_beeper *beeper)
+{
+       cancel_work_sync(&beeper->work);
+
+       if (beeper->period)
+               pwm_disable(beeper->pwm);
+}
+
+static void pwm_beeper_close(struct input_dev *input)
+{
+       struct pwm_beeper *beeper = input_get_drvdata(input);
+
+       pwm_beeper_stop(beeper);
+}
+
 static int pwm_beeper_probe(struct platform_device *pdev)
 {
        unsigned long pwm_id = (unsigned long)dev_get_platdata(&pdev->dev);
@@ -87,6 +115,14 @@ static int pwm_beeper_probe(struct platform_device *pdev)
                goto err_free;
        }
 
+       /*
+        * FIXME: pwm_apply_args() should be removed when switching to
+        * the atomic PWM API.
+        */
+       pwm_apply_args(beeper->pwm);
+
+       INIT_WORK(&beeper->work, pwm_beeper_work);
+
        beeper->input = input_allocate_device();
        if (!beeper->input) {
                dev_err(&pdev->dev, "Failed to allocate input device\n");
@@ -106,6 +142,7 @@ static int pwm_beeper_probe(struct platform_device *pdev)
        beeper->input->sndbit[0] = BIT(SND_TONE) | BIT(SND_BELL);
 
        beeper->input->event = pwm_beeper_event;
+       beeper->input->close = pwm_beeper_close;
 
        input_set_drvdata(beeper->input, beeper);
 
@@ -135,7 +172,6 @@ static int pwm_beeper_remove(struct platform_device *pdev)
 
        input_unregister_device(beeper->input);
 
-       pwm_disable(beeper->pwm);
        pwm_free(beeper->pwm);
 
        kfree(beeper);
@@ -147,8 +183,7 @@ static int __maybe_unused pwm_beeper_suspend(struct device *dev)
 {
        struct pwm_beeper *beeper = dev_get_drvdata(dev);
 
-       if (beeper->period)
-               pwm_disable(beeper->pwm);
+       pwm_beeper_stop(beeper);
 
        return 0;
 }
@@ -157,10 +192,8 @@ static int __maybe_unused pwm_beeper_resume(struct device *dev)
 {
        struct pwm_beeper *beeper = dev_get_drvdata(dev);
 
-       if (beeper->period) {
-               pwm_config(beeper->pwm, beeper->period / 2, beeper->period);
-               pwm_enable(beeper->pwm);
-       }
+       if (beeper->period)
+               __pwm_beeper_set(beeper);
 
        return 0;
 }
index abe1a92..65ebbd1 100644 (file)
@@ -981,9 +981,15 @@ static long uinput_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 }
 
 #ifdef CONFIG_COMPAT
+
+#define UI_SET_PHYS_COMPAT     _IOW(UINPUT_IOCTL_BASE, 108, compat_uptr_t)
+
 static long uinput_compat_ioctl(struct file *file,
                                unsigned int cmd, unsigned long arg)
 {
+       if (cmd == UI_SET_PHYS_COMPAT)
+               cmd = UI_SET_PHYS;
+
        return uinput_ioctl_handler(file, cmd, arg, compat_ptr(arg));
 }
 #endif
index 4857943..d07dd29 100644 (file)
 struct sun4i_ts_data {
        struct device *dev;
        struct input_dev *input;
-       struct thermal_zone_device *tz;
        void __iomem *base;
        unsigned int irq;
        bool ignore_fifo_data;
@@ -366,10 +365,7 @@ static int sun4i_ts_probe(struct platform_device *pdev)
        if (IS_ERR(hwmon))
                return PTR_ERR(hwmon);
 
-       ts->tz = thermal_zone_of_sensor_register(ts->dev, 0, ts,
-                                                &sun4i_ts_tz_ops);
-       if (IS_ERR(ts->tz))
-               ts->tz = NULL;
+       devm_thermal_zone_of_sensor_register(ts->dev, 0, ts, &sun4i_ts_tz_ops);
 
        writel(TEMP_IRQ_EN(1), ts->base + TP_INT_FIFOC);
 
@@ -377,7 +373,6 @@ static int sun4i_ts_probe(struct platform_device *pdev)
                error = input_register_device(ts->input);
                if (error) {
                        writel(0, ts->base + TP_INT_FIFOC);
-                       thermal_zone_of_sensor_unregister(ts->dev, ts->tz);
                        return error;
                }
        }
@@ -394,8 +389,6 @@ static int sun4i_ts_remove(struct platform_device *pdev)
        if (ts->input)
                input_unregister_device(ts->input);
 
-       thermal_zone_of_sensor_unregister(ts->dev, ts->tz);
-
        /* Deactivate all IRQs */
        writel(0, ts->base + TP_INT_FIFOC);
 
index ebab33e..94b6821 100644 (file)
@@ -1477,7 +1477,7 @@ static int arm_smmu_domain_finalise_s1(struct arm_smmu_domain *smmu_domain,
        struct arm_smmu_s1_cfg *cfg = &smmu_domain->s1_cfg;
 
        asid = arm_smmu_bitmap_alloc(smmu->asid_map, smmu->asid_bits);
-       if (IS_ERR_VALUE(asid))
+       if (asid < 0)
                return asid;
 
        cfg->cdptr = dmam_alloc_coherent(smmu->dev, CTXDESC_CD_DWORDS << 3,
@@ -1508,7 +1508,7 @@ static int arm_smmu_domain_finalise_s2(struct arm_smmu_domain *smmu_domain,
        struct arm_smmu_s2_cfg *cfg = &smmu_domain->s2_cfg;
 
        vmid = arm_smmu_bitmap_alloc(smmu->vmid_map, smmu->vmid_bits);
-       if (IS_ERR_VALUE(vmid))
+       if (vmid < 0)
                return vmid;
 
        cfg->vmid       = (u16)vmid;
@@ -1569,7 +1569,7 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain)
        smmu_domain->pgtbl_ops = pgtbl_ops;
 
        ret = finalise_stage_fn(smmu_domain, &pgtbl_cfg);
-       if (IS_ERR_VALUE(ret))
+       if (ret < 0)
                free_io_pgtable_ops(pgtbl_ops);
 
        return ret;
@@ -1642,7 +1642,7 @@ static void arm_smmu_detach_dev(struct device *dev)
        struct arm_smmu_group *smmu_group = arm_smmu_group_get(dev);
 
        smmu_group->ste.bypass = true;
-       if (IS_ERR_VALUE(arm_smmu_install_ste_for_group(smmu_group)))
+       if (arm_smmu_install_ste_for_group(smmu_group) < 0)
                dev_warn(dev, "failed to install bypass STE\n");
 
        smmu_group->domain = NULL;
@@ -1694,7 +1694,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
        smmu_group->ste.bypass  = domain->type == IOMMU_DOMAIN_DMA;
 
        ret = arm_smmu_install_ste_for_group(smmu_group);
-       if (IS_ERR_VALUE(ret))
+       if (ret < 0)
                smmu_group->domain = NULL;
 
 out_unlock:
@@ -2235,7 +2235,7 @@ static int arm_smmu_setup_irqs(struct arm_smmu_device *smmu)
                                                arm_smmu_evtq_handler,
                                                arm_smmu_evtq_thread,
                                                0, "arm-smmu-v3-evtq", smmu);
-               if (IS_ERR_VALUE(ret))
+               if (ret < 0)
                        dev_warn(smmu->dev, "failed to enable evtq irq\n");
        }
 
@@ -2244,7 +2244,7 @@ static int arm_smmu_setup_irqs(struct arm_smmu_device *smmu)
                ret = devm_request_irq(smmu->dev, irq,
                                       arm_smmu_cmdq_sync_handler, 0,
                                       "arm-smmu-v3-cmdq-sync", smmu);
-               if (IS_ERR_VALUE(ret))
+               if (ret < 0)
                        dev_warn(smmu->dev, "failed to enable cmdq-sync irq\n");
        }
 
@@ -2252,7 +2252,7 @@ static int arm_smmu_setup_irqs(struct arm_smmu_device *smmu)
        if (irq) {
                ret = devm_request_irq(smmu->dev, irq, arm_smmu_gerror_handler,
                                       0, "arm-smmu-v3-gerror", smmu);
-               if (IS_ERR_VALUE(ret))
+               if (ret < 0)
                        dev_warn(smmu->dev, "failed to enable gerror irq\n");
        }
 
@@ -2264,7 +2264,7 @@ static int arm_smmu_setup_irqs(struct arm_smmu_device *smmu)
                                                        arm_smmu_priq_thread,
                                                        0, "arm-smmu-v3-priq",
                                                        smmu);
-                       if (IS_ERR_VALUE(ret))
+                       if (ret < 0)
                                dev_warn(smmu->dev,
                                         "failed to enable priq irq\n");
                        else
index e206ce7..9345a3f 100644 (file)
@@ -950,7 +950,7 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain,
 
        ret = __arm_smmu_alloc_bitmap(smmu->context_map, start,
                                      smmu->num_context_banks);
-       if (IS_ERR_VALUE(ret))
+       if (ret < 0)
                goto out_unlock;
 
        cfg->cbndx = ret;
@@ -989,7 +989,7 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain,
        irq = smmu->irqs[smmu->num_global_irqs + cfg->irptndx];
        ret = request_irq(irq, arm_smmu_context_fault, IRQF_SHARED,
                          "arm-smmu-context-fault", domain);
-       if (IS_ERR_VALUE(ret)) {
+       if (ret < 0) {
                dev_err(smmu->dev, "failed to request context IRQ %d (%u)\n",
                        cfg->irptndx, irq);
                cfg->irptndx = INVALID_IRPTNDX;
@@ -1099,7 +1099,7 @@ static int arm_smmu_master_configure_smrs(struct arm_smmu_device *smmu,
        for (i = 0; i < cfg->num_streamids; ++i) {
                int idx = __arm_smmu_alloc_bitmap(smmu->smr_map, 0,
                                                  smmu->num_mapping_groups);
-               if (IS_ERR_VALUE(idx)) {
+               if (idx < 0) {
                        dev_err(smmu->dev, "failed to allocate free SMR\n");
                        goto err_free_smrs;
                }
@@ -1233,7 +1233,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 
        /* Ensure that the domain is finalised */
        ret = arm_smmu_init_domain_context(domain, smmu);
-       if (IS_ERR_VALUE(ret))
+       if (ret < 0)
                return ret;
 
        /*
index b2bfb95..a644d0c 100644 (file)
@@ -33,6 +33,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/mempool.h>
 #include <linux/memory.h>
+#include <linux/cpu.h>
 #include <linux/timer.h>
 #include <linux/io.h>
 #include <linux/iova.h>
@@ -390,6 +391,7 @@ struct dmar_domain {
                                         * domain ids are 16 bit wide according
                                         * to VT-d spec, section 9.3 */
 
+       bool has_iotlb_device;
        struct list_head devices;       /* all devices' list */
        struct iova_domain iovad;       /* iova's that belong to this domain */
 
@@ -456,27 +458,32 @@ static LIST_HEAD(dmar_rmrr_units);
 
 static void flush_unmaps_timeout(unsigned long data);
 
-static DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
+struct deferred_flush_entry {
+       unsigned long iova_pfn;
+       unsigned long nrpages;
+       struct dmar_domain *domain;
+       struct page *freelist;
+};
 
 #define HIGH_WATER_MARK 250
-struct deferred_flush_tables {
+struct deferred_flush_table {
        int next;
-       struct iova *iova[HIGH_WATER_MARK];
-       struct dmar_domain *domain[HIGH_WATER_MARK];
-       struct page *freelist[HIGH_WATER_MARK];
+       struct deferred_flush_entry entries[HIGH_WATER_MARK];
+};
+
+struct deferred_flush_data {
+       spinlock_t lock;
+       int timer_on;
+       struct timer_list timer;
+       long size;
+       struct deferred_flush_table *tables;
 };
 
-static struct deferred_flush_tables *deferred_flush;
+DEFINE_PER_CPU(struct deferred_flush_data, deferred_flush);
 
 /* bitmap for indexing intel_iommus */
 static int g_num_of_iommus;
 
-static DEFINE_SPINLOCK(async_umap_flush_lock);
-static LIST_HEAD(unmaps_to_do);
-
-static int timer_on;
-static long list_size;
-
 static void domain_exit(struct dmar_domain *domain);
 static void domain_remove_dev_info(struct dmar_domain *domain);
 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
@@ -1458,10 +1465,35 @@ iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
        return NULL;
 }
 
+static void domain_update_iotlb(struct dmar_domain *domain)
+{
+       struct device_domain_info *info;
+       bool has_iotlb_device = false;
+
+       assert_spin_locked(&device_domain_lock);
+
+       list_for_each_entry(info, &domain->devices, link) {
+               struct pci_dev *pdev;
+
+               if (!info->dev || !dev_is_pci(info->dev))
+                       continue;
+
+               pdev = to_pci_dev(info->dev);
+               if (pdev->ats_enabled) {
+                       has_iotlb_device = true;
+                       break;
+               }
+       }
+
+       domain->has_iotlb_device = has_iotlb_device;
+}
+
 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
 {
        struct pci_dev *pdev;
 
+       assert_spin_locked(&device_domain_lock);
+
        if (!info || !dev_is_pci(info->dev))
                return;
 
@@ -1481,6 +1513,7 @@ static void iommu_enable_dev_iotlb(struct device_domain_info *info)
 #endif
        if (info->ats_supported && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
                info->ats_enabled = 1;
+               domain_update_iotlb(info->domain);
                info->ats_qdep = pci_ats_queue_depth(pdev);
        }
 }
@@ -1489,6 +1522,8 @@ static void iommu_disable_dev_iotlb(struct device_domain_info *info)
 {
        struct pci_dev *pdev;
 
+       assert_spin_locked(&device_domain_lock);
+
        if (!dev_is_pci(info->dev))
                return;
 
@@ -1497,6 +1532,7 @@ static void iommu_disable_dev_iotlb(struct device_domain_info *info)
        if (info->ats_enabled) {
                pci_disable_ats(pdev);
                info->ats_enabled = 0;
+               domain_update_iotlb(info->domain);
        }
 #ifdef CONFIG_INTEL_IOMMU_SVM
        if (info->pri_enabled) {
@@ -1517,6 +1553,9 @@ static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
        unsigned long flags;
        struct device_domain_info *info;
 
+       if (!domain->has_iotlb_device)
+               return;
+
        spin_lock_irqsave(&device_domain_lock, flags);
        list_for_each_entry(info, &domain->devices, link) {
                if (!info->ats_enabled)
@@ -1734,6 +1773,7 @@ static struct dmar_domain *alloc_domain(int flags)
        memset(domain, 0, sizeof(*domain));
        domain->nid = -1;
        domain->flags = flags;
+       domain->has_iotlb_device = false;
        INIT_LIST_HEAD(&domain->devices);
 
        return domain;
@@ -1918,8 +1958,12 @@ static void domain_exit(struct dmar_domain *domain)
                return;
 
        /* Flush any lazy unmaps that may reference this domain */
-       if (!intel_iommu_strict)
-               flush_unmaps_timeout(0);
+       if (!intel_iommu_strict) {
+               int cpu;
+
+               for_each_possible_cpu(cpu)
+                       flush_unmaps_timeout(cpu);
+       }
 
        /* Remove associated devices and clear attached or cached domains */
        rcu_read_lock();
@@ -3077,7 +3121,7 @@ static int __init init_dmars(void)
        bool copied_tables = false;
        struct device *dev;
        struct intel_iommu *iommu;
-       int i, ret;
+       int i, ret, cpu;
 
        /*
         * for each drhd
@@ -3110,11 +3154,20 @@ static int __init init_dmars(void)
                goto error;
        }
 
-       deferred_flush = kzalloc(g_num_of_iommus *
-               sizeof(struct deferred_flush_tables), GFP_KERNEL);
-       if (!deferred_flush) {
-               ret = -ENOMEM;
-               goto free_g_iommus;
+       for_each_possible_cpu(cpu) {
+               struct deferred_flush_data *dfd = per_cpu_ptr(&deferred_flush,
+                                                             cpu);
+
+               dfd->tables = kzalloc(g_num_of_iommus *
+                                     sizeof(struct deferred_flush_table),
+                                     GFP_KERNEL);
+               if (!dfd->tables) {
+                       ret = -ENOMEM;
+                       goto free_g_iommus;
+               }
+
+               spin_lock_init(&dfd->lock);
+               setup_timer(&dfd->timer, flush_unmaps_timeout, cpu);
        }
 
        for_each_active_iommu(iommu, drhd) {
@@ -3291,19 +3344,20 @@ free_iommu:
                disable_dmar_iommu(iommu);
                free_dmar_iommu(iommu);
        }
-       kfree(deferred_flush);
 free_g_iommus:
+       for_each_possible_cpu(cpu)
+               kfree(per_cpu_ptr(&deferred_flush, cpu)->tables);
        kfree(g_iommus);
 error:
        return ret;
 }
 
 /* This takes a number of _MM_ pages, not VTD pages */
-static struct iova *intel_alloc_iova(struct device *dev,
+static unsigned long intel_alloc_iova(struct device *dev,
                                     struct dmar_domain *domain,
                                     unsigned long nrpages, uint64_t dma_mask)
 {
-       struct iova *iova = NULL;
+       unsigned long iova_pfn = 0;
 
        /* Restrict dma_mask to the width that the iommu can handle */
        dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
@@ -3316,19 +3370,19 @@ static struct iova *intel_alloc_iova(struct device *dev,
                 * DMA_BIT_MASK(32) and if that fails then try allocating
                 * from higher range
                 */
-               iova = alloc_iova(&domain->iovad, nrpages,
-                                 IOVA_PFN(DMA_BIT_MASK(32)), 1);
-               if (iova)
-                       return iova;
+               iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
+                                          IOVA_PFN(DMA_BIT_MASK(32)));
+               if (iova_pfn)
+                       return iova_pfn;
        }
-       iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
-       if (unlikely(!iova)) {
+       iova_pfn = alloc_iova_fast(&domain->iovad, nrpages, IOVA_PFN(dma_mask));
+       if (unlikely(!iova_pfn)) {
                pr_err("Allocating %ld-page iova for %s failed",
                       nrpages, dev_name(dev));
-               return NULL;
+               return 0;
        }
 
-       return iova;
+       return iova_pfn;
 }
 
 static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev)
@@ -3426,7 +3480,7 @@ static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
 {
        struct dmar_domain *domain;
        phys_addr_t start_paddr;
-       struct iova *iova;
+       unsigned long iova_pfn;
        int prot = 0;
        int ret;
        struct intel_iommu *iommu;
@@ -3444,8 +3498,8 @@ static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
        iommu = domain_get_iommu(domain);
        size = aligned_nrpages(paddr, size);
 
-       iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
-       if (!iova)
+       iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
+       if (!iova_pfn)
                goto error;
 
        /*
@@ -3463,7 +3517,7 @@ static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
         * might have two guest_addr mapping to the same host paddr, but this
         * is not a big problem
         */
-       ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
+       ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
                                 mm_to_dma_pfn(paddr_pfn), size, prot);
        if (ret)
                goto error;
@@ -3471,18 +3525,18 @@ static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
        /* it's a non-present to present mapping. Only flush if caching mode */
        if (cap_caching_mode(iommu->cap))
                iommu_flush_iotlb_psi(iommu, domain,
-                                     mm_to_dma_pfn(iova->pfn_lo),
+                                     mm_to_dma_pfn(iova_pfn),
                                      size, 0, 1);
        else
                iommu_flush_write_buffer(iommu);
 
-       start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
+       start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
        start_paddr += paddr & ~PAGE_MASK;
        return start_paddr;
 
 error:
-       if (iova)
-               __free_iova(&domain->iovad, iova);
+       if (iova_pfn)
+               free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
        pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
                dev_name(dev), size, (unsigned long long)paddr, dir);
        return 0;
@@ -3497,91 +3551,120 @@ static dma_addr_t intel_map_page(struct device *dev, struct page *page,
                                  dir, *dev->dma_mask);
 }
 
-static void flush_unmaps(void)
+static void flush_unmaps(struct deferred_flush_data *flush_data)
 {
        int i, j;
 
-       timer_on = 0;
+       flush_data->timer_on = 0;
 
        /* just flush them all */
        for (i = 0; i < g_num_of_iommus; i++) {
                struct intel_iommu *iommu = g_iommus[i];
+               struct deferred_flush_table *flush_table =
+                               &flush_data->tables[i];
                if (!iommu)
                        continue;
 
-               if (!deferred_flush[i].next)
+               if (!flush_table->next)
                        continue;
 
                /* In caching mode, global flushes turn emulation expensive */
                if (!cap_caching_mode(iommu->cap))
                        iommu->flush.flush_iotlb(iommu, 0, 0, 0,
                                         DMA_TLB_GLOBAL_FLUSH);
-               for (j = 0; j < deferred_flush[i].next; j++) {
+               for (j = 0; j < flush_table->next; j++) {
                        unsigned long mask;
-                       struct iova *iova = deferred_flush[i].iova[j];
-                       struct dmar_domain *domain = deferred_flush[i].domain[j];
+                       struct deferred_flush_entry *entry =
+                                               &flush_table->entries[j];
+                       unsigned long iova_pfn = entry->iova_pfn;
+                       unsigned long nrpages = entry->nrpages;
+                       struct dmar_domain *domain = entry->domain;
+                       struct page *freelist = entry->freelist;
 
                        /* On real hardware multiple invalidations are expensive */
                        if (cap_caching_mode(iommu->cap))
                                iommu_flush_iotlb_psi(iommu, domain,
-                                       iova->pfn_lo, iova_size(iova),
-                                       !deferred_flush[i].freelist[j], 0);
+                                       mm_to_dma_pfn(iova_pfn),
+                                       nrpages, !freelist, 0);
                        else {
-                               mask = ilog2(mm_to_dma_pfn(iova_size(iova)));
-                               iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
-                                               (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
+                               mask = ilog2(nrpages);
+                               iommu_flush_dev_iotlb(domain,
+                                               (uint64_t)iova_pfn << PAGE_SHIFT, mask);
                        }
-                       __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
-                       if (deferred_flush[i].freelist[j])
-                               dma_free_pagelist(deferred_flush[i].freelist[j]);
+                       free_iova_fast(&domain->iovad, iova_pfn, nrpages);
+                       if (freelist)
+                               dma_free_pagelist(freelist);
                }
-               deferred_flush[i].next = 0;
+               flush_table->next = 0;
        }
 
-       list_size = 0;
+       flush_data->size = 0;
 }
 
-static void flush_unmaps_timeout(unsigned long data)
+static void flush_unmaps_timeout(unsigned long cpuid)
 {
+       struct deferred_flush_data *flush_data = per_cpu_ptr(&deferred_flush, cpuid);
        unsigned long flags;
 
-       spin_lock_irqsave(&async_umap_flush_lock, flags);
-       flush_unmaps();
-       spin_unlock_irqrestore(&async_umap_flush_lock, flags);
+       spin_lock_irqsave(&flush_data->lock, flags);
+       flush_unmaps(flush_data);
+       spin_unlock_irqrestore(&flush_data->lock, flags);
 }
 
-static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *freelist)
+static void add_unmap(struct dmar_domain *dom, unsigned long iova_pfn,
+                     unsigned long nrpages, struct page *freelist)
 {
        unsigned long flags;
-       int next, iommu_id;
+       int entry_id, iommu_id;
        struct intel_iommu *iommu;
+       struct deferred_flush_entry *entry;
+       struct deferred_flush_data *flush_data;
+       unsigned int cpuid;
 
-       spin_lock_irqsave(&async_umap_flush_lock, flags);
-       if (list_size == HIGH_WATER_MARK)
-               flush_unmaps();
+       cpuid = get_cpu();
+       flush_data = per_cpu_ptr(&deferred_flush, cpuid);
+
+       /* Flush all CPUs' entries to avoid deferring too much.  If
+        * this becomes a bottleneck, can just flush us, and rely on
+        * flush timer for the rest.
+        */
+       if (flush_data->size == HIGH_WATER_MARK) {
+               int cpu;
+
+               for_each_online_cpu(cpu)
+                       flush_unmaps_timeout(cpu);
+       }
+
+       spin_lock_irqsave(&flush_data->lock, flags);
 
        iommu = domain_get_iommu(dom);
        iommu_id = iommu->seq_id;
 
-       next = deferred_flush[iommu_id].next;
-       deferred_flush[iommu_id].domain[next] = dom;
-       deferred_flush[iommu_id].iova[next] = iova;
-       deferred_flush[iommu_id].freelist[next] = freelist;
-       deferred_flush[iommu_id].next++;
+       entry_id = flush_data->tables[iommu_id].next;
+       ++(flush_data->tables[iommu_id].next);
 
-       if (!timer_on) {
-               mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
-               timer_on = 1;
+       entry = &flush_data->tables[iommu_id].entries[entry_id];
+       entry->domain = dom;
+       entry->iova_pfn = iova_pfn;
+       entry->nrpages = nrpages;
+       entry->freelist = freelist;
+
+       if (!flush_data->timer_on) {
+               mod_timer(&flush_data->timer, jiffies + msecs_to_jiffies(10));
+               flush_data->timer_on = 1;
        }
-       list_size++;
-       spin_unlock_irqrestore(&async_umap_flush_lock, flags);
+       flush_data->size++;
+       spin_unlock_irqrestore(&flush_data->lock, flags);
+
+       put_cpu();
 }
 
-static void intel_unmap(struct device *dev, dma_addr_t dev_addr)
+static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
 {
        struct dmar_domain *domain;
        unsigned long start_pfn, last_pfn;
-       struct iova *iova;
+       unsigned long nrpages;
+       unsigned long iova_pfn;
        struct intel_iommu *iommu;
        struct page *freelist;
 
@@ -3593,13 +3676,11 @@ static void intel_unmap(struct device *dev, dma_addr_t dev_addr)
 
        iommu = domain_get_iommu(domain);
 
-       iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
-       if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
-                     (unsigned long long)dev_addr))
-               return;
+       iova_pfn = IOVA_PFN(dev_addr);
 
-       start_pfn = mm_to_dma_pfn(iova->pfn_lo);
-       last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
+       nrpages = aligned_nrpages(dev_addr, size);
+       start_pfn = mm_to_dma_pfn(iova_pfn);
+       last_pfn = start_pfn + nrpages - 1;
 
        pr_debug("Device %s unmapping: pfn %lx-%lx\n",
                 dev_name(dev), start_pfn, last_pfn);
@@ -3608,12 +3689,12 @@ static void intel_unmap(struct device *dev, dma_addr_t dev_addr)
 
        if (intel_iommu_strict) {
                iommu_flush_iotlb_psi(iommu, domain, start_pfn,
-                                     last_pfn - start_pfn + 1, !freelist, 0);
+                                     nrpages, !freelist, 0);
                /* free iova */
-               __free_iova(&domain->iovad, iova);
+               free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
                dma_free_pagelist(freelist);
        } else {
-               add_unmap(domain, iova, freelist);
+               add_unmap(domain, iova_pfn, nrpages, freelist);
                /*
                 * queue up the release of the unmap to save the 1/6th of the
                 * cpu used up by the iotlb flush operation...
@@ -3625,7 +3706,7 @@ static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
                             size_t size, enum dma_data_direction dir,
                             struct dma_attrs *attrs)
 {
-       intel_unmap(dev, dev_addr);
+       intel_unmap(dev, dev_addr, size);
 }
 
 static void *intel_alloc_coherent(struct device *dev, size_t size,
@@ -3684,7 +3765,7 @@ static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
        size = PAGE_ALIGN(size);
        order = get_order(size);
 
-       intel_unmap(dev, dma_handle);
+       intel_unmap(dev, dma_handle, size);
        if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
                __free_pages(page, order);
 }
@@ -3693,7 +3774,16 @@ static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
                           int nelems, enum dma_data_direction dir,
                           struct dma_attrs *attrs)
 {
-       intel_unmap(dev, sglist[0].dma_address);
+       dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
+       unsigned long nrpages = 0;
+       struct scatterlist *sg;
+       int i;
+
+       for_each_sg(sglist, sg, nelems, i) {
+               nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
+       }
+
+       intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
 }
 
 static int intel_nontranslate_map_sg(struct device *hddev,
@@ -3717,7 +3807,7 @@ static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nele
        struct dmar_domain *domain;
        size_t size = 0;
        int prot = 0;
-       struct iova *iova = NULL;
+       unsigned long iova_pfn;
        int ret;
        struct scatterlist *sg;
        unsigned long start_vpfn;
@@ -3736,9 +3826,9 @@ static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nele
        for_each_sg(sglist, sg, nelems, i)
                size += aligned_nrpages(sg->offset, sg->length);
 
-       iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
+       iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
                                *dev->dma_mask);
-       if (!iova) {
+       if (!iova_pfn) {
                sglist->dma_length = 0;
                return 0;
        }
@@ -3753,13 +3843,13 @@ static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nele
        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
                prot |= DMA_PTE_WRITE;
 
-       start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
+       start_vpfn = mm_to_dma_pfn(iova_pfn);
 
        ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
        if (unlikely(ret)) {
                dma_pte_free_pagetable(domain, start_vpfn,
                                       start_vpfn + size - 1);
-               __free_iova(&domain->iovad, iova);
+               free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
                return 0;
        }
 
@@ -4498,6 +4588,46 @@ static struct notifier_block intel_iommu_memory_nb = {
        .priority = 0
 };
 
+static void free_all_cpu_cached_iovas(unsigned int cpu)
+{
+       int i;
+
+       for (i = 0; i < g_num_of_iommus; i++) {
+               struct intel_iommu *iommu = g_iommus[i];
+               struct dmar_domain *domain;
+               u16 did;
+
+               if (!iommu)
+                       continue;
+
+               for (did = 0; did < 0xffff; did++) {
+                       domain = get_iommu_domain(iommu, did);
+
+                       if (!domain)
+                               continue;
+                       free_cpu_cached_iovas(cpu, &domain->iovad);
+               }
+       }
+}
+
+static int intel_iommu_cpu_notifier(struct notifier_block *nfb,
+                                   unsigned long action, void *v)
+{
+       unsigned int cpu = (unsigned long)v;
+
+       switch (action) {
+       case CPU_DEAD:
+       case CPU_DEAD_FROZEN:
+               free_all_cpu_cached_iovas(cpu);
+               flush_unmaps_timeout(cpu);
+               break;
+       }
+       return NOTIFY_OK;
+}
+
+static struct notifier_block intel_iommu_cpu_nb = {
+       .notifier_call = intel_iommu_cpu_notifier,
+};
 
 static ssize_t intel_iommu_show_version(struct device *dev,
                                        struct device_attribute *attr,
@@ -4631,7 +4761,6 @@ int __init intel_iommu_init(void)
        up_write(&dmar_global_lock);
        pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
 
-       init_timer(&unmap_timer);
 #ifdef CONFIG_SWIOTLB
        swiotlb = 0;
 #endif
@@ -4648,6 +4777,7 @@ int __init intel_iommu_init(void)
        bus_register_notifier(&pci_bus_type, &device_nb);
        if (si_domain && !hw_pass_through)
                register_memory_notifier(&intel_iommu_memory_nb);
+       register_hotcpu_notifier(&intel_iommu_cpu_nb);
 
        intel_iommu_enabled = 1;
 
index fa0adef..ba764a0 100644 (file)
 #include <linux/iova.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/bitops.h>
+
+static bool iova_rcache_insert(struct iova_domain *iovad,
+                              unsigned long pfn,
+                              unsigned long size);
+static unsigned long iova_rcache_get(struct iova_domain *iovad,
+                                    unsigned long size,
+                                    unsigned long limit_pfn);
+static void init_iova_rcaches(struct iova_domain *iovad);
+static void free_iova_rcaches(struct iova_domain *iovad);
 
 void
 init_iova_domain(struct iova_domain *iovad, unsigned long granule,
@@ -38,6 +49,7 @@ init_iova_domain(struct iova_domain *iovad, unsigned long granule,
        iovad->granule = granule;
        iovad->start_pfn = start_pfn;
        iovad->dma_32bit_pfn = pfn_32bit;
+       init_iova_rcaches(iovad);
 }
 EXPORT_SYMBOL_GPL(init_iova_domain);
 
@@ -291,33 +303,18 @@ alloc_iova(struct iova_domain *iovad, unsigned long size,
 }
 EXPORT_SYMBOL_GPL(alloc_iova);
 
-/**
- * find_iova - find's an iova for a given pfn
- * @iovad: - iova domain in question.
- * @pfn: - page frame number
- * This function finds and returns an iova belonging to the
- * given doamin which matches the given pfn.
- */
-struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn)
+static struct iova *
+private_find_iova(struct iova_domain *iovad, unsigned long pfn)
 {
-       unsigned long flags;
-       struct rb_node *node;
+       struct rb_node *node = iovad->rbroot.rb_node;
+
+       assert_spin_locked(&iovad->iova_rbtree_lock);
 
-       /* Take the lock so that no other thread is manipulating the rbtree */
-       spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
-       node = iovad->rbroot.rb_node;
        while (node) {
                struct iova *iova = container_of(node, struct iova, node);
 
                /* If pfn falls within iova's range, return iova */
                if ((pfn >= iova->pfn_lo) && (pfn <= iova->pfn_hi)) {
-                       spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
-                       /* We are not holding the lock while this iova
-                        * is referenced by the caller as the same thread
-                        * which called this function also calls __free_iova()
-                        * and it is by design that only one thread can possibly
-                        * reference a particular iova and hence no conflict.
-                        */
                        return iova;
                }
 
@@ -327,9 +324,35 @@ struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn)
                        node = node->rb_right;
        }
 
-       spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
        return NULL;
 }
+
+static void private_free_iova(struct iova_domain *iovad, struct iova *iova)
+{
+       assert_spin_locked(&iovad->iova_rbtree_lock);
+       __cached_rbnode_delete_update(iovad, iova);
+       rb_erase(&iova->node, &iovad->rbroot);
+       free_iova_mem(iova);
+}
+
+/**
+ * find_iova - finds an iova for a given pfn
+ * @iovad: - iova domain in question.
+ * @pfn: - page frame number
+ * This function finds and returns an iova belonging to the
+ * given doamin which matches the given pfn.
+ */
+struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn)
+{
+       unsigned long flags;
+       struct iova *iova;
+
+       /* Take the lock so that no other thread is manipulating the rbtree */
+       spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+       iova = private_find_iova(iovad, pfn);
+       spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+       return iova;
+}
 EXPORT_SYMBOL_GPL(find_iova);
 
 /**
@@ -344,10 +367,8 @@ __free_iova(struct iova_domain *iovad, struct iova *iova)
        unsigned long flags;
 
        spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
-       __cached_rbnode_delete_update(iovad, iova);
-       rb_erase(&iova->node, &iovad->rbroot);
+       private_free_iova(iovad, iova);
        spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
-       free_iova_mem(iova);
 }
 EXPORT_SYMBOL_GPL(__free_iova);
 
@@ -369,6 +390,63 @@ free_iova(struct iova_domain *iovad, unsigned long pfn)
 }
 EXPORT_SYMBOL_GPL(free_iova);
 
+/**
+ * alloc_iova_fast - allocates an iova from rcache
+ * @iovad: - iova domain in question
+ * @size: - size of page frames to allocate
+ * @limit_pfn: - max limit address
+ * This function tries to satisfy an iova allocation from the rcache,
+ * and falls back to regular allocation on failure.
+*/
+unsigned long
+alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
+               unsigned long limit_pfn)
+{
+       bool flushed_rcache = false;
+       unsigned long iova_pfn;
+       struct iova *new_iova;
+
+       iova_pfn = iova_rcache_get(iovad, size, limit_pfn);
+       if (iova_pfn)
+               return iova_pfn;
+
+retry:
+       new_iova = alloc_iova(iovad, size, limit_pfn, true);
+       if (!new_iova) {
+               unsigned int cpu;
+
+               if (flushed_rcache)
+                       return 0;
+
+               /* Try replenishing IOVAs by flushing rcache. */
+               flushed_rcache = true;
+               for_each_online_cpu(cpu)
+                       free_cpu_cached_iovas(cpu, iovad);
+               goto retry;
+       }
+
+       return new_iova->pfn_lo;
+}
+EXPORT_SYMBOL_GPL(alloc_iova_fast);
+
+/**
+ * free_iova_fast - free iova pfn range into rcache
+ * @iovad: - iova domain in question.
+ * @pfn: - pfn that is allocated previously
+ * @size: - # of pages in range
+ * This functions frees an iova range by trying to put it into the rcache,
+ * falling back to regular iova deallocation via free_iova() if this fails.
+ */
+void
+free_iova_fast(struct iova_domain *iovad, unsigned long pfn, unsigned long size)
+{
+       if (iova_rcache_insert(iovad, pfn, size))
+               return;
+
+       free_iova(iovad, pfn);
+}
+EXPORT_SYMBOL_GPL(free_iova_fast);
+
 /**
  * put_iova_domain - destroys the iova doamin
  * @iovad: - iova domain in question.
@@ -379,6 +457,7 @@ void put_iova_domain(struct iova_domain *iovad)
        struct rb_node *node;
        unsigned long flags;
 
+       free_iova_rcaches(iovad);
        spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
        node = rb_first(&iovad->rbroot);
        while (node) {
@@ -550,5 +629,295 @@ error:
        return NULL;
 }
 
+/*
+ * Magazine caches for IOVA ranges.  For an introduction to magazines,
+ * see the USENIX 2001 paper "Magazines and Vmem: Extending the Slab
+ * Allocator to Many CPUs and Arbitrary Resources" by Bonwick and Adams.
+ * For simplicity, we use a static magazine size and don't implement the
+ * dynamic size tuning described in the paper.
+ */
+
+#define IOVA_MAG_SIZE 128
+
+struct iova_magazine {
+       unsigned long size;
+       unsigned long pfns[IOVA_MAG_SIZE];
+};
+
+struct iova_cpu_rcache {
+       spinlock_t lock;
+       struct iova_magazine *loaded;
+       struct iova_magazine *prev;
+};
+
+static struct iova_magazine *iova_magazine_alloc(gfp_t flags)
+{
+       return kzalloc(sizeof(struct iova_magazine), flags);
+}
+
+static void iova_magazine_free(struct iova_magazine *mag)
+{
+       kfree(mag);
+}
+
+static void
+iova_magazine_free_pfns(struct iova_magazine *mag, struct iova_domain *iovad)
+{
+       unsigned long flags;
+       int i;
+
+       if (!mag)
+               return;
+
+       spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+
+       for (i = 0 ; i < mag->size; ++i) {
+               struct iova *iova = private_find_iova(iovad, mag->pfns[i]);
+
+               BUG_ON(!iova);
+               private_free_iova(iovad, iova);
+       }
+
+       spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+
+       mag->size = 0;
+}
+
+static bool iova_magazine_full(struct iova_magazine *mag)
+{
+       return (mag && mag->size == IOVA_MAG_SIZE);
+}
+
+static bool iova_magazine_empty(struct iova_magazine *mag)
+{
+       return (!mag || mag->size == 0);
+}
+
+static unsigned long iova_magazine_pop(struct iova_magazine *mag,
+                                      unsigned long limit_pfn)
+{
+       BUG_ON(iova_magazine_empty(mag));
+
+       if (mag->pfns[mag->size - 1] >= limit_pfn)
+               return 0;
+
+       return mag->pfns[--mag->size];
+}
+
+static void iova_magazine_push(struct iova_magazine *mag, unsigned long pfn)
+{
+       BUG_ON(iova_magazine_full(mag));
+
+       mag->pfns[mag->size++] = pfn;
+}
+
+static void init_iova_rcaches(struct iova_domain *iovad)
+{
+       struct iova_cpu_rcache *cpu_rcache;
+       struct iova_rcache *rcache;
+       unsigned int cpu;
+       int i;
+
+       for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
+               rcache = &iovad->rcaches[i];
+               spin_lock_init(&rcache->lock);
+               rcache->depot_size = 0;
+               rcache->cpu_rcaches = __alloc_percpu(sizeof(*cpu_rcache), cache_line_size());
+               if (WARN_ON(!rcache->cpu_rcaches))
+                       continue;
+               for_each_possible_cpu(cpu) {
+                       cpu_rcache = per_cpu_ptr(rcache->cpu_rcaches, cpu);
+                       spin_lock_init(&cpu_rcache->lock);
+                       cpu_rcache->loaded = iova_magazine_alloc(GFP_KERNEL);
+                       cpu_rcache->prev = iova_magazine_alloc(GFP_KERNEL);
+               }
+       }
+}
+
+/*
+ * Try inserting IOVA range starting with 'iova_pfn' into 'rcache', and
+ * return true on success.  Can fail if rcache is full and we can't free
+ * space, and free_iova() (our only caller) will then return the IOVA
+ * range to the rbtree instead.
+ */
+static bool __iova_rcache_insert(struct iova_domain *iovad,
+                                struct iova_rcache *rcache,
+                                unsigned long iova_pfn)
+{
+       struct iova_magazine *mag_to_free = NULL;
+       struct iova_cpu_rcache *cpu_rcache;
+       bool can_insert = false;
+       unsigned long flags;
+
+       cpu_rcache = this_cpu_ptr(rcache->cpu_rcaches);
+       spin_lock_irqsave(&cpu_rcache->lock, flags);
+
+       if (!iova_magazine_full(cpu_rcache->loaded)) {
+               can_insert = true;
+       } else if (!iova_magazine_full(cpu_rcache->prev)) {
+               swap(cpu_rcache->prev, cpu_rcache->loaded);
+               can_insert = true;
+       } else {
+               struct iova_magazine *new_mag = iova_magazine_alloc(GFP_ATOMIC);
+
+               if (new_mag) {
+                       spin_lock(&rcache->lock);
+                       if (rcache->depot_size < MAX_GLOBAL_MAGS) {
+                               rcache->depot[rcache->depot_size++] =
+                                               cpu_rcache->loaded;
+                       } else {
+                               mag_to_free = cpu_rcache->loaded;
+                       }
+                       spin_unlock(&rcache->lock);
+
+                       cpu_rcache->loaded = new_mag;
+                       can_insert = true;
+               }
+       }
+
+       if (can_insert)
+               iova_magazine_push(cpu_rcache->loaded, iova_pfn);
+
+       spin_unlock_irqrestore(&cpu_rcache->lock, flags);
+
+       if (mag_to_free) {
+               iova_magazine_free_pfns(mag_to_free, iovad);
+               iova_magazine_free(mag_to_free);
+       }
+
+       return can_insert;
+}
+
+static bool iova_rcache_insert(struct iova_domain *iovad, unsigned long pfn,
+                              unsigned long size)
+{
+       unsigned int log_size = order_base_2(size);
+
+       if (log_size >= IOVA_RANGE_CACHE_MAX_SIZE)
+               return false;
+
+       return __iova_rcache_insert(iovad, &iovad->rcaches[log_size], pfn);
+}
+
+/*
+ * Caller wants to allocate a new IOVA range from 'rcache'.  If we can
+ * satisfy the request, return a matching non-NULL range and remove
+ * it from the 'rcache'.
+ */
+static unsigned long __iova_rcache_get(struct iova_rcache *rcache,
+                                      unsigned long limit_pfn)
+{
+       struct iova_cpu_rcache *cpu_rcache;
+       unsigned long iova_pfn = 0;
+       bool has_pfn = false;
+       unsigned long flags;
+
+       cpu_rcache = this_cpu_ptr(rcache->cpu_rcaches);
+       spin_lock_irqsave(&cpu_rcache->lock, flags);
+
+       if (!iova_magazine_empty(cpu_rcache->loaded)) {
+               has_pfn = true;
+       } else if (!iova_magazine_empty(cpu_rcache->prev)) {
+               swap(cpu_rcache->prev, cpu_rcache->loaded);
+               has_pfn = true;
+       } else {
+               spin_lock(&rcache->lock);
+               if (rcache->depot_size > 0) {
+                       iova_magazine_free(cpu_rcache->loaded);
+                       cpu_rcache->loaded = rcache->depot[--rcache->depot_size];
+                       has_pfn = true;
+               }
+               spin_unlock(&rcache->lock);
+       }
+
+       if (has_pfn)
+               iova_pfn = iova_magazine_pop(cpu_rcache->loaded, limit_pfn);
+
+       spin_unlock_irqrestore(&cpu_rcache->lock, flags);
+
+       return iova_pfn;
+}
+
+/*
+ * Try to satisfy IOVA allocation range from rcache.  Fail if requested
+ * size is too big or the DMA limit we are given isn't satisfied by the
+ * top element in the magazine.
+ */
+static unsigned long iova_rcache_get(struct iova_domain *iovad,
+                                    unsigned long size,
+                                    unsigned long limit_pfn)
+{
+       unsigned int log_size = order_base_2(size);
+
+       if (log_size >= IOVA_RANGE_CACHE_MAX_SIZE)
+               return 0;
+
+       return __iova_rcache_get(&iovad->rcaches[log_size], limit_pfn);
+}
+
+/*
+ * Free a cpu's rcache.
+ */
+static void free_cpu_iova_rcache(unsigned int cpu, struct iova_domain *iovad,
+                                struct iova_rcache *rcache)
+{
+       struct iova_cpu_rcache *cpu_rcache = per_cpu_ptr(rcache->cpu_rcaches, cpu);
+       unsigned long flags;
+
+       spin_lock_irqsave(&cpu_rcache->lock, flags);
+
+       iova_magazine_free_pfns(cpu_rcache->loaded, iovad);
+       iova_magazine_free(cpu_rcache->loaded);
+
+       iova_magazine_free_pfns(cpu_rcache->prev, iovad);
+       iova_magazine_free(cpu_rcache->prev);
+
+       spin_unlock_irqrestore(&cpu_rcache->lock, flags);
+}
+
+/*
+ * free rcache data structures.
+ */
+static void free_iova_rcaches(struct iova_domain *iovad)
+{
+       struct iova_rcache *rcache;
+       unsigned long flags;
+       unsigned int cpu;
+       int i, j;
+
+       for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
+               rcache = &iovad->rcaches[i];
+               for_each_possible_cpu(cpu)
+                       free_cpu_iova_rcache(cpu, iovad, rcache);
+               spin_lock_irqsave(&rcache->lock, flags);
+               free_percpu(rcache->cpu_rcaches);
+               for (j = 0; j < rcache->depot_size; ++j) {
+                       iova_magazine_free_pfns(rcache->depot[j], iovad);
+                       iova_magazine_free(rcache->depot[j]);
+               }
+               spin_unlock_irqrestore(&rcache->lock, flags);
+       }
+}
+
+/*
+ * free all the IOVA ranges cached by a cpu (used when cpu is unplugged)
+ */
+void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad)
+{
+       struct iova_cpu_rcache *cpu_rcache;
+       struct iova_rcache *rcache;
+       unsigned long flags;
+       int i;
+
+       for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
+               rcache = &iovad->rcaches[i];
+               cpu_rcache = per_cpu_ptr(rcache->cpu_rcaches, cpu);
+               spin_lock_irqsave(&cpu_rcache->lock, flags);
+               iova_magazine_free_pfns(cpu_rcache->loaded, iovad);
+               iova_magazine_free_pfns(cpu_rcache->prev, iovad);
+               spin_unlock_irqrestore(&cpu_rcache->lock, flags);
+       }
+}
+
 MODULE_AUTHOR("Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>");
 MODULE_LICENSE("GPL");
index eb5eb0c..2223b3f 100644 (file)
@@ -182,7 +182,7 @@ static int __init _clps711x_intc_init(struct device_node *np,
        writel_relaxed(0, clps711x_intc->intmr[2]);
 
        err = irq_alloc_descs(-1, 0, ARRAY_SIZE(clps711x_irqs), numa_node_id());
-       if (IS_ERR_VALUE(err))
+       if (err < 0)
                goto out_iounmap;
 
        clps711x_intc->ops.map = clps711x_intc_irq_map;
index 6bd881b..5eb1f9e 100644 (file)
@@ -41,6 +41,7 @@
 
 #define ITS_FLAGS_CMDQ_NEEDS_FLUSHING          (1ULL << 0)
 #define ITS_FLAGS_WORKAROUND_CAVIUM_22375      (1ULL << 1)
+#define ITS_FLAGS_WORKAROUND_CAVIUM_23144      (1ULL << 2)
 
 #define RDIST_FLAGS_PROPBASE_NEEDS_FLUSHING    (1 << 0)
 
@@ -82,6 +83,7 @@ struct its_node {
        u64                     flags;
        u32                     ite_size;
        u32                     device_ids;
+       int                     numa_node;
 };
 
 #define ITS_ITT_ALIGN          SZ_256
@@ -613,11 +615,23 @@ static void its_unmask_irq(struct irq_data *d)
 static int its_set_affinity(struct irq_data *d, const struct cpumask *mask_val,
                            bool force)
 {
-       unsigned int cpu = cpumask_any_and(mask_val, cpu_online_mask);
+       unsigned int cpu;
+       const struct cpumask *cpu_mask = cpu_online_mask;
        struct its_device *its_dev = irq_data_get_irq_chip_data(d);
        struct its_collection *target_col;
        u32 id = its_get_event_id(d);
 
+       /* lpi cannot be routed to a redistributor that is on a foreign node */
+       if (its_dev->its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_23144) {
+               if (its_dev->its->numa_node >= 0) {
+                       cpu_mask = cpumask_of_node(its_dev->its->numa_node);
+                       if (!cpumask_intersects(mask_val, cpu_mask))
+                               return -EINVAL;
+               }
+       }
+
+       cpu = cpumask_any_and(mask_val, cpu_mask);
+
        if (cpu >= nr_cpu_ids)
                return -EINVAL;
 
@@ -1101,6 +1115,16 @@ static void its_cpu_init_collection(void)
        list_for_each_entry(its, &its_nodes, entry) {
                u64 target;
 
+               /* avoid cross node collections and its mapping */
+               if (its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_23144) {
+                       struct device_node *cpu_node;
+
+                       cpu_node = of_get_cpu_node(cpu, NULL);
+                       if (its->numa_node != NUMA_NO_NODE &&
+                               its->numa_node != of_node_to_nid(cpu_node))
+                               continue;
+               }
+
                /*
                 * We now have to bind each collection to its target
                 * redistributor.
@@ -1351,9 +1375,14 @@ static void its_irq_domain_activate(struct irq_domain *domain,
 {
        struct its_device *its_dev = irq_data_get_irq_chip_data(d);
        u32 event = its_get_event_id(d);
+       const struct cpumask *cpu_mask = cpu_online_mask;
+
+       /* get the cpu_mask of local node */
+       if (its_dev->its->numa_node >= 0)
+               cpu_mask = cpumask_of_node(its_dev->its->numa_node);
 
        /* Bind the LPI to the first possible CPU */
-       its_dev->event_map.col_map[event] = cpumask_first(cpu_online_mask);
+       its_dev->event_map.col_map[event] = cpumask_first(cpu_mask);
 
        /* Map the GIC IRQ and event to the device */
        its_send_mapvi(its_dev, d->hwirq, event);
@@ -1443,6 +1472,13 @@ static void __maybe_unused its_enable_quirk_cavium_22375(void *data)
        its->flags |= ITS_FLAGS_WORKAROUND_CAVIUM_22375;
 }
 
+static void __maybe_unused its_enable_quirk_cavium_23144(void *data)
+{
+       struct its_node *its = data;
+
+       its->flags |= ITS_FLAGS_WORKAROUND_CAVIUM_23144;
+}
+
 static const struct gic_quirk its_quirks[] = {
 #ifdef CONFIG_CAVIUM_ERRATUM_22375
        {
@@ -1451,6 +1487,14 @@ static const struct gic_quirk its_quirks[] = {
                .mask   = 0xffff0fff,
                .init   = its_enable_quirk_cavium_22375,
        },
+#endif
+#ifdef CONFIG_CAVIUM_ERRATUM_23144
+       {
+               .desc   = "ITS: Cavium erratum 23144",
+               .iidr   = 0xa100034c,   /* ThunderX pass 1.x */
+               .mask   = 0xffff0fff,
+               .init   = its_enable_quirk_cavium_23144,
+       },
 #endif
        {
        }
@@ -1514,6 +1558,7 @@ static int __init its_probe(struct device_node *node,
        its->base = its_base;
        its->phys_base = res.start;
        its->ite_size = ((readl_relaxed(its_base + GITS_TYPER) >> 4) & 0xf) + 1;
+       its->numa_node = of_node_to_nid(node);
 
        its->cmd_base = kzalloc(ITS_CMD_QUEUE_SZ, GFP_KERNEL);
        if (!its->cmd_base) {
index fb042ba..2c5ba0e 100644 (file)
@@ -155,7 +155,7 @@ static void gic_enable_redist(bool enable)
 
        while (count--) {
                val = readl_relaxed(rbase + GICR_WAKER);
-               if (enable ^ (val & GICR_WAKER_ChildrenAsleep))
+               if (enable ^ (bool)(val & GICR_WAKER_ChildrenAsleep))
                        break;
                cpu_relax();
                udelay(1);
index b4e6471..fbc4ae2 100644 (file)
@@ -1123,7 +1123,7 @@ static int __init __gic_init_bases(struct gic_chip_data *gic, int irq_start,
 
                irq_base = irq_alloc_descs(irq_start, 16, gic_irqs,
                                           numa_node_id());
-               if (IS_ERR_VALUE(irq_base)) {
+               if (irq_base < 0) {
                        WARN(1, "Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n",
                             irq_start);
                        irq_base = irq_start;
index 9688d2e..9e25d8c 100644 (file)
@@ -402,7 +402,7 @@ hip04_of_init(struct device_node *node, struct device_node *parent)
        nr_irqs -= hwirq_base; /* calculate # of irqs to allocate */
 
        irq_base = irq_alloc_descs(-1, hwirq_base, nr_irqs, numa_node_id());
-       if (IS_ERR_VALUE(irq_base)) {
+       if (irq_base < 0) {
                pr_err("failed to allocate IRQ numbers\n");
                return -EINVAL;
        }
index c089f49..3b5e10a 100644 (file)
@@ -968,7 +968,7 @@ static void __init __gic_init(unsigned long gic_base_addr,
                              unsigned int cpu_vec, unsigned int irqbase,
                              struct device_node *node)
 {
-       unsigned int gicconfig;
+       unsigned int gicconfig, cpu;
        unsigned int v[2];
 
        __gic_base_addr = gic_base_addr;
@@ -985,6 +985,14 @@ static void __init __gic_init(unsigned long gic_base_addr,
        gic_vpes = gic_vpes + 1;
 
        if (cpu_has_veic) {
+               /* Set EIC mode for all VPEs */
+               for_each_present_cpu(cpu) {
+                       gic_write(GIC_REG(VPE_LOCAL, GIC_VPE_OTHER_ADDR),
+                                 mips_cm_vp_id(cpu));
+                       gic_write(GIC_REG(VPE_OTHER, GIC_VPE_CTL),
+                                 GIC_VPE_CTL_EIC_MODE_MSK);
+               }
+
                /* Always use vector 1 in EIC mode */
                gic_cpu_pin = 0;
                timer_cpu_pin = gic_cpu_pin;
index e7155db..73addb4 100644 (file)
@@ -91,7 +91,7 @@ static int pic32_set_type_edge(struct irq_data *data,
        /* set polarity for external interrupts only */
        for (i = 0; i < ARRAY_SIZE(priv->ext_irqs); i++) {
                if (priv->ext_irqs[i] == data->hwirq) {
-                       ret = pic32_set_ext_polarity(i + 1, flow_type);
+                       ret = pic32_set_ext_polarity(i, flow_type);
                        if (ret)
                                return ret;
                }
index 1ccd2ab..1518ba3 100644 (file)
@@ -232,7 +232,7 @@ static int __init shirq_init(struct spear_shirq **shirq_blocks, int block_nr,
                nr_irqs += shirq_blocks[i]->nr_irqs;
 
        virq_base = irq_alloc_descs(-1, 0, nr_irqs, 0);
-       if (IS_ERR_VALUE(virq_base)) {
+       if (virq_base < 0) {
                pr_err("%s: irq desc alloc failed\n", __func__);
                goto err_unmap;
        }
index 4783bac..a9145aa 100644 (file)
@@ -91,6 +91,7 @@ static int led_pwm_add(struct device *dev, struct led_pwm_priv *priv,
                       struct led_pwm *led, struct device_node *child)
 {
        struct led_pwm_data *led_data = &priv->leds[priv->num_leds];
+       struct pwm_args pargs;
        int ret;
 
        led_data->active_low = led->active_low;
@@ -117,7 +118,15 @@ static int led_pwm_add(struct device *dev, struct led_pwm_priv *priv,
        else
                led_data->cdev.brightness_set_blocking = led_pwm_set_blocking;
 
-       led_data->period = pwm_get_period(led_data->pwm);
+       /*
+        * FIXME: pwm_apply_args() should be removed when switching to the
+        * atomic PWM API.
+        */
+       pwm_apply_args(led_data->pwm);
+
+       pwm_get_args(led_data->pwm, &pargs);
+
+       led_data->period = pargs.period;
        if (!led_data->period && (led->pwm_period_ns > 0))
                led_data->period = led->pwm_period_ns;
 
index 8eeab72..ca4abe1 100644 (file)
@@ -64,7 +64,6 @@
 #include "btree.h"
 
 #include <linux/blkdev.h>
-#include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/random.h>
 #include <trace/events/bcache.h>
@@ -288,7 +287,6 @@ do {                                                                        \
                if (kthread_should_stop())                              \
                        return 0;                                       \
                                                                        \
-               try_to_freeze();                                        \
                schedule();                                             \
                mutex_lock(&(ca)->set->bucket_lock);                    \
        }                                                               \
index 22b9e34..eab505e 100644 (file)
@@ -27,7 +27,6 @@
 
 #include <linux/slab.h>
 #include <linux/bitops.h>
-#include <linux/freezer.h>
 #include <linux/hash.h>
 #include <linux/kthread.h>
 #include <linux/prefetch.h>
@@ -1787,7 +1786,6 @@ again:
 
                mutex_unlock(&c->bucket_lock);
 
-               try_to_freeze();
                schedule();
        }
 
index b9346cd..6012367 100644 (file)
@@ -12,7 +12,6 @@
 #include "writeback.h"
 
 #include <linux/delay.h>
-#include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <trace/events/bcache.h>
 
@@ -228,7 +227,6 @@ static void read_dirty(struct cached_dev *dc)
         */
 
        while (!kthread_should_stop()) {
-               try_to_freeze();
 
                w = bch_keybuf_next(&dc->writeback_keys);
                if (!w)
@@ -433,7 +431,6 @@ static int bch_writeback_thread(void *arg)
                        if (kthread_should_stop())
                                return 0;
 
-                       try_to_freeze();
                        schedule();
                        continue;
                }
index 9e1731c..e191e29 100644 (file)
@@ -95,7 +95,7 @@ static int adp1653_get_fault(struct adp1653_flash *flash)
        int rval;
 
        fault = i2c_smbus_read_byte_data(client, ADP1653_REG_FAULT);
-       if (IS_ERR_VALUE(fault))
+       if (fault < 0)
                return fault;
 
        flash->fault |= fault;
@@ -105,13 +105,13 @@ static int adp1653_get_fault(struct adp1653_flash *flash)
 
        /* Clear faults. */
        rval = i2c_smbus_write_byte_data(client, ADP1653_REG_OUT_SEL, 0);
-       if (IS_ERR_VALUE(rval))
+       if (rval < 0)
                return rval;
 
        flash->led_mode->val = V4L2_FLASH_LED_MODE_NONE;
 
        rval = adp1653_update_hw(flash);
-       if (IS_ERR_VALUE(rval))
+       if (rval)
                return rval;
 
        return flash->fault;
@@ -158,7 +158,7 @@ static int adp1653_get_ctrl(struct v4l2_ctrl *ctrl)
        int rval;
 
        rval = adp1653_get_fault(flash);
-       if (IS_ERR_VALUE(rval))
+       if (rval)
                return rval;
 
        ctrl->cur.val = 0;
@@ -184,7 +184,7 @@ static int adp1653_set_ctrl(struct v4l2_ctrl *ctrl)
        int rval;
 
        rval = adp1653_get_fault(flash);
-       if (IS_ERR_VALUE(rval))
+       if (rval)
                return rval;
        if ((rval & (ADP1653_REG_FAULT_FLT_SCP |
                     ADP1653_REG_FAULT_FLT_OT |
index 5ef6777..8a5d194 100644 (file)
@@ -146,7 +146,7 @@ int mxr_power_get(struct mxr_device *mdev)
 
        /* returning 1 means that power is already enabled,
         * so zero success be returned */
-       if (IS_ERR_VALUE(ret))
+       if (ret < 0)
                return ret;
        return 0;
 }
index 95a7388..09e0f58 100644 (file)
@@ -398,6 +398,8 @@ error:
 }
 
 #define AF9015_EEPROM_SIZE 256
+/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
+#define GOLDEN_RATIO_PRIME_32 0x9e370001UL
 
 /* hash (and dump) eeprom */
 static int af9015_eeprom_hash(struct dvb_usb_device *d)
index c61a284..81ddb17 100644 (file)
@@ -51,6 +51,7 @@ config TI_EMIF
 
 config OMAP_GPMC
        bool
+       select GPIOLIB
        help
          This driver is for the General Purpose Memory Controller (GPMC)
          present on Texas Instruments SoCs (e.g. OMAP2+). GPMC allows
index 2a691da..904b4af 100644 (file)
@@ -59,11 +59,11 @@ int fsl_ifc_find(phys_addr_t addr_base)
 {
        int i = 0;
 
-       if (!fsl_ifc_ctrl_dev || !fsl_ifc_ctrl_dev->regs)
+       if (!fsl_ifc_ctrl_dev || !fsl_ifc_ctrl_dev->gregs)
                return -ENODEV;
 
        for (i = 0; i < fsl_ifc_ctrl_dev->banks; i++) {
-               u32 cspr = ifc_in32(&fsl_ifc_ctrl_dev->regs->cspr_cs[i].cspr);
+               u32 cspr = ifc_in32(&fsl_ifc_ctrl_dev->gregs->cspr_cs[i].cspr);
                if (cspr & CSPR_V && (cspr & CSPR_BA) ==
                                convert_ifc_address(addr_base))
                        return i;
@@ -75,7 +75,7 @@ EXPORT_SYMBOL(fsl_ifc_find);
 
 static int fsl_ifc_ctrl_init(struct fsl_ifc_ctrl *ctrl)
 {
-       struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+       struct fsl_ifc_global __iomem *ifc = ctrl->gregs;
 
        /*
         * Clear all the common status and event registers
@@ -104,7 +104,7 @@ static int fsl_ifc_ctrl_remove(struct platform_device *dev)
        irq_dispose_mapping(ctrl->nand_irq);
        irq_dispose_mapping(ctrl->irq);
 
-       iounmap(ctrl->regs);
+       iounmap(ctrl->gregs);
 
        dev_set_drvdata(&dev->dev, NULL);
        kfree(ctrl);
@@ -122,7 +122,7 @@ static DEFINE_SPINLOCK(nand_irq_lock);
 
 static u32 check_nand_stat(struct fsl_ifc_ctrl *ctrl)
 {
-       struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+       struct fsl_ifc_runtime __iomem *ifc = ctrl->rregs;
        unsigned long flags;
        u32 stat;
 
@@ -157,7 +157,7 @@ static irqreturn_t fsl_ifc_nand_irq(int irqno, void *data)
 static irqreturn_t fsl_ifc_ctrl_irq(int irqno, void *data)
 {
        struct fsl_ifc_ctrl *ctrl = data;
-       struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+       struct fsl_ifc_global __iomem *ifc = ctrl->gregs;
        u32 err_axiid, err_srcid, status, cs_err, err_addr;
        irqreturn_t ret = IRQ_NONE;
 
@@ -215,6 +215,7 @@ static int fsl_ifc_ctrl_probe(struct platform_device *dev)
 {
        int ret = 0;
        int version, banks;
+       void __iomem *addr;
 
        dev_info(&dev->dev, "Freescale Integrated Flash Controller\n");
 
@@ -225,22 +226,13 @@ static int fsl_ifc_ctrl_probe(struct platform_device *dev)
        dev_set_drvdata(&dev->dev, fsl_ifc_ctrl_dev);
 
        /* IOMAP the entire IFC region */
-       fsl_ifc_ctrl_dev->regs = of_iomap(dev->dev.of_node, 0);
-       if (!fsl_ifc_ctrl_dev->regs) {
+       fsl_ifc_ctrl_dev->gregs = of_iomap(dev->dev.of_node, 0);
+       if (!fsl_ifc_ctrl_dev->gregs) {
                dev_err(&dev->dev, "failed to get memory region\n");
                ret = -ENODEV;
                goto err;
        }
 
-       version = ifc_in32(&fsl_ifc_ctrl_dev->regs->ifc_rev) &
-                       FSL_IFC_VERSION_MASK;
-       banks = (version == FSL_IFC_VERSION_1_0_0) ? 4 : 8;
-       dev_info(&dev->dev, "IFC version %d.%d, %d banks\n",
-               version >> 24, (version >> 16) & 0xf, banks);
-
-       fsl_ifc_ctrl_dev->version = version;
-       fsl_ifc_ctrl_dev->banks = banks;
-
        if (of_property_read_bool(dev->dev.of_node, "little-endian")) {
                fsl_ifc_ctrl_dev->little_endian = true;
                dev_dbg(&dev->dev, "IFC REGISTERS are LITTLE endian\n");
@@ -249,8 +241,9 @@ static int fsl_ifc_ctrl_probe(struct platform_device *dev)
                dev_dbg(&dev->dev, "IFC REGISTERS are BIG endian\n");
        }
 
-       version = ioread32be(&fsl_ifc_ctrl_dev->regs->ifc_rev) &
+       version = ifc_in32(&fsl_ifc_ctrl_dev->gregs->ifc_rev) &
                        FSL_IFC_VERSION_MASK;
+
        banks = (version == FSL_IFC_VERSION_1_0_0) ? 4 : 8;
        dev_info(&dev->dev, "IFC version %d.%d, %d banks\n",
                version >> 24, (version >> 16) & 0xf, banks);
@@ -258,6 +251,13 @@ static int fsl_ifc_ctrl_probe(struct platform_device *dev)
        fsl_ifc_ctrl_dev->version = version;
        fsl_ifc_ctrl_dev->banks = banks;
 
+       addr = fsl_ifc_ctrl_dev->gregs;
+       if (version >= FSL_IFC_VERSION_2_0_0)
+               addr += PGOFFSET_64K;
+       else
+               addr += PGOFFSET_4K;
+       fsl_ifc_ctrl_dev->rregs = addr;
+
        /* get the Controller level irq */
        fsl_ifc_ctrl_dev->irq = irq_of_parse_and_map(dev->dev.of_node, 0);
        if (fsl_ifc_ctrl_dev->irq == 0) {
index 21825dd..af4884b 100644 (file)
 #include <linux/spinlock.h>
 #include <linux/io.h>
 #include <linux/module.h>
+#include <linux/gpio/driver.h>
 #include <linux/interrupt.h>
+#include <linux/irqdomain.h>
 #include <linux/platform_device.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
-#include <linux/of_mtd.h>
 #include <linux/of_device.h>
 #include <linux/of_platform.h>
 #include <linux/omap-gpmc.h>
-#include <linux/mtd/nand.h>
 #include <linux/pm_runtime.h>
 
 #include <linux/platform_data/mtd-nand-omap2.h>
@@ -81,6 +81,8 @@
 
 #define GPMC_CONFIG_LIMITEDADDRESS             BIT(1)
 
+#define GPMC_STATUS_EMPTYWRITEBUFFERSTATUS     BIT(0)
+
 #define        GPMC_CONFIG2_CSEXTRADELAY               BIT(7)
 #define        GPMC_CONFIG3_ADVEXTRADELAY              BIT(7)
 #define        GPMC_CONFIG4_OEEXTRADELAY               BIT(7)
 #define GPMC_CS_SIZE           0x30
 #define        GPMC_BCH_SIZE           0x10
 
+/*
+ * The first 1MB of GPMC address space is typically mapped to
+ * the internal ROM. Never allocate the first page, to
+ * facilitate bug detection; even if we didn't boot from ROM.
+ * As GPMC minimum partition size is 16MB we can only start from
+ * there.
+ */
+#define GPMC_MEM_START         0x1000000
 #define GPMC_MEM_END           0x3FFFFFFF
 
 #define GPMC_CHUNK_SHIFT       24              /* 16 MB */
 #define GPMC_CONFIG_RDY_BSY    0x00000001
 #define GPMC_CONFIG_DEV_SIZE   0x00000002
 #define GPMC_CONFIG_DEV_TYPE   0x00000003
-#define GPMC_SET_IRQ_STATUS    0x00000004
 
 #define GPMC_CONFIG1_WRAPBURST_SUPP     (1 << 31)
 #define GPMC_CONFIG1_READMULTIPLE_SUPP  (1 << 30)
 #define GPMC_CONFIG_WRITEPROTECT       0x00000010
 #define WR_RD_PIN_MONITORING           0x00600000
 
-#define GPMC_ENABLE_IRQ                0x0000000d
-
 /* ECC commands */
 #define GPMC_ECC_READ          0 /* Reset Hardware ECC for read */
 #define GPMC_ECC_WRITE         1 /* Reset Hardware ECC for write */
 #define GPMC_ECC_READSYN       2 /* Reset before syndrom is read back */
 
-/* XXX: Only NAND irq has been considered,currently these are the only ones used
- */
-#define        GPMC_NR_IRQ             2
+#define        GPMC_NR_NAND_IRQS       2 /* number of NAND specific IRQs */
 
 enum gpmc_clk_domain {
        GPMC_CD_FCLK,
@@ -199,11 +204,6 @@ struct gpmc_cs_data {
        struct resource mem;
 };
 
-struct gpmc_client_irq {
-       unsigned                irq;
-       u32                     bitmask;
-};
-
 /* Structure to save gpmc cs context */
 struct gpmc_cs_config {
        u32 config1;
@@ -231,9 +231,15 @@ struct omap3_gpmc_regs {
        struct gpmc_cs_config cs_context[GPMC_CS_NUM];
 };
 
-static struct gpmc_client_irq gpmc_client_irq[GPMC_NR_IRQ];
-static struct irq_chip gpmc_irq_chip;
-static int gpmc_irq_start;
+struct gpmc_device {
+       struct device *dev;
+       int irq;
+       struct irq_chip irq_chip;
+       struct gpio_chip gpio_chip;
+       int nirqs;
+};
+
+static struct irq_domain *gpmc_irq_domain;
 
 static struct resource gpmc_mem_root;
 static struct gpmc_cs_data gpmc_cs[GPMC_CS_NUM];
@@ -241,8 +247,6 @@ static DEFINE_SPINLOCK(gpmc_mem_lock);
 /* Define chip-selects as reserved by default until probe completes */
 static unsigned int gpmc_cs_num = GPMC_CS_NUM;
 static unsigned int gpmc_nr_waitpins;
-static struct device *gpmc_dev;
-static int gpmc_irq;
 static resource_size_t phys_base, mem_size;
 static unsigned gpmc_capability;
 static void __iomem *gpmc_base;
@@ -1054,14 +1058,6 @@ int gpmc_configure(int cmd, int wval)
        u32 regval;
 
        switch (cmd) {
-       case GPMC_ENABLE_IRQ:
-               gpmc_write_reg(GPMC_IRQENABLE, wval);
-               break;
-
-       case GPMC_SET_IRQ_STATUS:
-               gpmc_write_reg(GPMC_IRQSTATUS, wval);
-               break;
-
        case GPMC_CONFIG_WP:
                regval = gpmc_read_reg(GPMC_CONFIG);
                if (wval)
@@ -1084,7 +1080,7 @@ void gpmc_update_nand_reg(struct gpmc_nand_regs *reg, int cs)
 {
        int i;
 
-       reg->gpmc_status = gpmc_base + GPMC_STATUS;
+       reg->gpmc_status = NULL;        /* deprecated */
        reg->gpmc_nand_command = gpmc_base + GPMC_CS0_OFFSET +
                                GPMC_CS_NAND_COMMAND + GPMC_CS_SIZE * cs;
        reg->gpmc_nand_address = gpmc_base + GPMC_CS0_OFFSET +
@@ -1118,87 +1114,201 @@ void gpmc_update_nand_reg(struct gpmc_nand_regs *reg, int cs)
        }
 }
 
-int gpmc_get_client_irq(unsigned irq_config)
+static bool gpmc_nand_writebuffer_empty(void)
 {
-       int i;
+       if (gpmc_read_reg(GPMC_STATUS) & GPMC_STATUS_EMPTYWRITEBUFFERSTATUS)
+               return true;
 
-       if (hweight32(irq_config) > 1)
+       return false;
+}
+
+static struct gpmc_nand_ops nand_ops = {
+       .nand_writebuffer_empty = gpmc_nand_writebuffer_empty,
+};
+
+/**
+ * gpmc_omap_get_nand_ops - Get the GPMC NAND interface
+ * @regs: the GPMC NAND register map exclusive for NAND use.
+ * @cs: GPMC chip select number on which the NAND sits. The
+ *      register map returned will be specific to this chip select.
+ *
+ * Returns NULL on error e.g. invalid cs.
+ */
+struct gpmc_nand_ops *gpmc_omap_get_nand_ops(struct gpmc_nand_regs *reg, int cs)
+{
+       if (cs >= gpmc_cs_num)
+               return NULL;
+
+       gpmc_update_nand_reg(reg, cs);
+
+       return &nand_ops;
+}
+EXPORT_SYMBOL_GPL(gpmc_omap_get_nand_ops);
+
+int gpmc_get_client_irq(unsigned irq_config)
+{
+       if (!gpmc_irq_domain) {
+               pr_warn("%s called before GPMC IRQ domain available\n",
+                       __func__);
                return 0;
+       }
 
-       for (i = 0; i < GPMC_NR_IRQ; i++)
-               if (gpmc_client_irq[i].bitmask & irq_config)
-                       return gpmc_client_irq[i].irq;
+       /* we restrict this to NAND IRQs only */
+       if (irq_config >= GPMC_NR_NAND_IRQS)
+               return 0;
 
-       return 0;
+       return irq_create_mapping(gpmc_irq_domain, irq_config);
 }
 
-static int gpmc_irq_endis(unsigned irq, bool endis)
+static int gpmc_irq_endis(unsigned long hwirq, bool endis)
 {
-       int i;
        u32 regval;
 
-       for (i = 0; i < GPMC_NR_IRQ; i++)
-               if (irq == gpmc_client_irq[i].irq) {
-                       regval = gpmc_read_reg(GPMC_IRQENABLE);
-                       if (endis)
-                               regval |= gpmc_client_irq[i].bitmask;
-                       else
-                               regval &= ~gpmc_client_irq[i].bitmask;
-                       gpmc_write_reg(GPMC_IRQENABLE, regval);
-                       break;
-               }
+       /* bits GPMC_NR_NAND_IRQS to 8 are reserved */
+       if (hwirq >= GPMC_NR_NAND_IRQS)
+               hwirq += 8 - GPMC_NR_NAND_IRQS;
+
+       regval = gpmc_read_reg(GPMC_IRQENABLE);
+       if (endis)
+               regval |= BIT(hwirq);
+       else
+               regval &= ~BIT(hwirq);
+       gpmc_write_reg(GPMC_IRQENABLE, regval);
 
        return 0;
 }
 
 static void gpmc_irq_disable(struct irq_data *p)
 {
-       gpmc_irq_endis(p->irq, false);
+       gpmc_irq_endis(p->hwirq, false);
 }
 
 static void gpmc_irq_enable(struct irq_data *p)
 {
-       gpmc_irq_endis(p->irq, true);
+       gpmc_irq_endis(p->hwirq, true);
 }
 
-static void gpmc_irq_noop(struct irq_data *data) { }
+static void gpmc_irq_mask(struct irq_data *d)
+{
+       gpmc_irq_endis(d->hwirq, false);
+}
 
-static unsigned int gpmc_irq_noop_ret(struct irq_data *data) { return 0; }
+static void gpmc_irq_unmask(struct irq_data *d)
+{
+       gpmc_irq_endis(d->hwirq, true);
+}
 
-static int gpmc_setup_irq(void)
+static void gpmc_irq_edge_config(unsigned long hwirq, bool rising_edge)
 {
-       int i;
        u32 regval;
 
-       if (!gpmc_irq)
+       /* NAND IRQs polarity is not configurable */
+       if (hwirq < GPMC_NR_NAND_IRQS)
+               return;
+
+       /* WAITPIN starts at BIT 8 */
+       hwirq += 8 - GPMC_NR_NAND_IRQS;
+
+       regval = gpmc_read_reg(GPMC_CONFIG);
+       if (rising_edge)
+               regval &= ~BIT(hwirq);
+       else
+               regval |= BIT(hwirq);
+
+       gpmc_write_reg(GPMC_CONFIG, regval);
+}
+
+static void gpmc_irq_ack(struct irq_data *d)
+{
+       unsigned int hwirq = d->hwirq;
+
+       /* skip reserved bits */
+       if (hwirq >= GPMC_NR_NAND_IRQS)
+               hwirq += 8 - GPMC_NR_NAND_IRQS;
+
+       /* Setting bit to 1 clears (or Acks) the interrupt */
+       gpmc_write_reg(GPMC_IRQSTATUS, BIT(hwirq));
+}
+
+static int gpmc_irq_set_type(struct irq_data *d, unsigned int trigger)
+{
+       /* can't set type for NAND IRQs */
+       if (d->hwirq < GPMC_NR_NAND_IRQS)
                return -EINVAL;
 
-       gpmc_irq_start = irq_alloc_descs(-1, 0, GPMC_NR_IRQ, 0);
-       if (gpmc_irq_start < 0) {
-               pr_err("irq_alloc_descs failed\n");
-               return gpmc_irq_start;
+       /* We can support either rising or falling edge at a time */
+       if (trigger == IRQ_TYPE_EDGE_FALLING)
+               gpmc_irq_edge_config(d->hwirq, false);
+       else if (trigger == IRQ_TYPE_EDGE_RISING)
+               gpmc_irq_edge_config(d->hwirq, true);
+       else
+               return -EINVAL;
+
+       return 0;
+}
+
+static int gpmc_irq_map(struct irq_domain *d, unsigned int virq,
+                       irq_hw_number_t hw)
+{
+       struct gpmc_device *gpmc = d->host_data;
+
+       irq_set_chip_data(virq, gpmc);
+       if (hw < GPMC_NR_NAND_IRQS) {
+               irq_modify_status(virq, IRQ_NOREQUEST, IRQ_NOAUTOEN);
+               irq_set_chip_and_handler(virq, &gpmc->irq_chip,
+                                        handle_simple_irq);
+       } else {
+               irq_set_chip_and_handler(virq, &gpmc->irq_chip,
+                                        handle_edge_irq);
        }
 
-       gpmc_irq_chip.name = "gpmc";
-       gpmc_irq_chip.irq_startup = gpmc_irq_noop_ret;
-       gpmc_irq_chip.irq_enable = gpmc_irq_enable;
-       gpmc_irq_chip.irq_disable = gpmc_irq_disable;
-       gpmc_irq_chip.irq_shutdown = gpmc_irq_noop;
-       gpmc_irq_chip.irq_ack = gpmc_irq_noop;
-       gpmc_irq_chip.irq_mask = gpmc_irq_noop;
-       gpmc_irq_chip.irq_unmask = gpmc_irq_noop;
-
-       gpmc_client_irq[0].bitmask = GPMC_IRQ_FIFOEVENTENABLE;
-       gpmc_client_irq[1].bitmask = GPMC_IRQ_COUNT_EVENT;
-
-       for (i = 0; i < GPMC_NR_IRQ; i++) {
-               gpmc_client_irq[i].irq = gpmc_irq_start + i;
-               irq_set_chip_and_handler(gpmc_client_irq[i].irq,
-                                       &gpmc_irq_chip, handle_simple_irq);
-               irq_modify_status(gpmc_client_irq[i].irq, IRQ_NOREQUEST,
-                                 IRQ_NOAUTOEN);
+       return 0;
+}
+
+static const struct irq_domain_ops gpmc_irq_domain_ops = {
+       .map    = gpmc_irq_map,
+       .xlate  = irq_domain_xlate_twocell,
+};
+
+static irqreturn_t gpmc_handle_irq(int irq, void *data)
+{
+       int hwirq, virq;
+       u32 regval, regvalx;
+       struct gpmc_device *gpmc = data;
+
+       regval = gpmc_read_reg(GPMC_IRQSTATUS);
+       regvalx = regval;
+
+       if (!regval)
+               return IRQ_NONE;
+
+       for (hwirq = 0; hwirq < gpmc->nirqs; hwirq++) {
+               /* skip reserved status bits */
+               if (hwirq == GPMC_NR_NAND_IRQS)
+                       regvalx >>= 8 - GPMC_NR_NAND_IRQS;
+
+               if (regvalx & BIT(hwirq)) {
+                       virq = irq_find_mapping(gpmc_irq_domain, hwirq);
+                       if (!virq) {
+                               dev_warn(gpmc->dev,
+                                        "spurious irq detected hwirq %d, virq %d\n",
+                                        hwirq, virq);
+                       }
+
+                       generic_handle_irq(virq);
+               }
        }
 
+       gpmc_write_reg(GPMC_IRQSTATUS, regval);
+
+       return IRQ_HANDLED;
+}
+
+static int gpmc_setup_irq(struct gpmc_device *gpmc)
+{
+       u32 regval;
+       int rc;
+
        /* Disable interrupts */
        gpmc_write_reg(GPMC_IRQENABLE, 0);
 
@@ -1206,22 +1316,45 @@ static int gpmc_setup_irq(void)
        regval = gpmc_read_reg(GPMC_IRQSTATUS);
        gpmc_write_reg(GPMC_IRQSTATUS, regval);
 
-       return request_irq(gpmc_irq, gpmc_handle_irq, 0, "gpmc", NULL);
+       gpmc->irq_chip.name = "gpmc";
+       gpmc->irq_chip.irq_enable = gpmc_irq_enable;
+       gpmc->irq_chip.irq_disable = gpmc_irq_disable;
+       gpmc->irq_chip.irq_ack = gpmc_irq_ack;
+       gpmc->irq_chip.irq_mask = gpmc_irq_mask;
+       gpmc->irq_chip.irq_unmask = gpmc_irq_unmask;
+       gpmc->irq_chip.irq_set_type = gpmc_irq_set_type;
+
+       gpmc_irq_domain = irq_domain_add_linear(gpmc->dev->of_node,
+                                               gpmc->nirqs,
+                                               &gpmc_irq_domain_ops,
+                                               gpmc);
+       if (!gpmc_irq_domain) {
+               dev_err(gpmc->dev, "IRQ domain add failed\n");
+               return -ENODEV;
+       }
+
+       rc = request_irq(gpmc->irq, gpmc_handle_irq, 0, "gpmc", gpmc);
+       if (rc) {
+               dev_err(gpmc->dev, "failed to request irq %d: %d\n",
+                       gpmc->irq, rc);
+               irq_domain_remove(gpmc_irq_domain);
+               gpmc_irq_domain = NULL;
+       }
+
+       return rc;
 }
 
-static int gpmc_free_irq(void)
+static int gpmc_free_irq(struct gpmc_device *gpmc)
 {
-       int i;
+       int hwirq;
 
-       if (gpmc_irq)
-               free_irq(gpmc_irq, NULL);
+       free_irq(gpmc->irq, gpmc);
 
-       for (i = 0; i < GPMC_NR_IRQ; i++) {
-               irq_set_handler(gpmc_client_irq[i].irq, NULL);
-               irq_set_chip(gpmc_client_irq[i].irq, &no_irq_chip);
-       }
+       for (hwirq = 0; hwirq < gpmc->nirqs; hwirq++)
+               irq_dispose_mapping(irq_find_mapping(gpmc_irq_domain, hwirq));
 
-       irq_free_descs(gpmc_irq_start, GPMC_NR_IRQ);
+       irq_domain_remove(gpmc_irq_domain);
+       gpmc_irq_domain = NULL;
 
        return 0;
 }
@@ -1242,12 +1375,7 @@ static void gpmc_mem_init(void)
 {
        int cs;
 
-       /*
-        * The first 1MB of GPMC address space is typically mapped to
-        * the internal ROM. Never allocate the first page, to
-        * facilitate bug detection; even if we didn't boot from ROM.
-        */
-       gpmc_mem_root.start = SZ_1M;
+       gpmc_mem_root.start = GPMC_MEM_START;
        gpmc_mem_root.end = GPMC_MEM_END;
 
        /* Reserve all regions that has been set up by bootloader */
@@ -1796,105 +1924,6 @@ static void __maybe_unused gpmc_read_timings_dt(struct device_node *np,
                of_property_read_bool(np, "gpmc,time-para-granularity");
 }
 
-#if IS_ENABLED(CONFIG_MTD_NAND)
-
-static const char * const nand_xfer_types[] = {
-       [NAND_OMAP_PREFETCH_POLLED]             = "prefetch-polled",
-       [NAND_OMAP_POLLED]                      = "polled",
-       [NAND_OMAP_PREFETCH_DMA]                = "prefetch-dma",
-       [NAND_OMAP_PREFETCH_IRQ]                = "prefetch-irq",
-};
-
-static int gpmc_probe_nand_child(struct platform_device *pdev,
-                                struct device_node *child)
-{
-       u32 val;
-       const char *s;
-       struct gpmc_timings gpmc_t;
-       struct omap_nand_platform_data *gpmc_nand_data;
-
-       if (of_property_read_u32(child, "reg", &val) < 0) {
-               dev_err(&pdev->dev, "%s has no 'reg' property\n",
-                       child->full_name);
-               return -ENODEV;
-       }
-
-       gpmc_nand_data = devm_kzalloc(&pdev->dev, sizeof(*gpmc_nand_data),
-                                     GFP_KERNEL);
-       if (!gpmc_nand_data)
-               return -ENOMEM;
-
-       gpmc_nand_data->cs = val;
-       gpmc_nand_data->of_node = child;
-
-       /* Detect availability of ELM module */
-       gpmc_nand_data->elm_of_node = of_parse_phandle(child, "ti,elm-id", 0);
-       if (gpmc_nand_data->elm_of_node == NULL)
-               gpmc_nand_data->elm_of_node =
-                                       of_parse_phandle(child, "elm_id", 0);
-
-       /* select ecc-scheme for NAND */
-       if (of_property_read_string(child, "ti,nand-ecc-opt", &s)) {
-               pr_err("%s: ti,nand-ecc-opt not found\n", __func__);
-               return -ENODEV;
-       }
-
-       if (!strcmp(s, "sw"))
-               gpmc_nand_data->ecc_opt = OMAP_ECC_HAM1_CODE_SW;
-       else if (!strcmp(s, "ham1") ||
-                !strcmp(s, "hw") || !strcmp(s, "hw-romcode"))
-               gpmc_nand_data->ecc_opt =
-                               OMAP_ECC_HAM1_CODE_HW;
-       else if (!strcmp(s, "bch4"))
-               if (gpmc_nand_data->elm_of_node)
-                       gpmc_nand_data->ecc_opt =
-                               OMAP_ECC_BCH4_CODE_HW;
-               else
-                       gpmc_nand_data->ecc_opt =
-                               OMAP_ECC_BCH4_CODE_HW_DETECTION_SW;
-       else if (!strcmp(s, "bch8"))
-               if (gpmc_nand_data->elm_of_node)
-                       gpmc_nand_data->ecc_opt =
-                               OMAP_ECC_BCH8_CODE_HW;
-               else
-                       gpmc_nand_data->ecc_opt =
-                               OMAP_ECC_BCH8_CODE_HW_DETECTION_SW;
-       else if (!strcmp(s, "bch16"))
-               if (gpmc_nand_data->elm_of_node)
-                       gpmc_nand_data->ecc_opt =
-                               OMAP_ECC_BCH16_CODE_HW;
-               else
-                       pr_err("%s: BCH16 requires ELM support\n", __func__);
-       else
-               pr_err("%s: ti,nand-ecc-opt invalid value\n", __func__);
-
-       /* select data transfer mode for NAND controller */
-       if (!of_property_read_string(child, "ti,nand-xfer-type", &s))
-               for (val = 0; val < ARRAY_SIZE(nand_xfer_types); val++)
-                       if (!strcasecmp(s, nand_xfer_types[val])) {
-                               gpmc_nand_data->xfer_type = val;
-                               break;
-                       }
-
-       gpmc_nand_data->flash_bbt = of_get_nand_on_flash_bbt(child);
-
-       val = of_get_nand_bus_width(child);
-       if (val == 16)
-               gpmc_nand_data->devsize = NAND_BUSWIDTH_16;
-
-       gpmc_read_timings_dt(child, &gpmc_t);
-       gpmc_nand_init(gpmc_nand_data, &gpmc_t);
-
-       return 0;
-}
-#else
-static int gpmc_probe_nand_child(struct platform_device *pdev,
-                                struct device_node *child)
-{
-       return 0;
-}
-#endif
-
 #if IS_ENABLED(CONFIG_MTD_ONENAND)
 static int gpmc_probe_onenand_child(struct platform_device *pdev,
                                 struct device_node *child)
@@ -1950,6 +1979,8 @@ static int gpmc_probe_generic_child(struct platform_device *pdev,
        const char *name;
        int ret, cs;
        u32 val;
+       struct gpio_desc *waitpin_desc = NULL;
+       struct gpmc_device *gpmc = platform_get_drvdata(pdev);
 
        if (of_property_read_u32(child, "reg", &cs) < 0) {
                dev_err(&pdev->dev, "%s has no 'reg' property\n",
@@ -2010,23 +2041,80 @@ static int gpmc_probe_generic_child(struct platform_device *pdev,
        if (ret < 0) {
                dev_err(&pdev->dev, "cannot remap GPMC CS %d to %pa\n",
                        cs, &res.start);
+               if (res.start < GPMC_MEM_START) {
+                       dev_info(&pdev->dev,
+                                "GPMC CS %d start cannot be lesser than 0x%x\n",
+                                cs, GPMC_MEM_START);
+               } else if (res.end > GPMC_MEM_END) {
+                       dev_info(&pdev->dev,
+                                "GPMC CS %d end cannot be greater than 0x%x\n",
+                                cs, GPMC_MEM_END);
+               }
                goto err;
        }
 
-       ret = of_property_read_u32(child, "bank-width", &gpmc_s.device_width);
-       if (ret < 0)
-               goto err;
+       if (of_node_cmp(child->name, "nand") == 0) {
+               /* Warn about older DT blobs with no compatible property */
+               if (!of_property_read_bool(child, "compatible")) {
+                       dev_warn(&pdev->dev,
+                                "Incompatible NAND node: missing compatible");
+                       ret = -EINVAL;
+                       goto err;
+               }
+       }
+
+       if (of_device_is_compatible(child, "ti,omap2-nand")) {
+               /* NAND specific setup */
+               val = 8;
+               of_property_read_u32(child, "nand-bus-width", &val);
+               switch (val) {
+               case 8:
+                       gpmc_s.device_width = GPMC_DEVWIDTH_8BIT;
+                       break;
+               case 16:
+                       gpmc_s.device_width = GPMC_DEVWIDTH_16BIT;
+                       break;
+               default:
+                       dev_err(&pdev->dev, "%s: invalid 'nand-bus-width'\n",
+                               child->name);
+                       ret = -EINVAL;
+                       goto err;
+               }
+
+               /* disable write protect */
+               gpmc_configure(GPMC_CONFIG_WP, 0);
+               gpmc_s.device_nand = true;
+       } else {
+               ret = of_property_read_u32(child, "bank-width",
+                                          &gpmc_s.device_width);
+               if (ret < 0)
+                       goto err;
+       }
+
+       /* Reserve wait pin if it is required and valid */
+       if (gpmc_s.wait_on_read || gpmc_s.wait_on_write) {
+               unsigned int wait_pin = gpmc_s.wait_pin;
+
+               waitpin_desc = gpiochip_request_own_desc(&gpmc->gpio_chip,
+                                                        wait_pin, "WAITPIN");
+               if (IS_ERR(waitpin_desc)) {
+                       dev_err(&pdev->dev, "invalid wait-pin: %d\n", wait_pin);
+                       ret = PTR_ERR(waitpin_desc);
+                       goto err;
+               }
+       }
 
        gpmc_cs_show_timings(cs, "before gpmc_cs_program_settings");
+
        ret = gpmc_cs_program_settings(cs, &gpmc_s);
        if (ret < 0)
-               goto err;
+               goto err_cs;
 
        ret = gpmc_cs_set_timings(cs, &gpmc_t, &gpmc_s);
        if (ret) {
                dev_err(&pdev->dev, "failed to set gpmc timings for: %s\n",
                        child->name);
-               goto err;
+               goto err_cs;
        }
 
        /* Clear limited address i.e. enable A26-A11 */
@@ -2057,16 +2145,81 @@ err_child_fail:
        dev_err(&pdev->dev, "failed to create gpmc child %s\n", child->name);
        ret = -ENODEV;
 
+err_cs:
+       if (waitpin_desc)
+               gpiochip_free_own_desc(waitpin_desc);
+
 err:
        gpmc_cs_free(cs);
 
        return ret;
 }
 
+static int gpmc_gpio_get_direction(struct gpio_chip *chip, unsigned int offset)
+{
+       return 1;       /* we're input only */
+}
+
+static int gpmc_gpio_direction_input(struct gpio_chip *chip,
+                                    unsigned int offset)
+{
+       return 0;       /* we're input only */
+}
+
+static int gpmc_gpio_direction_output(struct gpio_chip *chip,
+                                     unsigned int offset, int value)
+{
+       return -EINVAL; /* we're input only */
+}
+
+static void gpmc_gpio_set(struct gpio_chip *chip, unsigned int offset,
+                         int value)
+{
+}
+
+static int gpmc_gpio_get(struct gpio_chip *chip, unsigned int offset)
+{
+       u32 reg;
+
+       offset += 8;
+
+       reg = gpmc_read_reg(GPMC_STATUS) & BIT(offset);
+
+       return !!reg;
+}
+
+static int gpmc_gpio_init(struct gpmc_device *gpmc)
+{
+       int ret;
+
+       gpmc->gpio_chip.parent = gpmc->dev;
+       gpmc->gpio_chip.owner = THIS_MODULE;
+       gpmc->gpio_chip.label = DEVICE_NAME;
+       gpmc->gpio_chip.ngpio = gpmc_nr_waitpins;
+       gpmc->gpio_chip.get_direction = gpmc_gpio_get_direction;
+       gpmc->gpio_chip.direction_input = gpmc_gpio_direction_input;
+       gpmc->gpio_chip.direction_output = gpmc_gpio_direction_output;
+       gpmc->gpio_chip.set = gpmc_gpio_set;
+       gpmc->gpio_chip.get = gpmc_gpio_get;
+       gpmc->gpio_chip.base = -1;
+
+       ret = gpiochip_add(&gpmc->gpio_chip);
+       if (ret < 0) {
+               dev_err(gpmc->dev, "could not register gpio chip: %d\n", ret);
+               return ret;
+       }
+
+       return 0;
+}
+
+static void gpmc_gpio_exit(struct gpmc_device *gpmc)
+{
+       gpiochip_remove(&gpmc->gpio_chip);
+}
+
 static int gpmc_probe_dt(struct platform_device *pdev)
 {
        int ret;
-       struct device_node *child;
        const struct of_device_id *of_id =
                of_match_device(gpmc_dt_ids, &pdev->dev);
 
@@ -2094,17 +2247,26 @@ static int gpmc_probe_dt(struct platform_device *pdev)
                return ret;
        }
 
+       return 0;
+}
+
+static int gpmc_probe_dt_children(struct platform_device *pdev)
+{
+       int ret;
+       struct device_node *child;
+
        for_each_available_child_of_node(pdev->dev.of_node, child) {
 
                if (!child->name)
                        continue;
 
-               if (of_node_cmp(child->name, "nand") == 0)
-                       ret = gpmc_probe_nand_child(pdev, child);
-               else if (of_node_cmp(child->name, "onenand") == 0)
+               if (of_node_cmp(child->name, "onenand") == 0)
                        ret = gpmc_probe_onenand_child(pdev, child);
                else
                        ret = gpmc_probe_generic_child(pdev, child);
+
+               if (ret)
+                       return ret;
        }
 
        return 0;
@@ -2114,6 +2276,11 @@ static int gpmc_probe_dt(struct platform_device *pdev)
 {
        return 0;
 }
+
+static int gpmc_probe_dt_children(struct platform_device *pdev)
+{
+       return 0;
+}
 #endif
 
 static int gpmc_probe(struct platform_device *pdev)
@@ -2121,6 +2288,14 @@ static int gpmc_probe(struct platform_device *pdev)
        int rc;
        u32 l;
        struct resource *res;
+       struct gpmc_device *gpmc;
+
+       gpmc = devm_kzalloc(&pdev->dev, sizeof(*gpmc), GFP_KERNEL);
+       if (!gpmc)
+               return -ENOMEM;
+
+       gpmc->dev = &pdev->dev;
+       platform_set_drvdata(pdev, gpmc);
 
        res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
        if (res == NULL)
@@ -2134,15 +2309,16 @@ static int gpmc_probe(struct platform_device *pdev)
                return PTR_ERR(gpmc_base);
 
        res = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
-       if (res == NULL)
-               dev_warn(&pdev->dev, "Failed to get resource: irq\n");
-       else
-               gpmc_irq = res->start;
+       if (!res) {
+               dev_err(&pdev->dev, "Failed to get resource: irq\n");
+               return -ENOENT;
+       }
+
+       gpmc->irq = res->start;
 
        gpmc_l3_clk = devm_clk_get(&pdev->dev, "fck");
        if (IS_ERR(gpmc_l3_clk)) {
                dev_err(&pdev->dev, "Failed to get GPMC fck\n");
-               gpmc_irq = 0;
                return PTR_ERR(gpmc_l3_clk);
        }
 
@@ -2151,11 +2327,18 @@ static int gpmc_probe(struct platform_device *pdev)
                return -EINVAL;
        }
 
+       if (pdev->dev.of_node) {
+               rc = gpmc_probe_dt(pdev);
+               if (rc)
+                       return rc;
+       } else {
+               gpmc_cs_num = GPMC_CS_NUM;
+               gpmc_nr_waitpins = GPMC_NR_WAITPINS;
+       }
+
        pm_runtime_enable(&pdev->dev);
        pm_runtime_get_sync(&pdev->dev);
 
-       gpmc_dev = &pdev->dev;
-
        l = gpmc_read_reg(GPMC_REVISION);
 
        /*
@@ -2174,36 +2357,51 @@ static int gpmc_probe(struct platform_device *pdev)
                gpmc_capability = GPMC_HAS_WR_ACCESS | GPMC_HAS_WR_DATA_MUX_BUS;
        if (GPMC_REVISION_MAJOR(l) > 0x5)
                gpmc_capability |= GPMC_HAS_MUX_AAD;
-       dev_info(gpmc_dev, "GPMC revision %d.%d\n", GPMC_REVISION_MAJOR(l),
+       dev_info(gpmc->dev, "GPMC revision %d.%d\n", GPMC_REVISION_MAJOR(l),
                 GPMC_REVISION_MINOR(l));
 
        gpmc_mem_init();
-
-       if (gpmc_setup_irq() < 0)
-               dev_warn(gpmc_dev, "gpmc_setup_irq failed\n");
-
-       if (!pdev->dev.of_node) {
-               gpmc_cs_num      = GPMC_CS_NUM;
-               gpmc_nr_waitpins = GPMC_NR_WAITPINS;
+       rc = gpmc_gpio_init(gpmc);
+       if (rc)
+               goto gpio_init_failed;
+
+       gpmc->nirqs = GPMC_NR_NAND_IRQS + gpmc_nr_waitpins;
+       rc = gpmc_setup_irq(gpmc);
+       if (rc) {
+               dev_err(gpmc->dev, "gpmc_setup_irq failed\n");
+               goto setup_irq_failed;
        }
 
-       rc = gpmc_probe_dt(pdev);
+       rc = gpmc_probe_dt_children(pdev);
        if (rc < 0) {
-               pm_runtime_put_sync(&pdev->dev);
-               dev_err(gpmc_dev, "failed to probe DT parameters\n");
-               return rc;
+               dev_err(gpmc->dev, "failed to probe DT children\n");
+               goto dt_children_failed;
        }
 
        return 0;
+
+dt_children_failed:
+       gpmc_free_irq(gpmc);
+setup_irq_failed:
+       gpmc_gpio_exit(gpmc);
+gpio_init_failed:
+       gpmc_mem_exit();
+       pm_runtime_put_sync(&pdev->dev);
+       pm_runtime_disable(&pdev->dev);
+
+       return rc;
 }
 
 static int gpmc_remove(struct platform_device *pdev)
 {
-       gpmc_free_irq();
+       struct gpmc_device *gpmc = platform_get_drvdata(pdev);
+
+       gpmc_free_irq(gpmc);
+       gpmc_gpio_exit(gpmc);
        gpmc_mem_exit();
        pm_runtime_put_sync(&pdev->dev);
        pm_runtime_disable(&pdev->dev);
-       gpmc_dev = NULL;
+
        return 0;
 }
 
@@ -2249,25 +2447,6 @@ static __exit void gpmc_exit(void)
 postcore_initcall(gpmc_init);
 module_exit(gpmc_exit);
 
-static irqreturn_t gpmc_handle_irq(int irq, void *dev)
-{
-       int i;
-       u32 regval;
-
-       regval = gpmc_read_reg(GPMC_IRQSTATUS);
-
-       if (!regval)
-               return IRQ_NONE;
-
-       for (i = 0; i < GPMC_NR_IRQ; i++)
-               if (regval & gpmc_client_irq[i].bitmask)
-                       generic_handle_irq(gpmc_client_irq[i].irq);
-
-       gpmc_write_reg(GPMC_IRQSTATUS, regval);
-
-       return IRQ_HANDLED;
-}
-
 static struct omap3_gpmc_regs gpmc_context;
 
 void omap3_gpmc_save_context(void)
index 40e51b0..b46c0cf 100644 (file)
@@ -696,7 +696,7 @@ int twl4030_init_irq(struct device *dev, int irq_num)
        nr_irqs = TWL4030_PWR_NR_IRQS + TWL4030_CORE_NR_IRQS;
 
        irq_base = irq_alloc_descs(-1, 0, nr_irqs, 0);
-       if (IS_ERR_VALUE(irq_base)) {
+       if (irq_base < 0) {
                dev_err(dev, "Fail to allocate IRQ descs\n");
                return irq_base;
        }
index ddc9620..e62fde3 100644 (file)
@@ -618,6 +618,10 @@ static int mmc_blk_ioctl_cmd(struct block_device *bdev,
 
        ioc_err = __mmc_blk_ioctl_cmd(card, md, idata);
 
+       /* Always switch back to main area after RPMB access */
+       if (md->area_type & MMC_BLK_DATA_AREA_RPMB)
+               mmc_blk_part_switch(card, dev_get_drvdata(&card->dev));
+
        mmc_put_card(card);
 
        err = mmc_blk_ioctl_copy_to_user(ic_ptr, idata);
@@ -685,6 +689,10 @@ static int mmc_blk_ioctl_multi_cmd(struct block_device *bdev,
        for (i = 0; i < num_of_cmds && !ioc_err; i++)
                ioc_err = __mmc_blk_ioctl_cmd(card, md, idata[i]);
 
+       /* Always switch back to main area after RPMB access */
+       if (md->area_type & MMC_BLK_DATA_AREA_RPMB)
+               mmc_blk_part_switch(card, dev_get_drvdata(&card->dev));
+
        mmc_put_card(card);
 
        /* copy to user if data and response */
@@ -748,16 +756,25 @@ static inline int mmc_blk_part_switch(struct mmc_card *card,
        if (mmc_card_mmc(card)) {
                u8 part_config = card->ext_csd.part_config;
 
+               if (md->part_type == EXT_CSD_PART_CONFIG_ACC_RPMB)
+                       mmc_retune_pause(card->host);
+
                part_config &= ~EXT_CSD_PART_CONFIG_ACC_MASK;
                part_config |= md->part_type;
 
                ret = mmc_switch(card, EXT_CSD_CMD_SET_NORMAL,
                                 EXT_CSD_PART_CONFIG, part_config,
                                 card->ext_csd.part_time);
-               if (ret)
+               if (ret) {
+                       if (md->part_type == EXT_CSD_PART_CONFIG_ACC_RPMB)
+                               mmc_retune_unpause(card->host);
                        return ret;
+               }
 
                card->ext_csd.part_config = part_config;
+
+               if (main_md->part_curr == EXT_CSD_PART_CONFIG_ACC_RPMB)
+                       mmc_retune_unpause(card->host);
        }
 
        main_md->part_curr = md->part_type;
@@ -2519,11 +2536,12 @@ static const struct mmc_fixup blk_fixups[] =
                  MMC_QUIRK_BLK_NO_CMD23),
 
        /*
-        * Some Micron MMC cards needs longer data read timeout than
-        * indicated in CSD.
+        * Some MMC cards need longer data read timeout than indicated in CSD.
         */
        MMC_FIXUP(CID_NAME_ANY, CID_MANFID_MICRON, 0x200, add_quirk_mmc,
                  MMC_QUIRK_LONG_READ_TIME),
+       MMC_FIXUP("008GE0", CID_MANFID_TOSHIBA, CID_OEMID_ANY, add_quirk_mmc,
+                 MMC_QUIRK_LONG_READ_TIME),
 
        /*
         * On these Samsung MoviNAND parts, performing secure erase or
index 99275e4..8b4dfd4 100644 (file)
@@ -875,11 +875,11 @@ void mmc_set_data_timeout(struct mmc_data *data, const struct mmc_card *card)
        /*
         * Some cards require longer data read timeout than indicated in CSD.
         * Address this by setting the read timeout to a "reasonably high"
-        * value. For the cards tested, 300ms has proven enough. If necessary,
+        * value. For the cards tested, 600ms has proven enough. If necessary,
         * this value can be increased if other problematic cards require this.
         */
        if (mmc_card_long_read_time(card) && data->flags & MMC_DATA_READ) {
-               data->timeout_ns = 300000000;
+               data->timeout_ns = 600000000;
                data->timeout_clks = 0;
        }
 
index e0a3ee1..1be42fa 100644 (file)
@@ -68,8 +68,32 @@ void mmc_retune_enable(struct mmc_host *host)
                          jiffies + host->retune_period * HZ);
 }
 
+/*
+ * Pause re-tuning for a small set of operations.  The pause begins after the
+ * next command and after first doing re-tuning.
+ */
+void mmc_retune_pause(struct mmc_host *host)
+{
+       if (!host->retune_paused) {
+               host->retune_paused = 1;
+               mmc_retune_needed(host);
+               mmc_retune_hold(host);
+       }
+}
+EXPORT_SYMBOL(mmc_retune_pause);
+
+void mmc_retune_unpause(struct mmc_host *host)
+{
+       if (host->retune_paused) {
+               host->retune_paused = 0;
+               mmc_retune_release(host);
+       }
+}
+EXPORT_SYMBOL(mmc_retune_unpause);
+
 void mmc_retune_disable(struct mmc_host *host)
 {
+       mmc_retune_unpause(host);
        host->can_retune = 0;
        del_timer_sync(&host->retune_timer);
        host->retune_now = 0;
index b81b08f..5d438ad 100644 (file)
@@ -1276,7 +1276,7 @@ static int mmc_select_hs200(struct mmc_card *card)
         * switch to HS200 mode if bus width is set successfully.
         */
        err = mmc_select_bus_width(card);
-       if (!IS_ERR_VALUE(err)) {
+       if (err >= 0) {
                val = EXT_CSD_TIMING_HS200 |
                      card->drive_strength << EXT_CSD_DRV_STR_SHIFT;
                err = __mmc_switch(card, EXT_CSD_CMD_SET_NORMAL,
@@ -1583,7 +1583,7 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr,
        } else if (mmc_card_hs(card)) {
                /* Select the desired bus width optionally */
                err = mmc_select_bus_width(card);
-               if (!IS_ERR_VALUE(err)) {
+               if (err >= 0) {
                        err = mmc_select_hs_ddr(card);
                        if (err)
                                goto free_card;
index 8c20b81..358b0dc 100644 (file)
@@ -66,6 +66,70 @@ static void dw_mci_rk3288_set_ios(struct dw_mci *host, struct mmc_ios *ios)
        /* Make sure we use phases which we can enumerate with */
        if (!IS_ERR(priv->sample_clk))
                clk_set_phase(priv->sample_clk, priv->default_sample_phase);
+
+       /*
+        * Set the drive phase offset based on speed mode to achieve hold times.
+        *
+        * NOTE: this is _not_ a value that is dynamically tuned and is also
+        * _not_ a value that will vary from board to board.  It is a value
+        * that could vary between different SoC models if they had massively
+        * different output clock delays inside their dw_mmc IP block (delay_o),
+        * but since it's OK to overshoot a little we don't need to do complex
+        * calculations and can pick values that will just work for everyone.
+        *
+        * When picking values we'll stick with picking 0/90/180/270 since
+        * those can be made very accurately on all known Rockchip SoCs.
+        *
+        * Note that these values match values from the DesignWare Databook
+        * tables for the most part except for SDR12 and "ID mode".  For those
+        * two modes the databook calculations assume a clock in of 50MHz.  As
+        * seen above, we always use a clock in rate that is exactly the
+        * card's input clock (times RK3288_CLKGEN_DIV, but that gets divided
+        * back out before the controller sees it).
+        *
+        * From measurement of a single device, it appears that delay_o is
+        * about .5 ns.  Since we try to leave a bit of margin, it's expected
+        * that numbers here will be fine even with much larger delay_o
+        * (the 1.4 ns assumed by the DesignWare Databook would result in the
+        * same results, for instance).
+        */
+       if (!IS_ERR(priv->drv_clk)) {
+               int phase;
+
+               /*
+                * In almost all cases a 90 degree phase offset will provide
+                * sufficient hold times across all valid input clock rates
+                * assuming delay_o is not absurd for a given SoC.  We'll use
+                * that as a default.
+                */
+               phase = 90;
+
+               switch (ios->timing) {
+               case MMC_TIMING_MMC_DDR52:
+                       /*
+                        * Since clock in rate with MMC_DDR52 is doubled when
+                        * bus width is 8 we need to double the phase offset
+                        * to get the same timings.
+                        */
+                       if (ios->bus_width == MMC_BUS_WIDTH_8)
+                               phase = 180;
+                       break;
+               case MMC_TIMING_UHS_SDR104:
+               case MMC_TIMING_MMC_HS200:
+                       /*
+                        * In the case of 150 MHz clock (typical max for
+                        * Rockchip SoCs), 90 degree offset will add a delay
+                        * of 1.67 ns.  That will meet min hold time of .8 ns
+                        * as long as clock output delay is < .87 ns.  On
+                        * SoCs measured this seems to be OK, but it doesn't
+                        * hurt to give margin here, so we use 180.
+                        */
+                       phase = 180;
+                       break;
+               }
+
+               clk_set_phase(priv->drv_clk, phase);
+       }
 }
 
 #define NUM_PHASES                     360
@@ -233,10 +297,10 @@ static int dw_mci_rockchip_init(struct dw_mci *host)
 
 /* Common capabilities of RK3288 SoC */
 static unsigned long dw_mci_rk3288_dwmmc_caps[4] = {
-       MMC_CAP_ERASE,
-       MMC_CAP_ERASE,
-       MMC_CAP_ERASE,
-       MMC_CAP_ERASE,
+       MMC_CAP_ERASE | MMC_CAP_CMD23,
+       MMC_CAP_ERASE | MMC_CAP_CMD23,
+       MMC_CAP_ERASE | MMC_CAP_CMD23,
+       MMC_CAP_ERASE | MMC_CAP_CMD23,
 };
 
 static const struct dw_mci_drv_data rk2928_drv_data = {
index 9dd1bd3..2cc6123 100644 (file)
@@ -1431,7 +1431,7 @@ static int dw_mci_get_ro(struct mmc_host *mmc)
        int gpio_ro = mmc_gpio_get_ro(mmc);
 
        /* Use platform get_ro function, else try on board write protect */
-       if (!IS_ERR_VALUE(gpio_ro))
+       if (gpio_ro >= 0)
                read_only = gpio_ro;
        else
                read_only =
@@ -1454,7 +1454,7 @@ static int dw_mci_get_cd(struct mmc_host *mmc)
        if ((mmc->caps & MMC_CAP_NEEDS_POLL) ||
            (mmc->caps & MMC_CAP_NONREMOVABLE))
                present = 1;
-       else if (!IS_ERR_VALUE(gpio_cd))
+       else if (gpio_cd >= 0)
                present = gpio_cd;
        else
                present = (mci_readl(slot->host, CDETECT) & (1 << slot->id))
@@ -2595,13 +2595,13 @@ static int dw_mci_init_slot(struct dw_mci *host, unsigned int id)
        /* Useful defaults if platform data is unset. */
        if (host->use_dma == TRANS_MODE_IDMAC) {
                mmc->max_segs = host->ring_size;
-               mmc->max_blk_size = 65536;
+               mmc->max_blk_size = 65535;
                mmc->max_seg_size = 0x1000;
                mmc->max_req_size = mmc->max_seg_size * host->ring_size;
                mmc->max_blk_count = mmc->max_req_size / 512;
        } else if (host->use_dma == TRANS_MODE_EDMAC) {
                mmc->max_segs = 64;
-               mmc->max_blk_size = 65536;
+               mmc->max_blk_size = 65535;
                mmc->max_blk_count = 65535;
                mmc->max_req_size =
                                mmc->max_blk_size * mmc->max_blk_count;
@@ -2609,7 +2609,7 @@ static int dw_mci_init_slot(struct dw_mci *host, unsigned int id)
        } else {
                /* TRANS_MODE_PIO */
                mmc->max_segs = 64;
-               mmc->max_blk_size = 65536; /* BLKSIZ is 16 bits */
+               mmc->max_blk_size = 65535; /* BLKSIZ is 16 bits */
                mmc->max_blk_count = 512;
                mmc->max_req_size = mmc->max_blk_size *
                                    mmc->max_blk_count;
@@ -2927,7 +2927,7 @@ static void dw_mci_enable_cd(struct dw_mci *host)
                if (slot->mmc->caps & MMC_CAP_NEEDS_POLL)
                        return;
 
-               if (IS_ERR_VALUE(mmc_gpio_get_cd(slot->mmc)))
+               if (mmc_gpio_get_cd(slot->mmc) < 0)
                        break;
        }
        if (i == host->num_slots)
index b2d70ba..458ffb7 100644 (file)
@@ -274,7 +274,7 @@ static const struct sdhci_acpi_slot sdhci_acpi_slot_int_emmc = {
        .chip    = &sdhci_acpi_chip_int,
        .caps    = MMC_CAP_8_BIT_DATA | MMC_CAP_NONREMOVABLE |
                   MMC_CAP_HW_RESET | MMC_CAP_1_8V_DDR |
-                  MMC_CAP_BUS_WIDTH_TEST | MMC_CAP_WAIT_WHILE_BUSY,
+                  MMC_CAP_WAIT_WHILE_BUSY,
        .caps2   = MMC_CAP2_HC_ERASE_SZ,
        .flags   = SDHCI_ACPI_RUNTIME_PM,
        .quirks  = SDHCI_QUIRK_NO_ENDATTR_IN_NOPDESC,
@@ -289,7 +289,7 @@ static const struct sdhci_acpi_slot sdhci_acpi_slot_int_sdio = {
                   SDHCI_QUIRK_NO_ENDATTR_IN_NOPDESC,
        .quirks2 = SDHCI_QUIRK2_HOST_OFF_CARD_ON,
        .caps    = MMC_CAP_NONREMOVABLE | MMC_CAP_POWER_OFF_CARD |
-                  MMC_CAP_BUS_WIDTH_TEST | MMC_CAP_WAIT_WHILE_BUSY,
+                  MMC_CAP_WAIT_WHILE_BUSY,
        .flags   = SDHCI_ACPI_RUNTIME_PM,
        .pm_caps = MMC_PM_KEEP_POWER,
        .probe_slot     = sdhci_acpi_sdio_probe_slot,
@@ -301,7 +301,7 @@ static const struct sdhci_acpi_slot sdhci_acpi_slot_int_sd = {
        .quirks  = SDHCI_QUIRK_NO_ENDATTR_IN_NOPDESC,
        .quirks2 = SDHCI_QUIRK2_CARD_ON_NEEDS_BUS_ON |
                   SDHCI_QUIRK2_STOP_WITH_TC,
-       .caps    = MMC_CAP_BUS_WIDTH_TEST | MMC_CAP_WAIT_WHILE_BUSY,
+       .caps    = MMC_CAP_WAIT_WHILE_BUSY,
        .probe_slot     = sdhci_acpi_sd_probe_slot,
 };
 
@@ -378,7 +378,7 @@ static int sdhci_acpi_probe(struct platform_device *pdev)
 {
        struct device *dev = &pdev->dev;
        acpi_handle handle = ACPI_HANDLE(dev);
-       struct acpi_device *device;
+       struct acpi_device *device, *child;
        struct sdhci_acpi_host *c;
        struct sdhci_host *host;
        struct resource *iomem;
@@ -390,6 +390,11 @@ static int sdhci_acpi_probe(struct platform_device *pdev)
        if (acpi_bus_get_device(handle, &device))
                return -ENODEV;
 
+       /* Power on the SDHCI controller and its children */
+       acpi_device_fix_up_power(device);
+       list_for_each_entry(child, &device->children, node)
+               acpi_device_fix_up_power(child);
+
        if (acpi_bus_get_status(device) || !device->status.present)
                return -ENODEV;
 
index 2d300d8..9d3ae1f 100644 (file)
@@ -1011,7 +1011,7 @@ sdhci_esdhc_imx_probe_dt(struct platform_device *pdev,
        if (ret)
                return ret;
 
-       if (!IS_ERR_VALUE(mmc_gpio_get_cd(host->mmc)))
+       if (mmc_gpio_get_cd(host->mmc) >= 0)
                host->quirks &= ~SDHCI_QUIRK_BROKEN_CARD_DETECTION;
 
        return 0;
index 25f779e..d4cef71 100644 (file)
@@ -289,7 +289,7 @@ static int sdhci_at91_probe(struct platform_device *pdev)
         * to enable polling via device tree with broken-cd property.
         */
        if (!(host->mmc->caps & MMC_CAP_NONREMOVABLE) &&
-           IS_ERR_VALUE(mmc_gpio_get_cd(host->mmc))) {
+           mmc_gpio_get_cd(host->mmc) < 0) {
                host->mmc->caps |= MMC_CAP_NEEDS_POLL;
                host->quirks &= ~SDHCI_QUIRK_BROKEN_CARD_DETECTION;
        }
index 97d4eeb..a4dbf74 100644 (file)
@@ -356,7 +356,6 @@ static int byt_emmc_probe_slot(struct sdhci_pci_slot *slot)
 {
        slot->host->mmc->caps |= MMC_CAP_8_BIT_DATA | MMC_CAP_NONREMOVABLE |
                                 MMC_CAP_HW_RESET | MMC_CAP_1_8V_DDR |
-                                MMC_CAP_BUS_WIDTH_TEST |
                                 MMC_CAP_WAIT_WHILE_BUSY;
        slot->host->mmc->caps2 |= MMC_CAP2_HC_ERASE_SZ;
        slot->hw_reset = sdhci_pci_int_hw_reset;
@@ -372,15 +371,13 @@ static int byt_emmc_probe_slot(struct sdhci_pci_slot *slot)
 static int byt_sdio_probe_slot(struct sdhci_pci_slot *slot)
 {
        slot->host->mmc->caps |= MMC_CAP_POWER_OFF_CARD | MMC_CAP_NONREMOVABLE |
-                                MMC_CAP_BUS_WIDTH_TEST |
                                 MMC_CAP_WAIT_WHILE_BUSY;
        return 0;
 }
 
 static int byt_sd_probe_slot(struct sdhci_pci_slot *slot)
 {
-       slot->host->mmc->caps |= MMC_CAP_BUS_WIDTH_TEST |
-                                MMC_CAP_WAIT_WHILE_BUSY;
+       slot->host->mmc->caps |= MMC_CAP_WAIT_WHILE_BUSY;
        slot->cd_con_id = NULL;
        slot->cd_idx = 0;
        slot->cd_override_level = true;
index e010ea4..0e3d7c0 100644 (file)
@@ -1624,7 +1624,7 @@ static int sdhci_get_cd(struct mmc_host *mmc)
         * Try slot gpio detect, if defined it take precedence
         * over build in controller functionality
         */
-       if (!IS_ERR_VALUE(gpio_cd))
+       if (gpio_cd >= 0)
                return !!gpio_cd;
 
        /* If polling, assume that the card is always present. */
@@ -3077,7 +3077,7 @@ int sdhci_add_host(struct sdhci_host *host)
 
        if ((host->quirks & SDHCI_QUIRK_BROKEN_CARD_DETECTION) &&
            !(mmc->caps & MMC_CAP_NONREMOVABLE) &&
-           IS_ERR_VALUE(mmc_gpio_get_cd(host->mmc)))
+           mmc_gpio_get_cd(host->mmc) < 0)
                mmc->caps |= MMC_CAP_NEEDS_POLL;
 
        /* If there are external regulators, get them */
index 7fc8b7a..2ee4c21 100644 (file)
@@ -970,8 +970,8 @@ static const struct sunxi_mmc_clk_delay sun9i_mmc_clk_delays[] = {
        [SDXC_CLK_400K]         = { .output = 180, .sample = 180 },
        [SDXC_CLK_25M]          = { .output = 180, .sample =  75 },
        [SDXC_CLK_50M]          = { .output = 150, .sample = 120 },
-       [SDXC_CLK_50M_DDR]      = { .output =  90, .sample = 120 },
-       [SDXC_CLK_50M_DDR_8BIT] = { .output =  90, .sample = 120 },
+       [SDXC_CLK_50M_DDR]      = { .output =  54, .sample =  36 },
+       [SDXC_CLK_50M_DDR_8BIT] = { .output =  72, .sample =  72 },
 };
 
 static int sunxi_mmc_resource_request(struct sunxi_mmc_host *host,
@@ -1129,11 +1129,6 @@ static int sunxi_mmc_probe(struct platform_device *pdev)
                                  MMC_CAP_1_8V_DDR |
                                  MMC_CAP_ERASE | MMC_CAP_SDIO_IRQ;
 
-       /* TODO MMC DDR is not working on A80 */
-       if (of_device_is_compatible(pdev->dev.of_node,
-                                   "allwinner,sun9i-a80-mmc"))
-               mmc->caps &= ~MMC_CAP_1_8V_DDR;
-
        ret = mmc_of_parse(mmc);
        if (ret)
                goto error_free_dma;
index 3b3dabc..bbfa1f1 100644 (file)
@@ -115,6 +115,7 @@ config MTD_MAP_BANK_WIDTH_16
 
 config MTD_MAP_BANK_WIDTH_32
        bool "Support 256-bit buswidth" if MTD_CFI_GEOMETRY
+       select MTD_COMPLEX_MAPPINGS if HAS_IOMEM
        default n
        help
          If you wish to support CFI devices on a physical bus which is
index 347bb83..1c65c15 100644 (file)
@@ -2,6 +2,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/delay.h>
+#include <linux/ioport.h>
 #include <linux/mtd/mtd.h>
 #include <linux/platform_device.h>
 #include <linux/bcma/bcma.h>
@@ -109,8 +110,7 @@ static int bcm47xxsflash_read(struct mtd_info *mtd, loff_t from, size_t len,
        if ((from + len) > mtd->size)
                return -EINVAL;
 
-       memcpy_fromio(buf, (void __iomem *)KSEG0ADDR(b47s->window + from),
-                     len);
+       memcpy_fromio(buf, b47s->window + from, len);
        *retlen = len;
 
        return len;
@@ -275,15 +275,33 @@ static void bcm47xxsflash_bcma_cc_write(struct bcm47xxsflash *b47s, u16 offset,
 
 static int bcm47xxsflash_bcma_probe(struct platform_device *pdev)
 {
-       struct bcma_sflash *sflash = dev_get_platdata(&pdev->dev);
+       struct device *dev = &pdev->dev;
+       struct bcma_sflash *sflash = dev_get_platdata(dev);
        struct bcm47xxsflash *b47s;
+       struct resource *res;
        int err;
 
-       b47s = devm_kzalloc(&pdev->dev, sizeof(*b47s), GFP_KERNEL);
+       b47s = devm_kzalloc(dev, sizeof(*b47s), GFP_KERNEL);
        if (!b47s)
                return -ENOMEM;
        sflash->priv = b47s;
 
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       if (!res) {
+               dev_err(dev, "invalid resource\n");
+               return -EINVAL;
+       }
+       if (!devm_request_mem_region(dev, res->start, resource_size(res),
+                                    res->name)) {
+               dev_err(dev, "can't request region for resource %pR\n", res);
+               return -EBUSY;
+       }
+       b47s->window = ioremap_cache(res->start, resource_size(res));
+       if (!b47s->window) {
+               dev_err(dev, "ioremap failed for resource %pR\n", res);
+               return -ENOMEM;
+       }
+
        b47s->bcma_cc = container_of(sflash, struct bcma_drv_cc, sflash);
        b47s->cc_read = bcm47xxsflash_bcma_cc_read;
        b47s->cc_write = bcm47xxsflash_bcma_cc_write;
@@ -297,7 +315,6 @@ static int bcm47xxsflash_bcma_probe(struct platform_device *pdev)
                break;
        }
 
-       b47s->window = sflash->window;
        b47s->blocksize = sflash->blocksize;
        b47s->numblocks = sflash->numblocks;
        b47s->size = sflash->size;
@@ -306,6 +323,7 @@ static int bcm47xxsflash_bcma_probe(struct platform_device *pdev)
        err = mtd_device_parse_register(&b47s->mtd, probes, NULL, NULL, 0);
        if (err) {
                pr_err("Failed to register MTD device: %d\n", err);
+               iounmap(b47s->window);
                return err;
        }
 
@@ -321,6 +339,7 @@ static int bcm47xxsflash_bcma_remove(struct platform_device *pdev)
        struct bcm47xxsflash *b47s = sflash->priv;
 
        mtd_device_unregister(&b47s->mtd);
+       iounmap(b47s->window);
 
        return 0;
 }
index fe93daf..1564b62 100644 (file)
@@ -65,7 +65,8 @@ struct bcm47xxsflash {
 
        enum bcm47xxsflash_type type;
 
-       u32 window;
+       void __iomem *window;
+
        u32 blocksize;
        u16 numblocks;
        u32 size;
index e7b2e43..b833e6c 100644 (file)
@@ -67,16 +67,40 @@ module_param(reliable_mode, uint, 0);
 MODULE_PARM_DESC(reliable_mode, "Set the docg3 mode (0=normal MLC, 1=fast, "
                 "2=reliable) : MLC normal operations are in normal mode");
 
-/**
- * struct docg3_oobinfo - DiskOnChip G3 OOB layout
- * @eccbytes: 8 bytes are used (1 for Hamming ECC, 7 for BCH ECC)
- * @eccpos: ecc positions (byte 7 is Hamming ECC, byte 8-14 are BCH ECC)
- * @oobfree: free pageinfo bytes (byte 0 until byte 6, byte 15
- */
-static struct nand_ecclayout docg3_oobinfo = {
-       .eccbytes = 8,
-       .eccpos = {7, 8, 9, 10, 11, 12, 13, 14},
-       .oobfree = {{0, 7}, {15, 1} },
+static int docg3_ooblayout_ecc(struct mtd_info *mtd, int section,
+                              struct mtd_oob_region *oobregion)
+{
+       if (section)
+               return -ERANGE;
+
+       /* byte 7 is Hamming ECC, byte 8-14 are BCH ECC */
+       oobregion->offset = 7;
+       oobregion->length = 8;
+
+       return 0;
+}
+
+static int docg3_ooblayout_free(struct mtd_info *mtd, int section,
+                               struct mtd_oob_region *oobregion)
+{
+       if (section > 1)
+               return -ERANGE;
+
+       /* free bytes: byte 0 until byte 6, byte 15 */
+       if (!section) {
+               oobregion->offset = 0;
+               oobregion->length = 7;
+       } else {
+               oobregion->offset = 15;
+               oobregion->length = 1;
+       }
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops nand_ooblayout_docg3_ops = {
+       .ecc = docg3_ooblayout_ecc,
+       .free = docg3_ooblayout_free,
 };
 
 static inline u8 doc_readb(struct docg3 *docg3, u16 reg)
@@ -1857,7 +1881,7 @@ static int __init doc_set_driver_info(int chip_id, struct mtd_info *mtd)
        mtd->_read_oob = doc_read_oob;
        mtd->_write_oob = doc_write_oob;
        mtd->_block_isbad = doc_block_isbad;
-       mtd->ecclayout = &docg3_oobinfo;
+       mtd_set_ooblayout(mtd, &nand_ooblayout_docg3_ops);
        mtd->oobavail = 8;
        mtd->ecc_strength = DOC_ECC_BCH_T;
 
index c9c3b7f..9d68544 100644 (file)
@@ -131,6 +131,28 @@ static int m25p80_read(struct spi_nor *nor, loff_t from, size_t len,
        /* convert the dummy cycles to the number of bytes */
        dummy /= 8;
 
+       if (spi_flash_read_supported(spi)) {
+               struct spi_flash_read_message msg;
+               int ret;
+
+               memset(&msg, 0, sizeof(msg));
+
+               msg.buf = buf;
+               msg.from = from;
+               msg.len = len;
+               msg.read_opcode = nor->read_opcode;
+               msg.addr_width = nor->addr_width;
+               msg.dummy_bytes = dummy;
+               /* TODO: Support other combinations */
+               msg.opcode_nbits = SPI_NBITS_SINGLE;
+               msg.addr_nbits = SPI_NBITS_SINGLE;
+               msg.data_nbits = m25p80_rx_nbits(nor);
+
+               ret = spi_flash_read(spi, &msg);
+               *retlen = msg.retlen;
+               return ret;
+       }
+
        spi_message_init(&m);
        memset(t, 0, (sizeof t));
 
index 708b7e8..220f920 100644 (file)
@@ -353,7 +353,7 @@ static int pmc551_write(struct mtd_info *mtd, loff_t to, size_t len,
  * mechanism
  * returns the size of the memory region found.
  */
-static int fixup_pmc551(struct pci_dev *dev)
+static int __init fixup_pmc551(struct pci_dev *dev)
 {
 #ifdef CONFIG_MTD_PMC551_BUGFIX
        u32 dram_data;
index 0455166..4f206a9 100644 (file)
@@ -112,8 +112,8 @@ static void ck804xrom_cleanup(struct ck804xrom_window *window)
 }
 
 
-static int ck804xrom_init_one(struct pci_dev *pdev,
-                             const struct pci_device_id *ent)
+static int __init ck804xrom_init_one(struct pci_dev *pdev,
+                                    const struct pci_device_id *ent)
 {
        static char *rom_probe_types[] = { "cfi_probe", "jedec_probe", NULL };
        u8 byte;
index 76ed651..9646b07 100644 (file)
@@ -144,8 +144,8 @@ static void esb2rom_cleanup(struct esb2rom_window *window)
        pci_dev_put(window->pdev);
 }
 
-static int esb2rom_init_one(struct pci_dev *pdev,
-                           const struct pci_device_id *ent)
+static int __init esb2rom_init_one(struct pci_dev *pdev,
+                                  const struct pci_device_id *ent)
 {
        static char *rom_probe_types[] = { "cfi_probe", "jedec_probe", NULL };
        struct esb2rom_window *window = &esb2rom_window;
index 8636bba..e17d02a 100644 (file)
@@ -84,8 +84,8 @@ static void ichxrom_cleanup(struct ichxrom_window *window)
 }
 
 
-static int ichxrom_init_one(struct pci_dev *pdev,
-                           const struct pci_device_id *ent)
+static int __init ichxrom_init_one(struct pci_dev *pdev,
+                                  const struct pci_device_id *ent)
 {
        static char *rom_probe_types[] = { "cfi_probe", "jedec_probe", NULL };
        struct ichxrom_window *window = &ichxrom_window;
index c1af83d..00a8190 100644 (file)
@@ -4,11 +4,13 @@
  *     uclinux.c -- generic memory mapped MTD driver for uclinux
  *
  *     (C) Copyright 2002, Greg Ungerer (gerg@snapgear.com)
+ *
+ *      License: GPL
  */
 
 /****************************************************************************/
 
-#include <linux/module.h>
+#include <linux/moduleparam.h>
 #include <linux/types.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -117,27 +119,6 @@ static int __init uclinux_mtd_init(void)
 
        return(0);
 }
-
-/****************************************************************************/
-
-static void __exit uclinux_mtd_cleanup(void)
-{
-       if (uclinux_ram_mtdinfo) {
-               mtd_device_unregister(uclinux_ram_mtdinfo);
-               map_destroy(uclinux_ram_mtdinfo);
-               uclinux_ram_mtdinfo = NULL;
-       }
-       if (uclinux_ram_map.virt)
-               uclinux_ram_map.virt = 0;
-}
-
-/****************************************************************************/
-
-module_init(uclinux_mtd_init);
-module_exit(uclinux_mtd_cleanup);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Greg Ungerer <gerg@snapgear.com>");
-MODULE_DESCRIPTION("Generic MTD for uClinux");
+device_initcall(uclinux_mtd_init);
 
 /****************************************************************************/
index 6d19835..2a47a3f 100644 (file)
@@ -465,35 +465,108 @@ static int mtdchar_readoob(struct file *file, struct mtd_info *mtd,
 }
 
 /*
- * Copies (and truncates, if necessary) data from the larger struct,
- * nand_ecclayout, to the smaller, deprecated layout struct,
- * nand_ecclayout_user. This is necessary only to support the deprecated
- * API ioctl ECCGETLAYOUT while allowing all new functionality to use
- * nand_ecclayout flexibly (i.e. the struct may change size in new
- * releases without requiring major rewrites).
+ * Copies (and truncates, if necessary) OOB layout information to the
+ * deprecated layout struct, nand_ecclayout_user. This is necessary only to
+ * support the deprecated API ioctl ECCGETLAYOUT while allowing all new
+ * functionality to use mtd_ooblayout_ops flexibly (i.e. mtd_ooblayout_ops
+ * can describe any kind of OOB layout with almost zero overhead from a
+ * memory usage point of view).
  */
-static int shrink_ecclayout(const struct nand_ecclayout *from,
-               struct nand_ecclayout_user *to)
+static int shrink_ecclayout(struct mtd_info *mtd,
+                           struct nand_ecclayout_user *to)
 {
-       int i;
+       struct mtd_oob_region oobregion;
+       int i, section = 0, ret;
 
-       if (!from || !to)
+       if (!mtd || !to)
                return -EINVAL;
 
        memset(to, 0, sizeof(*to));
 
-       to->eccbytes = min((int)from->eccbytes, MTD_MAX_ECCPOS_ENTRIES);
-       for (i = 0; i < to->eccbytes; i++)
-               to->eccpos[i] = from->eccpos[i];
+       to->eccbytes = 0;
+       for (i = 0; i < MTD_MAX_ECCPOS_ENTRIES;) {
+               u32 eccpos;
+
+               ret = mtd_ooblayout_ecc(mtd, section, &oobregion);
+               if (ret < 0) {
+                       if (ret != -ERANGE)
+                               return ret;
+
+                       break;
+               }
+
+               eccpos = oobregion.offset;
+               for (; i < MTD_MAX_ECCPOS_ENTRIES &&
+                      eccpos < oobregion.offset + oobregion.length; i++) {
+                       to->eccpos[i] = eccpos++;
+                       to->eccbytes++;
+               }
+       }
 
        for (i = 0; i < MTD_MAX_OOBFREE_ENTRIES; i++) {
-               if (from->oobfree[i].length == 0 &&
-                               from->oobfree[i].offset == 0)
+               ret = mtd_ooblayout_free(mtd, i, &oobregion);
+               if (ret < 0) {
+                       if (ret != -ERANGE)
+                               return ret;
+
+                       break;
+               }
+
+               to->oobfree[i].offset = oobregion.offset;
+               to->oobfree[i].length = oobregion.length;
+               to->oobavail += to->oobfree[i].length;
+       }
+
+       return 0;
+}
+
+static int get_oobinfo(struct mtd_info *mtd, struct nand_oobinfo *to)
+{
+       struct mtd_oob_region oobregion;
+       int i, section = 0, ret;
+
+       if (!mtd || !to)
+               return -EINVAL;
+
+       memset(to, 0, sizeof(*to));
+
+       to->eccbytes = 0;
+       for (i = 0; i < ARRAY_SIZE(to->eccpos);) {
+               u32 eccpos;
+
+               ret = mtd_ooblayout_ecc(mtd, section, &oobregion);
+               if (ret < 0) {
+                       if (ret != -ERANGE)
+                               return ret;
+
                        break;
-               to->oobavail += from->oobfree[i].length;
-               to->oobfree[i] = from->oobfree[i];
+               }
+
+               if (oobregion.length + i > ARRAY_SIZE(to->eccpos))
+                       return -EINVAL;
+
+               eccpos = oobregion.offset;
+               for (; eccpos < oobregion.offset + oobregion.length; i++) {
+                       to->eccpos[i] = eccpos++;
+                       to->eccbytes++;
+               }
        }
 
+       for (i = 0; i < 8; i++) {
+               ret = mtd_ooblayout_free(mtd, i, &oobregion);
+               if (ret < 0) {
+                       if (ret != -ERANGE)
+                               return ret;
+
+                       break;
+               }
+
+               to->oobfree[i][0] = oobregion.offset;
+               to->oobfree[i][1] = oobregion.length;
+       }
+
+       to->useecc = MTD_NANDECC_AUTOPLACE;
+
        return 0;
 }
 
@@ -815,16 +888,12 @@ static int mtdchar_ioctl(struct file *file, u_int cmd, u_long arg)
        {
                struct nand_oobinfo oi;
 
-               if (!mtd->ecclayout)
+               if (!mtd->ooblayout)
                        return -EOPNOTSUPP;
-               if (mtd->ecclayout->eccbytes > ARRAY_SIZE(oi.eccpos))
-                       return -EINVAL;
 
-               oi.useecc = MTD_NANDECC_AUTOPLACE;
-               memcpy(&oi.eccpos, mtd->ecclayout->eccpos, sizeof(oi.eccpos));
-               memcpy(&oi.oobfree, mtd->ecclayout->oobfree,
-                      sizeof(oi.oobfree));
-               oi.eccbytes = mtd->ecclayout->eccbytes;
+               ret = get_oobinfo(mtd, &oi);
+               if (ret)
+                       return ret;
 
                if (copy_to_user(argp, &oi, sizeof(struct nand_oobinfo)))
                        return -EFAULT;
@@ -913,14 +982,14 @@ static int mtdchar_ioctl(struct file *file, u_int cmd, u_long arg)
        {
                struct nand_ecclayout_user *usrlay;
 
-               if (!mtd->ecclayout)
+               if (!mtd->ooblayout)
                        return -EOPNOTSUPP;
 
                usrlay = kmalloc(sizeof(*usrlay), GFP_KERNEL);
                if (!usrlay)
                        return -ENOMEM;
 
-               shrink_ecclayout(mtd->ecclayout, usrlay);
+               shrink_ecclayout(mtd, usrlay);
 
                if (copy_to_user(argp, usrlay, sizeof(*usrlay)))
                        ret = -EFAULT;
index 239a8c8..d573606 100644 (file)
@@ -777,7 +777,7 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[],       /* subdevices to c
 
        }
 
-       concat->mtd.ecclayout = subdev[0]->ecclayout;
+       mtd_set_ooblayout(&concat->mtd, subdev[0]->ooblayout);
 
        concat->num_subdev = num_devs;
        concat->mtd.name = name;
index bee180b..e3936b8 100644 (file)
@@ -1016,6 +1016,366 @@ int mtd_write_oob(struct mtd_info *mtd, loff_t to,
 }
 EXPORT_SYMBOL_GPL(mtd_write_oob);
 
+/**
+ * mtd_ooblayout_ecc - Get the OOB region definition of a specific ECC section
+ * @mtd: MTD device structure
+ * @section: ECC section. Depending on the layout you may have all the ECC
+ *          bytes stored in a single contiguous section, or one section
+ *          per ECC chunk (and sometime several sections for a single ECC
+ *          ECC chunk)
+ * @oobecc: OOB region struct filled with the appropriate ECC position
+ *         information
+ *
+ * This functions return ECC section information in the OOB area. I you want
+ * to get all the ECC bytes information, then you should call
+ * mtd_ooblayout_ecc(mtd, section++, oobecc) until it returns -ERANGE.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+int mtd_ooblayout_ecc(struct mtd_info *mtd, int section,
+                     struct mtd_oob_region *oobecc)
+{
+       memset(oobecc, 0, sizeof(*oobecc));
+
+       if (!mtd || section < 0)
+               return -EINVAL;
+
+       if (!mtd->ooblayout || !mtd->ooblayout->ecc)
+               return -ENOTSUPP;
+
+       return mtd->ooblayout->ecc(mtd, section, oobecc);
+}
+EXPORT_SYMBOL_GPL(mtd_ooblayout_ecc);
+
+/**
+ * mtd_ooblayout_free - Get the OOB region definition of a specific free
+ *                     section
+ * @mtd: MTD device structure
+ * @section: Free section you are interested in. Depending on the layout
+ *          you may have all the free bytes stored in a single contiguous
+ *          section, or one section per ECC chunk plus an extra section
+ *          for the remaining bytes (or other funky layout).
+ * @oobfree: OOB region struct filled with the appropriate free position
+ *          information
+ *
+ * This functions return free bytes position in the OOB area. I you want
+ * to get all the free bytes information, then you should call
+ * mtd_ooblayout_free(mtd, section++, oobfree) until it returns -ERANGE.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+int mtd_ooblayout_free(struct mtd_info *mtd, int section,
+                      struct mtd_oob_region *oobfree)
+{
+       memset(oobfree, 0, sizeof(*oobfree));
+
+       if (!mtd || section < 0)
+               return -EINVAL;
+
+       if (!mtd->ooblayout || !mtd->ooblayout->free)
+               return -ENOTSUPP;
+
+       return mtd->ooblayout->free(mtd, section, oobfree);
+}
+EXPORT_SYMBOL_GPL(mtd_ooblayout_free);
+
+/**
+ * mtd_ooblayout_find_region - Find the region attached to a specific byte
+ * @mtd: mtd info structure
+ * @byte: the byte we are searching for
+ * @sectionp: pointer where the section id will be stored
+ * @oobregion: used to retrieve the ECC position
+ * @iter: iterator function. Should be either mtd_ooblayout_free or
+ *       mtd_ooblayout_ecc depending on the region type you're searching for
+ *
+ * This functions returns the section id and oobregion information of a
+ * specific byte. For example, say you want to know where the 4th ECC byte is
+ * stored, you'll use:
+ *
+ * mtd_ooblayout_find_region(mtd, 3, &section, &oobregion, mtd_ooblayout_ecc);
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+static int mtd_ooblayout_find_region(struct mtd_info *mtd, int byte,
+                               int *sectionp, struct mtd_oob_region *oobregion,
+                               int (*iter)(struct mtd_info *,
+                                           int section,
+                                           struct mtd_oob_region *oobregion))
+{
+       int pos = 0, ret, section = 0;
+
+       memset(oobregion, 0, sizeof(*oobregion));
+
+       while (1) {
+               ret = iter(mtd, section, oobregion);
+               if (ret)
+                       return ret;
+
+               if (pos + oobregion->length > byte)
+                       break;
+
+               pos += oobregion->length;
+               section++;
+       }
+
+       /*
+        * Adjust region info to make it start at the beginning at the
+        * 'start' ECC byte.
+        */
+       oobregion->offset += byte - pos;
+       oobregion->length -= byte - pos;
+       *sectionp = section;
+
+       return 0;
+}
+
+/**
+ * mtd_ooblayout_find_eccregion - Find the ECC region attached to a specific
+ *                               ECC byte
+ * @mtd: mtd info structure
+ * @eccbyte: the byte we are searching for
+ * @sectionp: pointer where the section id will be stored
+ * @oobregion: OOB region information
+ *
+ * Works like mtd_ooblayout_find_region() except it searches for a specific ECC
+ * byte.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+int mtd_ooblayout_find_eccregion(struct mtd_info *mtd, int eccbyte,
+                                int *section,
+                                struct mtd_oob_region *oobregion)
+{
+       return mtd_ooblayout_find_region(mtd, eccbyte, section, oobregion,
+                                        mtd_ooblayout_ecc);
+}
+EXPORT_SYMBOL_GPL(mtd_ooblayout_find_eccregion);
+
+/**
+ * mtd_ooblayout_get_bytes - Extract OOB bytes from the oob buffer
+ * @mtd: mtd info structure
+ * @buf: destination buffer to store OOB bytes
+ * @oobbuf: OOB buffer
+ * @start: first byte to retrieve
+ * @nbytes: number of bytes to retrieve
+ * @iter: section iterator
+ *
+ * Extract bytes attached to a specific category (ECC or free)
+ * from the OOB buffer and copy them into buf.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+static int mtd_ooblayout_get_bytes(struct mtd_info *mtd, u8 *buf,
+                               const u8 *oobbuf, int start, int nbytes,
+                               int (*iter)(struct mtd_info *,
+                                           int section,
+                                           struct mtd_oob_region *oobregion))
+{
+       struct mtd_oob_region oobregion = { };
+       int section = 0, ret;
+
+       ret = mtd_ooblayout_find_region(mtd, start, &section,
+                                       &oobregion, iter);
+
+       while (!ret) {
+               int cnt;
+
+               cnt = oobregion.length > nbytes ? nbytes : oobregion.length;
+               memcpy(buf, oobbuf + oobregion.offset, cnt);
+               buf += cnt;
+               nbytes -= cnt;
+
+               if (!nbytes)
+                       break;
+
+               ret = iter(mtd, ++section, &oobregion);
+       }
+
+       return ret;
+}
+
+/**
+ * mtd_ooblayout_set_bytes - put OOB bytes into the oob buffer
+ * @mtd: mtd info structure
+ * @buf: source buffer to get OOB bytes from
+ * @oobbuf: OOB buffer
+ * @start: first OOB byte to set
+ * @nbytes: number of OOB bytes to set
+ * @iter: section iterator
+ *
+ * Fill the OOB buffer with data provided in buf. The category (ECC or free)
+ * is selected by passing the appropriate iterator.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+static int mtd_ooblayout_set_bytes(struct mtd_info *mtd, const u8 *buf,
+                               u8 *oobbuf, int start, int nbytes,
+                               int (*iter)(struct mtd_info *,
+                                           int section,
+                                           struct mtd_oob_region *oobregion))
+{
+       struct mtd_oob_region oobregion = { };
+       int section = 0, ret;
+
+       ret = mtd_ooblayout_find_region(mtd, start, &section,
+                                       &oobregion, iter);
+
+       while (!ret) {
+               int cnt;
+
+               cnt = oobregion.length > nbytes ? nbytes : oobregion.length;
+               memcpy(oobbuf + oobregion.offset, buf, cnt);
+               buf += cnt;
+               nbytes -= cnt;
+
+               if (!nbytes)
+                       break;
+
+               ret = iter(mtd, ++section, &oobregion);
+       }
+
+       return ret;
+}
+
+/**
+ * mtd_ooblayout_count_bytes - count the number of bytes in a OOB category
+ * @mtd: mtd info structure
+ * @iter: category iterator
+ *
+ * Count the number of bytes in a given category.
+ *
+ * Returns a positive value on success, a negative error code otherwise.
+ */
+static int mtd_ooblayout_count_bytes(struct mtd_info *mtd,
+                               int (*iter)(struct mtd_info *,
+                                           int section,
+                                           struct mtd_oob_region *oobregion))
+{
+       struct mtd_oob_region oobregion = { };
+       int section = 0, ret, nbytes = 0;
+
+       while (1) {
+               ret = iter(mtd, section++, &oobregion);
+               if (ret) {
+                       if (ret == -ERANGE)
+                               ret = nbytes;
+                       break;
+               }
+
+               nbytes += oobregion.length;
+       }
+
+       return ret;
+}
+
+/**
+ * mtd_ooblayout_get_eccbytes - extract ECC bytes from the oob buffer
+ * @mtd: mtd info structure
+ * @eccbuf: destination buffer to store ECC bytes
+ * @oobbuf: OOB buffer
+ * @start: first ECC byte to retrieve
+ * @nbytes: number of ECC bytes to retrieve
+ *
+ * Works like mtd_ooblayout_get_bytes(), except it acts on ECC bytes.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+int mtd_ooblayout_get_eccbytes(struct mtd_info *mtd, u8 *eccbuf,
+                              const u8 *oobbuf, int start, int nbytes)
+{
+       return mtd_ooblayout_get_bytes(mtd, eccbuf, oobbuf, start, nbytes,
+                                      mtd_ooblayout_ecc);
+}
+EXPORT_SYMBOL_GPL(mtd_ooblayout_get_eccbytes);
+
+/**
+ * mtd_ooblayout_set_eccbytes - set ECC bytes into the oob buffer
+ * @mtd: mtd info structure
+ * @eccbuf: source buffer to get ECC bytes from
+ * @oobbuf: OOB buffer
+ * @start: first ECC byte to set
+ * @nbytes: number of ECC bytes to set
+ *
+ * Works like mtd_ooblayout_set_bytes(), except it acts on ECC bytes.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+int mtd_ooblayout_set_eccbytes(struct mtd_info *mtd, const u8 *eccbuf,
+                              u8 *oobbuf, int start, int nbytes)
+{
+       return mtd_ooblayout_set_bytes(mtd, eccbuf, oobbuf, start, nbytes,
+                                      mtd_ooblayout_ecc);
+}
+EXPORT_SYMBOL_GPL(mtd_ooblayout_set_eccbytes);
+
+/**
+ * mtd_ooblayout_get_databytes - extract data bytes from the oob buffer
+ * @mtd: mtd info structure
+ * @databuf: destination buffer to store ECC bytes
+ * @oobbuf: OOB buffer
+ * @start: first ECC byte to retrieve
+ * @nbytes: number of ECC bytes to retrieve
+ *
+ * Works like mtd_ooblayout_get_bytes(), except it acts on free bytes.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+int mtd_ooblayout_get_databytes(struct mtd_info *mtd, u8 *databuf,
+                               const u8 *oobbuf, int start, int nbytes)
+{
+       return mtd_ooblayout_get_bytes(mtd, databuf, oobbuf, start, nbytes,
+                                      mtd_ooblayout_free);
+}
+EXPORT_SYMBOL_GPL(mtd_ooblayout_get_databytes);
+
+/**
+ * mtd_ooblayout_get_eccbytes - set data bytes into the oob buffer
+ * @mtd: mtd info structure
+ * @eccbuf: source buffer to get data bytes from
+ * @oobbuf: OOB buffer
+ * @start: first ECC byte to set
+ * @nbytes: number of ECC bytes to set
+ *
+ * Works like mtd_ooblayout_get_bytes(), except it acts on free bytes.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+int mtd_ooblayout_set_databytes(struct mtd_info *mtd, const u8 *databuf,
+                               u8 *oobbuf, int start, int nbytes)
+{
+       return mtd_ooblayout_set_bytes(mtd, databuf, oobbuf, start, nbytes,
+                                      mtd_ooblayout_free);
+}
+EXPORT_SYMBOL_GPL(mtd_ooblayout_set_databytes);
+
+/**
+ * mtd_ooblayout_count_freebytes - count the number of free bytes in OOB
+ * @mtd: mtd info structure
+ *
+ * Works like mtd_ooblayout_count_bytes(), except it count free bytes.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+int mtd_ooblayout_count_freebytes(struct mtd_info *mtd)
+{
+       return mtd_ooblayout_count_bytes(mtd, mtd_ooblayout_free);
+}
+EXPORT_SYMBOL_GPL(mtd_ooblayout_count_freebytes);
+
+/**
+ * mtd_ooblayout_count_freebytes - count the number of ECC bytes in OOB
+ * @mtd: mtd info structure
+ *
+ * Works like mtd_ooblayout_count_bytes(), except it count ECC bytes.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+int mtd_ooblayout_count_eccbytes(struct mtd_info *mtd)
+{
+       return mtd_ooblayout_count_bytes(mtd, mtd_ooblayout_ecc);
+}
+EXPORT_SYMBOL_GPL(mtd_ooblayout_count_eccbytes);
+
 /*
  * Method to access the protection register area, present in some flash
  * devices. The user data is one time programmable but the factory data is read
index 08de4b2..1f13e32 100644 (file)
@@ -317,6 +317,27 @@ static int part_block_markbad(struct mtd_info *mtd, loff_t ofs)
        return res;
 }
 
+static int part_ooblayout_ecc(struct mtd_info *mtd, int section,
+                             struct mtd_oob_region *oobregion)
+{
+       struct mtd_part *part = mtd_to_part(mtd);
+
+       return mtd_ooblayout_ecc(part->master, section, oobregion);
+}
+
+static int part_ooblayout_free(struct mtd_info *mtd, int section,
+                              struct mtd_oob_region *oobregion)
+{
+       struct mtd_part *part = mtd_to_part(mtd);
+
+       return mtd_ooblayout_free(part->master, section, oobregion);
+}
+
+static const struct mtd_ooblayout_ops part_ooblayout_ops = {
+       .ecc = part_ooblayout_ecc,
+       .free = part_ooblayout_free,
+};
+
 static inline void free_partition(struct mtd_part *p)
 {
        kfree(p->mtd.name);
@@ -533,7 +554,7 @@ static struct mtd_part *allocate_partition(struct mtd_info *master,
                        part->name);
        }
 
-       slave->mtd.ecclayout = master->ecclayout;
+       mtd_set_ooblayout(&slave->mtd, &part_ooblayout_ops);
        slave->mtd.ecc_step_size = master->ecc_step_size;
        slave->mtd.ecc_strength = master->ecc_strength;
        slave->mtd.bitflip_threshold = master->bitflip_threshold;
index 68b58c8..78e12cc 100644 (file)
@@ -224,6 +224,7 @@ static int ams_delta_init(struct platform_device *pdev)
        /* 25 us command delay time */
        this->chip_delay = 30;
        this->ecc.mode = NAND_ECC_SOFT;
+       this->ecc.algo = NAND_ECC_HAMMING;
 
        platform_set_drvdata(pdev, io_base);
 
index 20cbaab..68b9160 100644 (file)
@@ -36,7 +36,6 @@
 #include <linux/of.h>
 #include <linux/of_device.h>
 #include <linux/of_gpio.h>
-#include <linux/of_mtd.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/nand.h>
 #include <linux/mtd/partitions.h>
@@ -68,34 +67,44 @@ struct atmel_nand_caps {
        uint8_t pmecc_max_correction;
 };
 
-struct atmel_nand_nfc_caps {
-       uint32_t rb_mask;
-};
-
-/* oob layout for large page size
+/*
+ * oob layout for large page size
  * bad block info is on bytes 0 and 1
  * the bytes have to be consecutives to avoid
  * several NAND_CMD_RNDOUT during read
- */
-static struct nand_ecclayout atmel_oobinfo_large = {
-       .eccbytes = 4,
-       .eccpos = {60, 61, 62, 63},
-       .oobfree = {
-               {2, 58}
-       },
-};
-
-/* oob layout for small page size
+ *
+ * oob layout for small page size
  * bad block info is on bytes 4 and 5
  * the bytes have to be consecutives to avoid
  * several NAND_CMD_RNDOUT during read
  */
-static struct nand_ecclayout atmel_oobinfo_small = {
-       .eccbytes = 4,
-       .eccpos = {0, 1, 2, 3},
-       .oobfree = {
-               {6, 10}
-       },
+static int atmel_ooblayout_ecc_sp(struct mtd_info *mtd, int section,
+                                 struct mtd_oob_region *oobregion)
+{
+       if (section)
+               return -ERANGE;
+
+       oobregion->length = 4;
+       oobregion->offset = 0;
+
+       return 0;
+}
+
+static int atmel_ooblayout_free_sp(struct mtd_info *mtd, int section,
+                                  struct mtd_oob_region *oobregion)
+{
+       if (section)
+               return -ERANGE;
+
+       oobregion->offset = 6;
+       oobregion->length = mtd->oobsize - oobregion->offset;
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops atmel_ooblayout_sp_ops = {
+       .ecc = atmel_ooblayout_ecc_sp,
+       .free = atmel_ooblayout_free_sp,
 };
 
 struct atmel_nfc {
@@ -116,7 +125,6 @@ struct atmel_nfc {
        /* Point to the sram bank which include readed data via NFC */
        void                    *data_in_sram;
        bool                    will_write_sram;
-       const struct atmel_nand_nfc_caps *caps;
 };
 static struct atmel_nfc        nand_nfc;
 
@@ -163,8 +171,6 @@ struct atmel_nand_host {
        int                     *pmecc_delta;
 };
 
-static struct nand_ecclayout atmel_pmecc_oobinfo;
-
 /*
  * Enable NAND.
  */
@@ -434,14 +440,13 @@ err_buf:
 static void atmel_read_buf(struct mtd_info *mtd, u8 *buf, int len)
 {
        struct nand_chip *chip = mtd_to_nand(mtd);
-       struct atmel_nand_host *host = nand_get_controller_data(chip);
 
        if (use_dma && len > mtd->oobsize)
                /* only use DMA for bigger than oob size: better performances */
                if (atmel_nand_dma_op(mtd, buf, len, 1) == 0)
                        return;
 
-       if (host->board.bus_width_16)
+       if (chip->options & NAND_BUSWIDTH_16)
                atmel_read_buf16(mtd, buf, len);
        else
                atmel_read_buf8(mtd, buf, len);
@@ -450,14 +455,13 @@ static void atmel_read_buf(struct mtd_info *mtd, u8 *buf, int len)
 static void atmel_write_buf(struct mtd_info *mtd, const u8 *buf, int len)
 {
        struct nand_chip *chip = mtd_to_nand(mtd);
-       struct atmel_nand_host *host = nand_get_controller_data(chip);
 
        if (use_dma && len > mtd->oobsize)
                /* only use DMA for bigger than oob size: better performances */
                if (atmel_nand_dma_op(mtd, (void *)buf, len, 0) == 0)
                        return;
 
-       if (host->board.bus_width_16)
+       if (chip->options & NAND_BUSWIDTH_16)
                atmel_write_buf16(mtd, buf, len);
        else
                atmel_write_buf8(mtd, buf, len);
@@ -483,22 +487,6 @@ static int pmecc_get_ecc_bytes(int cap, int sector_size)
        return (m * cap + 7) / 8;
 }
 
-static void pmecc_config_ecc_layout(struct nand_ecclayout *layout,
-                                   int oobsize, int ecc_len)
-{
-       int i;
-
-       layout->eccbytes = ecc_len;
-
-       /* ECC will occupy the last ecc_len bytes continuously */
-       for (i = 0; i < ecc_len; i++)
-               layout->eccpos[i] = oobsize - ecc_len + i;
-
-       layout->oobfree[0].offset = PMECC_OOB_RESERVED_BYTES;
-       layout->oobfree[0].length =
-               oobsize - ecc_len - layout->oobfree[0].offset;
-}
-
 static void __iomem *pmecc_get_alpha_to(struct atmel_nand_host *host)
 {
        int table_size;
@@ -836,13 +824,16 @@ static void pmecc_correct_data(struct mtd_info *mtd, uint8_t *buf, uint8_t *ecc,
                        dev_dbg(host->dev, "Bit flip in data area, byte_pos: %d, bit_pos: %d, 0x%02x -> 0x%02x\n",
                                pos, bit_pos, err_byte, *(buf + byte_pos));
                } else {
+                       struct mtd_oob_region oobregion;
+
                        /* Bit flip in OOB area */
                        tmp = sector_num * nand_chip->ecc.bytes
                                        + (byte_pos - sector_size);
                        err_byte = ecc[tmp];
                        ecc[tmp] ^= (1 << bit_pos);
 
-                       pos = tmp + nand_chip->ecc.layout->eccpos[0];
+                       mtd_ooblayout_ecc(mtd, 0, &oobregion);
+                       pos = tmp + oobregion.offset;
                        dev_dbg(host->dev, "Bit flip in OOB, oob_byte_pos: %d, bit_pos: %d, 0x%02x -> 0x%02x\n",
                                pos, bit_pos, err_byte, ecc[tmp]);
                }
@@ -863,17 +854,6 @@ static int pmecc_correction(struct mtd_info *mtd, u32 pmecc_stat, uint8_t *buf,
        uint8_t *buf_pos;
        int max_bitflips = 0;
 
-       /* If can correct bitfilps from erased page, do the normal check */
-       if (host->caps->pmecc_correct_erase_page)
-               goto normal_check;
-
-       for (i = 0; i < nand_chip->ecc.total; i++)
-               if (ecc[i] != 0xff)
-                       goto normal_check;
-       /* Erased page, return OK */
-       return 0;
-
-normal_check:
        for (i = 0; i < nand_chip->ecc.steps; i++) {
                err_nbr = 0;
                if (pmecc_stat & 0x1) {
@@ -884,16 +864,30 @@ normal_check:
                        pmecc_get_sigma(mtd);
 
                        err_nbr = pmecc_err_location(mtd);
-                       if (err_nbr == -1) {
+                       if (err_nbr >= 0) {
+                               pmecc_correct_data(mtd, buf_pos, ecc, i,
+                                                  nand_chip->ecc.bytes,
+                                                  err_nbr);
+                       } else if (!host->caps->pmecc_correct_erase_page) {
+                               u8 *ecc_pos = ecc + (i * nand_chip->ecc.bytes);
+
+                               /* Try to detect erased pages */
+                               err_nbr = nand_check_erased_ecc_chunk(buf_pos,
+                                                       host->pmecc_sector_size,
+                                                       ecc_pos,
+                                                       nand_chip->ecc.bytes,
+                                                       NULL, 0,
+                                                       nand_chip->ecc.strength);
+                       }
+
+                       if (err_nbr < 0) {
                                dev_err(host->dev, "PMECC: Too many errors\n");
                                mtd->ecc_stats.failed++;
                                return -EIO;
-                       } else {
-                               pmecc_correct_data(mtd, buf_pos, ecc, i,
-                                       nand_chip->ecc.bytes, err_nbr);
-                               mtd->ecc_stats.corrected += err_nbr;
-                               max_bitflips = max_t(int, max_bitflips, err_nbr);
                        }
+
+                       mtd->ecc_stats.corrected += err_nbr;
+                       max_bitflips = max_t(int, max_bitflips, err_nbr);
                }
                pmecc_stat >>= 1;
        }
@@ -931,7 +925,6 @@ static int atmel_nand_pmecc_read_page(struct mtd_info *mtd,
        struct atmel_nand_host *host = nand_get_controller_data(chip);
        int eccsize = chip->ecc.size * chip->ecc.steps;
        uint8_t *oob = chip->oob_poi;
-       uint32_t *eccpos = chip->ecc.layout->eccpos;
        uint32_t stat;
        unsigned long end_time;
        int bitflips = 0;
@@ -953,7 +946,11 @@ static int atmel_nand_pmecc_read_page(struct mtd_info *mtd,
 
        stat = pmecc_readl_relaxed(host->ecc, ISR);
        if (stat != 0) {
-               bitflips = pmecc_correction(mtd, stat, buf, &oob[eccpos[0]]);
+               struct mtd_oob_region oobregion;
+
+               mtd_ooblayout_ecc(mtd, 0, &oobregion);
+               bitflips = pmecc_correction(mtd, stat, buf,
+                                           &oob[oobregion.offset]);
                if (bitflips < 0)
                        /* uncorrectable errors */
                        return 0;
@@ -967,8 +964,8 @@ static int atmel_nand_pmecc_write_page(struct mtd_info *mtd,
                int page)
 {
        struct atmel_nand_host *host = nand_get_controller_data(chip);
-       uint32_t *eccpos = chip->ecc.layout->eccpos;
-       int i, j;
+       struct mtd_oob_region oobregion = { };
+       int i, j, section = 0;
        unsigned long end_time;
 
        if (!host->nfc || !host->nfc->write_by_sram) {
@@ -987,11 +984,14 @@ static int atmel_nand_pmecc_write_page(struct mtd_info *mtd,
 
        for (i = 0; i < chip->ecc.steps; i++) {
                for (j = 0; j < chip->ecc.bytes; j++) {
-                       int pos;
+                       if (!oobregion.length)
+                               mtd_ooblayout_ecc(mtd, section, &oobregion);
 
-                       pos = i * chip->ecc.bytes + j;
-                       chip->oob_poi[eccpos[pos]] =
+                       chip->oob_poi[oobregion.offset] =
                                pmecc_readb_ecc_relaxed(host->ecc, i, j);
+                       oobregion.length--;
+                       oobregion.offset++;
+                       section++;
                }
        }
        chip->write_buf(mtd, chip->oob_poi, mtd->oobsize);
@@ -1003,8 +1003,9 @@ static void atmel_pmecc_core_init(struct mtd_info *mtd)
 {
        struct nand_chip *nand_chip = mtd_to_nand(mtd);
        struct atmel_nand_host *host = nand_get_controller_data(nand_chip);
+       int eccbytes = mtd_ooblayout_count_eccbytes(mtd);
        uint32_t val = 0;
-       struct nand_ecclayout *ecc_layout;
+       struct mtd_oob_region oobregion;
 
        pmecc_writel(host->ecc, CTRL, PMECC_CTRL_RST);
        pmecc_writel(host->ecc, CTRL, PMECC_CTRL_DISABLE);
@@ -1054,11 +1055,11 @@ static void atmel_pmecc_core_init(struct mtd_info *mtd)
                | PMECC_CFG_AUTO_DISABLE);
        pmecc_writel(host->ecc, CFG, val);
 
-       ecc_layout = nand_chip->ecc.layout;
        pmecc_writel(host->ecc, SAREA, mtd->oobsize - 1);
-       pmecc_writel(host->ecc, SADDR, ecc_layout->eccpos[0]);
+       mtd_ooblayout_ecc(mtd, 0, &oobregion);
+       pmecc_writel(host->ecc, SADDR, oobregion.offset);
        pmecc_writel(host->ecc, EADDR,
-                       ecc_layout->eccpos[ecc_layout->eccbytes - 1]);
+                    oobregion.offset + eccbytes - 1);
        /* See datasheet about PMECC Clock Control Register */
        pmecc_writel(host->ecc, CLK, 2);
        pmecc_writel(host->ecc, IDR, 0xff);
@@ -1206,6 +1207,7 @@ static int atmel_pmecc_nand_init_params(struct platform_device *pdev,
                dev_warn(host->dev,
                        "Can't get I/O resource regs for PMECC controller, rolling back on software ECC\n");
                nand_chip->ecc.mode = NAND_ECC_SOFT;
+               nand_chip->ecc.algo = NAND_ECC_HAMMING;
                return 0;
        }
 
@@ -1280,11 +1282,8 @@ static int atmel_pmecc_nand_init_params(struct platform_device *pdev,
                        err_no = -EINVAL;
                        goto err;
                }
-               pmecc_config_ecc_layout(&atmel_pmecc_oobinfo,
-                                       mtd->oobsize,
-                                       nand_chip->ecc.total);
 
-               nand_chip->ecc.layout = &atmel_pmecc_oobinfo;
+               mtd_set_ooblayout(mtd, &nand_ooblayout_lp_ops);
                break;
        default:
                dev_warn(host->dev,
@@ -1292,6 +1291,7 @@ static int atmel_pmecc_nand_init_params(struct platform_device *pdev,
                /* page size not handled by HW ECC */
                /* switching back to soft ECC */
                nand_chip->ecc.mode = NAND_ECC_SOFT;
+               nand_chip->ecc.algo = NAND_ECC_HAMMING;
                return 0;
        }
 
@@ -1359,12 +1359,12 @@ static int atmel_nand_read_page(struct mtd_info *mtd, struct nand_chip *chip,
 {
        int eccsize = chip->ecc.size;
        int eccbytes = chip->ecc.bytes;
-       uint32_t *eccpos = chip->ecc.layout->eccpos;
        uint8_t *p = buf;
        uint8_t *oob = chip->oob_poi;
        uint8_t *ecc_pos;
        int stat;
        unsigned int max_bitflips = 0;
+       struct mtd_oob_region oobregion = {};
 
        /*
         * Errata: ALE is incorrectly wired up to the ECC controller
@@ -1382,19 +1382,20 @@ static int atmel_nand_read_page(struct mtd_info *mtd, struct nand_chip *chip,
        chip->read_buf(mtd, p, eccsize);
 
        /* move to ECC position if needed */
-       if (eccpos[0] != 0) {
-               /* This only works on large pages
-                * because the ECC controller waits for
-                * NAND_CMD_RNDOUTSTART after the
-                * NAND_CMD_RNDOUT.
-                * anyway, for small pages, the eccpos[0] == 0
+       mtd_ooblayout_ecc(mtd, 0, &oobregion);
+       if (oobregion.offset != 0) {
+               /*
+                * This only works on large pages because the ECC controller
+                * waits for NAND_CMD_RNDOUTSTART after the NAND_CMD_RNDOUT.
+                * Anyway, for small pages, the first ECC byte is at offset
+                * 0 in the OOB area.
                 */
                chip->cmdfunc(mtd, NAND_CMD_RNDOUT,
-                               mtd->writesize + eccpos[0], -1);
+                             mtd->writesize + oobregion.offset, -1);
        }
 
        /* the ECC controller needs to read the ECC just after the data */
-       ecc_pos = oob + eccpos[0];
+       ecc_pos = oob + oobregion.offset;
        chip->read_buf(mtd, ecc_pos, eccbytes);
 
        /* check if there's an error */
@@ -1504,58 +1505,17 @@ static void atmel_nand_hwctl(struct mtd_info *mtd, int mode)
                ecc_writel(host->ecc, CR, ATMEL_ECC_RST);
 }
 
-static int atmel_of_init_port(struct atmel_nand_host *host,
-                             struct device_node *np)
+static int atmel_of_init_ecc(struct atmel_nand_host *host,
+                            struct device_node *np)
 {
-       u32 val;
        u32 offset[2];
-       int ecc_mode;
-       struct atmel_nand_data *board = &host->board;
-       enum of_gpio_flags flags = 0;
-
-       host->caps = (struct atmel_nand_caps *)
-               of_device_get_match_data(host->dev);
-
-       if (of_property_read_u32(np, "atmel,nand-addr-offset", &val) == 0) {
-               if (val >= 32) {
-                       dev_err(host->dev, "invalid addr-offset %u\n", val);
-                       return -EINVAL;
-               }
-               board->ale = val;
-       }
-
-       if (of_property_read_u32(np, "atmel,nand-cmd-offset", &val) == 0) {
-               if (val >= 32) {
-                       dev_err(host->dev, "invalid cmd-offset %u\n", val);
-                       return -EINVAL;
-               }
-               board->cle = val;
-       }
-
-       ecc_mode = of_get_nand_ecc_mode(np);
-
-       board->ecc_mode = ecc_mode < 0 ? NAND_ECC_SOFT : ecc_mode;
-
-       board->on_flash_bbt = of_get_nand_on_flash_bbt(np);
-
-       board->has_dma = of_property_read_bool(np, "atmel,nand-has-dma");
-
-       if (of_get_nand_bus_width(np) == 16)
-               board->bus_width_16 = 1;
-
-       board->rdy_pin = of_get_gpio_flags(np, 0, &flags);
-       board->rdy_pin_active_low = (flags == OF_GPIO_ACTIVE_LOW);
-
-       board->enable_pin = of_get_gpio(np, 1);
-       board->det_pin = of_get_gpio(np, 2);
+       u32 val;
 
        host->has_pmecc = of_property_read_bool(np, "atmel,has-pmecc");
 
-       /* load the nfc driver if there is */
-       of_platform_populate(np, NULL, NULL, host->dev);
-
-       if (!(board->ecc_mode == NAND_ECC_HW) || !host->has_pmecc)
-               return 0;       /* Not using PMECC */
+       /* Not using PMECC */
+       if (!(host->nand_chip.ecc.mode == NAND_ECC_HW) || !host->has_pmecc)
+               return 0;
 
        /* use PMECC, get correction capability, sector size and lookup
         * table offset.
@@ -1596,16 +1556,65 @@ static int atmel_of_init_port(struct atmel_nand_host *host,
                /* Will build a lookup table and initialize the offset later */
                return 0;
        }
+
        if (!offset[0] && !offset[1]) {
                dev_err(host->dev, "Invalid PMECC lookup table offset\n");
                return -EINVAL;
        }
+
        host->pmecc_lookup_table_offset_512 = offset[0];
        host->pmecc_lookup_table_offset_1024 = offset[1];
 
        return 0;
 }
 
+static int atmel_of_init_port(struct atmel_nand_host *host,
+                             struct device_node *np)
+{
+       u32 val;
+       struct atmel_nand_data *board = &host->board;
+       enum of_gpio_flags flags = 0;
+
+       host->caps = (struct atmel_nand_caps *)
+               of_device_get_match_data(host->dev);
+
+       if (of_property_read_u32(np, "atmel,nand-addr-offset", &val) == 0) {
+               if (val >= 32) {
+                       dev_err(host->dev, "invalid addr-offset %u\n", val);
+                       return -EINVAL;
+               }
+               board->ale = val;
+       }
+
+       if (of_property_read_u32(np, "atmel,nand-cmd-offset", &val) == 0) {
+               if (val >= 32) {
+                       dev_err(host->dev, "invalid cmd-offset %u\n", val);
+                       return -EINVAL;
+               }
+               board->cle = val;
+       }
+
+       board->has_dma = of_property_read_bool(np, "atmel,nand-has-dma");
+
+       board->rdy_pin = of_get_gpio_flags(np, 0, &flags);
+       board->rdy_pin_active_low = (flags == OF_GPIO_ACTIVE_LOW);
+
+       board->enable_pin = of_get_gpio(np, 1);
+       board->det_pin = of_get_gpio(np, 2);
+
+       /* load the nfc driver if there is */
+       of_platform_populate(np, NULL, NULL, host->dev);
+
+       /*
+        * Initialize ECC mode to NAND_ECC_SOFT so that we have a correct value
+        * even if the nand-ecc-mode property is not defined.
+        */
+       host->nand_chip.ecc.mode = NAND_ECC_SOFT;
+       host->nand_chip.ecc.algo = NAND_ECC_HAMMING;
+
+       return 0;
+}
+
 static int atmel_hw_nand_init_params(struct platform_device *pdev,
                                         struct atmel_nand_host *host)
 {
@@ -1618,6 +1627,7 @@ static int atmel_hw_nand_init_params(struct platform_device *pdev,
                dev_err(host->dev,
                        "Can't get I/O resource regs, use software ECC\n");
                nand_chip->ecc.mode = NAND_ECC_SOFT;
+               nand_chip->ecc.algo = NAND_ECC_HAMMING;
                return 0;
        }
 
@@ -1631,25 +1641,26 @@ static int atmel_hw_nand_init_params(struct platform_device *pdev,
        /* set ECC page size and oob layout */
        switch (mtd->writesize) {
        case 512:
-               nand_chip->ecc.layout = &atmel_oobinfo_small;
+               mtd_set_ooblayout(mtd, &atmel_ooblayout_sp_ops);
                ecc_writel(host->ecc, MR, ATMEL_ECC_PAGESIZE_528);
                break;
        case 1024:
-               nand_chip->ecc.layout = &atmel_oobinfo_large;
+               mtd_set_ooblayout(mtd, &nand_ooblayout_lp_ops);
                ecc_writel(host->ecc, MR, ATMEL_ECC_PAGESIZE_1056);
                break;
        case 2048:
-               nand_chip->ecc.layout = &atmel_oobinfo_large;
+               mtd_set_ooblayout(mtd, &nand_ooblayout_lp_ops);
                ecc_writel(host->ecc, MR, ATMEL_ECC_PAGESIZE_2112);
                break;
        case 4096:
-               nand_chip->ecc.layout = &atmel_oobinfo_large;
+               mtd_set_ooblayout(mtd, &nand_ooblayout_lp_ops);
                ecc_writel(host->ecc, MR, ATMEL_ECC_PAGESIZE_4224);
                break;
        default:
                /* page size not handled by HW ECC */
                /* switching back to soft ECC */
                nand_chip->ecc.mode = NAND_ECC_SOFT;
+               nand_chip->ecc.algo = NAND_ECC_HAMMING;
                return 0;
        }
 
@@ -1699,9 +1710,9 @@ static irqreturn_t hsmc_interrupt(int irq, void *dev_id)
                nfc_writel(host->nfc->hsmc_regs, IDR, NFC_SR_XFR_DONE);
                ret = IRQ_HANDLED;
        }
-       if (pending & host->nfc->caps->rb_mask) {
+       if (pending & NFC_SR_RB_EDGE) {
                complete(&host->nfc->comp_ready);
-               nfc_writel(host->nfc->hsmc_regs, IDR, host->nfc->caps->rb_mask);
+               nfc_writel(host->nfc->hsmc_regs, IDR, NFC_SR_RB_EDGE);
                ret = IRQ_HANDLED;
        }
        if (pending & NFC_SR_CMD_DONE) {
@@ -1719,7 +1730,7 @@ static void nfc_prepare_interrupt(struct atmel_nand_host *host, u32 flag)
        if (flag & NFC_SR_XFR_DONE)
                init_completion(&host->nfc->comp_xfer_done);
 
-       if (flag & host->nfc->caps->rb_mask)
+       if (flag & NFC_SR_RB_EDGE)
                init_completion(&host->nfc->comp_ready);
 
        if (flag & NFC_SR_CMD_DONE)
@@ -1737,7 +1748,7 @@ static int nfc_wait_interrupt(struct atmel_nand_host *host, u32 flag)
        if (flag & NFC_SR_XFR_DONE)
                comp[index++] = &host->nfc->comp_xfer_done;
 
-       if (flag & host->nfc->caps->rb_mask)
+       if (flag & NFC_SR_RB_EDGE)
                comp[index++] = &host->nfc->comp_ready;
 
        if (flag & NFC_SR_CMD_DONE)
@@ -1805,7 +1816,7 @@ static int nfc_device_ready(struct mtd_info *mtd)
                dev_err(host->dev, "Lost the interrupt flags: 0x%08x\n",
                                mask & status);
 
-       return status & host->nfc->caps->rb_mask;
+       return status & NFC_SR_RB_EDGE;
 }
 
 static void nfc_select_chip(struct mtd_info *mtd, int chip)
@@ -1978,8 +1989,8 @@ static void nfc_nand_command(struct mtd_info *mtd, unsigned int command,
                }
                /* fall through */
        default:
-               nfc_prepare_interrupt(host, host->nfc->caps->rb_mask);
-               nfc_wait_interrupt(host, host->nfc->caps->rb_mask);
+               nfc_prepare_interrupt(host, NFC_SR_RB_EDGE);
+               nfc_wait_interrupt(host, NFC_SR_RB_EDGE);
        }
 }
 
@@ -2147,6 +2158,19 @@ static int atmel_nand_probe(struct platform_device *pdev)
        } else {
                memcpy(&host->board, dev_get_platdata(&pdev->dev),
                       sizeof(struct atmel_nand_data));
+               nand_chip->ecc.mode = host->board.ecc_mode;
+
+               /*
+                * When using software ECC every supported avr32 board means
+                * Hamming algorithm. If that ever changes we'll need to add
+                * ecc_algo field to the struct atmel_nand_data.
+                */
+               if (nand_chip->ecc.mode == NAND_ECC_SOFT)
+                       nand_chip->ecc.algo = NAND_ECC_HAMMING;
+
+               /* 16-bit bus width */
+               if (host->board.bus_width_16)
+                       nand_chip->options |= NAND_BUSWIDTH_16;
        }
 
         /* link the private data structures */
@@ -2188,11 +2212,8 @@ static int atmel_nand_probe(struct platform_device *pdev)
                nand_chip->cmd_ctrl = atmel_nand_cmd_ctrl;
        }
 
-       nand_chip->ecc.mode = host->board.ecc_mode;
        nand_chip->chip_delay = 40;             /* 40us command delay time */
 
-       if (host->board.bus_width_16)   /* 16-bit bus width */
-               nand_chip->options |= NAND_BUSWIDTH_16;
 
        nand_chip->read_buf = atmel_read_buf;
        nand_chip->write_buf = atmel_write_buf;
@@ -2225,11 +2246,6 @@ static int atmel_nand_probe(struct platform_device *pdev)
                }
        }
 
-       if (host->board.on_flash_bbt || on_flash_bbt) {
-               dev_info(&pdev->dev, "Use On Flash BBT\n");
-               nand_chip->bbt_options |= NAND_BBT_USE_FLASH;
-       }
-
        if (!host->board.has_dma)
                use_dma = 0;
 
@@ -2256,6 +2272,18 @@ static int atmel_nand_probe(struct platform_device *pdev)
                goto err_scan_ident;
        }
 
+       if (host->board.on_flash_bbt || on_flash_bbt)
+               nand_chip->bbt_options |= NAND_BBT_USE_FLASH;
+
+       if (nand_chip->bbt_options & NAND_BBT_USE_FLASH)
+               dev_info(&pdev->dev, "Use On Flash BBT\n");
+
+       if (IS_ENABLED(CONFIG_OF) && pdev->dev.of_node) {
+               res = atmel_of_init_ecc(host, pdev->dev.of_node);
+               if (res)
+                       goto err_hw_ecc;
+       }
+
        if (nand_chip->ecc.mode == NAND_ECC_HW) {
                if (host->has_pmecc)
                        res = atmel_pmecc_nand_init_params(pdev, host);
@@ -2393,11 +2421,6 @@ static int atmel_nand_nfc_probe(struct platform_device *pdev)
                }
        }
 
-       nfc->caps = (const struct atmel_nand_nfc_caps *)
-               of_device_get_match_data(&pdev->dev);
-       if (!nfc->caps)
-               return -ENODEV;
-
        nfc_writel(nfc->hsmc_regs, IDR, 0xffffffff);
        nfc_readl(nfc->hsmc_regs, SR);  /* clear the NFC_SR */
 
@@ -2426,17 +2449,8 @@ static int atmel_nand_nfc_remove(struct platform_device *pdev)
        return 0;
 }
 
-static const struct atmel_nand_nfc_caps sama5d3_nfc_caps = {
-       .rb_mask = NFC_SR_RB_EDGE0,
-};
-
-static const struct atmel_nand_nfc_caps sama5d4_nfc_caps = {
-       .rb_mask = NFC_SR_RB_EDGE3,
-};
-
 static const struct of_device_id atmel_nand_nfc_match[] = {
-       { .compatible = "atmel,sama5d3-nfc", .data = &sama5d3_nfc_caps },
-       { .compatible = "atmel,sama5d4-nfc", .data = &sama5d4_nfc_caps },
+       { .compatible = "atmel,sama5d3-nfc" },
        { /* sentinel */ }
 };
 MODULE_DEVICE_TABLE(of, atmel_nand_nfc_match);
index 0bbc1fa..4d5d262 100644 (file)
@@ -42,8 +42,7 @@
 #define                NFC_SR_UNDEF            (1 << 21)
 #define                NFC_SR_AWB              (1 << 22)
 #define                NFC_SR_ASE              (1 << 23)
-#define                NFC_SR_RB_EDGE0         (1 << 24)
-#define                NFC_SR_RB_EDGE3         (1 << 27)
+#define                NFC_SR_RB_EDGE          (1 << 24)
 
 #define ATMEL_HSMC_NFC_IER     0x0c
 #define ATMEL_HSMC_NFC_IDR     0x10
index 341ea49..9bf6d99 100644 (file)
@@ -459,6 +459,7 @@ static int au1550nd_probe(struct platform_device *pdev)
        /* 30 us command delay time */
        this->chip_delay = 30;
        this->ecc.mode = NAND_ECC_SOFT;
+       this->ecc.algo = NAND_ECC_HAMMING;
 
        if (pd->devwidth)
                this->options |= NAND_BUSWIDTH_16;
index 7f6b30e..37da423 100644 (file)
@@ -109,28 +109,33 @@ static const unsigned short bfin_nfc_pin_req[] =
         0};
 
 #ifdef CONFIG_MTD_NAND_BF5XX_BOOTROM_ECC
-static struct nand_ecclayout bootrom_ecclayout = {
-       .eccbytes = 24,
-       .eccpos = {
-               0x8 * 0, 0x8 * 0 + 1, 0x8 * 0 + 2,
-               0x8 * 1, 0x8 * 1 + 1, 0x8 * 1 + 2,
-               0x8 * 2, 0x8 * 2 + 1, 0x8 * 2 + 2,
-               0x8 * 3, 0x8 * 3 + 1, 0x8 * 3 + 2,
-               0x8 * 4, 0x8 * 4 + 1, 0x8 * 4 + 2,
-               0x8 * 5, 0x8 * 5 + 1, 0x8 * 5 + 2,
-               0x8 * 6, 0x8 * 6 + 1, 0x8 * 6 + 2,
-               0x8 * 7, 0x8 * 7 + 1, 0x8 * 7 + 2
-       },
-       .oobfree = {
-               { 0x8 * 0 + 3, 5 },
-               { 0x8 * 1 + 3, 5 },
-               { 0x8 * 2 + 3, 5 },
-               { 0x8 * 3 + 3, 5 },
-               { 0x8 * 4 + 3, 5 },
-               { 0x8 * 5 + 3, 5 },
-               { 0x8 * 6 + 3, 5 },
-               { 0x8 * 7 + 3, 5 },
-       }
+static int bootrom_ooblayout_ecc(struct mtd_info *mtd, int section,
+                                struct mtd_oob_region *oobregion)
+{
+       if (section > 7)
+               return -ERANGE;
+
+       oobregion->offset = section * 8;
+       oobregion->length = 3;
+
+       return 0;
+}
+
+static int bootrom_ooblayout_free(struct mtd_info *mtd, int section,
+                                 struct mtd_oob_region *oobregion)
+{
+       if (section > 7)
+               return -ERANGE;
+
+       oobregion->offset = (section * 8) + 3;
+       oobregion->length = 5;
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops bootrom_ooblayout_ops = {
+       .ecc = bootrom_ooblayout_ecc,
+       .free = bootrom_ooblayout_free,
 };
 #endif
 
@@ -800,7 +805,7 @@ static int bf5xx_nand_probe(struct platform_device *pdev)
        /* setup hardware ECC data struct */
        if (hardware_ecc) {
 #ifdef CONFIG_MTD_NAND_BF5XX_BOOTROM_ECC
-               chip->ecc.layout = &bootrom_ecclayout;
+               mtd_set_ooblayout(mtd, &bootrom_ooblayout_ops);
 #endif
                chip->read_buf      = bf5xx_nand_dma_read_buf;
                chip->write_buf     = bf5xx_nand_dma_write_buf;
@@ -812,6 +817,7 @@ static int bf5xx_nand_probe(struct platform_device *pdev)
                chip->ecc.write_page_raw = bf5xx_nand_write_page_raw;
        } else {
                chip->ecc.mode      = NAND_ECC_SOFT;
+               chip->ecc.algo  = NAND_ECC_HAMMING;
        }
 
        /* scan hardware nand chip and setup mtd info data struct */
index e052839..b76ad7c 100644 (file)
@@ -32,7 +32,6 @@
 #include <linux/mtd/nand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/of.h>
-#include <linux/of_mtd.h>
 #include <linux/of_platform.h>
 #include <linux/slab.h>
 #include <linux/list.h>
@@ -601,7 +600,7 @@ static void brcmnand_wr_corr_thresh(struct brcmnand_host *host, u8 val)
 
 static inline int brcmnand_cmd_shift(struct brcmnand_controller *ctrl)
 {
-       if (ctrl->nand_version < 0x0700)
+       if (ctrl->nand_version < 0x0602)
                return 24;
        return 0;
 }
@@ -781,127 +780,183 @@ static inline bool is_hamming_ecc(struct brcmnand_cfg *cfg)
 }
 
 /*
- * Returns a nand_ecclayout strucutre for the given layout/configuration.
- * Returns NULL on failure.
+ * Set mtd->ooblayout to the appropriate mtd_ooblayout_ops given
+ * the layout/configuration.
+ * Returns -ERRCODE on failure.
  */
-static struct nand_ecclayout *brcmnand_create_layout(int ecc_level,
-                                                    struct brcmnand_host *host)
+static int brcmnand_hamming_ooblayout_ecc(struct mtd_info *mtd, int section,
+                                         struct mtd_oob_region *oobregion)
 {
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct brcmnand_host *host = nand_get_controller_data(chip);
        struct brcmnand_cfg *cfg = &host->hwcfg;
-       int i, j;
-       struct nand_ecclayout *layout;
-       int req;
-       int sectors;
-       int sas;
-       int idx1, idx2;
-
-       layout = devm_kzalloc(&host->pdev->dev, sizeof(*layout), GFP_KERNEL);
-       if (!layout)
-               return NULL;
-
-       sectors = cfg->page_size / (512 << cfg->sector_size_1k);
-       sas = cfg->spare_area_size << cfg->sector_size_1k;
-
-       /* Hamming */
-       if (is_hamming_ecc(cfg)) {
-               for (i = 0, idx1 = 0, idx2 = 0; i < sectors; i++) {
-                       /* First sector of each page may have BBI */
-                       if (i == 0) {
-                               layout->oobfree[idx2].offset = i * sas + 1;
-                               /* Small-page NAND use byte 6 for BBI */
-                               if (cfg->page_size == 512)
-                                       layout->oobfree[idx2].offset--;
-                               layout->oobfree[idx2].length = 5;
-                       } else {
-                               layout->oobfree[idx2].offset = i * sas;
-                               layout->oobfree[idx2].length = 6;
-                       }
-                       idx2++;
-                       layout->eccpos[idx1++] = i * sas + 6;
-                       layout->eccpos[idx1++] = i * sas + 7;
-                       layout->eccpos[idx1++] = i * sas + 8;
-                       layout->oobfree[idx2].offset = i * sas + 9;
-                       layout->oobfree[idx2].length = 7;
-                       idx2++;
-                       /* Leave zero-terminated entry for OOBFREE */
-                       if (idx1 >= MTD_MAX_ECCPOS_ENTRIES_LARGE ||
-                                   idx2 >= MTD_MAX_OOBFREE_ENTRIES_LARGE - 1)
-                               break;
-               }
+       int sas = cfg->spare_area_size << cfg->sector_size_1k;
+       int sectors = cfg->page_size / (512 << cfg->sector_size_1k);
 
-               return layout;
-       }
+       if (section >= sectors)
+               return -ERANGE;
 
-       /*
-        * CONTROLLER_VERSION:
-        *   < v5.0: ECC_REQ = ceil(BCH_T * 13/8)
-        *  >= v5.0: ECC_REQ = ceil(BCH_T * 14/8)
-        * But we will just be conservative.
-        */
-       req = DIV_ROUND_UP(ecc_level * 14, 8);
-       if (req >= sas) {
-               dev_err(&host->pdev->dev,
-                       "error: ECC too large for OOB (ECC bytes %d, spare sector %d)\n",
-                       req, sas);
-               return NULL;
-       }
+       oobregion->offset = (section * sas) + 6;
+       oobregion->length = 3;
+
+       return 0;
+}
+
+static int brcmnand_hamming_ooblayout_free(struct mtd_info *mtd, int section,
+                                          struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct brcmnand_host *host = nand_get_controller_data(chip);
+       struct brcmnand_cfg *cfg = &host->hwcfg;
+       int sas = cfg->spare_area_size << cfg->sector_size_1k;
+       int sectors = cfg->page_size / (512 << cfg->sector_size_1k);
 
-       layout->eccbytes = req * sectors;
-       for (i = 0, idx1 = 0, idx2 = 0; i < sectors; i++) {
-               for (j = sas - req; j < sas && idx1 <
-                               MTD_MAX_ECCPOS_ENTRIES_LARGE; j++, idx1++)
-                       layout->eccpos[idx1] = i * sas + j;
+       if (section >= sectors * 2)
+               return -ERANGE;
+
+       oobregion->offset = (section / 2) * sas;
+
+       if (section & 1) {
+               oobregion->offset += 9;
+               oobregion->length = 7;
+       } else {
+               oobregion->length = 6;
 
                /* First sector of each page may have BBI */
-               if (i == 0) {
-                       if (cfg->page_size == 512 && (sas - req >= 6)) {
-                               /* Small-page NAND use byte 6 for BBI */
-                               layout->oobfree[idx2].offset = 0;
-                               layout->oobfree[idx2].length = 5;
-                               idx2++;
-                               if (sas - req > 6) {
-                                       layout->oobfree[idx2].offset = 6;
-                                       layout->oobfree[idx2].length =
-                                               sas - req - 6;
-                                       idx2++;
-                               }
-                       } else if (sas > req + 1) {
-                               layout->oobfree[idx2].offset = i * sas + 1;
-                               layout->oobfree[idx2].length = sas - req - 1;
-                               idx2++;
-                       }
-               } else if (sas > req) {
-                       layout->oobfree[idx2].offset = i * sas;
-                       layout->oobfree[idx2].length = sas - req;
-                       idx2++;
+               if (!section) {
+                       /*
+                        * Small-page NAND use byte 6 for BBI while large-page
+                        * NAND use byte 0.
+                        */
+                       if (cfg->page_size > 512)
+                               oobregion->offset++;
+                       oobregion->length--;
                }
-               /* Leave zero-terminated entry for OOBFREE */
-               if (idx1 >= MTD_MAX_ECCPOS_ENTRIES_LARGE ||
-                               idx2 >= MTD_MAX_OOBFREE_ENTRIES_LARGE - 1)
-                       break;
        }
 
-       return layout;
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops brcmnand_hamming_ooblayout_ops = {
+       .ecc = brcmnand_hamming_ooblayout_ecc,
+       .free = brcmnand_hamming_ooblayout_free,
+};
+
+static int brcmnand_bch_ooblayout_ecc(struct mtd_info *mtd, int section,
+                                     struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct brcmnand_host *host = nand_get_controller_data(chip);
+       struct brcmnand_cfg *cfg = &host->hwcfg;
+       int sas = cfg->spare_area_size << cfg->sector_size_1k;
+       int sectors = cfg->page_size / (512 << cfg->sector_size_1k);
+
+       if (section >= sectors)
+               return -ERANGE;
+
+       oobregion->offset = (section * (sas + 1)) - chip->ecc.bytes;
+       oobregion->length = chip->ecc.bytes;
+
+       return 0;
+}
+
+static int brcmnand_bch_ooblayout_free_lp(struct mtd_info *mtd, int section,
+                                         struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct brcmnand_host *host = nand_get_controller_data(chip);
+       struct brcmnand_cfg *cfg = &host->hwcfg;
+       int sas = cfg->spare_area_size << cfg->sector_size_1k;
+       int sectors = cfg->page_size / (512 << cfg->sector_size_1k);
+
+       if (section >= sectors)
+               return -ERANGE;
+
+       if (sas <= chip->ecc.bytes)
+               return 0;
+
+       oobregion->offset = section * sas;
+       oobregion->length = sas - chip->ecc.bytes;
+
+       if (!section) {
+               oobregion->offset++;
+               oobregion->length--;
+       }
+
+       return 0;
 }
 
-static struct nand_ecclayout *brcmstb_choose_ecc_layout(
-               struct brcmnand_host *host)
+static int brcmnand_bch_ooblayout_free_sp(struct mtd_info *mtd, int section,
+                                         struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct brcmnand_host *host = nand_get_controller_data(chip);
+       struct brcmnand_cfg *cfg = &host->hwcfg;
+       int sas = cfg->spare_area_size << cfg->sector_size_1k;
+
+       if (section > 1 || sas - chip->ecc.bytes < 6 ||
+           (section && sas - chip->ecc.bytes == 6))
+               return -ERANGE;
+
+       if (!section) {
+               oobregion->offset = 0;
+               oobregion->length = 5;
+       } else {
+               oobregion->offset = 6;
+               oobregion->length = sas - chip->ecc.bytes - 6;
+       }
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops brcmnand_bch_lp_ooblayout_ops = {
+       .ecc = brcmnand_bch_ooblayout_ecc,
+       .free = brcmnand_bch_ooblayout_free_lp,
+};
+
+static const struct mtd_ooblayout_ops brcmnand_bch_sp_ooblayout_ops = {
+       .ecc = brcmnand_bch_ooblayout_ecc,
+       .free = brcmnand_bch_ooblayout_free_sp,
+};
+
+static int brcmstb_choose_ecc_layout(struct brcmnand_host *host)
 {
-       struct nand_ecclayout *layout;
        struct brcmnand_cfg *p = &host->hwcfg;
+       struct mtd_info *mtd = nand_to_mtd(&host->chip);
+       struct nand_ecc_ctrl *ecc = &host->chip.ecc;
        unsigned int ecc_level = p->ecc_level;
+       int sas = p->spare_area_size << p->sector_size_1k;
+       int sectors = p->page_size / (512 << p->sector_size_1k);
 
        if (p->sector_size_1k)
                ecc_level <<= 1;
 
-       layout = brcmnand_create_layout(ecc_level, host);
-       if (!layout) {
+       if (is_hamming_ecc(p)) {
+               ecc->bytes = 3 * sectors;
+               mtd_set_ooblayout(mtd, &brcmnand_hamming_ooblayout_ops);
+               return 0;
+       }
+
+       /*
+        * CONTROLLER_VERSION:
+        *   < v5.0: ECC_REQ = ceil(BCH_T * 13/8)
+        *  >= v5.0: ECC_REQ = ceil(BCH_T * 14/8)
+        * But we will just be conservative.
+        */
+       ecc->bytes = DIV_ROUND_UP(ecc_level * 14, 8);
+       if (p->page_size == 512)
+               mtd_set_ooblayout(mtd, &brcmnand_bch_sp_ooblayout_ops);
+       else
+               mtd_set_ooblayout(mtd, &brcmnand_bch_lp_ooblayout_ops);
+
+       if (ecc->bytes >= sas) {
                dev_err(&host->pdev->dev,
-                               "no proper ecc_layout for this NAND cfg\n");
-               return NULL;
+                       "error: ECC too large for OOB (ECC bytes %d, spare sector %d)\n",
+                       ecc->bytes, sas);
+               return -EINVAL;
        }
 
-       return layout;
+       return 0;
 }
 
 static void brcmnand_wp(struct mtd_info *mtd, int wp)
@@ -1870,9 +1925,31 @@ static int brcmnand_setup_dev(struct brcmnand_host *host)
        cfg->col_adr_bytes = 2;
        cfg->blk_adr_bytes = get_blk_adr_bytes(mtd->size, mtd->writesize);
 
+       if (chip->ecc.mode != NAND_ECC_HW) {
+               dev_err(ctrl->dev, "only HW ECC supported; selected: %d\n",
+                       chip->ecc.mode);
+               return -EINVAL;
+       }
+
+       if (chip->ecc.algo == NAND_ECC_UNKNOWN) {
+               if (chip->ecc.strength == 1 && chip->ecc.size == 512)
+                       /* Default to Hamming for 1-bit ECC, if unspecified */
+                       chip->ecc.algo = NAND_ECC_HAMMING;
+               else
+                       /* Otherwise, BCH */
+                       chip->ecc.algo = NAND_ECC_BCH;
+       }
+
+       if (chip->ecc.algo == NAND_ECC_HAMMING && (chip->ecc.strength != 1 ||
+                                                  chip->ecc.size != 512)) {
+               dev_err(ctrl->dev, "invalid Hamming params: %d bits per %d bytes\n",
+                       chip->ecc.strength, chip->ecc.size);
+               return -EINVAL;
+       }
+
        switch (chip->ecc.size) {
        case 512:
-               if (chip->ecc.strength == 1) /* Hamming */
+               if (chip->ecc.algo == NAND_ECC_HAMMING)
                        cfg->ecc_level = 15;
                else
                        cfg->ecc_level = chip->ecc.strength;
@@ -2001,8 +2078,8 @@ static int brcmnand_init_cs(struct brcmnand_host *host, struct device_node *dn)
         */
        chip->options |= NAND_USE_BOUNCE_BUFFER;
 
-       if (of_get_nand_on_flash_bbt(dn))
-               chip->bbt_options |= NAND_BBT_USE_FLASH | NAND_BBT_NO_OOB;
+       if (chip->bbt_options & NAND_BBT_USE_FLASH)
+               chip->bbt_options |= NAND_BBT_NO_OOB;
 
        if (brcmnand_setup_dev(host))
                return -ENXIO;
@@ -2011,9 +2088,9 @@ static int brcmnand_init_cs(struct brcmnand_host *host, struct device_node *dn)
        /* only use our internal HW threshold */
        mtd->bitflip_threshold = 1;
 
-       chip->ecc.layout = brcmstb_choose_ecc_layout(host);
-       if (!chip->ecc.layout)
-               return -ENXIO;
+       ret = brcmstb_choose_ecc_layout(host);
+       if (ret)
+               return ret;
 
        if (nand_scan_tail(mtd))
                return -ENXIO;
@@ -2115,6 +2192,7 @@ static const struct of_device_id brcmnand_of_match[] = {
        { .compatible = "brcm,brcmnand-v5.0" },
        { .compatible = "brcm,brcmnand-v6.0" },
        { .compatible = "brcm,brcmnand-v6.1" },
+       { .compatible = "brcm,brcmnand-v6.2" },
        { .compatible = "brcm,brcmnand-v7.0" },
        { .compatible = "brcm,brcmnand-v7.1" },
        {},
index e553aff..0b0c937 100644 (file)
@@ -459,10 +459,37 @@ static int cafe_nand_read_page(struct mtd_info *mtd, struct nand_chip *chip,
        return max_bitflips;
 }
 
-static struct nand_ecclayout cafe_oobinfo_2048 = {
-       .eccbytes = 14,
-       .eccpos = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13},
-       .oobfree = {{14, 50}}
+static int cafe_ooblayout_ecc(struct mtd_info *mtd, int section,
+                             struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+
+       if (section)
+               return -ERANGE;
+
+       oobregion->offset = 0;
+       oobregion->length = chip->ecc.total;
+
+       return 0;
+}
+
+static int cafe_ooblayout_free(struct mtd_info *mtd, int section,
+                              struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+
+       if (section)
+               return -ERANGE;
+
+       oobregion->offset = chip->ecc.total;
+       oobregion->length = mtd->oobsize - chip->ecc.total;
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops cafe_ooblayout_ops = {
+       .ecc = cafe_ooblayout_ecc,
+       .free = cafe_ooblayout_free,
 };
 
 /* Ick. The BBT code really ought to be able to work this bit out
@@ -494,12 +521,6 @@ static struct nand_bbt_descr cafe_bbt_mirror_descr_2048 = {
        .pattern = cafe_mirror_pattern_2048
 };
 
-static struct nand_ecclayout cafe_oobinfo_512 = {
-       .eccbytes = 14,
-       .eccpos = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13},
-       .oobfree = {{14, 2}}
-};
-
 static struct nand_bbt_descr cafe_bbt_main_descr_512 = {
        .options = NAND_BBT_LASTBLOCK | NAND_BBT_CREATE | NAND_BBT_WRITE
                | NAND_BBT_2BIT | NAND_BBT_VERSION,
@@ -743,12 +764,11 @@ static int cafe_nand_probe(struct pci_dev *pdev,
                cafe->ctl2 |= 1<<29; /* 2KiB page size */
 
        /* Set up ECC according to the type of chip we found */
+       mtd_set_ooblayout(mtd, &cafe_ooblayout_ops);
        if (mtd->writesize == 2048) {
-               cafe->nand.ecc.layout = &cafe_oobinfo_2048;
                cafe->nand.bbt_td = &cafe_bbt_main_descr_2048;
                cafe->nand.bbt_md = &cafe_bbt_mirror_descr_2048;
        } else if (mtd->writesize == 512) {
-               cafe->nand.ecc.layout = &cafe_oobinfo_512;
                cafe->nand.bbt_td = &cafe_bbt_main_descr_512;
                cafe->nand.bbt_md = &cafe_bbt_mirror_descr_512;
        } else {
index 6f97ebb..4913378 100644 (file)
@@ -187,6 +187,7 @@ static int __init cmx270_init(void)
        /* 15 us command delay time */
        this->chip_delay = 20;
        this->ecc.mode = NAND_ECC_SOFT;
+       this->ecc.algo = NAND_ECC_HAMMING;
 
        /* read/write functions */
        this->read_byte = cmx270_read_byte;
index 8cb821b..cc07ba0 100644 (file)
@@ -34,7 +34,6 @@
 #include <linux/slab.h>
 #include <linux/of_device.h>
 #include <linux/of.h>
-#include <linux/of_mtd.h>
 
 #include <linux/platform_data/mtd-davinci.h>
 #include <linux/platform_data/mtd-davinci-aemif.h>
@@ -54,7 +53,6 @@
  */
 struct davinci_nand_info {
        struct nand_chip        chip;
-       struct nand_ecclayout   ecclayout;
 
        struct device           *dev;
        struct clk              *clk;
@@ -480,63 +478,46 @@ static int nand_davinci_dev_ready(struct mtd_info *mtd)
  * ten ECC bytes plus the manufacturer's bad block marker byte, and
  * and not overlapping the default BBT markers.
  */
-static struct nand_ecclayout hwecc4_small = {
-       .eccbytes = 10,
-       .eccpos = { 0, 1, 2, 3, 4,
-               /* offset 5 holds the badblock marker */
-               6, 7,
-               13, 14, 15, },
-       .oobfree = {
-               {.offset = 8, .length = 5, },
-               {.offset = 16, },
-       },
-};
+static int hwecc4_ooblayout_small_ecc(struct mtd_info *mtd, int section,
+                                     struct mtd_oob_region *oobregion)
+{
+       if (section > 2)
+               return -ERANGE;
+
+       if (!section) {
+               oobregion->offset = 0;
+               oobregion->length = 5;
+       } else if (section == 1) {
+               oobregion->offset = 6;
+               oobregion->length = 2;
+       } else {
+               oobregion->offset = 13;
+               oobregion->length = 3;
+       }
 
-/* An ECC layout for using 4-bit ECC with large-page (2048bytes) flash,
- * storing ten ECC bytes plus the manufacturer's bad block marker byte,
- * and not overlapping the default BBT markers.
- */
-static struct nand_ecclayout hwecc4_2048 = {
-       .eccbytes = 40,
-       .eccpos = {
-               /* at the end of spare sector */
-               24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
-               34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
-               44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
-               54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-               },
-       .oobfree = {
-               /* 2 bytes at offset 0 hold manufacturer badblock markers */
-               {.offset = 2, .length = 22, },
-               /* 5 bytes at offset 8 hold BBT markers */
-               /* 8 bytes at offset 16 hold JFFS2 clean markers */
-       },
-};
+       return 0;
+}
 
-/*
- * An ECC layout for using 4-bit ECC with large-page (4096bytes) flash,
- * storing ten ECC bytes plus the manufacturer's bad block marker byte,
- * and not overlapping the default BBT markers.
- */
-static struct nand_ecclayout hwecc4_4096 = {
-       .eccbytes = 80,
-       .eccpos = {
-               /* at the end of spare sector */
-               48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
-               58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
-               68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
-               78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
-               88, 89, 90, 91, 92, 93, 94, 95, 96, 97,
-               98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
-               108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
-               118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
-       },
-       .oobfree = {
-               /* 2 bytes at offset 0 hold manufacturer badblock markers */
-               {.offset = 2, .length = 46, },
-               /* 5 bytes at offset 8 hold BBT markers */
-               /* 8 bytes at offset 16 hold JFFS2 clean markers */
-       },
+static int hwecc4_ooblayout_small_free(struct mtd_info *mtd, int section,
+                                      struct mtd_oob_region *oobregion)
+{
+       if (section > 1)
+               return -ERANGE;
+
+       if (!section) {
+               oobregion->offset = 8;
+               oobregion->length = 5;
+       } else {
+               oobregion->offset = 16;
+               oobregion->length = mtd->oobsize - 16;
+       }
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops hwecc4_small_ooblayout_ops = {
+       .ecc = hwecc4_ooblayout_small_ecc,
+       .free = hwecc4_ooblayout_small_free,
 };
 
 #if defined(CONFIG_OF)
@@ -577,8 +558,6 @@ static struct davinci_nand_pdata
                        "ti,davinci-mask-chipsel", &prop))
                        pdata->mask_chipsel = prop;
                if (!of_property_read_string(pdev->dev.of_node,
-                       "nand-ecc-mode", &mode) ||
-                   !of_property_read_string(pdev->dev.of_node,
                        "ti,davinci-ecc-mode", &mode)) {
                        if (!strncmp("none", mode, 4))
                                pdata->ecc_mode = NAND_ECC_NONE;
@@ -591,14 +570,11 @@ static struct davinci_nand_pdata
                        "ti,davinci-ecc-bits", &prop))
                        pdata->ecc_bits = prop;
 
-               prop = of_get_nand_bus_width(pdev->dev.of_node);
-               if (0 < prop || !of_property_read_u32(pdev->dev.of_node,
-                       "ti,davinci-nand-buswidth", &prop))
-                       if (prop == 16)
-                               pdata->options |= NAND_BUSWIDTH_16;
+               if (!of_property_read_u32(pdev->dev.of_node,
+                       "ti,davinci-nand-buswidth", &prop) && prop == 16)
+                       pdata->options |= NAND_BUSWIDTH_16;
+
                if (of_property_read_bool(pdev->dev.of_node,
-                       "nand-on-flash-bbt") ||
-                   of_property_read_bool(pdev->dev.of_node,
                        "ti,davinci-nand-use-bbt"))
                        pdata->bbt_options = NAND_BBT_USE_FLASH;
 
@@ -628,7 +604,6 @@ static int nand_davinci_probe(struct platform_device *pdev)
        void __iomem                    *base;
        int                             ret;
        uint32_t                        val;
-       nand_ecc_modes_t                ecc_mode;
        struct mtd_info                 *mtd;
 
        pdata = nand_davinci_get_pdata(pdev);
@@ -712,13 +687,53 @@ static int nand_davinci_probe(struct platform_device *pdev)
        info->chip.write_buf    = nand_davinci_write_buf;
 
        /* Use board-specific ECC config */
-       ecc_mode                = pdata->ecc_mode;
+       info->chip.ecc.mode     = pdata->ecc_mode;
 
        ret = -EINVAL;
-       switch (ecc_mode) {
+
+       info->clk = devm_clk_get(&pdev->dev, "aemif");
+       if (IS_ERR(info->clk)) {
+               ret = PTR_ERR(info->clk);
+               dev_dbg(&pdev->dev, "unable to get AEMIF clock, err %d\n", ret);
+               return ret;
+       }
+
+       ret = clk_prepare_enable(info->clk);
+       if (ret < 0) {
+               dev_dbg(&pdev->dev, "unable to enable AEMIF clock, err %d\n",
+                       ret);
+               goto err_clk_enable;
+       }
+
+       spin_lock_irq(&davinci_nand_lock);
+
+       /* put CSxNAND into NAND mode */
+       val = davinci_nand_readl(info, NANDFCR_OFFSET);
+       val |= BIT(info->core_chipsel);
+       davinci_nand_writel(info, NANDFCR_OFFSET, val);
+
+       spin_unlock_irq(&davinci_nand_lock);
+
+       /* Scan to find existence of the device(s) */
+       ret = nand_scan_ident(mtd, pdata->mask_chipsel ? 2 : 1, NULL);
+       if (ret < 0) {
+               dev_dbg(&pdev->dev, "no NAND chip(s) found\n");
+               goto err;
+       }
+
+       switch (info->chip.ecc.mode) {
        case NAND_ECC_NONE:
+               pdata->ecc_bits = 0;
+               break;
        case NAND_ECC_SOFT:
                pdata->ecc_bits = 0;
+               /*
+                * This driver expects Hamming based ECC when ecc_mode is set
+                * to NAND_ECC_SOFT. Force ecc.algo to NAND_ECC_HAMMING to
+                * avoid adding an extra ->ecc_algo field to
+                * davinci_nand_pdata.
+                */
+               info->chip.ecc.algo = NAND_ECC_HAMMING;
                break;
        case NAND_ECC_HW:
                if (pdata->ecc_bits == 4) {
@@ -754,37 +769,6 @@ static int nand_davinci_probe(struct platform_device *pdev)
        default:
                return -EINVAL;
        }
-       info->chip.ecc.mode = ecc_mode;
-
-       info->clk = devm_clk_get(&pdev->dev, "aemif");
-       if (IS_ERR(info->clk)) {
-               ret = PTR_ERR(info->clk);
-               dev_dbg(&pdev->dev, "unable to get AEMIF clock, err %d\n", ret);
-               return ret;
-       }
-
-       ret = clk_prepare_enable(info->clk);
-       if (ret < 0) {
-               dev_dbg(&pdev->dev, "unable to enable AEMIF clock, err %d\n",
-                       ret);
-               goto err_clk_enable;
-       }
-
-       spin_lock_irq(&davinci_nand_lock);
-
-       /* put CSxNAND into NAND mode */
-       val = davinci_nand_readl(info, NANDFCR_OFFSET);
-       val |= BIT(info->core_chipsel);
-       davinci_nand_writel(info, NANDFCR_OFFSET, val);
-
-       spin_unlock_irq(&davinci_nand_lock);
-
-       /* Scan to find existence of the device(s) */
-       ret = nand_scan_ident(mtd, pdata->mask_chipsel ? 2 : 1, NULL);
-       if (ret < 0) {
-               dev_dbg(&pdev->dev, "no NAND chip(s) found\n");
-               goto err;
-       }
 
        /* Update ECC layout if needed ... for 1-bit HW ECC, the default
         * is OK, but it allocates 6 bytes when only 3 are needed (for
@@ -805,26 +789,14 @@ static int nand_davinci_probe(struct platform_device *pdev)
                 * table marker fits in the free bytes.
                 */
                if (chunks == 1) {
-                       info->ecclayout = hwecc4_small;
-                       info->ecclayout.oobfree[1].length = mtd->oobsize - 16;
-                       goto syndrome_done;
-               }
-               if (chunks == 4) {
-                       info->ecclayout = hwecc4_2048;
-                       info->chip.ecc.mode = NAND_ECC_HW_OOB_FIRST;
-                       goto syndrome_done;
-               }
-               if (chunks == 8) {
-                       info->ecclayout = hwecc4_4096;
+                       mtd_set_ooblayout(mtd, &hwecc4_small_ooblayout_ops);
+               } else if (chunks == 4 || chunks == 8) {
+                       mtd_set_ooblayout(mtd, &nand_ooblayout_lp_ops);
                        info->chip.ecc.mode = NAND_ECC_HW_OOB_FIRST;
-                       goto syndrome_done;
+               } else {
+                       ret = -EIO;
+                       goto err;
                }
-
-               ret = -EIO;
-               goto err;
-
-syndrome_done:
-               info->chip.ecc.layout = &info->ecclayout;
        }
 
        ret = nand_scan_tail(mtd);
@@ -850,7 +822,7 @@ err:
 
 err_clk_enable:
        spin_lock_irq(&davinci_nand_lock);
-       if (ecc_mode == NAND_ECC_HW_SYNDROME)
+       if (info->chip.ecc.mode == NAND_ECC_HW_SYNDROME)
                ecc4_busy = false;
        spin_unlock_irq(&davinci_nand_lock);
        return ret;
index 30bf5f6..0476ae8 100644 (file)
@@ -1374,13 +1374,41 @@ static void denali_hw_init(struct denali_nand_info *denali)
  * correction
  */
 #define ECC_8BITS      14
-static struct nand_ecclayout nand_8bit_oob = {
-       .eccbytes = 14,
-};
-
 #define ECC_15BITS     26
-static struct nand_ecclayout nand_15bit_oob = {
-       .eccbytes = 26,
+
+static int denali_ooblayout_ecc(struct mtd_info *mtd, int section,
+                               struct mtd_oob_region *oobregion)
+{
+       struct denali_nand_info *denali = mtd_to_denali(mtd);
+       struct nand_chip *chip = mtd_to_nand(mtd);
+
+       if (section)
+               return -ERANGE;
+
+       oobregion->offset = denali->bbtskipbytes;
+       oobregion->length = chip->ecc.total;
+
+       return 0;
+}
+
+static int denali_ooblayout_free(struct mtd_info *mtd, int section,
+                                struct mtd_oob_region *oobregion)
+{
+       struct denali_nand_info *denali = mtd_to_denali(mtd);
+       struct nand_chip *chip = mtd_to_nand(mtd);
+
+       if (section)
+               return -ERANGE;
+
+       oobregion->offset = chip->ecc.total + denali->bbtskipbytes;
+       oobregion->length = mtd->oobsize - oobregion->offset;
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops denali_ooblayout_ops = {
+       .ecc = denali_ooblayout_ecc,
+       .free = denali_ooblayout_free,
 };
 
 static uint8_t bbt_pattern[] = {'B', 'b', 't', '0' };
@@ -1561,7 +1589,6 @@ int denali_init(struct denali_nand_info *denali)
                        ECC_SECTOR_SIZE)))) {
                /* if MLC OOB size is large enough, use 15bit ECC*/
                denali->nand.ecc.strength = 15;
-               denali->nand.ecc.layout = &nand_15bit_oob;
                denali->nand.ecc.bytes = ECC_15BITS;
                iowrite32(15, denali->flash_reg + ECC_CORRECTION);
        } else if (mtd->oobsize < (denali->bbtskipbytes +
@@ -1571,20 +1598,13 @@ int denali_init(struct denali_nand_info *denali)
                goto failed_req_irq;
        } else {
                denali->nand.ecc.strength = 8;
-               denali->nand.ecc.layout = &nand_8bit_oob;
                denali->nand.ecc.bytes = ECC_8BITS;
                iowrite32(8, denali->flash_reg + ECC_CORRECTION);
        }
 
+       mtd_set_ooblayout(mtd, &denali_ooblayout_ops);
        denali->nand.ecc.bytes *= denali->devnum;
        denali->nand.ecc.strength *= denali->devnum;
-       denali->nand.ecc.layout->eccbytes *=
-               mtd->writesize / ECC_SECTOR_SIZE;
-       denali->nand.ecc.layout->oobfree[0].offset =
-               denali->bbtskipbytes + denali->nand.ecc.layout->eccbytes;
-       denali->nand.ecc.layout->oobfree[0].length =
-               mtd->oobsize - denali->nand.ecc.layout->eccbytes -
-               denali->bbtskipbytes;
 
        /*
         * Let driver know the total blocks number and how many blocks
index 547c100..a023ab9 100644 (file)
@@ -950,20 +950,50 @@ static int doc200x_correct_data(struct mtd_info *mtd, u_char *dat,
 
 //u_char mydatabuf[528];
 
-/* The strange out-of-order .oobfree list below is a (possibly unneeded)
- * attempt to retain compatibility.  It used to read:
- *     .oobfree = { {8, 8} }
- * Since that leaves two bytes unusable, it was changed.  But the following
- * scheme might affect existing jffs2 installs by moving the cleanmarker:
- *     .oobfree = { {6, 10} }
- * jffs2 seems to handle the above gracefully, but the current scheme seems
- * safer.  The only problem with it is that any code that parses oobfree must
- * be able to handle out-of-order segments.
- */
-static struct nand_ecclayout doc200x_oobinfo = {
-       .eccbytes = 6,
-       .eccpos = {0, 1, 2, 3, 4, 5},
-       .oobfree = {{8, 8}, {6, 2}}
+static int doc200x_ooblayout_ecc(struct mtd_info *mtd, int section,
+                                struct mtd_oob_region *oobregion)
+{
+       if (section)
+               return -ERANGE;
+
+       oobregion->offset = 0;
+       oobregion->length = 6;
+
+       return 0;
+}
+
+static int doc200x_ooblayout_free(struct mtd_info *mtd, int section,
+                                 struct mtd_oob_region *oobregion)
+{
+       if (section > 1)
+               return -ERANGE;
+
+       /*
+        * The strange out-of-order free bytes definition is a (possibly
+        * unneeded) attempt to retain compatibility.  It used to read:
+        *      .oobfree = { {8, 8} }
+        * Since that leaves two bytes unusable, it was changed.  But the
+        * following scheme might affect existing jffs2 installs by moving the
+        * cleanmarker:
+        *      .oobfree = { {6, 10} }
+        * jffs2 seems to handle the above gracefully, but the current scheme
+        * seems safer. The only problem with it is that any code retrieving
+        * free bytes position must be able to handle out-of-order segments.
+        */
+       if (!section) {
+               oobregion->offset = 8;
+               oobregion->length = 8;
+       } else {
+               oobregion->offset = 6;
+               oobregion->length = 2;
+       }
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops doc200x_ooblayout_ops = {
+       .ecc = doc200x_ooblayout_ecc,
+       .free = doc200x_ooblayout_free,
 };
 
 /* Find the (I)NFTL Media Header, and optionally also the mirror media header.
@@ -1537,6 +1567,7 @@ static int __init doc_probe(unsigned long physadr)
        nand->bbt_md            = nand->bbt_td + 1;
 
        mtd->owner              = THIS_MODULE;
+       mtd_set_ooblayout(mtd, &doc200x_ooblayout_ops);
 
        nand_set_controller_data(nand, doc);
        nand->select_chip       = doc200x_select_chip;
@@ -1548,7 +1579,6 @@ static int __init doc_probe(unsigned long physadr)
        nand->ecc.calculate     = doc200x_calculate_ecc;
        nand->ecc.correct       = doc200x_correct_data;
 
-       nand->ecc.layout        = &doc200x_oobinfo;
        nand->ecc.mode          = NAND_ECC_HW_SYNDROME;
        nand->ecc.size          = 512;
        nand->ecc.bytes         = 6;
index d86a60e..4731699 100644 (file)
@@ -222,10 +222,33 @@ struct docg4_priv {
  * Bytes 8 - 14 are hw-generated ecc covering entire page + oob bytes 0 - 14.
  * Byte 15 (the last) is used by the driver as a "page written" flag.
  */
-static struct nand_ecclayout docg4_oobinfo = {
-       .eccbytes = 9,
-       .eccpos = {7, 8, 9, 10, 11, 12, 13, 14, 15},
-       .oobfree = { {.offset = 2, .length = 5} }
+static int docg4_ooblayout_ecc(struct mtd_info *mtd, int section,
+                              struct mtd_oob_region *oobregion)
+{
+       if (section)
+               return -ERANGE;
+
+       oobregion->offset = 7;
+       oobregion->length = 9;
+
+       return 0;
+}
+
+static int docg4_ooblayout_free(struct mtd_info *mtd, int section,
+                               struct mtd_oob_region *oobregion)
+{
+       if (section)
+               return -ERANGE;
+
+       oobregion->offset = 2;
+       oobregion->length = 5;
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops docg4_ooblayout_ops = {
+       .ecc = docg4_ooblayout_ecc,
+       .free = docg4_ooblayout_free,
 };
 
 /*
@@ -1209,6 +1232,7 @@ static void __init init_mtd_structs(struct mtd_info *mtd)
        mtd->writesize = DOCG4_PAGE_SIZE;
        mtd->erasesize = DOCG4_BLOCK_SIZE;
        mtd->oobsize = DOCG4_OOB_SIZE;
+       mtd_set_ooblayout(mtd, &docg4_ooblayout_ops);
        nand->chipsize = DOCG4_CHIP_SIZE;
        nand->chip_shift = DOCG4_CHIP_SHIFT;
        nand->bbt_erase_shift = nand->phys_erase_shift = DOCG4_ERASE_SHIFT;
@@ -1217,7 +1241,6 @@ static void __init init_mtd_structs(struct mtd_info *mtd)
        nand->pagemask = 0x3ffff;
        nand->badblockpos = NAND_LARGE_BADBLOCK_POS;
        nand->badblockbits = 8;
-       nand->ecc.layout = &docg4_oobinfo;
        nand->ecc.mode = NAND_ECC_HW_SYNDROME;
        nand->ecc.size = DOCG4_PAGE_SIZE;
        nand->ecc.prepad = 8;
index 059d5f7..60a88f2 100644 (file)
@@ -79,32 +79,53 @@ struct fsl_elbc_fcm_ctrl {
 
 /* These map to the positions used by the FCM hardware ECC generator */
 
-/* Small Page FLASH with FMR[ECCM] = 0 */
-static struct nand_ecclayout fsl_elbc_oob_sp_eccm0 = {
-       .eccbytes = 3,
-       .eccpos = {6, 7, 8},
-       .oobfree = { {0, 5}, {9, 7} },
-};
+static int fsl_elbc_ooblayout_ecc(struct mtd_info *mtd, int section,
+                                 struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct fsl_elbc_mtd *priv = nand_get_controller_data(chip);
 
-/* Small Page FLASH with FMR[ECCM] = 1 */
-static struct nand_ecclayout fsl_elbc_oob_sp_eccm1 = {
-       .eccbytes = 3,
-       .eccpos = {8, 9, 10},
-       .oobfree = { {0, 5}, {6, 2}, {11, 5} },
-};
+       if (section >= chip->ecc.steps)
+               return -ERANGE;
 
-/* Large Page FLASH with FMR[ECCM] = 0 */
-static struct nand_ecclayout fsl_elbc_oob_lp_eccm0 = {
-       .eccbytes = 12,
-       .eccpos = {6, 7, 8, 22, 23, 24, 38, 39, 40, 54, 55, 56},
-       .oobfree = { {1, 5}, {9, 13}, {25, 13}, {41, 13}, {57, 7} },
-};
+       oobregion->offset = (16 * section) + 6;
+       if (priv->fmr & FMR_ECCM)
+               oobregion->offset += 2;
 
-/* Large Page FLASH with FMR[ECCM] = 1 */
-static struct nand_ecclayout fsl_elbc_oob_lp_eccm1 = {
-       .eccbytes = 12,
-       .eccpos = {8, 9, 10, 24, 25, 26, 40, 41, 42, 56, 57, 58},
-       .oobfree = { {1, 7}, {11, 13}, {27, 13}, {43, 13}, {59, 5} },
+       oobregion->length = chip->ecc.bytes;
+
+       return 0;
+}
+
+static int fsl_elbc_ooblayout_free(struct mtd_info *mtd, int section,
+                                  struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct fsl_elbc_mtd *priv = nand_get_controller_data(chip);
+
+       if (section > chip->ecc.steps)
+               return -ERANGE;
+
+       if (!section) {
+               oobregion->offset = 0;
+               if (mtd->writesize > 512)
+                       oobregion->offset++;
+               oobregion->length = (priv->fmr & FMR_ECCM) ? 7 : 5;
+       } else {
+               oobregion->offset = (16 * section) -
+                                   ((priv->fmr & FMR_ECCM) ? 5 : 7);
+               if (section < chip->ecc.steps)
+                       oobregion->length = 13;
+               else
+                       oobregion->length = mtd->oobsize - oobregion->offset;
+       }
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops fsl_elbc_ooblayout_ops = {
+       .ecc = fsl_elbc_ooblayout_ecc,
+       .free = fsl_elbc_ooblayout_free,
 };
 
 /*
@@ -657,8 +678,8 @@ static int fsl_elbc_chip_init_tail(struct mtd_info *mtd)
                chip->ecc.bytes);
        dev_dbg(priv->dev, "fsl_elbc_init: nand->ecc.total = %d\n",
                chip->ecc.total);
-       dev_dbg(priv->dev, "fsl_elbc_init: nand->ecc.layout = %p\n",
-               chip->ecc.layout);
+       dev_dbg(priv->dev, "fsl_elbc_init: mtd->ooblayout = %p\n",
+               mtd->ooblayout);
        dev_dbg(priv->dev, "fsl_elbc_init: mtd->flags = %08x\n", mtd->flags);
        dev_dbg(priv->dev, "fsl_elbc_init: mtd->size = %lld\n", mtd->size);
        dev_dbg(priv->dev, "fsl_elbc_init: mtd->erasesize = %d\n",
@@ -675,14 +696,6 @@ static int fsl_elbc_chip_init_tail(struct mtd_info *mtd)
        } else if (mtd->writesize == 2048) {
                priv->page_size = 1;
                setbits32(&lbc->bank[priv->bank].or, OR_FCM_PGS);
-               /* adjust ecc setup if needed */
-               if ((in_be32(&lbc->bank[priv->bank].br) & BR_DECC) ==
-                   BR_DECC_CHK_GEN) {
-                       chip->ecc.size = 512;
-                       chip->ecc.layout = (priv->fmr & FMR_ECCM) ?
-                                          &fsl_elbc_oob_lp_eccm1 :
-                                          &fsl_elbc_oob_lp_eccm0;
-               }
        } else {
                dev_err(priv->dev,
                        "fsl_elbc_init: page size %d is not supported\n",
@@ -780,15 +793,14 @@ static int fsl_elbc_chip_init(struct fsl_elbc_mtd *priv)
        if ((in_be32(&lbc->bank[priv->bank].br) & BR_DECC) ==
            BR_DECC_CHK_GEN) {
                chip->ecc.mode = NAND_ECC_HW;
-               /* put in small page settings and adjust later if needed */
-               chip->ecc.layout = (priv->fmr & FMR_ECCM) ?
-                               &fsl_elbc_oob_sp_eccm1 : &fsl_elbc_oob_sp_eccm0;
+               mtd_set_ooblayout(mtd, &fsl_elbc_ooblayout_ops);
                chip->ecc.size = 512;
                chip->ecc.bytes = 3;
                chip->ecc.strength = 1;
        } else {
                /* otherwise fall back to default software ECC */
                chip->ecc.mode = NAND_ECC_SOFT;
+               chip->ecc.algo = NAND_ECC_HAMMING;
        }
 
        return 0;
index 43f5a3a..4e9e5fd 100644 (file)
@@ -67,136 +67,6 @@ struct fsl_ifc_nand_ctrl {
 
 static struct fsl_ifc_nand_ctrl *ifc_nand_ctrl;
 
-/* 512-byte page with 4-bit ECC, 8-bit */
-static struct nand_ecclayout oob_512_8bit_ecc4 = {
-       .eccbytes = 8,
-       .eccpos = {8, 9, 10, 11, 12, 13, 14, 15},
-       .oobfree = { {0, 5}, {6, 2} },
-};
-
-/* 512-byte page with 4-bit ECC, 16-bit */
-static struct nand_ecclayout oob_512_16bit_ecc4 = {
-       .eccbytes = 8,
-       .eccpos = {8, 9, 10, 11, 12, 13, 14, 15},
-       .oobfree = { {2, 6}, },
-};
-
-/* 2048-byte page size with 4-bit ECC */
-static struct nand_ecclayout oob_2048_ecc4 = {
-       .eccbytes = 32,
-       .eccpos = {
-               8, 9, 10, 11, 12, 13, 14, 15,
-               16, 17, 18, 19, 20, 21, 22, 23,
-               24, 25, 26, 27, 28, 29, 30, 31,
-               32, 33, 34, 35, 36, 37, 38, 39,
-       },
-       .oobfree = { {2, 6}, {40, 24} },
-};
-
-/* 4096-byte page size with 4-bit ECC */
-static struct nand_ecclayout oob_4096_ecc4 = {
-       .eccbytes = 64,
-       .eccpos = {
-               8, 9, 10, 11, 12, 13, 14, 15,
-               16, 17, 18, 19, 20, 21, 22, 23,
-               24, 25, 26, 27, 28, 29, 30, 31,
-               32, 33, 34, 35, 36, 37, 38, 39,
-               40, 41, 42, 43, 44, 45, 46, 47,
-               48, 49, 50, 51, 52, 53, 54, 55,
-               56, 57, 58, 59, 60, 61, 62, 63,
-               64, 65, 66, 67, 68, 69, 70, 71,
-       },
-       .oobfree = { {2, 6}, {72, 56} },
-};
-
-/* 4096-byte page size with 8-bit ECC -- requires 218-byte OOB */
-static struct nand_ecclayout oob_4096_ecc8 = {
-       .eccbytes = 128,
-       .eccpos = {
-               8, 9, 10, 11, 12, 13, 14, 15,
-               16, 17, 18, 19, 20, 21, 22, 23,
-               24, 25, 26, 27, 28, 29, 30, 31,
-               32, 33, 34, 35, 36, 37, 38, 39,
-               40, 41, 42, 43, 44, 45, 46, 47,
-               48, 49, 50, 51, 52, 53, 54, 55,
-               56, 57, 58, 59, 60, 61, 62, 63,
-               64, 65, 66, 67, 68, 69, 70, 71,
-               72, 73, 74, 75, 76, 77, 78, 79,
-               80, 81, 82, 83, 84, 85, 86, 87,
-               88, 89, 90, 91, 92, 93, 94, 95,
-               96, 97, 98, 99, 100, 101, 102, 103,
-               104, 105, 106, 107, 108, 109, 110, 111,
-               112, 113, 114, 115, 116, 117, 118, 119,
-               120, 121, 122, 123, 124, 125, 126, 127,
-               128, 129, 130, 131, 132, 133, 134, 135,
-       },
-       .oobfree = { {2, 6}, {136, 82} },
-};
-
-/* 8192-byte page size with 4-bit ECC */
-static struct nand_ecclayout oob_8192_ecc4 = {
-       .eccbytes = 128,
-       .eccpos = {
-               8, 9, 10, 11, 12, 13, 14, 15,
-               16, 17, 18, 19, 20, 21, 22, 23,
-               24, 25, 26, 27, 28, 29, 30, 31,
-               32, 33, 34, 35, 36, 37, 38, 39,
-               40, 41, 42, 43, 44, 45, 46, 47,
-               48, 49, 50, 51, 52, 53, 54, 55,
-               56, 57, 58, 59, 60, 61, 62, 63,
-               64, 65, 66, 67, 68, 69, 70, 71,
-               72, 73, 74, 75, 76, 77, 78, 79,
-               80, 81, 82, 83, 84, 85, 86, 87,
-               88, 89, 90, 91, 92, 93, 94, 95,
-               96, 97, 98, 99, 100, 101, 102, 103,
-               104, 105, 106, 107, 108, 109, 110, 111,
-               112, 113, 114, 115, 116, 117, 118, 119,
-               120, 121, 122, 123, 124, 125, 126, 127,
-               128, 129, 130, 131, 132, 133, 134, 135,
-       },
-       .oobfree = { {2, 6}, {136, 208} },
-};
-
-/* 8192-byte page size with 8-bit ECC -- requires 218-byte OOB */
-static struct nand_ecclayout oob_8192_ecc8 = {
-       .eccbytes = 256,
-       .eccpos = {
-               8, 9, 10, 11, 12, 13, 14, 15,
-               16, 17, 18, 19, 20, 21, 22, 23,
-               24, 25, 26, 27, 28, 29, 30, 31,
-               32, 33, 34, 35, 36, 37, 38, 39,
-               40, 41, 42, 43, 44, 45, 46, 47,
-               48, 49, 50, 51, 52, 53, 54, 55,
-               56, 57, 58, 59, 60, 61, 62, 63,
-               64, 65, 66, 67, 68, 69, 70, 71,
-               72, 73, 74, 75, 76, 77, 78, 79,
-               80, 81, 82, 83, 84, 85, 86, 87,
-               88, 89, 90, 91, 92, 93, 94, 95,
-               96, 97, 98, 99, 100, 101, 102, 103,
-               104, 105, 106, 107, 108, 109, 110, 111,
-               112, 113, 114, 115, 116, 117, 118, 119,
-               120, 121, 122, 123, 124, 125, 126, 127,
-               128, 129, 130, 131, 132, 133, 134, 135,
-               136, 137, 138, 139, 140, 141, 142, 143,
-               144, 145, 146, 147, 148, 149, 150, 151,
-               152, 153, 154, 155, 156, 157, 158, 159,
-               160, 161, 162, 163, 164, 165, 166, 167,
-               168, 169, 170, 171, 172, 173, 174, 175,
-               176, 177, 178, 179, 180, 181, 182, 183,
-               184, 185, 186, 187, 188, 189, 190, 191,
-               192, 193, 194, 195, 196, 197, 198, 199,
-               200, 201, 202, 203, 204, 205, 206, 207,
-               208, 209, 210, 211, 212, 213, 214, 215,
-               216, 217, 218, 219, 220, 221, 222, 223,
-               224, 225, 226, 227, 228, 229, 230, 231,
-               232, 233, 234, 235, 236, 237, 238, 239,
-               240, 241, 242, 243, 244, 245, 246, 247,
-               248, 249, 250, 251, 252, 253, 254, 255,
-               256, 257, 258, 259, 260, 261, 262, 263,
-       },
-       .oobfree = { {2, 6}, {264, 80} },
-};
-
 /*
  * Generic flash bbt descriptors
  */
@@ -223,6 +93,57 @@ static struct nand_bbt_descr bbt_mirror_descr = {
        .pattern = mirror_pattern,
 };
 
+static int fsl_ifc_ooblayout_ecc(struct mtd_info *mtd, int section,
+                                struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+
+       if (section)
+               return -ERANGE;
+
+       oobregion->offset = 8;
+       oobregion->length = chip->ecc.total;
+
+       return 0;
+}
+
+static int fsl_ifc_ooblayout_free(struct mtd_info *mtd, int section,
+                                 struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+
+       if (section > 1)
+               return -ERANGE;
+
+       if (mtd->writesize == 512 &&
+           !(chip->options & NAND_BUSWIDTH_16)) {
+               if (!section) {
+                       oobregion->offset = 0;
+                       oobregion->length = 5;
+               } else {
+                       oobregion->offset = 6;
+                       oobregion->length = 2;
+               }
+
+               return 0;
+       }
+
+       if (!section) {
+               oobregion->offset = 2;
+               oobregion->length = 6;
+       } else {
+               oobregion->offset = chip->ecc.total + 8;
+               oobregion->length = mtd->oobsize - oobregion->offset;
+       }
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops fsl_ifc_ooblayout_ops = {
+       .ecc = fsl_ifc_ooblayout_ecc,
+       .free = fsl_ifc_ooblayout_free,
+};
+
 /*
  * Set up the IFC hardware block and page address fields, and the ifc nand
  * structure addr field to point to the correct IFC buffer in memory
@@ -232,7 +153,7 @@ static void set_addr(struct mtd_info *mtd, int column, int page_addr, int oob)
        struct nand_chip *chip = mtd_to_nand(mtd);
        struct fsl_ifc_mtd *priv = nand_get_controller_data(chip);
        struct fsl_ifc_ctrl *ctrl = priv->ctrl;
-       struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+       struct fsl_ifc_runtime __iomem *ifc = ctrl->rregs;
        int buf_num;
 
        ifc_nand_ctrl->page = page_addr;
@@ -257,18 +178,22 @@ static int is_blank(struct mtd_info *mtd, unsigned int bufnum)
        u8 __iomem *addr = priv->vbase + bufnum * (mtd->writesize * 2);
        u32 __iomem *mainarea = (u32 __iomem *)addr;
        u8 __iomem *oob = addr + mtd->writesize;
-       int i;
+       struct mtd_oob_region oobregion = { };
+       int i, section = 0;
 
        for (i = 0; i < mtd->writesize / 4; i++) {
                if (__raw_readl(&mainarea[i]) != 0xffffffff)
                        return 0;
        }
 
-       for (i = 0; i < chip->ecc.layout->eccbytes; i++) {
-               int pos = chip->ecc.layout->eccpos[i];
+       mtd_ooblayout_ecc(mtd, section++, &oobregion);
+       while (oobregion.length) {
+               for (i = 0; i < oobregion.length; i++) {
+                       if (__raw_readb(&oob[oobregion.offset + i]) != 0xff)
+                               return 0;
+               }
 
-               if (__raw_readb(&oob[pos]) != 0xff)
-                       return 0;
+               mtd_ooblayout_ecc(mtd, section++, &oobregion);
        }
 
        return 1;
@@ -295,7 +220,7 @@ static void fsl_ifc_run_command(struct mtd_info *mtd)
        struct fsl_ifc_mtd *priv = nand_get_controller_data(chip);
        struct fsl_ifc_ctrl *ctrl = priv->ctrl;
        struct fsl_ifc_nand_ctrl *nctrl = ifc_nand_ctrl;
-       struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+       struct fsl_ifc_runtime __iomem *ifc = ctrl->rregs;
        u32 eccstat[4];
        int i;
 
@@ -371,7 +296,7 @@ static void fsl_ifc_do_read(struct nand_chip *chip,
 {
        struct fsl_ifc_mtd *priv = nand_get_controller_data(chip);
        struct fsl_ifc_ctrl *ctrl = priv->ctrl;
-       struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+       struct fsl_ifc_runtime __iomem *ifc = ctrl->rregs;
 
        /* Program FIR/IFC_NAND_FCR0 for Small/Large page */
        if (mtd->writesize > 512) {
@@ -411,7 +336,7 @@ static void fsl_ifc_cmdfunc(struct mtd_info *mtd, unsigned int command,
        struct nand_chip *chip = mtd_to_nand(mtd);
        struct fsl_ifc_mtd *priv = nand_get_controller_data(chip);
        struct fsl_ifc_ctrl *ctrl = priv->ctrl;
-       struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+       struct fsl_ifc_runtime __iomem *ifc = ctrl->rregs;
 
        /* clear the read buffer */
        ifc_nand_ctrl->read_bytes = 0;
@@ -723,7 +648,7 @@ static int fsl_ifc_wait(struct mtd_info *mtd, struct nand_chip *chip)
 {
        struct fsl_ifc_mtd *priv = nand_get_controller_data(chip);
        struct fsl_ifc_ctrl *ctrl = priv->ctrl;
-       struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+       struct fsl_ifc_runtime __iomem *ifc = ctrl->rregs;
        u32 nand_fsr;
 
        /* Use READ_STATUS command, but wait for the device to be ready */
@@ -808,8 +733,8 @@ static int fsl_ifc_chip_init_tail(struct mtd_info *mtd)
                                                        chip->ecc.bytes);
        dev_dbg(priv->dev, "%s: nand->ecc.total = %d\n", __func__,
                                                        chip->ecc.total);
-       dev_dbg(priv->dev, "%s: nand->ecc.layout = %p\n", __func__,
-                                                       chip->ecc.layout);
+       dev_dbg(priv->dev, "%s: mtd->ooblayout = %p\n", __func__,
+                                                       mtd->ooblayout);
        dev_dbg(priv->dev, "%s: mtd->flags = %08x\n", __func__, mtd->flags);
        dev_dbg(priv->dev, "%s: mtd->size = %lld\n", __func__, mtd->size);
        dev_dbg(priv->dev, "%s: mtd->erasesize = %d\n", __func__,
@@ -825,39 +750,42 @@ static int fsl_ifc_chip_init_tail(struct mtd_info *mtd)
 static void fsl_ifc_sram_init(struct fsl_ifc_mtd *priv)
 {
        struct fsl_ifc_ctrl *ctrl = priv->ctrl;
-       struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+       struct fsl_ifc_runtime __iomem *ifc_runtime = ctrl->rregs;
+       struct fsl_ifc_global __iomem *ifc_global = ctrl->gregs;
        uint32_t csor = 0, csor_8k = 0, csor_ext = 0;
        uint32_t cs = priv->bank;
 
        /* Save CSOR and CSOR_ext */
-       csor = ifc_in32(&ifc->csor_cs[cs].csor);
-       csor_ext = ifc_in32(&ifc->csor_cs[cs].csor_ext);
+       csor = ifc_in32(&ifc_global->csor_cs[cs].csor);
+       csor_ext = ifc_in32(&ifc_global->csor_cs[cs].csor_ext);
 
        /* chage PageSize 8K and SpareSize 1K*/
        csor_8k = (csor & ~(CSOR_NAND_PGS_MASK)) | 0x0018C000;
-       ifc_out32(csor_8k, &ifc->csor_cs[cs].csor);
-       ifc_out32(0x0000400, &ifc->csor_cs[cs].csor_ext);
+       ifc_out32(csor_8k, &ifc_global->csor_cs[cs].csor);
+       ifc_out32(0x0000400, &ifc_global->csor_cs[cs].csor_ext);
 
        /* READID */
        ifc_out32((IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
-                 (IFC_FIR_OP_UA  << IFC_NAND_FIR0_OP1_SHIFT) |
-                 (IFC_FIR_OP_RB << IFC_NAND_FIR0_OP2_SHIFT),
-                 &ifc->ifc_nand.nand_fir0);
+                   (IFC_FIR_OP_UA  << IFC_NAND_FIR0_OP1_SHIFT) |
+                   (IFC_FIR_OP_RB << IFC_NAND_FIR0_OP2_SHIFT),
+                   &ifc_runtime->ifc_nand.nand_fir0);
        ifc_out32(NAND_CMD_READID << IFC_NAND_FCR0_CMD0_SHIFT,
-                 &ifc->ifc_nand.nand_fcr0);
-       ifc_out32(0x0, &ifc->ifc_nand.row3);
+                   &ifc_runtime->ifc_nand.nand_fcr0);
+       ifc_out32(0x0, &ifc_runtime->ifc_nand.row3);
 
-       ifc_out32(0x0, &ifc->ifc_nand.nand_fbcr);
+       ifc_out32(0x0, &ifc_runtime->ifc_nand.nand_fbcr);
 
        /* Program ROW0/COL0 */
-       ifc_out32(0x0, &ifc->ifc_nand.row0);
-       ifc_out32(0x0, &ifc->ifc_nand.col0);
+       ifc_out32(0x0, &ifc_runtime->ifc_nand.row0);
+       ifc_out32(0x0, &ifc_runtime->ifc_nand.col0);
 
        /* set the chip select for NAND Transaction */
-       ifc_out32(cs << IFC_NAND_CSEL_SHIFT, &ifc->ifc_nand.nand_csel);
+       ifc_out32(cs << IFC_NAND_CSEL_SHIFT,
+               &ifc_runtime->ifc_nand.nand_csel);
 
        /* start read seq */
-       ifc_out32(IFC_NAND_SEQ_STRT_FIR_STRT, &ifc->ifc_nand.nandseq_strt);
+       ifc_out32(IFC_NAND_SEQ_STRT_FIR_STRT,
+               &ifc_runtime->ifc_nand.nandseq_strt);
 
        /* wait for command complete flag or timeout */
        wait_event_timeout(ctrl->nand_wait, ctrl->nand_stat,
@@ -867,17 +795,17 @@ static void fsl_ifc_sram_init(struct fsl_ifc_mtd *priv)
                printk(KERN_ERR "fsl-ifc: Failed to Initialise SRAM\n");
 
        /* Restore CSOR and CSOR_ext */
-       ifc_out32(csor, &ifc->csor_cs[cs].csor);
-       ifc_out32(csor_ext, &ifc->csor_cs[cs].csor_ext);
+       ifc_out32(csor, &ifc_global->csor_cs[cs].csor);
+       ifc_out32(csor_ext, &ifc_global->csor_cs[cs].csor_ext);
 }
 
 static int fsl_ifc_chip_init(struct fsl_ifc_mtd *priv)
 {
        struct fsl_ifc_ctrl *ctrl = priv->ctrl;
-       struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+       struct fsl_ifc_global __iomem *ifc_global = ctrl->gregs;
+       struct fsl_ifc_runtime __iomem *ifc_runtime = ctrl->rregs;
        struct nand_chip *chip = &priv->chip;
        struct mtd_info *mtd = nand_to_mtd(&priv->chip);
-       struct nand_ecclayout *layout;
        u32 csor;
 
        /* Fill in fsl_ifc_mtd structure */
@@ -886,7 +814,8 @@ static int fsl_ifc_chip_init(struct fsl_ifc_mtd *priv)
 
        /* fill in nand_chip structure */
        /* set up function call table */
-       if ((ifc_in32(&ifc->cspr_cs[priv->bank].cspr)) & CSPR_PORT_SIZE_16)
+       if ((ifc_in32(&ifc_global->cspr_cs[priv->bank].cspr))
+               & CSPR_PORT_SIZE_16)
                chip->read_byte = fsl_ifc_read_byte16;
        else
                chip->read_byte = fsl_ifc_read_byte;
@@ -900,13 +829,14 @@ static int fsl_ifc_chip_init(struct fsl_ifc_mtd *priv)
        chip->bbt_td = &bbt_main_descr;
        chip->bbt_md = &bbt_mirror_descr;
 
-       ifc_out32(0x0, &ifc->ifc_nand.ncfgr);
+       ifc_out32(0x0, &ifc_runtime->ifc_nand.ncfgr);
 
        /* set up nand options */
        chip->bbt_options = NAND_BBT_USE_FLASH;
        chip->options = NAND_NO_SUBPAGE_WRITE;
 
-       if (ifc_in32(&ifc->cspr_cs[priv->bank].cspr) & CSPR_PORT_SIZE_16) {
+       if (ifc_in32(&ifc_global->cspr_cs[priv->bank].cspr)
+               & CSPR_PORT_SIZE_16) {
                chip->read_byte = fsl_ifc_read_byte16;
                chip->options |= NAND_BUSWIDTH_16;
        } else {
@@ -919,20 +849,11 @@ static int fsl_ifc_chip_init(struct fsl_ifc_mtd *priv)
        chip->ecc.read_page = fsl_ifc_read_page;
        chip->ecc.write_page = fsl_ifc_write_page;
 
-       csor = ifc_in32(&ifc->csor_cs[priv->bank].csor);
-
-       /* Hardware generates ECC per 512 Bytes */
-       chip->ecc.size = 512;
-       chip->ecc.bytes = 8;
-       chip->ecc.strength = 4;
+       csor = ifc_in32(&ifc_global->csor_cs[priv->bank].csor);
 
        switch (csor & CSOR_NAND_PGS_MASK) {
        case CSOR_NAND_PGS_512:
-               if (chip->options & NAND_BUSWIDTH_16) {
-                       layout = &oob_512_16bit_ecc4;
-               } else {
-                       layout = &oob_512_8bit_ecc4;
-
+               if (!(chip->options & NAND_BUSWIDTH_16)) {
                        /* Avoid conflict with bad block marker */
                        bbt_main_descr.offs = 0;
                        bbt_mirror_descr.offs = 0;
@@ -942,35 +863,16 @@ static int fsl_ifc_chip_init(struct fsl_ifc_mtd *priv)
                break;
 
        case CSOR_NAND_PGS_2K:
-               layout = &oob_2048_ecc4;
                priv->bufnum_mask = 3;
                break;
 
        case CSOR_NAND_PGS_4K:
-               if ((csor & CSOR_NAND_ECC_MODE_MASK) ==
-                   CSOR_NAND_ECC_MODE_4) {
-                       layout = &oob_4096_ecc4;
-               } else {
-                       layout = &oob_4096_ecc8;
-                       chip->ecc.bytes = 16;
-                       chip->ecc.strength = 8;
-               }
-
                priv->bufnum_mask = 1;
                break;
 
        case CSOR_NAND_PGS_8K:
-               if ((csor & CSOR_NAND_ECC_MODE_MASK) ==
-                   CSOR_NAND_ECC_MODE_4) {
-                       layout = &oob_8192_ecc4;
-               } else {
-                       layout = &oob_8192_ecc8;
-                       chip->ecc.bytes = 16;
-                       chip->ecc.strength = 8;
-               }
-
                priv->bufnum_mask = 0;
-       break;
+               break;
 
        default:
                dev_err(priv->dev, "bad csor %#x: bad page size\n", csor);
@@ -980,9 +882,20 @@ static int fsl_ifc_chip_init(struct fsl_ifc_mtd *priv)
        /* Must also set CSOR_NAND_ECC_ENC_EN if DEC_EN set */
        if (csor & CSOR_NAND_ECC_DEC_EN) {
                chip->ecc.mode = NAND_ECC_HW;
-               chip->ecc.layout = layout;
+               mtd_set_ooblayout(mtd, &fsl_ifc_ooblayout_ops);
+
+               /* Hardware generates ECC per 512 Bytes */
+               chip->ecc.size = 512;
+               if ((csor & CSOR_NAND_ECC_MODE_MASK) == CSOR_NAND_ECC_MODE_4) {
+                       chip->ecc.bytes = 8;
+                       chip->ecc.strength = 4;
+               } else {
+                       chip->ecc.bytes = 16;
+                       chip->ecc.strength = 8;
+               }
        } else {
                chip->ecc.mode = NAND_ECC_SOFT;
+               chip->ecc.algo = NAND_ECC_HAMMING;
        }
 
        if (ctrl->version == FSL_IFC_VERSION_1_1_0)
@@ -1007,10 +920,10 @@ static int fsl_ifc_chip_remove(struct fsl_ifc_mtd *priv)
        return 0;
 }
 
-static int match_bank(struct fsl_ifc_regs __iomem *ifc, int bank,
+static int match_bank(struct fsl_ifc_global __iomem *ifc_global, int bank,
                      phys_addr_t addr)
 {
-       u32 cspr = ifc_in32(&ifc->cspr_cs[bank].cspr);
+       u32 cspr = ifc_in32(&ifc_global->cspr_cs[bank].cspr);
 
        if (!(cspr & CSPR_V))
                return 0;
@@ -1024,7 +937,7 @@ static DEFINE_MUTEX(fsl_ifc_nand_mutex);
 
 static int fsl_ifc_nand_probe(struct platform_device *dev)
 {
-       struct fsl_ifc_regs __iomem *ifc;
+       struct fsl_ifc_runtime __iomem *ifc;
        struct fsl_ifc_mtd *priv;
        struct resource res;
        static const char *part_probe_types[]
@@ -1034,9 +947,9 @@ static int fsl_ifc_nand_probe(struct platform_device *dev)
        struct device_node *node = dev->dev.of_node;
        struct mtd_info *mtd;
 
-       if (!fsl_ifc_ctrl_dev || !fsl_ifc_ctrl_dev->regs)
+       if (!fsl_ifc_ctrl_dev || !fsl_ifc_ctrl_dev->rregs)
                return -ENODEV;
-       ifc = fsl_ifc_ctrl_dev->regs;
+       ifc = fsl_ifc_ctrl_dev->rregs;
 
        /* get, allocate and map the memory resource */
        ret = of_address_to_resource(node, 0, &res);
@@ -1047,7 +960,7 @@ static int fsl_ifc_nand_probe(struct platform_device *dev)
 
        /* find which chip select it is connected to */
        for (bank = 0; bank < fsl_ifc_ctrl_dev->banks; bank++) {
-               if (match_bank(ifc, bank, res.start))
+               if (match_bank(fsl_ifc_ctrl_dev->gregs, bank, res.start))
                        break;
        }
 
index cafd12d..d85fa25 100644 (file)
@@ -170,6 +170,7 @@ static int fun_chip_init(struct fsl_upm_nand *fun,
        fun->chip.read_buf = fun_read_buf;
        fun->chip.write_buf = fun_write_buf;
        fun->chip.ecc.mode = NAND_ECC_SOFT;
+       fun->chip.ecc.algo = NAND_ECC_HAMMING;
        if (fun->mchip_count > 1)
                fun->chip.select_chip = fun_select_chip;
 
index 1bdcd4f..d4f454a 100644 (file)
 #include <linux/amba/bus.h>
 #include <mtd/mtd-abi.h>
 
-static struct nand_ecclayout fsmc_ecc1_128_layout = {
-       .eccbytes = 24,
-       .eccpos = {2, 3, 4, 18, 19, 20, 34, 35, 36, 50, 51, 52,
-               66, 67, 68, 82, 83, 84, 98, 99, 100, 114, 115, 116},
-       .oobfree = {
-               {.offset = 8, .length = 8},
-               {.offset = 24, .length = 8},
-               {.offset = 40, .length = 8},
-               {.offset = 56, .length = 8},
-               {.offset = 72, .length = 8},
-               {.offset = 88, .length = 8},
-               {.offset = 104, .length = 8},
-               {.offset = 120, .length = 8}
-       }
-};
+static int fsmc_ecc1_ooblayout_ecc(struct mtd_info *mtd, int section,
+                                  struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
 
-static struct nand_ecclayout fsmc_ecc1_64_layout = {
-       .eccbytes = 12,
-       .eccpos = {2, 3, 4, 18, 19, 20, 34, 35, 36, 50, 51, 52},
-       .oobfree = {
-               {.offset = 8, .length = 8},
-               {.offset = 24, .length = 8},
-               {.offset = 40, .length = 8},
-               {.offset = 56, .length = 8},
-       }
-};
+       if (section >= chip->ecc.steps)
+               return -ERANGE;
 
-static struct nand_ecclayout fsmc_ecc1_16_layout = {
-       .eccbytes = 3,
-       .eccpos = {2, 3, 4},
-       .oobfree = {
-               {.offset = 8, .length = 8},
-       }
-};
+       oobregion->offset = (section * 16) + 2;
+       oobregion->length = 3;
 
-/*
- * ECC4 layout for NAND of pagesize 8192 bytes & OOBsize 256 bytes. 13*16 bytes
- * of OB size is reserved for ECC, Byte no. 0 & 1 reserved for bad block and 46
- * bytes are free for use.
- */
-static struct nand_ecclayout fsmc_ecc4_256_layout = {
-       .eccbytes = 208,
-       .eccpos = {  2,   3,   4,   5,   6,   7,   8,
-               9,  10,  11,  12,  13,  14,
-               18,  19,  20,  21,  22,  23,  24,
-               25,  26,  27,  28,  29,  30,
-               34,  35,  36,  37,  38,  39,  40,
-               41,  42,  43,  44,  45,  46,
-               50,  51,  52,  53,  54,  55,  56,
-               57,  58,  59,  60,  61,  62,
-               66,  67,  68,  69,  70,  71,  72,
-               73,  74,  75,  76,  77,  78,
-               82,  83,  84,  85,  86,  87,  88,
-               89,  90,  91,  92,  93,  94,
-               98,  99, 100, 101, 102, 103, 104,
-               105, 106, 107, 108, 109, 110,
-               114, 115, 116, 117, 118, 119, 120,
-               121, 122, 123, 124, 125, 126,
-               130, 131, 132, 133, 134, 135, 136,
-               137, 138, 139, 140, 141, 142,
-               146, 147, 148, 149, 150, 151, 152,
-               153, 154, 155, 156, 157, 158,
-               162, 163, 164, 165, 166, 167, 168,
-               169, 170, 171, 172, 173, 174,
-               178, 179, 180, 181, 182, 183, 184,
-               185, 186, 187, 188, 189, 190,
-               194, 195, 196, 197, 198, 199, 200,
-               201, 202, 203, 204, 205, 206,
-               210, 211, 212, 213, 214, 215, 216,
-               217, 218, 219, 220, 221, 222,
-               226, 227, 228, 229, 230, 231, 232,
-               233, 234, 235, 236, 237, 238,
-               242, 243, 244, 245, 246, 247, 248,
-               249, 250, 251, 252, 253, 254
-       },
-       .oobfree = {
-               {.offset = 15, .length = 3},
-               {.offset = 31, .length = 3},
-               {.offset = 47, .length = 3},
-               {.offset = 63, .length = 3},
-               {.offset = 79, .length = 3},
-               {.offset = 95, .length = 3},
-               {.offset = 111, .length = 3},
-               {.offset = 127, .length = 3},
-               {.offset = 143, .length = 3},
-               {.offset = 159, .length = 3},
-               {.offset = 175, .length = 3},
-               {.offset = 191, .length = 3},
-               {.offset = 207, .length = 3},
-               {.offset = 223, .length = 3},
-               {.offset = 239, .length = 3},
-               {.offset = 255, .length = 1}
-       }
-};
+       return 0;
+}
 
-/*
- * ECC4 layout for NAND of pagesize 4096 bytes & OOBsize 224 bytes. 13*8 bytes
- * of OOB size is reserved for ECC, Byte no. 0 & 1 reserved for bad block & 118
- * bytes are free for use.
- */
-static struct nand_ecclayout fsmc_ecc4_224_layout = {
-       .eccbytes = 104,
-       .eccpos = {  2,   3,   4,   5,   6,   7,   8,
-               9,  10,  11,  12,  13,  14,
-               18,  19,  20,  21,  22,  23,  24,
-               25,  26,  27,  28,  29,  30,
-               34,  35,  36,  37,  38,  39,  40,
-               41,  42,  43,  44,  45,  46,
-               50,  51,  52,  53,  54,  55,  56,
-               57,  58,  59,  60,  61,  62,
-               66,  67,  68,  69,  70,  71,  72,
-               73,  74,  75,  76,  77,  78,
-               82,  83,  84,  85,  86,  87,  88,
-               89,  90,  91,  92,  93,  94,
-               98,  99, 100, 101, 102, 103, 104,
-               105, 106, 107, 108, 109, 110,
-               114, 115, 116, 117, 118, 119, 120,
-               121, 122, 123, 124, 125, 126
-       },
-       .oobfree = {
-               {.offset = 15, .length = 3},
-               {.offset = 31, .length = 3},
-               {.offset = 47, .length = 3},
-               {.offset = 63, .length = 3},
-               {.offset = 79, .length = 3},
-               {.offset = 95, .length = 3},
-               {.offset = 111, .length = 3},
-               {.offset = 127, .length = 97}
-       }
-};
+static int fsmc_ecc1_ooblayout_free(struct mtd_info *mtd, int section,
+                                   struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
 
-/*
- * ECC4 layout for NAND of pagesize 4096 bytes & OOBsize 128 bytes. 13*8 bytes
- * of OOB size is reserved for ECC, Byte no. 0 & 1 reserved for bad block & 22
- * bytes are free for use.
- */
-static struct nand_ecclayout fsmc_ecc4_128_layout = {
-       .eccbytes = 104,
-       .eccpos = {  2,   3,   4,   5,   6,   7,   8,
-               9,  10,  11,  12,  13,  14,
-               18,  19,  20,  21,  22,  23,  24,
-               25,  26,  27,  28,  29,  30,
-               34,  35,  36,  37,  38,  39,  40,
-               41,  42,  43,  44,  45,  46,
-               50,  51,  52,  53,  54,  55,  56,
-               57,  58,  59,  60,  61,  62,
-               66,  67,  68,  69,  70,  71,  72,
-               73,  74,  75,  76,  77,  78,
-               82,  83,  84,  85,  86,  87,  88,
-               89,  90,  91,  92,  93,  94,
-               98,  99, 100, 101, 102, 103, 104,
-               105, 106, 107, 108, 109, 110,
-               114, 115, 116, 117, 118, 119, 120,
-               121, 122, 123, 124, 125, 126
-       },
-       .oobfree = {
-               {.offset = 15, .length = 3},
-               {.offset = 31, .length = 3},
-               {.offset = 47, .length = 3},
-               {.offset = 63, .length = 3},
-               {.offset = 79, .length = 3},
-               {.offset = 95, .length = 3},
-               {.offset = 111, .length = 3},
-               {.offset = 127, .length = 1}
-       }
-};
+       if (section >= chip->ecc.steps)
+               return -ERANGE;
 
-/*
- * ECC4 layout for NAND of pagesize 2048 bytes & OOBsize 64 bytes. 13*4 bytes of
- * OOB size is reserved for ECC, Byte no. 0 & 1 reserved for bad block and 10
- * bytes are free for use.
- */
-static struct nand_ecclayout fsmc_ecc4_64_layout = {
-       .eccbytes = 52,
-       .eccpos = {  2,   3,   4,   5,   6,   7,   8,
-               9,  10,  11,  12,  13,  14,
-               18,  19,  20,  21,  22,  23,  24,
-               25,  26,  27,  28,  29,  30,
-               34,  35,  36,  37,  38,  39,  40,
-               41,  42,  43,  44,  45,  46,
-               50,  51,  52,  53,  54,  55,  56,
-               57,  58,  59,  60,  61,  62,
-       },
-       .oobfree = {
-               {.offset = 15, .length = 3},
-               {.offset = 31, .length = 3},
-               {.offset = 47, .length = 3},
-               {.offset = 63, .length = 1},
-       }
-};
+       oobregion->offset = (section * 16) + 8;
 
-/*
- * ECC4 layout for NAND of pagesize 512 bytes & OOBsize 16 bytes. 13 bytes of
- * OOB size is reserved for ECC, Byte no. 4 & 5 reserved for bad block and One
- * byte is free for use.
- */
-static struct nand_ecclayout fsmc_ecc4_16_layout = {
-       .eccbytes = 13,
-       .eccpos = { 0,  1,  2,  3,  6,  7, 8,
-               9, 10, 11, 12, 13, 14
-       },
-       .oobfree = {
-               {.offset = 15, .length = 1},
-       }
+       if (section < chip->ecc.steps - 1)
+               oobregion->length = 8;
+       else
+               oobregion->length = mtd->oobsize - oobregion->offset;
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops fsmc_ecc1_ooblayout_ops = {
+       .ecc = fsmc_ecc1_ooblayout_ecc,
+       .free = fsmc_ecc1_ooblayout_free,
 };
 
 /*
@@ -250,28 +81,46 @@ static struct nand_ecclayout fsmc_ecc4_16_layout = {
  * There are 13 bytes of ecc for every 512 byte block and it has to be read
  * consecutively and immediately after the 512 byte data block for hardware to
  * generate the error bit offsets in 512 byte data.
- * Managing the ecc bytes in the following way makes it easier for software to
- * read ecc bytes consecutive to data bytes. This way is similar to
- * oobfree structure maintained already in generic nand driver
  */
-static struct fsmc_eccplace fsmc_ecc4_lp_place = {
-       .eccplace = {
-               {.offset = 2, .length = 13},
-               {.offset = 18, .length = 13},
-               {.offset = 34, .length = 13},
-               {.offset = 50, .length = 13},
-               {.offset = 66, .length = 13},
-               {.offset = 82, .length = 13},
-               {.offset = 98, .length = 13},
-               {.offset = 114, .length = 13}
-       }
-};
+static int fsmc_ecc4_ooblayout_ecc(struct mtd_info *mtd, int section,
+                                  struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
 
-static struct fsmc_eccplace fsmc_ecc4_sp_place = {
-       .eccplace = {
-               {.offset = 0, .length = 4},
-               {.offset = 6, .length = 9}
-       }
+       if (section >= chip->ecc.steps)
+               return -ERANGE;
+
+       oobregion->length = chip->ecc.bytes;
+
+       if (!section && mtd->writesize <= 512)
+               oobregion->offset = 0;
+       else
+               oobregion->offset = (section * 16) + 2;
+
+       return 0;
+}
+
+static int fsmc_ecc4_ooblayout_free(struct mtd_info *mtd, int section,
+                                   struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+
+       if (section >= chip->ecc.steps)
+               return -ERANGE;
+
+       oobregion->offset = (section * 16) + 15;
+
+       if (section < chip->ecc.steps - 1)
+               oobregion->length = 3;
+       else
+               oobregion->length = mtd->oobsize - oobregion->offset;
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops fsmc_ecc4_ooblayout_ops = {
+       .ecc = fsmc_ecc4_ooblayout_ecc,
+       .free = fsmc_ecc4_ooblayout_free,
 };
 
 /**
@@ -283,7 +132,6 @@ static struct fsmc_eccplace fsmc_ecc4_sp_place = {
  * @partitions:                Partition info for a NAND Flash.
  * @nr_partitions:     Total number of partition of a NAND flash.
  *
- * @ecc_place:         ECC placing locations in oobfree type format.
  * @bank:              Bank number for probed device.
  * @clk:               Clock structure for FSMC.
  *
@@ -303,7 +151,6 @@ struct fsmc_nand_data {
        struct mtd_partition    *partitions;
        unsigned int            nr_partitions;
 
-       struct fsmc_eccplace    *ecc_place;
        unsigned int            bank;
        struct device           *dev;
        enum access_mode        mode;
@@ -710,8 +557,6 @@ static void fsmc_write_buf_dma(struct mtd_info *mtd, const uint8_t *buf,
 static int fsmc_read_page_hwecc(struct mtd_info *mtd, struct nand_chip *chip,
                                 uint8_t *buf, int oob_required, int page)
 {
-       struct fsmc_nand_data *host = mtd_to_fsmc(mtd);
-       struct fsmc_eccplace *ecc_place = host->ecc_place;
        int i, j, s, stat, eccsize = chip->ecc.size;
        int eccbytes = chip->ecc.bytes;
        int eccsteps = chip->ecc.steps;
@@ -734,9 +579,15 @@ static int fsmc_read_page_hwecc(struct mtd_info *mtd, struct nand_chip *chip,
                chip->read_buf(mtd, p, eccsize);
 
                for (j = 0; j < eccbytes;) {
-                       off = ecc_place->eccplace[group].offset;
-                       len = ecc_place->eccplace[group].length;
-                       group++;
+                       struct mtd_oob_region oobregion;
+                       int ret;
+
+                       ret = mtd_ooblayout_ecc(mtd, group++, &oobregion);
+                       if (ret)
+                               return ret;
+
+                       off = oobregion.offset;
+                       len = oobregion.length;
 
                        /*
                         * length is intentionally kept a higher multiple of 2
@@ -1084,24 +935,10 @@ static int __init fsmc_nand_probe(struct platform_device *pdev)
        if (AMBA_REV_BITS(host->pid) >= 8) {
                switch (mtd->oobsize) {
                case 16:
-                       nand->ecc.layout = &fsmc_ecc4_16_layout;
-                       host->ecc_place = &fsmc_ecc4_sp_place;
-                       break;
                case 64:
-                       nand->ecc.layout = &fsmc_ecc4_64_layout;
-                       host->ecc_place = &fsmc_ecc4_lp_place;
-                       break;
                case 128:
-                       nand->ecc.layout = &fsmc_ecc4_128_layout;
-                       host->ecc_place = &fsmc_ecc4_lp_place;
-                       break;
                case 224:
-                       nand->ecc.layout = &fsmc_ecc4_224_layout;
-                       host->ecc_place = &fsmc_ecc4_lp_place;
-                       break;
                case 256:
-                       nand->ecc.layout = &fsmc_ecc4_256_layout;
-                       host->ecc_place = &fsmc_ecc4_lp_place;
                        break;
                default:
                        dev_warn(&pdev->dev, "No oob scheme defined for oobsize %d\n",
@@ -1109,6 +946,8 @@ static int __init fsmc_nand_probe(struct platform_device *pdev)
                        ret = -EINVAL;
                        goto err_probe;
                }
+
+               mtd_set_ooblayout(mtd, &fsmc_ecc4_ooblayout_ops);
        } else {
                switch (nand->ecc.mode) {
                case NAND_ECC_HW:
@@ -1119,9 +958,11 @@ static int __init fsmc_nand_probe(struct platform_device *pdev)
                        nand->ecc.strength = 1;
                        break;
 
-               case NAND_ECC_SOFT_BCH:
-                       dev_info(&pdev->dev, "Using 4-bit SW BCH ECC scheme\n");
-                       break;
+               case NAND_ECC_SOFT:
+                       if (nand->ecc.algo == NAND_ECC_BCH) {
+                               dev_info(&pdev->dev, "Using 4-bit SW BCH ECC scheme\n");
+                               break;
+                       }
 
                default:
                        dev_err(&pdev->dev, "Unsupported ECC mode!\n");
@@ -1132,16 +973,13 @@ static int __init fsmc_nand_probe(struct platform_device *pdev)
                 * Don't set layout for BCH4 SW ECC. This will be
                 * generated later in nand_bch_init() later.
                 */
-               if (nand->ecc.mode != NAND_ECC_SOFT_BCH) {
+               if (nand->ecc.mode == NAND_ECC_HW) {
                        switch (mtd->oobsize) {
                        case 16:
-                               nand->ecc.layout = &fsmc_ecc1_16_layout;
-                               break;
                        case 64:
-                               nand->ecc.layout = &fsmc_ecc1_64_layout;
-                               break;
                        case 128:
-                               nand->ecc.layout = &fsmc_ecc1_128_layout;
+                               mtd_set_ooblayout(mtd,
+                                                 &fsmc_ecc1_ooblayout_ops);
                                break;
                        default:
                                dev_warn(&pdev->dev,
index ded658f..6317f68 100644 (file)
@@ -273,6 +273,7 @@ static int gpio_nand_probe(struct platform_device *pdev)
        nand_set_flash_node(chip, pdev->dev.of_node);
        chip->IO_ADDR_W         = chip->IO_ADDR_R;
        chip->ecc.mode          = NAND_ECC_SOFT;
+       chip->ecc.algo          = NAND_ECC_HAMMING;
        chip->options           = gpiomtd->plat.options;
        chip->chip_delay        = gpiomtd->plat.chip_delay;
        chip->cmd_ctrl          = gpio_nand_cmd_ctrl;
index 8122c69..6e46156 100644 (file)
@@ -25,7 +25,6 @@
 #include <linux/mtd/partitions.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
-#include <linux/of_mtd.h>
 #include "gpmi-nand.h"
 #include "bch-regs.h"
 
@@ -47,10 +46,44 @@ static struct nand_bbt_descr gpmi_bbt_descr = {
  * We may change the layout if we can get the ECC info from the datasheet,
  * else we will use all the (page + OOB).
  */
-static struct nand_ecclayout gpmi_hw_ecclayout = {
-       .eccbytes = 0,
-       .eccpos = { 0, },
-       .oobfree = { {.offset = 0, .length = 0} }
+static int gpmi_ooblayout_ecc(struct mtd_info *mtd, int section,
+                             struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct gpmi_nand_data *this = nand_get_controller_data(chip);
+       struct bch_geometry *geo = &this->bch_geometry;
+
+       if (section)
+               return -ERANGE;
+
+       oobregion->offset = 0;
+       oobregion->length = geo->page_size - mtd->writesize;
+
+       return 0;
+}
+
+static int gpmi_ooblayout_free(struct mtd_info *mtd, int section,
+                              struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct gpmi_nand_data *this = nand_get_controller_data(chip);
+       struct bch_geometry *geo = &this->bch_geometry;
+
+       if (section)
+               return -ERANGE;
+
+       /* The available oob size we have. */
+       if (geo->page_size < mtd->writesize + mtd->oobsize) {
+               oobregion->offset = geo->page_size - mtd->writesize;
+               oobregion->length = mtd->oobsize - oobregion->offset;
+       }
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops gpmi_ooblayout_ops = {
+       .ecc = gpmi_ooblayout_ecc,
+       .free = gpmi_ooblayout_free,
 };
 
 static const struct gpmi_devdata gpmi_devdata_imx23 = {
@@ -141,7 +174,6 @@ static int set_geometry_by_ecc_info(struct gpmi_nand_data *this)
        struct bch_geometry *geo = &this->bch_geometry;
        struct nand_chip *chip = &this->nand;
        struct mtd_info *mtd = nand_to_mtd(chip);
-       struct nand_oobfree *of = gpmi_hw_ecclayout.oobfree;
        unsigned int block_mark_bit_offset;
 
        if (!(chip->ecc_strength_ds > 0 && chip->ecc_step_ds > 0))
@@ -229,12 +261,6 @@ static int set_geometry_by_ecc_info(struct gpmi_nand_data *this)
        geo->page_size = mtd->writesize + geo->metadata_size +
                (geo->gf_len * geo->ecc_strength * geo->ecc_chunk_count) / 8;
 
-       /* The available oob size we have. */
-       if (geo->page_size < mtd->writesize + mtd->oobsize) {
-               of->offset = geo->page_size - mtd->writesize;
-               of->length = mtd->oobsize - of->offset;
-       }
-
        geo->payload_size = mtd->writesize;
 
        geo->auxiliary_status_offset = ALIGN(geo->metadata_size, 4);
@@ -797,6 +823,7 @@ static void gpmi_free_dma_buffer(struct gpmi_nand_data *this)
 
        this->cmd_buffer        = NULL;
        this->data_buffer_dma   = NULL;
+       this->raw_buffer        = NULL;
        this->page_buffer_virt  = NULL;
        this->page_buffer_size  =  0;
 }
@@ -1037,14 +1064,87 @@ static int gpmi_ecc_read_page(struct mtd_info *mtd, struct nand_chip *chip,
        /* Loop over status bytes, accumulating ECC status. */
        status = auxiliary_virt + nfc_geo->auxiliary_status_offset;
 
+       read_page_swap_end(this, buf, nfc_geo->payload_size,
+                          this->payload_virt, this->payload_phys,
+                          nfc_geo->payload_size,
+                          payload_virt, payload_phys);
+
        for (i = 0; i < nfc_geo->ecc_chunk_count; i++, status++) {
                if ((*status == STATUS_GOOD) || (*status == STATUS_ERASED))
                        continue;
 
                if (*status == STATUS_UNCORRECTABLE) {
+                       int eccbits = nfc_geo->ecc_strength * nfc_geo->gf_len;
+                       u8 *eccbuf = this->raw_buffer;
+                       int offset, bitoffset;
+                       int eccbytes;
+                       int flips;
+
+                       /* Read ECC bytes into our internal raw_buffer */
+                       offset = nfc_geo->metadata_size * 8;
+                       offset += ((8 * nfc_geo->ecc_chunk_size) + eccbits) * (i + 1);
+                       offset -= eccbits;
+                       bitoffset = offset % 8;
+                       eccbytes = DIV_ROUND_UP(offset + eccbits, 8);
+                       offset /= 8;
+                       eccbytes -= offset;
+                       chip->cmdfunc(mtd, NAND_CMD_RNDOUT, offset, -1);
+                       chip->read_buf(mtd, eccbuf, eccbytes);
+
+                       /*
+                        * ECC data are not byte aligned and we may have
+                        * in-band data in the first and last byte of
+                        * eccbuf. Set non-eccbits to one so that
+                        * nand_check_erased_ecc_chunk() does not count them
+                        * as bitflips.
+                        */
+                       if (bitoffset)
+                               eccbuf[0] |= GENMASK(bitoffset - 1, 0);
+
+                       bitoffset = (bitoffset + eccbits) % 8;
+                       if (bitoffset)
+                               eccbuf[eccbytes - 1] |= GENMASK(7, bitoffset);
+
+                       /*
+                        * The ECC hardware has an uncorrectable ECC status
+                        * code in case we have bitflips in an erased page. As
+                        * nothing was written into this subpage the ECC is
+                        * obviously wrong and we can not trust it. We assume
+                        * at this point that we are reading an erased page and
+                        * try to correct the bitflips in buffer up to
+                        * ecc_strength bitflips. If this is a page with random
+                        * data, we exceed this number of bitflips and have a
+                        * ECC failure. Otherwise we use the corrected buffer.
+                        */
+                       if (i == 0) {
+                               /* The first block includes metadata */
+                               flips = nand_check_erased_ecc_chunk(
+                                               buf + i * nfc_geo->ecc_chunk_size,
+                                               nfc_geo->ecc_chunk_size,
+                                               eccbuf, eccbytes,
+                                               auxiliary_virt,
+                                               nfc_geo->metadata_size,
+                                               nfc_geo->ecc_strength);
+                       } else {
+                               flips = nand_check_erased_ecc_chunk(
+                                               buf + i * nfc_geo->ecc_chunk_size,
+                                               nfc_geo->ecc_chunk_size,
+                                               eccbuf, eccbytes,
+                                               NULL, 0,
+                                               nfc_geo->ecc_strength);
+                       }
+
+                       if (flips > 0) {
+                               max_bitflips = max_t(unsigned int, max_bitflips,
+                                                    flips);
+                               mtd->ecc_stats.corrected += flips;
+                               continue;
+                       }
+
                        mtd->ecc_stats.failed++;
                        continue;
                }
+
                mtd->ecc_stats.corrected += *status;
                max_bitflips = max_t(unsigned int, max_bitflips, *status);
        }
@@ -1064,11 +1164,6 @@ static int gpmi_ecc_read_page(struct mtd_info *mtd, struct nand_chip *chip,
                chip->oob_poi[0] = ((uint8_t *) auxiliary_virt)[0];
        }
 
-       read_page_swap_end(this, buf, nfc_geo->payload_size,
-                       this->payload_virt, this->payload_phys,
-                       nfc_geo->payload_size,
-                       payload_virt, payload_phys);
-
        return max_bitflips;
 }
 
@@ -1327,18 +1422,19 @@ static int gpmi_ecc_read_oob(struct mtd_info *mtd, struct nand_chip *chip,
 static int
 gpmi_ecc_write_oob(struct mtd_info *mtd, struct nand_chip *chip, int page)
 {
-       struct nand_oobfree *of = mtd->ecclayout->oobfree;
+       struct mtd_oob_region of = { };
        int status = 0;
 
        /* Do we have available oob area? */
-       if (!of->length)
+       mtd_ooblayout_free(mtd, 0, &of);
+       if (!of.length)
                return -EPERM;
 
        if (!nand_is_slc(chip))
                return -EPERM;
 
-       chip->cmdfunc(mtd, NAND_CMD_SEQIN, mtd->writesize + of->offset, page);
-       chip->write_buf(mtd, chip->oob_poi + of->offset, of->length);
+       chip->cmdfunc(mtd, NAND_CMD_SEQIN, mtd->writesize + of.offset, page);
+       chip->write_buf(mtd, chip->oob_poi + of.offset, of.length);
        chip->cmdfunc(mtd, NAND_CMD_PAGEPROG, -1, -1);
 
        status = chip->waitfunc(mtd, chip);
@@ -1840,6 +1936,7 @@ static void gpmi_nand_exit(struct gpmi_nand_data *this)
 static int gpmi_init_last(struct gpmi_nand_data *this)
 {
        struct nand_chip *chip = &this->nand;
+       struct mtd_info *mtd = nand_to_mtd(chip);
        struct nand_ecc_ctrl *ecc = &chip->ecc;
        struct bch_geometry *bch_geo = &this->bch_geometry;
        int ret;
@@ -1861,7 +1958,7 @@ static int gpmi_init_last(struct gpmi_nand_data *this)
        ecc->mode       = NAND_ECC_HW;
        ecc->size       = bch_geo->ecc_chunk_size;
        ecc->strength   = bch_geo->ecc_strength;
-       ecc->layout     = &gpmi_hw_ecclayout;
+       mtd_set_ooblayout(mtd, &gpmi_ooblayout_ops);
 
        /*
         * We only enable the subpage read when:
@@ -1914,16 +2011,6 @@ static int gpmi_nand_init(struct gpmi_nand_data *this)
        /* Set up swap_block_mark, must be set before the gpmi_set_geometry() */
        this->swap_block_mark = !GPMI_IS_MX23(this);
 
-       if (of_get_nand_on_flash_bbt(this->dev->of_node)) {
-               chip->bbt_options |= NAND_BBT_USE_FLASH | NAND_BBT_NO_OOB;
-
-               if (of_property_read_bool(this->dev->of_node,
-                                               "fsl,no-blockmark-swap"))
-                       this->swap_block_mark = false;
-       }
-       dev_dbg(this->dev, "Blockmark swapping %sabled\n",
-               this->swap_block_mark ? "en" : "dis");
-
        /*
         * Allocate a temporary DMA buffer for reading ID in the
         * nand_scan_ident().
@@ -1938,6 +2025,16 @@ static int gpmi_nand_init(struct gpmi_nand_data *this)
        if (ret)
                goto err_out;
 
+       if (chip->bbt_options & NAND_BBT_USE_FLASH) {
+               chip->bbt_options |= NAND_BBT_NO_OOB;
+
+               if (of_property_read_bool(this->dev->of_node,
+                                               "fsl,no-blockmark-swap"))
+                       this->swap_block_mark = false;
+       }
+       dev_dbg(this->dev, "Blockmark swapping %sabled\n",
+               this->swap_block_mark ? "en" : "dis");
+
        ret = gpmi_init_last(this);
        if (ret)
                goto err_out;
index 96502b6..9432546 100644 (file)
@@ -19,7 +19,6 @@
  * GNU General Public License for more details.
  */
 #include <linux/of.h>
-#include <linux/of_mtd.h>
 #include <linux/mtd/mtd.h>
 #include <linux/sizes.h>
 #include <linux/clk.h>
@@ -631,8 +630,28 @@ static void hisi_nfc_host_init(struct hinfc_host *host)
        hinfc_write(host, HINFC504_INTEN_DMA, HINFC504_INTEN);
 }
 
-static struct nand_ecclayout nand_ecc_2K_16bits = {
-       .oobfree = { {2, 6} },
+static int hisi_ooblayout_ecc(struct mtd_info *mtd, int section,
+                             struct mtd_oob_region *oobregion)
+{
+       /* FIXME: add ECC bytes position */
+       return -ENOTSUPP;
+}
+
+static int hisi_ooblayout_free(struct mtd_info *mtd, int section,
+                              struct mtd_oob_region *oobregion)
+{
+       if (section)
+               return -ERANGE;
+
+       oobregion->offset = 2;
+       oobregion->length = 6;
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops hisi_ooblayout_ops = {
+       .ecc = hisi_ooblayout_ecc,
+       .free = hisi_ooblayout_free,
 };
 
 static int hisi_nfc_ecc_probe(struct hinfc_host *host)
@@ -642,10 +661,9 @@ static int hisi_nfc_ecc_probe(struct hinfc_host *host)
        struct device *dev = host->dev;
        struct nand_chip *chip = &host->chip;
        struct mtd_info *mtd = nand_to_mtd(chip);
-       struct device_node *np = host->dev->of_node;
 
-       size = of_get_nand_ecc_step_size(np);
-       strength = of_get_nand_ecc_strength(np);
+       size = chip->ecc.size;
+       strength = chip->ecc.strength;
        if (size != 1024) {
                dev_err(dev, "error ecc size: %d\n", size);
                return -EINVAL;
@@ -668,7 +686,7 @@ static int hisi_nfc_ecc_probe(struct hinfc_host *host)
        case 16:
                ecc_bits = 6;
                if (mtd->writesize == 2048)
-                       chip->ecc.layout = &nand_ecc_2K_16bits;
+                       mtd_set_ooblayout(mtd, &hisi_ooblayout_ops);
 
                /* TODO: add more page size support */
                break;
@@ -695,7 +713,7 @@ static int hisi_nfc_ecc_probe(struct hinfc_host *host)
 
 static int hisi_nfc_probe(struct platform_device *pdev)
 {
-       int ret = 0, irq, buswidth, flag, max_chips = HINFC504_MAX_CHIP;
+       int ret = 0, irq, flag, max_chips = HINFC504_MAX_CHIP;
        struct device *dev = &pdev->dev;
        struct hinfc_host *host;
        struct nand_chip  *chip;
@@ -747,12 +765,6 @@ static int hisi_nfc_probe(struct platform_device *pdev)
        chip->read_buf          = hisi_nfc_read_buf;
        chip->chip_delay        = HINFC504_CHIP_DELAY;
 
-       chip->ecc.mode = of_get_nand_ecc_mode(np);
-
-       buswidth = of_get_nand_bus_width(np);
-       if (buswidth == 16)
-               chip->options |= NAND_BUSWIDTH_16;
-
        hisi_nfc_host_init(host);
 
        ret = devm_request_irq(dev, irq, hinfc_irq_handle, 0x0, "nandc", host);
index 673ceb2..5551c36 100644 (file)
@@ -221,7 +221,6 @@ static int jz_nand_correct_ecc_rs(struct mtd_info *mtd, uint8_t *dat,
        struct jz_nand *nand = mtd_to_jz_nand(mtd);
        int i, error_count, index;
        uint32_t reg, status, error;
-       uint32_t t;
        unsigned int timeout = 1000;
 
        for (i = 0; i < 9; ++i)
@@ -476,7 +475,7 @@ static int jz_nand_probe(struct platform_device *pdev)
        }
 
        if (pdata && pdata->ident_callback) {
-               pdata->ident_callback(pdev, chip, &pdata->partitions,
+               pdata->ident_callback(pdev, mtd, &pdata->partitions,
                                        &pdata->num_partitions);
        }
 
index 755499c..d74f4ba 100644 (file)
@@ -287,7 +287,6 @@ static struct jz4780_bch *jz4780_bch_get(struct device_node *np)
        bch = platform_get_drvdata(pdev);
        clk_prepare_enable(bch->clk);
 
-       bch->dev = &pdev->dev;
        return bch;
 }
 
index e1c016c..daf3c42 100644 (file)
@@ -17,7 +17,6 @@
 #include <linux/of.h>
 #include <linux/of_address.h>
 #include <linux/gpio/consumer.h>
-#include <linux/of_mtd.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
 #include <linux/mtd/mtd.h>
@@ -56,8 +55,6 @@ struct jz4780_nand_chip {
        struct nand_chip chip;
        struct list_head chip_list;
 
-       struct nand_ecclayout ecclayout;
-
        struct gpio_desc *busy_gpio;
        struct gpio_desc *wp_gpio;
        unsigned int reading: 1;
@@ -165,8 +162,7 @@ static int jz4780_nand_init_ecc(struct jz4780_nand_chip *nand, struct device *de
        struct nand_chip *chip = &nand->chip;
        struct mtd_info *mtd = nand_to_mtd(chip);
        struct jz4780_nand_controller *nfc = to_jz4780_nand_controller(chip->controller);
-       struct nand_ecclayout *layout = &nand->ecclayout;
-       u32 start, i;
+       int eccbytes;
 
        chip->ecc.bytes = fls((1 + 8) * chip->ecc.size) *
                                (chip->ecc.strength / 8);
@@ -183,7 +179,6 @@ static int jz4780_nand_init_ecc(struct jz4780_nand_chip *nand, struct device *de
                chip->ecc.correct = jz4780_nand_ecc_correct;
                /* fall through */
        case NAND_ECC_SOFT:
-       case NAND_ECC_SOFT_BCH:
                dev_info(dev, "using %s (strength %d, size %d, bytes %d)\n",
                        (nfc->bch) ? "hardware BCH" : "software ECC",
                        chip->ecc.strength, chip->ecc.size, chip->ecc.bytes);
@@ -201,23 +196,17 @@ static int jz4780_nand_init_ecc(struct jz4780_nand_chip *nand, struct device *de
                return 0;
 
        /* Generate ECC layout. ECC codes are right aligned in the OOB area. */
-       layout->eccbytes = mtd->writesize / chip->ecc.size * chip->ecc.bytes;
+       eccbytes = mtd->writesize / chip->ecc.size * chip->ecc.bytes;
 
-       if (layout->eccbytes > mtd->oobsize - 2) {
+       if (eccbytes > mtd->oobsize - 2) {
                dev_err(dev,
                        "invalid ECC config: required %d ECC bytes, but only %d are available",
-                       layout->eccbytes, mtd->oobsize - 2);
+                       eccbytes, mtd->oobsize - 2);
                return -EINVAL;
        }
 
-       start = mtd->oobsize - layout->eccbytes;
-       for (i = 0; i < layout->eccbytes; i++)
-               layout->eccpos[i] = start + i;
-
-       layout->oobfree[0].offset = 2;
-       layout->oobfree[0].length = mtd->oobsize - layout->eccbytes - 2;
+       mtd->ooblayout = &nand_ooblayout_lp_ops;
 
-       chip->ecc.layout = layout;
        return 0;
 }
 
index d8c3e7a..8523881 100644 (file)
@@ -35,7 +35,6 @@
 #include <linux/completion.h>
 #include <linux/interrupt.h>
 #include <linux/of.h>
-#include <linux/of_mtd.h>
 #include <linux/of_gpio.h>
 #include <linux/mtd/lpc32xx_mlc.h>
 #include <linux/io.h>
@@ -139,22 +138,37 @@ struct lpc32xx_nand_cfg_mlc {
        unsigned num_parts;
 };
 
-static struct nand_ecclayout lpc32xx_nand_oob = {
-       .eccbytes = 40,
-       .eccpos = { 6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-                  22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-                  38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-                  54, 55, 56, 57, 58, 59, 60, 61, 62, 63 },
-       .oobfree = {
-               { .offset = 0,
-                 .length = 6, },
-               { .offset = 16,
-                 .length = 6, },
-               { .offset = 32,
-                 .length = 6, },
-               { .offset = 48,
-                 .length = 6, },
-               },
+static int lpc32xx_ooblayout_ecc(struct mtd_info *mtd, int section,
+                                struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *nand_chip = mtd_to_nand(mtd);
+
+       if (section >= nand_chip->ecc.steps)
+               return -ERANGE;
+
+       oobregion->offset = ((section + 1) * 16) - nand_chip->ecc.bytes;
+       oobregion->length = nand_chip->ecc.bytes;
+
+       return 0;
+}
+
+static int lpc32xx_ooblayout_free(struct mtd_info *mtd, int section,
+                                 struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *nand_chip = mtd_to_nand(mtd);
+
+       if (section >= nand_chip->ecc.steps)
+               return -ERANGE;
+
+       oobregion->offset = 16 * section;
+       oobregion->length = 16 - nand_chip->ecc.bytes;
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops lpc32xx_ooblayout_ops = {
+       .ecc = lpc32xx_ooblayout_ecc,
+       .free = lpc32xx_ooblayout_free,
 };
 
 static struct nand_bbt_descr lpc32xx_nand_bbt = {
@@ -713,6 +727,7 @@ static int lpc32xx_nand_probe(struct platform_device *pdev)
        nand_chip->ecc.write_oob = lpc32xx_write_oob;
        nand_chip->ecc.read_oob = lpc32xx_read_oob;
        nand_chip->ecc.strength = 4;
+       nand_chip->ecc.bytes = 10;
        nand_chip->waitfunc = lpc32xx_waitfunc;
 
        nand_chip->options = NAND_NO_SUBPAGE_WRITE;
@@ -751,7 +766,7 @@ static int lpc32xx_nand_probe(struct platform_device *pdev)
 
        nand_chip->ecc.mode = NAND_ECC_HW;
        nand_chip->ecc.size = 512;
-       nand_chip->ecc.layout = &lpc32xx_nand_oob;
+       mtd_set_ooblayout(mtd, &lpc32xx_ooblayout_ops);
        host->mlcsubpages = mtd->writesize / 512;
 
        /* initially clear interrupt status */
index 3b8f373..8d3edc3 100644 (file)
@@ -35,7 +35,6 @@
 #include <linux/mtd/nand_ecc.h>
 #include <linux/gpio.h>
 #include <linux/of.h>
-#include <linux/of_mtd.h>
 #include <linux/of_gpio.h>
 #include <linux/mtd/lpc32xx_slc.h>
 
  * NAND ECC Layout for small page NAND devices
  * Note: For large and huge page devices, the default layouts are used
  */
-static struct nand_ecclayout lpc32xx_nand_oob_16 = {
-       .eccbytes = 6,
-       .eccpos = {10, 11, 12, 13, 14, 15},
-       .oobfree = {
-               { .offset = 0, .length = 4 },
-               { .offset = 6, .length = 4 },
-       },
+static int lpc32xx_ooblayout_ecc(struct mtd_info *mtd, int section,
+                                struct mtd_oob_region *oobregion)
+{
+       if (section)
+               return -ERANGE;
+
+       oobregion->length = 6;
+       oobregion->offset = 10;
+
+       return 0;
+}
+
+static int lpc32xx_ooblayout_free(struct mtd_info *mtd, int section,
+                                 struct mtd_oob_region *oobregion)
+{
+       if (section > 1)
+               return -ERANGE;
+
+       if (!section) {
+               oobregion->offset = 0;
+               oobregion->length = 4;
+       } else {
+               oobregion->offset = 6;
+               oobregion->length = 4;
+       }
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops lpc32xx_ooblayout_ops = {
+       .ecc = lpc32xx_ooblayout_ecc,
+       .free = lpc32xx_ooblayout_free,
 };
 
 static u8 bbt_pattern[] = {'B', 'b', 't', '0' };
@@ -194,7 +218,6 @@ struct lpc32xx_nand_cfg_slc {
        uint32_t rwidth;
        uint32_t rhold;
        uint32_t rsetup;
-       bool use_bbt;
        int wp_gpio;
        struct mtd_partition *parts;
        unsigned num_parts;
@@ -604,7 +627,8 @@ static int lpc32xx_nand_read_page_syndrome(struct mtd_info *mtd,
                                           int oob_required, int page)
 {
        struct lpc32xx_nand_host *host = nand_get_controller_data(chip);
-       int stat, i, status;
+       struct mtd_oob_region oobregion = { };
+       int stat, i, status, error;
        uint8_t *oobecc, tmpecc[LPC32XX_ECC_SAVE_SIZE];
 
        /* Issue read command */
@@ -620,7 +644,11 @@ static int lpc32xx_nand_read_page_syndrome(struct mtd_info *mtd,
        lpc32xx_slc_ecc_copy(tmpecc, (uint32_t *) host->ecc_buf, chip->ecc.steps);
 
        /* Pointer to ECC data retrieved from NAND spare area */
-       oobecc = chip->oob_poi + chip->ecc.layout->eccpos[0];
+       error = mtd_ooblayout_ecc(mtd, 0, &oobregion);
+       if (error)
+               return error;
+
+       oobecc = chip->oob_poi + oobregion.offset;
 
        for (i = 0; i < chip->ecc.steps; i++) {
                stat = chip->ecc.correct(mtd, buf, oobecc,
@@ -666,7 +694,8 @@ static int lpc32xx_nand_write_page_syndrome(struct mtd_info *mtd,
                                            int oob_required, int page)
 {
        struct lpc32xx_nand_host *host = nand_get_controller_data(chip);
-       uint8_t *pb = chip->oob_poi + chip->ecc.layout->eccpos[0];
+       struct mtd_oob_region oobregion = { };
+       uint8_t *pb;
        int error;
 
        /* Write data, calculate ECC on outbound data */
@@ -678,6 +707,11 @@ static int lpc32xx_nand_write_page_syndrome(struct mtd_info *mtd,
         * The calculated ECC needs some manual work done to it before
         * committing it to NAND. Process the calculated ECC and place
         * the resultant values directly into the OOB buffer. */
+       error = mtd_ooblayout_ecc(mtd, 0, &oobregion);
+       if (error)
+               return error;
+
+       pb = chip->oob_poi + oobregion.offset;
        lpc32xx_slc_ecc_copy(pb, (uint32_t *)host->ecc_buf, chip->ecc.steps);
 
        /* Write ECC data to device */
@@ -747,7 +781,6 @@ static struct lpc32xx_nand_cfg_slc *lpc32xx_parse_dt(struct device *dev)
                return NULL;
        }
 
-       ncfg->use_bbt = of_get_nand_on_flash_bbt(np);
        ncfg->wp_gpio = of_get_named_gpio(np, "gpios", 0);
 
        return ncfg;
@@ -875,26 +908,22 @@ static int lpc32xx_nand_probe(struct platform_device *pdev)
         * custom BBT marker layout.
         */
        if (mtd->writesize <= 512)
-               chip->ecc.layout = &lpc32xx_nand_oob_16;
+               mtd_set_ooblayout(mtd, &lpc32xx_ooblayout_ops);
 
        /* These sizes remain the same regardless of page size */
        chip->ecc.size = 256;
        chip->ecc.bytes = LPC32XX_SLC_DEV_ECC_BYTES;
        chip->ecc.prepad = chip->ecc.postpad = 0;
 
-       /* Avoid extra scan if using BBT, setup BBT support */
-       if (host->ncfg->use_bbt) {
-               chip->bbt_options |= NAND_BBT_USE_FLASH;
-
-               /*
-                * Use a custom BBT marker setup for small page FLASH that
-                * won't interfere with the ECC layout. Large and huge page
-                * FLASH use the standard layout.
-                */
-               if (mtd->writesize <= 512) {
-                       chip->bbt_td = &bbt_smallpage_main_descr;
-                       chip->bbt_md = &bbt_smallpage_mirror_descr;
-               }
+       /*
+        * Use a custom BBT marker setup for small page FLASH that
+        * won't interfere with the ECC layout. Large and huge page
+        * FLASH use the standard layout.
+        */
+       if ((chip->bbt_options & NAND_BBT_USE_FLASH) &&
+           mtd->writesize <= 512) {
+               chip->bbt_td = &bbt_smallpage_main_descr;
+               chip->bbt_md = &bbt_smallpage_mirror_descr;
        }
 
        /*
index 5d7843f..7eacb2f 100644 (file)
@@ -710,6 +710,7 @@ static int mpc5121_nfc_probe(struct platform_device *op)
        chip->select_chip = mpc5121_nfc_select_chip;
        chip->bbt_options = NAND_BBT_USE_FLASH;
        chip->ecc.mode = NAND_ECC_SOFT;
+       chip->ecc.algo = NAND_ECC_HAMMING;
 
        /* Support external chip-select logic on ADS5121 board */
        if (of_machine_is_compatible("fsl,mpc5121ads")) {
index 854c832..5173fad 100644 (file)
@@ -34,7 +34,6 @@
 #include <linux/completion.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
-#include <linux/of_mtd.h>
 
 #include <asm/mach/flash.h>
 #include <linux/platform_data/mtd-mxc_nand.h>
@@ -149,7 +148,7 @@ struct mxc_nand_devtype_data {
        int (*check_int)(struct mxc_nand_host *);
        void (*irq_control)(struct mxc_nand_host *, int);
        u32 (*get_ecc_status)(struct mxc_nand_host *);
-       struct nand_ecclayout *ecclayout_512, *ecclayout_2k, *ecclayout_4k;
+       const struct mtd_ooblayout_ops *ooblayout;
        void (*select_chip)(struct mtd_info *mtd, int chip);
        int (*correct_data)(struct mtd_info *mtd, u_char *dat,
                        u_char *read_ecc, u_char *calc_ecc);
@@ -200,73 +199,6 @@ struct mxc_nand_host {
        struct mxc_nand_platform_data pdata;
 };
 
-/* OOB placement block for use with hardware ecc generation */
-static struct nand_ecclayout nandv1_hw_eccoob_smallpage = {
-       .eccbytes = 5,
-       .eccpos = {6, 7, 8, 9, 10},
-       .oobfree = {{0, 5}, {12, 4}, }
-};
-
-static struct nand_ecclayout nandv1_hw_eccoob_largepage = {
-       .eccbytes = 20,
-       .eccpos = {6, 7, 8, 9, 10, 22, 23, 24, 25, 26,
-                  38, 39, 40, 41, 42, 54, 55, 56, 57, 58},
-       .oobfree = {{2, 4}, {11, 10}, {27, 10}, {43, 10}, {59, 5}, }
-};
-
-/* OOB description for 512 byte pages with 16 byte OOB */
-static struct nand_ecclayout nandv2_hw_eccoob_smallpage = {
-       .eccbytes = 1 * 9,
-       .eccpos = {
-                7,  8,  9, 10, 11, 12, 13, 14, 15
-       },
-       .oobfree = {
-               {.offset = 0, .length = 5}
-       }
-};
-
-/* OOB description for 2048 byte pages with 64 byte OOB */
-static struct nand_ecclayout nandv2_hw_eccoob_largepage = {
-       .eccbytes = 4 * 9,
-       .eccpos = {
-                7,  8,  9, 10, 11, 12, 13, 14, 15,
-               23, 24, 25, 26, 27, 28, 29, 30, 31,
-               39, 40, 41, 42, 43, 44, 45, 46, 47,
-               55, 56, 57, 58, 59, 60, 61, 62, 63
-       },
-       .oobfree = {
-               {.offset = 2, .length = 4},
-               {.offset = 16, .length = 7},
-               {.offset = 32, .length = 7},
-               {.offset = 48, .length = 7}
-       }
-};
-
-/* OOB description for 4096 byte pages with 128 byte OOB */
-static struct nand_ecclayout nandv2_hw_eccoob_4k = {
-       .eccbytes = 8 * 9,
-       .eccpos = {
-               7,  8,  9, 10, 11, 12, 13, 14, 15,
-               23, 24, 25, 26, 27, 28, 29, 30, 31,
-               39, 40, 41, 42, 43, 44, 45, 46, 47,
-               55, 56, 57, 58, 59, 60, 61, 62, 63,
-               71, 72, 73, 74, 75, 76, 77, 78, 79,
-               87, 88, 89, 90, 91, 92, 93, 94, 95,
-               103, 104, 105, 106, 107, 108, 109, 110, 111,
-               119, 120, 121, 122, 123, 124, 125, 126, 127,
-       },
-       .oobfree = {
-               {.offset = 2, .length = 4},
-               {.offset = 16, .length = 7},
-               {.offset = 32, .length = 7},
-               {.offset = 48, .length = 7},
-               {.offset = 64, .length = 7},
-               {.offset = 80, .length = 7},
-               {.offset = 96, .length = 7},
-               {.offset = 112, .length = 7},
-       }
-};
-
 static const char * const part_probes[] = {
        "cmdlinepart", "RedBoot", "ofpart", NULL };
 
@@ -942,6 +874,99 @@ static void mxc_do_addr_cycle(struct mtd_info *mtd, int column, int page_addr)
        }
 }
 
+static int mxc_v1_ooblayout_ecc(struct mtd_info *mtd, int section,
+                               struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *nand_chip = mtd_to_nand(mtd);
+
+       if (section >= nand_chip->ecc.steps)
+               return -ERANGE;
+
+       oobregion->offset = (section * 16) + 6;
+       oobregion->length = nand_chip->ecc.bytes;
+
+       return 0;
+}
+
+static int mxc_v1_ooblayout_free(struct mtd_info *mtd, int section,
+                                struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *nand_chip = mtd_to_nand(mtd);
+
+       if (section > nand_chip->ecc.steps)
+               return -ERANGE;
+
+       if (!section) {
+               if (mtd->writesize <= 512) {
+                       oobregion->offset = 0;
+                       oobregion->length = 5;
+               } else {
+                       oobregion->offset = 2;
+                       oobregion->length = 4;
+               }
+       } else {
+               oobregion->offset = ((section - 1) * 16) +
+                                   nand_chip->ecc.bytes + 6;
+               if (section < nand_chip->ecc.steps)
+                       oobregion->length = (section * 16) + 6 -
+                                           oobregion->offset;
+               else
+                       oobregion->length = mtd->oobsize - oobregion->offset;
+       }
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops mxc_v1_ooblayout_ops = {
+       .ecc = mxc_v1_ooblayout_ecc,
+       .free = mxc_v1_ooblayout_free,
+};
+
+static int mxc_v2_ooblayout_ecc(struct mtd_info *mtd, int section,
+                               struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *nand_chip = mtd_to_nand(mtd);
+       int stepsize = nand_chip->ecc.bytes == 9 ? 16 : 26;
+
+       if (section >= nand_chip->ecc.steps)
+               return -ERANGE;
+
+       oobregion->offset = (section * stepsize) + 7;
+       oobregion->length = nand_chip->ecc.bytes;
+
+       return 0;
+}
+
+static int mxc_v2_ooblayout_free(struct mtd_info *mtd, int section,
+                                struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *nand_chip = mtd_to_nand(mtd);
+       int stepsize = nand_chip->ecc.bytes == 9 ? 16 : 26;
+
+       if (section > nand_chip->ecc.steps)
+               return -ERANGE;
+
+       if (!section) {
+               if (mtd->writesize <= 512) {
+                       oobregion->offset = 0;
+                       oobregion->length = 5;
+               } else {
+                       oobregion->offset = 2;
+                       oobregion->length = 4;
+               }
+       } else {
+               oobregion->offset = section * stepsize;
+               oobregion->length = 7;
+       }
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops mxc_v2_ooblayout_ops = {
+       .ecc = mxc_v2_ooblayout_ecc,
+       .free = mxc_v2_ooblayout_free,
+};
+
 /*
  * v2 and v3 type controllers can do 4bit or 8bit ecc depending
  * on how much oob the nand chip has. For 8bit ecc we need at least
@@ -959,23 +984,6 @@ static int get_eccsize(struct mtd_info *mtd)
                return 8;
 }
 
-static void ecc_8bit_layout_4k(struct nand_ecclayout *layout)
-{
-       int i, j;
-
-       layout->eccbytes = 8*18;
-       for (i = 0; i < 8; i++)
-               for (j = 0; j < 18; j++)
-                       layout->eccpos[i*18 + j] = i*26 + j + 7;
-
-       layout->oobfree[0].offset = 2;
-       layout->oobfree[0].length = 4;
-       for (i = 1; i < 8; i++) {
-               layout->oobfree[i].offset = i*26;
-               layout->oobfree[i].length = 7;
-       }
-}
-
 static void preset_v1(struct mtd_info *mtd)
 {
        struct nand_chip *nand_chip = mtd_to_nand(mtd);
@@ -1269,9 +1277,7 @@ static const struct mxc_nand_devtype_data imx21_nand_devtype_data = {
        .check_int = check_int_v1_v2,
        .irq_control = irq_control_v1_v2,
        .get_ecc_status = get_ecc_status_v1,
-       .ecclayout_512 = &nandv1_hw_eccoob_smallpage,
-       .ecclayout_2k = &nandv1_hw_eccoob_largepage,
-       .ecclayout_4k = &nandv1_hw_eccoob_smallpage, /* XXX: needs fix */
+       .ooblayout = &mxc_v1_ooblayout_ops,
        .select_chip = mxc_nand_select_chip_v1_v3,
        .correct_data = mxc_nand_correct_data_v1,
        .irqpending_quirk = 1,
@@ -1294,9 +1300,7 @@ static const struct mxc_nand_devtype_data imx27_nand_devtype_data = {
        .check_int = check_int_v1_v2,
        .irq_control = irq_control_v1_v2,
        .get_ecc_status = get_ecc_status_v1,
-       .ecclayout_512 = &nandv1_hw_eccoob_smallpage,
-       .ecclayout_2k = &nandv1_hw_eccoob_largepage,
-       .ecclayout_4k = &nandv1_hw_eccoob_smallpage, /* XXX: needs fix */
+       .ooblayout = &mxc_v1_ooblayout_ops,
        .select_chip = mxc_nand_select_chip_v1_v3,
        .correct_data = mxc_nand_correct_data_v1,
        .irqpending_quirk = 0,
@@ -1320,9 +1324,7 @@ static const struct mxc_nand_devtype_data imx25_nand_devtype_data = {
        .check_int = check_int_v1_v2,
        .irq_control = irq_control_v1_v2,
        .get_ecc_status = get_ecc_status_v2,
-       .ecclayout_512 = &nandv2_hw_eccoob_smallpage,
-       .ecclayout_2k = &nandv2_hw_eccoob_largepage,
-       .ecclayout_4k = &nandv2_hw_eccoob_4k,
+       .ooblayout = &mxc_v2_ooblayout_ops,
        .select_chip = mxc_nand_select_chip_v2,
        .correct_data = mxc_nand_correct_data_v2_v3,
        .irqpending_quirk = 0,
@@ -1346,9 +1348,7 @@ static const struct mxc_nand_devtype_data imx51_nand_devtype_data = {
        .check_int = check_int_v3,
        .irq_control = irq_control_v3,
        .get_ecc_status = get_ecc_status_v3,
-       .ecclayout_512 = &nandv2_hw_eccoob_smallpage,
-       .ecclayout_2k = &nandv2_hw_eccoob_largepage,
-       .ecclayout_4k = &nandv2_hw_eccoob_smallpage, /* XXX: needs fix */
+       .ooblayout = &mxc_v2_ooblayout_ops,
        .select_chip = mxc_nand_select_chip_v1_v3,
        .correct_data = mxc_nand_correct_data_v2_v3,
        .irqpending_quirk = 0,
@@ -1373,9 +1373,7 @@ static const struct mxc_nand_devtype_data imx53_nand_devtype_data = {
        .check_int = check_int_v3,
        .irq_control = irq_control_v3,
        .get_ecc_status = get_ecc_status_v3,
-       .ecclayout_512 = &nandv2_hw_eccoob_smallpage,
-       .ecclayout_2k = &nandv2_hw_eccoob_largepage,
-       .ecclayout_4k = &nandv2_hw_eccoob_smallpage, /* XXX: needs fix */
+       .ooblayout = &mxc_v2_ooblayout_ops,
        .select_chip = mxc_nand_select_chip_v1_v3,
        .correct_data = mxc_nand_correct_data_v2_v3,
        .irqpending_quirk = 0,
@@ -1461,25 +1459,12 @@ MODULE_DEVICE_TABLE(of, mxcnd_dt_ids);
 static int __init mxcnd_probe_dt(struct mxc_nand_host *host)
 {
        struct device_node *np = host->dev->of_node;
-       struct mxc_nand_platform_data *pdata = &host->pdata;
        const struct of_device_id *of_id =
                of_match_device(mxcnd_dt_ids, host->dev);
-       int buswidth;
 
        if (!np)
                return 1;
 
-       if (of_get_nand_ecc_mode(np) >= 0)
-               pdata->hw_ecc = 1;
-
-       pdata->flash_bbt = of_get_nand_on_flash_bbt(np);
-
-       buswidth = of_get_nand_bus_width(np);
-       if (buswidth < 0)
-               return buswidth;
-
-       pdata->width = buswidth / 8;
-
        host->devtype_data = of_id->data;
 
        return 0;
@@ -1576,27 +1561,22 @@ static int mxcnd_probe(struct platform_device *pdev)
 
        this->select_chip = host->devtype_data->select_chip;
        this->ecc.size = 512;
-       this->ecc.layout = host->devtype_data->ecclayout_512;
+       mtd_set_ooblayout(mtd, host->devtype_data->ooblayout);
 
        if (host->pdata.hw_ecc) {
-               this->ecc.calculate = mxc_nand_calculate_ecc;
-               this->ecc.hwctl = mxc_nand_enable_hwecc;
-               this->ecc.correct = host->devtype_data->correct_data;
                this->ecc.mode = NAND_ECC_HW;
        } else {
                this->ecc.mode = NAND_ECC_SOFT;
+               this->ecc.algo = NAND_ECC_HAMMING;
        }
 
        /* NAND bus width determines access functions used by upper layer */
        if (host->pdata.width == 2)
                this->options |= NAND_BUSWIDTH_16;
 
-       if (host->pdata.flash_bbt) {
-               this->bbt_td = &bbt_main_descr;
-               this->bbt_md = &bbt_mirror_descr;
-               /* update flash based bbt */
+       /* update flash based bbt */
+       if (host->pdata.flash_bbt)
                this->bbt_options |= NAND_BBT_USE_FLASH;
-       }
 
        init_completion(&host->op_completion);
 
@@ -1637,6 +1617,26 @@ static int mxcnd_probe(struct platform_device *pdev)
                goto escan;
        }
 
+       switch (this->ecc.mode) {
+       case NAND_ECC_HW:
+               this->ecc.calculate = mxc_nand_calculate_ecc;
+               this->ecc.hwctl = mxc_nand_enable_hwecc;
+               this->ecc.correct = host->devtype_data->correct_data;
+               break;
+
+       case NAND_ECC_SOFT:
+               break;
+
+       default:
+               err = -EINVAL;
+               goto escan;
+       }
+
+       if (this->bbt_options & NAND_BBT_USE_FLASH) {
+               this->bbt_td = &bbt_main_descr;
+               this->bbt_md = &bbt_mirror_descr;
+       }
+
        /* allocate the right size buffer now */
        devm_kfree(&pdev->dev, (void *)host->data_buf);
        host->data_buf = devm_kzalloc(&pdev->dev, mtd->writesize + mtd->oobsize,
@@ -1649,12 +1649,11 @@ static int mxcnd_probe(struct platform_device *pdev)
        /* Call preset again, with correct writesize this time */
        host->devtype_data->preset(mtd);
 
-       if (mtd->writesize == 2048)
-               this->ecc.layout = host->devtype_data->ecclayout_2k;
-       else if (mtd->writesize == 4096) {
-               this->ecc.layout = host->devtype_data->ecclayout_4k;
-               if (get_eccsize(mtd) == 8)
-                       ecc_8bit_layout_4k(this->ecc.layout);
+       if (!this->ecc.bytes) {
+               if (host->eccsize == 8)
+                       this->ecc.bytes = 18;
+               else if (host->eccsize == 4)
+                       this->ecc.bytes = 9;
        }
 
        /*
index ba4f603..0b0dc29 100644 (file)
 #include <linux/bitops.h>
 #include <linux/io.h>
 #include <linux/mtd/partitions.h>
-#include <linux/of_mtd.h>
+#include <linux/of.h>
+
+static int nand_get_device(struct mtd_info *mtd, int new_state);
+
+static int nand_do_write_oob(struct mtd_info *mtd, loff_t to,
+                            struct mtd_oob_ops *ops);
 
 /* Define default oob placement schemes for large and small page devices */
-static struct nand_ecclayout nand_oob_8 = {
-       .eccbytes = 3,
-       .eccpos = {0, 1, 2},
-       .oobfree = {
-               {.offset = 3,
-                .length = 2},
-               {.offset = 6,
-                .length = 2} }
-};
+static int nand_ooblayout_ecc_sp(struct mtd_info *mtd, int section,
+                                struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct nand_ecc_ctrl *ecc = &chip->ecc;
 
-static struct nand_ecclayout nand_oob_16 = {
-       .eccbytes = 6,
-       .eccpos = {0, 1, 2, 3, 6, 7},
-       .oobfree = {
-               {.offset = 8,
-                . length = 8} }
-};
+       if (section > 1)
+               return -ERANGE;
 
-static struct nand_ecclayout nand_oob_64 = {
-       .eccbytes = 24,
-       .eccpos = {
-                  40, 41, 42, 43, 44, 45, 46, 47,
-                  48, 49, 50, 51, 52, 53, 54, 55,
-                  56, 57, 58, 59, 60, 61, 62, 63},
-       .oobfree = {
-               {.offset = 2,
-                .length = 38} }
-};
+       if (!section) {
+               oobregion->offset = 0;
+               oobregion->length = 4;
+       } else {
+               oobregion->offset = 6;
+               oobregion->length = ecc->total - 4;
+       }
+
+       return 0;
+}
+
+static int nand_ooblayout_free_sp(struct mtd_info *mtd, int section,
+                                 struct mtd_oob_region *oobregion)
+{
+       if (section > 1)
+               return -ERANGE;
+
+       if (mtd->oobsize == 16) {
+               if (section)
+                       return -ERANGE;
+
+               oobregion->length = 8;
+               oobregion->offset = 8;
+       } else {
+               oobregion->length = 2;
+               if (!section)
+                       oobregion->offset = 3;
+               else
+                       oobregion->offset = 6;
+       }
+
+       return 0;
+}
 
-static struct nand_ecclayout nand_oob_128 = {
-       .eccbytes = 48,
-       .eccpos = {
-                  80, 81, 82, 83, 84, 85, 86, 87,
-                  88, 89, 90, 91, 92, 93, 94, 95,
-                  96, 97, 98, 99, 100, 101, 102, 103,
-                  104, 105, 106, 107, 108, 109, 110, 111,
-                  112, 113, 114, 115, 116, 117, 118, 119,
-                  120, 121, 122, 123, 124, 125, 126, 127},
-       .oobfree = {
-               {.offset = 2,
-                .length = 78} }
+const struct mtd_ooblayout_ops nand_ooblayout_sp_ops = {
+       .ecc = nand_ooblayout_ecc_sp,
+       .free = nand_ooblayout_free_sp,
 };
+EXPORT_SYMBOL_GPL(nand_ooblayout_sp_ops);
 
-static int nand_get_device(struct mtd_info *mtd, int new_state);
+static int nand_ooblayout_ecc_lp(struct mtd_info *mtd, int section,
+                                struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct nand_ecc_ctrl *ecc = &chip->ecc;
 
-static int nand_do_write_oob(struct mtd_info *mtd, loff_t to,
-                            struct mtd_oob_ops *ops);
+       if (section)
+               return -ERANGE;
+
+       oobregion->length = ecc->total;
+       oobregion->offset = mtd->oobsize - oobregion->length;
+
+       return 0;
+}
+
+static int nand_ooblayout_free_lp(struct mtd_info *mtd, int section,
+                                 struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct nand_ecc_ctrl *ecc = &chip->ecc;
+
+       if (section)
+               return -ERANGE;
+
+       oobregion->length = mtd->oobsize - ecc->total - 2;
+       oobregion->offset = 2;
+
+       return 0;
+}
+
+const struct mtd_ooblayout_ops nand_ooblayout_lp_ops = {
+       .ecc = nand_ooblayout_ecc_lp,
+       .free = nand_ooblayout_free_lp,
+};
+EXPORT_SYMBOL_GPL(nand_ooblayout_lp_ops);
 
 static int check_offs_len(struct mtd_info *mtd,
                                        loff_t ofs, uint64_t len)
@@ -1279,13 +1321,12 @@ static int nand_read_page_raw_syndrome(struct mtd_info *mtd,
 static int nand_read_page_swecc(struct mtd_info *mtd, struct nand_chip *chip,
                                uint8_t *buf, int oob_required, int page)
 {
-       int i, eccsize = chip->ecc.size;
+       int i, eccsize = chip->ecc.size, ret;
        int eccbytes = chip->ecc.bytes;
        int eccsteps = chip->ecc.steps;
        uint8_t *p = buf;
        uint8_t *ecc_calc = chip->buffers->ecccalc;
        uint8_t *ecc_code = chip->buffers->ecccode;
-       uint32_t *eccpos = chip->ecc.layout->eccpos;
        unsigned int max_bitflips = 0;
 
        chip->ecc.read_page_raw(mtd, chip, buf, 1, page);
@@ -1293,8 +1334,10 @@ static int nand_read_page_swecc(struct mtd_info *mtd, struct nand_chip *chip,
        for (i = 0; eccsteps; eccsteps--, i += eccbytes, p += eccsize)
                chip->ecc.calculate(mtd, p, &ecc_calc[i]);
 
-       for (i = 0; i < chip->ecc.total; i++)
-               ecc_code[i] = chip->oob_poi[eccpos[i]];
+       ret = mtd_ooblayout_get_eccbytes(mtd, ecc_code, chip->oob_poi, 0,
+                                        chip->ecc.total);
+       if (ret)
+               return ret;
 
        eccsteps = chip->ecc.steps;
        p = buf;
@@ -1326,14 +1369,14 @@ static int nand_read_subpage(struct mtd_info *mtd, struct nand_chip *chip,
                        uint32_t data_offs, uint32_t readlen, uint8_t *bufpoi,
                        int page)
 {
-       int start_step, end_step, num_steps;
-       uint32_t *eccpos = chip->ecc.layout->eccpos;
+       int start_step, end_step, num_steps, ret;
        uint8_t *p;
        int data_col_addr, i, gaps = 0;
        int datafrag_len, eccfrag_len, aligned_len, aligned_pos;
        int busw = (chip->options & NAND_BUSWIDTH_16) ? 2 : 1;
-       int index;
+       int index, section = 0;
        unsigned int max_bitflips = 0;
+       struct mtd_oob_region oobregion = { };
 
        /* Column address within the page aligned to ECC size (256bytes) */
        start_step = data_offs / chip->ecc.size;
@@ -1361,12 +1404,13 @@ static int nand_read_subpage(struct mtd_info *mtd, struct nand_chip *chip,
         * The performance is faster if we position offsets according to
         * ecc.pos. Let's make sure that there are no gaps in ECC positions.
         */
-       for (i = 0; i < eccfrag_len - 1; i++) {
-               if (eccpos[i + index] + 1 != eccpos[i + index + 1]) {
-                       gaps = 1;
-                       break;
-               }
-       }
+       ret = mtd_ooblayout_find_eccregion(mtd, index, &section, &oobregion);
+       if (ret)
+               return ret;
+
+       if (oobregion.length < eccfrag_len)
+               gaps = 1;
+
        if (gaps) {
                chip->cmdfunc(mtd, NAND_CMD_RNDOUT, mtd->writesize, -1);
                chip->read_buf(mtd, chip->oob_poi, mtd->oobsize);
@@ -1375,20 +1419,23 @@ static int nand_read_subpage(struct mtd_info *mtd, struct nand_chip *chip,
                 * Send the command to read the particular ECC bytes take care
                 * about buswidth alignment in read_buf.
                 */
-               aligned_pos = eccpos[index] & ~(busw - 1);
+               aligned_pos = oobregion.offset & ~(busw - 1);
                aligned_len = eccfrag_len;
-               if (eccpos[index] & (busw - 1))
+               if (oobregion.offset & (busw - 1))
                        aligned_len++;
-               if (eccpos[index + (num_steps * chip->ecc.bytes)] & (busw - 1))
+               if ((oobregion.offset + (num_steps * chip->ecc.bytes)) &
+                   (busw - 1))
                        aligned_len++;
 
                chip->cmdfunc(mtd, NAND_CMD_RNDOUT,
-                                       mtd->writesize + aligned_pos, -1);
+                             mtd->writesize + aligned_pos, -1);
                chip->read_buf(mtd, &chip->oob_poi[aligned_pos], aligned_len);
        }
 
-       for (i = 0; i < eccfrag_len; i++)
-               chip->buffers->ecccode[i] = chip->oob_poi[eccpos[i + index]];
+       ret = mtd_ooblayout_get_eccbytes(mtd, chip->buffers->ecccode,
+                                        chip->oob_poi, index, eccfrag_len);
+       if (ret)
+               return ret;
 
        p = bufpoi + data_col_addr;
        for (i = 0; i < eccfrag_len ; i += chip->ecc.bytes, p += chip->ecc.size) {
@@ -1429,13 +1476,12 @@ static int nand_read_subpage(struct mtd_info *mtd, struct nand_chip *chip,
 static int nand_read_page_hwecc(struct mtd_info *mtd, struct nand_chip *chip,
                                uint8_t *buf, int oob_required, int page)
 {
-       int i, eccsize = chip->ecc.size;
+       int i, eccsize = chip->ecc.size, ret;
        int eccbytes = chip->ecc.bytes;
        int eccsteps = chip->ecc.steps;
        uint8_t *p = buf;
        uint8_t *ecc_calc = chip->buffers->ecccalc;
        uint8_t *ecc_code = chip->buffers->ecccode;
-       uint32_t *eccpos = chip->ecc.layout->eccpos;
        unsigned int max_bitflips = 0;
 
        for (i = 0; eccsteps; eccsteps--, i += eccbytes, p += eccsize) {
@@ -1445,8 +1491,10 @@ static int nand_read_page_hwecc(struct mtd_info *mtd, struct nand_chip *chip,
        }
        chip->read_buf(mtd, chip->oob_poi, mtd->oobsize);
 
-       for (i = 0; i < chip->ecc.total; i++)
-               ecc_code[i] = chip->oob_poi[eccpos[i]];
+       ret = mtd_ooblayout_get_eccbytes(mtd, ecc_code, chip->oob_poi, 0,
+                                        chip->ecc.total);
+       if (ret)
+               return ret;
 
        eccsteps = chip->ecc.steps;
        p = buf;
@@ -1491,12 +1539,11 @@ static int nand_read_page_hwecc(struct mtd_info *mtd, struct nand_chip *chip,
 static int nand_read_page_hwecc_oob_first(struct mtd_info *mtd,
        struct nand_chip *chip, uint8_t *buf, int oob_required, int page)
 {
-       int i, eccsize = chip->ecc.size;
+       int i, eccsize = chip->ecc.size, ret;
        int eccbytes = chip->ecc.bytes;
        int eccsteps = chip->ecc.steps;
        uint8_t *p = buf;
        uint8_t *ecc_code = chip->buffers->ecccode;
-       uint32_t *eccpos = chip->ecc.layout->eccpos;
        uint8_t *ecc_calc = chip->buffers->ecccalc;
        unsigned int max_bitflips = 0;
 
@@ -1505,8 +1552,10 @@ static int nand_read_page_hwecc_oob_first(struct mtd_info *mtd,
        chip->read_buf(mtd, chip->oob_poi, mtd->oobsize);
        chip->cmdfunc(mtd, NAND_CMD_READ0, 0, page);
 
-       for (i = 0; i < chip->ecc.total; i++)
-               ecc_code[i] = chip->oob_poi[eccpos[i]];
+       ret = mtd_ooblayout_get_eccbytes(mtd, ecc_code, chip->oob_poi, 0,
+                                        chip->ecc.total);
+       if (ret)
+               return ret;
 
        for (i = 0; eccsteps; eccsteps--, i += eccbytes, p += eccsize) {
                int stat;
@@ -1607,14 +1656,17 @@ static int nand_read_page_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
 
 /**
  * nand_transfer_oob - [INTERN] Transfer oob to client buffer
- * @chip: nand chip structure
+ * @mtd: mtd info structure
  * @oob: oob destination address
  * @ops: oob ops structure
  * @len: size of oob to transfer
  */
-static uint8_t *nand_transfer_oob(struct nand_chip *chip, uint8_t *oob,
+static uint8_t *nand_transfer_oob(struct mtd_info *mtd, uint8_t *oob,
                                  struct mtd_oob_ops *ops, size_t len)
 {
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       int ret;
+
        switch (ops->mode) {
 
        case MTD_OPS_PLACE_OOB:
@@ -1622,31 +1674,12 @@ static uint8_t *nand_transfer_oob(struct nand_chip *chip, uint8_t *oob,
                memcpy(oob, chip->oob_poi + ops->ooboffs, len);
                return oob + len;
 
-       case MTD_OPS_AUTO_OOB: {
-               struct nand_oobfree *free = chip->ecc.layout->oobfree;
-               uint32_t boffs = 0, roffs = ops->ooboffs;
-               size_t bytes = 0;
-
-               for (; free->length && len; free++, len -= bytes) {
-                       /* Read request not from offset 0? */
-                       if (unlikely(roffs)) {
-                               if (roffs >= free->length) {
-                                       roffs -= free->length;
-                                       continue;
-                               }
-                               boffs = free->offset + roffs;
-                               bytes = min_t(size_t, len,
-                                             (free->length - roffs));
-                               roffs = 0;
-                       } else {
-                               bytes = min_t(size_t, len, free->length);
-                               boffs = free->offset;
-                       }
-                       memcpy(oob, chip->oob_poi + boffs, bytes);
-                       oob += bytes;
-               }
-               return oob;
-       }
+       case MTD_OPS_AUTO_OOB:
+               ret = mtd_ooblayout_get_databytes(mtd, oob, chip->oob_poi,
+                                                 ops->ooboffs, len);
+               BUG_ON(ret);
+               return oob + len;
+
        default:
                BUG();
        }
@@ -1780,7 +1813,7 @@ read_retry:
                                int toread = min(oobreadlen, max_oobsize);
 
                                if (toread) {
-                                       oob = nand_transfer_oob(chip,
+                                       oob = nand_transfer_oob(mtd,
                                                oob, ops, toread);
                                        oobreadlen -= toread;
                                }
@@ -1893,13 +1926,13 @@ static int nand_read(struct mtd_info *mtd, loff_t from, size_t len,
  * @chip: nand chip info structure
  * @page: page number to read
  */
-static int nand_read_oob_std(struct mtd_info *mtd, struct nand_chip *chip,
-                            int page)
+int nand_read_oob_std(struct mtd_info *mtd, struct nand_chip *chip, int page)
 {
        chip->cmdfunc(mtd, NAND_CMD_READOOB, 0, page);
        chip->read_buf(mtd, chip->oob_poi, mtd->oobsize);
        return 0;
 }
+EXPORT_SYMBOL(nand_read_oob_std);
 
 /**
  * nand_read_oob_syndrome - [REPLACEABLE] OOB data read function for HW ECC
@@ -1908,8 +1941,8 @@ static int nand_read_oob_std(struct mtd_info *mtd, struct nand_chip *chip,
  * @chip: nand chip info structure
  * @page: page number to read
  */
-static int nand_read_oob_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
-                                 int page)
+int nand_read_oob_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
+                          int page)
 {
        int length = mtd->oobsize;
        int chunk = chip->ecc.bytes + chip->ecc.prepad + chip->ecc.postpad;
@@ -1937,6 +1970,7 @@ static int nand_read_oob_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
 
        return 0;
 }
+EXPORT_SYMBOL(nand_read_oob_syndrome);
 
 /**
  * nand_write_oob_std - [REPLACEABLE] the most common OOB data write function
@@ -1944,8 +1978,7 @@ static int nand_read_oob_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
  * @chip: nand chip info structure
  * @page: page number to write
  */
-static int nand_write_oob_std(struct mtd_info *mtd, struct nand_chip *chip,
-                             int page)
+int nand_write_oob_std(struct mtd_info *mtd, struct nand_chip *chip, int page)
 {
        int status = 0;
        const uint8_t *buf = chip->oob_poi;
@@ -1960,6 +1993,7 @@ static int nand_write_oob_std(struct mtd_info *mtd, struct nand_chip *chip,
 
        return status & NAND_STATUS_FAIL ? -EIO : 0;
 }
+EXPORT_SYMBOL(nand_write_oob_std);
 
 /**
  * nand_write_oob_syndrome - [REPLACEABLE] OOB data write function for HW ECC
@@ -1968,8 +2002,8 @@ static int nand_write_oob_std(struct mtd_info *mtd, struct nand_chip *chip,
  * @chip: nand chip info structure
  * @page: page number to write
  */
-static int nand_write_oob_syndrome(struct mtd_info *mtd,
-                                  struct nand_chip *chip, int page)
+int nand_write_oob_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
+                           int page)
 {
        int chunk = chip->ecc.bytes + chip->ecc.prepad + chip->ecc.postpad;
        int eccsize = chip->ecc.size, length = mtd->oobsize;
@@ -2019,6 +2053,7 @@ static int nand_write_oob_syndrome(struct mtd_info *mtd,
 
        return status & NAND_STATUS_FAIL ? -EIO : 0;
 }
+EXPORT_SYMBOL(nand_write_oob_syndrome);
 
 /**
  * nand_do_read_oob - [INTERN] NAND read out-of-band
@@ -2078,7 +2113,7 @@ static int nand_do_read_oob(struct mtd_info *mtd, loff_t from,
                        break;
 
                len = min(len, readlen);
-               buf = nand_transfer_oob(chip, buf, ops, len);
+               buf = nand_transfer_oob(mtd, buf, ops, len);
 
                if (chip->options & NAND_NEED_READRDY) {
                        /* Apply delay or wait for ready/busy pin */
@@ -2237,19 +2272,20 @@ static int nand_write_page_swecc(struct mtd_info *mtd, struct nand_chip *chip,
                                 const uint8_t *buf, int oob_required,
                                 int page)
 {
-       int i, eccsize = chip->ecc.size;
+       int i, eccsize = chip->ecc.size, ret;
        int eccbytes = chip->ecc.bytes;
        int eccsteps = chip->ecc.steps;
        uint8_t *ecc_calc = chip->buffers->ecccalc;
        const uint8_t *p = buf;
-       uint32_t *eccpos = chip->ecc.layout->eccpos;
 
        /* Software ECC calculation */
        for (i = 0; eccsteps; eccsteps--, i += eccbytes, p += eccsize)
                chip->ecc.calculate(mtd, p, &ecc_calc[i]);
 
-       for (i = 0; i < chip->ecc.total; i++)
-               chip->oob_poi[eccpos[i]] = ecc_calc[i];
+       ret = mtd_ooblayout_set_eccbytes(mtd, ecc_calc, chip->oob_poi, 0,
+                                        chip->ecc.total);
+       if (ret)
+               return ret;
 
        return chip->ecc.write_page_raw(mtd, chip, buf, 1, page);
 }
@@ -2266,12 +2302,11 @@ static int nand_write_page_hwecc(struct mtd_info *mtd, struct nand_chip *chip,
                                  const uint8_t *buf, int oob_required,
                                  int page)
 {
-       int i, eccsize = chip->ecc.size;
+       int i, eccsize = chip->ecc.size, ret;
        int eccbytes = chip->ecc.bytes;
        int eccsteps = chip->ecc.steps;
        uint8_t *ecc_calc = chip->buffers->ecccalc;
        const uint8_t *p = buf;
-       uint32_t *eccpos = chip->ecc.layout->eccpos;
 
        for (i = 0; eccsteps; eccsteps--, i += eccbytes, p += eccsize) {
                chip->ecc.hwctl(mtd, NAND_ECC_WRITE);
@@ -2279,8 +2314,10 @@ static int nand_write_page_hwecc(struct mtd_info *mtd, struct nand_chip *chip,
                chip->ecc.calculate(mtd, p, &ecc_calc[i]);
        }
 
-       for (i = 0; i < chip->ecc.total; i++)
-               chip->oob_poi[eccpos[i]] = ecc_calc[i];
+       ret = mtd_ooblayout_set_eccbytes(mtd, ecc_calc, chip->oob_poi, 0,
+                                        chip->ecc.total);
+       if (ret)
+               return ret;
 
        chip->write_buf(mtd, chip->oob_poi, mtd->oobsize);
 
@@ -2308,11 +2345,10 @@ static int nand_write_subpage_hwecc(struct mtd_info *mtd,
        int ecc_size      = chip->ecc.size;
        int ecc_bytes     = chip->ecc.bytes;
        int ecc_steps     = chip->ecc.steps;
-       uint32_t *eccpos  = chip->ecc.layout->eccpos;
        uint32_t start_step = offset / ecc_size;
        uint32_t end_step   = (offset + data_len - 1) / ecc_size;
        int oob_bytes       = mtd->oobsize / ecc_steps;
-       int step, i;
+       int step, ret;
 
        for (step = 0; step < ecc_steps; step++) {
                /* configure controller for WRITE access */
@@ -2340,8 +2376,10 @@ static int nand_write_subpage_hwecc(struct mtd_info *mtd,
        /* copy calculated ECC for whole page to chip->buffer->oob */
        /* this include masked-value(0xFF) for unwritten subpages */
        ecc_calc = chip->buffers->ecccalc;
-       for (i = 0; i < chip->ecc.total; i++)
-               chip->oob_poi[eccpos[i]] = ecc_calc[i];
+       ret = mtd_ooblayout_set_eccbytes(mtd, ecc_calc, chip->oob_poi, 0,
+                                        chip->ecc.total);
+       if (ret)
+               return ret;
 
        /* write OOB buffer to NAND device */
        chip->write_buf(mtd, chip->oob_poi, mtd->oobsize);
@@ -2478,6 +2516,7 @@ static uint8_t *nand_fill_oob(struct mtd_info *mtd, uint8_t *oob, size_t len,
                              struct mtd_oob_ops *ops)
 {
        struct nand_chip *chip = mtd_to_nand(mtd);
+       int ret;
 
        /*
         * Initialise to all 0xFF, to avoid the possibility of left over OOB
@@ -2492,31 +2531,12 @@ static uint8_t *nand_fill_oob(struct mtd_info *mtd, uint8_t *oob, size_t len,
                memcpy(chip->oob_poi + ops->ooboffs, oob, len);
                return oob + len;
 
-       case MTD_OPS_AUTO_OOB: {
-               struct nand_oobfree *free = chip->ecc.layout->oobfree;
-               uint32_t boffs = 0, woffs = ops->ooboffs;
-               size_t bytes = 0;
-
-               for (; free->length && len; free++, len -= bytes) {
-                       /* Write request not from offset 0? */
-                       if (unlikely(woffs)) {
-                               if (woffs >= free->length) {
-                                       woffs -= free->length;
-                                       continue;
-                               }
-                               boffs = free->offset + woffs;
-                               bytes = min_t(size_t, len,
-                                             (free->length - woffs));
-                               woffs = 0;
-                       } else {
-                               bytes = min_t(size_t, len, free->length);
-                               boffs = free->offset;
-                       }
-                       memcpy(chip->oob_poi + boffs, oob, bytes);
-                       oob += bytes;
-               }
-               return oob;
-       }
+       case MTD_OPS_AUTO_OOB:
+               ret = mtd_ooblayout_set_databytes(mtd, oob, chip->oob_poi,
+                                                 ops->ooboffs, len);
+               BUG_ON(ret);
+               return oob + len;
+
        default:
                BUG();
        }
@@ -3951,10 +3971,115 @@ ident_done:
        return type;
 }
 
+static const char * const nand_ecc_modes[] = {
+       [NAND_ECC_NONE]         = "none",
+       [NAND_ECC_SOFT]         = "soft",
+       [NAND_ECC_HW]           = "hw",
+       [NAND_ECC_HW_SYNDROME]  = "hw_syndrome",
+       [NAND_ECC_HW_OOB_FIRST] = "hw_oob_first",
+};
+
+static int of_get_nand_ecc_mode(struct device_node *np)
+{
+       const char *pm;
+       int err, i;
+
+       err = of_property_read_string(np, "nand-ecc-mode", &pm);
+       if (err < 0)
+               return err;
+
+       for (i = 0; i < ARRAY_SIZE(nand_ecc_modes); i++)
+               if (!strcasecmp(pm, nand_ecc_modes[i]))
+                       return i;
+
+       /*
+        * For backward compatibility we support few obsoleted values that don't
+        * have their mappings into nand_ecc_modes_t anymore (they were merged
+        * with other enums).
+        */
+       if (!strcasecmp(pm, "soft_bch"))
+               return NAND_ECC_SOFT;
+
+       return -ENODEV;
+}
+
+static const char * const nand_ecc_algos[] = {
+       [NAND_ECC_HAMMING]      = "hamming",
+       [NAND_ECC_BCH]          = "bch",
+};
+
+static int of_get_nand_ecc_algo(struct device_node *np)
+{
+       const char *pm;
+       int err, i;
+
+       err = of_property_read_string(np, "nand-ecc-algo", &pm);
+       if (!err) {
+               for (i = NAND_ECC_HAMMING; i < ARRAY_SIZE(nand_ecc_algos); i++)
+                       if (!strcasecmp(pm, nand_ecc_algos[i]))
+                               return i;
+               return -ENODEV;
+       }
+
+       /*
+        * For backward compatibility we also read "nand-ecc-mode" checking
+        * for some obsoleted values that were specifying ECC algorithm.
+        */
+       err = of_property_read_string(np, "nand-ecc-mode", &pm);
+       if (err < 0)
+               return err;
+
+       if (!strcasecmp(pm, "soft"))
+               return NAND_ECC_HAMMING;
+       else if (!strcasecmp(pm, "soft_bch"))
+               return NAND_ECC_BCH;
+
+       return -ENODEV;
+}
+
+static int of_get_nand_ecc_step_size(struct device_node *np)
+{
+       int ret;
+       u32 val;
+
+       ret = of_property_read_u32(np, "nand-ecc-step-size", &val);
+       return ret ? ret : val;
+}
+
+static int of_get_nand_ecc_strength(struct device_node *np)
+{
+       int ret;
+       u32 val;
+
+       ret = of_property_read_u32(np, "nand-ecc-strength", &val);
+       return ret ? ret : val;
+}
+
+static int of_get_nand_bus_width(struct device_node *np)
+{
+       u32 val;
+
+       if (of_property_read_u32(np, "nand-bus-width", &val))
+               return 8;
+
+       switch (val) {
+       case 8:
+       case 16:
+               return val;
+       default:
+               return -EIO;
+       }
+}
+
+static bool of_get_nand_on_flash_bbt(struct device_node *np)
+{
+       return of_property_read_bool(np, "nand-on-flash-bbt");
+}
+
 static int nand_dt_init(struct nand_chip *chip)
 {
        struct device_node *dn = nand_get_flash_node(chip);
-       int ecc_mode, ecc_strength, ecc_step;
+       int ecc_mode, ecc_algo, ecc_strength, ecc_step;
 
        if (!dn)
                return 0;
@@ -3966,6 +4091,7 @@ static int nand_dt_init(struct nand_chip *chip)
                chip->bbt_options |= NAND_BBT_USE_FLASH;
 
        ecc_mode = of_get_nand_ecc_mode(dn);
+       ecc_algo = of_get_nand_ecc_algo(dn);
        ecc_strength = of_get_nand_ecc_strength(dn);
        ecc_step = of_get_nand_ecc_step_size(dn);
 
@@ -3978,6 +4104,9 @@ static int nand_dt_init(struct nand_chip *chip)
        if (ecc_mode >= 0)
                chip->ecc.mode = ecc_mode;
 
+       if (ecc_algo >= 0)
+               chip->ecc.algo = ecc_algo;
+
        if (ecc_strength >= 0)
                chip->ecc.strength = ecc_strength;
 
@@ -4054,6 +4183,82 @@ int nand_scan_ident(struct mtd_info *mtd, int maxchips,
 }
 EXPORT_SYMBOL(nand_scan_ident);
 
+static int nand_set_ecc_soft_ops(struct mtd_info *mtd)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct nand_ecc_ctrl *ecc = &chip->ecc;
+
+       if (WARN_ON(ecc->mode != NAND_ECC_SOFT))
+               return -EINVAL;
+
+       switch (ecc->algo) {
+       case NAND_ECC_HAMMING:
+               ecc->calculate = nand_calculate_ecc;
+               ecc->correct = nand_correct_data;
+               ecc->read_page = nand_read_page_swecc;
+               ecc->read_subpage = nand_read_subpage;
+               ecc->write_page = nand_write_page_swecc;
+               ecc->read_page_raw = nand_read_page_raw;
+               ecc->write_page_raw = nand_write_page_raw;
+               ecc->read_oob = nand_read_oob_std;
+               ecc->write_oob = nand_write_oob_std;
+               if (!ecc->size)
+                       ecc->size = 256;
+               ecc->bytes = 3;
+               ecc->strength = 1;
+               return 0;
+       case NAND_ECC_BCH:
+               if (!mtd_nand_has_bch()) {
+                       WARN(1, "CONFIG_MTD_NAND_ECC_BCH not enabled\n");
+                       return -EINVAL;
+               }
+               ecc->calculate = nand_bch_calculate_ecc;
+               ecc->correct = nand_bch_correct_data;
+               ecc->read_page = nand_read_page_swecc;
+               ecc->read_subpage = nand_read_subpage;
+               ecc->write_page = nand_write_page_swecc;
+               ecc->read_page_raw = nand_read_page_raw;
+               ecc->write_page_raw = nand_write_page_raw;
+               ecc->read_oob = nand_read_oob_std;
+               ecc->write_oob = nand_write_oob_std;
+               /*
+               * Board driver should supply ecc.size and ecc.strength
+               * values to select how many bits are correctable.
+               * Otherwise, default to 4 bits for large page devices.
+               */
+               if (!ecc->size && (mtd->oobsize >= 64)) {
+                       ecc->size = 512;
+                       ecc->strength = 4;
+               }
+
+               /*
+                * if no ecc placement scheme was provided pickup the default
+                * large page one.
+                */
+               if (!mtd->ooblayout) {
+                       /* handle large page devices only */
+                       if (mtd->oobsize < 64) {
+                               WARN(1, "OOB layout is required when using software BCH on small pages\n");
+                               return -EINVAL;
+                       }
+
+                       mtd_set_ooblayout(mtd, &nand_ooblayout_lp_ops);
+               }
+
+               /* See nand_bch_init() for details. */
+               ecc->bytes = 0;
+               ecc->priv = nand_bch_init(mtd);
+               if (!ecc->priv) {
+                       WARN(1, "BCH ECC initialization failed!\n");
+                       return -EINVAL;
+               }
+               return 0;
+       default:
+               WARN(1, "Unsupported ECC algorithm!\n");
+               return -EINVAL;
+       }
+}
+
 /*
  * Check if the chip configuration meet the datasheet requirements.
 
@@ -4098,14 +4303,15 @@ static bool nand_ecc_strength_good(struct mtd_info *mtd)
  */
 int nand_scan_tail(struct mtd_info *mtd)
 {
-       int i;
        struct nand_chip *chip = mtd_to_nand(mtd);
        struct nand_ecc_ctrl *ecc = &chip->ecc;
        struct nand_buffers *nbuf;
+       int ret;
 
        /* New bad blocks should be marked in OOB, flash-based BBT, or both */
-       BUG_ON((chip->bbt_options & NAND_BBT_NO_OOB_BBM) &&
-                       !(chip->bbt_options & NAND_BBT_USE_FLASH));
+       if (WARN_ON((chip->bbt_options & NAND_BBT_NO_OOB_BBM) &&
+                  !(chip->bbt_options & NAND_BBT_USE_FLASH)))
+               return -EINVAL;
 
        if (!(chip->options & NAND_OWN_BUFFERS)) {
                nbuf = kzalloc(sizeof(*nbuf) + mtd->writesize
@@ -4128,24 +4334,22 @@ int nand_scan_tail(struct mtd_info *mtd)
        /*
         * If no default placement scheme is given, select an appropriate one.
         */
-       if (!ecc->layout && (ecc->mode != NAND_ECC_SOFT_BCH)) {
+       if (!mtd->ooblayout &&
+           !(ecc->mode == NAND_ECC_SOFT && ecc->algo == NAND_ECC_BCH)) {
                switch (mtd->oobsize) {
                case 8:
-                       ecc->layout = &nand_oob_8;
-                       break;
                case 16:
-                       ecc->layout = &nand_oob_16;
+                       mtd_set_ooblayout(mtd, &nand_ooblayout_sp_ops);
                        break;
                case 64:
-                       ecc->layout = &nand_oob_64;
-                       break;
                case 128:
-                       ecc->layout = &nand_oob_128;
+                       mtd_set_ooblayout(mtd, &nand_ooblayout_lp_ops);
                        break;
                default:
-                       pr_warn("No oob scheme defined for oobsize %d\n",
-                                  mtd->oobsize);
-                       BUG();
+                       WARN(1, "No oob scheme defined for oobsize %d\n",
+                               mtd->oobsize);
+                       ret = -EINVAL;
+                       goto err_free;
                }
        }
 
@@ -4161,8 +4365,9 @@ int nand_scan_tail(struct mtd_info *mtd)
        case NAND_ECC_HW_OOB_FIRST:
                /* Similar to NAND_ECC_HW, but a separate read_page handle */
                if (!ecc->calculate || !ecc->correct || !ecc->hwctl) {
-                       pr_warn("No ECC functions supplied; hardware ECC not possible\n");
-                       BUG();
+                       WARN(1, "No ECC functions supplied; hardware ECC not possible\n");
+                       ret = -EINVAL;
+                       goto err_free;
                }
                if (!ecc->read_page)
                        ecc->read_page = nand_read_page_hwecc_oob_first;
@@ -4192,8 +4397,9 @@ int nand_scan_tail(struct mtd_info *mtd)
                     ecc->read_page == nand_read_page_hwecc ||
                     !ecc->write_page ||
                     ecc->write_page == nand_write_page_hwecc)) {
-                       pr_warn("No ECC functions supplied; hardware ECC not possible\n");
-                       BUG();
+                       WARN(1, "No ECC functions supplied; hardware ECC not possible\n");
+                       ret = -EINVAL;
+                       goto err_free;
                }
                /* Use standard syndrome read/write page function? */
                if (!ecc->read_page)
@@ -4211,61 +4417,22 @@ int nand_scan_tail(struct mtd_info *mtd)
 
                if (mtd->writesize >= ecc->size) {
                        if (!ecc->strength) {
-                               pr_warn("Driver must set ecc.strength when using hardware ECC\n");
-                               BUG();
+                               WARN(1, "Driver must set ecc.strength when using hardware ECC\n");
+                               ret = -EINVAL;
+                               goto err_free;
                        }
                        break;
                }
                pr_warn("%d byte HW ECC not possible on %d byte page size, fallback to SW ECC\n",
                        ecc->size, mtd->writesize);
                ecc->mode = NAND_ECC_SOFT;
+               ecc->algo = NAND_ECC_HAMMING;
 
        case NAND_ECC_SOFT:
-               ecc->calculate = nand_calculate_ecc;
-               ecc->correct = nand_correct_data;
-               ecc->read_page = nand_read_page_swecc;
-               ecc->read_subpage = nand_read_subpage;
-               ecc->write_page = nand_write_page_swecc;
-               ecc->read_page_raw = nand_read_page_raw;
-               ecc->write_page_raw = nand_write_page_raw;
-               ecc->read_oob = nand_read_oob_std;
-               ecc->write_oob = nand_write_oob_std;
-               if (!ecc->size)
-                       ecc->size = 256;
-               ecc->bytes = 3;
-               ecc->strength = 1;
-               break;
-
-       case NAND_ECC_SOFT_BCH:
-               if (!mtd_nand_has_bch()) {
-                       pr_warn("CONFIG_MTD_NAND_ECC_BCH not enabled\n");
-                       BUG();
-               }
-               ecc->calculate = nand_bch_calculate_ecc;
-               ecc->correct = nand_bch_correct_data;
-               ecc->read_page = nand_read_page_swecc;
-               ecc->read_subpage = nand_read_subpage;
-               ecc->write_page = nand_write_page_swecc;
-               ecc->read_page_raw = nand_read_page_raw;
-               ecc->write_page_raw = nand_write_page_raw;
-               ecc->read_oob = nand_read_oob_std;
-               ecc->write_oob = nand_write_oob_std;
-               /*
-                * Board driver should supply ecc.size and ecc.strength values
-                * to select how many bits are correctable. Otherwise, default
-                * to 4 bits for large page devices.
-                */
-               if (!ecc->size && (mtd->oobsize >= 64)) {
-                       ecc->size = 512;
-                       ecc->strength = 4;
-               }
-
-               /* See nand_bch_init() for details. */
-               ecc->bytes = 0;
-               ecc->priv = nand_bch_init(mtd);
-               if (!ecc->priv) {
-                       pr_warn("BCH ECC initialization failed!\n");
-                       BUG();
+               ret = nand_set_ecc_soft_ops(mtd);
+               if (ret) {
+                       ret = -EINVAL;
+                       goto err_free;
                }
                break;
 
@@ -4283,8 +4450,9 @@ int nand_scan_tail(struct mtd_info *mtd)
                break;
 
        default:
-               pr_warn("Invalid NAND_ECC_MODE %d\n", ecc->mode);
-               BUG();
+               WARN(1, "Invalid NAND_ECC_MODE %d\n", ecc->mode);
+               ret = -EINVAL;
+               goto err_free;
        }
 
        /* For many systems, the standard OOB write also works for raw */
@@ -4293,20 +4461,9 @@ int nand_scan_tail(struct mtd_info *mtd)
        if (!ecc->write_oob_raw)
                ecc->write_oob_raw = ecc->write_oob;
 
-       /*
-        * The number of bytes available for a client to place data into
-        * the out of band area.
-        */
-       mtd->oobavail = 0;
-       if (ecc->layout) {
-               for (i = 0; ecc->layout->oobfree[i].length; i++)
-                       mtd->oobavail += ecc->layout->oobfree[i].length;
-       }
-
-       /* ECC sanity check: warn if it's too weak */
-       if (!nand_ecc_strength_good(mtd))
-               pr_warn("WARNING: %s: the ECC used on your system is too weak compared to the one required by the NAND chip\n",
-                       mtd->name);
+       /* propagate ecc info to mtd_info */
+       mtd->ecc_strength = ecc->strength;
+       mtd->ecc_step_size = ecc->size;
 
        /*
         * Set the number of read / write steps for one page depending on ECC
@@ -4314,11 +4471,27 @@ int nand_scan_tail(struct mtd_info *mtd)
         */
        ecc->steps = mtd->writesize / ecc->size;
        if (ecc->steps * ecc->size != mtd->writesize) {
-               pr_warn("Invalid ECC parameters\n");
-               BUG();
+               WARN(1, "Invalid ECC parameters\n");
+               ret = -EINVAL;
+               goto err_free;
        }
        ecc->total = ecc->steps * ecc->bytes;
 
+       /*
+        * The number of bytes available for a client to place data into
+        * the out of band area.
+        */
+       ret = mtd_ooblayout_count_freebytes(mtd);
+       if (ret < 0)
+               ret = 0;
+
+       mtd->oobavail = ret;
+
+       /* ECC sanity check: warn if it's too weak */
+       if (!nand_ecc_strength_good(mtd))
+               pr_warn("WARNING: %s: the ECC used on your system is too weak compared to the one required by the NAND chip\n",
+                       mtd->name);
+
        /* Allow subpage writes up to ecc.steps. Not possible for MLC flash */
        if (!(chip->options & NAND_NO_SUBPAGE_WRITE) && nand_is_slc(chip)) {
                switch (ecc->steps) {
@@ -4343,7 +4516,6 @@ int nand_scan_tail(struct mtd_info *mtd)
        /* Large page NAND with SOFT_ECC should support subpage reads */
        switch (ecc->mode) {
        case NAND_ECC_SOFT:
-       case NAND_ECC_SOFT_BCH:
                if (chip->page_shift > 9)
                        chip->options |= NAND_SUBPAGE_READ;
                break;
@@ -4375,10 +4547,6 @@ int nand_scan_tail(struct mtd_info *mtd)
        mtd->_block_markbad = nand_block_markbad;
        mtd->writebufsize = mtd->writesize;
 
-       /* propagate ecc info to mtd_info */
-       mtd->ecclayout = ecc->layout;
-       mtd->ecc_strength = ecc->strength;
-       mtd->ecc_step_size = ecc->size;
        /*
         * Initialize bitflip_threshold to its default prior scan_bbt() call.
         * scan_bbt() might invoke mtd_read(), thus bitflip_threshold must be
@@ -4393,6 +4561,10 @@ int nand_scan_tail(struct mtd_info *mtd)
 
        /* Build bad block table */
        return chip->scan_bbt(mtd);
+err_free:
+       if (!(chip->options & NAND_OWN_BUFFERS))
+               kfree(chip->buffers);
+       return ret;
 }
 EXPORT_SYMBOL(nand_scan_tail);
 
@@ -4436,7 +4608,8 @@ void nand_release(struct mtd_info *mtd)
 {
        struct nand_chip *chip = mtd_to_nand(mtd);
 
-       if (chip->ecc.mode == NAND_ECC_SOFT_BCH)
+       if (chip->ecc.mode == NAND_ECC_SOFT &&
+           chip->ecc.algo == NAND_ECC_BCH)
                nand_bch_free((struct nand_bch_control *)chip->ecc.priv);
 
        mtd_device_unregister(mtd);
index b585bae..44763f8 100644 (file)
 /**
  * struct nand_bch_control - private NAND BCH control structure
  * @bch:       BCH control structure
- * @ecclayout: private ecc layout for this BCH configuration
  * @errloc:    error location array
  * @eccmask:   XOR ecc mask, allows erased pages to be decoded as valid
  */
 struct nand_bch_control {
        struct bch_control   *bch;
-       struct nand_ecclayout ecclayout;
        unsigned int         *errloc;
        unsigned char        *eccmask;
 };
@@ -124,7 +122,6 @@ struct nand_bch_control *nand_bch_init(struct mtd_info *mtd)
 {
        struct nand_chip *nand = mtd_to_nand(mtd);
        unsigned int m, t, eccsteps, i;
-       struct nand_ecclayout *layout = nand->ecc.layout;
        struct nand_bch_control *nbc = NULL;
        unsigned char *erased_page;
        unsigned int eccsize = nand->ecc.size;
@@ -161,34 +158,10 @@ struct nand_bch_control *nand_bch_init(struct mtd_info *mtd)
 
        eccsteps = mtd->writesize/eccsize;
 
-       /* if no ecc placement scheme was provided, build one */
-       if (!layout) {
-
-               /* handle large page devices only */
-               if (mtd->oobsize < 64) {
-                       printk(KERN_WARNING "must provide an oob scheme for "
-                              "oobsize %d\n", mtd->oobsize);
-                       goto fail;
-               }
-
-               layout = &nbc->ecclayout;
-               layout->eccbytes = eccsteps*eccbytes;
-
-               /* reserve 2 bytes for bad block marker */
-               if (layout->eccbytes+2 > mtd->oobsize) {
-                       printk(KERN_WARNING "no suitable oob scheme available "
-                              "for oobsize %d eccbytes %u\n", mtd->oobsize,
-                              eccbytes);
-                       goto fail;
-               }
-               /* put ecc bytes at oob tail */
-               for (i = 0; i < layout->eccbytes; i++)
-                       layout->eccpos[i] = mtd->oobsize-layout->eccbytes+i;
-
-               layout->oobfree[0].offset = 2;
-               layout->oobfree[0].length = mtd->oobsize-2-layout->eccbytes;
-
-               nand->ecc.layout = layout;
+       /* Check that we have an oob layout description. */
+       if (!mtd->ooblayout) {
+               pr_warn("missing oob scheme");
+               goto fail;
        }
 
        /* sanity checks */
@@ -196,7 +169,18 @@ struct nand_bch_control *nand_bch_init(struct mtd_info *mtd)
                printk(KERN_WARNING "eccsize %u is too large\n", eccsize);
                goto fail;
        }
-       if (layout->eccbytes != (eccsteps*eccbytes)) {
+
+       /*
+        * ecc->steps and ecc->total might be used by mtd->ooblayout->ecc(),
+        * which is called by mtd_ooblayout_count_eccbytes().
+        * Make sure they are properly initialized before calling
+        * mtd_ooblayout_count_eccbytes().
+        * FIXME: we should probably rework the sequencing in nand_scan_tail()
+        * to avoid setting those fields twice.
+        */
+       nand->ecc.steps = eccsteps;
+       nand->ecc.total = eccsteps * eccbytes;
+       if (mtd_ooblayout_count_eccbytes(mtd) != (eccsteps*eccbytes)) {
                printk(KERN_WARNING "invalid ecc layout\n");
                goto fail;
        }
index a58169a..1eb9344 100644 (file)
@@ -569,7 +569,7 @@ static void nandsim_debugfs_remove(struct nandsim *ns)
  *
  * RETURNS: 0 if success, -ENOMEM if memory alloc fails.
  */
-static int alloc_device(struct nandsim *ns)
+static int __init alloc_device(struct nandsim *ns)
 {
        struct file *cfile;
        int i, err;
@@ -654,7 +654,7 @@ static void free_device(struct nandsim *ns)
        }
 }
 
-static char *get_partition_name(int i)
+static char __init *get_partition_name(int i)
 {
        return kasprintf(GFP_KERNEL, "NAND simulator partition %d", i);
 }
@@ -664,7 +664,7 @@ static char *get_partition_name(int i)
  *
  * RETURNS: 0 if success, -ERRNO if failure.
  */
-static int init_nandsim(struct mtd_info *mtd)
+static int __init init_nandsim(struct mtd_info *mtd)
 {
        struct nand_chip *chip = mtd_to_nand(mtd);
        struct nandsim   *ns   = nand_get_controller_data(chip);
@@ -2261,6 +2261,7 @@ static int __init ns_init_module(void)
        chip->read_buf   = ns_nand_read_buf;
        chip->read_word  = ns_nand_read_word;
        chip->ecc.mode   = NAND_ECC_SOFT;
+       chip->ecc.algo   = NAND_ECC_HAMMING;
        /* The NAND_SKIP_BBTSCAN option is necessary for 'overridesize' */
        /* and 'badblocks' parameters to work */
        chip->options   |= NAND_SKIP_BBTSCAN;
@@ -2338,7 +2339,8 @@ static int __init ns_init_module(void)
                        retval = -EINVAL;
                        goto error;
                }
-               chip->ecc.mode = NAND_ECC_SOFT_BCH;
+               chip->ecc.mode = NAND_ECC_SOFT;
+               chip->ecc.algo = NAND_ECC_BCH;
                chip->ecc.size = 512;
                chip->ecc.strength = bch;
                chip->ecc.bytes = eccbytes;
index dbc5b57..8f64011 100644 (file)
@@ -261,6 +261,7 @@ static int nuc900_nand_probe(struct platform_device *pdev)
        chip->chip_delay        = 50;
        chip->options           = 0;
        chip->ecc.mode          = NAND_ECC_SOFT;
+       chip->ecc.algo          = NAND_ECC_HAMMING;
 
        res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
        nuc900_nand->reg = devm_ioremap_resource(&pdev->dev, res);
index 0749ca1..08e1588 100644 (file)
@@ -12,6 +12,7 @@
 #include <linux/dmaengine.h>
 #include <linux/dma-mapping.h>
 #include <linux/delay.h>
+#include <linux/gpio/consumer.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
 #include <linux/jiffies.h>
@@ -28,6 +29,7 @@
 #include <linux/mtd/nand_bch.h>
 #include <linux/platform_data/elm.h>
 
+#include <linux/omap-gpmc.h>
 #include <linux/platform_data/mtd-nand-omap2.h>
 
 #define        DRIVER_NAME     "omap2-nand"
@@ -151,13 +153,17 @@ static struct nand_hw_control omap_gpmc_controller = {
 };
 
 struct omap_nand_info {
-       struct omap_nand_platform_data  *pdata;
        struct nand_chip                nand;
        struct platform_device          *pdev;
 
        int                             gpmc_cs;
-       unsigned long                   phys_base;
+       bool                            dev_ready;
+       enum nand_io                    xfer_type;
+       int                             devsize;
        enum omap_ecc                   ecc_opt;
+       struct device_node              *elm_of_node;
+
+       unsigned long                   phys_base;
        struct completion               comp;
        struct dma_chan                 *dma;
        int                             gpmc_irq_fifo;
@@ -168,12 +174,14 @@ struct omap_nand_info {
        } iomode;
        u_char                          *buf;
        int                                     buf_len;
+       /* Interface to GPMC */
        struct gpmc_nand_regs           reg;
-       /* generated at runtime depending on ECC algorithm and layout selected */
-       struct nand_ecclayout           oobinfo;
+       struct gpmc_nand_ops            *ops;
+       bool                            flash_bbt;
        /* fields specific for BCHx_HW ECC scheme */
        struct device                   *elm_dev;
-       struct device_node              *of_node;
+       /* NAND ready gpio */
+       struct gpio_desc                *ready_gpiod;
 };
 
 static inline struct omap_nand_info *mtd_to_omap(struct mtd_info *mtd)
@@ -208,7 +216,7 @@ static int omap_prefetch_enable(int cs, int fifo_th, int dma_mode,
         */
        val = ((cs << PREFETCH_CONFIG1_CS_SHIFT) |
                PREFETCH_FIFOTHRESHOLD(fifo_th) | ENABLE_PREFETCH |
-               (dma_mode << DMA_MPU_MODE_SHIFT) | (0x1 & is_write));
+               (dma_mode << DMA_MPU_MODE_SHIFT) | (is_write & 0x1));
        writel(val, info->reg.gpmc_prefetch_config1);
 
        /*  Start the prefetch engine */
@@ -288,14 +296,13 @@ static void omap_write_buf8(struct mtd_info *mtd, const u_char *buf, int len)
 {
        struct omap_nand_info *info = mtd_to_omap(mtd);
        u_char *p = (u_char *)buf;
-       u32     status = 0;
+       bool status;
 
        while (len--) {
                iowrite8(*p++, info->nand.IO_ADDR_W);
                /* wait until buffer is available for write */
                do {
-                       status = readl(info->reg.gpmc_status) &
-                                       STATUS_BUFF_EMPTY;
+                       status = info->ops->nand_writebuffer_empty();
                } while (!status);
        }
 }
@@ -323,7 +330,7 @@ static void omap_write_buf16(struct mtd_info *mtd, const u_char * buf, int len)
 {
        struct omap_nand_info *info = mtd_to_omap(mtd);
        u16 *p = (u16 *) buf;
-       u32     status = 0;
+       bool status;
        /* FIXME try bursts of writesw() or DMA ... */
        len >>= 1;
 
@@ -331,8 +338,7 @@ static void omap_write_buf16(struct mtd_info *mtd, const u_char * buf, int len)
                iowrite16(*p++, info->nand.IO_ADDR_W);
                /* wait until buffer is available for write */
                do {
-                       status = readl(info->reg.gpmc_status) &
-                                       STATUS_BUFF_EMPTY;
+                       status = info->ops->nand_writebuffer_empty();
                } while (!status);
        }
 }
@@ -467,17 +473,8 @@ static inline int omap_nand_dma_transfer(struct mtd_info *mtd, void *addr,
        int ret;
        u32 val;
 
-       if (addr >= high_memory) {
-               struct page *p1;
-
-               if (((size_t)addr & PAGE_MASK) !=
-                       ((size_t)(addr + len - 1) & PAGE_MASK))
-                       goto out_copy;
-               p1 = vmalloc_to_page(addr);
-               if (!p1)
-                       goto out_copy;
-               addr = page_address(p1) + ((size_t)addr & ~PAGE_MASK);
-       }
+       if (!virt_addr_valid(addr))
+               goto out_copy;
 
        sg_init_one(&sg, addr, len);
        n = dma_map_sg(info->dma->device->dev, &sg, 1, dir);
@@ -497,6 +494,11 @@ static inline int omap_nand_dma_transfer(struct mtd_info *mtd, void *addr,
        tx->callback_param = &info->comp;
        dmaengine_submit(tx);
 
+       init_completion(&info->comp);
+
+       /* setup and start DMA using dma_addr */
+       dma_async_issue_pending(info->dma);
+
        /*  configure and start prefetch transfer */
        ret = omap_prefetch_enable(info->gpmc_cs,
                PREFETCH_FIFOTHRESHOLD_MAX, 0x1, len, is_write, info);
@@ -504,10 +506,6 @@ static inline int omap_nand_dma_transfer(struct mtd_info *mtd, void *addr,
                /* PFPW engine is busy, use cpu copy method */
                goto out_copy_unmap;
 
-       init_completion(&info->comp);
-       dma_async_issue_pending(info->dma);
-
-       /* setup and start DMA using dma_addr */
        wait_for_completion(&info->comp);
        tim = 0;
        limit = (loops_per_jiffy * msecs_to_jiffies(OMAP_NAND_TIMEOUT_MS));
@@ -1017,21 +1015,16 @@ static int omap_wait(struct mtd_info *mtd, struct nand_chip *chip)
 }
 
 /**
- * omap_dev_ready - calls the platform specific dev_ready function
+ * omap_dev_ready - checks the NAND Ready GPIO line
  * @mtd: MTD device structure
+ *
+ * Returns true if ready and false if busy.
  */
 static int omap_dev_ready(struct mtd_info *mtd)
 {
-       unsigned int val = 0;
        struct omap_nand_info *info = mtd_to_omap(mtd);
 
-       val = readl(info->reg.gpmc_status);
-
-       if ((val & 0x100) == 0x100) {
-               return 1;
-       } else {
-               return 0;
-       }
+       return gpiod_get_value(info->ready_gpiod);
 }
 
 /**
@@ -1495,9 +1488,8 @@ static int omap_elm_correct_data(struct mtd_info *mtd, u_char *data,
 static int omap_write_page_bch(struct mtd_info *mtd, struct nand_chip *chip,
                               const uint8_t *buf, int oob_required, int page)
 {
-       int i;
+       int ret;
        uint8_t *ecc_calc = chip->buffers->ecccalc;
-       uint32_t *eccpos = chip->ecc.layout->eccpos;
 
        /* Enable GPMC ecc engine */
        chip->ecc.hwctl(mtd, NAND_ECC_WRITE);
@@ -1508,8 +1500,10 @@ static int omap_write_page_bch(struct mtd_info *mtd, struct nand_chip *chip,
        /* Update ecc vector from GPMC result registers */
        chip->ecc.calculate(mtd, buf, &ecc_calc[0]);
 
-       for (i = 0; i < chip->ecc.total; i++)
-               chip->oob_poi[eccpos[i]] = ecc_calc[i];
+       ret = mtd_ooblayout_set_eccbytes(mtd, ecc_calc, chip->oob_poi, 0,
+                                        chip->ecc.total);
+       if (ret)
+               return ret;
 
        /* Write ecc vector to OOB area */
        chip->write_buf(mtd, chip->oob_poi, mtd->oobsize);
@@ -1536,10 +1530,7 @@ static int omap_read_page_bch(struct mtd_info *mtd, struct nand_chip *chip,
 {
        uint8_t *ecc_calc = chip->buffers->ecccalc;
        uint8_t *ecc_code = chip->buffers->ecccode;
-       uint32_t *eccpos = chip->ecc.layout->eccpos;
-       uint8_t *oob = &chip->oob_poi[eccpos[0]];
-       uint32_t oob_pos = mtd->writesize + chip->ecc.layout->eccpos[0];
-       int stat;
+       int stat, ret;
        unsigned int max_bitflips = 0;
 
        /* Enable GPMC ecc engine */
@@ -1549,13 +1540,18 @@ static int omap_read_page_bch(struct mtd_info *mtd, struct nand_chip *chip,
        chip->read_buf(mtd, buf, mtd->writesize);
 
        /* Read oob bytes */
-       chip->cmdfunc(mtd, NAND_CMD_RNDOUT, oob_pos, -1);
-       chip->read_buf(mtd, oob, chip->ecc.total);
+       chip->cmdfunc(mtd, NAND_CMD_RNDOUT,
+                     mtd->writesize + BADBLOCK_MARKER_LENGTH, -1);
+       chip->read_buf(mtd, chip->oob_poi + BADBLOCK_MARKER_LENGTH,
+                      chip->ecc.total);
 
        /* Calculate ecc bytes */
        chip->ecc.calculate(mtd, buf, ecc_calc);
 
-       memcpy(ecc_code, &chip->oob_poi[eccpos[0]], chip->ecc.total);
+       ret = mtd_ooblayout_get_eccbytes(mtd, ecc_code, chip->oob_poi, 0,
+                                        chip->ecc.total);
+       if (ret)
+               return ret;
 
        stat = chip->ecc.correct(mtd, buf, ecc_code, ecc_calc);
 
@@ -1630,7 +1626,7 @@ static bool omap2_nand_ecc_check(struct omap_nand_info *info,
                        "CONFIG_MTD_NAND_OMAP_BCH not enabled\n");
                return false;
        }
-       if (ecc_needs_elm && !is_elm_present(info, pdata->elm_of_node)) {
+       if (ecc_needs_elm && !is_elm_present(info, info->elm_of_node)) {
                dev_err(&info->pdev->dev, "ELM not available\n");
                return false;
        }
@@ -1638,43 +1634,227 @@ static bool omap2_nand_ecc_check(struct omap_nand_info *info,
        return true;
 }
 
+static const char * const nand_xfer_types[] = {
+       [NAND_OMAP_PREFETCH_POLLED] = "prefetch-polled",
+       [NAND_OMAP_POLLED] = "polled",
+       [NAND_OMAP_PREFETCH_DMA] = "prefetch-dma",
+       [NAND_OMAP_PREFETCH_IRQ] = "prefetch-irq",
+};
+
+static int omap_get_dt_info(struct device *dev, struct omap_nand_info *info)
+{
+       struct device_node *child = dev->of_node;
+       int i;
+       const char *s;
+       u32 cs;
+
+       if (of_property_read_u32(child, "reg", &cs) < 0) {
+               dev_err(dev, "reg not found in DT\n");
+               return -EINVAL;
+       }
+
+       info->gpmc_cs = cs;
+
+       /* detect availability of ELM module. Won't be present pre-OMAP4 */
+       info->elm_of_node = of_parse_phandle(child, "ti,elm-id", 0);
+       if (!info->elm_of_node)
+               dev_dbg(dev, "ti,elm-id not in DT\n");
+
+       /* select ecc-scheme for NAND */
+       if (of_property_read_string(child, "ti,nand-ecc-opt", &s)) {
+               dev_err(dev, "ti,nand-ecc-opt not found\n");
+               return -EINVAL;
+       }
+
+       if (!strcmp(s, "sw")) {
+               info->ecc_opt = OMAP_ECC_HAM1_CODE_SW;
+       } else if (!strcmp(s, "ham1") ||
+                  !strcmp(s, "hw") || !strcmp(s, "hw-romcode")) {
+               info->ecc_opt = OMAP_ECC_HAM1_CODE_HW;
+       } else if (!strcmp(s, "bch4")) {
+               if (info->elm_of_node)
+                       info->ecc_opt = OMAP_ECC_BCH4_CODE_HW;
+               else
+                       info->ecc_opt = OMAP_ECC_BCH4_CODE_HW_DETECTION_SW;
+       } else if (!strcmp(s, "bch8")) {
+               if (info->elm_of_node)
+                       info->ecc_opt = OMAP_ECC_BCH8_CODE_HW;
+               else
+                       info->ecc_opt = OMAP_ECC_BCH8_CODE_HW_DETECTION_SW;
+       } else if (!strcmp(s, "bch16")) {
+               info->ecc_opt = OMAP_ECC_BCH16_CODE_HW;
+       } else {
+               dev_err(dev, "unrecognized value for ti,nand-ecc-opt\n");
+               return -EINVAL;
+       }
+
+       /* select data transfer mode */
+       if (!of_property_read_string(child, "ti,nand-xfer-type", &s)) {
+               for (i = 0; i < ARRAY_SIZE(nand_xfer_types); i++) {
+                       if (!strcasecmp(s, nand_xfer_types[i])) {
+                               info->xfer_type = i;
+                               return 0;
+                       }
+               }
+
+               dev_err(dev, "unrecognized value for ti,nand-xfer-type\n");
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static int omap_ooblayout_ecc(struct mtd_info *mtd, int section,
+                             struct mtd_oob_region *oobregion)
+{
+       struct omap_nand_info *info = mtd_to_omap(mtd);
+       struct nand_chip *chip = &info->nand;
+       int off = BADBLOCK_MARKER_LENGTH;
+
+       if (info->ecc_opt == OMAP_ECC_HAM1_CODE_HW &&
+           !(chip->options & NAND_BUSWIDTH_16))
+               off = 1;
+
+       if (section)
+               return -ERANGE;
+
+       oobregion->offset = off;
+       oobregion->length = chip->ecc.total;
+
+       return 0;
+}
+
+static int omap_ooblayout_free(struct mtd_info *mtd, int section,
+                              struct mtd_oob_region *oobregion)
+{
+       struct omap_nand_info *info = mtd_to_omap(mtd);
+       struct nand_chip *chip = &info->nand;
+       int off = BADBLOCK_MARKER_LENGTH;
+
+       if (info->ecc_opt == OMAP_ECC_HAM1_CODE_HW &&
+           !(chip->options & NAND_BUSWIDTH_16))
+               off = 1;
+
+       if (section)
+               return -ERANGE;
+
+       off += chip->ecc.total;
+       if (off >= mtd->oobsize)
+               return -ERANGE;
+
+       oobregion->offset = off;
+       oobregion->length = mtd->oobsize - off;
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops omap_ooblayout_ops = {
+       .ecc = omap_ooblayout_ecc,
+       .free = omap_ooblayout_free,
+};
+
+static int omap_sw_ooblayout_ecc(struct mtd_info *mtd, int section,
+                                struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       int off = BADBLOCK_MARKER_LENGTH;
+
+       if (section >= chip->ecc.steps)
+               return -ERANGE;
+
+       /*
+        * When SW correction is employed, one OMAP specific marker byte is
+        * reserved after each ECC step.
+        */
+       oobregion->offset = off + (section * (chip->ecc.bytes + 1));
+       oobregion->length = chip->ecc.bytes;
+
+       return 0;
+}
+
+static int omap_sw_ooblayout_free(struct mtd_info *mtd, int section,
+                                 struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       int off = BADBLOCK_MARKER_LENGTH;
+
+       if (section)
+               return -ERANGE;
+
+       /*
+        * When SW correction is employed, one OMAP specific marker byte is
+        * reserved after each ECC step.
+        */
+       off += ((chip->ecc.bytes + 1) * chip->ecc.steps);
+       if (off >= mtd->oobsize)
+               return -ERANGE;
+
+       oobregion->offset = off;
+       oobregion->length = mtd->oobsize - off;
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops omap_sw_ooblayout_ops = {
+       .ecc = omap_sw_ooblayout_ecc,
+       .free = omap_sw_ooblayout_free,
+};
+
 static int omap_nand_probe(struct platform_device *pdev)
 {
        struct omap_nand_info           *info;
-       struct omap_nand_platform_data  *pdata;
+       struct omap_nand_platform_data  *pdata = NULL;
        struct mtd_info                 *mtd;
        struct nand_chip                *nand_chip;
-       struct nand_ecclayout           *ecclayout;
        int                             err;
-       int                             i;
        dma_cap_mask_t                  mask;
        unsigned                        sig;
-       unsigned                        oob_index;
        struct resource                 *res;
-
-       pdata = dev_get_platdata(&pdev->dev);
-       if (pdata == NULL) {
-               dev_err(&pdev->dev, "platform data missing\n");
-               return -ENODEV;
-       }
+       struct device                   *dev = &pdev->dev;
+       int                             min_oobbytes = BADBLOCK_MARKER_LENGTH;
+       int                             oobbytes_per_step;
 
        info = devm_kzalloc(&pdev->dev, sizeof(struct omap_nand_info),
                                GFP_KERNEL);
        if (!info)
                return -ENOMEM;
 
+       info->pdev = pdev;
+
+       if (dev->of_node) {
+               if (omap_get_dt_info(dev, info))
+                       return -EINVAL;
+       } else {
+               pdata = dev_get_platdata(&pdev->dev);
+               if (!pdata) {
+                       dev_err(&pdev->dev, "platform data missing\n");
+                       return -EINVAL;
+               }
+
+               info->gpmc_cs = pdata->cs;
+               info->reg = pdata->reg;
+               info->ecc_opt = pdata->ecc_opt;
+               if (pdata->dev_ready)
+                       dev_info(&pdev->dev, "pdata->dev_ready is deprecated\n");
+
+               info->xfer_type = pdata->xfer_type;
+               info->devsize = pdata->devsize;
+               info->elm_of_node = pdata->elm_of_node;
+               info->flash_bbt = pdata->flash_bbt;
+       }
+
        platform_set_drvdata(pdev, info);
+       info->ops = gpmc_omap_get_nand_ops(&info->reg, info->gpmc_cs);
+       if (!info->ops) {
+               dev_err(&pdev->dev, "Failed to get GPMC->NAND interface\n");
+               return -ENODEV;
+       }
 
-       info->pdev              = pdev;
-       info->gpmc_cs           = pdata->cs;
-       info->reg               = pdata->reg;
-       info->of_node           = pdata->of_node;
-       info->ecc_opt           = pdata->ecc_opt;
        nand_chip               = &info->nand;
        mtd                     = nand_to_mtd(nand_chip);
        mtd->dev.parent         = &pdev->dev;
        nand_chip->ecc.priv     = NULL;
-       nand_set_flash_node(nand_chip, pdata->of_node);
+       nand_set_flash_node(nand_chip, dev->of_node);
 
        res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
        nand_chip->IO_ADDR_R = devm_ioremap_resource(&pdev->dev, res);
@@ -1688,6 +1868,13 @@ static int omap_nand_probe(struct platform_device *pdev)
        nand_chip->IO_ADDR_W = nand_chip->IO_ADDR_R;
        nand_chip->cmd_ctrl  = omap_hwcontrol;
 
+       info->ready_gpiod = devm_gpiod_get_optional(&pdev->dev, "rb",
+                                                   GPIOD_IN);
+       if (IS_ERR(info->ready_gpiod)) {
+               dev_err(dev, "failed to get ready gpio\n");
+               return PTR_ERR(info->ready_gpiod);
+       }
+
        /*
         * If RDY/BSY line is connected to OMAP then use the omap ready
         * function and the generic nand_wait function which reads the status
@@ -1695,7 +1882,7 @@ static int omap_nand_probe(struct platform_device *pdev)
         * chip delay which is slightly more than tR (AC Timing) of the NAND
         * device and read status register until you get a failure or success
         */
-       if (pdata->dev_ready) {
+       if (info->ready_gpiod) {
                nand_chip->dev_ready = omap_dev_ready;
                nand_chip->chip_delay = 0;
        } else {
@@ -1703,21 +1890,25 @@ static int omap_nand_probe(struct platform_device *pdev)
                nand_chip->chip_delay = 50;
        }
 
-       if (pdata->flash_bbt)
-               nand_chip->bbt_options |= NAND_BBT_USE_FLASH | NAND_BBT_NO_OOB;
-       else
-               nand_chip->options |= NAND_SKIP_BBTSCAN;
+       if (info->flash_bbt)
+               nand_chip->bbt_options |= NAND_BBT_USE_FLASH;
 
        /* scan NAND device connected to chip controller */
-       nand_chip->options |= pdata->devsize & NAND_BUSWIDTH_16;
+       nand_chip->options |= info->devsize & NAND_BUSWIDTH_16;
        if (nand_scan_ident(mtd, 1, NULL)) {
-               dev_err(&info->pdev->dev, "scan failed, may be bus-width mismatch\n");
+               dev_err(&info->pdev->dev,
+                       "scan failed, may be bus-width mismatch\n");
                err = -ENXIO;
                goto return_error;
        }
 
+       if (nand_chip->bbt_options & NAND_BBT_USE_FLASH)
+               nand_chip->bbt_options |= NAND_BBT_NO_OOB;
+       else
+               nand_chip->options |= NAND_SKIP_BBTSCAN;
+
        /* re-populate low-level callbacks based on xfer modes */
-       switch (pdata->xfer_type) {
+       switch (info->xfer_type) {
        case NAND_OMAP_PREFETCH_POLLED:
                nand_chip->read_buf   = omap_read_buf_pref;
                nand_chip->write_buf  = omap_write_buf_pref;
@@ -1797,7 +1988,7 @@ static int omap_nand_probe(struct platform_device *pdev)
 
        default:
                dev_err(&pdev->dev,
-                       "xfer_type(%d) not supported!\n", pdata->xfer_type);
+                       "xfer_type(%d) not supported!\n", info->xfer_type);
                err = -EINVAL;
                goto return_error;
        }
@@ -1809,16 +2000,15 @@ static int omap_nand_probe(struct platform_device *pdev)
 
        /*
         * Bail out earlier to let NAND_ECC_SOFT code create its own
-        * ecclayout instead of using ours.
+        * ooblayout instead of using ours.
         */
        if (info->ecc_opt == OMAP_ECC_HAM1_CODE_SW) {
                nand_chip->ecc.mode = NAND_ECC_SOFT;
+               nand_chip->ecc.algo = NAND_ECC_HAMMING;
                goto scan_tail;
        }
 
        /* populate MTD interface based on ECC scheme */
-       ecclayout               = &info->oobinfo;
-       nand_chip->ecc.layout   = ecclayout;
        switch (info->ecc_opt) {
        case OMAP_ECC_HAM1_CODE_HW:
                pr_info("nand: using OMAP_ECC_HAM1_CODE_HW\n");
@@ -1829,19 +2019,12 @@ static int omap_nand_probe(struct platform_device *pdev)
                nand_chip->ecc.calculate        = omap_calculate_ecc;
                nand_chip->ecc.hwctl            = omap_enable_hwecc;
                nand_chip->ecc.correct          = omap_correct_data;
-               /* define ECC layout */
-               ecclayout->eccbytes             = nand_chip->ecc.bytes *
-                                                       (mtd->writesize /
-                                                       nand_chip->ecc.size);
-               if (nand_chip->options & NAND_BUSWIDTH_16)
-                       oob_index               = BADBLOCK_MARKER_LENGTH;
-               else
-                       oob_index               = 1;
-               for (i = 0; i < ecclayout->eccbytes; i++, oob_index++)
-                       ecclayout->eccpos[i]    = oob_index;
-               /* no reserved-marker in ecclayout for this ecc-scheme */
-               ecclayout->oobfree->offset      =
-                               ecclayout->eccpos[ecclayout->eccbytes - 1] + 1;
+               mtd_set_ooblayout(mtd, &omap_ooblayout_ops);
+               oobbytes_per_step               = nand_chip->ecc.bytes;
+
+               if (!(nand_chip->options & NAND_BUSWIDTH_16))
+                       min_oobbytes            = 1;
+
                break;
 
        case OMAP_ECC_BCH4_CODE_HW_DETECTION_SW:
@@ -1853,19 +2036,9 @@ static int omap_nand_probe(struct platform_device *pdev)
                nand_chip->ecc.hwctl            = omap_enable_hwecc_bch;
                nand_chip->ecc.correct          = nand_bch_correct_data;
                nand_chip->ecc.calculate        = omap_calculate_ecc_bch;
-               /* define ECC layout */
-               ecclayout->eccbytes             = nand_chip->ecc.bytes *
-                                                       (mtd->writesize /
-                                                       nand_chip->ecc.size);
-               oob_index                       = BADBLOCK_MARKER_LENGTH;
-               for (i = 0; i < ecclayout->eccbytes; i++, oob_index++) {
-                       ecclayout->eccpos[i] = oob_index;
-                       if (((i + 1) % nand_chip->ecc.bytes) == 0)
-                               oob_index++;
-               }
-               /* include reserved-marker in ecclayout->oobfree calculation */
-               ecclayout->oobfree->offset      = 1 +
-                               ecclayout->eccpos[ecclayout->eccbytes - 1] + 1;
+               mtd_set_ooblayout(mtd, &omap_sw_ooblayout_ops);
+               /* Reserve one byte for the OMAP marker */
+               oobbytes_per_step               = nand_chip->ecc.bytes + 1;
                /* software bch library is used for locating errors */
                nand_chip->ecc.priv             = nand_bch_init(mtd);
                if (!nand_chip->ecc.priv) {
@@ -1887,16 +2060,8 @@ static int omap_nand_probe(struct platform_device *pdev)
                nand_chip->ecc.calculate        = omap_calculate_ecc_bch;
                nand_chip->ecc.read_page        = omap_read_page_bch;
                nand_chip->ecc.write_page       = omap_write_page_bch;
-               /* define ECC layout */
-               ecclayout->eccbytes             = nand_chip->ecc.bytes *
-                                                       (mtd->writesize /
-                                                       nand_chip->ecc.size);
-               oob_index                       = BADBLOCK_MARKER_LENGTH;
-               for (i = 0; i < ecclayout->eccbytes; i++, oob_index++)
-                       ecclayout->eccpos[i]    = oob_index;
-               /* reserved marker already included in ecclayout->eccbytes */
-               ecclayout->oobfree->offset      =
-                               ecclayout->eccpos[ecclayout->eccbytes - 1] + 1;
+               mtd_set_ooblayout(mtd, &omap_ooblayout_ops);
+               oobbytes_per_step               = nand_chip->ecc.bytes;
 
                err = elm_config(info->elm_dev, BCH4_ECC,
                                 mtd->writesize / nand_chip->ecc.size,
@@ -1914,19 +2079,9 @@ static int omap_nand_probe(struct platform_device *pdev)
                nand_chip->ecc.hwctl            = omap_enable_hwecc_bch;
                nand_chip->ecc.correct          = nand_bch_correct_data;
                nand_chip->ecc.calculate        = omap_calculate_ecc_bch;
-               /* define ECC layout */
-               ecclayout->eccbytes             = nand_chip->ecc.bytes *
-                                                       (mtd->writesize /
-                                                       nand_chip->ecc.size);
-               oob_index                       = BADBLOCK_MARKER_LENGTH;
-               for (i = 0; i < ecclayout->eccbytes; i++, oob_index++) {
-                       ecclayout->eccpos[i] = oob_index;
-                       if (((i + 1) % nand_chip->ecc.bytes) == 0)
-                               oob_index++;
-               }
-               /* include reserved-marker in ecclayout->oobfree calculation */
-               ecclayout->oobfree->offset      = 1 +
-                               ecclayout->eccpos[ecclayout->eccbytes - 1] + 1;
+               mtd_set_ooblayout(mtd, &omap_sw_ooblayout_ops);
+               /* Reserve one byte for the OMAP marker */
+               oobbytes_per_step               = nand_chip->ecc.bytes + 1;
                /* software bch library is used for locating errors */
                nand_chip->ecc.priv             = nand_bch_init(mtd);
                if (!nand_chip->ecc.priv) {
@@ -1948,6 +2103,8 @@ static int omap_nand_probe(struct platform_device *pdev)
                nand_chip->ecc.calculate        = omap_calculate_ecc_bch;
                nand_chip->ecc.read_page        = omap_read_page_bch;
                nand_chip->ecc.write_page       = omap_write_page_bch;
+               mtd_set_ooblayout(mtd, &omap_ooblayout_ops);
+               oobbytes_per_step               = nand_chip->ecc.bytes;
 
                err = elm_config(info->elm_dev, BCH8_ECC,
                                 mtd->writesize / nand_chip->ecc.size,
@@ -1955,16 +2112,6 @@ static int omap_nand_probe(struct platform_device *pdev)
                if (err < 0)
                        goto return_error;
 
-               /* define ECC layout */
-               ecclayout->eccbytes             = nand_chip->ecc.bytes *
-                                                       (mtd->writesize /
-                                                       nand_chip->ecc.size);
-               oob_index                       = BADBLOCK_MARKER_LENGTH;
-               for (i = 0; i < ecclayout->eccbytes; i++, oob_index++)
-                       ecclayout->eccpos[i]    = oob_index;
-               /* reserved marker already included in ecclayout->eccbytes */
-               ecclayout->oobfree->offset      =
-                               ecclayout->eccpos[ecclayout->eccbytes - 1] + 1;
                break;
 
        case OMAP_ECC_BCH16_CODE_HW:
@@ -1978,6 +2125,8 @@ static int omap_nand_probe(struct platform_device *pdev)
                nand_chip->ecc.calculate        = omap_calculate_ecc_bch;
                nand_chip->ecc.read_page        = omap_read_page_bch;
                nand_chip->ecc.write_page       = omap_write_page_bch;
+               mtd_set_ooblayout(mtd, &omap_ooblayout_ops);
+               oobbytes_per_step               = nand_chip->ecc.bytes;
 
                err = elm_config(info->elm_dev, BCH16_ECC,
                                 mtd->writesize / nand_chip->ecc.size,
@@ -1985,16 +2134,6 @@ static int omap_nand_probe(struct platform_device *pdev)
                if (err < 0)
                        goto return_error;
 
-               /* define ECC layout */
-               ecclayout->eccbytes             = nand_chip->ecc.bytes *
-                                                       (mtd->writesize /
-                                                       nand_chip->ecc.size);
-               oob_index                       = BADBLOCK_MARKER_LENGTH;
-               for (i = 0; i < ecclayout->eccbytes; i++, oob_index++)
-                       ecclayout->eccpos[i]    = oob_index;
-               /* reserved marker already included in ecclayout->eccbytes */
-               ecclayout->oobfree->offset      =
-                               ecclayout->eccpos[ecclayout->eccbytes - 1] + 1;
                break;
        default:
                dev_err(&info->pdev->dev, "invalid or unsupported ECC scheme\n");
@@ -2002,13 +2141,13 @@ static int omap_nand_probe(struct platform_device *pdev)
                goto return_error;
        }
 
-       /* all OOB bytes from oobfree->offset till end off OOB are free */
-       ecclayout->oobfree->length = mtd->oobsize - ecclayout->oobfree->offset;
        /* check if NAND device's OOB is enough to store ECC signatures */
-       if (mtd->oobsize < (ecclayout->eccbytes + BADBLOCK_MARKER_LENGTH)) {
+       min_oobbytes += (oobbytes_per_step *
+                        (mtd->writesize / nand_chip->ecc.size));
+       if (mtd->oobsize < min_oobbytes) {
                dev_err(&info->pdev->dev,
                        "not enough OOB bytes required = %d, available=%d\n",
-                       ecclayout->eccbytes, mtd->oobsize);
+                       min_oobbytes, mtd->oobsize);
                err = -EINVAL;
                goto return_error;
        }
@@ -2020,7 +2159,10 @@ scan_tail:
                goto return_error;
        }
 
-       mtd_device_register(mtd, pdata->parts, pdata->nr_parts);
+       if (dev->of_node)
+               mtd_device_register(mtd, NULL, 0);
+       else
+               mtd_device_register(mtd, pdata->parts, pdata->nr_parts);
 
        platform_set_drvdata(pdev, mtd);
 
@@ -2051,11 +2193,17 @@ static int omap_nand_remove(struct platform_device *pdev)
        return 0;
 }
 
+static const struct of_device_id omap_nand_ids[] = {
+       { .compatible = "ti,omap2-nand", },
+       {},
+};
+
 static struct platform_driver omap_nand_driver = {
        .probe          = omap_nand_probe,
        .remove         = omap_nand_remove,
        .driver         = {
                .name   = DRIVER_NAME,
+               .of_match_table = of_match_ptr(omap_nand_ids),
        },
 };
 
index d4614bf..40a7c4a 100644 (file)
@@ -130,6 +130,7 @@ static int __init orion_nand_probe(struct platform_device *pdev)
        nc->cmd_ctrl = orion_nand_cmd_ctrl;
        nc->read_buf = orion_nand_read_buf;
        nc->ecc.mode = NAND_ECC_SOFT;
+       nc->ecc.algo = NAND_ECC_HAMMING;
 
        if (board->chip_delay)
                nc->chip_delay = board->chip_delay;
index 3ab53ca..5de7591 100644 (file)
@@ -92,8 +92,9 @@ int pasemi_device_ready(struct mtd_info *mtd)
 
 static int pasemi_nand_probe(struct platform_device *ofdev)
 {
+       struct device *dev = &ofdev->dev;
        struct pci_dev *pdev;
-       struct device_node *np = ofdev->dev.of_node;
+       struct device_node *np = dev->of_node;
        struct resource res;
        struct nand_chip *chip;
        int err = 0;
@@ -107,13 +108,11 @@ static int pasemi_nand_probe(struct platform_device *ofdev)
        if (pasemi_nand_mtd)
                return -ENODEV;
 
-       pr_debug("pasemi_nand at %pR\n", &res);
+       dev_dbg(dev, "pasemi_nand at %pR\n", &res);
 
        /* Allocate memory for MTD device structure and private data */
        chip = kzalloc(sizeof(struct nand_chip), GFP_KERNEL);
        if (!chip) {
-               printk(KERN_WARNING
-                      "Unable to allocate PASEMI NAND MTD device structure\n");
                err = -ENOMEM;
                goto out;
        }
@@ -121,7 +120,7 @@ static int pasemi_nand_probe(struct platform_device *ofdev)
        pasemi_nand_mtd = nand_to_mtd(chip);
 
        /* Link the private data with the MTD structure */
-       pasemi_nand_mtd->dev.parent = &ofdev->dev;
+       pasemi_nand_mtd->dev.parent = dev;
 
        chip->IO_ADDR_R = of_iomap(np, 0);
        chip->IO_ADDR_W = chip->IO_ADDR_R;
@@ -151,6 +150,7 @@ static int pasemi_nand_probe(struct platform_device *ofdev)
        chip->write_buf = pasemi_write_buf;
        chip->chip_delay = 0;
        chip->ecc.mode = NAND_ECC_SOFT;
+       chip->ecc.algo = NAND_ECC_HAMMING;
 
        /* Enable the following for a flash based bad block table */
        chip->bbt_options = NAND_BBT_USE_FLASH;
@@ -162,13 +162,13 @@ static int pasemi_nand_probe(struct platform_device *ofdev)
        }
 
        if (mtd_device_register(pasemi_nand_mtd, NULL, 0)) {
-               printk(KERN_ERR "pasemi_nand: Unable to register MTD device\n");
+               dev_err(dev, "Unable to register MTD device\n");
                err = -ENODEV;
                goto out_lpc;
        }
 
-       printk(KERN_INFO "PA Semi NAND flash at %08llx, control at I/O %x\n",
-              res.start, lpcctl);
+       dev_info(dev, "PA Semi NAND flash at %pR, control at I/O %x\n", &res,
+                lpcctl);
 
        return 0;
 
index e4e50da..415a53a 100644 (file)
@@ -74,6 +74,7 @@ static int plat_nand_probe(struct platform_device *pdev)
 
        data->chip.ecc.hwctl = pdata->ctrl.hwcontrol;
        data->chip.ecc.mode = NAND_ECC_SOFT;
+       data->chip.ecc.algo = NAND_ECC_HAMMING;
 
        platform_set_drvdata(pdev, data);
 
index d650885..436dd6d 100644 (file)
@@ -29,7 +29,6 @@
 #include <linux/slab.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
-#include <linux/of_mtd.h>
 #include <linux/platform_data/mtd-nand-pxa3xx.h>
 
 #define        CHIP_DELAY_TIMEOUT      msecs_to_jiffies(200)
@@ -324,6 +323,62 @@ static struct pxa3xx_nand_flash builtin_flash_types[] = {
        { 0xba20, 16, 16, &timing[3] },
 };
 
+static int pxa3xx_ooblayout_ecc(struct mtd_info *mtd, int section,
+                               struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct pxa3xx_nand_host *host = nand_get_controller_data(chip);
+       struct pxa3xx_nand_info *info = host->info_data;
+       int nchunks = mtd->writesize / info->chunk_size;
+
+       if (section >= nchunks)
+               return -ERANGE;
+
+       oobregion->offset = ((info->ecc_size + info->spare_size) * section) +
+                           info->spare_size;
+       oobregion->length = info->ecc_size;
+
+       return 0;
+}
+
+static int pxa3xx_ooblayout_free(struct mtd_info *mtd, int section,
+                                struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct pxa3xx_nand_host *host = nand_get_controller_data(chip);
+       struct pxa3xx_nand_info *info = host->info_data;
+       int nchunks = mtd->writesize / info->chunk_size;
+
+       if (section >= nchunks)
+               return -ERANGE;
+
+       if (!info->spare_size)
+               return 0;
+
+       oobregion->offset = section * (info->ecc_size + info->spare_size);
+       oobregion->length = info->spare_size;
+       if (!section) {
+               /*
+                * Bootrom looks in bytes 0 & 5 for bad blocks for the
+                * 4KB page / 4bit BCH combination.
+                */
+               if (mtd->writesize == 4096 && info->chunk_size == 2048) {
+                       oobregion->offset += 6;
+                       oobregion->length -= 6;
+               } else {
+                       oobregion->offset += 2;
+                       oobregion->length -= 2;
+               }
+       }
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops pxa3xx_ooblayout_ops = {
+       .ecc = pxa3xx_ooblayout_ecc,
+       .free = pxa3xx_ooblayout_free,
+};
+
 static u8 bbt_pattern[] = {'M', 'V', 'B', 'b', 't', '0' };
 static u8 bbt_mirror_pattern[] = {'1', 't', 'b', 'B', 'V', 'M' };
 
@@ -347,41 +402,6 @@ static struct nand_bbt_descr bbt_mirror_descr = {
        .pattern = bbt_mirror_pattern
 };
 
-static struct nand_ecclayout ecc_layout_2KB_bch4bit = {
-       .eccbytes = 32,
-       .eccpos = {
-               32, 33, 34, 35, 36, 37, 38, 39,
-               40, 41, 42, 43, 44, 45, 46, 47,
-               48, 49, 50, 51, 52, 53, 54, 55,
-               56, 57, 58, 59, 60, 61, 62, 63},
-       .oobfree = { {2, 30} }
-};
-
-static struct nand_ecclayout ecc_layout_4KB_bch4bit = {
-       .eccbytes = 64,
-       .eccpos = {
-               32,  33,  34,  35,  36,  37,  38,  39,
-               40,  41,  42,  43,  44,  45,  46,  47,
-               48,  49,  50,  51,  52,  53,  54,  55,
-               56,  57,  58,  59,  60,  61,  62,  63,
-               96,  97,  98,  99,  100, 101, 102, 103,
-               104, 105, 106, 107, 108, 109, 110, 111,
-               112, 113, 114, 115, 116, 117, 118, 119,
-               120, 121, 122, 123, 124, 125, 126, 127},
-       /* Bootrom looks in bytes 0 & 5 for bad blocks */
-       .oobfree = { {6, 26}, { 64, 32} }
-};
-
-static struct nand_ecclayout ecc_layout_4KB_bch8bit = {
-       .eccbytes = 128,
-       .eccpos = {
-               32,  33,  34,  35,  36,  37,  38,  39,
-               40,  41,  42,  43,  44,  45,  46,  47,
-               48,  49,  50,  51,  52,  53,  54,  55,
-               56,  57,  58,  59,  60,  61,  62,  63},
-       .oobfree = { }
-};
-
 #define NDTR0_tCH(c)   (min((c), 7) << 19)
 #define NDTR0_tCS(c)   (min((c), 7) << 16)
 #define NDTR0_tWH(c)   (min((c), 7) << 11)
@@ -1546,9 +1566,12 @@ static void pxa3xx_nand_free_buff(struct pxa3xx_nand_info *info)
 }
 
 static int pxa_ecc_init(struct pxa3xx_nand_info *info,
-                       struct nand_ecc_ctrl *ecc,
+                       struct mtd_info *mtd,
                        int strength, int ecc_stepsize, int page_size)
 {
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct nand_ecc_ctrl *ecc = &chip->ecc;
+
        if (strength == 1 && ecc_stepsize == 512 && page_size == 2048) {
                info->nfullchunks = 1;
                info->ntotalchunks = 1;
@@ -1582,7 +1605,7 @@ static int pxa_ecc_init(struct pxa3xx_nand_info *info,
                info->ecc_size = 32;
                ecc->mode = NAND_ECC_HW;
                ecc->size = info->chunk_size;
-               ecc->layout = &ecc_layout_2KB_bch4bit;
+               mtd_set_ooblayout(mtd, &pxa3xx_ooblayout_ops);
                ecc->strength = 16;
 
        } else if (strength == 4 && ecc_stepsize == 512 && page_size == 4096) {
@@ -1594,7 +1617,7 @@ static int pxa_ecc_init(struct pxa3xx_nand_info *info,
                info->ecc_size = 32;
                ecc->mode = NAND_ECC_HW;
                ecc->size = info->chunk_size;
-               ecc->layout = &ecc_layout_4KB_bch4bit;
+               mtd_set_ooblayout(mtd, &pxa3xx_ooblayout_ops);
                ecc->strength = 16;
 
        /*
@@ -1612,7 +1635,7 @@ static int pxa_ecc_init(struct pxa3xx_nand_info *info,
                info->ecc_size = 32;
                ecc->mode = NAND_ECC_HW;
                ecc->size = info->chunk_size;
-               ecc->layout = &ecc_layout_4KB_bch8bit;
+               mtd_set_ooblayout(mtd, &pxa3xx_ooblayout_ops);
                ecc->strength = 16;
        } else {
                dev_err(&info->pdev->dev,
@@ -1651,6 +1674,12 @@ static int pxa3xx_nand_scan(struct mtd_info *mtd)
        if (info->variant == PXA3XX_NAND_VARIANT_ARMADA370)
                nand_writel(info, NDECCCTRL, 0x0);
 
+       if (pdata->flash_bbt)
+               chip->bbt_options |= NAND_BBT_USE_FLASH;
+
+       chip->ecc.strength = pdata->ecc_strength;
+       chip->ecc.size = pdata->ecc_step_size;
+
        if (nand_scan_ident(mtd, 1, NULL))
                return -ENODEV;
 
@@ -1663,13 +1692,12 @@ static int pxa3xx_nand_scan(struct mtd_info *mtd)
                }
        }
 
-       if (pdata->flash_bbt) {
+       if (chip->bbt_options & NAND_BBT_USE_FLASH) {
                /*
                 * We'll use a bad block table stored in-flash and don't
                 * allow writing the bad block marker to the flash.
                 */
-               chip->bbt_options |= NAND_BBT_USE_FLASH |
-                                    NAND_BBT_NO_OOB_BBM;
+               chip->bbt_options |= NAND_BBT_NO_OOB_BBM;
                chip->bbt_td = &bbt_main_descr;
                chip->bbt_md = &bbt_mirror_descr;
        }
@@ -1689,10 +1717,9 @@ static int pxa3xx_nand_scan(struct mtd_info *mtd)
                }
        }
 
-       if (pdata->ecc_strength && pdata->ecc_step_size) {
-               ecc_strength = pdata->ecc_strength;
-               ecc_step = pdata->ecc_step_size;
-       } else {
+       ecc_strength = chip->ecc.strength;
+       ecc_step = chip->ecc.size;
+       if (!ecc_strength || !ecc_step) {
                ecc_strength = chip->ecc_strength_ds;
                ecc_step = chip->ecc_step_ds;
        }
@@ -1703,7 +1730,7 @@ static int pxa3xx_nand_scan(struct mtd_info *mtd)
                ecc_step = 512;
        }
 
-       ret = pxa_ecc_init(info, &chip->ecc, ecc_strength,
+       ret = pxa_ecc_init(info, mtd, ecc_strength,
                           ecc_step, mtd->writesize);
        if (ret)
                return ret;
@@ -1903,15 +1930,6 @@ static int pxa3xx_nand_probe_dt(struct platform_device *pdev)
        if (of_get_property(np, "marvell,nand-keep-config", NULL))
                pdata->keep_config = 1;
        of_property_read_u32(np, "num-cs", &pdata->num_cs);
-       pdata->flash_bbt = of_get_nand_on_flash_bbt(np);
-
-       pdata->ecc_strength = of_get_nand_ecc_strength(np);
-       if (pdata->ecc_strength < 0)
-               pdata->ecc_strength = 0;
-
-       pdata->ecc_step_size = of_get_nand_ecc_step_size(np);
-       if (pdata->ecc_step_size < 0)
-               pdata->ecc_step_size = 0;
 
        pdev->dev.platform_data = pdata;
 
index f550a57..de7d28e 100644 (file)
@@ -21,7 +21,6 @@
 #include <linux/mtd/partitions.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
-#include <linux/of_mtd.h>
 #include <linux/delay.h>
 
 /* NANDc reg offsets */
@@ -1437,7 +1436,6 @@ static int qcom_nandc_write_oob(struct mtd_info *mtd, struct nand_chip *chip,
        struct qcom_nand_controller *nandc = get_qcom_nand_controller(chip);
        struct nand_ecc_ctrl *ecc = &chip->ecc;
        u8 *oob = chip->oob_poi;
-       int free_boff;
        int data_size, oob_size;
        int ret, status = 0;
 
@@ -1451,12 +1449,11 @@ static int qcom_nandc_write_oob(struct mtd_info *mtd, struct nand_chip *chip,
 
        /* calculate the data and oob size for the last codeword/step */
        data_size = ecc->size - ((ecc->steps - 1) << 2);
-       oob_size = ecc->steps << 2;
-
-       free_boff = ecc->layout->oobfree[0].offset;
+       oob_size = mtd->oobavail;
 
        /* override new oob content to last codeword */
-       memcpy(nandc->data_buffer + data_size, oob + free_boff, oob_size);
+       mtd_ooblayout_get_databytes(mtd, nandc->data_buffer + data_size, oob,
+                                   0, mtd->oobavail);
 
        set_address(host, host->cw_size * (ecc->steps - 1), page);
        update_rw_regs(host, 1, false);
@@ -1710,61 +1707,52 @@ static void qcom_nandc_select_chip(struct mtd_info *mtd, int chipnr)
  * This layout is read as is when ECC is disabled. When ECC is enabled, the
  * inaccessible Bad Block byte(s) are ignored when we write to a page/oob,
  * and assumed as 0xffs when we read a page/oob. The ECC, unused and
- * dummy/real bad block bytes are grouped as ecc bytes in nand_ecclayout (i.e,
- * ecc->bytes is the sum of the three).
+ * dummy/real bad block bytes are grouped as ecc bytes (i.e, ecc->bytes is
+ * the sum of the three).
  */
-
-static struct nand_ecclayout *
-qcom_nand_create_layout(struct qcom_nand_host *host)
+static int qcom_nand_ooblayout_ecc(struct mtd_info *mtd, int section,
+                                  struct mtd_oob_region *oobregion)
 {
-       struct nand_chip *chip = &host->chip;
-       struct mtd_info *mtd = nand_to_mtd(chip);
-       struct qcom_nand_controller *nandc = get_qcom_nand_controller(chip);
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct qcom_nand_host *host = to_qcom_nand_host(chip);
        struct nand_ecc_ctrl *ecc = &chip->ecc;
-       struct nand_ecclayout *layout;
-       int i, j, steps, pos = 0, shift = 0;
 
-       layout = devm_kzalloc(nandc->dev, sizeof(*layout), GFP_KERNEL);
-       if (!layout)
-               return NULL;
-
-       steps = mtd->writesize / ecc->size;
-       layout->eccbytes = steps * ecc->bytes;
+       if (section > 1)
+               return -ERANGE;
 
-       layout->oobfree[0].offset = (steps - 1) * ecc->bytes + host->bbm_size;
-       layout->oobfree[0].length = steps << 2;
-
-       /*
-        * the oob bytes in the first n - 1 codewords are all grouped together
-        * in the format:
-        * DUMMY_BBM + UNUSED + ECC
-        */
-       for (i = 0; i < steps - 1; i++) {
-               for (j = 0; j < ecc->bytes; j++)
-                       layout->eccpos[pos++] = i * ecc->bytes + j;
+       if (!section) {
+               oobregion->length = (ecc->bytes * (ecc->steps - 1)) +
+                                   host->bbm_size;
+               oobregion->offset = 0;
+       } else {
+               oobregion->length = host->ecc_bytes_hw + host->spare_bytes;
+               oobregion->offset = mtd->oobsize - oobregion->length;
        }
 
-       /*
-        * the oob bytes in the last codeword are grouped in the format:
-        * BBM + FREE OOB + UNUSED + ECC
-        */
+       return 0;
+}
 
-       /* fill up the bbm positions */
-       for (j = 0; j < host->bbm_size; j++)
-               layout->eccpos[pos++] = i * ecc->bytes + j;
+static int qcom_nand_ooblayout_free(struct mtd_info *mtd, int section,
+                                    struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct qcom_nand_host *host = to_qcom_nand_host(chip);
+       struct nand_ecc_ctrl *ecc = &chip->ecc;
 
-       /*
-        * fill up the ecc and reserved positions, their indices are offseted
-        * by the free oob region
-        */
-       shift = layout->oobfree[0].length + host->bbm_size;
+       if (section)
+               return -ERANGE;
 
-       for (j = 0; j < (host->ecc_bytes_hw + host->spare_bytes); j++)
-               layout->eccpos[pos++] = i * ecc->bytes + shift + j;
+       oobregion->length = ecc->steps * 4;
+       oobregion->offset = ((ecc->steps - 1) * ecc->bytes) + host->bbm_size;
 
-       return layout;
+       return 0;
 }
 
+static const struct mtd_ooblayout_ops qcom_nand_ooblayout_ops = {
+       .ecc = qcom_nand_ooblayout_ecc,
+       .free = qcom_nand_ooblayout_free,
+};
+
 static int qcom_nand_host_setup(struct qcom_nand_host *host)
 {
        struct nand_chip *chip = &host->chip;
@@ -1851,9 +1839,7 @@ static int qcom_nand_host_setup(struct qcom_nand_host *host)
 
        ecc->mode = NAND_ECC_HW;
 
-       ecc->layout = qcom_nand_create_layout(host);
-       if (!ecc->layout)
-               return -ENOMEM;
+       mtd_set_ooblayout(mtd, &qcom_nand_ooblayout_ops);
 
        cwperpage = mtd->writesize / ecc->size;
 
index 9c9397b..d9309cf 100644 (file)
 
 /* new oob placement block for use with hardware ecc generation
  */
+static int s3c2410_ooblayout_ecc(struct mtd_info *mtd, int section,
+                                struct mtd_oob_region *oobregion)
+{
+       if (section)
+               return -ERANGE;
+
+       oobregion->offset = 0;
+       oobregion->length = 3;
+
+       return 0;
+}
+
+static int s3c2410_ooblayout_free(struct mtd_info *mtd, int section,
+                                 struct mtd_oob_region *oobregion)
+{
+       if (section)
+               return -ERANGE;
+
+       oobregion->offset = 8;
+       oobregion->length = 8;
+
+       return 0;
+}
 
-static struct nand_ecclayout nand_hw_eccoob = {
-       .eccbytes = 3,
-       .eccpos = {0, 1, 2},
-       .oobfree = {{8, 8}}
+static const struct mtd_ooblayout_ops s3c2410_ooblayout_ops = {
+       .ecc = s3c2410_ooblayout_ecc,
+       .free = s3c2410_ooblayout_free,
 };
 
 /* controller and mtd information */
@@ -542,7 +564,8 @@ static int s3c2410_nand_correct_data(struct mtd_info *mtd, u_char *dat,
        diff0 |= (diff1 << 8);
        diff0 |= (diff2 << 16);
 
-       if ((diff0 & ~(1<<fls(diff0))) == 0)
+       /* equal to "(diff0 & ~(1 << __ffs(diff0)))" */
+       if ((diff0 & (diff0 - 1)) == 0)
                return 1;
 
        return -1;
@@ -859,6 +882,7 @@ static void s3c2410_nand_init_chip(struct s3c2410_nand_info *info,
        }
 #else
        chip->ecc.mode      = NAND_ECC_SOFT;
+       chip->ecc.algo  = NAND_ECC_HAMMING;
 #endif
 
        if (set->disable_ecc)
@@ -919,7 +943,7 @@ static void s3c2410_nand_update_chip(struct s3c2410_nand_info *info,
        } else {
                chip->ecc.size      = 512;
                chip->ecc.bytes     = 3;
-               chip->ecc.layout    = &nand_hw_eccoob;
+               mtd_set_ooblayout(nand_to_mtd(chip), &s3c2410_ooblayout_ops);
        }
 }
 
index 4814402..6fa3bcd 100644 (file)
@@ -31,7 +31,6 @@
 #include <linux/io.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
-#include <linux/of_mtd.h>
 #include <linux/platform_device.h>
 #include <linux/pm_runtime.h>
 #include <linux/sh_dma.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/sh_flctl.h>
 
-static struct nand_ecclayout flctl_4secc_oob_16 = {
-       .eccbytes = 10,
-       .eccpos = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
-       .oobfree = {
-               {.offset = 12,
-               . length = 4} },
+static int flctl_4secc_ooblayout_sp_ecc(struct mtd_info *mtd, int section,
+                                       struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+
+       if (section)
+               return -ERANGE;
+
+       oobregion->offset = 0;
+       oobregion->length = chip->ecc.bytes;
+
+       return 0;
+}
+
+static int flctl_4secc_ooblayout_sp_free(struct mtd_info *mtd, int section,
+                                        struct mtd_oob_region *oobregion)
+{
+       if (section)
+               return -ERANGE;
+
+       oobregion->offset = 12;
+       oobregion->length = 4;
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops flctl_4secc_oob_smallpage_ops = {
+       .ecc = flctl_4secc_ooblayout_sp_ecc,
+       .free = flctl_4secc_ooblayout_sp_free,
 };
 
-static struct nand_ecclayout flctl_4secc_oob_64 = {
-       .eccbytes = 4 * 10,
-       .eccpos = {
-                6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-               22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-               38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-               54, 55, 56, 57, 58, 59, 60, 61, 62, 63 },
-       .oobfree = {
-               {.offset =  2, .length = 4},
-               {.offset = 16, .length = 6},
-               {.offset = 32, .length = 6},
-               {.offset = 48, .length = 6} },
+static int flctl_4secc_ooblayout_lp_ecc(struct mtd_info *mtd, int section,
+                                       struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+
+       if (section >= chip->ecc.steps)
+               return -ERANGE;
+
+       oobregion->offset = (section * 16) + 6;
+       oobregion->length = chip->ecc.bytes;
+
+       return 0;
+}
+
+static int flctl_4secc_ooblayout_lp_free(struct mtd_info *mtd, int section,
+                                        struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+
+       if (section >= chip->ecc.steps)
+               return -ERANGE;
+
+       oobregion->offset = section * 16;
+       oobregion->length = 6;
+
+       if (!section) {
+               oobregion->offset += 2;
+               oobregion->length -= 2;
+       }
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops flctl_4secc_oob_largepage_ops = {
+       .ecc = flctl_4secc_ooblayout_lp_ecc,
+       .free = flctl_4secc_ooblayout_lp_free,
 };
 
 static uint8_t scan_ff_pattern[] = { 0xff, 0xff };
@@ -987,10 +1033,10 @@ static int flctl_chip_init_tail(struct mtd_info *mtd)
 
        if (flctl->hwecc) {
                if (mtd->writesize == 512) {
-                       chip->ecc.layout = &flctl_4secc_oob_16;
+                       mtd_set_ooblayout(mtd, &flctl_4secc_oob_smallpage_ops);
                        chip->badblock_pattern = &flctl_4secc_smallpage;
                } else {
-                       chip->ecc.layout = &flctl_4secc_oob_64;
+                       mtd_set_ooblayout(mtd, &flctl_4secc_oob_largepage_ops);
                        chip->badblock_pattern = &flctl_4secc_largepage;
                }
 
@@ -1005,6 +1051,7 @@ static int flctl_chip_init_tail(struct mtd_info *mtd)
                flctl->flcmncr_base |= _4ECCEN;
        } else {
                chip->ecc.mode = NAND_ECC_SOFT;
+               chip->ecc.algo = NAND_ECC_HAMMING;
        }
 
        return 0;
@@ -1044,8 +1091,6 @@ static struct sh_flctl_platform_data *flctl_parse_dt(struct device *dev)
        const struct of_device_id *match;
        struct flctl_soc_config *config;
        struct sh_flctl_platform_data *pdata;
-       struct device_node *dn = dev->of_node;
-       int ret;
 
        match = of_match_device(of_flctl_match, dev);
        if (match)
@@ -1065,15 +1110,6 @@ static struct sh_flctl_platform_data *flctl_parse_dt(struct device *dev)
        pdata->has_hwecc = config->has_hwecc;
        pdata->use_holden = config->use_holden;
 
-       /* parse user defined options */
-       ret = of_get_nand_bus_width(dn);
-       if (ret == 16)
-               pdata->flcmncr_val |= SEL_16BIT;
-       else if (ret != 8) {
-               dev_err(dev, "%s: invalid bus width\n", __func__);
-               return NULL;
-       }
-
        return pdata;
 }
 
@@ -1136,15 +1172,14 @@ static int flctl_probe(struct platform_device *pdev)
        nand->chip_delay = 20;
 
        nand->read_byte = flctl_read_byte;
+       nand->read_word = flctl_read_word;
        nand->write_buf = flctl_write_buf;
        nand->read_buf = flctl_read_buf;
        nand->select_chip = flctl_select_chip;
        nand->cmdfunc = flctl_cmdfunc;
 
-       if (pdata->flcmncr_val & SEL_16BIT) {
+       if (pdata->flcmncr_val & SEL_16BIT)
                nand->options |= NAND_BUSWIDTH_16;
-               nand->read_word = flctl_read_word;
-       }
 
        pm_runtime_enable(&pdev->dev);
        pm_runtime_resume(&pdev->dev);
@@ -1155,6 +1190,16 @@ static int flctl_probe(struct platform_device *pdev)
        if (ret)
                goto err_chip;
 
+       if (nand->options & NAND_BUSWIDTH_16) {
+               /*
+                * NAND_BUSWIDTH_16 may have been set by nand_scan_ident().
+                * Add the SEL_16BIT flag in pdata->flcmncr_val and re-assign
+                * flctl->flcmncr_base to pdata->flcmncr_val.
+                */
+               pdata->flcmncr_val |= SEL_16BIT;
+               flctl->flcmncr_base = pdata->flcmncr_val;
+       }
+
        ret = flctl_chip_init_tail(flctl_mtd);
        if (ret)
                goto err_chip;
index b7d1b55..064ca17 100644 (file)
@@ -148,6 +148,7 @@ static int sharpsl_nand_probe(struct platform_device *pdev)
        /* Link the private data with the MTD structure */
        mtd = nand_to_mtd(this);
        mtd->dev.parent = &pdev->dev;
+       mtd_set_ooblayout(mtd, data->ecc_layout);
 
        platform_set_drvdata(pdev, sharpsl);
 
@@ -170,7 +171,6 @@ static int sharpsl_nand_probe(struct platform_device *pdev)
        this->ecc.bytes = 3;
        this->ecc.strength = 1;
        this->badblock_pattern = data->badblock_pattern;
-       this->ecc.layout = data->ecc_layout;
        this->ecc.hwctl = sharpsl_nand_enable_hwecc;
        this->ecc.calculate = sharpsl_nand_calculate_ecc;
        this->ecc.correct = nand_correct_data;
index c514740..5939dff 100644 (file)
 #include <linux/sizes.h>
 #include "sm_common.h"
 
-static struct nand_ecclayout nand_oob_sm = {
-       .eccbytes = 6,
-       .eccpos = {8, 9, 10, 13, 14, 15},
-       .oobfree = {
-               {.offset = 0 , .length = 4}, /* reserved */
-               {.offset = 6 , .length = 2}, /* LBA1 */
-               {.offset = 11, .length = 2}  /* LBA2 */
+static int oob_sm_ooblayout_ecc(struct mtd_info *mtd, int section,
+                               struct mtd_oob_region *oobregion)
+{
+       if (section > 1)
+               return -ERANGE;
+
+       oobregion->length = 3;
+       oobregion->offset = ((section + 1) * 8) - 3;
+
+       return 0;
+}
+
+static int oob_sm_ooblayout_free(struct mtd_info *mtd, int section,
+                                struct mtd_oob_region *oobregion)
+{
+       switch (section) {
+       case 0:
+               /* reserved */
+               oobregion->offset = 0;
+               oobregion->length = 4;
+               break;
+       case 1:
+               /* LBA1 */
+               oobregion->offset = 6;
+               oobregion->length = 2;
+               break;
+       case 2:
+               /* LBA2 */
+               oobregion->offset = 11;
+               oobregion->length = 2;
+               break;
+       default:
+               return -ERANGE;
        }
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops oob_sm_ops = {
+       .ecc = oob_sm_ooblayout_ecc,
+       .free = oob_sm_ooblayout_free,
 };
 
 /* NOTE: This layout is is not compatabable with SmartMedia, */
@@ -28,15 +61,43 @@ static struct nand_ecclayout nand_oob_sm = {
 /* If you use smftl, it will bypass this and work correctly */
 /* If you not, then you break SmartMedia compliance anyway */
 
-static struct nand_ecclayout nand_oob_sm_small = {
-       .eccbytes = 3,
-       .eccpos = {0, 1, 2},
-       .oobfree = {
-               {.offset = 3 , .length = 2}, /* reserved */
-               {.offset = 6 , .length = 2}, /* LBA1 */
+static int oob_sm_small_ooblayout_ecc(struct mtd_info *mtd, int section,
+                                     struct mtd_oob_region *oobregion)
+{
+       if (section)
+               return -ERANGE;
+
+       oobregion->length = 3;
+       oobregion->offset = 0;
+
+       return 0;
+}
+
+static int oob_sm_small_ooblayout_free(struct mtd_info *mtd, int section,
+                                      struct mtd_oob_region *oobregion)
+{
+       switch (section) {
+       case 0:
+               /* reserved */
+               oobregion->offset = 3;
+               oobregion->length = 2;
+               break;
+       case 1:
+               /* LBA1 */
+               oobregion->offset = 6;
+               oobregion->length = 2;
+               break;
+       default:
+               return -ERANGE;
        }
-};
 
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops oob_sm_small_ops = {
+       .ecc = oob_sm_small_ooblayout_ecc,
+       .free = oob_sm_small_ooblayout_free,
+};
 
 static int sm_block_markbad(struct mtd_info *mtd, loff_t ofs)
 {
@@ -121,9 +182,9 @@ int sm_register_device(struct mtd_info *mtd, int smartmedia)
 
        /* ECC layout */
        if (mtd->writesize == SM_SECTOR_SIZE)
-               chip->ecc.layout = &nand_oob_sm;
+               mtd_set_ooblayout(mtd, &oob_sm_ops);
        else if (mtd->writesize == SM_SMALL_PAGE)
-               chip->ecc.layout = &nand_oob_sm_small;
+               mtd_set_ooblayout(mtd, &oob_sm_small_ops);
        else
                return -ENODEV;
 
index e3305f9..888fd31 100644 (file)
@@ -180,6 +180,7 @@ static int socrates_nand_probe(struct platform_device *ofdev)
        nand_chip->dev_ready = socrates_nand_device_ready;
 
        nand_chip->ecc.mode = NAND_ECC_SOFT;    /* enable ECC */
+       nand_chip->ecc.algo = NAND_ECC_HAMMING;
 
        /* TODO: I have no idea what real delay is. */
        nand_chip->chip_delay = 20;             /* 20us command delay time */
index 1c03eee..a83a690 100644 (file)
@@ -30,7 +30,6 @@
 #include <linux/of.h>
 #include <linux/of_device.h>
 #include <linux/of_gpio.h>
-#include <linux/of_mtd.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/nand.h>
 #include <linux/mtd/partitions.h>
@@ -39,7 +38,7 @@
 #include <linux/dmaengine.h>
 #include <linux/gpio.h>
 #include <linux/interrupt.h>
-#include <linux/io.h>
+#include <linux/iopoll.h>
 
 #define NFC_REG_CTL            0x0000
 #define NFC_REG_ST             0x0004
 /* define bit use in NFC_ECC_ST */
 #define NFC_ECC_ERR(x)         BIT(x)
 #define NFC_ECC_PAT_FOUND(x)   BIT(x + 16)
-#define NFC_ECC_ERR_CNT(b, x)  (((x) >> ((b) * 8)) & 0xff)
+#define NFC_ECC_ERR_CNT(b, x)  (((x) >> (((b) % 4) * 8)) & 0xff)
 
 #define NFC_DEFAULT_TIMEOUT_MS 1000
 
@@ -212,12 +211,9 @@ struct sunxi_nand_chip_sel {
  * sunxi HW ECC infos: stores information related to HW ECC support
  *
  * @mode:      the sunxi ECC mode field deduced from ECC requirements
- * @layout:    the OOB layout depending on the ECC requirements and the
- *             selected ECC mode
  */
 struct sunxi_nand_hw_ecc {
        int mode;
-       struct nand_ecclayout layout;
 };
 
 /*
@@ -239,6 +235,10 @@ struct sunxi_nand_chip {
        u32 timing_cfg;
        u32 timing_ctl;
        int selected;
+       int addr_cycles;
+       u32 addr[2];
+       int cmd_cycles;
+       u8 cmd[2];
        int nsels;
        struct sunxi_nand_chip_sel sels[0];
 };
@@ -298,54 +298,71 @@ static irqreturn_t sunxi_nfc_interrupt(int irq, void *dev_id)
        return IRQ_HANDLED;
 }
 
-static int sunxi_nfc_wait_int(struct sunxi_nfc *nfc, u32 flags,
-                             unsigned int timeout_ms)
+static int sunxi_nfc_wait_events(struct sunxi_nfc *nfc, u32 events,
+                                bool use_polling, unsigned int timeout_ms)
 {
-       init_completion(&nfc->complete);
+       int ret;
 
-       writel(flags, nfc->regs + NFC_REG_INT);
+       if (events & ~NFC_INT_MASK)
+               return -EINVAL;
 
        if (!timeout_ms)
                timeout_ms = NFC_DEFAULT_TIMEOUT_MS;
 
-       if (!wait_for_completion_timeout(&nfc->complete,
-                                        msecs_to_jiffies(timeout_ms))) {
-               dev_err(nfc->dev, "wait interrupt timedout\n");
-               return -ETIMEDOUT;
+       if (!use_polling) {
+               init_completion(&nfc->complete);
+
+               writel(events, nfc->regs + NFC_REG_INT);
+
+               ret = wait_for_completion_timeout(&nfc->complete,
+                                               msecs_to_jiffies(timeout_ms));
+
+               writel(0, nfc->regs + NFC_REG_INT);
+       } else {
+               u32 status;
+
+               ret = readl_poll_timeout(nfc->regs + NFC_REG_ST, status,
+                                        (status & events) == events, 1,
+                                        timeout_ms * 1000);
        }
 
-       return 0;
+       writel(events & NFC_INT_MASK, nfc->regs + NFC_REG_ST);
+
+       if (ret)
+               dev_err(nfc->dev, "wait interrupt timedout\n");
+
+       return ret;
 }
 
 static int sunxi_nfc_wait_cmd_fifo_empty(struct sunxi_nfc *nfc)
 {
-       unsigned long timeout = jiffies +
-                               msecs_to_jiffies(NFC_DEFAULT_TIMEOUT_MS);
+       u32 status;
+       int ret;
 
-       do {
-               if (!(readl(nfc->regs + NFC_REG_ST) & NFC_CMD_FIFO_STATUS))
-                       return 0;
-       } while (time_before(jiffies, timeout));
+       ret = readl_poll_timeout(nfc->regs + NFC_REG_ST, status,
+                                !(status & NFC_CMD_FIFO_STATUS), 1,
+                                NFC_DEFAULT_TIMEOUT_MS * 1000);
+       if (ret)
+               dev_err(nfc->dev, "wait for empty cmd FIFO timedout\n");
 
-       dev_err(nfc->dev, "wait for empty cmd FIFO timedout\n");
-       return -ETIMEDOUT;
+       return ret;
 }
 
 static int sunxi_nfc_rst(struct sunxi_nfc *nfc)
 {
-       unsigned long timeout = jiffies +
-                               msecs_to_jiffies(NFC_DEFAULT_TIMEOUT_MS);
+       u32 ctl;
+       int ret;
 
        writel(0, nfc->regs + NFC_REG_ECC_CTL);
        writel(NFC_RESET, nfc->regs + NFC_REG_CTL);
 
-       do {
-               if (!(readl(nfc->regs + NFC_REG_CTL) & NFC_RESET))
-                       return 0;
-       } while (time_before(jiffies, timeout));
+       ret = readl_poll_timeout(nfc->regs + NFC_REG_CTL, ctl,
+                                !(ctl & NFC_RESET), 1,
+                                NFC_DEFAULT_TIMEOUT_MS * 1000);
+       if (ret)
+               dev_err(nfc->dev, "wait for NAND controller reset timedout\n");
 
-       dev_err(nfc->dev, "wait for NAND controller reset timedout\n");
-       return -ETIMEDOUT;
+       return ret;
 }
 
 static int sunxi_nfc_dev_ready(struct mtd_info *mtd)
@@ -354,7 +371,6 @@ static int sunxi_nfc_dev_ready(struct mtd_info *mtd)
        struct sunxi_nand_chip *sunxi_nand = to_sunxi_nand(nand);
        struct sunxi_nfc *nfc = to_sunxi_nfc(sunxi_nand->nand.controller);
        struct sunxi_nand_rb *rb;
-       unsigned long timeo = (sunxi_nand->nand.state == FL_ERASING ? 400 : 20);
        int ret;
 
        if (sunxi_nand->selected < 0)
@@ -364,12 +380,6 @@ static int sunxi_nfc_dev_ready(struct mtd_info *mtd)
 
        switch (rb->type) {
        case RB_NATIVE:
-               ret = !!(readl(nfc->regs + NFC_REG_ST) &
-                        NFC_RB_STATE(rb->info.nativeid));
-               if (ret)
-                       break;
-
-               sunxi_nfc_wait_int(nfc, NFC_RB_B2R, timeo);
                ret = !!(readl(nfc->regs + NFC_REG_ST) &
                         NFC_RB_STATE(rb->info.nativeid));
                break;
@@ -407,7 +417,7 @@ static void sunxi_nfc_select_chip(struct mtd_info *mtd, int chip)
                sel = &sunxi_nand->sels[chip];
 
                ctl |= NFC_CE_SEL(sel->cs) | NFC_EN |
-                      NFC_PAGE_SHIFT(nand->page_shift - 10);
+                      NFC_PAGE_SHIFT(nand->page_shift);
                if (sel->rb.type == RB_NONE) {
                        nand->dev_ready = NULL;
                } else {
@@ -452,7 +462,7 @@ static void sunxi_nfc_read_buf(struct mtd_info *mtd, uint8_t *buf, int len)
                tmp = NFC_DATA_TRANS | NFC_DATA_SWAP_METHOD;
                writel(tmp, nfc->regs + NFC_REG_CMD);
 
-               ret = sunxi_nfc_wait_int(nfc, NFC_CMD_INT_FLAG, 0);
+               ret = sunxi_nfc_wait_events(nfc, NFC_CMD_INT_FLAG, true, 0);
                if (ret)
                        break;
 
@@ -487,7 +497,7 @@ static void sunxi_nfc_write_buf(struct mtd_info *mtd, const uint8_t *buf,
                      NFC_ACCESS_DIR;
                writel(tmp, nfc->regs + NFC_REG_CMD);
 
-               ret = sunxi_nfc_wait_int(nfc, NFC_CMD_INT_FLAG, 0);
+               ret = sunxi_nfc_wait_events(nfc, NFC_CMD_INT_FLAG, true, 0);
                if (ret)
                        break;
 
@@ -511,32 +521,54 @@ static void sunxi_nfc_cmd_ctrl(struct mtd_info *mtd, int dat,
        struct sunxi_nand_chip *sunxi_nand = to_sunxi_nand(nand);
        struct sunxi_nfc *nfc = to_sunxi_nfc(sunxi_nand->nand.controller);
        int ret;
-       u32 tmp;
 
        ret = sunxi_nfc_wait_cmd_fifo_empty(nfc);
        if (ret)
                return;
 
-       if (ctrl & NAND_CTRL_CHANGE) {
-               tmp = readl(nfc->regs + NFC_REG_CTL);
-               if (ctrl & NAND_NCE)
-                       tmp |= NFC_CE_CTL;
-               else
-                       tmp &= ~NFC_CE_CTL;
-               writel(tmp, nfc->regs + NFC_REG_CTL);
-       }
+       if (dat == NAND_CMD_NONE && (ctrl & NAND_NCE) &&
+           !(ctrl & (NAND_CLE | NAND_ALE))) {
+               u32 cmd = 0;
 
-       if (dat == NAND_CMD_NONE)
-               return;
+               if (!sunxi_nand->addr_cycles && !sunxi_nand->cmd_cycles)
+                       return;
 
-       if (ctrl & NAND_CLE) {
-               writel(NFC_SEND_CMD1 | dat, nfc->regs + NFC_REG_CMD);
-       } else {
-               writel(dat, nfc->regs + NFC_REG_ADDR_LOW);
-               writel(NFC_SEND_ADR, nfc->regs + NFC_REG_CMD);
+               if (sunxi_nand->cmd_cycles--)
+                       cmd |= NFC_SEND_CMD1 | sunxi_nand->cmd[0];
+
+               if (sunxi_nand->cmd_cycles--) {
+                       cmd |= NFC_SEND_CMD2;
+                       writel(sunxi_nand->cmd[1],
+                              nfc->regs + NFC_REG_RCMD_SET);
+               }
+
+               sunxi_nand->cmd_cycles = 0;
+
+               if (sunxi_nand->addr_cycles) {
+                       cmd |= NFC_SEND_ADR |
+                              NFC_ADR_NUM(sunxi_nand->addr_cycles);
+                       writel(sunxi_nand->addr[0],
+                              nfc->regs + NFC_REG_ADDR_LOW);
+               }
+
+               if (sunxi_nand->addr_cycles > 4)
+                       writel(sunxi_nand->addr[1],
+                              nfc->regs + NFC_REG_ADDR_HIGH);
+
+               writel(cmd, nfc->regs + NFC_REG_CMD);
+               sunxi_nand->addr[0] = 0;
+               sunxi_nand->addr[1] = 0;
+               sunxi_nand->addr_cycles = 0;
+               sunxi_nfc_wait_events(nfc, NFC_CMD_INT_FLAG, true, 0);
        }
 
-       sunxi_nfc_wait_int(nfc, NFC_CMD_INT_FLAG, 0);
+       if (ctrl & NAND_CLE) {
+               sunxi_nand->cmd[sunxi_nand->cmd_cycles++] = dat;
+       } else if (ctrl & NAND_ALE) {
+               sunxi_nand->addr[sunxi_nand->addr_cycles / 4] |=
+                               dat << ((sunxi_nand->addr_cycles % 4) * 8);
+               sunxi_nand->addr_cycles++;
+       }
 }
 
 /* These seed values have been extracted from Allwinner's BSP */
@@ -717,7 +749,8 @@ static void sunxi_nfc_hw_ecc_enable(struct mtd_info *mtd)
        ecc_ctl = readl(nfc->regs + NFC_REG_ECC_CTL);
        ecc_ctl &= ~(NFC_ECC_MODE_MSK | NFC_ECC_PIPELINE |
                     NFC_ECC_BLOCK_SIZE_MSK);
-       ecc_ctl |= NFC_ECC_EN | NFC_ECC_MODE(data->mode) | NFC_ECC_EXCEPTION;
+       ecc_ctl |= NFC_ECC_EN | NFC_ECC_MODE(data->mode) | NFC_ECC_EXCEPTION |
+                  NFC_ECC_PIPELINE;
 
        writel(ecc_ctl, nfc->regs + NFC_REG_ECC_CTL);
 }
@@ -739,18 +772,106 @@ static inline void sunxi_nfc_user_data_to_buf(u32 user_data, u8 *buf)
        buf[3] = user_data >> 24;
 }
 
+static inline u32 sunxi_nfc_buf_to_user_data(const u8 *buf)
+{
+       return buf[0] | (buf[1] << 8) | (buf[2] << 16) | (buf[3] << 24);
+}
+
+static void sunxi_nfc_hw_ecc_get_prot_oob_bytes(struct mtd_info *mtd, u8 *oob,
+                                               int step, bool bbm, int page)
+{
+       struct nand_chip *nand = mtd_to_nand(mtd);
+       struct sunxi_nfc *nfc = to_sunxi_nfc(nand->controller);
+
+       sunxi_nfc_user_data_to_buf(readl(nfc->regs + NFC_REG_USER_DATA(step)),
+                                  oob);
+
+       /* De-randomize the Bad Block Marker. */
+       if (bbm && (nand->options & NAND_NEED_SCRAMBLING))
+               sunxi_nfc_randomize_bbm(mtd, page, oob);
+}
+
+static void sunxi_nfc_hw_ecc_set_prot_oob_bytes(struct mtd_info *mtd,
+                                               const u8 *oob, int step,
+                                               bool bbm, int page)
+{
+       struct nand_chip *nand = mtd_to_nand(mtd);
+       struct sunxi_nfc *nfc = to_sunxi_nfc(nand->controller);
+       u8 user_data[4];
+
+       /* Randomize the Bad Block Marker. */
+       if (bbm && (nand->options & NAND_NEED_SCRAMBLING)) {
+               memcpy(user_data, oob, sizeof(user_data));
+               sunxi_nfc_randomize_bbm(mtd, page, user_data);
+               oob = user_data;
+       }
+
+       writel(sunxi_nfc_buf_to_user_data(oob),
+              nfc->regs + NFC_REG_USER_DATA(step));
+}
+
+static void sunxi_nfc_hw_ecc_update_stats(struct mtd_info *mtd,
+                                         unsigned int *max_bitflips, int ret)
+{
+       if (ret < 0) {
+               mtd->ecc_stats.failed++;
+       } else {
+               mtd->ecc_stats.corrected += ret;
+               *max_bitflips = max_t(unsigned int, *max_bitflips, ret);
+       }
+}
+
+static int sunxi_nfc_hw_ecc_correct(struct mtd_info *mtd, u8 *data, u8 *oob,
+                                   int step, bool *erased)
+{
+       struct nand_chip *nand = mtd_to_nand(mtd);
+       struct sunxi_nfc *nfc = to_sunxi_nfc(nand->controller);
+       struct nand_ecc_ctrl *ecc = &nand->ecc;
+       u32 status, tmp;
+
+       *erased = false;
+
+       status = readl(nfc->regs + NFC_REG_ECC_ST);
+
+       if (status & NFC_ECC_ERR(step))
+               return -EBADMSG;
+
+       if (status & NFC_ECC_PAT_FOUND(step)) {
+               u8 pattern;
+
+               if (unlikely(!(readl(nfc->regs + NFC_REG_PAT_ID) & 0x1))) {
+                       pattern = 0x0;
+               } else {
+                       pattern = 0xff;
+                       *erased = true;
+               }
+
+               if (data)
+                       memset(data, pattern, ecc->size);
+
+               if (oob)
+                       memset(oob, pattern, ecc->bytes + 4);
+
+               return 0;
+       }
+
+       tmp = readl(nfc->regs + NFC_REG_ECC_ERR_CNT(step));
+
+       return NFC_ECC_ERR_CNT(step, tmp);
+}
+
 static int sunxi_nfc_hw_ecc_read_chunk(struct mtd_info *mtd,
                                       u8 *data, int data_off,
                                       u8 *oob, int oob_off,
                                       int *cur_off,
                                       unsigned int *max_bitflips,
-                                      bool bbm, int page)
+                                      bool bbm, bool oob_required, int page)
 {
        struct nand_chip *nand = mtd_to_nand(mtd);
        struct sunxi_nfc *nfc = to_sunxi_nfc(nand->controller);
        struct nand_ecc_ctrl *ecc = &nand->ecc;
        int raw_mode = 0;
-       u32 status;
+       bool erased;
        int ret;
 
        if (*cur_off != data_off)
@@ -769,34 +890,19 @@ static int sunxi_nfc_hw_ecc_read_chunk(struct mtd_info *mtd,
        writel(NFC_DATA_TRANS | NFC_DATA_SWAP_METHOD | NFC_ECC_OP,
               nfc->regs + NFC_REG_CMD);
 
-       ret = sunxi_nfc_wait_int(nfc, NFC_CMD_INT_FLAG, 0);
+       ret = sunxi_nfc_wait_events(nfc, NFC_CMD_INT_FLAG, true, 0);
        sunxi_nfc_randomizer_disable(mtd);
        if (ret)
                return ret;
 
        *cur_off = oob_off + ecc->bytes + 4;
 
-       status = readl(nfc->regs + NFC_REG_ECC_ST);
-       if (status & NFC_ECC_PAT_FOUND(0)) {
-               u8 pattern = 0xff;
-
-               if (unlikely(!(readl(nfc->regs + NFC_REG_PAT_ID) & 0x1)))
-                       pattern = 0x0;
-
-               memset(data, pattern, ecc->size);
-               memset(oob, pattern, ecc->bytes + 4);
-
+       ret = sunxi_nfc_hw_ecc_correct(mtd, data, oob_required ? oob : NULL, 0,
+                                      &erased);
+       if (erased)
                return 1;
-       }
-
-       ret = NFC_ECC_ERR_CNT(0, readl(nfc->regs + NFC_REG_ECC_ERR_CNT(0)));
-
-       memcpy_fromio(data, nfc->regs + NFC_RAM0_BASE, ecc->size);
-
-       nand->cmdfunc(mtd, NAND_CMD_RNDOUT, oob_off, -1);
-       sunxi_nfc_randomizer_read_buf(mtd, oob, ecc->bytes + 4, true, page);
 
-       if (status & NFC_ECC_ERR(0)) {
+       if (ret < 0) {
                /*
                 * Re-read the data with the randomizer disabled to identify
                 * bitflips in erased pages.
@@ -804,35 +910,34 @@ static int sunxi_nfc_hw_ecc_read_chunk(struct mtd_info *mtd,
                if (nand->options & NAND_NEED_SCRAMBLING) {
                        nand->cmdfunc(mtd, NAND_CMD_RNDOUT, data_off, -1);
                        nand->read_buf(mtd, data, ecc->size);
-                       nand->cmdfunc(mtd, NAND_CMD_RNDOUT, oob_off, -1);
-                       nand->read_buf(mtd, oob, ecc->bytes + 4);
+               } else {
+                       memcpy_fromio(data, nfc->regs + NFC_RAM0_BASE,
+                                     ecc->size);
                }
 
+               nand->cmdfunc(mtd, NAND_CMD_RNDOUT, oob_off, -1);
+               nand->read_buf(mtd, oob, ecc->bytes + 4);
+
                ret = nand_check_erased_ecc_chunk(data, ecc->size,
                                                  oob, ecc->bytes + 4,
                                                  NULL, 0, ecc->strength);
                if (ret >= 0)
                        raw_mode = 1;
        } else {
-               /*
-                * The engine protects 4 bytes of OOB data per chunk.
-                * Retrieve the corrected OOB bytes.
-                */
-               sunxi_nfc_user_data_to_buf(readl(nfc->regs + NFC_REG_USER_DATA(0)),
-                                          oob);
+               memcpy_fromio(data, nfc->regs + NFC_RAM0_BASE, ecc->size);
 
-               /* De-randomize the Bad Block Marker. */
-               if (bbm && nand->options & NAND_NEED_SCRAMBLING)
-                       sunxi_nfc_randomize_bbm(mtd, page, oob);
-       }
+               if (oob_required) {
+                       nand->cmdfunc(mtd, NAND_CMD_RNDOUT, oob_off, -1);
+                       sunxi_nfc_randomizer_read_buf(mtd, oob, ecc->bytes + 4,
+                                                     true, page);
 
-       if (ret < 0) {
-               mtd->ecc_stats.failed++;
-       } else {
-               mtd->ecc_stats.corrected += ret;
-               *max_bitflips = max_t(unsigned int, *max_bitflips, ret);
+                       sunxi_nfc_hw_ecc_get_prot_oob_bytes(mtd, oob, 0,
+                                                           bbm, page);
+               }
        }
 
+       sunxi_nfc_hw_ecc_update_stats(mtd, max_bitflips, ret);
+
        return raw_mode;
 }
 
@@ -848,7 +953,7 @@ static void sunxi_nfc_hw_ecc_read_extra_oob(struct mtd_info *mtd,
        if (len <= 0)
                return;
 
-       if (*cur_off != offset)
+       if (!cur_off || *cur_off != offset)
                nand->cmdfunc(mtd, NAND_CMD_RNDOUT,
                              offset + mtd->writesize, -1);
 
@@ -858,12 +963,8 @@ static void sunxi_nfc_hw_ecc_read_extra_oob(struct mtd_info *mtd,
                sunxi_nfc_randomizer_read_buf(mtd, oob + offset, len,
                                              false, page);
 
-       *cur_off = mtd->oobsize + mtd->writesize;
-}
-
-static inline u32 sunxi_nfc_buf_to_user_data(const u8 *buf)
-{
-       return buf[0] | (buf[1] << 8) | (buf[2] << 16) | (buf[3] << 24);
+       if (cur_off)
+               *cur_off = mtd->oobsize + mtd->writesize;
 }
 
 static int sunxi_nfc_hw_ecc_write_chunk(struct mtd_info *mtd,
@@ -882,19 +983,6 @@ static int sunxi_nfc_hw_ecc_write_chunk(struct mtd_info *mtd,
 
        sunxi_nfc_randomizer_write_buf(mtd, data, ecc->size, false, page);
 
-       /* Fill OOB data in */
-       if ((nand->options & NAND_NEED_SCRAMBLING) && bbm) {
-               u8 user_data[4];
-
-               memcpy(user_data, oob, 4);
-               sunxi_nfc_randomize_bbm(mtd, page, user_data);
-               writel(sunxi_nfc_buf_to_user_data(user_data),
-                      nfc->regs + NFC_REG_USER_DATA(0));
-       } else {
-               writel(sunxi_nfc_buf_to_user_data(oob),
-                      nfc->regs + NFC_REG_USER_DATA(0));
-       }
-
        if (data_off + ecc->size != oob_off)
                nand->cmdfunc(mtd, NAND_CMD_RNDIN, oob_off, -1);
 
@@ -903,11 +991,13 @@ static int sunxi_nfc_hw_ecc_write_chunk(struct mtd_info *mtd,
                return ret;
 
        sunxi_nfc_randomizer_enable(mtd);
+       sunxi_nfc_hw_ecc_set_prot_oob_bytes(mtd, oob, 0, bbm, page);
+
        writel(NFC_DATA_TRANS | NFC_DATA_SWAP_METHOD |
               NFC_ACCESS_DIR | NFC_ECC_OP,
               nfc->regs + NFC_REG_CMD);
 
-       ret = sunxi_nfc_wait_int(nfc, NFC_CMD_INT_FLAG, 0);
+       ret = sunxi_nfc_wait_events(nfc, NFC_CMD_INT_FLAG, true, 0);
        sunxi_nfc_randomizer_disable(mtd);
        if (ret)
                return ret;
@@ -929,13 +1019,14 @@ static void sunxi_nfc_hw_ecc_write_extra_oob(struct mtd_info *mtd,
        if (len <= 0)
                return;
 
-       if (*cur_off != offset)
+       if (!cur_off || *cur_off != offset)
                nand->cmdfunc(mtd, NAND_CMD_RNDIN,
                              offset + mtd->writesize, -1);
 
        sunxi_nfc_randomizer_write_buf(mtd, oob + offset, len, false, page);
 
-       *cur_off = mtd->oobsize + mtd->writesize;
+       if (cur_off)
+               *cur_off = mtd->oobsize + mtd->writesize;
 }
 
 static int sunxi_nfc_hw_ecc_read_page(struct mtd_info *mtd,
@@ -958,7 +1049,7 @@ static int sunxi_nfc_hw_ecc_read_page(struct mtd_info *mtd,
                ret = sunxi_nfc_hw_ecc_read_chunk(mtd, data, data_off, oob,
                                                  oob_off + mtd->writesize,
                                                  &cur_off, &max_bitflips,
-                                                 !i, page);
+                                                 !i, oob_required, page);
                if (ret < 0)
                        return ret;
                else if (ret)
@@ -974,6 +1065,39 @@ static int sunxi_nfc_hw_ecc_read_page(struct mtd_info *mtd,
        return max_bitflips;
 }
 
+static int sunxi_nfc_hw_ecc_read_subpage(struct mtd_info *mtd,
+                                        struct nand_chip *chip,
+                                        u32 data_offs, u32 readlen,
+                                        u8 *bufpoi, int page)
+{
+       struct nand_ecc_ctrl *ecc = &chip->ecc;
+       int ret, i, cur_off = 0;
+       unsigned int max_bitflips = 0;
+
+       sunxi_nfc_hw_ecc_enable(mtd);
+
+       chip->cmdfunc(mtd, NAND_CMD_READ0, 0, page);
+       for (i = data_offs / ecc->size;
+            i < DIV_ROUND_UP(data_offs + readlen, ecc->size); i++) {
+               int data_off = i * ecc->size;
+               int oob_off = i * (ecc->bytes + 4);
+               u8 *data = bufpoi + data_off;
+               u8 *oob = chip->oob_poi + oob_off;
+
+               ret = sunxi_nfc_hw_ecc_read_chunk(mtd, data, data_off,
+                                                 oob,
+                                                 oob_off + mtd->writesize,
+                                                 &cur_off, &max_bitflips, !i,
+                                                 false, page);
+               if (ret < 0)
+                       return ret;
+       }
+
+       sunxi_nfc_hw_ecc_disable(mtd);
+
+       return max_bitflips;
+}
+
 static int sunxi_nfc_hw_ecc_write_page(struct mtd_info *mtd,
                                       struct nand_chip *chip,
                                       const uint8_t *buf, int oob_required,
@@ -1026,7 +1150,9 @@ static int sunxi_nfc_hw_syndrome_ecc_read_page(struct mtd_info *mtd,
 
                ret = sunxi_nfc_hw_ecc_read_chunk(mtd, data, data_off, oob,
                                                  oob_off, &cur_off,
-                                                 &max_bitflips, !i, page);
+                                                 &max_bitflips, !i,
+                                                 oob_required,
+                                                 page);
                if (ret < 0)
                        return ret;
                else if (ret)
@@ -1074,6 +1200,40 @@ static int sunxi_nfc_hw_syndrome_ecc_write_page(struct mtd_info *mtd,
        return 0;
 }
 
+static int sunxi_nfc_hw_common_ecc_read_oob(struct mtd_info *mtd,
+                                           struct nand_chip *chip,
+                                           int page)
+{
+       chip->cmdfunc(mtd, NAND_CMD_READ0, 0, page);
+
+       chip->pagebuf = -1;
+
+       return chip->ecc.read_page(mtd, chip, chip->buffers->databuf, 1, page);
+}
+
+static int sunxi_nfc_hw_common_ecc_write_oob(struct mtd_info *mtd,
+                                            struct nand_chip *chip,
+                                            int page)
+{
+       int ret, status;
+
+       chip->cmdfunc(mtd, NAND_CMD_SEQIN, 0, page);
+
+       chip->pagebuf = -1;
+
+       memset(chip->buffers->databuf, 0xff, mtd->writesize);
+       ret = chip->ecc.write_page(mtd, chip, chip->buffers->databuf, 1, page);
+       if (ret)
+               return ret;
+
+       /* Send command to program the OOB data */
+       chip->cmdfunc(mtd, NAND_CMD_PAGEPROG, -1, -1);
+
+       status = chip->waitfunc(mtd, chip);
+
+       return status & NAND_STATUS_FAIL ? -EIO : 0;
+}
+
 static const s32 tWB_lut[] = {6, 12, 16, 20};
 static const s32 tRHW_lut[] = {4, 8, 12, 20};
 
@@ -1101,6 +1261,7 @@ static int sunxi_nand_chip_set_timings(struct sunxi_nand_chip *chip,
        struct sunxi_nfc *nfc = to_sunxi_nfc(chip->nand.controller);
        u32 min_clk_period = 0;
        s32 tWB, tADL, tWHR, tRHW, tCAD;
+       long real_clk_rate;
 
        /* T1 <=> tCLS */
        if (timings->tCLS_min > min_clk_period)
@@ -1163,6 +1324,18 @@ static int sunxi_nand_chip_set_timings(struct sunxi_nand_chip *chip,
                min_clk_period = DIV_ROUND_UP(timings->tWC_min, 2);
 
        /* T16 - T19 + tCAD */
+       if (timings->tWB_max > (min_clk_period * 20))
+               min_clk_period = DIV_ROUND_UP(timings->tWB_max, 20);
+
+       if (timings->tADL_min > (min_clk_period * 32))
+               min_clk_period = DIV_ROUND_UP(timings->tADL_min, 32);
+
+       if (timings->tWHR_min > (min_clk_period * 32))
+               min_clk_period = DIV_ROUND_UP(timings->tWHR_min, 32);
+
+       if (timings->tRHW_min > (min_clk_period * 20))
+               min_clk_period = DIV_ROUND_UP(timings->tRHW_min, 20);
+
        tWB  = sunxi_nand_lookup_timing(tWB_lut, timings->tWB_max,
                                        min_clk_period);
        if (tWB < 0) {
@@ -1198,23 +1371,26 @@ static int sunxi_nand_chip_set_timings(struct sunxi_nand_chip *chip,
        /* TODO: A83 has some more bits for CDQSS, CS, CLHZ, CCS, WC */
        chip->timing_cfg = NFC_TIMING_CFG(tWB, tADL, tWHR, tRHW, tCAD);
 
-       /*
-        * ONFI specification 3.1, paragraph 4.15.2 dictates that EDO data
-        * output cycle timings shall be used if the host drives tRC less than
-        * 30 ns.
-        */
-       chip->timing_ctl = (timings->tRC_min < 30000) ? NFC_TIMING_CTL_EDO : 0;
-
        /* Convert min_clk_period from picoseconds to nanoseconds */
        min_clk_period = DIV_ROUND_UP(min_clk_period, 1000);
 
        /*
-        * Convert min_clk_period into a clk frequency, then get the
-        * appropriate rate for the NAND controller IP given this formula
-        * (specified in the datasheet):
-        * nand clk_rate = 2 * min_clk_rate
+        * Unlike what is stated in Allwinner datasheet, the clk_rate should
+        * be set to (1 / min_clk_period), and not (2 / min_clk_period).
+        * This new formula was verified with a scope and validated by
+        * Allwinner engineers.
         */
-       chip->clk_rate = (2 * NSEC_PER_SEC) / min_clk_period;
+       chip->clk_rate = NSEC_PER_SEC / min_clk_period;
+       real_clk_rate = clk_round_rate(nfc->mod_clk, chip->clk_rate);
+
+       /*
+        * ONFI specification 3.1, paragraph 4.15.2 dictates that EDO data
+        * output cycle timings shall be used if the host drives tRC less than
+        * 30 ns.
+        */
+       min_clk_period = NSEC_PER_SEC / real_clk_rate;
+       chip->timing_ctl = ((min_clk_period * 2) < 30) ?
+                          NFC_TIMING_CTL_EDO : 0;
 
        return 0;
 }
@@ -1257,6 +1433,57 @@ static int sunxi_nand_chip_init_timings(struct sunxi_nand_chip *chip,
        return sunxi_nand_chip_set_timings(chip, timings);
 }
 
+static int sunxi_nand_ooblayout_ecc(struct mtd_info *mtd, int section,
+                                   struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *nand = mtd_to_nand(mtd);
+       struct nand_ecc_ctrl *ecc = &nand->ecc;
+
+       if (section >= ecc->steps)
+               return -ERANGE;
+
+       oobregion->offset = section * (ecc->bytes + 4) + 4;
+       oobregion->length = ecc->bytes;
+
+       return 0;
+}
+
+static int sunxi_nand_ooblayout_free(struct mtd_info *mtd, int section,
+                                    struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *nand = mtd_to_nand(mtd);
+       struct nand_ecc_ctrl *ecc = &nand->ecc;
+
+       if (section > ecc->steps)
+               return -ERANGE;
+
+       /*
+        * The first 2 bytes are used for BB markers, hence we
+        * only have 2 bytes available in the first user data
+        * section.
+        */
+       if (!section && ecc->mode == NAND_ECC_HW) {
+               oobregion->offset = 2;
+               oobregion->length = 2;
+
+               return 0;
+       }
+
+       oobregion->offset = section * (ecc->bytes + 4);
+
+       if (section < ecc->steps)
+               oobregion->length = 4;
+       else
+               oobregion->offset = mtd->oobsize - oobregion->offset;
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops sunxi_nand_ooblayout_ops = {
+       .ecc = sunxi_nand_ooblayout_ecc,
+       .free = sunxi_nand_ooblayout_free,
+};
+
 static int sunxi_nand_hw_common_ecc_ctrl_init(struct mtd_info *mtd,
                                              struct nand_ecc_ctrl *ecc,
                                              struct device_node *np)
@@ -1266,7 +1493,6 @@ static int sunxi_nand_hw_common_ecc_ctrl_init(struct mtd_info *mtd,
        struct sunxi_nand_chip *sunxi_nand = to_sunxi_nand(nand);
        struct sunxi_nfc *nfc = to_sunxi_nfc(sunxi_nand->nand.controller);
        struct sunxi_nand_hw_ecc *data;
-       struct nand_ecclayout *layout;
        int nsectors;
        int ret;
        int i;
@@ -1295,7 +1521,6 @@ static int sunxi_nand_hw_common_ecc_ctrl_init(struct mtd_info *mtd,
        /* HW ECC always work with even numbers of ECC bytes */
        ecc->bytes = ALIGN(ecc->bytes, 2);
 
-       layout = &data->layout;
        nsectors = mtd->writesize / ecc->size;
 
        if (mtd->oobsize < ((ecc->bytes + 4) * nsectors)) {
@@ -1303,9 +1528,9 @@ static int sunxi_nand_hw_common_ecc_ctrl_init(struct mtd_info *mtd,
                goto err;
        }
 
-       layout->eccbytes = (ecc->bytes * nsectors);
-
-       ecc->layout = layout;
+       ecc->read_oob = sunxi_nfc_hw_common_ecc_read_oob;
+       ecc->write_oob = sunxi_nfc_hw_common_ecc_write_oob;
+       mtd_set_ooblayout(mtd, &sunxi_nand_ooblayout_ops);
        ecc->priv = data;
 
        return 0;
@@ -1325,9 +1550,6 @@ static int sunxi_nand_hw_ecc_ctrl_init(struct mtd_info *mtd,
                                       struct nand_ecc_ctrl *ecc,
                                       struct device_node *np)
 {
-       struct nand_ecclayout *layout;
-       int nsectors;
-       int i, j;
        int ret;
 
        ret = sunxi_nand_hw_common_ecc_ctrl_init(mtd, ecc, np);
@@ -1336,40 +1558,9 @@ static int sunxi_nand_hw_ecc_ctrl_init(struct mtd_info *mtd,
 
        ecc->read_page = sunxi_nfc_hw_ecc_read_page;
        ecc->write_page = sunxi_nfc_hw_ecc_write_page;
-       layout = ecc->layout;
-       nsectors = mtd->writesize / ecc->size;
-
-       for (i = 0; i < nsectors; i++) {
-               if (i) {
-                       layout->oobfree[i].offset =
-                               layout->oobfree[i - 1].offset +
-                               layout->oobfree[i - 1].length +
-                               ecc->bytes;
-                       layout->oobfree[i].length = 4;
-               } else {
-                       /*
-                        * The first 2 bytes are used for BB markers, hence we
-                        * only have 2 bytes available in the first user data
-                        * section.
-                        */
-                       layout->oobfree[i].length = 2;
-                       layout->oobfree[i].offset = 2;
-               }
-
-               for (j = 0; j < ecc->bytes; j++)
-                       layout->eccpos[(ecc->bytes * i) + j] =
-                                       layout->oobfree[i].offset +
-                                       layout->oobfree[i].length + j;
-       }
-
-       if (mtd->oobsize > (ecc->bytes + 4) * nsectors) {
-               layout->oobfree[nsectors].offset =
-                               layout->oobfree[nsectors - 1].offset +
-                               layout->oobfree[nsectors - 1].length +
-                               ecc->bytes;
-               layout->oobfree[nsectors].length = mtd->oobsize -
-                               ((ecc->bytes + 4) * nsectors);
-       }
+       ecc->read_oob_raw = nand_read_oob_std;
+       ecc->write_oob_raw = nand_write_oob_std;
+       ecc->read_subpage = sunxi_nfc_hw_ecc_read_subpage;
 
        return 0;
 }
@@ -1378,9 +1569,6 @@ static int sunxi_nand_hw_syndrome_ecc_ctrl_init(struct mtd_info *mtd,
                                                struct nand_ecc_ctrl *ecc,
                                                struct device_node *np)
 {
-       struct nand_ecclayout *layout;
-       int nsectors;
-       int i;
        int ret;
 
        ret = sunxi_nand_hw_common_ecc_ctrl_init(mtd, ecc, np);
@@ -1390,15 +1578,8 @@ static int sunxi_nand_hw_syndrome_ecc_ctrl_init(struct mtd_info *mtd,
        ecc->prepad = 4;
        ecc->read_page = sunxi_nfc_hw_syndrome_ecc_read_page;
        ecc->write_page = sunxi_nfc_hw_syndrome_ecc_write_page;
-
-       layout = ecc->layout;
-       nsectors = mtd->writesize / ecc->size;
-
-       for (i = 0; i < (ecc->bytes * nsectors); i++)
-               layout->eccpos[i] = i;
-
-       layout->oobfree[0].length = mtd->oobsize - i;
-       layout->oobfree[0].offset = i;
+       ecc->read_oob_raw = nand_read_oob_syndrome;
+       ecc->write_oob_raw = nand_write_oob_syndrome;
 
        return 0;
 }
@@ -1411,7 +1592,6 @@ static void sunxi_nand_ecc_cleanup(struct nand_ecc_ctrl *ecc)
                sunxi_nand_hw_common_ecc_ctrl_cleanup(ecc);
                break;
        case NAND_ECC_NONE:
-               kfree(ecc->layout);
        default:
                break;
        }
@@ -1432,8 +1612,6 @@ static int sunxi_nand_ecc_init(struct mtd_info *mtd, struct nand_ecc_ctrl *ecc,
                return -EINVAL;
 
        switch (ecc->mode) {
-       case NAND_ECC_SOFT_BCH:
-               break;
        case NAND_ECC_HW:
                ret = sunxi_nand_hw_ecc_ctrl_init(mtd, ecc, np);
                if (ret)
@@ -1445,10 +1623,6 @@ static int sunxi_nand_ecc_init(struct mtd_info *mtd, struct nand_ecc_ctrl *ecc,
                        return ret;
                break;
        case NAND_ECC_NONE:
-               ecc->layout = kzalloc(sizeof(*ecc->layout), GFP_KERNEL);
-               if (!ecc->layout)
-                       return -ENOMEM;
-               ecc->layout->oobfree[0].length = mtd->oobsize;
        case NAND_ECC_SOFT:
                break;
        default:
@@ -1536,21 +1710,6 @@ static int sunxi_nand_chip_init(struct device *dev, struct sunxi_nfc *nfc,
                }
        }
 
-       timings = onfi_async_timing_mode_to_sdr_timings(0);
-       if (IS_ERR(timings)) {
-               ret = PTR_ERR(timings);
-               dev_err(dev,
-                       "could not retrieve timings for ONFI mode 0: %d\n",
-                       ret);
-               return ret;
-       }
-
-       ret = sunxi_nand_chip_set_timings(chip, timings);
-       if (ret) {
-               dev_err(dev, "could not configure chip timings: %d\n", ret);
-               return ret;
-       }
-
        nand = &chip->nand;
        /* Default tR value specified in the ONFI spec (chapter 4.15.1) */
        nand->chip_delay = 200;
@@ -1570,6 +1729,21 @@ static int sunxi_nand_chip_init(struct device *dev, struct sunxi_nfc *nfc,
        mtd = nand_to_mtd(nand);
        mtd->dev.parent = dev;
 
+       timings = onfi_async_timing_mode_to_sdr_timings(0);
+       if (IS_ERR(timings)) {
+               ret = PTR_ERR(timings);
+               dev_err(dev,
+                       "could not retrieve timings for ONFI mode 0: %d\n",
+                       ret);
+               return ret;
+       }
+
+       ret = sunxi_nand_chip_set_timings(chip, timings);
+       if (ret) {
+               dev_err(dev, "could not configure chip timings: %d\n", ret);
+               return ret;
+       }
+
        ret = nand_scan_ident(mtd, nsels, NULL);
        if (ret)
                return ret;
@@ -1580,6 +1754,8 @@ static int sunxi_nand_chip_init(struct device *dev, struct sunxi_nfc *nfc,
        if (nand->options & NAND_NEED_SCRAMBLING)
                nand->options |= NAND_NO_SUBPAGE_WRITE;
 
+       nand->options |= NAND_SUBPAGE_READ;
+
        ret = sunxi_nand_chip_init_timings(chip, np);
        if (ret) {
                dev_err(dev, "could not configure chip timings: %d\n", ret);
@@ -1728,6 +1904,8 @@ static int sunxi_nfc_remove(struct platform_device *pdev)
        struct sunxi_nfc *nfc = platform_get_drvdata(pdev);
 
        sunxi_nand_chips_cleanup(nfc);
+       clk_disable_unprepare(nfc->mod_clk);
+       clk_disable_unprepare(nfc->ahb_clk);
 
        return 0;
 }
index 293feb1..3ad514c 100644 (file)
@@ -33,7 +33,6 @@
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/nand.h>
 #include <linux/mtd/partitions.h>
-#include <linux/of_mtd.h>
 #include <linux/of_device.h>
 #include <linux/pinctrl/consumer.h>
 #include <linux/platform_device.h>
@@ -175,34 +174,6 @@ static inline struct vf610_nfc *mtd_to_nfc(struct mtd_info *mtd)
        return container_of(mtd_to_nand(mtd), struct vf610_nfc, chip);
 }
 
-static struct nand_ecclayout vf610_nfc_ecc45 = {
-       .eccbytes = 45,
-       .eccpos = {19, 20, 21, 22, 23,
-                  24, 25, 26, 27, 28, 29, 30, 31,
-                  32, 33, 34, 35, 36, 37, 38, 39,
-                  40, 41, 42, 43, 44, 45, 46, 47,
-                  48, 49, 50, 51, 52, 53, 54, 55,
-                  56, 57, 58, 59, 60, 61, 62, 63},
-       .oobfree = {
-               {.offset = 2,
-                .length = 17} }
-};
-
-static struct nand_ecclayout vf610_nfc_ecc60 = {
-       .eccbytes = 60,
-       .eccpos = { 4,  5,  6,  7,  8,  9, 10, 11,
-                  12, 13, 14, 15, 16, 17, 18, 19,
-                  20, 21, 22, 23, 24, 25, 26, 27,
-                  28, 29, 30, 31, 32, 33, 34, 35,
-                  36, 37, 38, 39, 40, 41, 42, 43,
-                  44, 45, 46, 47, 48, 49, 50, 51,
-                  52, 53, 54, 55, 56, 57, 58, 59,
-                  60, 61, 62, 63 },
-       .oobfree = {
-               {.offset = 2,
-                .length = 2} }
-};
-
 static inline u32 vf610_nfc_read(struct vf610_nfc *nfc, uint reg)
 {
        return readl(nfc->regs + reg);
@@ -781,14 +752,16 @@ static int vf610_nfc_probe(struct platform_device *pdev)
                if (mtd->oobsize > 64)
                        mtd->oobsize = 64;
 
+               /*
+                * mtd->ecclayout is not specified here because we're using the
+                * default large page ECC layout defined in NAND core.
+                */
                if (chip->ecc.strength == 32) {
                        nfc->ecc_mode = ECC_60_BYTE;
                        chip->ecc.bytes = 60;
-                       chip->ecc.layout = &vf610_nfc_ecc60;
                } else if (chip->ecc.strength == 24) {
                        nfc->ecc_mode = ECC_45_BYTE;
                        chip->ecc.bytes = 45;
-                       chip->ecc.layout = &vf610_nfc_ecc45;
                } else {
                        dev_err(nfc->dev, "Unsupported ECC strength\n");
                        err = -ENXIO;
index af28bb3..a4b029a 100644 (file)
@@ -68,21 +68,33 @@ MODULE_PARM_DESC(otp,       "Corresponding behaviour of OneNAND in OTP"
  * flexonenand_oob_128 - oob info for Flex-Onenand with 4KB page
  * For now, we expose only 64 out of 80 ecc bytes
  */
-static struct nand_ecclayout flexonenand_oob_128 = {
-       .eccbytes       = 64,
-       .eccpos         = {
-               6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-               22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-               38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-               54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-               70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
-               86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
-               102, 103, 104, 105
-               },
-       .oobfree        = {
-               {2, 4}, {18, 4}, {34, 4}, {50, 4},
-               {66, 4}, {82, 4}, {98, 4}, {114, 4}
-       }
+static int flexonenand_ooblayout_ecc(struct mtd_info *mtd, int section,
+                                    struct mtd_oob_region *oobregion)
+{
+       if (section > 7)
+               return -ERANGE;
+
+       oobregion->offset = (section * 16) + 6;
+       oobregion->length = 10;
+
+       return 0;
+}
+
+static int flexonenand_ooblayout_free(struct mtd_info *mtd, int section,
+                                     struct mtd_oob_region *oobregion)
+{
+       if (section > 7)
+               return -ERANGE;
+
+       oobregion->offset = (section * 16) + 2;
+       oobregion->length = 4;
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops flexonenand_ooblayout_ops = {
+       .ecc = flexonenand_ooblayout_ecc,
+       .free = flexonenand_ooblayout_free,
 };
 
 /*
@@ -91,56 +103,77 @@ static struct nand_ecclayout flexonenand_oob_128 = {
  * Based on specification:
  * 4Gb M-die OneNAND Flash (KFM4G16Q4M, KFN8G16Q4M). Rev. 1.3, Apr. 2010
  *
- * For eccpos we expose only 64 bytes out of 72 (see struct nand_ecclayout)
- *
- * oobfree uses the spare area fields marked as
- * "Managed by internal ECC logic for Logical Sector Number area"
  */
-static struct nand_ecclayout onenand_oob_128 = {
-       .eccbytes       = 64,
-       .eccpos         = {
-               7, 8, 9, 10, 11, 12, 13, 14, 15,
-               23, 24, 25, 26, 27, 28, 29, 30, 31,
-               39, 40, 41, 42, 43, 44, 45, 46, 47,
-               55, 56, 57, 58, 59, 60, 61, 62, 63,
-               71, 72, 73, 74, 75, 76, 77, 78, 79,
-               87, 88, 89, 90, 91, 92, 93, 94, 95,
-               103, 104, 105, 106, 107, 108, 109, 110, 111,
-               119
-       },
-       .oobfree        = {
-               {2, 3}, {18, 3}, {34, 3}, {50, 3},
-               {66, 3}, {82, 3}, {98, 3}, {114, 3}
-       }
+static int onenand_ooblayout_128_ecc(struct mtd_info *mtd, int section,
+                                    struct mtd_oob_region *oobregion)
+{
+       if (section > 7)
+               return -ERANGE;
+
+       oobregion->offset = (section * 16) + 7;
+       oobregion->length = 9;
+
+       return 0;
+}
+
+static int onenand_ooblayout_128_free(struct mtd_info *mtd, int section,
+                                     struct mtd_oob_region *oobregion)
+{
+       if (section >= 8)
+               return -ERANGE;
+
+       /*
+        * free bytes are using the spare area fields marked as
+        * "Managed by internal ECC logic for Logical Sector Number area"
+        */
+       oobregion->offset = (section * 16) + 2;
+       oobregion->length = 3;
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops onenand_oob_128_ooblayout_ops = {
+       .ecc = onenand_ooblayout_128_ecc,
+       .free = onenand_ooblayout_128_free,
 };
 
 /**
- * onenand_oob_64 - oob info for large (2KB) page
+ * onenand_oob_32_64 - oob info for large (2KB) page
  */
-static struct nand_ecclayout onenand_oob_64 = {
-       .eccbytes       = 20,
-       .eccpos         = {
-               8, 9, 10, 11, 12,
-               24, 25, 26, 27, 28,
-               40, 41, 42, 43, 44,
-               56, 57, 58, 59, 60,
-               },
-       .oobfree        = {
-               {2, 3}, {14, 2}, {18, 3}, {30, 2},
-               {34, 3}, {46, 2}, {50, 3}, {62, 2}
+static int onenand_ooblayout_32_64_ecc(struct mtd_info *mtd, int section,
+                                      struct mtd_oob_region *oobregion)
+{
+       if (section > 3)
+               return -ERANGE;
+
+       oobregion->offset = (section * 16) + 8;
+       oobregion->length = 5;
+
+       return 0;
+}
+
+static int onenand_ooblayout_32_64_free(struct mtd_info *mtd, int section,
+                                       struct mtd_oob_region *oobregion)
+{
+       int sections = (mtd->oobsize / 32) * 2;
+
+       if (section >= sections)
+               return -ERANGE;
+
+       if (section & 1) {
+               oobregion->offset = ((section - 1) * 16) + 14;
+               oobregion->length = 2;
+       } else  {
+               oobregion->offset = (section * 16) + 2;
+               oobregion->length = 3;
        }
-};
 
-/**
- * onenand_oob_32 - oob info for middle (1KB) page
- */
-static struct nand_ecclayout onenand_oob_32 = {
-       .eccbytes       = 10,
-       .eccpos         = {
-               8, 9, 10, 11, 12,
-               24, 25, 26, 27, 28,
-               },
-       .oobfree        = { {2, 3}, {14, 2}, {18, 3}, {30, 2} }
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops onenand_oob_32_64_ooblayout_ops = {
+       .ecc = onenand_ooblayout_32_64_ecc,
+       .free = onenand_ooblayout_32_64_free,
 };
 
 static const unsigned char ffchars[] = {
@@ -1024,34 +1057,15 @@ static int onenand_transfer_auto_oob(struct mtd_info *mtd, uint8_t *buf, int col
                                int thislen)
 {
        struct onenand_chip *this = mtd->priv;
-       struct nand_oobfree *free;
-       int readcol = column;
-       int readend = column + thislen;
-       int lastgap = 0;
-       unsigned int i;
-       uint8_t *oob_buf = this->oob_buf;
-
-       free = this->ecclayout->oobfree;
-       for (i = 0; i < MTD_MAX_OOBFREE_ENTRIES && free->length; i++, free++) {
-               if (readcol >= lastgap)
-                       readcol += free->offset - lastgap;
-               if (readend >= lastgap)
-                       readend += free->offset - lastgap;
-               lastgap = free->offset + free->length;
-       }
-       this->read_bufferram(mtd, ONENAND_SPARERAM, oob_buf, 0, mtd->oobsize);
-       free = this->ecclayout->oobfree;
-       for (i = 0; i < MTD_MAX_OOBFREE_ENTRIES && free->length; i++, free++) {
-               int free_end = free->offset + free->length;
-               if (free->offset < readend && free_end > readcol) {
-                       int st = max_t(int,free->offset,readcol);
-                       int ed = min_t(int,free_end,readend);
-                       int n = ed - st;
-                       memcpy(buf, oob_buf + st, n);
-                       buf += n;
-               } else if (column == 0)
-                       break;
-       }
+       int ret;
+
+       this->read_bufferram(mtd, ONENAND_SPARERAM, this->oob_buf, 0,
+                            mtd->oobsize);
+       ret = mtd_ooblayout_get_databytes(mtd, buf, this->oob_buf,
+                                         column, thislen);
+       if (ret)
+               return ret;
+
        return 0;
 }
 
@@ -1808,34 +1822,7 @@ static int onenand_panic_write(struct mtd_info *mtd, loff_t to, size_t len,
 static int onenand_fill_auto_oob(struct mtd_info *mtd, u_char *oob_buf,
                                  const u_char *buf, int column, int thislen)
 {
-       struct onenand_chip *this = mtd->priv;
-       struct nand_oobfree *free;
-       int writecol = column;
-       int writeend = column + thislen;
-       int lastgap = 0;
-       unsigned int i;
-
-       free = this->ecclayout->oobfree;
-       for (i = 0; i < MTD_MAX_OOBFREE_ENTRIES && free->length; i++, free++) {
-               if (writecol >= lastgap)
-                       writecol += free->offset - lastgap;
-               if (writeend >= lastgap)
-                       writeend += free->offset - lastgap;
-               lastgap = free->offset + free->length;
-       }
-       free = this->ecclayout->oobfree;
-       for (i = 0; i < MTD_MAX_OOBFREE_ENTRIES && free->length; i++, free++) {
-               int free_end = free->offset + free->length;
-               if (free->offset < writeend && free_end > writecol) {
-                       int st = max_t(int,free->offset,writecol);
-                       int ed = min_t(int,free_end,writeend);
-                       int n = ed - st;
-                       memcpy(oob_buf + st, buf, n);
-                       buf += n;
-               } else if (column == 0)
-                       break;
-       }
-       return 0;
+       return mtd_ooblayout_set_databytes(mtd, buf, oob_buf, column, thislen);
 }
 
 /**
@@ -4003,22 +3990,22 @@ int onenand_scan(struct mtd_info *mtd, int maxchips)
        switch (mtd->oobsize) {
        case 128:
                if (FLEXONENAND(this)) {
-                       this->ecclayout = &flexonenand_oob_128;
+                       mtd_set_ooblayout(mtd, &flexonenand_ooblayout_ops);
                        mtd->subpage_sft = 0;
                } else {
-                       this->ecclayout = &onenand_oob_128;
+                       mtd_set_ooblayout(mtd, &onenand_oob_128_ooblayout_ops);
                        mtd->subpage_sft = 2;
                }
                if (ONENAND_IS_NOP_1(this))
                        mtd->subpage_sft = 0;
                break;
        case 64:
-               this->ecclayout = &onenand_oob_64;
+               mtd_set_ooblayout(mtd, &onenand_oob_32_64_ooblayout_ops);
                mtd->subpage_sft = 2;
                break;
 
        case 32:
-               this->ecclayout = &onenand_oob_32;
+               mtd_set_ooblayout(mtd, &onenand_oob_32_64_ooblayout_ops);
                mtd->subpage_sft = 1;
                break;
 
@@ -4027,7 +4014,7 @@ int onenand_scan(struct mtd_info *mtd, int maxchips)
                        __func__, mtd->oobsize);
                mtd->subpage_sft = 0;
                /* To prevent kernel oops */
-               this->ecclayout = &onenand_oob_32;
+               mtd_set_ooblayout(mtd, &onenand_oob_32_64_ooblayout_ops);
                break;
        }
 
@@ -4037,12 +4024,12 @@ int onenand_scan(struct mtd_info *mtd, int maxchips)
         * The number of bytes available for a client to place data into
         * the out of band area
         */
-       mtd->oobavail = 0;
-       for (i = 0; i < MTD_MAX_OOBFREE_ENTRIES &&
-           this->ecclayout->oobfree[i].length; i++)
-               mtd->oobavail += this->ecclayout->oobfree[i].length;
+       ret = mtd_ooblayout_count_freebytes(mtd);
+       if (ret < 0)
+               ret = 0;
+
+       mtd->oobavail = ret;
 
-       mtd->ecclayout = this->ecclayout;
        mtd->ecc_strength = 1;
 
        /* Fill in remaining MTD driver data */
index 157841d..c52e455 100644 (file)
@@ -832,6 +832,7 @@ static const struct flash_info spi_nor_ids[] = {
        /* GigaDevice */
        { "gd25q32", INFO(0xc84016, 0, 64 * 1024,  64, SECT_4K) },
        { "gd25q64", INFO(0xc84017, 0, 64 * 1024, 128, SECT_4K) },
+       { "gd25lq64c", INFO(0xc86017, 0, 64 * 1024, 128, SECT_4K | SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ) },
        { "gd25q128", INFO(0xc84018, 0, 64 * 1024, 256, SECT_4K) },
 
        /* Intel/Numonyx -- xxxs33b */
index a7d1feb..16baeb5 100644 (file)
@@ -149,6 +149,8 @@ static struct device_attribute dev_bgt_enabled =
        __ATTR(bgt_enabled, S_IRUGO, dev_attribute_show, NULL);
 static struct device_attribute dev_mtd_num =
        __ATTR(mtd_num, S_IRUGO, dev_attribute_show, NULL);
+static struct device_attribute dev_ro_mode =
+       __ATTR(ro_mode, S_IRUGO, dev_attribute_show, NULL);
 
 /**
  * ubi_volume_notify - send a volume change notification.
@@ -385,6 +387,8 @@ static ssize_t dev_attribute_show(struct device *dev,
                ret = sprintf(buf, "%d\n", ubi->thread_enabled);
        else if (attr == &dev_mtd_num)
                ret = sprintf(buf, "%d\n", ubi->mtd->index);
+       else if (attr == &dev_ro_mode)
+               ret = sprintf(buf, "%d\n", ubi->ro_mode);
        else
                ret = -EINVAL;
 
@@ -404,6 +408,7 @@ static struct attribute *ubi_dev_attrs[] = {
        &dev_min_io_size.attr,
        &dev_bgt_enabled.attr,
        &dev_mtd_num.attr,
+       &dev_ro_mode.attr,
        NULL
 };
 ATTRIBUTE_GROUPS(ubi_dev);
index c4cb15a..f101a49 100644 (file)
@@ -352,7 +352,8 @@ static ssize_t dfs_file_write(struct file *file, const char __user *user_buf,
        } else if (dent == d->dfs_emulate_power_cut) {
                if (kstrtoint(buf, 0, &val) != 0)
                        count = -EINVAL;
-               d->emulate_power_cut = val;
+               else
+                       d->emulate_power_cut = val;
                goto out;
        }
 
index 5b9834c..5780dd1 100644 (file)
@@ -426,8 +426,25 @@ retry:
                                                 pnum, vol_id, lnum);
                                        err = -EBADMSG;
                                } else {
-                                       err = -EINVAL;
-                                       ubi_ro_mode(ubi);
+                                       /*
+                                        * Ending up here in the non-Fastmap case
+                                        * is a clear bug as the VID header had to
+                                        * be present at scan time to have it referenced.
+                                        * With fastmap the story is more complicated.
+                                        * Fastmap has the mapping info without the need
+                                        * of a full scan. So the LEB could have been
+                                        * unmapped, Fastmap cannot know this and keeps
+                                        * the LEB referenced.
+                                        * This is valid and works as the layer above UBI
+                                        * has to do bookkeeping about used/referenced
+                                        * LEBs in any case.
+                                        */
+                                       if (ubi->fast_attach) {
+                                               err = -EBADMSG;
+                                       } else {
+                                               err = -EINVAL;
+                                               ubi_ro_mode(ubi);
+                                       }
                                }
                        }
                        goto out_free;
@@ -1202,32 +1219,6 @@ int ubi_eba_copy_leb(struct ubi_device *ubi, int from, int to,
                }
 
                cond_resched();
-
-               /*
-                * We've written the data and are going to read it back to make
-                * sure it was written correctly.
-                */
-               memset(ubi->peb_buf, 0xFF, aldata_size);
-               err = ubi_io_read_data(ubi, ubi->peb_buf, to, 0, aldata_size);
-               if (err) {
-                       if (err != UBI_IO_BITFLIPS) {
-                               ubi_warn(ubi, "error %d while reading data back from PEB %d",
-                                        err, to);
-                               if (is_error_sane(err))
-                                       err = MOVE_TARGET_RD_ERR;
-                       } else
-                               err = MOVE_TARGET_BITFLIPS;
-                       goto out_unlock_buf;
-               }
-
-               cond_resched();
-
-               if (crc != crc32(UBI_CRC32_INIT, ubi->peb_buf, data_size)) {
-                       ubi_warn(ubi, "read data back from PEB %d and it is different",
-                                to);
-                       err = -EINVAL;
-                       goto out_unlock_buf;
-               }
        }
 
        ubi_assert(vol->eba_tbl[lnum] == from);
index 263b439..990898b 100644 (file)
@@ -1058,6 +1058,7 @@ int ubi_scan_fastmap(struct ubi_device *ubi, struct ubi_attach_info *ai,
        ubi_msg(ubi, "fastmap WL pool size: %d",
                ubi->fm_wl_pool.max_size);
        ubi->fm_disabled = 0;
+       ubi->fast_attach = 1;
 
        ubi_free_vid_hdr(ubi, vh);
        kfree(ech);
index 437757c..348dbbc 100644 (file)
@@ -705,7 +705,7 @@ int ubi_leb_map(struct ubi_volume_desc *desc, int lnum)
        struct ubi_volume *vol = desc->vol;
        struct ubi_device *ubi = vol->ubi;
 
-       dbg_gen("unmap LEB %d:%d", vol->vol_id, lnum);
+       dbg_gen("map LEB %d:%d", vol->vol_id, lnum);
 
        if (desc->mode == UBI_READONLY || vol->vol_type == UBI_STATIC_VOLUME)
                return -EROFS;
index dadc6a9..61d4e99 100644 (file)
@@ -466,6 +466,7 @@ struct ubi_debug_info {
  * @fm_eba_sem: allows ubi_update_fastmap() to block EBA table changes
  * @fm_work: fastmap work queue
  * @fm_work_scheduled: non-zero if fastmap work was scheduled
+ * @fast_attach: non-zero if UBI was attached by fastmap
  *
  * @used: RB-tree of used physical eraseblocks
  * @erroneous: RB-tree of erroneous used physical eraseblocks
@@ -574,6 +575,7 @@ struct ubi_device {
        size_t fm_size;
        struct work_struct fm_work;
        int fm_work_scheduled;
+       int fast_attach;
 
        /* Wear-leveling sub-system's stuff */
        struct rb_root used;
index 1ae17bb..10059df 100644 (file)
@@ -405,7 +405,7 @@ int ubi_remove_volume(struct ubi_volume_desc *desc, int no_vtbl)
        if (!no_vtbl)
                self_check_volumes(ubi);
 
-       return err;
+       return 0;
 
 out_err:
        ubi_err(ubi, "cannot remove volume %d, error %d", vol_id, err);
index 17ec948..959c7b1 100644 (file)
@@ -1534,6 +1534,7 @@ int ubi_wl_init(struct ubi_device *ubi, struct ubi_attach_info *ai)
                INIT_LIST_HEAD(&ubi->pq[i]);
        ubi->pq_head = 0;
 
+       ubi->free_count = 0;
        list_for_each_entry_safe(aeb, tmp, &ai->erase, u.list) {
                cond_resched();
 
@@ -1552,7 +1553,6 @@ int ubi_wl_init(struct ubi_device *ubi, struct ubi_attach_info *ai)
                found_pebs++;
        }
 
-       ubi->free_count = 0;
        list_for_each_entry(aeb, &ai->free, u.list) {
                cond_resched();
 
index 16419f5..058460b 100644 (file)
@@ -141,7 +141,7 @@ int arc_mdio_probe(struct arc_emac_priv *priv)
        priv->bus = bus;
        bus->priv = priv;
        bus->parent = priv->dev;
-       bus->name = "Synopsys MII Bus",
+       bus->name = "Synopsys MII Bus";
        bus->read = &arc_mdio_read;
        bus->write = &arc_mdio_write;
        bus->reset = &arc_mdio_reset;
index 8fc93c5..d02c424 100644 (file)
@@ -96,6 +96,10 @@ struct alx_priv {
        unsigned int rx_ringsz;
        unsigned int rxbuf_size;
 
+       struct page  *rx_page;
+       unsigned int rx_page_offset;
+       unsigned int rx_frag_size;
+
        struct napi_struct napi;
        struct alx_tx_queue txq;
        struct alx_rx_queue rxq;
index 9fe8b5e..c98acdc 100644 (file)
@@ -70,6 +70,35 @@ static void alx_free_txbuf(struct alx_priv *alx, int entry)
        }
 }
 
+static struct sk_buff *alx_alloc_skb(struct alx_priv *alx, gfp_t gfp)
+{
+       struct sk_buff *skb;
+       struct page *page;
+
+       if (alx->rx_frag_size > PAGE_SIZE)
+               return __netdev_alloc_skb(alx->dev, alx->rxbuf_size, gfp);
+
+       page = alx->rx_page;
+       if (!page) {
+               alx->rx_page = page = alloc_page(gfp);
+               if (unlikely(!page))
+                       return NULL;
+               alx->rx_page_offset = 0;
+       }
+
+       skb = build_skb(page_address(page) + alx->rx_page_offset,
+                       alx->rx_frag_size);
+       if (likely(skb)) {
+               alx->rx_page_offset += alx->rx_frag_size;
+               if (alx->rx_page_offset >= PAGE_SIZE)
+                       alx->rx_page = NULL;
+               else
+                       get_page(page);
+       }
+       return skb;
+}
+
+
 static int alx_refill_rx_ring(struct alx_priv *alx, gfp_t gfp)
 {
        struct alx_rx_queue *rxq = &alx->rxq;
@@ -86,7 +115,7 @@ static int alx_refill_rx_ring(struct alx_priv *alx, gfp_t gfp)
        while (!cur_buf->skb && next != rxq->read_idx) {
                struct alx_rfd *rfd = &rxq->rfd[cur];
 
-               skb = __netdev_alloc_skb(alx->dev, alx->rxbuf_size, gfp);
+               skb = alx_alloc_skb(alx, gfp);
                if (!skb)
                        break;
                dma = dma_map_single(&alx->hw.pdev->dev,
@@ -124,6 +153,7 @@ static int alx_refill_rx_ring(struct alx_priv *alx, gfp_t gfp)
                alx_write_mem16(&alx->hw, ALX_RFD_PIDX, cur);
        }
 
+
        return count;
 }
 
@@ -592,6 +622,11 @@ static void alx_free_rings(struct alx_priv *alx)
        kfree(alx->txq.bufs);
        kfree(alx->rxq.bufs);
 
+       if (alx->rx_page) {
+               put_page(alx->rx_page);
+               alx->rx_page = NULL;
+       }
+
        dma_free_coherent(&alx->hw.pdev->dev,
                          alx->descmem.size,
                          alx->descmem.virt,
@@ -646,6 +681,7 @@ static int alx_request_irq(struct alx_priv *alx)
                                  alx->dev->name, alx);
                if (!err)
                        goto out;
+
                /* fall back to legacy interrupt */
                pci_disable_msi(alx->hw.pdev);
        }
@@ -689,6 +725,7 @@ static int alx_init_sw(struct alx_priv *alx)
        struct pci_dev *pdev = alx->hw.pdev;
        struct alx_hw *hw = &alx->hw;
        int err;
+       unsigned int head_size;
 
        err = alx_identify_hw(alx);
        if (err) {
@@ -704,7 +741,12 @@ static int alx_init_sw(struct alx_priv *alx)
 
        hw->smb_timer = 400;
        hw->mtu = alx->dev->mtu;
+
        alx->rxbuf_size = ALX_MAX_FRAME_LEN(hw->mtu);
+       head_size = SKB_DATA_ALIGN(alx->rxbuf_size + NET_SKB_PAD) +
+                   SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+       alx->rx_frag_size = roundup_pow_of_two(head_size);
+
        alx->tx_ringsz = 256;
        alx->rx_ringsz = 512;
        hw->imt = 200;
@@ -806,6 +848,7 @@ static int alx_change_mtu(struct net_device *netdev, int mtu)
 {
        struct alx_priv *alx = netdev_priv(netdev);
        int max_frame = ALX_MAX_FRAME_LEN(mtu);
+       unsigned int head_size;
 
        if ((max_frame < ALX_MIN_FRAME_SIZE) ||
            (max_frame > ALX_MAX_FRAME_SIZE))
@@ -817,6 +860,9 @@ static int alx_change_mtu(struct net_device *netdev, int mtu)
        netdev->mtu = mtu;
        alx->hw.mtu = mtu;
        alx->rxbuf_size = max(max_frame, ALX_DEF_RXBUF_SIZE);
+       head_size = SKB_DATA_ALIGN(alx->rxbuf_size + NET_SKB_PAD) +
+                   SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+       alx->rx_frag_size = roundup_pow_of_two(head_size);
        netdev_update_features(netdev);
        if (netif_running(netdev))
                alx_reinit(alx);
index 0a5b770..c5fe915 100644 (file)
@@ -13941,14 +13941,14 @@ static int bnx2x_init_one(struct pci_dev *pdev,
                bp->doorbells = bnx2x_vf_doorbells(bp);
                rc = bnx2x_vf_pci_alloc(bp);
                if (rc)
-                       goto init_one_exit;
+                       goto init_one_freemem;
        } else {
                doorbell_size = BNX2X_L2_MAX_CID(bp) * (1 << BNX2X_DB_SHIFT);
                if (doorbell_size > pci_resource_len(pdev, 2)) {
                        dev_err(&bp->pdev->dev,
                                "Cannot map doorbells, bar size too small, aborting\n");
                        rc = -ENOMEM;
-                       goto init_one_exit;
+                       goto init_one_freemem;
                }
                bp->doorbells = ioremap_nocache(pci_resource_start(pdev, 2),
                                                doorbell_size);
@@ -13957,19 +13957,19 @@ static int bnx2x_init_one(struct pci_dev *pdev,
                dev_err(&bp->pdev->dev,
                        "Cannot map doorbell space, aborting\n");
                rc = -ENOMEM;
-               goto init_one_exit;
+               goto init_one_freemem;
        }
 
        if (IS_VF(bp)) {
                rc = bnx2x_vfpf_acquire(bp, tx_count, rx_count);
                if (rc)
-                       goto init_one_exit;
+                       goto init_one_freemem;
        }
 
        /* Enable SRIOV if capability found in configuration space */
        rc = bnx2x_iov_init_one(bp, int_mode, BNX2X_MAX_NUM_OF_VFS);
        if (rc)
-               goto init_one_exit;
+               goto init_one_freemem;
 
        /* calc qm_cid_count */
        bp->qm_cid_count = bnx2x_set_qm_cid_count(bp);
@@ -13988,7 +13988,7 @@ static int bnx2x_init_one(struct pci_dev *pdev,
        rc = bnx2x_set_int_mode(bp);
        if (rc) {
                dev_err(&pdev->dev, "Cannot set interrupts\n");
-               goto init_one_exit;
+               goto init_one_freemem;
        }
        BNX2X_DEV_INFO("set interrupts successfully\n");
 
@@ -13996,7 +13996,7 @@ static int bnx2x_init_one(struct pci_dev *pdev,
        rc = register_netdev(dev);
        if (rc) {
                dev_err(&pdev->dev, "Cannot register net device\n");
-               goto init_one_exit;
+               goto init_one_freemem;
        }
        BNX2X_DEV_INFO("device name after netdev register %s\n", dev->name);
 
@@ -14029,6 +14029,9 @@ static int bnx2x_init_one(struct pci_dev *pdev,
 
        return 0;
 
+init_one_freemem:
+       bnx2x_free_mem_bp(bp);
+
 init_one_exit:
        bnx2x_disable_pcie_error_reporting(bp);
 
index 085f912..06f0317 100644 (file)
@@ -205,8 +205,10 @@ static int nps_enet_poll(struct napi_struct *napi, int budget)
                 * re-adding ourselves to the poll list.
                 */
 
-               if (priv->tx_skb && !tx_ctrl_ct)
+               if (priv->tx_skb && !tx_ctrl_ct) {
+                       nps_enet_reg_set(priv, NPS_ENET_REG_BUF_INT_ENABLE, 0);
                        napi_reschedule(napi);
+               }
        }
 
        return work_done;
index ca2cccc..3c0255e 100644 (file)
@@ -1197,10 +1197,8 @@ fec_enet_tx_queue(struct net_device *ndev, u16 queue_id)
                                         fec16_to_cpu(bdp->cbd_datlen),
                                         DMA_TO_DEVICE);
                bdp->cbd_bufaddr = cpu_to_fec32(0);
-               if (!skb) {
-                       bdp = fec_enet_get_nextdesc(bdp, &txq->bd);
-                       continue;
-               }
+               if (!skb)
+                       goto skb_done;
 
                /* Check for errors. */
                if (status & (BD_ENET_TX_HB | BD_ENET_TX_LC |
@@ -1239,7 +1237,7 @@ fec_enet_tx_queue(struct net_device *ndev, u16 queue_id)
 
                /* Free the sk buffer associated with this last transmit */
                dev_kfree_skb_any(skb);
-
+skb_done:
                /* Make sure the update to bdp and tx_skbuff are performed
                 * before dirty_tx
                 */
index bcb9dcc..1de2e1e 100644 (file)
@@ -615,7 +615,7 @@ struct fman {
        struct fman_cfg *cfg;
        struct muram_info *muram;
        /* cam section in muram */
-       int cam_offset;
+       unsigned long cam_offset;
        size_t cam_size;
        /* Fifo in MURAM */
        int fifo_offset;
index 4eb0e9a..47394c4 100644 (file)
@@ -129,7 +129,7 @@ unsigned long fman_muram_offset_to_vbase(struct muram_info *muram,
  *
  * Return: address of the allocated memory; NULL otherwise.
  */
-int fman_muram_alloc(struct muram_info *muram, size_t size)
+unsigned long fman_muram_alloc(struct muram_info *muram, size_t size)
 {
        unsigned long vaddr;
 
@@ -150,7 +150,7 @@ int fman_muram_alloc(struct muram_info *muram, size_t size)
  *
  * Free an allocated memory from FM-MURAM partition.
  */
-void fman_muram_free_mem(struct muram_info *muram, u32 offset, size_t size)
+void fman_muram_free_mem(struct muram_info *muram, unsigned long offset, size_t size)
 {
        unsigned long addr = fman_muram_offset_to_vbase(muram, offset);
 
index dbf0af9..889649a 100644 (file)
@@ -44,8 +44,8 @@ struct muram_info *fman_muram_init(phys_addr_t base, size_t size);
 unsigned long fman_muram_offset_to_vbase(struct muram_info *muram,
                                         unsigned long offset);
 
-int fman_muram_alloc(struct muram_info *muram, size_t size);
+unsigned long fman_muram_alloc(struct muram_info *muram, size_t size);
 
-void fman_muram_free_mem(struct muram_info *muram, u32 offset, size_t size);
+void fman_muram_free_mem(struct muram_info *muram, unsigned long offset, size_t size);
 
 #endif /* __FM_MURAM_EXT */
index 3d746c8..67a648c 100644 (file)
@@ -46,7 +46,6 @@ static u32 hns_nic_get_link(struct net_device *net_dev)
        u32 link_stat = priv->link;
        struct hnae_handle *h;
 
-       assert(priv && priv->ae_handle);
        h = priv->ae_handle;
 
        if (priv->phy) {
@@ -646,8 +645,6 @@ static void hns_nic_get_drvinfo(struct net_device *net_dev,
 {
        struct hns_nic_priv *priv = netdev_priv(net_dev);
 
-       assert(priv);
-
        strncpy(drvinfo->version, HNAE_DRIVER_VERSION,
                sizeof(drvinfo->version));
        drvinfo->version[sizeof(drvinfo->version) - 1] = '\0';
@@ -720,8 +717,6 @@ static int hns_set_pauseparam(struct net_device *net_dev,
        struct hnae_handle *h;
        struct hnae_ae_ops *ops;
 
-       assert(priv || priv->ae_handle);
-
        h = priv->ae_handle;
        ops = h->dev->ops;
 
@@ -780,8 +775,6 @@ static int hns_set_coalesce(struct net_device *net_dev,
        struct hnae_ae_ops *ops;
        int ret;
 
-       assert(priv || priv->ae_handle);
-
        ops = priv->ae_handle->dev->ops;
 
        if (ec->tx_coalesce_usecs != ec->rx_coalesce_usecs)
@@ -1111,8 +1104,6 @@ void hns_get_regs(struct net_device *net_dev, struct ethtool_regs *cmd,
        struct hns_nic_priv *priv = netdev_priv(net_dev);
        struct hnae_ae_ops *ops;
 
-       assert(priv || priv->ae_handle);
-
        ops = priv->ae_handle->dev->ops;
 
        cmd->version = HNS_CHIP_VERSION;
@@ -1135,8 +1126,6 @@ static int hns_get_regs_len(struct net_device *net_dev)
        struct hns_nic_priv *priv = netdev_priv(net_dev);
        struct hnae_ae_ops *ops;
 
-       assert(priv || priv->ae_handle);
-
        ops = priv->ae_handle->dev->ops;
        if (!ops->get_regs_len) {
                netdev_err(net_dev, "ops->get_regs_len is null!\n");
index 01fccec..466939f 100644 (file)
@@ -189,6 +189,7 @@ struct mvneta_bm_pool *mvneta_bm_pool_use(struct mvneta_bm *priv, u8 pool_id,
                        SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
                hwbm_pool->construct = mvneta_bm_construct;
                hwbm_pool->priv = new_pool;
+               spin_lock_init(&hwbm_pool->lock);
 
                /* Create new pool */
                err = mvneta_bm_pool_create(priv, new_pool);
index c761194..fc95aff 100644 (file)
@@ -362,7 +362,7 @@ static void mlx4_en_get_ethtool_stats(struct net_device *dev,
 
        for (i = 0; i < NUM_MAIN_STATS; i++, bitmap_iterator_inc(&it))
                if (bitmap_iterator_test(&it))
-                       data[index++] = ((unsigned long *)&priv->stats)[i];
+                       data[index++] = ((unsigned long *)&dev->stats)[i];
 
        for (i = 0; i < NUM_PORT_STATS; i++, bitmap_iterator_inc(&it))
                if (bitmap_iterator_test(&it))
index 92e0624..19ceced 100644 (file)
@@ -1296,15 +1296,16 @@ static void mlx4_en_tx_timeout(struct net_device *dev)
 }
 
 
-static struct net_device_stats *mlx4_en_get_stats(struct net_device *dev)
+static struct rtnl_link_stats64 *
+mlx4_en_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
        struct mlx4_en_priv *priv = netdev_priv(dev);
 
        spin_lock_bh(&priv->stats_lock);
-       memcpy(&priv->ret_stats, &priv->stats, sizeof(priv->stats));
+       netdev_stats_to_stats64(stats, &dev->stats);
        spin_unlock_bh(&priv->stats_lock);
 
-       return &priv->ret_stats;
+       return stats;
 }
 
 static void mlx4_en_set_default_moderation(struct mlx4_en_priv *priv)
@@ -1876,7 +1877,6 @@ static void mlx4_en_clear_stats(struct net_device *dev)
        if (mlx4_en_DUMP_ETH_STATS(mdev, priv->port, 1))
                en_dbg(HW, priv, "Failed dumping statistics\n");
 
-       memset(&priv->stats, 0, sizeof(priv->stats));
        memset(&priv->pstats, 0, sizeof(priv->pstats));
        memset(&priv->pkstats, 0, sizeof(priv->pkstats));
        memset(&priv->port_stats, 0, sizeof(priv->port_stats));
@@ -1892,6 +1892,11 @@ static void mlx4_en_clear_stats(struct net_device *dev)
                priv->tx_ring[i]->bytes = 0;
                priv->tx_ring[i]->packets = 0;
                priv->tx_ring[i]->tx_csum = 0;
+               priv->tx_ring[i]->tx_dropped = 0;
+               priv->tx_ring[i]->queue_stopped = 0;
+               priv->tx_ring[i]->wake_queue = 0;
+               priv->tx_ring[i]->tso_packets = 0;
+               priv->tx_ring[i]->xmit_more = 0;
        }
        for (i = 0; i < priv->rx_ring_num; i++) {
                priv->rx_ring[i]->bytes = 0;
@@ -2482,7 +2487,7 @@ static const struct net_device_ops mlx4_netdev_ops = {
        .ndo_stop               = mlx4_en_close,
        .ndo_start_xmit         = mlx4_en_xmit,
        .ndo_select_queue       = mlx4_en_select_queue,
-       .ndo_get_stats          = mlx4_en_get_stats,
+       .ndo_get_stats64        = mlx4_en_get_stats64,
        .ndo_set_rx_mode        = mlx4_en_set_rx_mode,
        .ndo_set_mac_address    = mlx4_en_set_mac,
        .ndo_validate_addr      = eth_validate_addr,
@@ -2514,7 +2519,7 @@ static const struct net_device_ops mlx4_netdev_ops_master = {
        .ndo_stop               = mlx4_en_close,
        .ndo_start_xmit         = mlx4_en_xmit,
        .ndo_select_queue       = mlx4_en_select_queue,
-       .ndo_get_stats          = mlx4_en_get_stats,
+       .ndo_get_stats64        = mlx4_en_get_stats64,
        .ndo_set_rx_mode        = mlx4_en_set_rx_mode,
        .ndo_set_mac_address    = mlx4_en_set_mac,
        .ndo_validate_addr      = eth_validate_addr,
index 20b6c2e..5aa8b75 100644 (file)
@@ -152,8 +152,9 @@ int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
        struct mlx4_counter tmp_counter_stats;
        struct mlx4_en_stat_out_mbox *mlx4_en_stats;
        struct mlx4_en_stat_out_flow_control_mbox *flowstats;
-       struct mlx4_en_priv *priv = netdev_priv(mdev->pndev[port]);
-       struct net_device_stats *stats = &priv->stats;
+       struct net_device *dev = mdev->pndev[port];
+       struct mlx4_en_priv *priv = netdev_priv(dev);
+       struct net_device_stats *stats = &dev->stats;
        struct mlx4_cmd_mailbox *mailbox;
        u64 in_mod = reset << 8 | port;
        int err;
@@ -188,6 +189,7 @@ int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
        }
        stats->tx_packets = 0;
        stats->tx_bytes = 0;
+       stats->tx_dropped = 0;
        priv->port_stats.tx_chksum_offload = 0;
        priv->port_stats.queue_stopped = 0;
        priv->port_stats.wake_queue = 0;
@@ -199,6 +201,7 @@ int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
 
                stats->tx_packets += ring->packets;
                stats->tx_bytes += ring->bytes;
+               stats->tx_dropped += ring->tx_dropped;
                priv->port_stats.tx_chksum_offload += ring->tx_csum;
                priv->port_stats.queue_stopped     += ring->queue_stopped;
                priv->port_stats.wake_queue        += ring->wake_queue;
@@ -237,21 +240,12 @@ int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
        stats->multicast = en_stats_adder(&mlx4_en_stats->MCAST_prio_0,
                                          &mlx4_en_stats->MCAST_prio_1,
                                          NUM_PRIORITIES);
-       stats->collisions = 0;
        stats->rx_dropped = be32_to_cpu(mlx4_en_stats->RDROP) +
                            sw_rx_dropped;
        stats->rx_length_errors = be32_to_cpu(mlx4_en_stats->RdropLength);
-       stats->rx_over_errors = 0;
        stats->rx_crc_errors = be32_to_cpu(mlx4_en_stats->RCRC);
-       stats->rx_frame_errors = 0;
        stats->rx_fifo_errors = be32_to_cpu(mlx4_en_stats->RdropOvflw);
-       stats->rx_missed_errors = 0;
-       stats->tx_aborted_errors = 0;
-       stats->tx_carrier_errors = 0;
-       stats->tx_fifo_errors = 0;
-       stats->tx_heartbeat_errors = 0;
-       stats->tx_window_errors = 0;
-       stats->tx_dropped = be32_to_cpu(mlx4_en_stats->TDROP);
+       stats->tx_dropped += be32_to_cpu(mlx4_en_stats->TDROP);
 
        /* RX stats */
        priv->pkstats.rx_multicast_packets = stats->multicast;
index f6e6157..76aa4d2 100644 (file)
@@ -726,12 +726,12 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
        bool inline_ok;
        u32 ring_cons;
 
-       if (!priv->port_up)
-               goto tx_drop;
-
        tx_ind = skb_get_queue_mapping(skb);
        ring = priv->tx_ring[tx_ind];
 
+       if (!priv->port_up)
+               goto tx_drop;
+
        /* fetch ring->cons far ahead before needing it to avoid stall */
        ring_cons = ACCESS_ONCE(ring->cons);
 
@@ -1030,7 +1030,7 @@ tx_drop_unmap:
 
 tx_drop:
        dev_kfree_skb_any(skb);
-       priv->stats.tx_dropped++;
+       ring->tx_dropped++;
        return NETDEV_TX_OK;
 }
 
index cc84e09..467d47e 100644 (file)
@@ -270,6 +270,7 @@ struct mlx4_en_tx_ring {
        unsigned long           tx_csum;
        unsigned long           tso_packets;
        unsigned long           xmit_more;
+       unsigned int            tx_dropped;
        struct mlx4_bf          bf;
        unsigned long           queue_stopped;
 
@@ -482,8 +483,6 @@ struct mlx4_en_priv {
        struct mlx4_en_port_profile *prof;
        struct net_device *dev;
        unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)];
-       struct net_device_stats stats;
-       struct net_device_stats ret_stats;
        struct mlx4_en_port_state port_state;
        spinlock_t stats_lock;
        struct ethtool_flow_id ethtool_rules[MAX_NUM_OF_FS_RULES];
index cbf58e1..21ec1c2 100644 (file)
@@ -192,9 +192,10 @@ qed_dcbx_process_tlv(struct qed_hwfn *p_hwfn,
                     struct dcbx_app_priority_entry *p_tbl,
                     u32 pri_tc_tbl, int count, bool dcbx_enabled)
 {
-       u8 tc, priority, priority_map;
+       u8 tc, priority_map;
        enum dcbx_protocol_type type;
        u16 protocol_id;
+       int priority;
        bool enable;
        int i;
 
@@ -221,7 +222,7 @@ qed_dcbx_process_tlv(struct qed_hwfn *p_hwfn,
                         * indication, but we only got here if there was an
                         * app tlv for the protocol, so dcbx must be enabled.
                         */
-                       enable = !!(type == DCBX_PROTOCOL_ETH);
+                       enable = !(type == DCBX_PROTOCOL_ETH);
 
                        qed_dcbx_update_app_info(p_data, p_hwfn, enable, true,
                                                 priority, tc, type);
index 089016f..2d89e8c 100644 (file)
@@ -155,12 +155,14 @@ void qed_resc_free(struct qed_dev *cdev)
        }
 }
 
-static int qed_init_qm_info(struct qed_hwfn *p_hwfn)
+static int qed_init_qm_info(struct qed_hwfn *p_hwfn, bool b_sleepable)
 {
        u8 num_vports, vf_offset = 0, i, vport_id, num_ports, curr_queue = 0;
        struct qed_qm_info *qm_info = &p_hwfn->qm_info;
        struct init_qm_port_params *p_qm_port;
        u16 num_pqs, multi_cos_tcs = 1;
+       u8 pf_wfq = qm_info->pf_wfq;
+       u32 pf_rl = qm_info->pf_rl;
        u16 num_vfs = 0;
 
 #ifdef CONFIG_QED_SRIOV
@@ -182,23 +184,28 @@ static int qed_init_qm_info(struct qed_hwfn *p_hwfn)
 
        /* PQs will be arranged as follows: First per-TC PQ then pure-LB quete.
         */
-       qm_info->qm_pq_params = kzalloc(sizeof(*qm_info->qm_pq_params) *
-                                       num_pqs, GFP_KERNEL);
+       qm_info->qm_pq_params = kcalloc(num_pqs,
+                                       sizeof(struct init_qm_pq_params),
+                                       b_sleepable ? GFP_KERNEL : GFP_ATOMIC);
        if (!qm_info->qm_pq_params)
                goto alloc_err;
 
-       qm_info->qm_vport_params = kzalloc(sizeof(*qm_info->qm_vport_params) *
-                                          num_vports, GFP_KERNEL);
+       qm_info->qm_vport_params = kcalloc(num_vports,
+                                          sizeof(struct init_qm_vport_params),
+                                          b_sleepable ? GFP_KERNEL
+                                                      : GFP_ATOMIC);
        if (!qm_info->qm_vport_params)
                goto alloc_err;
 
-       qm_info->qm_port_params = kzalloc(sizeof(*qm_info->qm_port_params) *
-                                         MAX_NUM_PORTS, GFP_KERNEL);
+       qm_info->qm_port_params = kcalloc(MAX_NUM_PORTS,
+                                         sizeof(struct init_qm_port_params),
+                                         b_sleepable ? GFP_KERNEL
+                                                     : GFP_ATOMIC);
        if (!qm_info->qm_port_params)
                goto alloc_err;
 
-       qm_info->wfq_data = kcalloc(num_vports, sizeof(*qm_info->wfq_data),
-                                   GFP_KERNEL);
+       qm_info->wfq_data = kcalloc(num_vports, sizeof(struct qed_wfq_data),
+                                   b_sleepable ? GFP_KERNEL : GFP_ATOMIC);
        if (!qm_info->wfq_data)
                goto alloc_err;
 
@@ -264,10 +271,10 @@ static int qed_init_qm_info(struct qed_hwfn *p_hwfn)
        for (i = 0; i < qm_info->num_vports; i++)
                qm_info->qm_vport_params[i].vport_wfq = 1;
 
-       qm_info->pf_wfq = 0;
-       qm_info->pf_rl = 0;
        qm_info->vport_rl_en = 1;
        qm_info->vport_wfq_en = 1;
+       qm_info->pf_rl = pf_rl;
+       qm_info->pf_wfq = pf_wfq;
 
        return 0;
 
@@ -299,7 +306,7 @@ int qed_qm_reconf(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
        qed_qm_info_free(p_hwfn);
 
        /* initialize qed's qm data structure */
-       rc = qed_init_qm_info(p_hwfn);
+       rc = qed_init_qm_info(p_hwfn, false);
        if (rc)
                return rc;
 
@@ -388,7 +395,7 @@ int qed_resc_alloc(struct qed_dev *cdev)
                        goto alloc_err;
 
                /* Prepare and process QM requirements */
-               rc = qed_init_qm_info(p_hwfn);
+               rc = qed_init_qm_info(p_hwfn, true);
                if (rc)
                        goto alloc_err;
 
@@ -581,7 +588,14 @@ static void qed_calc_hw_mode(struct qed_hwfn *p_hwfn)
 
        hw_mode |= 1 << MODE_ASIC;
 
+       if (p_hwfn->cdev->num_hwfns > 1)
+               hw_mode |= 1 << MODE_100G;
+
        p_hwfn->hw_info.hw_mode = hw_mode;
+
+       DP_VERBOSE(p_hwfn, (NETIF_MSG_PROBE | NETIF_MSG_IFUP),
+                  "Configuring function for hw_mode: 0x%08x\n",
+                  p_hwfn->hw_info.hw_mode);
 }
 
 /* Init run time data for all PFs on an engine. */
@@ -821,6 +835,11 @@ int qed_hw_init(struct qed_dev *cdev,
        u32 load_code, param;
        int rc, mfw_rc, i;
 
+       if ((int_mode == QED_INT_MODE_MSI) && (cdev->num_hwfns > 1)) {
+               DP_NOTICE(cdev, "MSI mode is not supported for CMT devices\n");
+               return -EINVAL;
+       }
+
        if (IS_PF(cdev)) {
                rc = qed_init_fw_data(cdev, bin_fw_data);
                if (rc != 0)
@@ -2086,6 +2105,13 @@ void qed_configure_vp_wfq_on_link_change(struct qed_dev *cdev, u32 min_pf_rate)
 {
        int i;
 
+       if (cdev->num_hwfns > 1) {
+               DP_VERBOSE(cdev,
+                          NETIF_MSG_LINK,
+                          "WFQ configuration is not supported for this device\n");
+               return;
+       }
+
        for_each_hwfn(cdev, i) {
                struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
 
index 8b22f87..7530646 100644 (file)
@@ -413,15 +413,17 @@ static int qed_set_int_mode(struct qed_dev *cdev, bool force_mode)
                /* Fallthrough */
 
        case QED_INT_MODE_MSI:
-               rc = pci_enable_msi(cdev->pdev);
-               if (!rc) {
-                       int_params->out.int_mode = QED_INT_MODE_MSI;
-                       goto out;
-               }
+               if (cdev->num_hwfns == 1) {
+                       rc = pci_enable_msi(cdev->pdev);
+                       if (!rc) {
+                               int_params->out.int_mode = QED_INT_MODE_MSI;
+                               goto out;
+                       }
 
-               DP_NOTICE(cdev, "Failed to enable MSI\n");
-               if (force_mode)
-                       goto out;
+                       DP_NOTICE(cdev, "Failed to enable MSI\n");
+                       if (force_mode)
+                               goto out;
+               }
                /* Fallthrough */
 
        case QED_INT_MODE_INTA:
index 1bc7535..ad3cae3 100644 (file)
@@ -230,7 +230,10 @@ static int qede_get_sset_count(struct net_device *dev, int stringset)
        case ETH_SS_PRIV_FLAGS:
                return QEDE_PRI_FLAG_LEN;
        case ETH_SS_TEST:
-               return QEDE_ETHTOOL_TEST_MAX;
+               if (!IS_VF(edev))
+                       return QEDE_ETHTOOL_TEST_MAX;
+               else
+                       return 0;
        default:
                DP_VERBOSE(edev, QED_MSG_DEBUG,
                           "Unsupported stringset 0x%08x\n", stringset);
index 337e839..5d00d14 100644 (file)
@@ -1824,7 +1824,7 @@ static int qede_set_vf_rate(struct net_device *dev, int vfidx,
 {
        struct qede_dev *edev = netdev_priv(dev);
 
-       return edev->ops->iov->set_rate(edev->cdev, vfidx, max_tx_rate,
+       return edev->ops->iov->set_rate(edev->cdev, vfidx, min_tx_rate,
                                        max_tx_rate);
 }
 
@@ -2091,6 +2091,29 @@ static void qede_vlan_mark_nonconfigured(struct qede_dev *edev)
        edev->accept_any_vlan = false;
 }
 
+int qede_set_features(struct net_device *dev, netdev_features_t features)
+{
+       struct qede_dev *edev = netdev_priv(dev);
+       netdev_features_t changes = features ^ dev->features;
+       bool need_reload = false;
+
+       /* No action needed if hardware GRO is disabled during driver load */
+       if (changes & NETIF_F_GRO) {
+               if (dev->features & NETIF_F_GRO)
+                       need_reload = !edev->gro_disable;
+               else
+                       need_reload = edev->gro_disable;
+       }
+
+       if (need_reload && netif_running(edev->ndev)) {
+               dev->features = features;
+               qede_reload(edev, NULL, NULL);
+               return 1;
+       }
+
+       return 0;
+}
+
 #ifdef CONFIG_QEDE_VXLAN
 static void qede_add_vxlan_port(struct net_device *dev,
                                sa_family_t sa_family, __be16 port)
@@ -2175,6 +2198,7 @@ static const struct net_device_ops qede_netdev_ops = {
 #endif
        .ndo_vlan_rx_add_vid = qede_vlan_rx_add_vid,
        .ndo_vlan_rx_kill_vid = qede_vlan_rx_kill_vid,
+       .ndo_set_features = qede_set_features,
        .ndo_get_stats64 = qede_get_stats64,
 #ifdef CONFIG_QED_SRIOV
        .ndo_set_vf_link_state = qede_set_vf_link_state,
index 83d7210..fd5d1c9 100644 (file)
@@ -4846,7 +4846,6 @@ static void ql_eeh_close(struct net_device *ndev)
        }
 
        /* Disabling the timer */
-       del_timer_sync(&qdev->timer);
        ql_cancel_all_work_sync(qdev);
 
        for (i = 0; i < qdev->rss_ring_count; i++)
@@ -4873,6 +4872,7 @@ static pci_ers_result_t qlge_io_error_detected(struct pci_dev *pdev,
                return PCI_ERS_RESULT_CAN_RECOVER;
        case pci_channel_io_frozen:
                netif_device_detach(ndev);
+               del_timer_sync(&qdev->timer);
                if (netif_running(ndev))
                        ql_eeh_close(ndev);
                pci_disable_device(pdev);
@@ -4880,6 +4880,7 @@ static pci_ers_result_t qlge_io_error_detected(struct pci_dev *pdev,
        case pci_channel_io_perm_failure:
                dev_err(&pdev->dev,
                        "%s: pci_channel_io_perm_failure.\n", __func__);
+               del_timer_sync(&qdev->timer);
                ql_eeh_close(ndev);
                set_bit(QL_EEH_FATAL, &qdev->flags);
                return PCI_ERS_RESULT_DISCONNECT;
index 1681084..1f30912 100644 (file)
@@ -619,6 +619,17 @@ fail:
        return rc;
 }
 
+static void efx_ef10_forget_old_piobufs(struct efx_nic *efx)
+{
+       struct efx_channel *channel;
+       struct efx_tx_queue *tx_queue;
+
+       /* All our existing PIO buffers went away */
+       efx_for_each_channel(channel, efx)
+               efx_for_each_channel_tx_queue(tx_queue, channel)
+                       tx_queue->piobuf = NULL;
+}
+
 #else /* !EFX_USE_PIO */
 
 static int efx_ef10_alloc_piobufs(struct efx_nic *efx, unsigned int n)
@@ -635,6 +646,10 @@ static void efx_ef10_free_piobufs(struct efx_nic *efx)
 {
 }
 
+static void efx_ef10_forget_old_piobufs(struct efx_nic *efx)
+{
+}
+
 #endif /* EFX_USE_PIO */
 
 static void efx_ef10_remove(struct efx_nic *efx)
@@ -1018,6 +1033,7 @@ static void efx_ef10_reset_mc_allocations(struct efx_nic *efx)
        nic_data->must_realloc_vis = true;
        nic_data->must_restore_filters = true;
        nic_data->must_restore_piobufs = true;
+       efx_ef10_forget_old_piobufs(efx);
        nic_data->rx_rss_context = EFX_EF10_RSS_CONTEXT_INVALID;
 
        /* Driver-created vswitches and vports must be re-created */
index 0705ec8..097f363 100644 (file)
@@ -1726,14 +1726,33 @@ static int efx_probe_filters(struct efx_nic *efx)
 
 #ifdef CONFIG_RFS_ACCEL
        if (efx->type->offload_features & NETIF_F_NTUPLE) {
-               efx->rps_flow_id = kcalloc(efx->type->max_rx_ip_filters,
-                                          sizeof(*efx->rps_flow_id),
-                                          GFP_KERNEL);
-               if (!efx->rps_flow_id) {
+               struct efx_channel *channel;
+               int i, success = 1;
+
+               efx_for_each_channel(channel, efx) {
+                       channel->rps_flow_id =
+                               kcalloc(efx->type->max_rx_ip_filters,
+                                       sizeof(*channel->rps_flow_id),
+                                       GFP_KERNEL);
+                       if (!channel->rps_flow_id)
+                               success = 0;
+                       else
+                               for (i = 0;
+                                    i < efx->type->max_rx_ip_filters;
+                                    ++i)
+                                       channel->rps_flow_id[i] =
+                                               RPS_FLOW_ID_INVALID;
+               }
+
+               if (!success) {
+                       efx_for_each_channel(channel, efx)
+                               kfree(channel->rps_flow_id);
                        efx->type->filter_table_remove(efx);
                        rc = -ENOMEM;
                        goto out_unlock;
                }
+
+               efx->rps_expire_index = efx->rps_expire_channel = 0;
        }
 #endif
 out_unlock:
@@ -1744,7 +1763,10 @@ out_unlock:
 static void efx_remove_filters(struct efx_nic *efx)
 {
 #ifdef CONFIG_RFS_ACCEL
-       kfree(efx->rps_flow_id);
+       struct efx_channel *channel;
+
+       efx_for_each_channel(channel, efx)
+               kfree(channel->rps_flow_id);
 #endif
        down_write(&efx->filter_sem);
        efx->type->filter_table_remove(efx);
index 38c4223..d13ddf9 100644 (file)
@@ -403,6 +403,8 @@ enum efx_sync_events_state {
  * @event_test_cpu: Last CPU to handle interrupt or test event for this channel
  * @irq_count: Number of IRQs since last adaptive moderation decision
  * @irq_mod_score: IRQ moderation score
+ * @rps_flow_id: Flow IDs of filters allocated for accelerated RFS,
+ *      indexed by filter ID
  * @n_rx_tobe_disc: Count of RX_TOBE_DISC errors
  * @n_rx_ip_hdr_chksum_err: Count of RX IP header checksum errors
  * @n_rx_tcp_udp_chksum_err: Count of RX TCP and UDP checksum errors
@@ -446,6 +448,8 @@ struct efx_channel {
        unsigned int irq_mod_score;
 #ifdef CONFIG_RFS_ACCEL
        unsigned int rfs_filters_added;
+#define RPS_FLOW_ID_INVALID 0xFFFFFFFF
+       u32 *rps_flow_id;
 #endif
 
        unsigned n_rx_tobe_disc;
@@ -889,9 +893,9 @@ struct vfdi_status;
  * @filter_sem: Filter table rw_semaphore, for freeing the table
  * @filter_lock: Filter table lock, for mere content changes
  * @filter_state: Architecture-dependent filter table state
- * @rps_flow_id: Flow IDs of filters allocated for accelerated RFS,
- *     indexed by filter ID
- * @rps_expire_index: Next index to check for expiry in @rps_flow_id
+ * @rps_expire_channel: Next channel to check for expiry
+ * @rps_expire_index: Next index to check for expiry in
+ *     @rps_expire_channel's @rps_flow_id
  * @active_queues: Count of RX and TX queues that haven't been flushed and drained.
  * @rxq_flush_pending: Count of number of receive queues that need to be flushed.
  *     Decremented when the efx_flush_rx_queue() is called.
@@ -1035,7 +1039,7 @@ struct efx_nic {
        spinlock_t filter_lock;
        void *filter_state;
 #ifdef CONFIG_RFS_ACCEL
-       u32 *rps_flow_id;
+       unsigned int rps_expire_channel;
        unsigned int rps_expire_index;
 #endif
 
index 8956995..02b0b52 100644 (file)
@@ -842,33 +842,18 @@ int efx_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb,
        struct efx_nic *efx = netdev_priv(net_dev);
        struct efx_channel *channel;
        struct efx_filter_spec spec;
-       const __be16 *ports;
-       __be16 ether_type;
-       int nhoff;
+       struct flow_keys fk;
        int rc;
 
-       /* The core RPS/RFS code has already parsed and validated
-        * VLAN, IP and transport headers.  We assume they are in the
-        * header area.
-        */
-
-       if (skb->protocol == htons(ETH_P_8021Q)) {
-               const struct vlan_hdr *vh =
-                       (const struct vlan_hdr *)skb->data;
+       if (flow_id == RPS_FLOW_ID_INVALID)
+               return -EINVAL;
 
-               /* We can't filter on the IP 5-tuple and the vlan
-                * together, so just strip the vlan header and filter
-                * on the IP part.
-                */
-               EFX_BUG_ON_PARANOID(skb_headlen(skb) < sizeof(*vh));
-               ether_type = vh->h_vlan_encapsulated_proto;
-               nhoff = sizeof(struct vlan_hdr);
-       } else {
-               ether_type = skb->protocol;
-               nhoff = 0;
-       }
+       if (!skb_flow_dissect_flow_keys(skb, &fk, 0))
+               return -EPROTONOSUPPORT;
 
-       if (ether_type != htons(ETH_P_IP) && ether_type != htons(ETH_P_IPV6))
+       if (fk.basic.n_proto != htons(ETH_P_IP) && fk.basic.n_proto != htons(ETH_P_IPV6))
+               return -EPROTONOSUPPORT;
+       if (fk.control.flags & FLOW_DIS_IS_FRAGMENT)
                return -EPROTONOSUPPORT;
 
        efx_filter_init_rx(&spec, EFX_FILTER_PRI_HINT,
@@ -878,56 +863,41 @@ int efx_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb,
                EFX_FILTER_MATCH_ETHER_TYPE | EFX_FILTER_MATCH_IP_PROTO |
                EFX_FILTER_MATCH_LOC_HOST | EFX_FILTER_MATCH_LOC_PORT |
                EFX_FILTER_MATCH_REM_HOST | EFX_FILTER_MATCH_REM_PORT;
-       spec.ether_type = ether_type;
-
-       if (ether_type == htons(ETH_P_IP)) {
-               const struct iphdr *ip =
-                       (const struct iphdr *)(skb->data + nhoff);
-
-               EFX_BUG_ON_PARANOID(skb_headlen(skb) < nhoff + sizeof(*ip));
-               if (ip_is_fragment(ip))
-                       return -EPROTONOSUPPORT;
-               spec.ip_proto = ip->protocol;
-               spec.rem_host[0] = ip->saddr;
-               spec.loc_host[0] = ip->daddr;
-               EFX_BUG_ON_PARANOID(skb_headlen(skb) < nhoff + 4 * ip->ihl + 4);
-               ports = (const __be16 *)(skb->data + nhoff + 4 * ip->ihl);
+       spec.ether_type = fk.basic.n_proto;
+       spec.ip_proto = fk.basic.ip_proto;
+
+       if (fk.basic.n_proto == htons(ETH_P_IP)) {
+               spec.rem_host[0] = fk.addrs.v4addrs.src;
+               spec.loc_host[0] = fk.addrs.v4addrs.dst;
        } else {
-               const struct ipv6hdr *ip6 =
-                       (const struct ipv6hdr *)(skb->data + nhoff);
-
-               EFX_BUG_ON_PARANOID(skb_headlen(skb) <
-                                   nhoff + sizeof(*ip6) + 4);
-               spec.ip_proto = ip6->nexthdr;
-               memcpy(spec.rem_host, &ip6->saddr, sizeof(ip6->saddr));
-               memcpy(spec.loc_host, &ip6->daddr, sizeof(ip6->daddr));
-               ports = (const __be16 *)(ip6 + 1);
+               memcpy(spec.rem_host, &fk.addrs.v6addrs.src, sizeof(struct in6_addr));
+               memcpy(spec.loc_host, &fk.addrs.v6addrs.dst, sizeof(struct in6_addr));
        }
 
-       spec.rem_port = ports[0];
-       spec.loc_port = ports[1];
+       spec.rem_port = fk.ports.src;
+       spec.loc_port = fk.ports.dst;
 
        rc = efx->type->filter_rfs_insert(efx, &spec);
        if (rc < 0)
                return rc;
 
        /* Remember this so we can check whether to expire the filter later */
-       efx->rps_flow_id[rc] = flow_id;
-       channel = efx_get_channel(efx, skb_get_rx_queue(skb));
+       channel = efx_get_channel(efx, rxq_index);
+       channel->rps_flow_id[rc] = flow_id;
        ++channel->rfs_filters_added;
 
-       if (ether_type == htons(ETH_P_IP))
+       if (spec.ether_type == htons(ETH_P_IP))
                netif_info(efx, rx_status, efx->net_dev,
                           "steering %s %pI4:%u:%pI4:%u to queue %u [flow %u filter %d]\n",
                           (spec.ip_proto == IPPROTO_TCP) ? "TCP" : "UDP",
-                          spec.rem_host, ntohs(ports[0]), spec.loc_host,
-                          ntohs(ports[1]), rxq_index, flow_id, rc);
+                          spec.rem_host, ntohs(spec.rem_port), spec.loc_host,
+                          ntohs(spec.loc_port), rxq_index, flow_id, rc);
        else
                netif_info(efx, rx_status, efx->net_dev,
                           "steering %s [%pI6]:%u:[%pI6]:%u to queue %u [flow %u filter %d]\n",
                           (spec.ip_proto == IPPROTO_TCP) ? "TCP" : "UDP",
-                          spec.rem_host, ntohs(ports[0]), spec.loc_host,
-                          ntohs(ports[1]), rxq_index, flow_id, rc);
+                          spec.rem_host, ntohs(spec.rem_port), spec.loc_host,
+                          ntohs(spec.loc_port), rxq_index, flow_id, rc);
 
        return rc;
 }
@@ -935,24 +905,34 @@ int efx_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb,
 bool __efx_filter_rfs_expire(struct efx_nic *efx, unsigned int quota)
 {
        bool (*expire_one)(struct efx_nic *efx, u32 flow_id, unsigned int index);
-       unsigned int index, size;
+       unsigned int channel_idx, index, size;
        u32 flow_id;
 
        if (!spin_trylock_bh(&efx->filter_lock))
                return false;
 
        expire_one = efx->type->filter_rfs_expire_one;
+       channel_idx = efx->rps_expire_channel;
        index = efx->rps_expire_index;
        size = efx->type->max_rx_ip_filters;
        while (quota--) {
-               flow_id = efx->rps_flow_id[index];
-               if (expire_one(efx, flow_id, index))
+               struct efx_channel *channel = efx_get_channel(efx, channel_idx);
+               flow_id = channel->rps_flow_id[index];
+
+               if (flow_id != RPS_FLOW_ID_INVALID &&
+                   expire_one(efx, flow_id, index)) {
                        netif_info(efx, rx_status, efx->net_dev,
-                                  "expired filter %d [flow %u]\n",
-                                  index, flow_id);
-               if (++index == size)
+                                  "expired filter %d [queue %u flow %u]\n",
+                                  index, channel_idx, flow_id);
+                       channel->rps_flow_id[index] = RPS_FLOW_ID_INVALID;
+               }
+               if (++index == size) {
+                       if (++channel_idx == efx->n_channels)
+                               channel_idx = 0;
                        index = 0;
+               }
        }
+       efx->rps_expire_channel = channel_idx;
        efx->rps_expire_index = index;
 
        spin_unlock_bh(&efx->filter_lock);
index 3f83c36..ec29585 100644 (file)
@@ -297,7 +297,7 @@ int stmmac_mdio_register(struct net_device *ndev)
                return -ENOMEM;
 
        if (mdio_bus_data->irqs)
-               memcpy(new_bus->irq, mdio_bus_data, sizeof(new_bus->irq));
+               memcpy(new_bus->irq, mdio_bus_data->irqs, sizeof(new_bus->irq));
 
 #ifdef CONFIG_OF
        if (priv->device->of_node)
index a0f64cb..2ace126 100644 (file)
@@ -990,7 +990,7 @@ static void team_port_disable(struct team *team,
 #define TEAM_ENC_FEATURES      (NETIF_F_HW_CSUM | NETIF_F_SG | \
                                 NETIF_F_RXCSUM | NETIF_F_ALL_TSO)
 
-static void __team_compute_features(struct team *team)
+static void ___team_compute_features(struct team *team)
 {
        struct team_port *port;
        u32 vlan_features = TEAM_VLAN_FEATURES & NETIF_F_ALL_FOR_ALL;
@@ -1021,15 +1021,20 @@ static void __team_compute_features(struct team *team)
        team->dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
        if (dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM))
                team->dev->priv_flags |= IFF_XMIT_DST_RELEASE;
+}
 
+static void __team_compute_features(struct team *team)
+{
+       ___team_compute_features(team);
        netdev_change_features(team->dev);
 }
 
 static void team_compute_features(struct team *team)
 {
        mutex_lock(&team->lock);
-       __team_compute_features(team);
+       ___team_compute_features(team);
        mutex_unlock(&team->lock);
+       netdev_change_features(team->dev);
 }
 
 static int team_port_enter(struct team *team, struct team_port *port)
index 36cd7f0..9bbe016 100644 (file)
@@ -473,7 +473,7 @@ static void read_bulk_callback(struct urb *urb)
                goto goon;
        }
 
-       if (!count || count < 4)
+       if (count < 4)
                goto goon;
 
        rx_status = buf[count - 2];
index d9d2806..dc989a8 100644 (file)
@@ -61,6 +61,8 @@
 #define SUSPEND_ALLMODES               (SUSPEND_SUSPEND0 | SUSPEND_SUSPEND1 | \
                                         SUSPEND_SUSPEND2 | SUSPEND_SUSPEND3)
 
+#define CARRIER_CHECK_DELAY (2 * HZ)
+
 struct smsc95xx_priv {
        u32 mac_cr;
        u32 hash_hi;
@@ -69,6 +71,9 @@ struct smsc95xx_priv {
        spinlock_t mac_cr_lock;
        u8 features;
        u8 suspend_flags;
+       bool link_ok;
+       struct delayed_work carrier_check;
+       struct usbnet *dev;
 };
 
 static bool turbo_mode = true;
@@ -624,6 +629,44 @@ static void smsc95xx_status(struct usbnet *dev, struct urb *urb)
                            intdata);
 }
 
+static void set_carrier(struct usbnet *dev, bool link)
+{
+       struct smsc95xx_priv *pdata = (struct smsc95xx_priv *)(dev->data[0]);
+
+       if (pdata->link_ok == link)
+               return;
+
+       pdata->link_ok = link;
+
+       if (link)
+               usbnet_link_change(dev, 1, 0);
+       else
+               usbnet_link_change(dev, 0, 0);
+}
+
+static void check_carrier(struct work_struct *work)
+{
+       struct smsc95xx_priv *pdata = container_of(work, struct smsc95xx_priv,
+                                               carrier_check.work);
+       struct usbnet *dev = pdata->dev;
+       int ret;
+
+       if (pdata->suspend_flags != 0)
+               return;
+
+       ret = smsc95xx_mdio_read(dev->net, dev->mii.phy_id, MII_BMSR);
+       if (ret < 0) {
+               netdev_warn(dev->net, "Failed to read MII_BMSR\n");
+               return;
+       }
+       if (ret & BMSR_LSTATUS)
+               set_carrier(dev, 1);
+       else
+               set_carrier(dev, 0);
+
+       schedule_delayed_work(&pdata->carrier_check, CARRIER_CHECK_DELAY);
+}
+
 /* Enable or disable Tx & Rx checksum offload engines */
 static int smsc95xx_set_features(struct net_device *netdev,
        netdev_features_t features)
@@ -1165,13 +1208,20 @@ static int smsc95xx_bind(struct usbnet *dev, struct usb_interface *intf)
        dev->net->flags |= IFF_MULTICAST;
        dev->net->hard_header_len += SMSC95XX_TX_OVERHEAD_CSUM;
        dev->hard_mtu = dev->net->mtu + dev->net->hard_header_len;
+
+       pdata->dev = dev;
+       INIT_DELAYED_WORK(&pdata->carrier_check, check_carrier);
+       schedule_delayed_work(&pdata->carrier_check, CARRIER_CHECK_DELAY);
+
        return 0;
 }
 
 static void smsc95xx_unbind(struct usbnet *dev, struct usb_interface *intf)
 {
        struct smsc95xx_priv *pdata = (struct smsc95xx_priv *)(dev->data[0]);
+
        if (pdata) {
+               cancel_delayed_work(&pdata->carrier_check);
                netif_dbg(dev, ifdown, dev->net, "free pdata\n");
                kfree(pdata);
                pdata = NULL;
@@ -1695,6 +1745,7 @@ static int smsc95xx_resume(struct usb_interface *intf)
 
        /* do this first to ensure it's cleared even in error case */
        pdata->suspend_flags = 0;
+       schedule_delayed_work(&pdata->carrier_check, CARRIER_CHECK_DELAY);
 
        if (suspend_flags & SUSPEND_ALLMODES) {
                /* clear wake-up sources */
index 49d84e5..e0638e5 100644 (file)
@@ -1925,24 +1925,11 @@ static int virtnet_probe(struct virtio_device *vdev)
 
        virtio_device_ready(vdev);
 
-       /* Last of all, set up some receive buffers. */
-       for (i = 0; i < vi->curr_queue_pairs; i++) {
-               try_fill_recv(vi, &vi->rq[i], GFP_KERNEL);
-
-               /* If we didn't even get one input buffer, we're useless. */
-               if (vi->rq[i].vq->num_free ==
-                   virtqueue_get_vring_size(vi->rq[i].vq)) {
-                       free_unused_bufs(vi);
-                       err = -ENOMEM;
-                       goto free_recv_bufs;
-               }
-       }
-
        vi->nb.notifier_call = &virtnet_cpu_callback;
        err = register_hotcpu_notifier(&vi->nb);
        if (err) {
                pr_debug("virtio_net: registering cpu notifier failed\n");
-               goto free_recv_bufs;
+               goto free_unregister_netdev;
        }
 
        /* Assume link up if device can't report link status,
@@ -1960,10 +1947,9 @@ static int virtnet_probe(struct virtio_device *vdev)
 
        return 0;
 
-free_recv_bufs:
+free_unregister_netdev:
        vi->vdev->config->reset(vdev);
 
-       free_receive_bufs(vi);
        unregister_netdev(dev);
 free_vqs:
        cancel_delayed_work_sync(&vi->refill);
index 8ff30c3..f999db2 100644 (file)
@@ -3086,6 +3086,9 @@ static int vxlan_newlink(struct net *src_net, struct net_device *dev,
        if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL])
                conf.flags |= VXLAN_F_REMCSUM_NOPARTIAL;
 
+       if (tb[IFLA_MTU])
+               conf.mtu = nla_get_u32(tb[IFLA_MTU]);
+
        err = vxlan_dev_configure(src_net, dev, &conf);
        switch (err) {
        case -ENODEV:
index 020ac1a..cea9443 100644 (file)
@@ -382,7 +382,7 @@ static int wlcore_probe_of(struct spi_device *spi, struct wl12xx_spi_glue *glue,
 
        ret = of_property_read_u32(dt_node, "ref-clock-frequency",
                                   &pdev_data->ref_clock_freq);
-       if (IS_ERR_VALUE(ret)) {
+       if (ret) {
                dev_err(glue->dev,
                        "can't get reference clock frequency (%d)\n", ret);
                return ret;
@@ -425,7 +425,7 @@ static int wl1271_probe(struct spi_device *spi)
        }
 
        ret = wlcore_probe_of(spi, glue, &pdev_data);
-       if (IS_ERR_VALUE(ret)) {
+       if (ret) {
                dev_err(glue->dev,
                        "can't get device tree parameters (%d)\n", ret);
                return ret;
index 042baec..608fc44 100644 (file)
@@ -164,14 +164,22 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
 }
 
 static long pmem_direct_access(struct block_device *bdev, sector_t sector,
-                     void __pmem **kaddr, pfn_t *pfn)
+                     void __pmem **kaddr, pfn_t *pfn, long size)
 {
        struct pmem_device *pmem = bdev->bd_queue->queuedata;
        resource_size_t offset = sector * 512 + pmem->data_offset;
 
+       if (unlikely(is_bad_pmem(&pmem->bb, sector, size)))
+               return -EIO;
        *kaddr = pmem->virt_addr + offset;
        *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
 
+       /*
+        * If badblocks are present, limit known good range to the
+        * requested range.
+        */
+       if (unlikely(pmem->bb.count))
+               return size;
        return pmem->size - pmem->pfn_pad - offset;
 }
 
index 2de248b..1a51584 100644 (file)
@@ -95,6 +95,15 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
                        break;
                }
                break;
+       case NVME_CTRL_DEAD:
+               switch (old_state) {
+               case NVME_CTRL_DELETING:
+                       changed = true;
+                       /* FALLTHRU */
+               default:
+                       break;
+               }
+               break;
        default:
                break;
        }
@@ -720,10 +729,14 @@ static void nvme_init_integrity(struct nvme_ns *ns)
        switch (ns->pi_type) {
        case NVME_NS_DPS_PI_TYPE3:
                integrity.profile = &t10_pi_type3_crc;
+               integrity.tag_size = sizeof(u16) + sizeof(u32);
+               integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
                break;
        case NVME_NS_DPS_PI_TYPE1:
        case NVME_NS_DPS_PI_TYPE2:
                integrity.profile = &t10_pi_type1_crc;
+               integrity.tag_size = sizeof(u16);
+               integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
                break;
        default:
                integrity.profile = NULL;
@@ -1212,6 +1225,9 @@ static long nvme_dev_ioctl(struct file *file, unsigned int cmd,
                return ctrl->ops->reset_ctrl(ctrl);
        case NVME_IOCTL_SUBSYS_RESET:
                return nvme_reset_subsystem(ctrl);
+       case NVME_IOCTL_RESCAN:
+               nvme_queue_scan(ctrl);
+               return 0;
        default:
                return -ENOTTY;
        }
@@ -1239,6 +1255,17 @@ static ssize_t nvme_sysfs_reset(struct device *dev,
 }
 static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset);
 
+static ssize_t nvme_sysfs_rescan(struct device *dev,
+                               struct device_attribute *attr, const char *buf,
+                               size_t count)
+{
+       struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+       nvme_queue_scan(ctrl);
+       return count;
+}
+static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan);
+
 static ssize_t wwid_show(struct device *dev, struct device_attribute *attr,
                                                                char *buf)
 {
@@ -1342,6 +1369,7 @@ nvme_show_int_function(cntlid);
 
 static struct attribute *nvme_dev_attrs[] = {
        &dev_attr_reset_controller.attr,
+       &dev_attr_rescan_controller.attr,
        &dev_attr_model.attr,
        &dev_attr_serial.attr,
        &dev_attr_firmware_rev.attr,
@@ -1580,6 +1608,15 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
 {
        struct nvme_ns *ns, *next;
 
+       /*
+        * The dead states indicates the controller was not gracefully
+        * disconnected. In that case, we won't be able to flush any data while
+        * removing the namespaces' disks; fail all the queues now to avoid
+        * potentially having to clean up the failed sync later.
+        */
+       if (ctrl->state == NVME_CTRL_DEAD)
+               nvme_kill_queues(ctrl);
+
        mutex_lock(&ctrl->namespaces_mutex);
        list_for_each_entry_safe(ns, next, &ctrl->namespaces, list)
                nvme_ns_remove(ns);
index 114b928..1daa048 100644 (file)
@@ -72,6 +72,7 @@ enum nvme_ctrl_state {
        NVME_CTRL_LIVE,
        NVME_CTRL_RESETTING,
        NVME_CTRL_DELETING,
+       NVME_CTRL_DEAD,
 };
 
 struct nvme_ctrl {
index 0f093f1..78dca31 100644 (file)
@@ -1394,7 +1394,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
        struct pci_dev *pdev = to_pci_dev(dev->dev);
        int result, i, vecs, nr_io_queues, size;
 
-       nr_io_queues = num_possible_cpus();
+       nr_io_queues = num_online_cpus();
        result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
        if (result < 0)
                return result;
@@ -1551,12 +1551,12 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
 
 static void nvme_disable_io_queues(struct nvme_dev *dev)
 {
-       int pass;
+       int pass, queues = dev->online_queues - 1;
        unsigned long timeout;
        u8 opcode = nvme_admin_delete_sq;
 
        for (pass = 0; pass < 2; pass++) {
-               int sent = 0, i = dev->queue_count - 1;
+               int sent = 0, i = queues;
 
                reinit_completion(&dev->ioq_wait);
  retry:
@@ -1857,7 +1857,7 @@ static void nvme_remove_dead_ctrl_work(struct work_struct *work)
 
        nvme_kill_queues(&dev->ctrl);
        if (pci_get_drvdata(pdev))
-               pci_stop_and_remove_bus_device_locked(pdev);
+               device_release_driver(&pdev->dev);
        nvme_put_ctrl(&dev->ctrl);
 }
 
@@ -2017,6 +2017,10 @@ static void nvme_remove(struct pci_dev *pdev)
        nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
 
        pci_set_drvdata(pdev, NULL);
+
+       if (!pci_device_is_present(pdev))
+               nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
+
        flush_work(&dev->reset_work);
        nvme_uninit_ctrl(&dev->ctrl);
        nvme_dev_disable(dev, true);
@@ -2060,14 +2064,17 @@ static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev,
         * shutdown the controller to quiesce. The controller will be restarted
         * after the slot reset through driver's slot_reset callback.
         */
-       dev_warn(dev->ctrl.device, "error detected: state:%d\n", state);
        switch (state) {
        case pci_channel_io_normal:
                return PCI_ERS_RESULT_CAN_RECOVER;
        case pci_channel_io_frozen:
+               dev_warn(dev->ctrl.device,
+                       "frozen state error detected, reset controller\n");
                nvme_dev_disable(dev, false);
                return PCI_ERS_RESULT_NEED_RESET;
        case pci_channel_io_perm_failure:
+               dev_warn(dev->ctrl.device,
+                       "failure state error detected, request disconnect\n");
                return PCI_ERS_RESULT_DISCONNECT;
        }
        return PCI_ERS_RESULT_NEED_RESET;
@@ -2102,6 +2109,12 @@ static const struct pci_device_id nvme_id_table[] = {
        { PCI_VDEVICE(INTEL, 0x0953),
                .driver_data = NVME_QUIRK_STRIPE_SIZE |
                                NVME_QUIRK_DISCARD_ZEROES, },
+       { PCI_VDEVICE(INTEL, 0x0a53),
+               .driver_data = NVME_QUIRK_STRIPE_SIZE |
+                               NVME_QUIRK_DISCARD_ZEROES, },
+       { PCI_VDEVICE(INTEL, 0x0a54),
+               .driver_data = NVME_QUIRK_STRIPE_SIZE |
+                               NVME_QUIRK_DISCARD_ZEROES, },
        { PCI_VDEVICE(INTEL, 0x5845),   /* Qemu emulated controller */
                .driver_data = NVME_QUIRK_IDENTIFY_CNS, },
        { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
index bb4ea12..965911d 100644 (file)
@@ -113,7 +113,7 @@ static ssize_t bin_attr_nvmem_read(struct file *filp, struct kobject *kobj,
 
        rc = nvmem_reg_read(nvmem, pos, buf, count);
 
-       if (IS_ERR_VALUE(rc))
+       if (rc)
                return rc;
 
        return count;
@@ -147,7 +147,7 @@ static ssize_t bin_attr_nvmem_write(struct file *filp, struct kobject *kobj,
 
        rc = nvmem_reg_write(nvmem, pos, buf, count);
 
-       if (IS_ERR_VALUE(rc))
+       if (rc)
                return rc;
 
        return count;
@@ -366,7 +366,7 @@ static int nvmem_add_cells(struct nvmem_device *nvmem,
                }
 
                rval = nvmem_cell_info_to_nvmem_cell(nvmem, &info[i], cells[i]);
-               if (IS_ERR_VALUE(rval)) {
+               if (rval) {
                        kfree(cells[i]);
                        goto err;
                }
@@ -963,7 +963,7 @@ static int __nvmem_cell_read(struct nvmem_device *nvmem,
 
        rc = nvmem_reg_read(nvmem, cell->offset, buf, cell->bytes);
 
-       if (IS_ERR_VALUE(rc))
+       if (rc)
                return rc;
 
        /* shift bits in-place */
@@ -998,7 +998,7 @@ void *nvmem_cell_read(struct nvmem_cell *cell, size_t *len)
                return ERR_PTR(-ENOMEM);
 
        rc = __nvmem_cell_read(nvmem, cell, buf, len);
-       if (IS_ERR_VALUE(rc)) {
+       if (rc) {
                kfree(buf);
                return ERR_PTR(rc);
        }
@@ -1083,7 +1083,7 @@ int nvmem_cell_write(struct nvmem_cell *cell, void *buf, size_t len)
        if (cell->bit_offset || cell->nbits)
                kfree(buf);
 
-       if (IS_ERR_VALUE(rc))
+       if (rc)
                return rc;
 
        return len;
@@ -1111,11 +1111,11 @@ ssize_t nvmem_device_cell_read(struct nvmem_device *nvmem,
                return -EINVAL;
 
        rc = nvmem_cell_info_to_nvmem_cell(nvmem, info, &cell);
-       if (IS_ERR_VALUE(rc))
+       if (rc)
                return rc;
 
        rc = __nvmem_cell_read(nvmem, &cell, buf, &len);
-       if (IS_ERR_VALUE(rc))
+       if (rc)
                return rc;
 
        return len;
@@ -1141,7 +1141,7 @@ int nvmem_device_cell_write(struct nvmem_device *nvmem,
                return -EINVAL;
 
        rc = nvmem_cell_info_to_nvmem_cell(nvmem, info, &cell);
-       if (IS_ERR_VALUE(rc))
+       if (rc)
                return rc;
 
        return nvmem_cell_write(&cell, buf, cell.bytes);
@@ -1170,7 +1170,7 @@ int nvmem_device_read(struct nvmem_device *nvmem,
 
        rc = nvmem_reg_read(nvmem, offset, buf, bytes);
 
-       if (IS_ERR_VALUE(rc))
+       if (rc)
                return rc;
 
        return bytes;
@@ -1198,7 +1198,7 @@ int nvmem_device_write(struct nvmem_device *nvmem,
 
        rc = nvmem_reg_write(nvmem, offset, buf, bytes);
 
-       if (IS_ERR_VALUE(rc))
+       if (rc)
                return rc;
 
 
index bee3fa9..d7efd9d 100644 (file)
@@ -10,7 +10,6 @@ obj-$(CONFIG_OF_UNITTEST) += unittest.o
 obj-$(CONFIG_OF_MDIO)  += of_mdio.o
 obj-$(CONFIG_OF_PCI)   += of_pci.o
 obj-$(CONFIG_OF_PCI_IRQ)  += of_pci_irq.o
-obj-$(CONFIG_OF_MTD)   += of_mtd.o
 obj-$(CONFIG_OF_RESERVED_MEM) += of_reserved_mem.o
 obj-$(CONFIG_OF_RESOLVE)  += resolver.o
 obj-$(CONFIG_OF_OVERLAY) += overlay.o
diff --git a/drivers/of/of_mtd.c b/drivers/of/of_mtd.c
deleted file mode 100644 (file)
index b7361ed..0000000
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Copyright 2012 Jean-Christophe PLAGNIOL-VILLARD <plagnioj@jcrosoft.com>
- *
- * OF helpers for mtd.
- *
- * This file is released under the GPLv2
- *
- */
-#include <linux/kernel.h>
-#include <linux/of_mtd.h>
-#include <linux/mtd/nand.h>
-#include <linux/export.h>
-
-/**
- * It maps 'enum nand_ecc_modes_t' found in include/linux/mtd/nand.h
- * into the device tree binding of 'nand-ecc', so that MTD
- * device driver can get nand ecc from device tree.
- */
-static const char *nand_ecc_modes[] = {
-       [NAND_ECC_NONE]         = "none",
-       [NAND_ECC_SOFT]         = "soft",
-       [NAND_ECC_HW]           = "hw",
-       [NAND_ECC_HW_SYNDROME]  = "hw_syndrome",
-       [NAND_ECC_HW_OOB_FIRST] = "hw_oob_first",
-       [NAND_ECC_SOFT_BCH]     = "soft_bch",
-};
-
-/**
- * of_get_nand_ecc_mode - Get nand ecc mode for given device_node
- * @np:        Pointer to the given device_node
- *
- * The function gets ecc mode string from property 'nand-ecc-mode',
- * and return its index in nand_ecc_modes table, or errno in error case.
- */
-int of_get_nand_ecc_mode(struct device_node *np)
-{
-       const char *pm;
-       int err, i;
-
-       err = of_property_read_string(np, "nand-ecc-mode", &pm);
-       if (err < 0)
-               return err;
-
-       for (i = 0; i < ARRAY_SIZE(nand_ecc_modes); i++)
-               if (!strcasecmp(pm, nand_ecc_modes[i]))
-                       return i;
-
-       return -ENODEV;
-}
-EXPORT_SYMBOL_GPL(of_get_nand_ecc_mode);
-
-/**
- * of_get_nand_ecc_step_size - Get ECC step size associated to
- * the required ECC strength (see below).
- * @np:        Pointer to the given device_node
- *
- * return the ECC step size, or errno in error case.
- */
-int of_get_nand_ecc_step_size(struct device_node *np)
-{
-       int ret;
-       u32 val;
-
-       ret = of_property_read_u32(np, "nand-ecc-step-size", &val);
-       return ret ? ret : val;
-}
-EXPORT_SYMBOL_GPL(of_get_nand_ecc_step_size);
-
-/**
- * of_get_nand_ecc_strength - Get required ECC strength over the
- * correspnding step size as defined by 'nand-ecc-size'
- * @np:        Pointer to the given device_node
- *
- * return the ECC strength, or errno in error case.
- */
-int of_get_nand_ecc_strength(struct device_node *np)
-{
-       int ret;
-       u32 val;
-
-       ret = of_property_read_u32(np, "nand-ecc-strength", &val);
-       return ret ? ret : val;
-}
-EXPORT_SYMBOL_GPL(of_get_nand_ecc_strength);
-
-/**
- * of_get_nand_bus_width - Get nand bus witdh for given device_node
- * @np:        Pointer to the given device_node
- *
- * return bus width option, or errno in error case.
- */
-int of_get_nand_bus_width(struct device_node *np)
-{
-       u32 val;
-
-       if (of_property_read_u32(np, "nand-bus-width", &val))
-               return 8;
-
-       switch(val) {
-       case 8:
-       case 16:
-               return val;
-       default:
-               return -EIO;
-       }
-}
-EXPORT_SYMBOL_GPL(of_get_nand_bus_width);
-
-/**
- * of_get_nand_on_flash_bbt - Get nand on flash bbt for given device_node
- * @np:        Pointer to the given device_node
- *
- * return true if present false other wise
- */
-bool of_get_nand_on_flash_bbt(struct device_node *np)
-{
-       return of_property_read_bool(np, "nand-on-flash-bbt");
-}
-EXPORT_SYMBOL_GPL(of_get_nand_on_flash_bbt);
index f2d01d4..1b8304e 100644 (file)
@@ -950,17 +950,14 @@ static int of_pmu_irq_cfg(struct arm_pmu *pmu)
 
                /* For SPIs, we need to track the affinity per IRQ */
                if (using_spi) {
-                       if (i >= pdev->num_resources) {
-                               of_node_put(dn);
+                       if (i >= pdev->num_resources)
                                break;
-                       }
 
                        irqs[i] = cpu;
                }
 
                /* Keep track of the CPUs containing this PMU type */
                cpumask_set_cpu(cpu, &pmu->supported_cpus);
-               of_node_put(dn);
                i++;
        } while (1);
 
@@ -995,9 +992,6 @@ int arm_pmu_device_probe(struct platform_device *pdev,
 
        armpmu_init(pmu);
 
-       if (!__oprofile_cpu_pmu)
-               __oprofile_cpu_pmu = pmu;
-
        pmu->plat_device = pdev;
 
        if (node && (of_id = of_match_node(of_table, pdev->dev.of_node))) {
@@ -1033,6 +1027,9 @@ int arm_pmu_device_probe(struct platform_device *pdev,
        if (ret)
                goto out_destroy;
 
+       if (!__oprofile_cpu_pmu)
+               __oprofile_cpu_pmu = pmu;
+
        pr_info("enabled with %s PMU driver, %d counters available\n",
                        pmu->name, pmu->num_events);
 
@@ -1043,6 +1040,7 @@ out_destroy:
 out_free:
        pr_info("%s: failed to register PMU devices!\n",
                of_node_full_name(node));
+       kfree(pmu->irq_affinity);
        kfree(pmu);
        return ret;
 }
index 55182fc..677a811 100644 (file)
@@ -153,8 +153,10 @@ struct byt_community {
                .name                   = (n),                  \
                .pins                   = (p),                  \
                .npins                  = ARRAY_SIZE((p)),      \
-               .has_simple_funcs       = 1,            \
-               .simple_funcs           = (f),                  \
+               .has_simple_funcs       = 1,                    \
+               {                                               \
+                       .simple_funcs           = (f),          \
+               },                                              \
                .nfuncs                 = ARRAY_SIZE((f)),      \
        }
 #define PIN_GROUP_MIXED(n, p, f)                               \
@@ -163,7 +165,9 @@ struct byt_community {
                .pins                   = (p),                  \
                .npins                  = ARRAY_SIZE((p)),      \
                .has_simple_funcs       = 0,                    \
-               .mixed_funcs            = (f),                  \
+               {                                               \
+                       .mixed_funcs            = (f),          \
+               },                                              \
                .nfuncs                 = ARRAY_SIZE((f)),      \
        }
 
index 207b13b..a607655 100644 (file)
@@ -1256,9 +1256,10 @@ static void mtk_eint_irq_handler(struct irq_desc *desc)
        const struct mtk_desc_pin *pin;
 
        chained_irq_enter(chip, desc);
-       for (eint_num = 0; eint_num < pctl->devdata->ap_num; eint_num += 32) {
+       for (eint_num = 0;
+            eint_num < pctl->devdata->ap_num;
+            eint_num += 32, reg += 4) {
                status = readl(reg);
-               reg += 4;
                while (status) {
                        offset = __ffs(status);
                        index = eint_num + offset;
index ccbfc32..38facef 100644 (file)
@@ -854,7 +854,7 @@ static int nmk_gpio_get_dir(struct gpio_chip *chip, unsigned offset)
 
        clk_enable(nmk_chip->clk);
 
-       dir = !!(readl(nmk_chip->addr + NMK_GPIO_DIR) & BIT(offset));
+       dir = !(readl(nmk_chip->addr + NMK_GPIO_DIR) & BIT(offset));
 
        clk_disable(nmk_chip->clk);
 
index d03df4a..76bdae1 100644 (file)
@@ -64,4 +64,14 @@ config CROS_EC_PROTO
         help
           ChromeOS EC communication protocol helpers.
 
+config CROS_KBD_LED_BACKLIGHT
+       tristate "Backlight LED support for Chrome OS keyboards"
+       depends on LEDS_CLASS && ACPI
+       help
+         This option enables support for the keyboard backlight LEDs on
+         select Chrome OS systems.
+
+         To compile this driver as a module, choose M here: the
+         module will be called cros_kbd_led_backlight.
+
 endif # CHROMEOS_PLATFORMS
index bc498bd..4f34627 100644 (file)
@@ -1,8 +1,9 @@
 
-obj-$(CONFIG_CHROMEOS_LAPTOP)  += chromeos_laptop.o
-obj-$(CONFIG_CHROMEOS_PSTORE)  += chromeos_pstore.o
-cros_ec_devs-objs              := cros_ec_dev.o cros_ec_sysfs.o \
-                                  cros_ec_lightbar.o cros_ec_vbc.o
-obj-$(CONFIG_CROS_EC_CHARDEV)   += cros_ec_devs.o
-obj-$(CONFIG_CROS_EC_LPC)       += cros_ec_lpc.o
-obj-$(CONFIG_CROS_EC_PROTO)    += cros_ec_proto.o
+obj-$(CONFIG_CHROMEOS_LAPTOP)          += chromeos_laptop.o
+obj-$(CONFIG_CHROMEOS_PSTORE)          += chromeos_pstore.o
+cros_ec_devs-objs                      := cros_ec_dev.o cros_ec_sysfs.o \
+                                          cros_ec_lightbar.o cros_ec_vbc.o
+obj-$(CONFIG_CROS_EC_CHARDEV)          += cros_ec_devs.o
+obj-$(CONFIG_CROS_EC_LPC)              += cros_ec_lpc.o
+obj-$(CONFIG_CROS_EC_PROTO)            += cros_ec_proto.o
+obj-$(CONFIG_CROS_KBD_LED_BACKLIGHT)   += cros_kbd_led_backlight.o
index 2b441e9..e8a44a9 100644 (file)
@@ -34,6 +34,7 @@
 #define ATMEL_TS_I2C_ADDR      0x4a
 #define ATMEL_TS_I2C_BL_ADDR   0x26
 #define CYAPA_TP_I2C_ADDR      0x67
+#define ELAN_TP_I2C_ADDR       0x15
 #define ISL_ALS_I2C_ADDR       0x44
 #define TAOS_ALS_I2C_ADDR      0x29
 
@@ -73,7 +74,7 @@ struct i2c_peripheral {
        int tries;
 };
 
-#define MAX_I2C_PERIPHERALS 3
+#define MAX_I2C_PERIPHERALS 4
 
 struct chromeos_laptop {
        struct i2c_peripheral i2c_peripherals[MAX_I2C_PERIPHERALS];
@@ -86,6 +87,11 @@ static struct i2c_board_info cyapa_device = {
        .flags          = I2C_CLIENT_WAKE,
 };
 
+static struct i2c_board_info elantech_device = {
+       I2C_BOARD_INFO("elan_i2c", ELAN_TP_I2C_ADDR),
+       .flags          = I2C_CLIENT_WAKE,
+};
+
 static struct i2c_board_info isl_als_device = {
        I2C_BOARD_INFO("isl29018", ISL_ALS_I2C_ADDR),
 };
@@ -306,6 +312,16 @@ static int setup_atmel_224s_tp(enum i2c_adapter_type type)
        return (!tp) ? -EAGAIN : 0;
 }
 
+static int setup_elantech_tp(enum i2c_adapter_type type)
+{
+       if (tp)
+               return 0;
+
+       /* add elantech touchpad */
+       tp = add_i2c_device("trackpad", type, &elantech_device);
+       return (!tp) ? -EAGAIN : 0;
+}
+
 static int setup_atmel_1664s_ts(enum i2c_adapter_type type)
 {
        const unsigned short addr_list[] = { ATMEL_TS_I2C_BL_ADDR,
@@ -445,6 +461,8 @@ static struct chromeos_laptop dell_chromebook_11 = {
        .i2c_peripherals = {
                /* Touchpad. */
                { .add = setup_cyapa_tp, I2C_ADAPTER_DESIGNWARE_0 },
+               /* Elan Touchpad option. */
+               { .add = setup_elantech_tp, I2C_ADAPTER_DESIGNWARE_0 },
        },
 };
 
@@ -475,6 +493,8 @@ static struct chromeos_laptop acer_c720 = {
                { .add = setup_atmel_1664s_ts, I2C_ADAPTER_DESIGNWARE_1 },
                /* Touchpad. */
                { .add = setup_cyapa_tp, I2C_ADAPTER_DESIGNWARE_0 },
+               /* Elan Touchpad option. */
+               { .add = setup_elantech_tp, I2C_ADAPTER_DESIGNWARE_0 },
                /* Light Sensor. */
                { .add = setup_isl29018_als, I2C_ADAPTER_DESIGNWARE_1 },
        },
index 3474920..308a853 100644 (file)
@@ -8,6 +8,7 @@
  *  the Free Software Foundation, version 2 of the License.
  */
 
+#include <linux/acpi.h>
 #include <linux/dmi.h>
 #include <linux/module.h>
 #include <linux/platform_device.h>
@@ -58,7 +59,7 @@ MODULE_DEVICE_TABLE(dmi, chromeos_pstore_dmi_table);
 static struct ramoops_platform_data chromeos_ramoops_data = {
        .mem_size       = 0x100000,
        .mem_address    = 0xf00000,
-       .record_size    = 0x20000,
+       .record_size    = 0x40000,
        .console_size   = 0x20000,
        .ftrace_size    = 0x20000,
        .dump_oops      = 1,
@@ -71,9 +72,59 @@ static struct platform_device chromeos_ramoops = {
        },
 };
 
+#ifdef CONFIG_ACPI
+static const struct acpi_device_id cros_ramoops_acpi_match[] = {
+       { "GOOG9999", 0 },
+       { }
+};
+MODULE_DEVICE_TABLE(acpi, cros_ramoops_acpi_match);
+
+static struct platform_driver chromeos_ramoops_acpi = {
+       .driver         = {
+               .name   = "chromeos_pstore",
+               .acpi_match_table = ACPI_PTR(cros_ramoops_acpi_match),
+       },
+};
+
+static int __init chromeos_probe_acpi(struct platform_device *pdev)
+{
+       struct resource *res;
+       resource_size_t len;
+
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       if (!res)
+               return -ENOMEM;
+
+       len = resource_size(res);
+       if (!res->start || !len)
+               return -ENOMEM;
+
+       pr_info("chromeos ramoops using acpi device.\n");
+
+       chromeos_ramoops_data.mem_size = len;
+       chromeos_ramoops_data.mem_address = res->start;
+
+       return 0;
+}
+
+static bool __init chromeos_check_acpi(void)
+{
+       if (!platform_driver_probe(&chromeos_ramoops_acpi, chromeos_probe_acpi))
+               return true;
+       return false;
+}
+#else
+static inline bool chromeos_check_acpi(void) { return false; }
+#endif
+
 static int __init chromeos_pstore_init(void)
 {
-       if (dmi_check_system(chromeos_pstore_dmi_table))
+       bool acpi_dev_found;
+
+       /* First check ACPI for non-hardcoded values from firmware. */
+       acpi_dev_found = chromeos_check_acpi();
+
+       if (acpi_dev_found || dmi_check_system(chromeos_pstore_dmi_table))
                return platform_device_register(&chromeos_ramoops);
 
        return -ENODEV;
index d45cd25..6d8ee3b 100644 (file)
@@ -137,6 +137,10 @@ static long ec_device_ioctl_xcmd(struct cros_ec_dev *ec, void __user *arg)
        if (copy_from_user(&u_cmd, arg, sizeof(u_cmd)))
                return -EFAULT;
 
+       if ((u_cmd.outsize > EC_MAX_MSG_BYTES) ||
+           (u_cmd.insize > EC_MAX_MSG_BYTES))
+               return -EINVAL;
+
        s_cmd = kmalloc(sizeof(*s_cmd) + max(u_cmd.outsize, u_cmd.insize),
                        GFP_KERNEL);
        if (!s_cmd)
@@ -208,6 +212,9 @@ static const struct file_operations fops = {
        .release = ec_device_release,
        .read = ec_device_read,
        .unlocked_ioctl = ec_device_ioctl,
+#ifdef CONFIG_COMPAT
+       .compat_ioctl = ec_device_ioctl,
+#endif
 };
 
 static void __remove(struct device *dev)
index ff76405..8df3d44 100644 (file)
@@ -412,9 +412,13 @@ static umode_t cros_ec_lightbar_attrs_are_visible(struct kobject *kobj,
        struct device *dev = container_of(kobj, struct device, kobj);
        struct cros_ec_dev *ec = container_of(dev,
                                              struct cros_ec_dev, class_dev);
-       struct platform_device *pdev = container_of(ec->dev,
-                                                  struct platform_device, dev);
-       if (pdev->id != 0)
+       struct platform_device *pdev = to_platform_device(ec->dev);
+       struct cros_ec_platform *pdata = pdev->dev.platform_data;
+       int is_cros_ec;
+
+       is_cros_ec = strcmp(pdata->ec_name, CROS_EC_DEV_NAME);
+
+       if (is_cros_ec != 0)
                return 0;
 
        /* Only instantiate this stuff if the EC has a lightbar */
index 990308c..b6e161f 100644 (file)
@@ -298,8 +298,8 @@ int cros_ec_query_all(struct cros_ec_device *ec_dev)
                        ec_dev->max_response = EC_PROTO2_MAX_PARAM_SIZE;
                        ec_dev->max_passthru = 0;
                        ec_dev->pkt_xfer = NULL;
-                       ec_dev->din_size = EC_MSG_BYTES;
-                       ec_dev->dout_size = EC_MSG_BYTES;
+                       ec_dev->din_size = EC_PROTO2_MSG_BYTES;
+                       ec_dev->dout_size = EC_PROTO2_MSG_BYTES;
                } else {
                        /*
                         * It's possible for a test to occur too early when
diff --git a/drivers/platform/chrome/cros_kbd_led_backlight.c b/drivers/platform/chrome/cros_kbd_led_backlight.c
new file mode 100644 (file)
index 0000000..ca3e4da
--- /dev/null
@@ -0,0 +1,122 @@
+/*
+ *  Keyboard backlight LED driver for Chrome OS.
+ *
+ *  Copyright (C) 2012 Google, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ */
+
+#include <linux/acpi.h>
+#include <linux/leds.h>
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+
+/* Keyboard LED ACPI Device must be defined in firmware */
+#define ACPI_KEYBOARD_BACKLIGHT_DEVICE "\\_SB.KBLT"
+#define ACPI_KEYBOARD_BACKLIGHT_READ   ACPI_KEYBOARD_BACKLIGHT_DEVICE ".KBQC"
+#define ACPI_KEYBOARD_BACKLIGHT_WRITE  ACPI_KEYBOARD_BACKLIGHT_DEVICE ".KBCM"
+
+#define ACPI_KEYBOARD_BACKLIGHT_MAX            100
+
+static void keyboard_led_set_brightness(struct led_classdev *cdev,
+                                       enum led_brightness brightness)
+{
+       union acpi_object param;
+       struct acpi_object_list input;
+       acpi_status status;
+
+       param.type = ACPI_TYPE_INTEGER;
+       param.integer.value = brightness;
+       input.count = 1;
+       input.pointer = &param;
+
+       status = acpi_evaluate_object(NULL, ACPI_KEYBOARD_BACKLIGHT_WRITE,
+                                     &input, NULL);
+       if (ACPI_FAILURE(status))
+               dev_err(cdev->dev, "Error setting keyboard LED value: %d\n",
+                       status);
+}
+
+static enum led_brightness
+keyboard_led_get_brightness(struct led_classdev *cdev)
+{
+       unsigned long long brightness;
+       acpi_status status;
+
+       status = acpi_evaluate_integer(NULL, ACPI_KEYBOARD_BACKLIGHT_READ,
+                                      NULL, &brightness);
+       if (ACPI_FAILURE(status)) {
+               dev_err(cdev->dev, "Error getting keyboard LED value: %d\n",
+                       status);
+               return -EIO;
+       }
+
+       return brightness;
+}
+
+static int keyboard_led_probe(struct platform_device *pdev)
+{
+       struct led_classdev *cdev;
+       acpi_handle handle;
+       acpi_status status;
+       int error;
+
+       /* Look for the keyboard LED ACPI Device */
+       status = acpi_get_handle(ACPI_ROOT_OBJECT,
+                                ACPI_KEYBOARD_BACKLIGHT_DEVICE,
+                                &handle);
+       if (ACPI_FAILURE(status)) {
+               dev_err(&pdev->dev, "Unable to find ACPI device %s: %d\n",
+                       ACPI_KEYBOARD_BACKLIGHT_DEVICE, status);
+               return -ENXIO;
+       }
+
+       cdev = devm_kzalloc(&pdev->dev, sizeof(*cdev), GFP_KERNEL);
+       if (!cdev)
+               return -ENOMEM;
+
+       cdev->name = "chromeos::kbd_backlight";
+       cdev->max_brightness = ACPI_KEYBOARD_BACKLIGHT_MAX;
+       cdev->flags |= LED_CORE_SUSPENDRESUME;
+       cdev->brightness_set = keyboard_led_set_brightness;
+       cdev->brightness_get = keyboard_led_get_brightness;
+
+       error = devm_led_classdev_register(&pdev->dev, cdev);
+       if (error)
+               return error;
+
+       return 0;
+}
+
+static const struct acpi_device_id keyboard_led_id[] = {
+       { "GOOG0002", 0 },
+       { }
+};
+MODULE_DEVICE_TABLE(acpi, keyboard_led_id);
+
+static struct platform_driver keyboard_led_driver = {
+       .driver         = {
+               .name   = "chromeos-keyboard-leds",
+               .acpi_match_table = ACPI_PTR(keyboard_led_id),
+       },
+       .probe          = keyboard_led_probe,
+};
+module_platform_driver(keyboard_led_driver);
+
+MODULE_AUTHOR("Simon Que <sque@chromium.org>");
+MODULE_DESCRIPTION("ChromeOS Keyboard backlight LED Driver");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:chromeos-keyboard-leds");
index ed2004b..c06bb85 100644 (file)
@@ -846,6 +846,18 @@ config INTEL_IMR
 
          If you are running on a Galileo/Quark say Y here.
 
+config INTEL_PMC_CORE
+       bool "Intel PMC Core driver"
+       depends on X86 && PCI
+       ---help---
+         The Intel Platform Controller Hub for Intel Core SoCs provides access
+         to Power Management Controller registers via a PCI interface. This
+         driver can utilize debugging capabilities and supported features as
+         exposed by the Power Management Controller.
+
+         Supported features:
+               - SLP_S0_RESIDENCY counter.
+
 config IBM_RTL
        tristate "Device driver to enable PRTL support"
        depends on X86 && PCI
index 448443c..9b11b40 100644 (file)
@@ -69,3 +69,4 @@ obj-$(CONFIG_INTEL_PUNIT_IPC)  += intel_punit_ipc.o
 obj-$(CONFIG_INTEL_TELEMETRY)  += intel_telemetry_core.o \
                                   intel_telemetry_pltdrv.o \
                                   intel_telemetry_debugfs.o
+obj-$(CONFIG_INTEL_PMC_CORE)    += intel_pmc_core.o
index f2b5d0a..15f1311 100644 (file)
@@ -771,12 +771,14 @@ static int asus_read_brightness(struct backlight_device *bd)
 {
        struct asus_laptop *asus = bl_get_data(bd);
        unsigned long long value;
-       acpi_status rv = AE_OK;
+       acpi_status rv;
 
        rv = acpi_evaluate_integer(asus->handle, METHOD_BRIGHTNESS_GET,
                                   NULL, &value);
-       if (ACPI_FAILURE(rv))
+       if (ACPI_FAILURE(rv)) {
                pr_warn("Error reading brightness\n");
+               return 0;
+       }
 
        return value;
 }
@@ -865,7 +867,7 @@ static ssize_t infos_show(struct device *dev, struct device_attribute *attr,
        int len = 0;
        unsigned long long temp;
        char buf[16];           /* enough for all info */
-       acpi_status rv = AE_OK;
+       acpi_status rv;
 
        /*
         * We use the easy way, we don't care of off and count,
@@ -946,11 +948,10 @@ static ssize_t sysfs_acpi_set(struct asus_laptop *asus,
                              const char *method)
 {
        int rv, value;
-       int out = 0;
 
        rv = parse_arg(buf, count, &value);
-       if (rv > 0)
-               out = value ? 1 : 0;
+       if (rv <= 0)
+               return rv;
 
        if (write_acpi_int(asus->handle, method, value))
                return -ENODEV;
@@ -1265,7 +1266,7 @@ static DEVICE_ATTR_RO(ls_value);
 static int asus_gps_status(struct asus_laptop *asus)
 {
        unsigned long long status;
-       acpi_status rv = AE_OK;
+       acpi_status rv;
 
        rv = acpi_evaluate_integer(asus->handle, METHOD_GPS_STATUS,
                                   NULL, &status);
index a96630d..a26dca3 100644 (file)
@@ -114,6 +114,7 @@ MODULE_LICENSE("GPL");
 #define ASUS_WMI_DEVID_LED6            0x00020016
 
 /* Backlight and Brightness */
+#define ASUS_WMI_DEVID_ALS_ENABLE      0x00050001 /* Ambient Light Sensor */
 #define ASUS_WMI_DEVID_BACKLIGHT       0x00050011
 #define ASUS_WMI_DEVID_BRIGHTNESS      0x00050012
 #define ASUS_WMI_DEVID_KBD_BACKLIGHT   0x00050021
@@ -1730,6 +1731,7 @@ ASUS_WMI_CREATE_DEVICE_ATTR(touchpad, 0644, ASUS_WMI_DEVID_TOUCHPAD);
 ASUS_WMI_CREATE_DEVICE_ATTR(camera, 0644, ASUS_WMI_DEVID_CAMERA);
 ASUS_WMI_CREATE_DEVICE_ATTR(cardr, 0644, ASUS_WMI_DEVID_CARDREADER);
 ASUS_WMI_CREATE_DEVICE_ATTR(lid_resume, 0644, ASUS_WMI_DEVID_LID_RESUME);
+ASUS_WMI_CREATE_DEVICE_ATTR(als_enable, 0644, ASUS_WMI_DEVID_ALS_ENABLE);
 
 static ssize_t store_cpufv(struct device *dev, struct device_attribute *attr,
                           const char *buf, size_t count)
@@ -1756,6 +1758,7 @@ static struct attribute *platform_attributes[] = {
        &dev_attr_cardr.attr,
        &dev_attr_touchpad.attr,
        &dev_attr_lid_resume.attr,
+       &dev_attr_als_enable.attr,
        NULL
 };
 
@@ -1776,6 +1779,8 @@ static umode_t asus_sysfs_is_visible(struct kobject *kobj,
                devid = ASUS_WMI_DEVID_TOUCHPAD;
        else if (attr == &dev_attr_lid_resume.attr)
                devid = ASUS_WMI_DEVID_LID_RESUME;
+       else if (attr == &dev_attr_als_enable.attr)
+               devid = ASUS_WMI_DEVID_ALS_ENABLE;
 
        if (devid != -1)
                ok = !(asus_wmi_get_devstate_simple(asus, devid) < 0);
index b51a200..dcd9f40 100644 (file)
@@ -28,6 +28,7 @@ struct rbtn_data {
        enum rbtn_type type;
        struct rfkill *rfkill;
        struct input_dev *input_dev;
+       bool suspended;
 };
 
 
@@ -235,9 +236,55 @@ static const struct acpi_device_id rbtn_ids[] = {
        { "", 0 },
 };
 
+#ifdef CONFIG_PM_SLEEP
+static void ACPI_SYSTEM_XFACE rbtn_clear_suspended_flag(void *context)
+{
+       struct rbtn_data *rbtn_data = context;
+
+       rbtn_data->suspended = false;
+}
+
+static int rbtn_suspend(struct device *dev)
+{
+       struct acpi_device *device = to_acpi_device(dev);
+       struct rbtn_data *rbtn_data = acpi_driver_data(device);
+
+       rbtn_data->suspended = true;
+
+       return 0;
+}
+
+static int rbtn_resume(struct device *dev)
+{
+       struct acpi_device *device = to_acpi_device(dev);
+       struct rbtn_data *rbtn_data = acpi_driver_data(device);
+       acpi_status status;
+
+       /*
+        * Upon resume, some BIOSes send an ACPI notification thet triggers
+        * an unwanted input event. In order to ignore it, we use a flag
+        * that we set at suspend and clear once we have received the extra
+        * ACPI notification. Since ACPI notifications are delivered
+        * asynchronously to drivers, we clear the flag from the workqueue
+        * used to deliver the notifications. This should be enough
+        * to have the flag cleared only after we received the extra
+        * notification, if any.
+        */
+       status = acpi_os_execute(OSL_NOTIFY_HANDLER,
+                        rbtn_clear_suspended_flag, rbtn_data);
+       if (ACPI_FAILURE(status))
+               rbtn_clear_suspended_flag(rbtn_data);
+
+       return 0;
+}
+#endif
+
+static SIMPLE_DEV_PM_OPS(rbtn_pm_ops, rbtn_suspend, rbtn_resume);
+
 static struct acpi_driver rbtn_driver = {
        .name = "dell-rbtn",
        .ids = rbtn_ids,
+       .drv.pm = &rbtn_pm_ops,
        .ops = {
                .add = rbtn_add,
                .remove = rbtn_remove,
@@ -399,6 +446,15 @@ static void rbtn_notify(struct acpi_device *device, u32 event)
 {
        struct rbtn_data *rbtn_data = device->driver_data;
 
+       /*
+        * Some BIOSes send a notification at resume.
+        * Ignore it to prevent unwanted input events.
+        */
+       if (rbtn_data->suspended) {
+               dev_dbg(&device->dev, "ACPI notification ignored\n");
+               return;
+       }
+
        if (event != 0x80) {
                dev_info(&device->dev, "Received unknown event (0x%x)\n",
                         event);
index ffc84cc..ce41bc3 100644 (file)
@@ -69,7 +69,7 @@
 #include <linux/kfifo.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
-#if defined(CONFIG_LEDS_CLASS) || defined(CONFIG_LEDS_CLASS_MODULE)
+#if IS_ENABLED(CONFIG_LEDS_CLASS)
 #include <linux/leds.h>
 #endif
 #include <acpi/video.h>
 /* FUNC interface - responses */
 #define UNSUPPORTED_CMD 0x80000000
 
-#if defined(CONFIG_LEDS_CLASS) || defined(CONFIG_LEDS_CLASS_MODULE)
+#if IS_ENABLED(CONFIG_LEDS_CLASS)
 /* FUNC interface - LED control */
 #define FUNC_LED_OFF   0x1
 #define FUNC_LED_ON    0x30001
 #define KEYBOARD_LAMPS 0x100
 #define LOGOLAMP_POWERON 0x2000
 #define LOGOLAMP_ALWAYS  0x4000
+#define RADIO_LED_ON   0x20
 #endif
 
 /* Hotkey details */
@@ -174,13 +175,14 @@ struct fujitsu_hotkey_t {
        int rfkill_state;
        int logolamp_registered;
        int kblamps_registered;
+       int radio_led_registered;
 };
 
 static struct fujitsu_hotkey_t *fujitsu_hotkey;
 
 static void acpi_fujitsu_hotkey_notify(struct acpi_device *device, u32 event);
 
-#if defined(CONFIG_LEDS_CLASS) || defined(CONFIG_LEDS_CLASS_MODULE)
+#if IS_ENABLED(CONFIG_LEDS_CLASS)
 static enum led_brightness logolamp_get(struct led_classdev *cdev);
 static void logolamp_set(struct led_classdev *cdev,
                               enum led_brightness brightness);
@@ -200,6 +202,16 @@ static struct led_classdev kblamps_led = {
  .brightness_get = kblamps_get,
  .brightness_set = kblamps_set
 };
+
+static enum led_brightness radio_led_get(struct led_classdev *cdev);
+static void radio_led_set(struct led_classdev *cdev,
+                              enum led_brightness brightness);
+
+static struct led_classdev radio_led = {
+ .name = "fujitsu::radio_led",
+ .brightness_get = radio_led_get,
+ .brightness_set = radio_led_set
+};
 #endif
 
 #ifdef CONFIG_FUJITSU_LAPTOP_DEBUG
@@ -249,7 +261,7 @@ static int call_fext_func(int cmd, int arg0, int arg1, int arg2)
        return value;
 }
 
-#if defined(CONFIG_LEDS_CLASS) || defined(CONFIG_LEDS_CLASS_MODULE)
+#if IS_ENABLED(CONFIG_LEDS_CLASS)
 /* LED class callbacks */
 
 static void logolamp_set(struct led_classdev *cdev,
@@ -275,6 +287,15 @@ static void kblamps_set(struct led_classdev *cdev,
                call_fext_func(FUNC_LEDS, 0x1, KEYBOARD_LAMPS, FUNC_LED_OFF);
 }
 
+static void radio_led_set(struct led_classdev *cdev,
+                               enum led_brightness brightness)
+{
+       if (brightness >= LED_FULL)
+               call_fext_func(FUNC_RFKILL, 0x5, RADIO_LED_ON, RADIO_LED_ON);
+       else
+               call_fext_func(FUNC_RFKILL, 0x5, RADIO_LED_ON, 0x0);
+}
+
 static enum led_brightness logolamp_get(struct led_classdev *cdev)
 {
        enum led_brightness brightness = LED_OFF;
@@ -299,6 +320,16 @@ static enum led_brightness kblamps_get(struct led_classdev *cdev)
 
        return brightness;
 }
+
+static enum led_brightness radio_led_get(struct led_classdev *cdev)
+{
+       enum led_brightness brightness = LED_OFF;
+
+       if (call_fext_func(FUNC_RFKILL, 0x4, 0x0, 0x0) & RADIO_LED_ON)
+               brightness = LED_FULL;
+
+       return brightness;
+}
 #endif
 
 /* Hardware access for LCD brightness control */
@@ -872,7 +903,7 @@ static int acpi_fujitsu_hotkey_add(struct acpi_device *device)
        /* Suspect this is a keymap of the application panel, print it */
        pr_info("BTNI: [0x%x]\n", call_fext_func(FUNC_BUTTONS, 0x0, 0x0, 0x0));
 
-#if defined(CONFIG_LEDS_CLASS) || defined(CONFIG_LEDS_CLASS_MODULE)
+#if IS_ENABLED(CONFIG_LEDS_CLASS)
        if (call_fext_func(FUNC_LEDS, 0x0, 0x0, 0x0) & LOGOLAMP_POWERON) {
                result = led_classdev_register(&fujitsu->pf_device->dev,
                                                &logolamp_led);
@@ -895,6 +926,23 @@ static int acpi_fujitsu_hotkey_add(struct acpi_device *device)
                               result);
                }
        }
+
+       /*
+        * BTNI bit 24 seems to indicate the presence of a radio toggle
+        * button in place of a slide switch, and all such machines appear
+        * to also have an RF LED.  Therefore use bit 24 as an indicator
+        * that an RF LED is present.
+        */
+       if (call_fext_func(FUNC_BUTTONS, 0x0, 0x0, 0x0) & BIT(24)) {
+               result = led_classdev_register(&fujitsu->pf_device->dev,
+                                               &radio_led);
+               if (result == 0) {
+                       fujitsu_hotkey->radio_led_registered = 1;
+               } else {
+                       pr_err("Could not register LED handler for radio LED, error %i\n",
+                              result);
+               }
+       }
 #endif
 
        return result;
@@ -915,12 +963,15 @@ static int acpi_fujitsu_hotkey_remove(struct acpi_device *device)
        struct fujitsu_hotkey_t *fujitsu_hotkey = acpi_driver_data(device);
        struct input_dev *input = fujitsu_hotkey->input;
 
-#if defined(CONFIG_LEDS_CLASS) || defined(CONFIG_LEDS_CLASS_MODULE)
+#if IS_ENABLED(CONFIG_LEDS_CLASS)
        if (fujitsu_hotkey->logolamp_registered)
                led_classdev_unregister(&logolamp_led);
 
        if (fujitsu_hotkey->kblamps_registered)
                led_classdev_unregister(&kblamps_led);
+
+       if (fujitsu_hotkey->radio_led_registered)
+               led_classdev_unregister(&radio_led);
 #endif
 
        input_unregister_device(input);
index be3bc2f..4a23fbc 100644 (file)
 #define CFG_CAMERA_BIT (19)
 
 #if IS_ENABLED(CONFIG_ACPI_WMI)
-static const char ideapad_wmi_fnesc_event[] = "26CAB2E5-5CF1-46AE-AAC3-4A12B6BA50E6";
+static const char *const ideapad_wmi_fnesc_events[] = {
+       "26CAB2E5-5CF1-46AE-AAC3-4A12B6BA50E6", /* Yoga 3 */
+       "56322276-8493-4CE8-A783-98C991274F5E", /* Yoga 700 */
+};
 #endif
 
 enum {
@@ -93,6 +96,7 @@ struct ideapad_private {
        struct dentry *debug;
        unsigned long cfg;
        bool has_hw_rfkill_switch;
+       const char *fnesc_guid;
 };
 
 static bool no_bt_rfkill;
@@ -989,8 +993,16 @@ static int ideapad_acpi_add(struct platform_device *pdev)
                ACPI_DEVICE_NOTIFY, ideapad_acpi_notify, priv);
        if (ret)
                goto notification_failed;
+
 #if IS_ENABLED(CONFIG_ACPI_WMI)
-       ret = wmi_install_notify_handler(ideapad_wmi_fnesc_event, ideapad_wmi_notify, priv);
+       for (i = 0; i < ARRAY_SIZE(ideapad_wmi_fnesc_events); i++) {
+               ret = wmi_install_notify_handler(ideapad_wmi_fnesc_events[i],
+                                                ideapad_wmi_notify, priv);
+               if (ret == AE_OK) {
+                       priv->fnesc_guid = ideapad_wmi_fnesc_events[i];
+                       break;
+               }
+       }
        if (ret != AE_OK && ret != AE_NOT_EXIST)
                goto notification_failed_wmi;
 #endif
@@ -1020,7 +1032,8 @@ static int ideapad_acpi_remove(struct platform_device *pdev)
        int i;
 
 #if IS_ENABLED(CONFIG_ACPI_WMI)
-       wmi_remove_notify_handler(ideapad_wmi_fnesc_event);
+       if (priv->fnesc_guid)
+               wmi_remove_notify_handler(priv->fnesc_guid);
 #endif
        acpi_remove_notify_handler(priv->adev->handle,
                ACPI_DEVICE_NOTIFY, ideapad_acpi_notify);
index 0a919d8..cbe0102 100644 (file)
@@ -306,33 +306,32 @@ static int sensor_set_auxtrip(acpi_handle handle, int index, int value)
 #define to_intel_menlow_attr(_attr)    \
        container_of(_attr, struct intel_menlow_attribute, attr)
 
-static ssize_t aux0_show(struct device *dev,
-                        struct device_attribute *dev_attr, char *buf)
+static ssize_t aux_show(struct device *dev, struct device_attribute *dev_attr,
+                       char *buf, int idx)
 {
        struct intel_menlow_attribute *attr = to_intel_menlow_attr(dev_attr);
        unsigned long long value;
        int result;
 
-       result = sensor_get_auxtrip(attr->handle, 0, &value);
+       result = sensor_get_auxtrip(attr->handle, idx, &value);
 
        return result ? result : sprintf(buf, "%lu", DECI_KELVIN_TO_CELSIUS(value));
 }
 
-static ssize_t aux1_show(struct device *dev,
+static ssize_t aux0_show(struct device *dev,
                         struct device_attribute *dev_attr, char *buf)
 {
-       struct intel_menlow_attribute *attr = to_intel_menlow_attr(dev_attr);
-       unsigned long long value;
-       int result;
-
-       result = sensor_get_auxtrip(attr->handle, 1, &value);
+       return aux_show(dev, dev_attr, buf, 0);
+}
 
-       return result ? result : sprintf(buf, "%lu", DECI_KELVIN_TO_CELSIUS(value));
+static ssize_t aux1_show(struct device *dev,
+                        struct device_attribute *dev_attr, char *buf)
+{
+       return aux_show(dev, dev_attr, buf, 1);
 }
 
-static ssize_t aux0_store(struct device *dev,
-                         struct device_attribute *dev_attr,
-                         const char *buf, size_t count)
+static ssize_t aux_store(struct device *dev, struct device_attribute *dev_attr,
+                        const char *buf, size_t count, int idx)
 {
        struct intel_menlow_attribute *attr = to_intel_menlow_attr(dev_attr);
        int value;
@@ -345,27 +344,23 @@ static ssize_t aux0_store(struct device *dev,
        if (value < 0)
                return -EINVAL;
 
-       result = sensor_set_auxtrip(attr->handle, 0, CELSIUS_TO_DECI_KELVIN(value));
+       result = sensor_set_auxtrip(attr->handle, idx, 
+                                   CELSIUS_TO_DECI_KELVIN(value));
        return result ? result : count;
 }
 
-static ssize_t aux1_store(struct device *dev,
+static ssize_t aux0_store(struct device *dev,
                          struct device_attribute *dev_attr,
                          const char *buf, size_t count)
 {
-       struct intel_menlow_attribute *attr = to_intel_menlow_attr(dev_attr);
-       int value;
-       int result;
-
-       /*Sanity check; should be a positive integer */
-       if (!sscanf(buf, "%d", &value))
-               return -EINVAL;
-
-       if (value < 0)
-               return -EINVAL;
+       return aux_store(dev, dev_attr, buf, count, 0);
+}
 
-       result = sensor_set_auxtrip(attr->handle, 1, CELSIUS_TO_DECI_KELVIN(value));
-       return result ? result : count;
+static ssize_t aux1_store(struct device *dev,
+                         struct device_attribute *dev_attr,
+                         const char *buf, size_t count)
+{
+       return aux_store(dev, dev_attr, buf, count, 1);
 }
 
 /* BIOS can enable/disable the thermal user application in dabney platform */
diff --git a/drivers/platform/x86/intel_pmc_core.c b/drivers/platform/x86/intel_pmc_core.c
new file mode 100644 (file)
index 0000000..2776bec
--- /dev/null
@@ -0,0 +1,200 @@
+/*
+ * Intel Core SoC Power Management Controller Driver
+ *
+ * Copyright (c) 2016, Intel Corporation.
+ * All Rights Reserved.
+ *
+ * Authors: Rajneesh Bhardwaj <rajneesh.bhardwaj@intel.com>
+ *          Vishwanath Somayaji <vishwanath.somayaji@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/pci.h>
+#include <linux/seq_file.h>
+
+#include <asm/cpu_device_id.h>
+#include <asm/pmc_core.h>
+
+#include "intel_pmc_core.h"
+
+static struct pmc_dev pmc;
+
+static const struct pci_device_id pmc_pci_ids[] = {
+       { PCI_VDEVICE(INTEL, SPT_PMC_PCI_DEVICE_ID), (kernel_ulong_t)NULL },
+       { 0, },
+};
+
+static inline u32 pmc_core_reg_read(struct pmc_dev *pmcdev, int reg_offset)
+{
+       return readl(pmcdev->regbase + reg_offset);
+}
+
+static inline u32 pmc_core_adjust_slp_s0_step(u32 value)
+{
+       return value * SPT_PMC_SLP_S0_RES_COUNTER_STEP;
+}
+
+/**
+ * intel_pmc_slp_s0_counter_read() - Read SLP_S0 residency.
+ * @data: Out param that contains current SLP_S0 count.
+ *
+ * This API currently supports Intel Skylake SoC and Sunrise
+ * Point Platform Controller Hub. Future platform support
+ * should be added for platforms that support low power modes
+ * beyond Package C10 state.
+ *
+ * SLP_S0_RESIDENCY counter counts in 100 us granularity per
+ * step hence function populates the multiplied value in out
+ * parameter @data.
+ *
+ * Return: an error code or 0 on success.
+ */
+int intel_pmc_slp_s0_counter_read(u32 *data)
+{
+       struct pmc_dev *pmcdev = &pmc;
+       u32 value;
+
+       if (!pmcdev->has_slp_s0_res)
+               return -EACCES;
+
+       value = pmc_core_reg_read(pmcdev, SPT_PMC_SLP_S0_RES_COUNTER_OFFSET);
+       *data = pmc_core_adjust_slp_s0_step(value);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(intel_pmc_slp_s0_counter_read);
+
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+static int pmc_core_dev_state_show(struct seq_file *s, void *unused)
+{
+       struct pmc_dev *pmcdev = s->private;
+       u32 counter_val;
+
+       counter_val = pmc_core_reg_read(pmcdev,
+                                       SPT_PMC_SLP_S0_RES_COUNTER_OFFSET);
+       seq_printf(s, "%u\n", pmc_core_adjust_slp_s0_step(counter_val));
+
+       return 0;
+}
+
+static int pmc_core_dev_state_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, pmc_core_dev_state_show, inode->i_private);
+}
+
+static const struct file_operations pmc_core_dev_state_ops = {
+       .open           = pmc_core_dev_state_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static void pmc_core_dbgfs_unregister(struct pmc_dev *pmcdev)
+{
+       debugfs_remove_recursive(pmcdev->dbgfs_dir);
+}
+
+static int pmc_core_dbgfs_register(struct pmc_dev *pmcdev)
+{
+       struct dentry *dir, *file;
+
+       dir = debugfs_create_dir("pmc_core", NULL);
+       if (!dir)
+               return -ENOMEM;
+
+       pmcdev->dbgfs_dir = dir;
+       file = debugfs_create_file("slp_s0_residency_usec", S_IFREG | S_IRUGO,
+                                  dir, pmcdev, &pmc_core_dev_state_ops);
+
+       if (!file) {
+               pmc_core_dbgfs_unregister(pmcdev);
+               return -ENODEV;
+       }
+
+       return 0;
+}
+#else
+static inline int pmc_core_dbgfs_register(struct pmc_dev *pmcdev)
+{
+       return 0;
+}
+
+static inline void pmc_core_dbgfs_unregister(struct pmc_dev *pmcdev)
+{
+}
+#endif /* CONFIG_DEBUG_FS */
+
+static const struct x86_cpu_id intel_pmc_core_ids[] = {
+       { X86_VENDOR_INTEL, 6, 0x4e, X86_FEATURE_MWAIT,
+               (kernel_ulong_t)NULL}, /* Skylake CPUID Signature */
+       { X86_VENDOR_INTEL, 6, 0x5e, X86_FEATURE_MWAIT,
+               (kernel_ulong_t)NULL}, /* Skylake CPUID Signature */
+       {}
+};
+
+static int pmc_core_probe(struct pci_dev *dev, const struct pci_device_id *id)
+{
+       struct device *ptr_dev = &dev->dev;
+       struct pmc_dev *pmcdev = &pmc;
+       const struct x86_cpu_id *cpu_id;
+       int err;
+
+       cpu_id = x86_match_cpu(intel_pmc_core_ids);
+       if (!cpu_id) {
+               dev_dbg(&dev->dev, "PMC Core: cpuid mismatch.\n");
+               return -EINVAL;
+       }
+
+       err = pcim_enable_device(dev);
+       if (err < 0) {
+               dev_dbg(&dev->dev, "PMC Core: failed to enable Power Management Controller.\n");
+               return err;
+       }
+
+       err = pci_read_config_dword(dev,
+                                   SPT_PMC_BASE_ADDR_OFFSET,
+                                   &pmcdev->base_addr);
+       if (err < 0) {
+               dev_dbg(&dev->dev, "PMC Core: failed to read PCI config space.\n");
+               return err;
+       }
+       dev_dbg(&dev->dev, "PMC Core: PWRMBASE is %#x\n", pmcdev->base_addr);
+
+       pmcdev->regbase = devm_ioremap_nocache(ptr_dev,
+                                             pmcdev->base_addr,
+                                             SPT_PMC_MMIO_REG_LEN);
+       if (!pmcdev->regbase) {
+               dev_dbg(&dev->dev, "PMC Core: ioremap failed.\n");
+               return -ENOMEM;
+       }
+
+       err = pmc_core_dbgfs_register(pmcdev);
+       if (err < 0) {
+               dev_err(&dev->dev, "PMC Core: debugfs register failed.\n");
+               return err;
+       }
+
+       pmc.has_slp_s0_res = true;
+       return 0;
+}
+
+static struct pci_driver intel_pmc_core_driver = {
+       .name = "intel_pmc_core",
+       .id_table = pmc_pci_ids,
+       .probe = pmc_core_probe,
+};
+
+builtin_pci_driver(intel_pmc_core_driver);
diff --git a/drivers/platform/x86/intel_pmc_core.h b/drivers/platform/x86/intel_pmc_core.h
new file mode 100644 (file)
index 0000000..a9dadaf
--- /dev/null
@@ -0,0 +1,51 @@
+/*
+ * Intel Core SoC Power Management Controller Header File
+ *
+ * Copyright (c) 2016, Intel Corporation.
+ * All Rights Reserved.
+ *
+ * Authors: Rajneesh Bhardwaj <rajneesh.bhardwaj@intel.com>
+ *          Vishwanath Somayaji <vishwanath.somayaji@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#ifndef PMC_CORE_H
+#define PMC_CORE_H
+
+/* Sunrise Point Power Management Controller PCI Device ID */
+#define SPT_PMC_PCI_DEVICE_ID                  0x9d21
+#define SPT_PMC_BASE_ADDR_OFFSET               0x48
+#define SPT_PMC_SLP_S0_RES_COUNTER_OFFSET      0x13c
+#define SPT_PMC_MMIO_REG_LEN                   0x100
+#define SPT_PMC_SLP_S0_RES_COUNTER_STEP                0x64
+
+/**
+ * struct pmc_dev - pmc device structure
+ * @base_addr:         comtains pmc base address
+ * @regbase:           pointer to io-remapped memory location
+ * @dbgfs_dir:         path to debug fs interface
+ * @feature_available: flag to indicate whether
+ *                     the feature is available
+ *                     on a particular platform or not.
+ *
+ * pmc_dev contains info about power management controller device.
+ */
+struct pmc_dev {
+       u32 base_addr;
+       void __iomem *regbase;
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+       struct dentry *dbgfs_dir;
+#endif /* CONFIG_DEBUG_FS */
+       bool has_slp_s0_res;
+};
+
+#endif /* PMC_CORE_H */
index a695a43..0d4c380 100644 (file)
@@ -25,7 +25,7 @@
 
 struct telemetry_core_config {
        struct telemetry_plt_config *plt_config;
-       struct telemetry_core_ops *telem_ops;
+       const struct telemetry_core_ops *telem_ops;
 };
 
 static struct telemetry_core_config telm_core_conf;
@@ -95,7 +95,7 @@ static int telemetry_def_reset_events(void)
        return 0;
 }
 
-static struct telemetry_core_ops telm_defpltops = {
+static const struct telemetry_core_ops telm_defpltops = {
        .set_sampling_period = telemetry_def_set_sampling_period,
        .get_sampling_period = telemetry_def_get_sampling_period,
        .get_trace_verbosity = telemetry_def_get_trace_verbosity,
@@ -332,7 +332,7 @@ EXPORT_SYMBOL_GPL(telemetry_set_trace_verbosity);
  *
  * Return: 0 success, < 0 for failure
  */
-int telemetry_set_pltdata(struct telemetry_core_ops *ops,
+int telemetry_set_pltdata(const struct telemetry_core_ops *ops,
                          struct telemetry_plt_config *pltconfig)
 {
        if (ops)
index 781bd10..09c84a2 100644 (file)
@@ -1081,7 +1081,7 @@ out:
        return ret;
 }
 
-static struct telemetry_core_ops telm_pltops = {
+static const struct telemetry_core_ops telm_pltops = {
        .get_trace_verbosity = telemetry_plt_get_trace_verbosity,
        .set_trace_verbosity = telemetry_plt_set_trace_verbosity,
        .set_sampling_period = telemetry_plt_set_sampling_period,
index e9caa34..1dba359 100644 (file)
@@ -1446,6 +1446,9 @@ static void sony_nc_function_cleanup(struct platform_device *pd)
 {
        unsigned int i, result, bitmask, handle;
 
+       if (!handles)
+               return;
+
        /* get enabled events and disable them */
        sony_nc_int_call(sony_nc_acpi_handle, "SN01", NULL, &bitmask);
        sony_nc_int_call(sony_nc_acpi_handle, "SN03", &bitmask, &result);
index 700e0fa..6505c97 100644 (file)
@@ -24,6 +24,8 @@
 #define SURFACE_BUTTON_OBJ_NAME                "VGBI"
 #define SURFACE_BUTTON_DEVICE_NAME     "Surface Pro 3/4 Buttons"
 
+#define SURFACE_BUTTON_NOTIFY_TABLET_MODE      0xc8
+
 #define SURFACE_BUTTON_NOTIFY_PRESS_POWER      0xc6
 #define SURFACE_BUTTON_NOTIFY_RELEASE_POWER    0xc7
 
@@ -33,7 +35,7 @@
 #define SURFACE_BUTTON_NOTIFY_PRESS_VOLUME_UP  0xc0
 #define SURFACE_BUTTON_NOTIFY_RELEASE_VOLUME_UP        0xc1
 
-#define SURFACE_BUTTON_NOTIFY_PRESS_VOLUME_DOWN        0xc2
+#define SURFACE_BUTTON_NOTIFY_PRESS_VOLUME_DOWN                0xc2
 #define SURFACE_BUTTON_NOTIFY_RELEASE_VOLUME_DOWN      0xc3
 
 ACPI_MODULE_NAME("surface pro 3 button");
@@ -105,9 +107,12 @@ static void surface_button_notify(struct acpi_device *device, u32 event)
        case SURFACE_BUTTON_NOTIFY_RELEASE_VOLUME_DOWN:
                key_code = KEY_VOLUMEDOWN;
                break;
+       case SURFACE_BUTTON_NOTIFY_TABLET_MODE:
+               dev_warn_once(&device->dev, "Tablet mode is not supported\n");
+               break;
        default:
                dev_info_ratelimited(&device->dev,
-                                 "Unsupported event [0x%x]\n", event);
+                                    "Unsupported event [0x%x]\n", event);
                break;
        }
        input = button->input;
index 9255ff3..c3bfa1f 100644 (file)
@@ -5001,6 +5001,8 @@ static int kbdlight_set_level(int level)
        return 0;
 }
 
+static int kbdlight_set_level_and_update(int level);
+
 static int kbdlight_get_level(void)
 {
        int status = 0;
@@ -5068,7 +5070,7 @@ static void kbdlight_set_worker(struct work_struct *work)
                        container_of(work, struct tpacpi_led_classdev, work);
 
        if (likely(tpacpi_lifecycle == TPACPI_LIFE_RUNNING))
-               kbdlight_set_level(data->new_state);
+               kbdlight_set_level_and_update(data->new_state);
 }
 
 static void kbdlight_sysfs_set(struct led_classdev *led_cdev,
@@ -5099,7 +5101,6 @@ static struct tpacpi_led_classdev tpacpi_led_kbdlight = {
                .max_brightness = 2,
                .brightness_set = &kbdlight_sysfs_set,
                .brightness_get = &kbdlight_sysfs_get,
-               .flags          = LED_CORE_SUSPENDRESUME,
        }
 };
 
@@ -5137,6 +5138,20 @@ static void kbdlight_exit(void)
        flush_workqueue(tpacpi_wq);
 }
 
+static int kbdlight_set_level_and_update(int level)
+{
+       int ret;
+       struct led_classdev *led_cdev;
+
+       ret = kbdlight_set_level(level);
+       led_cdev = &tpacpi_led_kbdlight.led_classdev;
+
+       if (ret == 0 && !(led_cdev->flags & LED_SUSPENDED))
+               led_cdev->brightness = level;
+
+       return ret;
+}
+
 static int kbdlight_read(struct seq_file *m)
 {
        int level;
@@ -5177,13 +5192,35 @@ static int kbdlight_write(char *buf)
        if (level == -1)
                return -EINVAL;
 
-       return kbdlight_set_level(level);
+       return kbdlight_set_level_and_update(level);
+}
+
+static void kbdlight_suspend(void)
+{
+       struct led_classdev *led_cdev;
+
+       if (!tp_features.kbdlight)
+               return;
+
+       led_cdev = &tpacpi_led_kbdlight.led_classdev;
+       led_update_brightness(led_cdev);
+       led_classdev_suspend(led_cdev);
+}
+
+static void kbdlight_resume(void)
+{
+       if (!tp_features.kbdlight)
+               return;
+
+       led_classdev_resume(&tpacpi_led_kbdlight.led_classdev);
 }
 
 static struct ibm_struct kbdlight_driver_data = {
        .name = "kbdlight",
        .read = kbdlight_read,
        .write = kbdlight_write,
+       .suspend = kbdlight_suspend,
+       .resume = kbdlight_resume,
        .exit = kbdlight_exit,
 };
 
index 579fd65..d637c93 100644 (file)
@@ -208,14 +208,10 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
                break;
 
        case PTP_SYS_OFFSET:
-               sysoff = kmalloc(sizeof(*sysoff), GFP_KERNEL);
-               if (!sysoff) {
-                       err = -ENOMEM;
-                       break;
-               }
-               if (copy_from_user(sysoff, (void __user *)arg,
-                                  sizeof(*sysoff))) {
-                       err = -EFAULT;
+               sysoff = memdup_user((void __user *)arg, sizeof(*sysoff));
+               if (IS_ERR(sysoff)) {
+                       err = PTR_ERR(sysoff);
+                       sysoff = NULL;
                        break;
                }
                if (sysoff->n_samples > PTP_MAX_SAMPLES) {
index 680fbc7..dba3843 100644 (file)
@@ -75,6 +75,7 @@ static void free_pwms(struct pwm_chip *chip)
 
        for (i = 0; i < chip->npwm; i++) {
                struct pwm_device *pwm = &chip->pwms[i];
+
                radix_tree_delete(&pwm_tree, pwm->pwm);
        }
 
@@ -128,13 +129,6 @@ static int pwm_device_request(struct pwm_device *pwm, const char *label)
        set_bit(PWMF_REQUESTED, &pwm->flags);
        pwm->label = label;
 
-       /*
-        * FIXME: This should be removed once all PWM users properly make use
-        * of struct pwm_args to initialize the PWM device. As long as this is
-        * here, the PWM state and hardware state can get out of sync.
-        */
-       pwm_apply_args(pwm);
-
        return 0;
 }
 
@@ -233,6 +227,19 @@ void *pwm_get_chip_data(struct pwm_device *pwm)
 }
 EXPORT_SYMBOL_GPL(pwm_get_chip_data);
 
+static bool pwm_ops_check(const struct pwm_ops *ops)
+{
+       /* driver supports legacy, non-atomic operation */
+       if (ops->config && ops->enable && ops->disable)
+               return true;
+
+       /* driver supports atomic operation */
+       if (ops->apply)
+               return true;
+
+       return false;
+}
+
 /**
  * pwmchip_add_with_polarity() - register a new PWM chip
  * @chip: the PWM chip to add
@@ -251,8 +258,10 @@ int pwmchip_add_with_polarity(struct pwm_chip *chip,
        unsigned int i;
        int ret;
 
-       if (!chip || !chip->dev || !chip->ops || !chip->ops->config ||
-           !chip->ops->enable || !chip->ops->disable || !chip->npwm)
+       if (!chip || !chip->dev || !chip->ops || !chip->npwm)
+               return -EINVAL;
+
+       if (!pwm_ops_check(chip->ops))
                return -EINVAL;
 
        mutex_lock(&pwm_lock);
@@ -261,7 +270,7 @@ int pwmchip_add_with_polarity(struct pwm_chip *chip,
        if (ret < 0)
                goto out;
 
-       chip->pwms = kzalloc(chip->npwm * sizeof(*pwm), GFP_KERNEL);
+       chip->pwms = kcalloc(chip->npwm, sizeof(*pwm), GFP_KERNEL);
        if (!chip->pwms) {
                ret = -ENOMEM;
                goto out;
@@ -275,8 +284,10 @@ int pwmchip_add_with_polarity(struct pwm_chip *chip,
                pwm->chip = chip;
                pwm->pwm = chip->base + i;
                pwm->hwpwm = i;
-               pwm->polarity = polarity;
-               mutex_init(&pwm->lock);
+               pwm->state.polarity = polarity;
+
+               if (chip->ops->get_state)
+                       chip->ops->get_state(chip, pwm, &pwm->state);
 
                radix_tree_insert(&pwm_tree, pwm->pwm, pwm);
        }
@@ -436,107 +447,138 @@ void pwm_free(struct pwm_device *pwm)
 EXPORT_SYMBOL_GPL(pwm_free);
 
 /**
- * pwm_config() - change a PWM device configuration
+ * pwm_apply_state() - atomically apply a new state to a PWM device
  * @pwm: PWM device
- * @duty_ns: "on" time (in nanoseconds)
- * @period_ns: duration (in nanoseconds) of one cycle
- *
- * Returns: 0 on success or a negative error code on failure.
+ * @state: new state to apply. This can be adjusted by the PWM driver
+ *        if the requested config is not achievable, for example,
+ *        ->duty_cycle and ->period might be approximated.
  */
-int pwm_config(struct pwm_device *pwm, int duty_ns, int period_ns)
+int pwm_apply_state(struct pwm_device *pwm, struct pwm_state *state)
 {
        int err;
 
-       if (!pwm || duty_ns < 0 || period_ns <= 0 || duty_ns > period_ns)
+       if (!pwm)
                return -EINVAL;
 
-       err = pwm->chip->ops->config(pwm->chip, pwm, duty_ns, period_ns);
-       if (err)
-               return err;
-
-       pwm->duty_cycle = duty_ns;
-       pwm->period = period_ns;
+       if (!memcmp(state, &pwm->state, sizeof(*state)))
+               return 0;
 
-       return 0;
-}
-EXPORT_SYMBOL_GPL(pwm_config);
+       if (pwm->chip->ops->apply) {
+               err = pwm->chip->ops->apply(pwm->chip, pwm, state);
+               if (err)
+                       return err;
 
-/**
- * pwm_set_polarity() - configure the polarity of a PWM signal
- * @pwm: PWM device
- * @polarity: new polarity of the PWM signal
- *
- * Note that the polarity cannot be configured while the PWM device is
- * enabled.
- *
- * Returns: 0 on success or a negative error code on failure.
- */
-int pwm_set_polarity(struct pwm_device *pwm, enum pwm_polarity polarity)
-{
-       int err;
+               pwm->state = *state;
+       } else {
+               /*
+                * FIXME: restore the initial state in case of error.
+                */
+               if (state->polarity != pwm->state.polarity) {
+                       if (!pwm->chip->ops->set_polarity)
+                               return -ENOTSUPP;
+
+                       /*
+                        * Changing the polarity of a running PWM is
+                        * only allowed when the PWM driver implements
+                        * ->apply().
+                        */
+                       if (pwm->state.enabled) {
+                               pwm->chip->ops->disable(pwm->chip, pwm);
+                               pwm->state.enabled = false;
+                       }
+
+                       err = pwm->chip->ops->set_polarity(pwm->chip, pwm,
+                                                          state->polarity);
+                       if (err)
+                               return err;
+
+                       pwm->state.polarity = state->polarity;
+               }
 
-       if (!pwm || !pwm->chip->ops)
-               return -EINVAL;
+               if (state->period != pwm->state.period ||
+                   state->duty_cycle != pwm->state.duty_cycle) {
+                       err = pwm->chip->ops->config(pwm->chip, pwm,
+                                                    state->duty_cycle,
+                                                    state->period);
+                       if (err)
+                               return err;
 
-       if (!pwm->chip->ops->set_polarity)
-               return -ENOSYS;
+                       pwm->state.duty_cycle = state->duty_cycle;
+                       pwm->state.period = state->period;
+               }
 
-       mutex_lock(&pwm->lock);
+               if (state->enabled != pwm->state.enabled) {
+                       if (state->enabled) {
+                               err = pwm->chip->ops->enable(pwm->chip, pwm);
+                               if (err)
+                                       return err;
+                       } else {
+                               pwm->chip->ops->disable(pwm->chip, pwm);
+                       }
 
-       if (pwm_is_enabled(pwm)) {
-               err = -EBUSY;
-               goto unlock;
+                       pwm->state.enabled = state->enabled;
+               }
        }
 
-       err = pwm->chip->ops->set_polarity(pwm->chip, pwm, polarity);
-       if (err)
-               goto unlock;
-
-       pwm->polarity = polarity;
-
-unlock:
-       mutex_unlock(&pwm->lock);
-       return err;
+       return 0;
 }
-EXPORT_SYMBOL_GPL(pwm_set_polarity);
+EXPORT_SYMBOL_GPL(pwm_apply_state);
 
 /**
- * pwm_enable() - start a PWM output toggling
+ * pwm_adjust_config() - adjust the current PWM config to the PWM arguments
  * @pwm: PWM device
  *
- * Returns: 0 on success or a negative error code on failure.
+ * This function will adjust the PWM config to the PWM arguments provided
+ * by the DT or PWM lookup table. This is particularly useful to adapt
+ * the bootloader config to the Linux one.
  */
-int pwm_enable(struct pwm_device *pwm)
+int pwm_adjust_config(struct pwm_device *pwm)
 {
-       int err = 0;
+       struct pwm_state state;
+       struct pwm_args pargs;
 
-       if (!pwm)
-               return -EINVAL;
+       pwm_get_args(pwm, &pargs);
+       pwm_get_state(pwm, &state);
 
-       mutex_lock(&pwm->lock);
+       /*
+        * If the current period is zero it means that either the PWM driver
+        * does not support initial state retrieval or the PWM has not yet
+        * been configured.
+        *
+        * In either case, we setup the new period and polarity, and assign a
+        * duty cycle of 0.
+        */
+       if (!state.period) {
+               state.duty_cycle = 0;
+               state.period = pargs.period;
+               state.polarity = pargs.polarity;
 
-       if (!test_and_set_bit(PWMF_ENABLED, &pwm->flags)) {
-               err = pwm->chip->ops->enable(pwm->chip, pwm);
-               if (err)
-                       clear_bit(PWMF_ENABLED, &pwm->flags);
+               return pwm_apply_state(pwm, &state);
        }
 
-       mutex_unlock(&pwm->lock);
+       /*
+        * Adjust the PWM duty cycle/period based on the period value provided
+        * in PWM args.
+        */
+       if (pargs.period != state.period) {
+               u64 dutycycle = (u64)state.duty_cycle * pargs.period;
 
-       return err;
-}
-EXPORT_SYMBOL_GPL(pwm_enable);
+               do_div(dutycycle, state.period);
+               state.duty_cycle = dutycycle;
+               state.period = pargs.period;
+       }
 
-/**
- * pwm_disable() - stop a PWM output toggling
- * @pwm: PWM device
- */
-void pwm_disable(struct pwm_device *pwm)
-{
-       if (pwm && test_and_clear_bit(PWMF_ENABLED, &pwm->flags))
-               pwm->chip->ops->disable(pwm->chip, pwm);
+       /*
+        * If the polarity changed, we should also change the duty cycle.
+        */
+       if (pargs.polarity != state.polarity) {
+               state.polarity = pargs.polarity;
+               state.duty_cycle = state.period - state.duty_cycle;
+       }
+
+       return pwm_apply_state(pwm, &state);
 }
-EXPORT_SYMBOL_GPL(pwm_disable);
+EXPORT_SYMBOL_GPL(pwm_adjust_config);
 
 static struct pwm_chip *of_node_to_pwmchip(struct device_node *np)
 {
@@ -754,13 +796,13 @@ struct pwm_device *pwm_get(struct device *dev, const char *con_id)
        if (!chip)
                goto out;
 
-       pwm->args.period = chosen->period;
-       pwm->args.polarity = chosen->polarity;
-
        pwm = pwm_request_from_chip(chip, chosen->index, con_id ?: dev_id);
        if (IS_ERR(pwm))
                goto out;
 
+       pwm->args.period = chosen->period;
+       pwm->args.polarity = chosen->polarity;
+
 out:
        mutex_unlock(&pwm_lookup_lock);
        return pwm;
@@ -907,15 +949,23 @@ static void pwm_dbg_show(struct pwm_chip *chip, struct seq_file *s)
 
        for (i = 0; i < chip->npwm; i++) {
                struct pwm_device *pwm = &chip->pwms[i];
+               struct pwm_state state;
+
+               pwm_get_state(pwm, &state);
 
                seq_printf(s, " pwm-%-3d (%-20.20s):", i, pwm->label);
 
                if (test_bit(PWMF_REQUESTED, &pwm->flags))
                        seq_puts(s, " requested");
 
-               if (pwm_is_enabled(pwm))
+               if (state.enabled)
                        seq_puts(s, " enabled");
 
+               seq_printf(s, " period: %u ns", state.period);
+               seq_printf(s, " duty: %u ns", state.duty_cycle);
+               seq_printf(s, " polarity: %s",
+                          state.polarity ? "inverse" : "normal");
+
                seq_puts(s, "\n");
        }
 }
index 7101c70..bd0ebd0 100644 (file)
@@ -75,7 +75,7 @@ static int crc_pwm_config(struct pwm_chip *c, struct pwm_device *pwm,
                return -EINVAL;
        }
 
-       if (pwm->period != period_ns) {
+       if (pwm_get_period(pwm) != period_ns) {
                int clk_div;
 
                /* changing the clk divisor, need to disable fisrt */
index 9861fed..19dc64c 100644 (file)
@@ -249,7 +249,7 @@ static int lpc18xx_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm)
                           LPC18XX_PWM_EVSTATEMSK(lpc18xx_data->duty_event),
                           LPC18XX_PWM_EVSTATEMSK_ALL);
 
-       if (pwm->polarity == PWM_POLARITY_NORMAL) {
+       if (pwm_get_polarity(pwm) == PWM_POLARITY_NORMAL) {
                set_event = lpc18xx_pwm->period_event;
                clear_event = lpc18xx_data->duty_event;
                res_action = LPC18XX_PWM_RES_SET;
index b7e6ecb..3e95090 100644 (file)
@@ -192,7 +192,7 @@ static int pwm_omap_dmtimer_config(struct pwm_chip *chip,
                load_value, load_value, match_value, match_value);
 
        omap->pdata->set_pwm(omap->dm_timer,
-                             pwm->polarity == PWM_POLARITY_INVERSED,
+                             pwm_get_polarity(pwm) == PWM_POLARITY_INVERSED,
                              true,
                              PWM_OMAP_DMTIMER_TRIGGER_OVERFLOW_AND_COMPARE);
 
index 7b8ac06..1c85ecc 100644 (file)
@@ -157,7 +157,7 @@ static int rcar_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
                return div;
 
        /* Let the core driver set pwm->period if disabled and duty_ns == 0 */
-       if (!test_bit(PWMF_ENABLED, &pwm->flags) && !duty_ns)
+       if (!pwm_is_enabled(pwm) && !duty_ns)
                return 0;
 
        rcar_pwm_update(rp, RCAR_PWMCR_SYNC, RCAR_PWMCR_SYNC, RCAR_PWMCR);
index 67af9f6..03a99a5 100644 (file)
@@ -354,7 +354,8 @@ static int sun4i_pwm_probe(struct platform_device *pdev)
        val = sun4i_pwm_readl(pwm, PWM_CTRL_REG);
        for (i = 0; i < pwm->chip.npwm; i++)
                if (!(val & BIT_CH(PWM_ACT_STATE, i)))
-                       pwm->chip.pwms[i].polarity = PWM_POLARITY_INVERSED;
+                       pwm_set_polarity(&pwm->chip.pwms[i],
+                                        PWM_POLARITY_INVERSED);
        clk_disable_unprepare(pwm->clk);
 
        return 0;
index 9c90886..d985992 100644 (file)
@@ -26,6 +26,7 @@
 struct pwm_export {
        struct device child;
        struct pwm_device *pwm;
+       struct mutex lock;
 };
 
 static struct pwm_export *child_to_pwm_export(struct device *child)
@@ -45,15 +46,20 @@ static ssize_t period_show(struct device *child,
                           char *buf)
 {
        const struct pwm_device *pwm = child_to_pwm_device(child);
+       struct pwm_state state;
 
-       return sprintf(buf, "%u\n", pwm_get_period(pwm));
+       pwm_get_state(pwm, &state);
+
+       return sprintf(buf, "%u\n", state.period);
 }
 
 static ssize_t period_store(struct device *child,
                            struct device_attribute *attr,
                            const char *buf, size_t size)
 {
-       struct pwm_device *pwm = child_to_pwm_device(child);
+       struct pwm_export *export = child_to_pwm_export(child);
+       struct pwm_device *pwm = export->pwm;
+       struct pwm_state state;
        unsigned int val;
        int ret;
 
@@ -61,7 +67,11 @@ static ssize_t period_store(struct device *child,
        if (ret)
                return ret;
 
-       ret = pwm_config(pwm, pwm_get_duty_cycle(pwm), val);
+       mutex_lock(&export->lock);
+       pwm_get_state(pwm, &state);
+       state.period = val;
+       ret = pwm_apply_state(pwm, &state);
+       mutex_unlock(&export->lock);
 
        return ret ? : size;
 }
@@ -71,15 +81,20 @@ static ssize_t duty_cycle_show(struct device *child,
                               char *buf)
 {
        const struct pwm_device *pwm = child_to_pwm_device(child);
+       struct pwm_state state;
+
+       pwm_get_state(pwm, &state);
 
-       return sprintf(buf, "%u\n", pwm_get_duty_cycle(pwm));
+       return sprintf(buf, "%u\n", state.duty_cycle);
 }
 
 static ssize_t duty_cycle_store(struct device *child,
                                struct device_attribute *attr,
                                const char *buf, size_t size)
 {
-       struct pwm_device *pwm = child_to_pwm_device(child);
+       struct pwm_export *export = child_to_pwm_export(child);
+       struct pwm_device *pwm = export->pwm;
+       struct pwm_state state;
        unsigned int val;
        int ret;
 
@@ -87,7 +102,11 @@ static ssize_t duty_cycle_store(struct device *child,
        if (ret)
                return ret;
 
-       ret = pwm_config(pwm, val, pwm_get_period(pwm));
+       mutex_lock(&export->lock);
+       pwm_get_state(pwm, &state);
+       state.duty_cycle = val;
+       ret = pwm_apply_state(pwm, &state);
+       mutex_unlock(&export->lock);
 
        return ret ? : size;
 }
@@ -97,33 +116,46 @@ static ssize_t enable_show(struct device *child,
                           char *buf)
 {
        const struct pwm_device *pwm = child_to_pwm_device(child);
+       struct pwm_state state;
+
+       pwm_get_state(pwm, &state);
 
-       return sprintf(buf, "%d\n", pwm_is_enabled(pwm));
+       return sprintf(buf, "%d\n", state.enabled);
 }
 
 static ssize_t enable_store(struct device *child,
                            struct device_attribute *attr,
                            const char *buf, size_t size)
 {
-       struct pwm_device *pwm = child_to_pwm_device(child);
+       struct pwm_export *export = child_to_pwm_export(child);
+       struct pwm_device *pwm = export->pwm;
+       struct pwm_state state;
        int val, ret;
 
        ret = kstrtoint(buf, 0, &val);
        if (ret)
                return ret;
 
+       mutex_lock(&export->lock);
+
+       pwm_get_state(pwm, &state);
+
        switch (val) {
        case 0:
-               pwm_disable(pwm);
+               state.enabled = false;
                break;
        case 1:
-               ret = pwm_enable(pwm);
+               state.enabled = true;
                break;
        default:
                ret = -EINVAL;
-               break;
+               goto unlock;
        }
 
+       pwm_apply_state(pwm, &state);
+
+unlock:
+       mutex_unlock(&export->lock);
        return ret ? : size;
 }
 
@@ -133,8 +165,11 @@ static ssize_t polarity_show(struct device *child,
 {
        const struct pwm_device *pwm = child_to_pwm_device(child);
        const char *polarity = "unknown";
+       struct pwm_state state;
+
+       pwm_get_state(pwm, &state);
 
-       switch (pwm_get_polarity(pwm)) {
+       switch (state.polarity) {
        case PWM_POLARITY_NORMAL:
                polarity = "normal";
                break;
@@ -151,8 +186,10 @@ static ssize_t polarity_store(struct device *child,
                              struct device_attribute *attr,
                              const char *buf, size_t size)
 {
-       struct pwm_device *pwm = child_to_pwm_device(child);
+       struct pwm_export *export = child_to_pwm_export(child);
+       struct pwm_device *pwm = export->pwm;
        enum pwm_polarity polarity;
+       struct pwm_state state;
        int ret;
 
        if (sysfs_streq(buf, "normal"))
@@ -162,7 +199,11 @@ static ssize_t polarity_store(struct device *child,
        else
                return -EINVAL;
 
-       ret = pwm_set_polarity(pwm, polarity);
+       mutex_lock(&export->lock);
+       pwm_get_state(pwm, &state);
+       state.polarity = polarity;
+       ret = pwm_apply_state(pwm, &state);
+       mutex_unlock(&export->lock);
 
        return ret ? : size;
 }
@@ -203,6 +244,7 @@ static int pwm_export_child(struct device *parent, struct pwm_device *pwm)
        }
 
        export->pwm = pwm;
+       mutex_init(&export->lock);
 
        export->child.release = pwm_export_release;
        export->child.parent = parent;
index b839086..bed53c4 100644 (file)
@@ -31,7 +31,7 @@ static void dcssblk_release(struct gendisk *disk, fmode_t mode);
 static blk_qc_t dcssblk_make_request(struct request_queue *q,
                                                struct bio *bio);
 static long dcssblk_direct_access(struct block_device *bdev, sector_t secnum,
-                        void __pmem **kaddr, pfn_t *pfn);
+                        void __pmem **kaddr, pfn_t *pfn, long size);
 
 static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0";
 
@@ -884,7 +884,7 @@ fail:
 
 static long
 dcssblk_direct_access (struct block_device *bdev, sector_t secnum,
-                       void __pmem **kaddr, pfn_t *pfn)
+                       void __pmem **kaddr, pfn_t *pfn, long size)
 {
        struct dcssblk_dev_info *dev_info;
        unsigned long offset, dev_sz;
index 8f90d9e..969c312 100644 (file)
@@ -620,6 +620,11 @@ struct aac_driver_ident
  */
 #define AAC_QUIRK_SCSI_32      0x0020
 
+/*
+ * SRC based adapters support the AifReqEvent functions
+ */
+#define AAC_QUIRK_SRC 0x0040
+
 /*
  *     The adapter interface specs all queues to be located in the same
  *     physically contiguous block. The host structure that defines the
index a943bd2..79871f3 100644 (file)
@@ -236,10 +236,10 @@ static struct aac_driver_ident aac_drivers[] = {
        { aac_rx_init, "aacraid",  "ADAPTEC ", "RAID            ", 2 }, /* Adaptec Catch All */
        { aac_rkt_init, "aacraid", "ADAPTEC ", "RAID            ", 2 }, /* Adaptec Rocket Catch All */
        { aac_nark_init, "aacraid", "ADAPTEC ", "RAID           ", 2 }, /* Adaptec NEMER/ARK Catch All */
-       { aac_src_init, "aacraid", "ADAPTEC ", "RAID            ", 2 }, /* Adaptec PMC Series 6 (Tupelo) */
-       { aac_srcv_init, "aacraid", "ADAPTEC ", "RAID            ", 2 }, /* Adaptec PMC Series 7 (Denali) */
-       { aac_srcv_init, "aacraid", "ADAPTEC ", "RAID            ", 2 }, /* Adaptec PMC Series 8 */
-       { aac_srcv_init, "aacraid", "ADAPTEC ", "RAID            ", 2 } /* Adaptec PMC Series 9 */
+       { aac_src_init, "aacraid", "ADAPTEC ", "RAID            ", 2, AAC_QUIRK_SRC }, /* Adaptec PMC Series 6 (Tupelo) */
+       { aac_srcv_init, "aacraid", "ADAPTEC ", "RAID            ", 2, AAC_QUIRK_SRC }, /* Adaptec PMC Series 7 (Denali) */
+       { aac_srcv_init, "aacraid", "ADAPTEC ", "RAID            ", 2, AAC_QUIRK_SRC }, /* Adaptec PMC Series 8 */
+       { aac_srcv_init, "aacraid", "ADAPTEC ", "RAID            ", 2, AAC_QUIRK_SRC } /* Adaptec PMC Series 9 */
 };
 
 /**
@@ -1299,7 +1299,8 @@ static int aac_probe_one(struct pci_dev *pdev, const struct pci_device_id *id)
        else
                shost->this_id = shost->max_id;
 
-       aac_intr_normal(aac, 0, 2, 0, NULL);
+       if (aac_drivers[index].quirks & AAC_QUIRK_SRC)
+               aac_intr_normal(aac, 0, 2, 0, NULL);
 
        /*
         * dmb - we may need to move the setting of these parms somewhere else once
index 6a4df5a..6bff13e 100644 (file)
@@ -7975,13 +7975,14 @@ mpt3sas_scsih_event_callback(struct MPT3SAS_ADAPTER *ioc, u8 msix_index,
                ActiveCableEventData =
                    (Mpi26EventDataActiveCableExcept_t *) mpi_reply->EventData;
                if (ActiveCableEventData->ReasonCode ==
-                               MPI26_EVENT_ACTIVE_CABLE_INSUFFICIENT_POWER)
+                               MPI26_EVENT_ACTIVE_CABLE_INSUFFICIENT_POWER) {
                        pr_info(MPT3SAS_FMT "Currently an active cable with ReceptacleID %d",
                            ioc->name, ActiveCableEventData->ReceptacleID);
                        pr_info("cannot be powered and devices connected to this active cable");
                        pr_info("will not be seen. This active cable");
                        pr_info("requires %d mW of power",
                            ActiveCableEventData->ActiveCablePowerRequirement);
+               }
                break;
 
        default: /* ignore the rest */
index 10aa18b..67c0d5a 100644 (file)
@@ -36,3 +36,12 @@ config TCM_QLA2XXX
        default n
        ---help---
        Say Y here to enable the TCM_QLA2XXX fabric module for QLogic 24xx+ series target mode HBAs
+
+if TCM_QLA2XXX
+config TCM_QLA2XXX_DEBUG
+       bool "TCM_QLA2XXX fabric module DEBUG mode for QLogic 24xx+ series target mode HBAs"
+       default n
+       ---help---
+       Say Y here to enable the TCM_QLA2XXX fabric module DEBUG for QLogic 24xx+ series target mode HBAs
+       This will include code to enable the SCSI command jammer
+endif
index 8a44d15..ca39deb 100644 (file)
@@ -637,8 +637,10 @@ static void qlt_free_session_done(struct work_struct *work)
 }
 
 /* ha->tgt.sess_lock supposed to be held on entry */
-void qlt_unreg_sess(struct qla_tgt_sess *sess)
+static void qlt_release_session(struct kref *kref)
 {
+       struct qla_tgt_sess *sess =
+               container_of(kref, struct qla_tgt_sess, sess_kref);
        struct scsi_qla_host *vha = sess->vha;
 
        if (sess->se_sess)
@@ -651,8 +653,16 @@ void qlt_unreg_sess(struct qla_tgt_sess *sess)
        INIT_WORK(&sess->free_work, qlt_free_session_done);
        schedule_work(&sess->free_work);
 }
-EXPORT_SYMBOL(qlt_unreg_sess);
 
+void qlt_put_sess(struct qla_tgt_sess *sess)
+{
+       if (!sess)
+               return;
+
+       assert_spin_locked(&sess->vha->hw->tgt.sess_lock);
+       kref_put(&sess->sess_kref, qlt_release_session);
+}
+EXPORT_SYMBOL(qlt_put_sess);
 
 static int qlt_reset(struct scsi_qla_host *vha, void *iocb, int mcmd)
 {
@@ -857,12 +867,9 @@ static void qlt_del_sess_work_fn(struct delayed_work *work)
                        ql_dbg(ql_dbg_tgt_mgt, vha, 0xf004,
                            "Timeout: sess %p about to be deleted\n",
                            sess);
-                       if (sess->se_sess) {
+                       if (sess->se_sess)
                                ha->tgt.tgt_ops->shutdown_sess(sess);
-                               ha->tgt.tgt_ops->put_sess(sess);
-                       } else {
-                               qlt_unreg_sess(sess);
-                       }
+                       qlt_put_sess(sess);
                } else {
                        schedule_delayed_work(&tgt->sess_del_work,
                            sess->expires - elapsed);
@@ -917,7 +924,7 @@ static struct qla_tgt_sess *qlt_create_sess(
                                }
                        }
 
-                       kref_get(&sess->se_sess->sess_kref);
+                       kref_get(&sess->sess_kref);
                        ha->tgt.tgt_ops->update_sess(sess, fcport->d_id, fcport->loop_id,
                                                (fcport->flags & FCF_CONF_COMP_SUPPORTED));
 
@@ -947,6 +954,7 @@ static struct qla_tgt_sess *qlt_create_sess(
        sess->s_id = fcport->d_id;
        sess->loop_id = fcport->loop_id;
        sess->local = local;
+       kref_init(&sess->sess_kref);
        INIT_LIST_HEAD(&sess->del_list_entry);
 
        /* Under normal circumstances we want to logout from firmware when
@@ -991,7 +999,7 @@ static struct qla_tgt_sess *qlt_create_sess(
                 * Take an extra reference to ->sess_kref here to handle qla_tgt_sess
                 * access across ->tgt.sess_lock reaquire.
                 */
-               kref_get(&sess->se_sess->sess_kref);
+               kref_get(&sess->sess_kref);
        }
 
        return sess;
@@ -1035,7 +1043,7 @@ void qlt_fc_port_added(struct scsi_qla_host *vha, fc_port_t *fcport)
                spin_unlock_irqrestore(&ha->tgt.sess_lock, flags);
                return;
        } else {
-               kref_get(&sess->se_sess->sess_kref);
+               kref_get(&sess->sess_kref);
 
                if (sess->deleted) {
                        qlt_undelete_sess(sess);
@@ -1060,7 +1068,7 @@ void qlt_fc_port_added(struct scsi_qla_host *vha, fc_port_t *fcport)
                    fcport->port_name, sess->loop_id);
                sess->local = 0;
        }
-       ha->tgt.tgt_ops->put_sess(sess);
+       qlt_put_sess(sess);
        spin_unlock_irqrestore(&ha->tgt.sess_lock, flags);
 }
 
@@ -3817,7 +3825,7 @@ static void __qlt_do_work(struct qla_tgt_cmd *cmd)
         * Drop extra session reference from qla_tgt_handle_cmd_for_atio*(
         */
        spin_lock_irqsave(&ha->tgt.sess_lock, flags);
-       ha->tgt.tgt_ops->put_sess(sess);
+       qlt_put_sess(sess);
        spin_unlock_irqrestore(&ha->tgt.sess_lock, flags);
        return;
 
@@ -3836,7 +3844,7 @@ out_term:
        spin_unlock_irqrestore(&ha->hardware_lock, flags);
 
        spin_lock_irqsave(&ha->tgt.sess_lock, flags);
-       ha->tgt.tgt_ops->put_sess(sess);
+       qlt_put_sess(sess);
        spin_unlock_irqrestore(&ha->tgt.sess_lock, flags);
 }
 
@@ -3936,13 +3944,13 @@ static void qlt_create_sess_from_atio(struct work_struct *work)
        if (!cmd) {
                spin_lock_irqsave(&ha->hardware_lock, flags);
                qlt_send_busy(vha, &op->atio, SAM_STAT_BUSY);
-               ha->tgt.tgt_ops->put_sess(sess);
+               qlt_put_sess(sess);
                spin_unlock_irqrestore(&ha->hardware_lock, flags);
                kfree(op);
                return;
        }
        /*
-        * __qlt_do_work() will call ha->tgt.tgt_ops->put_sess() to release
+        * __qlt_do_work() will call qlt_put_sess() to release
         * the extra reference taken above by qlt_make_local_sess()
         */
        __qlt_do_work(cmd);
@@ -4003,13 +4011,13 @@ static int qlt_handle_cmd_for_atio(struct scsi_qla_host *vha,
        /*
         * Do kref_get() before returning + dropping qla_hw_data->hardware_lock.
         */
-       kref_get(&sess->se_sess->sess_kref);
+       kref_get(&sess->sess_kref);
 
        cmd = qlt_get_tag(vha, sess, atio);
        if (!cmd) {
                ql_dbg(ql_dbg_io, vha, 0x3062,
                    "qla_target(%d): Allocation of cmd failed\n", vha->vp_idx);
-               ha->tgt.tgt_ops->put_sess(sess);
+               qlt_put_sess(sess);
                return -ENOMEM;
        }
 
@@ -5911,7 +5919,7 @@ static void qlt_abort_work(struct qla_tgt *tgt,
                        goto out_term2;
                }
 
-               kref_get(&sess->se_sess->sess_kref);
+               kref_get(&sess->sess_kref);
        }
 
        spin_lock_irqsave(&ha->hardware_lock, flags);
@@ -5924,7 +5932,7 @@ static void qlt_abort_work(struct qla_tgt *tgt,
                goto out_term;
        spin_unlock_irqrestore(&ha->hardware_lock, flags);
 
-       ha->tgt.tgt_ops->put_sess(sess);
+       qlt_put_sess(sess);
        spin_unlock_irqrestore(&ha->tgt.sess_lock, flags2);
        return;
 
@@ -5935,8 +5943,7 @@ out_term:
        qlt_24xx_send_abts_resp(vha, &prm->abts, FCP_TMF_REJECTED, false);
        spin_unlock_irqrestore(&ha->hardware_lock, flags);
 
-       if (sess)
-               ha->tgt.tgt_ops->put_sess(sess);
+       qlt_put_sess(sess);
        spin_unlock_irqrestore(&ha->tgt.sess_lock, flags2);
 }
 
@@ -5976,7 +5983,7 @@ static void qlt_tmr_work(struct qla_tgt *tgt,
                        goto out_term;
                }
 
-               kref_get(&sess->se_sess->sess_kref);
+               kref_get(&sess->sess_kref);
        }
 
        iocb = a;
@@ -5988,14 +5995,13 @@ static void qlt_tmr_work(struct qla_tgt *tgt,
        if (rc != 0)
                goto out_term;
 
-       ha->tgt.tgt_ops->put_sess(sess);
+       qlt_put_sess(sess);
        spin_unlock_irqrestore(&ha->tgt.sess_lock, flags);
        return;
 
 out_term:
        qlt_send_term_exchange(vha, NULL, &prm->tm_iocb2, 1, 0);
-       if (sess)
-               ha->tgt.tgt_ops->put_sess(sess);
+       qlt_put_sess(sess);
        spin_unlock_irqrestore(&ha->tgt.sess_lock, flags);
 }
 
index d857fee..f26c5f6 100644 (file)
@@ -738,7 +738,6 @@ struct qla_tgt_func_tmpl {
        struct qla_tgt_sess *(*find_sess_by_s_id)(struct scsi_qla_host *,
                                                const uint8_t *);
        void (*clear_nacl_from_fcport_map)(struct qla_tgt_sess *);
-       void (*put_sess)(struct qla_tgt_sess *);
        void (*shutdown_sess)(struct qla_tgt_sess *);
 };
 
@@ -930,6 +929,7 @@ struct qla_tgt_sess {
        int generation;
 
        struct se_session *se_sess;
+       struct kref sess_kref;
        struct scsi_qla_host *vha;
        struct qla_tgt *tgt;
 
@@ -1101,7 +1101,7 @@ extern int qlt_remove_target(struct qla_hw_data *, struct scsi_qla_host *);
 extern int qlt_lport_register(void *, u64, u64, u64,
                        int (*callback)(struct scsi_qla_host *, void *, u64, u64));
 extern void qlt_lport_deregister(struct scsi_qla_host *);
-extern void qlt_unreg_sess(struct qla_tgt_sess *);
+void qlt_put_sess(struct qla_tgt_sess *sess);
 extern void qlt_fc_port_added(struct scsi_qla_host *, fc_port_t *);
 extern void qlt_fc_port_deleted(struct scsi_qla_host *, fc_port_t *, int);
 extern int __init qlt_init(void);
index c1461d2..6643f6f 100644 (file)
@@ -339,22 +339,6 @@ static void tcm_qla2xxx_release_cmd(struct se_cmd *se_cmd)
        qlt_free_cmd(cmd);
 }
 
-static int tcm_qla2xxx_shutdown_session(struct se_session *se_sess)
-{
-       struct qla_tgt_sess *sess = se_sess->fabric_sess_ptr;
-       struct scsi_qla_host *vha;
-       unsigned long flags;
-
-       BUG_ON(!sess);
-       vha = sess->vha;
-
-       spin_lock_irqsave(&vha->hw->tgt.sess_lock, flags);
-       target_sess_cmd_list_set_waiting(se_sess);
-       spin_unlock_irqrestore(&vha->hw->tgt.sess_lock, flags);
-
-       return 1;
-}
-
 static void tcm_qla2xxx_close_session(struct se_session *se_sess)
 {
        struct qla_tgt_sess *sess = se_sess->fabric_sess_ptr;
@@ -365,7 +349,8 @@ static void tcm_qla2xxx_close_session(struct se_session *se_sess)
        vha = sess->vha;
 
        spin_lock_irqsave(&vha->hw->tgt.sess_lock, flags);
-       qlt_unreg_sess(sess);
+       target_sess_cmd_list_set_waiting(se_sess);
+       qlt_put_sess(sess);
        spin_unlock_irqrestore(&vha->hw->tgt.sess_lock, flags);
 }
 
@@ -457,6 +442,10 @@ static int tcm_qla2xxx_handle_cmd(scsi_qla_host_t *vha, struct qla_tgt_cmd *cmd,
        struct se_cmd *se_cmd = &cmd->se_cmd;
        struct se_session *se_sess;
        struct qla_tgt_sess *sess;
+#ifdef CONFIG_TCM_QLA2XXX_DEBUG
+       struct se_portal_group *se_tpg;
+       struct tcm_qla2xxx_tpg *tpg;
+#endif
        int flags = TARGET_SCF_ACK_KREF;
 
        if (bidi)
@@ -477,6 +466,15 @@ static int tcm_qla2xxx_handle_cmd(scsi_qla_host_t *vha, struct qla_tgt_cmd *cmd,
                return -EINVAL;
        }
 
+#ifdef CONFIG_TCM_QLA2XXX_DEBUG
+       se_tpg = se_sess->se_tpg;
+       tpg = container_of(se_tpg, struct tcm_qla2xxx_tpg, se_tpg);
+       if (unlikely(tpg->tpg_attrib.jam_host)) {
+               /* return, and dont run target_submit_cmd,discarding command */
+               return 0;
+       }
+#endif
+
        cmd->vha->tgt_counters.qla_core_sbt_cmd++;
        return target_submit_cmd(se_cmd, se_sess, cdb, &cmd->sense_buffer[0],
                                cmd->unpacked_lun, data_length, fcp_task_attr,
@@ -758,23 +756,6 @@ static void tcm_qla2xxx_clear_nacl_from_fcport_map(struct qla_tgt_sess *sess)
        tcm_qla2xxx_clear_sess_lookup(lport, nacl, sess);
 }
 
-static void tcm_qla2xxx_release_session(struct kref *kref)
-{
-       struct se_session *se_sess = container_of(kref,
-                       struct se_session, sess_kref);
-
-       qlt_unreg_sess(se_sess->fabric_sess_ptr);
-}
-
-static void tcm_qla2xxx_put_sess(struct qla_tgt_sess *sess)
-{
-       if (!sess)
-               return;
-
-       assert_spin_locked(&sess->vha->hw->tgt.sess_lock);
-       kref_put(&sess->se_sess->sess_kref, tcm_qla2xxx_release_session);
-}
-
 static void tcm_qla2xxx_shutdown_sess(struct qla_tgt_sess *sess)
 {
        assert_spin_locked(&sess->vha->hw->tgt.sess_lock);
@@ -844,6 +825,9 @@ DEF_QLA_TPG_ATTRIB(cache_dynamic_acls);
 DEF_QLA_TPG_ATTRIB(demo_mode_write_protect);
 DEF_QLA_TPG_ATTRIB(prod_mode_write_protect);
 DEF_QLA_TPG_ATTRIB(demo_mode_login_only);
+#ifdef CONFIG_TCM_QLA2XXX_DEBUG
+DEF_QLA_TPG_ATTRIB(jam_host);
+#endif
 
 static struct configfs_attribute *tcm_qla2xxx_tpg_attrib_attrs[] = {
        &tcm_qla2xxx_tpg_attrib_attr_generate_node_acls,
@@ -851,6 +835,9 @@ static struct configfs_attribute *tcm_qla2xxx_tpg_attrib_attrs[] = {
        &tcm_qla2xxx_tpg_attrib_attr_demo_mode_write_protect,
        &tcm_qla2xxx_tpg_attrib_attr_prod_mode_write_protect,
        &tcm_qla2xxx_tpg_attrib_attr_demo_mode_login_only,
+#ifdef CONFIG_TCM_QLA2XXX_DEBUG
+       &tcm_qla2xxx_tpg_attrib_attr_jam_host,
+#endif
        NULL,
 };
 
@@ -1023,6 +1010,7 @@ static struct se_portal_group *tcm_qla2xxx_make_tpg(
        tpg->tpg_attrib.demo_mode_write_protect = 1;
        tpg->tpg_attrib.cache_dynamic_acls = 1;
        tpg->tpg_attrib.demo_mode_login_only = 1;
+       tpg->tpg_attrib.jam_host = 0;
 
        ret = core_tpg_register(wwn, &tpg->se_tpg, SCSI_PROTOCOL_FCP);
        if (ret < 0) {
@@ -1579,7 +1567,6 @@ static struct qla_tgt_func_tmpl tcm_qla2xxx_template = {
        .find_sess_by_s_id      = tcm_qla2xxx_find_sess_by_s_id,
        .find_sess_by_loop_id   = tcm_qla2xxx_find_sess_by_loop_id,
        .clear_nacl_from_fcport_map = tcm_qla2xxx_clear_nacl_from_fcport_map,
-       .put_sess               = tcm_qla2xxx_put_sess,
        .shutdown_sess          = tcm_qla2xxx_shutdown_sess,
 };
 
@@ -1847,7 +1834,6 @@ static const struct target_core_fabric_ops tcm_qla2xxx_ops = {
        .tpg_get_inst_index             = tcm_qla2xxx_tpg_get_inst_index,
        .check_stop_free                = tcm_qla2xxx_check_stop_free,
        .release_cmd                    = tcm_qla2xxx_release_cmd,
-       .shutdown_session               = tcm_qla2xxx_shutdown_session,
        .close_session                  = tcm_qla2xxx_close_session,
        .sess_get_index                 = tcm_qla2xxx_sess_get_index,
        .sess_get_initiator_sid         = NULL,
@@ -1890,7 +1876,6 @@ static const struct target_core_fabric_ops tcm_qla2xxx_npiv_ops = {
        .tpg_get_inst_index             = tcm_qla2xxx_tpg_get_inst_index,
        .check_stop_free                = tcm_qla2xxx_check_stop_free,
        .release_cmd                    = tcm_qla2xxx_release_cmd,
-       .shutdown_session               = tcm_qla2xxx_shutdown_session,
        .close_session                  = tcm_qla2xxx_close_session,
        .sess_get_index                 = tcm_qla2xxx_sess_get_index,
        .sess_get_initiator_sid         = NULL,
index 3bbf4cb..37e026a 100644 (file)
@@ -34,6 +34,7 @@ struct tcm_qla2xxx_tpg_attrib {
        int prod_mode_write_protect;
        int demo_mode_login_only;
        int fabric_prot_type;
+       int jam_host;
 };
 
 struct tcm_qla2xxx_tpg {
index b2e332a..c71344a 100644 (file)
@@ -821,9 +821,12 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
        }
 
        /*
-        * If we finished all bytes in the request we are done now.
+        * special case: failed zero length commands always need to
+        * drop down into the retry code. Otherwise, if we finished
+        * all bytes in the request we are done now.
         */
-       if (!scsi_end_request(req, error, good_bytes, 0))
+       if (!(blk_rq_bytes(req) == 0 && error) &&
+           !scsi_end_request(req, error, good_bytes, 0))
                return;
 
        /*
index 428c03e..f459dff 100644 (file)
@@ -1398,11 +1398,15 @@ static int media_not_present(struct scsi_disk *sdkp,
  **/
 static unsigned int sd_check_events(struct gendisk *disk, unsigned int clearing)
 {
-       struct scsi_disk *sdkp = scsi_disk(disk);
-       struct scsi_device *sdp = sdkp->device;
+       struct scsi_disk *sdkp = scsi_disk_get(disk);
+       struct scsi_device *sdp;
        struct scsi_sense_hdr *sshdr = NULL;
        int retval;
 
+       if (!sdkp)
+               return 0;
+
+       sdp = sdkp->device;
        SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_check_events\n"));
 
        /*
@@ -1459,6 +1463,7 @@ out:
        kfree(sshdr);
        retval = sdp->changed ? DISK_EVENT_MEDIA_CHANGE : 0;
        sdp->changed = 0;
+       scsi_disk_put(sdkp);
        return retval;
 }
 
index 3c3e56d..a003ba2 100644 (file)
@@ -1059,7 +1059,7 @@ static const struct pmic_wrapper_type pwrap_mt2701 = {
        .regs = mt2701_regs,
        .type = PWRAP_MT2701,
        .arb_en_all = 0x3f,
-       .int_en_all = ~(BIT(31) | BIT(2)),
+       .int_en_all = ~(u32)(BIT(31) | BIT(2)),
        .spi_w = PWRAP_MAN_CMD_SPI_WRITE_NEW,
        .wdt_src = PWRAP_WDT_SRC_MASK_ALL,
        .has_bridge = 0,
@@ -1071,7 +1071,7 @@ static struct pmic_wrapper_type pwrap_mt8135 = {
        .regs = mt8135_regs,
        .type = PWRAP_MT8135,
        .arb_en_all = 0x1ff,
-       .int_en_all = ~(BIT(31) | BIT(1)),
+       .int_en_all = ~(u32)(BIT(31) | BIT(1)),
        .spi_w = PWRAP_MAN_CMD_SPI_WRITE,
        .wdt_src = PWRAP_WDT_SRC_MASK_ALL,
        .has_bridge = 1,
@@ -1083,7 +1083,7 @@ static struct pmic_wrapper_type pwrap_mt8173 = {
        .regs = mt8173_regs,
        .type = PWRAP_MT8173,
        .arb_en_all = 0x3f,
-       .int_en_all = ~(BIT(31) | BIT(1)),
+       .int_en_all = ~(u32)(BIT(31) | BIT(1)),
        .spi_w = PWRAP_MAN_CMD_SPI_WRITE,
        .wdt_src = PWRAP_WDT_SRC_MASK_NO_STAUPD,
        .has_bridge = 0,
index 9d8c84b..4b931ec 100644 (file)
@@ -410,7 +410,6 @@ config SPI_OMAP_UWIRE
 config SPI_OMAP24XX
        tristate "McSPI driver for OMAP"
        depends on HAS_DMA
-       depends on ARM || ARM64 || AVR32 || HEXAGON || MIPS || SUPERH
        depends on ARCH_OMAP2PLUS || COMPILE_TEST
        help
          SPI master controller for OMAP24XX and later Multichannel SPI
@@ -432,10 +431,23 @@ config SPI_OMAP_100K
 
 config SPI_ORION
        tristate "Orion SPI master"
-       depends on PLAT_ORION || COMPILE_TEST
+       depends on PLAT_ORION || ARCH_MVEBU || COMPILE_TEST
        help
          This enables using the SPI master controller on the Orion chips.
 
+config SPI_PIC32
+       tristate "Microchip PIC32 series SPI"
+       depends on MACH_PIC32 || COMPILE_TEST
+       help
+         SPI driver for Microchip PIC32 SPI master controller.
+
+config SPI_PIC32_SQI
+       tristate "Microchip PIC32 Quad SPI driver"
+       depends on MACH_PIC32 || COMPILE_TEST
+       depends on HAS_DMA
+       help
+         SPI driver for PIC32 Quad SPI controller.
+
 config SPI_PL022
        tristate "ARM AMBA PL022 SSP controller"
        depends on ARM_AMBA
@@ -469,7 +481,6 @@ config SPI_PXA2XX_PCI
 
 config SPI_ROCKCHIP
        tristate "Rockchip SPI controller driver"
-       depends on ARM || ARM64 || AVR32 || HEXAGON || MIPS || SUPERH
        help
          This selects a driver for Rockchip SPI controller.
 
@@ -569,7 +580,7 @@ config SPI_SIRF
 
 config SPI_ST_SSC4
        tristate "STMicroelectronics SPI SSC-based driver"
-       depends on ARCH_STI
+       depends on ARCH_STI || COMPILE_TEST
        help
          STMicroelectronics SoCs support for SPI. If you say yes to
          this option, support will be included for the SSC driven SPI.
@@ -656,7 +667,7 @@ config SPI_XILINX
 
 config SPI_XLP
        tristate "Netlogic XLP SPI controller driver"
-       depends on CPU_XLP || COMPILE_TEST
+       depends on CPU_XLP || ARCH_VULCAN || COMPILE_TEST
        help
          Enable support for the SPI controller on the Netlogic XLP SoCs.
          Currently supported XLP variants are XLP8XX, XLP3XX, XLP2XX, XLP9XX
index fbb255c..3c74d00 100644 (file)
@@ -62,6 +62,8 @@ obj-$(CONFIG_SPI_OMAP_100K)           += spi-omap-100k.o
 obj-$(CONFIG_SPI_OMAP24XX)             += spi-omap2-mcspi.o
 obj-$(CONFIG_SPI_TI_QSPI)              += spi-ti-qspi.o
 obj-$(CONFIG_SPI_ORION)                        += spi-orion.o
+obj-$(CONFIG_SPI_PIC32)                        += spi-pic32.o
+obj-$(CONFIG_SPI_PIC32_SQI)            += spi-pic32-sqi.o
 obj-$(CONFIG_SPI_PL022)                        += spi-pl022.o
 obj-$(CONFIG_SPI_PPC4xx)               += spi-ppc4xx.o
 spi-pxa2xx-platform-objs               := spi-pxa2xx.o spi-pxa2xx-dma.o
index c968ab2..2b1456e 100644 (file)
@@ -525,7 +525,6 @@ static int spi_engine_probe(struct platform_device *pdev)
        if (ret)
                goto err_ref_clk_disable;
 
-       master->dev.parent = &pdev->dev;
        master->dev.of_node = pdev->dev.of_node;
        master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_3WIRE;
        master->bits_per_word_mask = SPI_BPW_MASK(8);
index cc3f938..afb5169 100644 (file)
@@ -10,6 +10,7 @@
 #include "spi-bcm53xx.h"
 
 #define BCM53XXSPI_MAX_SPI_BAUD        13500000        /* 216 MHz? */
+#define BCM53XXSPI_FLASH_WINDOW        SZ_32M
 
 /* The longest observed required wait was 19 ms */
 #define BCM53XXSPI_SPE_TIMEOUT_MS      80
 struct bcm53xxspi {
        struct bcma_device *core;
        struct spi_master *master;
+       void __iomem *mmio_base;
 
        size_t read_offset;
+       bool bspi;                              /* Boot SPI mode with memory mapping */
 };
 
 static inline u32 bcm53xxspi_read(struct bcm53xxspi *b53spi, u16 offset)
@@ -32,6 +35,50 @@ static inline void bcm53xxspi_write(struct bcm53xxspi *b53spi, u16 offset,
        bcma_write32(b53spi->core, offset, value);
 }
 
+static void bcm53xxspi_disable_bspi(struct bcm53xxspi *b53spi)
+{
+       struct device *dev = &b53spi->core->dev;
+       unsigned long deadline;
+       u32 tmp;
+
+       if (!b53spi->bspi)
+               return;
+
+       tmp = bcm53xxspi_read(b53spi, B53SPI_BSPI_MAST_N_BOOT_CTRL);
+       if (tmp & 0x1)
+               return;
+
+       deadline = jiffies + usecs_to_jiffies(200);
+       do {
+               tmp = bcm53xxspi_read(b53spi, B53SPI_BSPI_BUSY_STATUS);
+               if (!(tmp & 0x1)) {
+                       bcm53xxspi_write(b53spi, B53SPI_BSPI_MAST_N_BOOT_CTRL,
+                                        0x1);
+                       ndelay(200);
+                       b53spi->bspi = false;
+                       return;
+               }
+               udelay(1);
+       } while (!time_after_eq(jiffies, deadline));
+
+       dev_warn(dev, "Timeout disabling BSPI\n");
+}
+
+static void bcm53xxspi_enable_bspi(struct bcm53xxspi *b53spi)
+{
+       u32 tmp;
+
+       if (b53spi->bspi)
+               return;
+
+       tmp = bcm53xxspi_read(b53spi, B53SPI_BSPI_MAST_N_BOOT_CTRL);
+       if (!(tmp & 0x1))
+               return;
+
+       bcm53xxspi_write(b53spi, B53SPI_BSPI_MAST_N_BOOT_CTRL, 0x0);
+       b53spi->bspi = true;
+}
+
 static inline unsigned int bcm53xxspi_calc_timeout(size_t len)
 {
        /* Do some magic calculation based on length and buad. Add 10% and 1. */
@@ -176,6 +223,8 @@ static int bcm53xxspi_transfer_one(struct spi_master *master,
        u8 *buf;
        size_t left;
 
+       bcm53xxspi_disable_bspi(b53spi);
+
        if (t->tx_buf) {
                buf = (u8 *)t->tx_buf;
                left = t->len;
@@ -206,6 +255,22 @@ static int bcm53xxspi_transfer_one(struct spi_master *master,
        return 0;
 }
 
+static int bcm53xxspi_flash_read(struct spi_device *spi,
+                                struct spi_flash_read_message *msg)
+{
+       struct bcm53xxspi *b53spi = spi_master_get_devdata(spi->master);
+       int ret = 0;
+
+       if (msg->from + msg->len > BCM53XXSPI_FLASH_WINDOW)
+               return -EINVAL;
+
+       bcm53xxspi_enable_bspi(b53spi);
+       memcpy_fromio(msg->buf, b53spi->mmio_base + msg->from, msg->len);
+       msg->retlen = msg->len;
+
+       return ret;
+}
+
 /**************************************************
  * BCMA
  **************************************************/
@@ -222,6 +287,7 @@ MODULE_DEVICE_TABLE(bcma, bcm53xxspi_bcma_tbl);
 
 static int bcm53xxspi_bcma_probe(struct bcma_device *core)
 {
+       struct device *dev = &core->dev;
        struct bcm53xxspi *b53spi;
        struct spi_master *master;
        int err;
@@ -231,7 +297,7 @@ static int bcm53xxspi_bcma_probe(struct bcma_device *core)
                return -ENOTSUPP;
        }
 
-       master = spi_alloc_master(&core->dev, sizeof(*b53spi));
+       master = spi_alloc_master(dev, sizeof(*b53spi));
        if (!master)
                return -ENOMEM;
 
@@ -239,11 +305,19 @@ static int bcm53xxspi_bcma_probe(struct bcma_device *core)
        b53spi->master = master;
        b53spi->core = core;
 
+       if (core->addr_s[0])
+               b53spi->mmio_base = devm_ioremap(dev, core->addr_s[0],
+                                                BCM53XXSPI_FLASH_WINDOW);
+       b53spi->bspi = true;
+       bcm53xxspi_disable_bspi(b53spi);
+
        master->transfer_one = bcm53xxspi_transfer_one;
+       if (b53spi->mmio_base)
+               master->spi_flash_read = bcm53xxspi_flash_read;
 
        bcma_set_drvdata(core, b53spi);
 
-       err = devm_spi_register_master(&core->dev, master);
+       err = devm_spi_register_master(dev, master);
        if (err) {
                spi_master_put(master);
                bcma_set_drvdata(core, NULL);
index 121a413..1c57ce6 100644 (file)
 #include <linux/of_irq.h>
 #include <linux/of_address.h>
 #include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
 #include <linux/spi/spi.h>
 
 /* Name of this driver */
 #define CDNS_SPI_NAME          "cdns-spi"
 
 /* Register offset definitions */
-#define CDNS_SPI_CR_OFFSET     0x00 /* Configuration  Register, RW */
-#define CDNS_SPI_ISR_OFFSET    0x04 /* Interrupt Status Register, RO */
-#define CDNS_SPI_IER_OFFSET    0x08 /* Interrupt Enable Register, WO */
-#define CDNS_SPI_IDR_OFFSET    0x0c /* Interrupt Disable Register, WO */
-#define CDNS_SPI_IMR_OFFSET    0x10 /* Interrupt Enabled Mask Register, RO */
-#define CDNS_SPI_ER_OFFSET     0x14 /* Enable/Disable Register, RW */
-#define CDNS_SPI_DR_OFFSET     0x18 /* Delay Register, RW */
-#define CDNS_SPI_TXD_OFFSET    0x1C /* Data Transmit Register, WO */
-#define CDNS_SPI_RXD_OFFSET    0x20 /* Data Receive Register, RO */
-#define CDNS_SPI_SICR_OFFSET   0x24 /* Slave Idle Count Register, RW */
-#define CDNS_SPI_THLD_OFFSET   0x28 /* Transmit FIFO Watermark Register,RW */
-
+#define CDNS_SPI_CR    0x00 /* Configuration  Register, RW */
+#define CDNS_SPI_ISR   0x04 /* Interrupt Status Register, RO */
+#define CDNS_SPI_IER   0x08 /* Interrupt Enable Register, WO */
+#define CDNS_SPI_IDR   0x0c /* Interrupt Disable Register, WO */
+#define CDNS_SPI_IMR   0x10 /* Interrupt Enabled Mask Register, RO */
+#define CDNS_SPI_ER    0x14 /* Enable/Disable Register, RW */
+#define CDNS_SPI_DR    0x18 /* Delay Register, RW */
+#define CDNS_SPI_TXD   0x1C /* Data Transmit Register, WO */
+#define CDNS_SPI_RXD   0x20 /* Data Receive Register, RO */
+#define CDNS_SPI_SICR  0x24 /* Slave Idle Count Register, RW */
+#define CDNS_SPI_THLD  0x28 /* Transmit FIFO Watermark Register,RW */
+
+#define SPI_AUTOSUSPEND_TIMEOUT                3000
 /*
  * SPI Configuration Register bit Masks
  *
  * This register contains various control bits that affect the operation
  * of the SPI controller
  */
-#define CDNS_SPI_CR_MANSTRT_MASK       0x00010000 /* Manual TX Start */
-#define CDNS_SPI_CR_CPHA_MASK          0x00000004 /* Clock Phase Control */
-#define CDNS_SPI_CR_CPOL_MASK          0x00000002 /* Clock Polarity Control */
-#define CDNS_SPI_CR_SSCTRL_MASK                0x00003C00 /* Slave Select Mask */
-#define CDNS_SPI_CR_PERI_SEL_MASK      0x00000200 /* Peripheral Select Decode */
-#define CDNS_SPI_CR_BAUD_DIV_MASK      0x00000038 /* Baud Rate Divisor Mask */
-#define CDNS_SPI_CR_MSTREN_MASK                0x00000001 /* Master Enable Mask */
-#define CDNS_SPI_CR_MANSTRTEN_MASK     0x00008000 /* Manual TX Enable Mask */
-#define CDNS_SPI_CR_SSFORCE_MASK       0x00004000 /* Manual SS Enable Mask */
-#define CDNS_SPI_CR_BAUD_DIV_4_MASK    0x00000008 /* Default Baud Div Mask */
-#define CDNS_SPI_CR_DEFAULT_MASK       (CDNS_SPI_CR_MSTREN_MASK | \
-                                       CDNS_SPI_CR_SSCTRL_MASK | \
-                                       CDNS_SPI_CR_SSFORCE_MASK | \
-                                       CDNS_SPI_CR_BAUD_DIV_4_MASK)
+#define CDNS_SPI_CR_MANSTRT    0x00010000 /* Manual TX Start */
+#define CDNS_SPI_CR_CPHA               0x00000004 /* Clock Phase Control */
+#define CDNS_SPI_CR_CPOL               0x00000002 /* Clock Polarity Control */
+#define CDNS_SPI_CR_SSCTRL             0x00003C00 /* Slave Select Mask */
+#define CDNS_SPI_CR_PERI_SEL   0x00000200 /* Peripheral Select Decode */
+#define CDNS_SPI_CR_BAUD_DIV   0x00000038 /* Baud Rate Divisor Mask */
+#define CDNS_SPI_CR_MSTREN             0x00000001 /* Master Enable Mask */
+#define CDNS_SPI_CR_MANSTRTEN  0x00008000 /* Manual TX Enable Mask */
+#define CDNS_SPI_CR_SSFORCE    0x00004000 /* Manual SS Enable Mask */
+#define CDNS_SPI_CR_BAUD_DIV_4 0x00000008 /* Default Baud Div Mask */
+#define CDNS_SPI_CR_DEFAULT    (CDNS_SPI_CR_MSTREN | \
+                                       CDNS_SPI_CR_SSCTRL | \
+                                       CDNS_SPI_CR_SSFORCE | \
+                                       CDNS_SPI_CR_BAUD_DIV_4)
 
 /*
  * SPI Configuration Register - Baud rate and slave select
  * All the four interrupt registers (Status/Mask/Enable/Disable) have the same
  * bit definitions.
  */
-#define CDNS_SPI_IXR_TXOW_MASK 0x00000004 /* SPI TX FIFO Overwater */
-#define CDNS_SPI_IXR_MODF_MASK 0x00000002 /* SPI Mode Fault */
-#define CDNS_SPI_IXR_RXNEMTY_MASK 0x00000010 /* SPI RX FIFO Not Empty */
-#define CDNS_SPI_IXR_DEFAULT_MASK      (CDNS_SPI_IXR_TXOW_MASK | \
-                                       CDNS_SPI_IXR_MODF_MASK)
-#define CDNS_SPI_IXR_TXFULL_MASK       0x00000008 /* SPI TX Full */
-#define CDNS_SPI_IXR_ALL_MASK  0x0000007F /* SPI all interrupts */
+#define CDNS_SPI_IXR_TXOW      0x00000004 /* SPI TX FIFO Overwater */
+#define CDNS_SPI_IXR_MODF      0x00000002 /* SPI Mode Fault */
+#define CDNS_SPI_IXR_RXNEMTY 0x00000010 /* SPI RX FIFO Not Empty */
+#define CDNS_SPI_IXR_DEFAULT   (CDNS_SPI_IXR_TXOW | \
+                                       CDNS_SPI_IXR_MODF)
+#define CDNS_SPI_IXR_TXFULL    0x00000008 /* SPI TX Full */
+#define CDNS_SPI_IXR_ALL       0x0000007F /* SPI all interrupts */
 
 /*
  * SPI Enable Register bit Masks
  *
  * This register is used to enable or disable the SPI controller
  */
-#define CDNS_SPI_ER_ENABLE_MASK        0x00000001 /* SPI Enable Bit Mask */
-#define CDNS_SPI_ER_DISABLE_MASK       0x0 /* SPI Disable Bit Mask */
+#define CDNS_SPI_ER_ENABLE     0x00000001 /* SPI Enable Bit Mask */
+#define CDNS_SPI_ER_DISABLE    0x0 /* SPI Disable Bit Mask */
 
 /* SPI FIFO depth in bytes */
 #define CDNS_SPI_FIFO_DEPTH    128
@@ -149,56 +151,51 @@ static inline void cdns_spi_write(struct cdns_spi *xspi, u32 offset, u32 val)
  */
 static void cdns_spi_init_hw(struct cdns_spi *xspi)
 {
-       u32 ctrl_reg = CDNS_SPI_CR_DEFAULT_MASK;
+       u32 ctrl_reg = CDNS_SPI_CR_DEFAULT;
 
        if (xspi->is_decoded_cs)
-               ctrl_reg |= CDNS_SPI_CR_PERI_SEL_MASK;
+               ctrl_reg |= CDNS_SPI_CR_PERI_SEL;
 
-       cdns_spi_write(xspi, CDNS_SPI_ER_OFFSET,
-                      CDNS_SPI_ER_DISABLE_MASK);
-       cdns_spi_write(xspi, CDNS_SPI_IDR_OFFSET,
-                      CDNS_SPI_IXR_ALL_MASK);
+       cdns_spi_write(xspi, CDNS_SPI_ER, CDNS_SPI_ER_DISABLE);
+       cdns_spi_write(xspi, CDNS_SPI_IDR, CDNS_SPI_IXR_ALL);
 
        /* Clear the RX FIFO */
-       while (cdns_spi_read(xspi, CDNS_SPI_ISR_OFFSET) &
-              CDNS_SPI_IXR_RXNEMTY_MASK)
-               cdns_spi_read(xspi, CDNS_SPI_RXD_OFFSET);
-
-       cdns_spi_write(xspi, CDNS_SPI_ISR_OFFSET,
-                      CDNS_SPI_IXR_ALL_MASK);
-       cdns_spi_write(xspi, CDNS_SPI_CR_OFFSET, ctrl_reg);
-       cdns_spi_write(xspi, CDNS_SPI_ER_OFFSET,
-                      CDNS_SPI_ER_ENABLE_MASK);
+       while (cdns_spi_read(xspi, CDNS_SPI_ISR) & CDNS_SPI_IXR_RXNEMTY)
+               cdns_spi_read(xspi, CDNS_SPI_RXD);
+
+       cdns_spi_write(xspi, CDNS_SPI_ISR, CDNS_SPI_IXR_ALL);
+       cdns_spi_write(xspi, CDNS_SPI_CR, ctrl_reg);
+       cdns_spi_write(xspi, CDNS_SPI_ER, CDNS_SPI_ER_ENABLE);
 }
 
 /**
  * cdns_spi_chipselect - Select or deselect the chip select line
  * @spi:       Pointer to the spi_device structure
- * @is_on:     Select(0) or deselect (1) the chip select line
+ * @is_high:   Select(0) or deselect (1) the chip select line
  */
 static void cdns_spi_chipselect(struct spi_device *spi, bool is_high)
 {
        struct cdns_spi *xspi = spi_master_get_devdata(spi->master);
        u32 ctrl_reg;
 
-       ctrl_reg = cdns_spi_read(xspi, CDNS_SPI_CR_OFFSET);
+       ctrl_reg = cdns_spi_read(xspi, CDNS_SPI_CR);
 
        if (is_high) {
                /* Deselect the slave */
-               ctrl_reg |= CDNS_SPI_CR_SSCTRL_MASK;
+               ctrl_reg |= CDNS_SPI_CR_SSCTRL;
        } else {
                /* Select the slave */
-               ctrl_reg &= ~CDNS_SPI_CR_SSCTRL_MASK;
+               ctrl_reg &= ~CDNS_SPI_CR_SSCTRL;
                if (!(xspi->is_decoded_cs))
                        ctrl_reg |= ((~(CDNS_SPI_SS0 << spi->chip_select)) <<
                                     CDNS_SPI_SS_SHIFT) &
-                                    CDNS_SPI_CR_SSCTRL_MASK;
+                                    CDNS_SPI_CR_SSCTRL;
                else
                        ctrl_reg |= (spi->chip_select << CDNS_SPI_SS_SHIFT) &
-                                    CDNS_SPI_CR_SSCTRL_MASK;
+                                    CDNS_SPI_CR_SSCTRL;
        }
 
-       cdns_spi_write(xspi, CDNS_SPI_CR_OFFSET, ctrl_reg);
+       cdns_spi_write(xspi, CDNS_SPI_CR, ctrl_reg);
 }
 
 /**
@@ -212,14 +209,15 @@ static void cdns_spi_config_clock_mode(struct spi_device *spi)
        struct cdns_spi *xspi = spi_master_get_devdata(spi->master);
        u32 ctrl_reg, new_ctrl_reg;
 
-       new_ctrl_reg = ctrl_reg = cdns_spi_read(xspi, CDNS_SPI_CR_OFFSET);
+       new_ctrl_reg = cdns_spi_read(xspi, CDNS_SPI_CR);
+       ctrl_reg = new_ctrl_reg;
 
        /* Set the SPI clock phase and clock polarity */
-       new_ctrl_reg &= ~(CDNS_SPI_CR_CPHA_MASK | CDNS_SPI_CR_CPOL_MASK);
+       new_ctrl_reg &= ~(CDNS_SPI_CR_CPHA | CDNS_SPI_CR_CPOL);
        if (spi->mode & SPI_CPHA)
-               new_ctrl_reg |= CDNS_SPI_CR_CPHA_MASK;
+               new_ctrl_reg |= CDNS_SPI_CR_CPHA;
        if (spi->mode & SPI_CPOL)
-               new_ctrl_reg |= CDNS_SPI_CR_CPOL_MASK;
+               new_ctrl_reg |= CDNS_SPI_CR_CPOL;
 
        if (new_ctrl_reg != ctrl_reg) {
                /*
@@ -228,11 +226,9 @@ static void cdns_spi_config_clock_mode(struct spi_device *spi)
                 * polarity as it will cause the SPI slave to see spurious clock
                 * transitions. To workaround the issue toggle the ER register.
                 */
-               cdns_spi_write(xspi, CDNS_SPI_ER_OFFSET,
-                                  CDNS_SPI_ER_DISABLE_MASK);
-               cdns_spi_write(xspi, CDNS_SPI_CR_OFFSET, new_ctrl_reg);
-               cdns_spi_write(xspi, CDNS_SPI_ER_OFFSET,
-                                  CDNS_SPI_ER_ENABLE_MASK);
+               cdns_spi_write(xspi, CDNS_SPI_ER, CDNS_SPI_ER_DISABLE);
+               cdns_spi_write(xspi, CDNS_SPI_CR, new_ctrl_reg);
+               cdns_spi_write(xspi, CDNS_SPI_ER, CDNS_SPI_ER_ENABLE);
        }
 }
 
@@ -251,7 +247,7 @@ static void cdns_spi_config_clock_mode(struct spi_device *spi)
  * controller.
  */
 static void cdns_spi_config_clock_freq(struct spi_device *spi,
-                                 struct spi_transfer *transfer)
+                                      struct spi_transfer *transfer)
 {
        struct cdns_spi *xspi = spi_master_get_devdata(spi->master);
        u32 ctrl_reg, baud_rate_val;
@@ -259,7 +255,7 @@ static void cdns_spi_config_clock_freq(struct spi_device *spi,
 
        frequency = clk_get_rate(xspi->ref_clk);
 
-       ctrl_reg = cdns_spi_read(xspi, CDNS_SPI_CR_OFFSET);
+       ctrl_reg = cdns_spi_read(xspi, CDNS_SPI_CR);
 
        /* Set the clock frequency */
        if (xspi->speed_hz != transfer->speed_hz) {
@@ -269,12 +265,12 @@ static void cdns_spi_config_clock_freq(struct spi_device *spi,
                       (frequency / (2 << baud_rate_val)) > transfer->speed_hz)
                        baud_rate_val++;
 
-               ctrl_reg &= ~CDNS_SPI_CR_BAUD_DIV_MASK;
+               ctrl_reg &= ~CDNS_SPI_CR_BAUD_DIV;
                ctrl_reg |= baud_rate_val << CDNS_SPI_BAUD_DIV_SHIFT;
 
                xspi->speed_hz = frequency / (2 << baud_rate_val);
        }
-       cdns_spi_write(xspi, CDNS_SPI_CR_OFFSET, ctrl_reg);
+       cdns_spi_write(xspi, CDNS_SPI_CR, ctrl_reg);
 }
 
 /**
@@ -313,10 +309,9 @@ static void cdns_spi_fill_tx_fifo(struct cdns_spi *xspi)
        while ((trans_cnt < CDNS_SPI_FIFO_DEPTH) &&
               (xspi->tx_bytes > 0)) {
                if (xspi->txbuf)
-                       cdns_spi_write(xspi, CDNS_SPI_TXD_OFFSET,
-                                      *xspi->txbuf++);
+                       cdns_spi_write(xspi, CDNS_SPI_TXD, *xspi->txbuf++);
                else
-                       cdns_spi_write(xspi, CDNS_SPI_TXD_OFFSET, 0);
+                       cdns_spi_write(xspi, CDNS_SPI_TXD, 0);
 
                xspi->tx_bytes--;
                trans_cnt++;
@@ -344,19 +339,18 @@ static irqreturn_t cdns_spi_irq(int irq, void *dev_id)
        u32 intr_status, status;
 
        status = IRQ_NONE;
-       intr_status = cdns_spi_read(xspi, CDNS_SPI_ISR_OFFSET);
-       cdns_spi_write(xspi, CDNS_SPI_ISR_OFFSET, intr_status);
+       intr_status = cdns_spi_read(xspi, CDNS_SPI_ISR);
+       cdns_spi_write(xspi, CDNS_SPI_ISR, intr_status);
 
-       if (intr_status & CDNS_SPI_IXR_MODF_MASK) {
+       if (intr_status & CDNS_SPI_IXR_MODF) {
                /* Indicate that transfer is completed, the SPI subsystem will
                 * identify the error as the remaining bytes to be
                 * transferred is non-zero
                 */
-               cdns_spi_write(xspi, CDNS_SPI_IDR_OFFSET,
-                              CDNS_SPI_IXR_DEFAULT_MASK);
+               cdns_spi_write(xspi, CDNS_SPI_IDR, CDNS_SPI_IXR_DEFAULT);
                spi_finalize_current_transfer(master);
                status = IRQ_HANDLED;
-       } else if (intr_status & CDNS_SPI_IXR_TXOW_MASK) {
+       } else if (intr_status & CDNS_SPI_IXR_TXOW) {
                unsigned long trans_cnt;
 
                trans_cnt = xspi->rx_bytes - xspi->tx_bytes;
@@ -365,7 +359,7 @@ static irqreturn_t cdns_spi_irq(int irq, void *dev_id)
                while (trans_cnt) {
                        u8 data;
 
-                       data = cdns_spi_read(xspi, CDNS_SPI_RXD_OFFSET);
+                       data = cdns_spi_read(xspi, CDNS_SPI_RXD);
                        if (xspi->rxbuf)
                                *xspi->rxbuf++ = data;
 
@@ -378,8 +372,8 @@ static irqreturn_t cdns_spi_irq(int irq, void *dev_id)
                        cdns_spi_fill_tx_fifo(xspi);
                } else {
                        /* Transfer is completed */
-                       cdns_spi_write(xspi, CDNS_SPI_IDR_OFFSET,
-                                      CDNS_SPI_IXR_DEFAULT_MASK);
+                       cdns_spi_write(xspi, CDNS_SPI_IDR,
+                                      CDNS_SPI_IXR_DEFAULT);
                        spi_finalize_current_transfer(master);
                }
                status = IRQ_HANDLED;
@@ -387,6 +381,7 @@ static irqreturn_t cdns_spi_irq(int irq, void *dev_id)
 
        return status;
 }
+
 static int cdns_prepare_message(struct spi_master *master,
                                struct spi_message *msg)
 {
@@ -421,8 +416,7 @@ static int cdns_transfer_one(struct spi_master *master,
 
        cdns_spi_fill_tx_fifo(xspi);
 
-       cdns_spi_write(xspi, CDNS_SPI_IER_OFFSET,
-                      CDNS_SPI_IXR_DEFAULT_MASK);
+       cdns_spi_write(xspi, CDNS_SPI_IER, CDNS_SPI_IXR_DEFAULT);
        return transfer->len;
 }
 
@@ -439,8 +433,7 @@ static int cdns_prepare_transfer_hardware(struct spi_master *master)
 {
        struct cdns_spi *xspi = spi_master_get_devdata(master);
 
-       cdns_spi_write(xspi, CDNS_SPI_ER_OFFSET,
-                      CDNS_SPI_ER_ENABLE_MASK);
+       cdns_spi_write(xspi, CDNS_SPI_ER, CDNS_SPI_ER_ENABLE);
 
        return 0;
 }
@@ -458,8 +451,7 @@ static int cdns_unprepare_transfer_hardware(struct spi_master *master)
 {
        struct cdns_spi *xspi = spi_master_get_devdata(master);
 
-       cdns_spi_write(xspi, CDNS_SPI_ER_OFFSET,
-                      CDNS_SPI_ER_DISABLE_MASK);
+       cdns_spi_write(xspi, CDNS_SPI_ER, CDNS_SPI_ER_DISABLE);
 
        return 0;
 }
@@ -481,7 +473,7 @@ static int cdns_spi_probe(struct platform_device *pdev)
        u32 num_cs;
 
        master = spi_alloc_master(&pdev->dev, sizeof(*xspi));
-       if (master == NULL)
+       if (!master)
                return -ENOMEM;
 
        xspi = spi_master_get_devdata(master);
@@ -521,6 +513,11 @@ static int cdns_spi_probe(struct platform_device *pdev)
                goto clk_dis_apb;
        }
 
+       pm_runtime_enable(&pdev->dev);
+       pm_runtime_use_autosuspend(&pdev->dev);
+       pm_runtime_set_autosuspend_delay(&pdev->dev, SPI_AUTOSUSPEND_TIMEOUT);
+       pm_runtime_set_active(&pdev->dev);
+
        ret = of_property_read_u32(pdev->dev.of_node, "num-cs", &num_cs);
        if (ret < 0)
                master->num_chipselect = CDNS_SPI_DEFAULT_NUM_CS;
@@ -535,11 +532,14 @@ static int cdns_spi_probe(struct platform_device *pdev)
        /* SPI controller initializations */
        cdns_spi_init_hw(xspi);
 
+       pm_runtime_mark_last_busy(&pdev->dev);
+       pm_runtime_put_autosuspend(&pdev->dev);
+
        irq = platform_get_irq(pdev, 0);
        if (irq <= 0) {
                ret = -ENXIO;
                dev_err(&pdev->dev, "irq number is invalid\n");
-               goto remove_master;
+               goto clk_dis_all;
        }
 
        ret = devm_request_irq(&pdev->dev, irq, cdns_spi_irq,
@@ -547,7 +547,7 @@ static int cdns_spi_probe(struct platform_device *pdev)
        if (ret != 0) {
                ret = -ENXIO;
                dev_err(&pdev->dev, "request_irq failed\n");
-               goto remove_master;
+               goto clk_dis_all;
        }
 
        master->prepare_transfer_hardware = cdns_prepare_transfer_hardware;
@@ -555,6 +555,7 @@ static int cdns_spi_probe(struct platform_device *pdev)
        master->transfer_one = cdns_transfer_one;
        master->unprepare_transfer_hardware = cdns_unprepare_transfer_hardware;
        master->set_cs = cdns_spi_chipselect;
+       master->auto_runtime_pm = true;
        master->mode_bits = SPI_CPOL | SPI_CPHA;
 
        /* Set to default valid value */
@@ -572,6 +573,8 @@ static int cdns_spi_probe(struct platform_device *pdev)
        return ret;
 
 clk_dis_all:
+       pm_runtime_set_suspended(&pdev->dev);
+       pm_runtime_disable(&pdev->dev);
        clk_disable_unprepare(xspi->ref_clk);
 clk_dis_apb:
        clk_disable_unprepare(xspi->pclk);
@@ -595,11 +598,12 @@ static int cdns_spi_remove(struct platform_device *pdev)
        struct spi_master *master = platform_get_drvdata(pdev);
        struct cdns_spi *xspi = spi_master_get_devdata(master);
 
-       cdns_spi_write(xspi, CDNS_SPI_ER_OFFSET,
-                      CDNS_SPI_ER_DISABLE_MASK);
+       cdns_spi_write(xspi, CDNS_SPI_ER, CDNS_SPI_ER_DISABLE);
 
        clk_disable_unprepare(xspi->ref_clk);
        clk_disable_unprepare(xspi->pclk);
+       pm_runtime_set_suspended(&pdev->dev);
+       pm_runtime_disable(&pdev->dev);
 
        spi_unregister_master(master);
 
@@ -613,21 +617,14 @@ static int cdns_spi_remove(struct platform_device *pdev)
  * This function disables the SPI controller and
  * changes the driver state to "suspend"
  *
- * Return:     Always 0
+ * Return:     0 on success and error value on error
  */
 static int __maybe_unused cdns_spi_suspend(struct device *dev)
 {
        struct platform_device *pdev = to_platform_device(dev);
        struct spi_master *master = platform_get_drvdata(pdev);
-       struct cdns_spi *xspi = spi_master_get_devdata(master);
-
-       spi_master_suspend(master);
-
-       clk_disable_unprepare(xspi->ref_clk);
-
-       clk_disable_unprepare(xspi->pclk);
 
-       return 0;
+       return spi_master_suspend(master);
 }
 
 /**
@@ -642,8 +639,23 @@ static int __maybe_unused cdns_spi_resume(struct device *dev)
 {
        struct platform_device *pdev = to_platform_device(dev);
        struct spi_master *master = platform_get_drvdata(pdev);
+
+       return spi_master_resume(master);
+}
+
+/**
+ * cdns_spi_runtime_resume - Runtime resume method for the SPI driver
+ * @dev:       Address of the platform_device structure
+ *
+ * This function enables the clocks
+ *
+ * Return:     0 on success and error value on error
+ */
+static int __maybe_unused cnds_runtime_resume(struct device *dev)
+{
+       struct spi_master *master = dev_get_drvdata(dev);
        struct cdns_spi *xspi = spi_master_get_devdata(master);
-       int ret = 0;
+       int ret;
 
        ret = clk_prepare_enable(xspi->pclk);
        if (ret) {
@@ -657,13 +669,33 @@ static int __maybe_unused cdns_spi_resume(struct device *dev)
                clk_disable(xspi->pclk);
                return ret;
        }
-       spi_master_resume(master);
+       return 0;
+}
+
+/**
+ * cdns_spi_runtime_suspend - Runtime suspend method for the SPI driver
+ * @dev:       Address of the platform_device structure
+ *
+ * This function disables the clocks
+ *
+ * Return:     Always 0
+ */
+static int __maybe_unused cnds_runtime_suspend(struct device *dev)
+{
+       struct spi_master *master = dev_get_drvdata(dev);
+       struct cdns_spi *xspi = spi_master_get_devdata(master);
+
+       clk_disable_unprepare(xspi->ref_clk);
+       clk_disable_unprepare(xspi->pclk);
 
        return 0;
 }
 
-static SIMPLE_DEV_PM_OPS(cdns_spi_dev_pm_ops, cdns_spi_suspend,
-                        cdns_spi_resume);
+static const struct dev_pm_ops cdns_spi_dev_pm_ops = {
+       SET_RUNTIME_PM_OPS(cnds_runtime_suspend,
+                          cnds_runtime_resume, NULL)
+       SET_SYSTEM_SLEEP_PM_OPS(cdns_spi_suspend, cdns_spi_resume)
+};
 
 static const struct of_device_id cdns_spi_of_match[] = {
        { .compatible = "xlnx,zynq-spi-r1p6" },
index fddb7a3..d36c11b 100644 (file)
@@ -23,7 +23,6 @@
 #include <linux/clk.h>
 #include <linux/dmaengine.h>
 #include <linux/dma-mapping.h>
-#include <linux/edma.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
 #include <linux/of_gpio.h>
@@ -33,8 +32,6 @@
 
 #include <linux/platform_data/spi-davinci.h>
 
-#define SPI_NO_RESOURCE                ((resource_size_t)-1)
-
 #define CS_DEFAULT     0xFF
 
 #define SPIFMT_PHASE_MASK      BIT(16)
@@ -130,8 +127,6 @@ struct davinci_spi {
 
        struct dma_chan         *dma_rx;
        struct dma_chan         *dma_tx;
-       int                     dma_rx_chnum;
-       int                     dma_tx_chnum;
 
        struct davinci_spi_platform_data pdata;
 
@@ -797,35 +792,19 @@ static irqreturn_t davinci_spi_irq(s32 irq, void *data)
 
 static int davinci_spi_request_dma(struct davinci_spi *dspi)
 {
-       dma_cap_mask_t mask;
        struct device *sdev = dspi->bitbang.master->dev.parent;
-       int r;
-
-       dma_cap_zero(mask);
-       dma_cap_set(DMA_SLAVE, mask);
 
-       dspi->dma_rx = dma_request_channel(mask, edma_filter_fn,
-                                          &dspi->dma_rx_chnum);
-       if (!dspi->dma_rx) {
-               dev_err(sdev, "request RX DMA channel failed\n");
-               r = -ENODEV;
-               goto rx_dma_failed;
-       }
+       dspi->dma_rx = dma_request_chan(sdev, "rx");
+       if (IS_ERR(dspi->dma_rx))
+               return PTR_ERR(dspi->dma_rx);
 
-       dspi->dma_tx = dma_request_channel(mask, edma_filter_fn,
-                                          &dspi->dma_tx_chnum);
-       if (!dspi->dma_tx) {
-               dev_err(sdev, "request TX DMA channel failed\n");
-               r = -ENODEV;
-               goto tx_dma_failed;
+       dspi->dma_tx = dma_request_chan(sdev, "tx");
+       if (IS_ERR(dspi->dma_tx)) {
+               dma_release_channel(dspi->dma_rx);
+               return PTR_ERR(dspi->dma_tx);
        }
 
        return 0;
-
-tx_dma_failed:
-       dma_release_channel(dspi->dma_rx);
-rx_dma_failed:
-       return r;
 }
 
 #if defined(CONFIG_OF)
@@ -936,8 +915,6 @@ static int davinci_spi_probe(struct platform_device *pdev)
        struct davinci_spi *dspi;
        struct davinci_spi_platform_data *pdata;
        struct resource *r;
-       resource_size_t dma_rx_chan = SPI_NO_RESOURCE;
-       resource_size_t dma_tx_chan = SPI_NO_RESOURCE;
        int ret = 0;
        u32 spipc0;
 
@@ -1044,27 +1021,15 @@ static int davinci_spi_probe(struct platform_device *pdev)
                }
        }
 
-       r = platform_get_resource(pdev, IORESOURCE_DMA, 0);
-       if (r)
-               dma_rx_chan = r->start;
-       r = platform_get_resource(pdev, IORESOURCE_DMA, 1);
-       if (r)
-               dma_tx_chan = r->start;
-
        dspi->bitbang.txrx_bufs = davinci_spi_bufs;
-       if (dma_rx_chan != SPI_NO_RESOURCE &&
-           dma_tx_chan != SPI_NO_RESOURCE) {
-               dspi->dma_rx_chnum = dma_rx_chan;
-               dspi->dma_tx_chnum = dma_tx_chan;
-
-               ret = davinci_spi_request_dma(dspi);
-               if (ret)
-                       goto free_clk;
-
-               dev_info(&pdev->dev, "DMA: supported\n");
-               dev_info(&pdev->dev, "DMA: RX channel: %pa, TX channel: %pa, event queue: %d\n",
-                               &dma_rx_chan, &dma_tx_chan,
-                               pdata->dma_event_q);
+
+       ret = davinci_spi_request_dma(dspi);
+       if (ret == -EPROBE_DEFER) {
+               goto free_clk;
+       } else if (ret) {
+               dev_info(&pdev->dev, "DMA is not supported (%d)\n", ret);
+               dspi->dma_rx = NULL;
+               dspi->dma_tx = NULL;
        }
 
        dspi->get_rx = davinci_spi_rx_buf_u8;
@@ -1102,8 +1067,10 @@ static int davinci_spi_probe(struct platform_device *pdev)
        return ret;
 
 free_dma:
-       dma_release_channel(dspi->dma_rx);
-       dma_release_channel(dspi->dma_tx);
+       if (dspi->dma_rx) {
+               dma_release_channel(dspi->dma_rx);
+               dma_release_channel(dspi->dma_tx);
+       }
 free_clk:
        clk_disable_unprepare(dspi->clk);
 free_master:
@@ -1134,6 +1101,11 @@ static int davinci_spi_remove(struct platform_device *pdev)
        clk_disable_unprepare(dspi->clk);
        spi_master_put(master);
 
+       if (dspi->dma_rx) {
+               dma_release_channel(dspi->dma_rx);
+               dma_release_channel(dspi->dma_tx);
+       }
+
        return 0;
 }
 
index 3b7d91d..b62a99c 100644 (file)
@@ -683,6 +683,7 @@ static int dln2_spi_probe(struct platform_device *pdev)
        struct spi_master *master;
        struct dln2_spi *dln2;
        struct dln2_platform_data *pdata = dev_get_platdata(&pdev->dev);
+       struct device *dev = &pdev->dev;
        int ret;
 
        master = spi_alloc_master(&pdev->dev, sizeof(*dln2));
@@ -700,6 +701,7 @@ static int dln2_spi_probe(struct platform_device *pdev)
        }
 
        dln2->master = master;
+       dln2->master->dev.of_node = dev->of_node;
        dln2->pdev = pdev;
        dln2->port = pdata->port;
        /* cs/mode can never be 0xff, so the first transfer will set them */
index 332ccb0..ef7db75 100644 (file)
@@ -67,7 +67,7 @@ static int spi_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
        dws->irq = pdev->irq;
 
        /*
-        * Specific handling for paltforms, like dma setup,
+        * Specific handling for platforms, like dma setup,
         * clock rate, FIFO depth.
         */
        if (desc) {
index bb00be8..17a6387 100644 (file)
@@ -567,7 +567,7 @@ static void ep93xx_spi_dma_transfer(struct ep93xx_spi *espi)
        txd = ep93xx_spi_dma_prepare(espi, DMA_MEM_TO_DEV);
        if (IS_ERR(txd)) {
                ep93xx_spi_dma_finish(espi, DMA_DEV_TO_MEM);
-               dev_err(&espi->pdev->dev, "DMA TX failed: %ld\n", PTR_ERR(rxd));
+               dev_err(&espi->pdev->dev, "DMA TX failed: %ld\n", PTR_ERR(txd));
                msg->status = PTR_ERR(txd);
                return;
        }
index c1a2d74..9e9dadb 100644 (file)
@@ -121,18 +121,22 @@ enum dspi_trans_mode {
 
 struct fsl_dspi_devtype_data {
        enum dspi_trans_mode trans_mode;
+       u8 max_clock_factor;
 };
 
 static const struct fsl_dspi_devtype_data vf610_data = {
        .trans_mode = DSPI_EOQ_MODE,
+       .max_clock_factor = 2,
 };
 
 static const struct fsl_dspi_devtype_data ls1021a_v1_data = {
        .trans_mode = DSPI_TCFQ_MODE,
+       .max_clock_factor = 8,
 };
 
 static const struct fsl_dspi_devtype_data ls2085a_data = {
        .trans_mode = DSPI_TCFQ_MODE,
+       .max_clock_factor = 8,
 };
 
 struct fsl_dspi {
@@ -726,6 +730,9 @@ static int dspi_probe(struct platform_device *pdev)
        }
        clk_prepare_enable(dspi->clk);
 
+       master->max_speed_hz =
+               clk_get_rate(dspi->clk) / dspi->devtype_data->max_clock_factor;
+
        init_waitqueue_head(&dspi->waitq);
        platform_set_drvdata(pdev, master);
 
index 7cb0c19..8d85a3c 100644 (file)
@@ -245,7 +245,12 @@ static int fsl_espi_bufs(struct spi_device *spi, struct spi_transfer *t)
        if (ret)
                return ret;
 
-       wait_for_completion(&mpc8xxx_spi->done);
+       /* Won't hang up forever, SPI bus sometimes got lost interrupts... */
+       ret = wait_for_completion_timeout(&mpc8xxx_spi->done, 2 * HZ);
+       if (ret == 0)
+               dev_err(mpc8xxx_spi->dev,
+                       "Transaction hanging up (left %d bytes)\n",
+                       mpc8xxx_spi->count);
 
        /* disable rx ints */
        mpc8xxx_spi_write_reg(&reg_base->mask, 0);
@@ -539,16 +544,31 @@ void fsl_espi_cpu_irq(struct mpc8xxx_spi *mspi, u32 events)
        if (events & SPIE_NE) {
                u32 rx_data, tmp;
                u8 rx_data_8;
+               int rx_nr_bytes = 4;
+               int ret;
 
                /* Spin until RX is done */
-               while (SPIE_RXCNT(events) < min(4, mspi->len)) {
-                       cpu_relax();
-                       events = mpc8xxx_spi_read_reg(&reg_base->event);
+               if (SPIE_RXCNT(events) < min(4, mspi->len)) {
+                       ret = spin_event_timeout(
+                               !(SPIE_RXCNT(events =
+                               mpc8xxx_spi_read_reg(&reg_base->event)) <
+                                               min(4, mspi->len)),
+                                               10000, 0); /* 10 msec */
+                       if (!ret)
+                               dev_err(mspi->dev,
+                                        "tired waiting for SPIE_RXCNT\n");
                }
 
                if (mspi->len >= 4) {
                        rx_data = mpc8xxx_spi_read_reg(&reg_base->receive);
+               } else if (mspi->len <= 0) {
+                       dev_err(mspi->dev,
+                               "unexpected RX(SPIE_NE) interrupt occurred,\n"
+                               "(local rxlen %d bytes, reg rxlen %d bytes)\n",
+                               min(4, mspi->len), SPIE_RXCNT(events));
+                       rx_nr_bytes = 0;
                } else {
+                       rx_nr_bytes = mspi->len;
                        tmp = mspi->len;
                        rx_data = 0;
                        while (tmp--) {
@@ -559,7 +579,7 @@ void fsl_espi_cpu_irq(struct mpc8xxx_spi *mspi, u32 events)
                        rx_data <<= (4 - mspi->len) * 8;
                }
 
-               mspi->len -= 4;
+               mspi->len -= rx_nr_bytes;
 
                if (mspi->rx)
                        mspi->get_rx(rx_data, mspi);
index 07e4ce8..3b17009 100644 (file)
@@ -175,6 +175,7 @@ err:
 static int octeon_spi_probe(struct platform_device *pdev)
 {
        struct resource *res_mem;
+       void __iomem *reg_base;
        struct spi_master *master;
        struct octeon_spi *p;
        int err = -ENOENT;
@@ -186,19 +187,13 @@ static int octeon_spi_probe(struct platform_device *pdev)
        platform_set_drvdata(pdev, master);
 
        res_mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-
-       if (res_mem == NULL) {
-               dev_err(&pdev->dev, "found no memory resource\n");
-               err = -ENXIO;
-               goto fail;
-       }
-       if (!devm_request_mem_region(&pdev->dev, res_mem->start,
-                                    resource_size(res_mem), res_mem->name)) {
-               dev_err(&pdev->dev, "request_mem_region failed\n");
+       reg_base = devm_ioremap_resource(&pdev->dev, res_mem);
+       if (IS_ERR(reg_base)) {
+               err = PTR_ERR(reg_base);
                goto fail;
        }
-       p->register_base = (u64)devm_ioremap(&pdev->dev, res_mem->start,
-                                            resource_size(res_mem));
+
+       p->register_base = (u64)reg_base;
 
        master->num_chipselect = 4;
        master->mode_bits = SPI_CPHA |
index 0caa3c8..1d237e9 100644 (file)
@@ -23,7 +23,6 @@
 #include <linux/delay.h>
 #include <linux/dma-mapping.h>
 #include <linux/dmaengine.h>
-#include <linux/omap-dma.h>
 #include <linux/pinctrl/consumer.h>
 #include <linux/platform_device.h>
 #include <linux/err.h>
@@ -103,9 +102,6 @@ struct omap2_mcspi_dma {
        struct dma_chan *dma_tx;
        struct dma_chan *dma_rx;
 
-       int dma_tx_sync_dev;
-       int dma_rx_sync_dev;
-
        struct completion dma_tx_completion;
        struct completion dma_rx_completion;
 
@@ -964,8 +960,7 @@ static int omap2_mcspi_request_dma(struct spi_device *spi)
        struct spi_master       *master = spi->master;
        struct omap2_mcspi      *mcspi;
        struct omap2_mcspi_dma  *mcspi_dma;
-       dma_cap_mask_t mask;
-       unsigned sig;
+       int ret = 0;
 
        mcspi = spi_master_get_devdata(master);
        mcspi_dma = mcspi->dma_channels + spi->chip_select;
@@ -973,34 +968,25 @@ static int omap2_mcspi_request_dma(struct spi_device *spi)
        init_completion(&mcspi_dma->dma_rx_completion);
        init_completion(&mcspi_dma->dma_tx_completion);
 
-       dma_cap_zero(mask);
-       dma_cap_set(DMA_SLAVE, mask);
-       sig = mcspi_dma->dma_rx_sync_dev;
-
-       mcspi_dma->dma_rx =
-               dma_request_slave_channel_compat(mask, omap_dma_filter_fn,
-                                                &sig, &master->dev,
-                                                mcspi_dma->dma_rx_ch_name);
-       if (!mcspi_dma->dma_rx)
+       mcspi_dma->dma_rx = dma_request_chan(&master->dev,
+                                            mcspi_dma->dma_rx_ch_name);
+       if (IS_ERR(mcspi_dma->dma_rx)) {
+               ret = PTR_ERR(mcspi_dma->dma_rx);
+               mcspi_dma->dma_rx = NULL;
                goto no_dma;
+       }
 
-       sig = mcspi_dma->dma_tx_sync_dev;
-       mcspi_dma->dma_tx =
-               dma_request_slave_channel_compat(mask, omap_dma_filter_fn,
-                                                &sig, &master->dev,
-                                                mcspi_dma->dma_tx_ch_name);
-
-       if (!mcspi_dma->dma_tx) {
+       mcspi_dma->dma_tx = dma_request_chan(&master->dev,
+                                            mcspi_dma->dma_tx_ch_name);
+       if (IS_ERR(mcspi_dma->dma_tx)) {
+               ret = PTR_ERR(mcspi_dma->dma_tx);
+               mcspi_dma->dma_tx = NULL;
                dma_release_channel(mcspi_dma->dma_rx);
                mcspi_dma->dma_rx = NULL;
-               goto no_dma;
        }
 
-       return 0;
-
 no_dma:
-       dev_warn(&spi->dev, "not using DMA for McSPI\n");
-       return -EAGAIN;
+       return ret;
 }
 
 static int omap2_mcspi_setup(struct spi_device *spi)
@@ -1039,8 +1025,9 @@ static int omap2_mcspi_setup(struct spi_device *spi)
 
        if (!mcspi_dma->dma_rx || !mcspi_dma->dma_tx) {
                ret = omap2_mcspi_request_dma(spi);
-               if (ret < 0 && ret != -EAGAIN)
-                       return ret;
+               if (ret)
+                       dev_warn(&spi->dev, "not using DMA for McSPI (%d)\n",
+                                ret);
        }
 
        ret = pm_runtime_get_sync(mcspi->dev);
@@ -1434,42 +1421,8 @@ static int omap2_mcspi_probe(struct platform_device *pdev)
        }
 
        for (i = 0; i < master->num_chipselect; i++) {
-               char *dma_rx_ch_name = mcspi->dma_channels[i].dma_rx_ch_name;
-               char *dma_tx_ch_name = mcspi->dma_channels[i].dma_tx_ch_name;
-               struct resource *dma_res;
-
-               sprintf(dma_rx_ch_name, "rx%d", i);
-               if (!pdev->dev.of_node) {
-                       dma_res =
-                               platform_get_resource_byname(pdev,
-                                                            IORESOURCE_DMA,
-                                                            dma_rx_ch_name);
-                       if (!dma_res) {
-                               dev_dbg(&pdev->dev,
-                                       "cannot get DMA RX channel\n");
-                               status = -ENODEV;
-                               break;
-                       }
-
-                       mcspi->dma_channels[i].dma_rx_sync_dev =
-                               dma_res->start;
-               }
-               sprintf(dma_tx_ch_name, "tx%d", i);
-               if (!pdev->dev.of_node) {
-                       dma_res =
-                               platform_get_resource_byname(pdev,
-                                                            IORESOURCE_DMA,
-                                                            dma_tx_ch_name);
-                       if (!dma_res) {
-                               dev_dbg(&pdev->dev,
-                                       "cannot get DMA TX channel\n");
-                               status = -ENODEV;
-                               break;
-                       }
-
-                       mcspi->dma_channels[i].dma_tx_sync_dev =
-                               dma_res->start;
-               }
+               sprintf(mcspi->dma_channels[i].dma_rx_ch_name, "rx%d", i);
+               sprintf(mcspi->dma_channels[i].dma_tx_ch_name, "tx%d", i);
        }
 
        if (status < 0)
diff --git a/drivers/spi/spi-pic32-sqi.c b/drivers/spi/spi-pic32-sqi.c
new file mode 100644 (file)
index 0000000..ca3c8d9
--- /dev/null
@@ -0,0 +1,727 @@
+/*
+ * PIC32 Quad SPI controller driver.
+ *
+ * Purna Chandra Mandal <purna.mandal@microchip.com>
+ * Copyright (c) 2016, Microchip Technology Inc.
+ *
+ * This program is free software; you can distribute it and/or modify it
+ * under the terms of the GNU General Public License (Version 2) as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+#include <linux/clk.h>
+#include <linux/dma-mapping.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/iopoll.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/spi/spi.h>
+
+/* SQI registers */
+#define PESQI_XIP_CONF1_REG    0x00
+#define PESQI_XIP_CONF2_REG    0x04
+#define PESQI_CONF_REG         0x08
+#define PESQI_CTRL_REG         0x0C
+#define PESQI_CLK_CTRL_REG     0x10
+#define PESQI_CMD_THRES_REG    0x14
+#define PESQI_INT_THRES_REG    0x18
+#define PESQI_INT_ENABLE_REG   0x1C
+#define PESQI_INT_STAT_REG     0x20
+#define PESQI_TX_DATA_REG      0x24
+#define PESQI_RX_DATA_REG      0x28
+#define PESQI_STAT1_REG                0x2C
+#define PESQI_STAT2_REG                0x30
+#define PESQI_BD_CTRL_REG      0x34
+#define PESQI_BD_CUR_ADDR_REG  0x38
+#define PESQI_BD_BASE_ADDR_REG 0x40
+#define PESQI_BD_STAT_REG      0x44
+#define PESQI_BD_POLL_CTRL_REG 0x48
+#define PESQI_BD_TX_DMA_STAT_REG       0x4C
+#define PESQI_BD_RX_DMA_STAT_REG       0x50
+#define PESQI_THRES_REG                0x54
+#define PESQI_INT_SIGEN_REG    0x58
+
+/* PESQI_CONF_REG fields */
+#define PESQI_MODE             0x7
+#define  PESQI_MODE_BOOT       0
+#define  PESQI_MODE_PIO                1
+#define  PESQI_MODE_DMA                2
+#define  PESQI_MODE_XIP                3
+#define PESQI_MODE_SHIFT       0
+#define PESQI_CPHA             BIT(3)
+#define PESQI_CPOL             BIT(4)
+#define PESQI_LSBF             BIT(5)
+#define PESQI_RXLATCH          BIT(7)
+#define PESQI_SERMODE          BIT(8)
+#define PESQI_WP_EN            BIT(9)
+#define PESQI_HOLD_EN          BIT(10)
+#define PESQI_BURST_EN         BIT(12)
+#define PESQI_CS_CTRL_HW       BIT(15)
+#define PESQI_SOFT_RESET       BIT(16)
+#define PESQI_LANES_SHIFT      20
+#define  PESQI_SINGLE_LANE     0
+#define  PESQI_DUAL_LANE       1
+#define  PESQI_QUAD_LANE       2
+#define PESQI_CSEN_SHIFT       24
+#define PESQI_EN               BIT(23)
+
+/* PESQI_CLK_CTRL_REG fields */
+#define PESQI_CLK_EN           BIT(0)
+#define PESQI_CLK_STABLE       BIT(1)
+#define PESQI_CLKDIV_SHIFT     8
+#define PESQI_CLKDIV           0xff
+
+/* PESQI_INT_THR/CMD_THR_REG */
+#define PESQI_TXTHR_MASK       0x1f
+#define PESQI_TXTHR_SHIFT      8
+#define PESQI_RXTHR_MASK       0x1f
+#define PESQI_RXTHR_SHIFT      0
+
+/* PESQI_INT_EN/INT_STAT/INT_SIG_EN_REG */
+#define PESQI_TXEMPTY          BIT(0)
+#define PESQI_TXFULL           BIT(1)
+#define PESQI_TXTHR            BIT(2)
+#define PESQI_RXEMPTY          BIT(3)
+#define PESQI_RXFULL           BIT(4)
+#define PESQI_RXTHR            BIT(5)
+#define PESQI_BDDONE           BIT(9)  /* BD processing complete */
+#define PESQI_PKTCOMP          BIT(10) /* packet processing complete */
+#define PESQI_DMAERR           BIT(11) /* error */
+
+/* PESQI_BD_CTRL_REG */
+#define PESQI_DMA_EN           BIT(0) /* enable DMA engine */
+#define PESQI_POLL_EN          BIT(1) /* enable polling */
+#define PESQI_BDP_START                BIT(2) /* start BD processor */
+
+/* PESQI controller buffer descriptor */
+struct buf_desc {
+       u32 bd_ctrl;    /* control */
+       u32 bd_status;  /* reserved */
+       u32 bd_addr;    /* DMA buffer addr */
+       u32 bd_nextp;   /* next item in chain */
+};
+
+/* bd_ctrl */
+#define BD_BUFLEN              0x1ff
+#define BD_CBD_INT_EN          BIT(16) /* Current BD is processed */
+#define BD_PKT_INT_EN          BIT(17) /* All BDs of PKT processed */
+#define BD_LIFM                        BIT(18) /* last data of pkt */
+#define BD_LAST                        BIT(19) /* end of list */
+#define BD_DATA_RECV           BIT(20) /* receive data */
+#define BD_DDR                 BIT(21) /* DDR mode */
+#define BD_DUAL                        BIT(22) /* Dual SPI */
+#define BD_QUAD                        BIT(23) /* Quad SPI */
+#define BD_LSBF                        BIT(25) /* LSB First */
+#define BD_STAT_CHECK          BIT(27) /* Status poll */
+#define BD_DEVSEL_SHIFT                28      /* CS */
+#define BD_CS_DEASSERT         BIT(30) /* de-assert CS after current BD */
+#define BD_EN                  BIT(31) /* BD owned by H/W */
+
+/**
+ * struct ring_desc - Representation of SQI ring descriptor
+ * @list:      list element to add to free or used list.
+ * @bd:                PESQI controller buffer descriptor
+ * @bd_dma:    DMA address of PESQI controller buffer descriptor
+ * @xfer_len:  transfer length
+ */
+struct ring_desc {
+       struct list_head list;
+       struct buf_desc *bd;
+       dma_addr_t bd_dma;
+       u32 xfer_len;
+};
+
+/* Global constants */
+#define PESQI_BD_BUF_LEN_MAX   256
+#define PESQI_BD_COUNT         256 /* max 64KB data per spi message */
+
+struct pic32_sqi {
+       void __iomem            *regs;
+       struct clk              *sys_clk;
+       struct clk              *base_clk; /* drives spi clock */
+       struct spi_master       *master;
+       int                     irq;
+       struct completion       xfer_done;
+       struct ring_desc        *ring;
+       void                    *bd;
+       dma_addr_t              bd_dma;
+       struct list_head        bd_list_free; /* free */
+       struct list_head        bd_list_used; /* allocated */
+       struct spi_device       *cur_spi;
+       u32                     cur_speed;
+       u8                      cur_mode;
+};
+
+static inline void pic32_setbits(void __iomem *reg, u32 set)
+{
+       writel(readl(reg) | set, reg);
+}
+
+static inline void pic32_clrbits(void __iomem *reg, u32 clr)
+{
+       writel(readl(reg) & ~clr, reg);
+}
+
+static int pic32_sqi_set_clk_rate(struct pic32_sqi *sqi, u32 sck)
+{
+       u32 val, div;
+
+       /* div = base_clk / (2 * spi_clk) */
+       div = clk_get_rate(sqi->base_clk) / (2 * sck);
+       div &= PESQI_CLKDIV;
+
+       val = readl(sqi->regs + PESQI_CLK_CTRL_REG);
+       /* apply new divider */
+       val &= ~(PESQI_CLK_STABLE | (PESQI_CLKDIV << PESQI_CLKDIV_SHIFT));
+       val |= div << PESQI_CLKDIV_SHIFT;
+       writel(val, sqi->regs + PESQI_CLK_CTRL_REG);
+
+       /* wait for stability */
+       return readl_poll_timeout(sqi->regs + PESQI_CLK_CTRL_REG, val,
+                                 val & PESQI_CLK_STABLE, 1, 5000);
+}
+
+static inline void pic32_sqi_enable_int(struct pic32_sqi *sqi)
+{
+       u32 mask = PESQI_DMAERR | PESQI_BDDONE | PESQI_PKTCOMP;
+
+       writel(mask, sqi->regs + PESQI_INT_ENABLE_REG);
+       /* INT_SIGEN works as interrupt-gate to INTR line */
+       writel(mask, sqi->regs + PESQI_INT_SIGEN_REG);
+}
+
+static inline void pic32_sqi_disable_int(struct pic32_sqi *sqi)
+{
+       writel(0, sqi->regs + PESQI_INT_ENABLE_REG);
+       writel(0, sqi->regs + PESQI_INT_SIGEN_REG);
+}
+
+static irqreturn_t pic32_sqi_isr(int irq, void *dev_id)
+{
+       struct pic32_sqi *sqi = dev_id;
+       u32 enable, status;
+
+       enable = readl(sqi->regs + PESQI_INT_ENABLE_REG);
+       status = readl(sqi->regs + PESQI_INT_STAT_REG);
+
+       /* check spurious interrupt */
+       if (!status)
+               return IRQ_NONE;
+
+       if (status & PESQI_DMAERR) {
+               enable = 0;
+               goto irq_done;
+       }
+
+       if (status & PESQI_TXTHR)
+               enable &= ~(PESQI_TXTHR | PESQI_TXFULL | PESQI_TXEMPTY);
+
+       if (status & PESQI_RXTHR)
+               enable &= ~(PESQI_RXTHR | PESQI_RXFULL | PESQI_RXEMPTY);
+
+       if (status & PESQI_BDDONE)
+               enable &= ~PESQI_BDDONE;
+
+       /* packet processing completed */
+       if (status & PESQI_PKTCOMP) {
+               /* mask all interrupts */
+               enable = 0;
+               /* complete trasaction */
+               complete(&sqi->xfer_done);
+       }
+
+irq_done:
+       /* interrupts are sticky, so mask when handled */
+       writel(enable, sqi->regs + PESQI_INT_ENABLE_REG);
+
+       return IRQ_HANDLED;
+}
+
+static struct ring_desc *ring_desc_get(struct pic32_sqi *sqi)
+{
+       struct ring_desc *rdesc;
+
+       if (list_empty(&sqi->bd_list_free))
+               return NULL;
+
+       rdesc = list_first_entry(&sqi->bd_list_free, struct ring_desc, list);
+       list_del(&rdesc->list);
+       list_add_tail(&rdesc->list, &sqi->bd_list_used);
+       return rdesc;
+}
+
+static void ring_desc_put(struct pic32_sqi *sqi, struct ring_desc *rdesc)
+{
+       list_del(&rdesc->list);
+       list_add(&rdesc->list, &sqi->bd_list_free);
+}
+
+static int pic32_sqi_one_transfer(struct pic32_sqi *sqi,
+                                 struct spi_message *mesg,
+                                 struct spi_transfer *xfer)
+{
+       struct spi_device *spi = mesg->spi;
+       struct scatterlist *sg, *sgl;
+       struct ring_desc *rdesc;
+       struct buf_desc *bd;
+       int nents, i;
+       u32 bd_ctrl;
+       u32 nbits;
+
+       /* Device selection */
+       bd_ctrl = spi->chip_select << BD_DEVSEL_SHIFT;
+
+       /* half-duplex: select transfer buffer, direction and lane */
+       if (xfer->rx_buf) {
+               bd_ctrl |= BD_DATA_RECV;
+               nbits = xfer->rx_nbits;
+               sgl = xfer->rx_sg.sgl;
+               nents = xfer->rx_sg.nents;
+       } else {
+               nbits = xfer->tx_nbits;
+               sgl = xfer->tx_sg.sgl;
+               nents = xfer->tx_sg.nents;
+       }
+
+       if (nbits & SPI_NBITS_QUAD)
+               bd_ctrl |= BD_QUAD;
+       else if (nbits & SPI_NBITS_DUAL)
+               bd_ctrl |= BD_DUAL;
+
+       /* LSB first */
+       if (spi->mode & SPI_LSB_FIRST)
+               bd_ctrl |= BD_LSBF;
+
+       /* ownership to hardware */
+       bd_ctrl |= BD_EN;
+
+       for_each_sg(sgl, sg, nents, i) {
+               /* get ring descriptor */
+               rdesc = ring_desc_get(sqi);
+               if (!rdesc)
+                       break;
+
+               bd = rdesc->bd;
+
+               /* BD CTRL: length */
+               rdesc->xfer_len = sg_dma_len(sg);
+               bd->bd_ctrl = bd_ctrl;
+               bd->bd_ctrl |= rdesc->xfer_len;
+
+               /* BD STAT */
+               bd->bd_status = 0;
+
+               /* BD BUFFER ADDRESS */
+               bd->bd_addr = sg->dma_address;
+       }
+
+       return 0;
+}
+
+static int pic32_sqi_prepare_hardware(struct spi_master *master)
+{
+       struct pic32_sqi *sqi = spi_master_get_devdata(master);
+
+       /* enable spi interface */
+       pic32_setbits(sqi->regs + PESQI_CONF_REG, PESQI_EN);
+       /* enable spi clk */
+       pic32_setbits(sqi->regs + PESQI_CLK_CTRL_REG, PESQI_CLK_EN);
+
+       return 0;
+}
+
+static bool pic32_sqi_can_dma(struct spi_master *master,
+                             struct spi_device *spi,
+                             struct spi_transfer *x)
+{
+       /* Do DMA irrespective of transfer size */
+       return true;
+}
+
+static int pic32_sqi_one_message(struct spi_master *master,
+                                struct spi_message *msg)
+{
+       struct spi_device *spi = msg->spi;
+       struct ring_desc *rdesc, *next;
+       struct spi_transfer *xfer;
+       struct pic32_sqi *sqi;
+       int ret = 0, mode;
+       u32 val;
+
+       sqi = spi_master_get_devdata(master);
+
+       reinit_completion(&sqi->xfer_done);
+       msg->actual_length = 0;
+
+       /* We can't handle spi_transfer specific "speed_hz", "bits_per_word"
+        * and "delay_usecs". But spi_device specific speed and mode change
+        * can be handled at best during spi chip-select switch.
+        */
+       if (sqi->cur_spi != spi) {
+               /* set spi speed */
+               if (sqi->cur_speed != spi->max_speed_hz) {
+                       sqi->cur_speed = spi->max_speed_hz;
+                       ret = pic32_sqi_set_clk_rate(sqi, spi->max_speed_hz);
+                       if (ret)
+                               dev_warn(&spi->dev, "set_clk, %d\n", ret);
+               }
+
+               /* set spi mode */
+               mode = spi->mode & (SPI_MODE_3 | SPI_LSB_FIRST);
+               if (sqi->cur_mode != mode) {
+                       val = readl(sqi->regs + PESQI_CONF_REG);
+                       val &= ~(PESQI_CPOL | PESQI_CPHA | PESQI_LSBF);
+                       if (mode & SPI_CPOL)
+                               val |= PESQI_CPOL;
+                       if (mode & SPI_LSB_FIRST)
+                               val |= PESQI_LSBF;
+                       val |= PESQI_CPHA;
+                       writel(val, sqi->regs + PESQI_CONF_REG);
+
+                       sqi->cur_mode = mode;
+               }
+               sqi->cur_spi = spi;
+       }
+
+       /* prepare hardware desc-list(BD) for transfer(s) */
+       list_for_each_entry(xfer, &msg->transfers, transfer_list) {
+               ret = pic32_sqi_one_transfer(sqi, msg, xfer);
+               if (ret) {
+                       dev_err(&spi->dev, "xfer %p err\n", xfer);
+                       goto xfer_out;
+               }
+       }
+
+       /* BDs are prepared and chained. Now mark LAST_BD, CS_DEASSERT at last
+        * element of the list.
+        */
+       rdesc = list_last_entry(&sqi->bd_list_used, struct ring_desc, list);
+       rdesc->bd->bd_ctrl |= BD_LAST | BD_CS_DEASSERT |
+                             BD_LIFM | BD_PKT_INT_EN;
+
+       /* set base address BD list for DMA engine */
+       rdesc = list_first_entry(&sqi->bd_list_used, struct ring_desc, list);
+       writel(rdesc->bd_dma, sqi->regs + PESQI_BD_BASE_ADDR_REG);
+
+       /* enable interrupt */
+       pic32_sqi_enable_int(sqi);
+
+       /* enable DMA engine */
+       val = PESQI_DMA_EN | PESQI_POLL_EN | PESQI_BDP_START;
+       writel(val, sqi->regs + PESQI_BD_CTRL_REG);
+
+       /* wait for xfer completion */
+       ret = wait_for_completion_timeout(&sqi->xfer_done, 5 * HZ);
+       if (ret <= 0) {
+               dev_err(&sqi->master->dev, "wait timedout/interrupted\n");
+               ret = -EIO;
+               msg->status = ret;
+       } else {
+               /* success */
+               msg->status = 0;
+               ret = 0;
+       }
+
+       /* disable DMA */
+       writel(0, sqi->regs + PESQI_BD_CTRL_REG);
+
+       pic32_sqi_disable_int(sqi);
+
+xfer_out:
+       list_for_each_entry_safe_reverse(rdesc, next,
+                                        &sqi->bd_list_used, list) {
+               /* Update total byte transferred */
+               msg->actual_length += rdesc->xfer_len;
+               /* release ring descr */
+               ring_desc_put(sqi, rdesc);
+       }
+       spi_finalize_current_message(spi->master);
+
+       return ret;
+}
+
+static int pic32_sqi_unprepare_hardware(struct spi_master *master)
+{
+       struct pic32_sqi *sqi = spi_master_get_devdata(master);
+
+       /* disable clk */
+       pic32_clrbits(sqi->regs + PESQI_CLK_CTRL_REG, PESQI_CLK_EN);
+       /* disable spi */
+       pic32_clrbits(sqi->regs + PESQI_CONF_REG, PESQI_EN);
+
+       return 0;
+}
+
+static int ring_desc_ring_alloc(struct pic32_sqi *sqi)
+{
+       struct ring_desc *rdesc;
+       struct buf_desc *bd;
+       int i;
+
+       /* allocate coherent DMAable memory for hardware buffer descriptors. */
+       sqi->bd = dma_zalloc_coherent(&sqi->master->dev,
+                                     sizeof(*bd) * PESQI_BD_COUNT,
+                                     &sqi->bd_dma, GFP_DMA32);
+       if (!sqi->bd) {
+               dev_err(&sqi->master->dev, "failed allocating dma buffer\n");
+               return -ENOMEM;
+       }
+
+       /* allocate software ring descriptors */
+       sqi->ring = kcalloc(PESQI_BD_COUNT, sizeof(*rdesc), GFP_KERNEL);
+       if (!sqi->ring) {
+               dma_free_coherent(&sqi->master->dev,
+                                 sizeof(*bd) * PESQI_BD_COUNT,
+                                 sqi->bd, sqi->bd_dma);
+               return -ENOMEM;
+       }
+
+       bd = (struct buf_desc *)sqi->bd;
+
+       INIT_LIST_HEAD(&sqi->bd_list_free);
+       INIT_LIST_HEAD(&sqi->bd_list_used);
+
+       /* initialize ring-desc */
+       for (i = 0, rdesc = sqi->ring; i < PESQI_BD_COUNT; i++, rdesc++) {
+               INIT_LIST_HEAD(&rdesc->list);
+               rdesc->bd = &bd[i];
+               rdesc->bd_dma = sqi->bd_dma + (void *)&bd[i] - (void *)bd;
+               list_add_tail(&rdesc->list, &sqi->bd_list_free);
+       }
+
+       /* Prepare BD: chain to next BD(s) */
+       for (i = 0, rdesc = sqi->ring; i < PESQI_BD_COUNT - 1; i++)
+               bd[i].bd_nextp = rdesc[i + 1].bd_dma;
+       bd[PESQI_BD_COUNT - 1].bd_nextp = 0;
+
+       return 0;
+}
+
+static void ring_desc_ring_free(struct pic32_sqi *sqi)
+{
+       dma_free_coherent(&sqi->master->dev,
+                         sizeof(struct buf_desc) * PESQI_BD_COUNT,
+                         sqi->bd, sqi->bd_dma);
+       kfree(sqi->ring);
+}
+
+static void pic32_sqi_hw_init(struct pic32_sqi *sqi)
+{
+       unsigned long flags;
+       u32 val;
+
+       /* Soft-reset of PESQI controller triggers interrupt.
+        * We are not yet ready to handle them so disable CPU
+        * interrupt for the time being.
+        */
+       local_irq_save(flags);
+
+       /* assert soft-reset */
+       writel(PESQI_SOFT_RESET, sqi->regs + PESQI_CONF_REG);
+
+       /* wait until clear */
+       readl_poll_timeout_atomic(sqi->regs + PESQI_CONF_REG, val,
+                                 !(val & PESQI_SOFT_RESET), 1, 5000);
+
+       /* disable all interrupts */
+       pic32_sqi_disable_int(sqi);
+
+       /* Now it is safe to enable back CPU interrupt */
+       local_irq_restore(flags);
+
+       /* tx and rx fifo interrupt threshold */
+       val = readl(sqi->regs + PESQI_CMD_THRES_REG);
+       val &= ~(PESQI_TXTHR_MASK << PESQI_TXTHR_SHIFT);
+       val &= ~(PESQI_RXTHR_MASK << PESQI_RXTHR_SHIFT);
+       val |= (1U << PESQI_TXTHR_SHIFT) | (1U << PESQI_RXTHR_SHIFT);
+       writel(val, sqi->regs + PESQI_CMD_THRES_REG);
+
+       val = readl(sqi->regs + PESQI_INT_THRES_REG);
+       val &= ~(PESQI_TXTHR_MASK << PESQI_TXTHR_SHIFT);
+       val &= ~(PESQI_RXTHR_MASK << PESQI_RXTHR_SHIFT);
+       val |= (1U << PESQI_TXTHR_SHIFT) | (1U << PESQI_RXTHR_SHIFT);
+       writel(val, sqi->regs + PESQI_INT_THRES_REG);
+
+       /* default configuration */
+       val = readl(sqi->regs + PESQI_CONF_REG);
+
+       /* set mode: DMA */
+       val &= ~PESQI_MODE;
+       val |= PESQI_MODE_DMA << PESQI_MODE_SHIFT;
+       writel(val, sqi->regs + PESQI_CONF_REG);
+
+       /* DATAEN - SQIID0-ID3 */
+       val |= PESQI_QUAD_LANE << PESQI_LANES_SHIFT;
+
+       /* burst/INCR4 enable */
+       val |= PESQI_BURST_EN;
+
+       /* CSEN - all CS */
+       val |= 3U << PESQI_CSEN_SHIFT;
+       writel(val, sqi->regs + PESQI_CONF_REG);
+
+       /* write poll count */
+       writel(0, sqi->regs + PESQI_BD_POLL_CTRL_REG);
+
+       sqi->cur_speed = 0;
+       sqi->cur_mode = -1;
+}
+
+static int pic32_sqi_probe(struct platform_device *pdev)
+{
+       struct spi_master *master;
+       struct pic32_sqi *sqi;
+       struct resource *reg;
+       int ret;
+
+       master = spi_alloc_master(&pdev->dev, sizeof(*sqi));
+       if (!master)
+               return -ENOMEM;
+
+       sqi = spi_master_get_devdata(master);
+       sqi->master = master;
+
+       reg = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       sqi->regs = devm_ioremap_resource(&pdev->dev, reg);
+       if (IS_ERR(sqi->regs)) {
+               ret = PTR_ERR(sqi->regs);
+               goto err_free_master;
+       }
+
+       /* irq */
+       sqi->irq = platform_get_irq(pdev, 0);
+       if (sqi->irq < 0) {
+               dev_err(&pdev->dev, "no irq found\n");
+               ret = sqi->irq;
+               goto err_free_master;
+       }
+
+       /* clocks */
+       sqi->sys_clk = devm_clk_get(&pdev->dev, "reg_ck");
+       if (IS_ERR(sqi->sys_clk)) {
+               ret = PTR_ERR(sqi->sys_clk);
+               dev_err(&pdev->dev, "no sys_clk ?\n");
+               goto err_free_master;
+       }
+
+       sqi->base_clk = devm_clk_get(&pdev->dev, "spi_ck");
+       if (IS_ERR(sqi->base_clk)) {
+               ret = PTR_ERR(sqi->base_clk);
+               dev_err(&pdev->dev, "no base clk ?\n");
+               goto err_free_master;
+       }
+
+       ret = clk_prepare_enable(sqi->sys_clk);
+       if (ret) {
+               dev_err(&pdev->dev, "sys clk enable failed\n");
+               goto err_free_master;
+       }
+
+       ret = clk_prepare_enable(sqi->base_clk);
+       if (ret) {
+               dev_err(&pdev->dev, "base clk enable failed\n");
+               clk_disable_unprepare(sqi->sys_clk);
+               goto err_free_master;
+       }
+
+       init_completion(&sqi->xfer_done);
+
+       /* initialize hardware */
+       pic32_sqi_hw_init(sqi);
+
+       /* allocate buffers & descriptors */
+       ret = ring_desc_ring_alloc(sqi);
+       if (ret) {
+               dev_err(&pdev->dev, "ring alloc failed\n");
+               goto err_disable_clk;
+       }
+
+       /* install irq handlers */
+       ret = request_irq(sqi->irq, pic32_sqi_isr, 0,
+                         dev_name(&pdev->dev), sqi);
+       if (ret < 0) {
+               dev_err(&pdev->dev, "request_irq(%d), failed\n", sqi->irq);
+               goto err_free_ring;
+       }
+
+       /* register master */
+       master->num_chipselect  = 2;
+       master->max_speed_hz    = clk_get_rate(sqi->base_clk);
+       master->dma_alignment   = 32;
+       master->max_dma_len     = PESQI_BD_BUF_LEN_MAX;
+       master->dev.of_node     = of_node_get(pdev->dev.of_node);
+       master->mode_bits       = SPI_MODE_3 | SPI_MODE_0 | SPI_TX_DUAL |
+                                 SPI_RX_DUAL | SPI_TX_QUAD | SPI_RX_QUAD;
+       master->flags           = SPI_MASTER_HALF_DUPLEX;
+       master->can_dma         = pic32_sqi_can_dma;
+       master->bits_per_word_mask      = SPI_BPW_RANGE_MASK(8, 32);
+       master->transfer_one_message    = pic32_sqi_one_message;
+       master->prepare_transfer_hardware       = pic32_sqi_prepare_hardware;
+       master->unprepare_transfer_hardware     = pic32_sqi_unprepare_hardware;
+
+       ret = devm_spi_register_master(&pdev->dev, master);
+       if (ret) {
+               dev_err(&master->dev, "failed registering spi master\n");
+               free_irq(sqi->irq, sqi);
+               goto err_free_ring;
+       }
+
+       platform_set_drvdata(pdev, sqi);
+
+       return 0;
+
+err_free_ring:
+       ring_desc_ring_free(sqi);
+
+err_disable_clk:
+       clk_disable_unprepare(sqi->base_clk);
+       clk_disable_unprepare(sqi->sys_clk);
+
+err_free_master:
+       spi_master_put(master);
+       return ret;
+}
+
+static int pic32_sqi_remove(struct platform_device *pdev)
+{
+       struct pic32_sqi *sqi = platform_get_drvdata(pdev);
+
+       /* release resources */
+       free_irq(sqi->irq, sqi);
+       ring_desc_ring_free(sqi);
+
+       /* disable clk */
+       clk_disable_unprepare(sqi->base_clk);
+       clk_disable_unprepare(sqi->sys_clk);
+
+       return 0;
+}
+
+static const struct of_device_id pic32_sqi_of_ids[] = {
+       {.compatible = "microchip,pic32mzda-sqi",},
+       {},
+};
+MODULE_DEVICE_TABLE(of, pic32_sqi_of_ids);
+
+static struct platform_driver pic32_sqi_driver = {
+       .driver = {
+               .name = "sqi-pic32",
+               .of_match_table = of_match_ptr(pic32_sqi_of_ids),
+       },
+       .probe = pic32_sqi_probe,
+       .remove = pic32_sqi_remove,
+};
+
+module_platform_driver(pic32_sqi_driver);
+
+MODULE_AUTHOR("Purna Chandra Mandal <purna.mandal@microchip.com>");
+MODULE_DESCRIPTION("Microchip SPI driver for PIC32 SQI controller.");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/spi/spi-pic32.c b/drivers/spi/spi-pic32.c
new file mode 100644 (file)
index 0000000..73db87f
--- /dev/null
@@ -0,0 +1,878 @@
+/*
+ * Microchip PIC32 SPI controller driver.
+ *
+ * Purna Chandra Mandal <purna.mandal@microchip.com>
+ * Copyright (c) 2016, Microchip Technology Inc.
+ *
+ * This program is free software; you can distribute it and/or modify it
+ * under the terms of the GNU General Public License (Version 2) as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+#include <linux/clk.h>
+#include <linux/clkdev.h>
+#include <linux/delay.h>
+#include <linux/dmaengine.h>
+#include <linux/dma-mapping.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/io.h>
+#include <linux/interrupt.h>
+#include <linux/of.h>
+#include <linux/of_irq.h>
+#include <linux/of_gpio.h>
+#include <linux/of_address.h>
+#include <linux/platform_device.h>
+#include <linux/spi/spi.h>
+
+/* SPI controller registers */
+struct pic32_spi_regs {
+       u32 ctrl;
+       u32 ctrl_clr;
+       u32 ctrl_set;
+       u32 ctrl_inv;
+       u32 status;
+       u32 status_clr;
+       u32 status_set;
+       u32 status_inv;
+       u32 buf;
+       u32 dontuse[3];
+       u32 baud;
+       u32 dontuse2[3];
+       u32 ctrl2;
+       u32 ctrl2_clr;
+       u32 ctrl2_set;
+       u32 ctrl2_inv;
+};
+
+/* Bit fields of SPI Control Register */
+#define CTRL_RX_INT_SHIFT      0  /* Rx interrupt generation */
+#define  RX_FIFO_EMTPY         0
+#define  RX_FIFO_NOT_EMPTY     1 /* not empty */
+#define  RX_FIFO_HALF_FULL     2 /* full by half or more */
+#define  RX_FIFO_FULL          3 /* completely full */
+
+#define CTRL_TX_INT_SHIFT      2  /* TX interrupt generation */
+#define  TX_FIFO_ALL_EMPTY     0 /* completely empty */
+#define  TX_FIFO_EMTPY         1 /* empty */
+#define  TX_FIFO_HALF_EMPTY    2 /* empty by half or more */
+#define  TX_FIFO_NOT_FULL      3 /* atleast one empty */
+
+#define CTRL_MSTEN     BIT(5) /* enable master mode */
+#define CTRL_CKP       BIT(6) /* active low */
+#define CTRL_CKE       BIT(8) /* Tx on falling edge */
+#define CTRL_SMP       BIT(9) /* Rx at middle or end of tx */
+#define CTRL_BPW_MASK  0x03   /* bits per word/sample */
+#define CTRL_BPW_SHIFT 10
+#define  PIC32_BPW_8   0
+#define  PIC32_BPW_16  1
+#define  PIC32_BPW_32  2
+#define CTRL_SIDL      BIT(13) /* sleep when idle */
+#define CTRL_ON                BIT(15) /* enable macro */
+#define CTRL_ENHBUF    BIT(16) /* enable enhanced buffering */
+#define CTRL_MCLKSEL   BIT(23) /* select clock source */
+#define CTRL_MSSEN     BIT(28) /* macro driven /SS */
+#define CTRL_FRMEN     BIT(31) /* enable framing mode */
+
+/* Bit fields of SPI Status Register */
+#define STAT_RF_EMPTY  BIT(5) /* RX Fifo empty */
+#define STAT_RX_OV     BIT(6) /* err, s/w needs to clear */
+#define STAT_TX_UR     BIT(8) /* UR in Framed SPI modes */
+#define STAT_FRM_ERR   BIT(12) /* Multiple Frame Sync pulse */
+#define STAT_TF_LVL_MASK       0x1F
+#define STAT_TF_LVL_SHIFT      16
+#define STAT_RF_LVL_MASK       0x1F
+#define STAT_RF_LVL_SHIFT      24
+
+/* Bit fields of SPI Baud Register */
+#define BAUD_MASK              0x1ff
+
+/* Bit fields of SPI Control2 Register */
+#define CTRL2_TX_UR_EN         BIT(10) /* Enable int on Tx under-run */
+#define CTRL2_RX_OV_EN         BIT(11) /* Enable int on Rx over-run */
+#define CTRL2_FRM_ERR_EN       BIT(12) /* Enable frame err int */
+
+/* Minimum DMA transfer size */
+#define PIC32_DMA_LEN_MIN      64
+
+struct pic32_spi {
+       dma_addr_t              dma_base;
+       struct pic32_spi_regs __iomem *regs;
+       int                     fault_irq;
+       int                     rx_irq;
+       int                     tx_irq;
+       u32                     fifo_n_byte; /* FIFO depth in bytes */
+       struct clk              *clk;
+       struct spi_master       *master;
+       /* Current controller setting */
+       u32                     speed_hz; /* spi-clk rate */
+       u32                     mode;
+       u32                     bits_per_word;
+       u32                     fifo_n_elm; /* FIFO depth in words */
+#define PIC32F_DMA_PREP                0 /* DMA chnls configured */
+       unsigned long           flags;
+       /* Current transfer state */
+       struct completion       xfer_done;
+       /* PIO transfer specific */
+       const void              *tx;
+       const void              *tx_end;
+       const void              *rx;
+       const void              *rx_end;
+       int                     len;
+       void (*rx_fifo)(struct pic32_spi *);
+       void (*tx_fifo)(struct pic32_spi *);
+};
+
+static inline void pic32_spi_enable(struct pic32_spi *pic32s)
+{
+       writel(CTRL_ON | CTRL_SIDL, &pic32s->regs->ctrl_set);
+}
+
+static inline void pic32_spi_disable(struct pic32_spi *pic32s)
+{
+       writel(CTRL_ON | CTRL_SIDL, &pic32s->regs->ctrl_clr);
+
+       /* avoid SPI registers read/write at immediate next CPU clock */
+       ndelay(20);
+}
+
+static void pic32_spi_set_clk_rate(struct pic32_spi *pic32s, u32 spi_ck)
+{
+       u32 div;
+
+       /* div = (clk_in / 2 * spi_ck) - 1 */
+       div = DIV_ROUND_CLOSEST(clk_get_rate(pic32s->clk), 2 * spi_ck) - 1;
+
+       writel(div & BAUD_MASK, &pic32s->regs->baud);
+}
+
+static inline u32 pic32_rx_fifo_level(struct pic32_spi *pic32s)
+{
+       u32 sr = readl(&pic32s->regs->status);
+
+       return (sr >> STAT_RF_LVL_SHIFT) & STAT_RF_LVL_MASK;
+}
+
+static inline u32 pic32_tx_fifo_level(struct pic32_spi *pic32s)
+{
+       u32 sr = readl(&pic32s->regs->status);
+
+       return (sr >> STAT_TF_LVL_SHIFT) & STAT_TF_LVL_MASK;
+}
+
+/* Return the max entries we can fill into tx fifo */
+static u32 pic32_tx_max(struct pic32_spi *pic32s, int n_bytes)
+{
+       u32 tx_left, tx_room, rxtx_gap;
+
+       tx_left = (pic32s->tx_end - pic32s->tx) / n_bytes;
+       tx_room = pic32s->fifo_n_elm - pic32_tx_fifo_level(pic32s);
+
+       /*
+        * Another concern is about the tx/rx mismatch, we
+        * though to use (pic32s->fifo_n_byte - rxfl - txfl) as
+        * one maximum value for tx, but it doesn't cover the
+        * data which is out of tx/rx fifo and inside the
+        * shift registers. So a ctrl from sw point of
+        * view is taken.
+        */
+       rxtx_gap = ((pic32s->rx_end - pic32s->rx) -
+                   (pic32s->tx_end - pic32s->tx)) / n_bytes;
+       return min3(tx_left, tx_room, (u32)(pic32s->fifo_n_elm - rxtx_gap));
+}
+
+/* Return the max entries we should read out of rx fifo */
+static u32 pic32_rx_max(struct pic32_spi *pic32s, int n_bytes)
+{
+       u32 rx_left = (pic32s->rx_end - pic32s->rx) / n_bytes;
+
+       return min_t(u32, rx_left, pic32_rx_fifo_level(pic32s));
+}
+
+#define BUILD_SPI_FIFO_RW(__name, __type, __bwl)               \
+static void pic32_spi_rx_##__name(struct pic32_spi *pic32s)    \
+{                                                              \
+       __type v;                                               \
+       u32 mx = pic32_rx_max(pic32s, sizeof(__type));          \
+       for (; mx; mx--) {                                      \
+               v = read##__bwl(&pic32s->regs->buf);            \
+               if (pic32s->rx_end - pic32s->len)               \
+                       *(__type *)(pic32s->rx) = v;            \
+               pic32s->rx += sizeof(__type);                   \
+       }                                                       \
+}                                                              \
+                                                               \
+static void pic32_spi_tx_##__name(struct pic32_spi *pic32s)    \
+{                                                              \
+       __type v;                                               \
+       u32 mx = pic32_tx_max(pic32s, sizeof(__type));          \
+       for (; mx ; mx--) {                                     \
+               v = (__type)~0U;                                \
+               if (pic32s->tx_end - pic32s->len)               \
+                       v = *(__type *)(pic32s->tx);            \
+               write##__bwl(v, &pic32s->regs->buf);            \
+               pic32s->tx += sizeof(__type);                   \
+       }                                                       \
+}
+
+BUILD_SPI_FIFO_RW(byte, u8, b);
+BUILD_SPI_FIFO_RW(word, u16, w);
+BUILD_SPI_FIFO_RW(dword, u32, l);
+
+static void pic32_err_stop(struct pic32_spi *pic32s, const char *msg)
+{
+       /* disable all interrupts */
+       disable_irq_nosync(pic32s->fault_irq);
+       disable_irq_nosync(pic32s->rx_irq);
+       disable_irq_nosync(pic32s->tx_irq);
+
+       /* Show err message and abort xfer with err */
+       dev_err(&pic32s->master->dev, "%s\n", msg);
+       if (pic32s->master->cur_msg)
+               pic32s->master->cur_msg->status = -EIO;
+       complete(&pic32s->xfer_done);
+}
+
+static irqreturn_t pic32_spi_fault_irq(int irq, void *dev_id)
+{
+       struct pic32_spi *pic32s = dev_id;
+       u32 status;
+
+       status = readl(&pic32s->regs->status);
+
+       /* Error handling */
+       if (status & (STAT_RX_OV | STAT_TX_UR)) {
+               writel(STAT_RX_OV, &pic32s->regs->status_clr);
+               writel(STAT_TX_UR, &pic32s->regs->status_clr);
+               pic32_err_stop(pic32s, "err_irq: fifo ov/ur-run\n");
+               return IRQ_HANDLED;
+       }
+
+       if (status & STAT_FRM_ERR) {
+               pic32_err_stop(pic32s, "err_irq: frame error");
+               return IRQ_HANDLED;
+       }
+
+       if (!pic32s->master->cur_msg) {
+               pic32_err_stop(pic32s, "err_irq: no mesg");
+               return IRQ_NONE;
+       }
+
+       return IRQ_NONE;
+}
+
+static irqreturn_t pic32_spi_rx_irq(int irq, void *dev_id)
+{
+       struct pic32_spi *pic32s = dev_id;
+
+       pic32s->rx_fifo(pic32s);
+
+       /* rx complete ? */
+       if (pic32s->rx_end == pic32s->rx) {
+               /* disable all interrupts */
+               disable_irq_nosync(pic32s->fault_irq);
+               disable_irq_nosync(pic32s->rx_irq);
+
+               /* complete current xfer */
+               complete(&pic32s->xfer_done);
+       }
+
+       return IRQ_HANDLED;
+}
+
+static irqreturn_t pic32_spi_tx_irq(int irq, void *dev_id)
+{
+       struct pic32_spi *pic32s = dev_id;
+
+       pic32s->tx_fifo(pic32s);
+
+       /* tx complete? disable tx interrupt */
+       if (pic32s->tx_end == pic32s->tx)
+               disable_irq_nosync(pic32s->tx_irq);
+
+       return IRQ_HANDLED;
+}
+
+static void pic32_spi_dma_rx_notify(void *data)
+{
+       struct pic32_spi *pic32s = data;
+
+       complete(&pic32s->xfer_done);
+}
+
+static int pic32_spi_dma_transfer(struct pic32_spi *pic32s,
+                                 struct spi_transfer *xfer)
+{
+       struct spi_master *master = pic32s->master;
+       struct dma_async_tx_descriptor *desc_rx;
+       struct dma_async_tx_descriptor *desc_tx;
+       dma_cookie_t cookie;
+       int ret;
+
+       if (!master->dma_rx || !master->dma_tx)
+               return -ENODEV;
+
+       desc_rx = dmaengine_prep_slave_sg(master->dma_rx,
+                                         xfer->rx_sg.sgl,
+                                         xfer->rx_sg.nents,
+                                         DMA_FROM_DEVICE,
+                                         DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
+       if (!desc_rx) {
+               ret = -EINVAL;
+               goto err_dma;
+       }
+
+       desc_tx = dmaengine_prep_slave_sg(master->dma_tx,
+                                         xfer->tx_sg.sgl,
+                                         xfer->tx_sg.nents,
+                                         DMA_TO_DEVICE,
+                                         DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
+       if (!desc_tx) {
+               ret = -EINVAL;
+               goto err_dma;
+       }
+
+       /* Put callback on the RX transfer, that should finish last */
+       desc_rx->callback = pic32_spi_dma_rx_notify;
+       desc_rx->callback_param = pic32s;
+
+       cookie = dmaengine_submit(desc_rx);
+       ret = dma_submit_error(cookie);
+       if (ret)
+               goto err_dma;
+
+       cookie = dmaengine_submit(desc_tx);
+       ret = dma_submit_error(cookie);
+       if (ret)
+               goto err_dma_tx;
+
+       dma_async_issue_pending(master->dma_rx);
+       dma_async_issue_pending(master->dma_tx);
+
+       return 0;
+
+err_dma_tx:
+       dmaengine_terminate_all(master->dma_rx);
+err_dma:
+       return ret;
+}
+
+static int pic32_spi_dma_config(struct pic32_spi *pic32s, u32 dma_width)
+{
+       int buf_offset = offsetof(struct pic32_spi_regs, buf);
+       struct spi_master *master = pic32s->master;
+       struct dma_slave_config cfg;
+       int ret;
+
+       cfg.device_fc = true;
+       cfg.src_addr = pic32s->dma_base + buf_offset;
+       cfg.dst_addr = pic32s->dma_base + buf_offset;
+       cfg.src_maxburst = pic32s->fifo_n_elm / 2; /* fill one-half */
+       cfg.dst_maxburst = pic32s->fifo_n_elm / 2; /* drain one-half */
+       cfg.src_addr_width = dma_width;
+       cfg.dst_addr_width = dma_width;
+       /* tx channel */
+       cfg.slave_id = pic32s->tx_irq;
+       cfg.direction = DMA_MEM_TO_DEV;
+       ret = dmaengine_slave_config(master->dma_tx, &cfg);
+       if (ret) {
+               dev_err(&master->dev, "tx channel setup failed\n");
+               return ret;
+       }
+       /* rx channel */
+       cfg.slave_id = pic32s->rx_irq;
+       cfg.direction = DMA_DEV_TO_MEM;
+       ret = dmaengine_slave_config(master->dma_rx, &cfg);
+       if (ret)
+               dev_err(&master->dev, "rx channel setup failed\n");
+
+       return ret;
+}
+
+static int pic32_spi_set_word_size(struct pic32_spi *pic32s, u8 bits_per_word)
+{
+       enum dma_slave_buswidth dmawidth;
+       u32 buswidth, v;
+
+       switch (bits_per_word) {
+       case 8:
+               pic32s->rx_fifo = pic32_spi_rx_byte;
+               pic32s->tx_fifo = pic32_spi_tx_byte;
+               buswidth = PIC32_BPW_8;
+               dmawidth = DMA_SLAVE_BUSWIDTH_1_BYTE;
+               break;
+       case 16:
+               pic32s->rx_fifo = pic32_spi_rx_word;
+               pic32s->tx_fifo = pic32_spi_tx_word;
+               buswidth = PIC32_BPW_16;
+               dmawidth = DMA_SLAVE_BUSWIDTH_2_BYTES;
+               break;
+       case 32:
+               pic32s->rx_fifo = pic32_spi_rx_dword;
+               pic32s->tx_fifo = pic32_spi_tx_dword;
+               buswidth = PIC32_BPW_32;
+               dmawidth = DMA_SLAVE_BUSWIDTH_4_BYTES;
+               break;
+       default:
+               /* not supported */
+               return -EINVAL;
+       }
+
+       /* calculate maximum number of words fifos can hold */
+       pic32s->fifo_n_elm = DIV_ROUND_UP(pic32s->fifo_n_byte,
+                                         bits_per_word / 8);
+       /* set word size */
+       v = readl(&pic32s->regs->ctrl);
+       v &= ~(CTRL_BPW_MASK << CTRL_BPW_SHIFT);
+       v |= buswidth << CTRL_BPW_SHIFT;
+       writel(v, &pic32s->regs->ctrl);
+
+       /* re-configure dma width, if required */
+       if (test_bit(PIC32F_DMA_PREP, &pic32s->flags))
+               pic32_spi_dma_config(pic32s, dmawidth);
+
+       return 0;
+}
+
+static int pic32_spi_prepare_hardware(struct spi_master *master)
+{
+       struct pic32_spi *pic32s = spi_master_get_devdata(master);
+
+       pic32_spi_enable(pic32s);
+
+       return 0;
+}
+
+static int pic32_spi_prepare_message(struct spi_master *master,
+                                    struct spi_message *msg)
+{
+       struct pic32_spi *pic32s = spi_master_get_devdata(master);
+       struct spi_device *spi = msg->spi;
+       u32 val;
+
+       /* set device specific bits_per_word */
+       if (pic32s->bits_per_word != spi->bits_per_word) {
+               pic32_spi_set_word_size(pic32s, spi->bits_per_word);
+               pic32s->bits_per_word = spi->bits_per_word;
+       }
+
+       /* device specific speed change */
+       if (pic32s->speed_hz != spi->max_speed_hz) {
+               pic32_spi_set_clk_rate(pic32s, spi->max_speed_hz);
+               pic32s->speed_hz = spi->max_speed_hz;
+       }
+
+       /* device specific mode change */
+       if (pic32s->mode != spi->mode) {
+               val = readl(&pic32s->regs->ctrl);
+               /* active low */
+               if (spi->mode & SPI_CPOL)
+                       val |= CTRL_CKP;
+               else
+                       val &= ~CTRL_CKP;
+               /* tx on rising edge */
+               if (spi->mode & SPI_CPHA)
+                       val &= ~CTRL_CKE;
+               else
+                       val |= CTRL_CKE;
+
+               /* rx at end of tx */
+               val |= CTRL_SMP;
+               writel(val, &pic32s->regs->ctrl);
+               pic32s->mode = spi->mode;
+       }
+
+       return 0;
+}
+
+static bool pic32_spi_can_dma(struct spi_master *master,
+                             struct spi_device *spi,
+                             struct spi_transfer *xfer)
+{
+       struct pic32_spi *pic32s = spi_master_get_devdata(master);
+
+       /* skip using DMA on small size transfer to avoid overhead.*/
+       return (xfer->len >= PIC32_DMA_LEN_MIN) &&
+              test_bit(PIC32F_DMA_PREP, &pic32s->flags);
+}
+
+static int pic32_spi_one_transfer(struct spi_master *master,
+                                 struct spi_device *spi,
+                                 struct spi_transfer *transfer)
+{
+       struct pic32_spi *pic32s;
+       bool dma_issued = false;
+       int ret;
+
+       pic32s = spi_master_get_devdata(master);
+
+       /* handle transfer specific word size change */
+       if (transfer->bits_per_word &&
+           (transfer->bits_per_word != pic32s->bits_per_word)) {
+               ret = pic32_spi_set_word_size(pic32s, transfer->bits_per_word);
+               if (ret)
+                       return ret;
+               pic32s->bits_per_word = transfer->bits_per_word;
+       }
+
+       /* handle transfer specific speed change */
+       if (transfer->speed_hz && (transfer->speed_hz != pic32s->speed_hz)) {
+               pic32_spi_set_clk_rate(pic32s, transfer->speed_hz);
+               pic32s->speed_hz = transfer->speed_hz;
+       }
+
+       reinit_completion(&pic32s->xfer_done);
+
+       /* transact by DMA mode */
+       if (transfer->rx_sg.nents && transfer->tx_sg.nents) {
+               ret = pic32_spi_dma_transfer(pic32s, transfer);
+               if (ret) {
+                       dev_err(&spi->dev, "dma submit error\n");
+                       return ret;
+               }
+
+               /* DMA issued */
+               dma_issued = true;
+       } else {
+               /* set current transfer information */
+               pic32s->tx = (const void *)transfer->tx_buf;
+               pic32s->rx = (const void *)transfer->rx_buf;
+               pic32s->tx_end = pic32s->tx + transfer->len;
+               pic32s->rx_end = pic32s->rx + transfer->len;
+               pic32s->len = transfer->len;
+
+               /* transact by interrupt driven PIO */
+               enable_irq(pic32s->fault_irq);
+               enable_irq(pic32s->rx_irq);
+               enable_irq(pic32s->tx_irq);
+       }
+
+       /* wait for completion */
+       ret = wait_for_completion_timeout(&pic32s->xfer_done, 2 * HZ);
+       if (ret <= 0) {
+               dev_err(&spi->dev, "wait error/timedout\n");
+               if (dma_issued) {
+                       dmaengine_terminate_all(master->dma_rx);
+                       dmaengine_terminate_all(master->dma_rx);
+               }
+               ret = -ETIMEDOUT;
+       } else {
+               ret = 0;
+       }
+
+       return ret;
+}
+
+static int pic32_spi_unprepare_message(struct spi_master *master,
+                                      struct spi_message *msg)
+{
+       /* nothing to do */
+       return 0;
+}
+
+static int pic32_spi_unprepare_hardware(struct spi_master *master)
+{
+       struct pic32_spi *pic32s = spi_master_get_devdata(master);
+
+       pic32_spi_disable(pic32s);
+
+       return 0;
+}
+
+/* This may be called multiple times by same spi dev */
+static int pic32_spi_setup(struct spi_device *spi)
+{
+       if (!spi->max_speed_hz) {
+               dev_err(&spi->dev, "No max speed HZ parameter\n");
+               return -EINVAL;
+       }
+
+       /* PIC32 spi controller can drive /CS during transfer depending
+        * on tx fifo fill-level. /CS will stay asserted as long as TX
+        * fifo is non-empty, else will be deasserted indicating
+        * completion of the ongoing transfer. This might result into
+        * unreliable/erroneous SPI transactions.
+        * To avoid that we will always handle /CS by toggling GPIO.
+        */
+       if (!gpio_is_valid(spi->cs_gpio))
+               return -EINVAL;
+
+       gpio_direction_output(spi->cs_gpio, !(spi->mode & SPI_CS_HIGH));
+
+       return 0;
+}
+
+static void pic32_spi_cleanup(struct spi_device *spi)
+{
+       /* de-activate cs-gpio */
+       gpio_direction_output(spi->cs_gpio, !(spi->mode & SPI_CS_HIGH));
+}
+
+static void pic32_spi_dma_prep(struct pic32_spi *pic32s, struct device *dev)
+{
+       struct spi_master *master = pic32s->master;
+       dma_cap_mask_t mask;
+
+       dma_cap_zero(mask);
+       dma_cap_set(DMA_SLAVE, mask);
+
+       master->dma_rx = dma_request_slave_channel_compat(mask, NULL, NULL,
+                                                         dev, "spi-rx");
+       if (!master->dma_rx) {
+               dev_warn(dev, "RX channel not found.\n");
+               goto out_err;
+       }
+
+       master->dma_tx = dma_request_slave_channel_compat(mask, NULL, NULL,
+                                                         dev, "spi-tx");
+       if (!master->dma_tx) {
+               dev_warn(dev, "TX channel not found.\n");
+               goto out_err;
+       }
+
+       if (pic32_spi_dma_config(pic32s, DMA_SLAVE_BUSWIDTH_1_BYTE))
+               goto out_err;
+
+       /* DMA chnls allocated and prepared */
+       set_bit(PIC32F_DMA_PREP, &pic32s->flags);
+
+       return;
+
+out_err:
+       if (master->dma_rx)
+               dma_release_channel(master->dma_rx);
+
+       if (master->dma_tx)
+               dma_release_channel(master->dma_tx);
+}
+
+static void pic32_spi_dma_unprep(struct pic32_spi *pic32s)
+{
+       if (!test_bit(PIC32F_DMA_PREP, &pic32s->flags))
+               return;
+
+       clear_bit(PIC32F_DMA_PREP, &pic32s->flags);
+       if (pic32s->master->dma_rx)
+               dma_release_channel(pic32s->master->dma_rx);
+
+       if (pic32s->master->dma_tx)
+               dma_release_channel(pic32s->master->dma_tx);
+}
+
+static void pic32_spi_hw_init(struct pic32_spi *pic32s)
+{
+       u32 ctrl;
+
+       /* disable hardware */
+       pic32_spi_disable(pic32s);
+
+       ctrl = readl(&pic32s->regs->ctrl);
+       /* enable enhanced fifo of 128bit deep */
+       ctrl |= CTRL_ENHBUF;
+       pic32s->fifo_n_byte = 16;
+
+       /* disable framing mode */
+       ctrl &= ~CTRL_FRMEN;
+
+       /* enable master mode while disabled */
+       ctrl |= CTRL_MSTEN;
+
+       /* set tx fifo threshold interrupt */
+       ctrl &= ~(0x3 << CTRL_TX_INT_SHIFT);
+       ctrl |= (TX_FIFO_HALF_EMPTY << CTRL_TX_INT_SHIFT);
+
+       /* set rx fifo threshold interrupt */
+       ctrl &= ~(0x3 << CTRL_RX_INT_SHIFT);
+       ctrl |= (RX_FIFO_NOT_EMPTY << CTRL_RX_INT_SHIFT);
+
+       /* select clk source */
+       ctrl &= ~CTRL_MCLKSEL;
+
+       /* set manual /CS mode */
+       ctrl &= ~CTRL_MSSEN;
+
+       writel(ctrl, &pic32s->regs->ctrl);
+
+       /* enable error reporting */
+       ctrl = CTRL2_TX_UR_EN | CTRL2_RX_OV_EN | CTRL2_FRM_ERR_EN;
+       writel(ctrl, &pic32s->regs->ctrl2_set);
+}
+
+static int pic32_spi_hw_probe(struct platform_device *pdev,
+                             struct pic32_spi *pic32s)
+{
+       struct resource *mem;
+       int ret;
+
+       mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       pic32s->regs = devm_ioremap_resource(&pdev->dev, mem);
+       if (IS_ERR(pic32s->regs))
+               return PTR_ERR(pic32s->regs);
+
+       pic32s->dma_base = mem->start;
+
+       /* get irq resources: err-irq, rx-irq, tx-irq */
+       pic32s->fault_irq = platform_get_irq_byname(pdev, "fault");
+       if (pic32s->fault_irq < 0) {
+               dev_err(&pdev->dev, "fault-irq not found\n");
+               return pic32s->fault_irq;
+       }
+
+       pic32s->rx_irq = platform_get_irq_byname(pdev, "rx");
+       if (pic32s->rx_irq < 0) {
+               dev_err(&pdev->dev, "rx-irq not found\n");
+               return pic32s->rx_irq;
+       }
+
+       pic32s->tx_irq = platform_get_irq_byname(pdev, "tx");
+       if (pic32s->tx_irq < 0) {
+               dev_err(&pdev->dev, "tx-irq not found\n");
+               return pic32s->tx_irq;
+       }
+
+       /* get clock */
+       pic32s->clk = devm_clk_get(&pdev->dev, "mck0");
+       if (IS_ERR(pic32s->clk)) {
+               dev_err(&pdev->dev, "clk not found\n");
+               ret = PTR_ERR(pic32s->clk);
+               goto err_unmap_mem;
+       }
+
+       ret = clk_prepare_enable(pic32s->clk);
+       if (ret)
+               goto err_unmap_mem;
+
+       pic32_spi_hw_init(pic32s);
+
+       return 0;
+
+err_unmap_mem:
+       dev_err(&pdev->dev, "%s failed, err %d\n", __func__, ret);
+       return ret;
+}
+
+static int pic32_spi_probe(struct platform_device *pdev)
+{
+       struct spi_master *master;
+       struct pic32_spi *pic32s;
+       int ret;
+
+       master = spi_alloc_master(&pdev->dev, sizeof(*pic32s));
+       if (!master)
+               return -ENOMEM;
+
+       pic32s = spi_master_get_devdata(master);
+       pic32s->master = master;
+
+       ret = pic32_spi_hw_probe(pdev, pic32s);
+       if (ret)
+               goto err_master;
+
+       master->dev.of_node     = of_node_get(pdev->dev.of_node);
+       master->mode_bits       = SPI_MODE_3 | SPI_MODE_0 | SPI_CS_HIGH;
+       master->num_chipselect  = 1; /* single chip-select */
+       master->max_speed_hz    = clk_get_rate(pic32s->clk);
+       master->setup           = pic32_spi_setup;
+       master->cleanup         = pic32_spi_cleanup;
+       master->flags           = SPI_MASTER_MUST_TX | SPI_MASTER_MUST_RX;
+       master->bits_per_word_mask      = SPI_BPW_MASK(8) | SPI_BPW_MASK(16) |
+                                         SPI_BPW_MASK(32);
+       master->transfer_one            = pic32_spi_one_transfer;
+       master->prepare_message         = pic32_spi_prepare_message;
+       master->unprepare_message       = pic32_spi_unprepare_message;
+       master->prepare_transfer_hardware       = pic32_spi_prepare_hardware;
+       master->unprepare_transfer_hardware     = pic32_spi_unprepare_hardware;
+
+       /* optional DMA support */
+       pic32_spi_dma_prep(pic32s, &pdev->dev);
+       if (test_bit(PIC32F_DMA_PREP, &pic32s->flags))
+               master->can_dma = pic32_spi_can_dma;
+
+       init_completion(&pic32s->xfer_done);
+       pic32s->mode = -1;
+
+       /* install irq handlers (with irq-disabled) */
+       irq_set_status_flags(pic32s->fault_irq, IRQ_NOAUTOEN);
+       ret = devm_request_irq(&pdev->dev, pic32s->fault_irq,
+                              pic32_spi_fault_irq, IRQF_NO_THREAD,
+                              dev_name(&pdev->dev), pic32s);
+       if (ret < 0) {
+               dev_err(&pdev->dev, "request fault-irq %d\n", pic32s->rx_irq);
+               goto err_bailout;
+       }
+
+       /* receive interrupt handler */
+       irq_set_status_flags(pic32s->rx_irq, IRQ_NOAUTOEN);
+       ret = devm_request_irq(&pdev->dev, pic32s->rx_irq,
+                              pic32_spi_rx_irq, IRQF_NO_THREAD,
+                              dev_name(&pdev->dev), pic32s);
+       if (ret < 0) {
+               dev_err(&pdev->dev, "request rx-irq %d\n", pic32s->rx_irq);
+               goto err_bailout;
+       }
+
+       /* transmit interrupt handler */
+       irq_set_status_flags(pic32s->tx_irq, IRQ_NOAUTOEN);
+       ret = devm_request_irq(&pdev->dev, pic32s->tx_irq,
+                              pic32_spi_tx_irq, IRQF_NO_THREAD,
+                              dev_name(&pdev->dev), pic32s);
+       if (ret < 0) {
+               dev_err(&pdev->dev, "request tx-irq %d\n", pic32s->tx_irq);
+               goto err_bailout;
+       }
+
+       /* register master */
+       ret = devm_spi_register_master(&pdev->dev, master);
+       if (ret) {
+               dev_err(&master->dev, "failed registering spi master\n");
+               goto err_bailout;
+       }
+
+       platform_set_drvdata(pdev, pic32s);
+
+       return 0;
+
+err_bailout:
+       clk_disable_unprepare(pic32s->clk);
+err_master:
+       spi_master_put(master);
+       return ret;
+}
+
+static int pic32_spi_remove(struct platform_device *pdev)
+{
+       struct pic32_spi *pic32s;
+
+       pic32s = platform_get_drvdata(pdev);
+       pic32_spi_disable(pic32s);
+       clk_disable_unprepare(pic32s->clk);
+       pic32_spi_dma_unprep(pic32s);
+
+       return 0;
+}
+
+static const struct of_device_id pic32_spi_of_match[] = {
+       {.compatible = "microchip,pic32mzda-spi",},
+       {},
+};
+MODULE_DEVICE_TABLE(of, pic32_spi_of_match);
+
+static struct platform_driver pic32_spi_driver = {
+       .driver = {
+               .name = "spi-pic32",
+               .of_match_table = of_match_ptr(pic32_spi_of_match),
+       },
+       .probe = pic32_spi_probe,
+       .remove = pic32_spi_remove,
+};
+
+module_platform_driver(pic32_spi_driver);
+
+MODULE_AUTHOR("Purna Chandra Mandal <purna.mandal@microchip.com>");
+MODULE_DESCRIPTION("Microchip SPI driver for PIC32 SPI controller.");
+MODULE_LICENSE("GPL v2");
index 365fc22..a18a03d 100644 (file)
@@ -33,12 +33,10 @@ static int pxa2xx_spi_map_dma_buffer(struct driver_data *drv_data,
                dmadev = drv_data->tx_chan->device->dev;
                sgt = &drv_data->tx_sgt;
                buf = drv_data->tx;
-               drv_data->tx_map_len = len;
        } else {
                dmadev = drv_data->rx_chan->device->dev;
                sgt = &drv_data->rx_sgt;
                buf = drv_data->rx;
-               drv_data->rx_map_len = len;
        }
 
        nents = DIV_ROUND_UP(len, SZ_2K);
@@ -55,11 +53,7 @@ static int pxa2xx_spi_map_dma_buffer(struct driver_data *drv_data,
        for_each_sg(sgt->sgl, sg, sgt->nents, i) {
                size_t bytes = min_t(size_t, len, SZ_2K);
 
-               if (buf)
-                       sg_set_buf(sg, pbuf, bytes);
-               else
-                       sg_set_buf(sg, drv_data->dummy, bytes);
-
+               sg_set_buf(sg, pbuf, bytes);
                pbuf += bytes;
                len -= bytes;
        }
@@ -133,9 +127,6 @@ static void pxa2xx_spi_dma_transfer_complete(struct driver_data *drv_data,
                if (!error) {
                        pxa2xx_spi_unmap_dma_buffers(drv_data);
 
-                       drv_data->tx += drv_data->tx_map_len;
-                       drv_data->rx += drv_data->rx_map_len;
-
                        msg->actual_length += drv_data->len;
                        msg->state = pxa2xx_spi_next_transfer(drv_data);
                } else {
@@ -267,19 +258,22 @@ irqreturn_t pxa2xx_spi_dma_transfer(struct driver_data *drv_data)
 int pxa2xx_spi_dma_prepare(struct driver_data *drv_data, u32 dma_burst)
 {
        struct dma_async_tx_descriptor *tx_desc, *rx_desc;
+       int err = 0;
 
        tx_desc = pxa2xx_spi_dma_prepare_one(drv_data, DMA_MEM_TO_DEV);
        if (!tx_desc) {
                dev_err(&drv_data->pdev->dev,
                        "failed to get DMA TX descriptor\n");
-               return -EBUSY;
+               err = -EBUSY;
+               goto err_tx;
        }
 
        rx_desc = pxa2xx_spi_dma_prepare_one(drv_data, DMA_DEV_TO_MEM);
        if (!rx_desc) {
                dev_err(&drv_data->pdev->dev,
                        "failed to get DMA RX descriptor\n");
-               return -EBUSY;
+               err = -EBUSY;
+               goto err_rx;
        }
 
        /* We are ready when RX completes */
@@ -289,6 +283,12 @@ int pxa2xx_spi_dma_prepare(struct driver_data *drv_data, u32 dma_burst)
        dmaengine_submit(rx_desc);
        dmaengine_submit(tx_desc);
        return 0;
+
+err_rx:
+       dmaengine_terminate_async(drv_data->tx_chan);
+err_tx:
+       pxa2xx_spi_unmap_dma_buffers(drv_data);
+       return err;
 }
 
 void pxa2xx_spi_dma_start(struct driver_data *drv_data)
@@ -308,10 +308,6 @@ int pxa2xx_spi_dma_setup(struct driver_data *drv_data)
        dma_cap_zero(mask);
        dma_cap_set(DMA_SLAVE, mask);
 
-       drv_data->dummy = devm_kzalloc(dev, SZ_2K, GFP_KERNEL);
-       if (!drv_data->dummy)
-               return -ENOMEM;
-
        drv_data->tx_chan = dma_request_slave_channel_compat(mask,
                                pdata->dma_filter, pdata->tx_param, dev, "tx");
        if (!drv_data->tx_chan)
index 4fd7f98..5202de9 100644 (file)
@@ -173,8 +173,8 @@ static int pxa2xx_spi_pci_probe(struct pci_dev *dev,
        ssp->type = c->type;
 
        snprintf(buf, sizeof(buf), "pxa2xx-spi.%d", ssp->port_id);
-       ssp->clk = clk_register_fixed_rate(&dev->dev, buf , NULL,
-                                       CLK_IS_ROOT, c->max_clk_rate);
+       ssp->clk = clk_register_fixed_rate(&dev->dev, buf , NULL, 0,
+                                          c->max_clk_rate);
         if (IS_ERR(ssp->clk))
                return PTR_ERR(ssp->clk);
 
index 86138e4..fe07c05 100644 (file)
@@ -570,9 +570,8 @@ static void giveback(struct driver_data *drv_data)
                /* see if the next and current messages point
                 * to the same chip
                 */
-               if (next_msg && next_msg->spi != msg->spi)
-                       next_msg = NULL;
-               if (!next_msg || msg->state == ERROR_STATE)
+               if ((next_msg && next_msg->spi != msg->spi) ||
+                   msg->state == ERROR_STATE)
                        cs_deassert(drv_data);
        }
 
@@ -928,6 +927,7 @@ static void pump_transfers(unsigned long data)
        u32 dma_thresh = drv_data->cur_chip->dma_threshold;
        u32 dma_burst = drv_data->cur_chip->dma_burst_size;
        u32 change_mask = pxa2xx_spi_get_ssrc1_change_mask(drv_data);
+       int err;
 
        /* Get current state information */
        message = drv_data->cur_msg;
@@ -1047,7 +1047,12 @@ static void pump_transfers(unsigned long data)
                /* Ensure we have the correct interrupt handler */
                drv_data->transfer_handler = pxa2xx_spi_dma_transfer;
 
-               pxa2xx_spi_dma_prepare(drv_data, dma_burst);
+               err = pxa2xx_spi_dma_prepare(drv_data, dma_burst);
+               if (err) {
+                       message->status = err;
+                       giveback(drv_data);
+                       return;
+               }
 
                /* Clear status and start DMA engine */
                cr1 = chip->cr1 | dma_thresh | drv_data->dma_cr1;
@@ -1543,7 +1548,6 @@ static int pxa2xx_spi_probe(struct platform_device *pdev)
        drv_data->pdev = pdev;
        drv_data->ssp = ssp;
 
-       master->dev.parent = &pdev->dev;
        master->dev.of_node = pdev->dev.of_node;
        /* the spi->mode bits understood by this driver: */
        master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH | SPI_LOOP;
@@ -1556,6 +1560,7 @@ static int pxa2xx_spi_probe(struct platform_device *pdev)
        master->unprepare_transfer_hardware = pxa2xx_spi_unprepare_transfer;
        master->fw_translate_cs = pxa2xx_spi_fw_translate_cs;
        master->auto_runtime_pm = true;
+       master->flags = SPI_MASTER_MUST_RX | SPI_MASTER_MUST_TX;
 
        drv_data->ssp_type = ssp->type;
 
index a1ef889..e6b0900 100644 (file)
@@ -56,7 +56,6 @@ struct driver_data {
        struct sg_table tx_sgt;
        int rx_nents;
        int tx_nents;
-       void *dummy;
        atomic_t dma_running;
 
        /* Current message transfer state info */
@@ -69,8 +68,6 @@ struct driver_data {
        void *rx;
        void *rx_end;
        int dma_mapped;
-       size_t rx_map_len;
-       size_t tx_map_len;
        u8 n_bytes;
        int (*write)(struct driver_data *drv_data);
        int (*read)(struct driver_data *drv_data);
index 810a7fa..c338ef1 100644 (file)
@@ -937,6 +937,10 @@ static int spi_qup_pm_suspend_runtime(struct device *device)
        config = readl(controller->base + QUP_CONFIG);
        config |= QUP_CONFIG_CLOCK_AUTO_GATE;
        writel_relaxed(config, controller->base + QUP_CONFIG);
+
+       clk_disable_unprepare(controller->cclk);
+       clk_disable_unprepare(controller->iclk);
+
        return 0;
 }
 
@@ -945,6 +949,15 @@ static int spi_qup_pm_resume_runtime(struct device *device)
        struct spi_master *master = dev_get_drvdata(device);
        struct spi_qup *controller = spi_master_get_devdata(master);
        u32 config;
+       int ret;
+
+       ret = clk_prepare_enable(controller->iclk);
+       if (ret)
+               return ret;
+
+       ret = clk_prepare_enable(controller->cclk);
+       if (ret)
+               return ret;
 
        /* Disable clocks auto gaiting */
        config = readl_relaxed(controller->base + QUP_CONFIG);
@@ -1017,6 +1030,8 @@ static int spi_qup_remove(struct platform_device *pdev)
 
        pm_runtime_put_noidle(&pdev->dev);
        pm_runtime_disable(&pdev->dev);
+       spi_master_put(master);
+
        return 0;
 }
 
index 6c6c001..cd89682 100644 (file)
@@ -744,10 +744,8 @@ static int rockchip_spi_probe(struct platform_device *pdev)
        rs->dma_rx.ch = dma_request_chan(rs->dev, "rx");
        if (IS_ERR(rs->dma_rx.ch)) {
                if (PTR_ERR(rs->dma_rx.ch) == -EPROBE_DEFER) {
-                       dma_release_channel(rs->dma_tx.ch);
-                       rs->dma_tx.ch = NULL;
                        ret = -EPROBE_DEFER;
-                       goto err_get_fifo_len;
+                       goto err_free_dma_tx;
                }
                dev_warn(rs->dev, "Failed to request RX DMA channel\n");
                rs->dma_rx.ch = NULL;
@@ -775,10 +773,11 @@ static int rockchip_spi_probe(struct platform_device *pdev)
 
 err_register_master:
        pm_runtime_disable(&pdev->dev);
-       if (rs->dma_tx.ch)
-               dma_release_channel(rs->dma_tx.ch);
        if (rs->dma_rx.ch)
                dma_release_channel(rs->dma_rx.ch);
+err_free_dma_tx:
+       if (rs->dma_tx.ch)
+               dma_release_channel(rs->dma_tx.ch);
 err_get_fifo_len:
        clk_disable_unprepare(rs->spiclk);
 err_spiclk_enable:
index f17c0ab..d5adf9f 100644 (file)
@@ -345,12 +345,13 @@ static int spi_st_probe(struct platform_device *pdev)
        spi_st->clk = devm_clk_get(&pdev->dev, "ssc");
        if (IS_ERR(spi_st->clk)) {
                dev_err(&pdev->dev, "Unable to request clock\n");
-               return PTR_ERR(spi_st->clk);
+               ret = PTR_ERR(spi_st->clk);
+               goto put_master;
        }
 
        ret = spi_st_clk_enable(spi_st);
        if (ret)
-               return ret;
+               goto put_master;
 
        init_completion(&spi_st->done);
 
@@ -408,7 +409,8 @@ static int spi_st_probe(struct platform_device *pdev)
 
 clk_disable:
        spi_st_clk_disable(spi_st);
-
+put_master:
+       spi_master_put(master);
        return ret;
 }
 
index aab9b49..18aeace 100644 (file)
@@ -360,7 +360,7 @@ static int zynqmp_prepare_transfer_hardware(struct spi_master *master)
 
        ret = clk_enable(xqspi->refclk);
        if (ret)
-               goto clk_err;
+               return ret;
 
        ret = clk_enable(xqspi->pclk);
        if (ret)
@@ -369,6 +369,7 @@ static int zynqmp_prepare_transfer_hardware(struct spi_master *master)
        zynqmp_gqspi_write(xqspi, GQSPI_EN_OFST, GQSPI_EN_MASK);
        return 0;
 clk_err:
+       clk_disable(xqspi->refclk);
        return ret;
 }
 
index 0239b45..77e6e45 100644 (file)
@@ -717,9 +717,11 @@ static int spi_map_buf(struct spi_master *master, struct device *dev,
        if (vmalloced_buf) {
                desc_len = min_t(int, max_seg_size, PAGE_SIZE);
                sgs = DIV_ROUND_UP(len + offset_in_page(buf), desc_len);
-       } else {
+       } else if (virt_addr_valid(buf)) {
                desc_len = min_t(int, max_seg_size, master->max_dma_len);
                sgs = DIV_ROUND_UP(len, desc_len);
+       } else {
+               return -EINVAL;
        }
 
        ret = sg_alloc_table(sgt, sgs, GFP_KERNEL);
@@ -933,7 +935,7 @@ static int spi_map_msg(struct spi_master *master, struct spi_message *msg)
  * spi_transfer_one_message - Default implementation of transfer_one_message()
  *
  * This is a standard implementation of transfer_one_message() for
- * drivers which impelment a transfer_one() operation.  It provides
+ * drivers which implement a transfer_one() operation.  It provides
  * standard handling of delays and chip select management.
  */
 static int spi_transfer_one_message(struct spi_master *master,
@@ -1764,6 +1766,7 @@ struct spi_master *spi_alloc_master(struct device *dev, unsigned size)
        master->num_chipselect = 1;
        master->dev.class = &spi_master_class;
        master->dev.parent = dev;
+       pm_suspend_ignore_children(&master->dev, true);
        spi_master_set_devdata(master, &master[1]);
 
        return master;
index 5bac28a..7c197d1 100644 (file)
@@ -66,8 +66,6 @@ source "drivers/staging/nvec/Kconfig"
 
 source "drivers/staging/media/Kconfig"
 
-source "drivers/staging/rdma/Kconfig"
-
 source "drivers/staging/android/Kconfig"
 
 source "drivers/staging/board/Kconfig"
index a954242..a470c72 100644 (file)
@@ -23,7 +23,6 @@ obj-$(CONFIG_FB_XGI)          += xgifb/
 obj-$(CONFIG_USB_EMXX)         += emxx_udc/
 obj-$(CONFIG_SPEAKUP)          += speakup/
 obj-$(CONFIG_MFD_NVEC)         += nvec/
-obj-$(CONFIG_STAGING_RDMA)     += rdma/
 obj-$(CONFIG_ANDROID)          += android/
 obj-$(CONFIG_STAGING_BOARD)    += board/
 obj-$(CONFIG_LTE_GDM724X)      += gdm724x/
index ce1f949..3f2f30b 100644 (file)
@@ -976,8 +976,8 @@ static inline __u64 ll_file_maxbytes(struct inode *inode)
 }
 
 /* llite/xattr.c */
-int ll_setxattr(struct dentry *dentry, const char *name,
-               const void *value, size_t size, int flags);
+int ll_setxattr(struct dentry *dentry, struct inode *inode,
+               const char *name, const void *value, size_t size, int flags);
 ssize_t ll_getxattr(struct dentry *dentry, struct inode *inode,
                    const char *name, void *buffer, size_t size);
 ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size);
index ed4de04..608014b 100644 (file)
@@ -211,11 +211,9 @@ int ll_setxattr_common(struct inode *inode, const char *name,
        return 0;
 }
 
-int ll_setxattr(struct dentry *dentry, const char *name,
-               const void *value, size_t size, int flags)
+int ll_setxattr(struct dentry *dentry, struct inode *inode,
+               const char *name, const void *value, size_t size, int flags)
 {
-       struct inode *inode = d_inode(dentry);
-
        LASSERT(inode);
        LASSERT(name);
 
index 163f21a..e389009 100644 (file)
@@ -42,23 +42,33 @@ static inline struct spinand_state *mtd_to_state(struct mtd_info *mtd)
 static int enable_hw_ecc;
 static int enable_read_hw_ecc;
 
-static struct nand_ecclayout spinand_oob_64 = {
-       .eccbytes = 24,
-       .eccpos = {
-               1, 2, 3, 4, 5, 6,
-               17, 18, 19, 20, 21, 22,
-               33, 34, 35, 36, 37, 38,
-               49, 50, 51, 52, 53, 54, },
-       .oobfree = {
-               {.offset = 8,
-                       .length = 8},
-               {.offset = 24,
-                       .length = 8},
-               {.offset = 40,
-                       .length = 8},
-               {.offset = 56,
-                       .length = 8},
-       }
+static int spinand_ooblayout_64_ecc(struct mtd_info *mtd, int section,
+                                   struct mtd_oob_region *oobregion)
+{
+       if (section > 3)
+               return -ERANGE;
+
+       oobregion->offset = (section * 16) + 1;
+       oobregion->length = 6;
+
+       return 0;
+}
+
+static int spinand_ooblayout_64_free(struct mtd_info *mtd, int section,
+                                    struct mtd_oob_region *oobregion)
+{
+       if (section > 3)
+               return -ERANGE;
+
+       oobregion->offset = (section * 16) + 8;
+       oobregion->length = 8;
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops spinand_oob_64_ops = {
+       .ecc = spinand_ooblayout_64_ecc,
+       .free = spinand_ooblayout_64_free,
 };
 #endif
 
@@ -886,11 +896,11 @@ static int spinand_probe(struct spi_device *spi_nand)
 
        chip->ecc.strength = 1;
        chip->ecc.total = chip->ecc.steps * chip->ecc.bytes;
-       chip->ecc.layout = &spinand_oob_64;
        chip->ecc.read_page = spinand_read_page_hwecc;
        chip->ecc.write_page = spinand_write_page_hwecc;
 #else
        chip->ecc.mode  = NAND_ECC_SOFT;
+       chip->ecc.algo  = NAND_ECC_HAMMING;
        if (spinand_disable_ecc(spi_nand) < 0)
                dev_info(&spi_nand->dev, "%s: disable ecc failed!\n",
                         __func__);
@@ -912,6 +922,9 @@ static int spinand_probe(struct spi_device *spi_nand)
 
        mtd->dev.parent = &spi_nand->dev;
        mtd->oobsize = 64;
+#ifdef CONFIG_MTD_SPINAND_ONDIEECC
+       mtd_set_ooblayout(mtd, &spinand_oob_64_ops);
+#endif
 
        if (nand_scan(mtd, 1))
                return -ENXIO;
diff --git a/drivers/staging/rdma/Kconfig b/drivers/staging/rdma/Kconfig
deleted file mode 100644 (file)
index f1f3eca..0000000
+++ /dev/null
@@ -1,27 +0,0 @@
-menuconfig STAGING_RDMA
-        tristate "RDMA staging drivers"
-       depends on INFINIBAND
-       depends on PCI || BROKEN
-       depends on HAS_IOMEM
-       depends on NET
-       depends on INET
-        default n
-        ---help---
-          This option allows you to select a number of RDMA drivers that
-         fall into one of two categories: deprecated drivers being held
-         here before finally being removed or new drivers that still need
-         some work before being moved to the normal RDMA driver area.
-
-          If you wish to work on these drivers, to help improve them, or
-          to report problems you have with them, please use the
-         linux-rdma@vger.kernel.org mailing list.
-
-          If in doubt, say N here.
-
-
-# Please keep entries in alphabetic order
-if STAGING_RDMA
-
-source "drivers/staging/rdma/hfi1/Kconfig"
-
-endif
diff --git a/drivers/staging/rdma/Makefile b/drivers/staging/rdma/Makefile
deleted file mode 100644 (file)
index 8c7fc1d..0000000
+++ /dev/null
@@ -1,2 +0,0 @@
-# Entries for RDMA_STAGING tree
-obj-$(CONFIG_INFINIBAND_HFI1)  += hfi1/
diff --git a/drivers/staging/rdma/hfi1/Kconfig b/drivers/staging/rdma/hfi1/Kconfig
deleted file mode 100644 (file)
index a925fb0..0000000
+++ /dev/null
@@ -1,29 +0,0 @@
-config INFINIBAND_HFI1
-       tristate "Intel OPA Gen1 support"
-       depends on X86_64 && INFINIBAND_RDMAVT
-       select MMU_NOTIFIER
-       select CRC32
-       default m
-       ---help---
-       This is a low-level driver for Intel OPA Gen1 adapter.
-config HFI1_DEBUG_SDMA_ORDER
-       bool "HFI1 SDMA Order debug"
-       depends on INFINIBAND_HFI1
-       default n
-       ---help---
-       This is a debug flag to test for out of order
-       sdma completions for unit testing
-config HFI1_VERBS_31BIT_PSN
-       bool "HFI1 enable 31 bit PSN"
-       depends on INFINIBAND_HFI1
-       default y
-       ---help---
-       Setting this enables 31 BIT PSN
-       For verbs RC/UC
-config SDMA_VERBOSITY
-       bool "Config SDMA Verbosity"
-       depends on INFINIBAND_HFI1
-       default n
-       ---help---
-       This is a configuration flag to enable verbose
-       SDMA debug
diff --git a/drivers/staging/rdma/hfi1/Makefile b/drivers/staging/rdma/hfi1/Makefile
deleted file mode 100644 (file)
index 8dc5938..0000000
+++ /dev/null
@@ -1,21 +0,0 @@
-#
-# HFI driver
-#
-#
-#
-# Called from the kernel module build system.
-#
-obj-$(CONFIG_INFINIBAND_HFI1) += hfi1.o
-
-hfi1-y := affinity.o chip.o device.o diag.o driver.o efivar.o \
-       eprom.o file_ops.o firmware.o \
-       init.o intr.o mad.o mmu_rb.o pcie.o pio.o pio_copy.o platform.o \
-       qp.o qsfp.o rc.o ruc.o sdma.o sysfs.o trace.o twsi.o \
-       uc.o ud.o user_exp_rcv.o user_pages.o user_sdma.o verbs.o \
-       verbs_txreq.o
-hfi1-$(CONFIG_DEBUG_FS) += debugfs.o
-
-CFLAGS_trace.o = -I$(src)
-ifdef MVERSION
-CFLAGS_driver.o = -DHFI_DRIVER_VERSION_BASE=\"$(MVERSION)\"
-endif
diff --git a/drivers/staging/rdma/hfi1/TODO b/drivers/staging/rdma/hfi1/TODO
deleted file mode 100644 (file)
index 4c6f1d7..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-July, 2015
-
-- Remove unneeded file entries in sysfs
-- Remove software processing of IB protocol and place in library for use
-  by qib, ipath (if still present), hfi1, and eventually soft-roce
-- Replace incorrect uAPI
diff --git a/drivers/staging/rdma/hfi1/affinity.c b/drivers/staging/rdma/hfi1/affinity.c
deleted file mode 100644 (file)
index 6e7050a..0000000
+++ /dev/null
@@ -1,431 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#include <linux/topology.h>
-#include <linux/cpumask.h>
-#include <linux/module.h>
-
-#include "hfi.h"
-#include "affinity.h"
-#include "sdma.h"
-#include "trace.h"
-
-/* Name of IRQ types, indexed by enum irq_type */
-static const char * const irq_type_names[] = {
-       "SDMA",
-       "RCVCTXT",
-       "GENERAL",
-       "OTHER",
-};
-
-static inline void init_cpu_mask_set(struct cpu_mask_set *set)
-{
-       cpumask_clear(&set->mask);
-       cpumask_clear(&set->used);
-       set->gen = 0;
-}
-
-/* Initialize non-HT cpu cores mask */
-int init_real_cpu_mask(struct hfi1_devdata *dd)
-{
-       struct hfi1_affinity *info;
-       int possible, curr_cpu, i, ht;
-
-       info = kzalloc(sizeof(*info), GFP_KERNEL);
-       if (!info)
-               return -ENOMEM;
-
-       cpumask_clear(&info->real_cpu_mask);
-
-       /* Start with cpu online mask as the real cpu mask */
-       cpumask_copy(&info->real_cpu_mask, cpu_online_mask);
-
-       /*
-        * Remove HT cores from the real cpu mask.  Do this in two steps below.
-        */
-       possible = cpumask_weight(&info->real_cpu_mask);
-       ht = cpumask_weight(topology_sibling_cpumask(
-                                       cpumask_first(&info->real_cpu_mask)));
-       /*
-        * Step 1.  Skip over the first N HT siblings and use them as the
-        * "real" cores.  Assumes that HT cores are not enumerated in
-        * succession (except in the single core case).
-        */
-       curr_cpu = cpumask_first(&info->real_cpu_mask);
-       for (i = 0; i < possible / ht; i++)
-               curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask);
-       /*
-        * Step 2.  Remove the remaining HT siblings.  Use cpumask_next() to
-        * skip any gaps.
-        */
-       for (; i < possible; i++) {
-               cpumask_clear_cpu(curr_cpu, &info->real_cpu_mask);
-               curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask);
-       }
-
-       dd->affinity = info;
-       return 0;
-}
-
-/*
- * Interrupt affinity.
- *
- * non-rcv avail gets a default mask that
- * starts as possible cpus with threads reset
- * and each rcv avail reset.
- *
- * rcv avail gets node relative 1 wrapping back
- * to the node relative 1 as necessary.
- *
- */
-void hfi1_dev_affinity_init(struct hfi1_devdata *dd)
-{
-       int node = pcibus_to_node(dd->pcidev->bus);
-       struct hfi1_affinity *info = dd->affinity;
-       const struct cpumask *local_mask;
-       int curr_cpu, possible, i;
-
-       if (node < 0)
-               node = numa_node_id();
-       dd->node = node;
-
-       spin_lock_init(&info->lock);
-
-       init_cpu_mask_set(&info->def_intr);
-       init_cpu_mask_set(&info->rcv_intr);
-       init_cpu_mask_set(&info->proc);
-
-       local_mask = cpumask_of_node(dd->node);
-       if (cpumask_first(local_mask) >= nr_cpu_ids)
-               local_mask = topology_core_cpumask(0);
-       /* Use the "real" cpu mask of this node as the default */
-       cpumask_and(&info->def_intr.mask, &info->real_cpu_mask, local_mask);
-
-       /*  fill in the receive list */
-       possible = cpumask_weight(&info->def_intr.mask);
-       curr_cpu = cpumask_first(&info->def_intr.mask);
-       if (possible == 1) {
-               /*  only one CPU, everyone will use it */
-               cpumask_set_cpu(curr_cpu, &info->rcv_intr.mask);
-       } else {
-               /*
-                * Retain the first CPU in the default list for the control
-                * context.
-                */
-               curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask);
-               /*
-                * Remove the remaining kernel receive queues from
-                * the default list and add them to the receive list.
-                */
-               for (i = 0; i < dd->n_krcv_queues - 1; i++) {
-                       cpumask_clear_cpu(curr_cpu, &info->def_intr.mask);
-                       cpumask_set_cpu(curr_cpu, &info->rcv_intr.mask);
-                       curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask);
-                       if (curr_cpu >= nr_cpu_ids)
-                               break;
-               }
-       }
-
-       cpumask_copy(&info->proc.mask, cpu_online_mask);
-}
-
-void hfi1_dev_affinity_free(struct hfi1_devdata *dd)
-{
-       kfree(dd->affinity);
-}
-
-int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix)
-{
-       int ret;
-       cpumask_var_t diff;
-       struct cpu_mask_set *set;
-       struct sdma_engine *sde = NULL;
-       struct hfi1_ctxtdata *rcd = NULL;
-       char extra[64];
-       int cpu = -1;
-
-       extra[0] = '\0';
-       cpumask_clear(&msix->mask);
-
-       ret = zalloc_cpumask_var(&diff, GFP_KERNEL);
-       if (!ret)
-               return -ENOMEM;
-
-       switch (msix->type) {
-       case IRQ_SDMA:
-               sde = (struct sdma_engine *)msix->arg;
-               scnprintf(extra, 64, "engine %u", sde->this_idx);
-               /* fall through */
-       case IRQ_GENERAL:
-               set = &dd->affinity->def_intr;
-               break;
-       case IRQ_RCVCTXT:
-               rcd = (struct hfi1_ctxtdata *)msix->arg;
-               if (rcd->ctxt == HFI1_CTRL_CTXT) {
-                       set = &dd->affinity->def_intr;
-                       cpu = cpumask_first(&set->mask);
-               } else {
-                       set = &dd->affinity->rcv_intr;
-               }
-               scnprintf(extra, 64, "ctxt %u", rcd->ctxt);
-               break;
-       default:
-               dd_dev_err(dd, "Invalid IRQ type %d\n", msix->type);
-               return -EINVAL;
-       }
-
-       /*
-        * The control receive context is placed on a particular CPU, which
-        * is set above.  Skip accounting for it.  Everything else finds its
-        * CPU here.
-        */
-       if (cpu == -1) {
-               spin_lock(&dd->affinity->lock);
-               if (cpumask_equal(&set->mask, &set->used)) {
-                       /*
-                        * We've used up all the CPUs, bump up the generation
-                        * and reset the 'used' map
-                        */
-                       set->gen++;
-                       cpumask_clear(&set->used);
-               }
-               cpumask_andnot(diff, &set->mask, &set->used);
-               cpu = cpumask_first(diff);
-               cpumask_set_cpu(cpu, &set->used);
-               spin_unlock(&dd->affinity->lock);
-       }
-
-       switch (msix->type) {
-       case IRQ_SDMA:
-               sde->cpu = cpu;
-               break;
-       case IRQ_GENERAL:
-       case IRQ_RCVCTXT:
-       case IRQ_OTHER:
-               break;
-       }
-
-       cpumask_set_cpu(cpu, &msix->mask);
-       dd_dev_info(dd, "IRQ vector: %u, type %s %s -> cpu: %d\n",
-                   msix->msix.vector, irq_type_names[msix->type],
-                   extra, cpu);
-       irq_set_affinity_hint(msix->msix.vector, &msix->mask);
-
-       free_cpumask_var(diff);
-       return 0;
-}
-
-void hfi1_put_irq_affinity(struct hfi1_devdata *dd,
-                          struct hfi1_msix_entry *msix)
-{
-       struct cpu_mask_set *set = NULL;
-       struct hfi1_ctxtdata *rcd;
-
-       switch (msix->type) {
-       case IRQ_SDMA:
-       case IRQ_GENERAL:
-               set = &dd->affinity->def_intr;
-               break;
-       case IRQ_RCVCTXT:
-               rcd = (struct hfi1_ctxtdata *)msix->arg;
-               /* only do accounting for non control contexts */
-               if (rcd->ctxt != HFI1_CTRL_CTXT)
-                       set = &dd->affinity->rcv_intr;
-               break;
-       default:
-               return;
-       }
-
-       if (set) {
-               spin_lock(&dd->affinity->lock);
-               cpumask_andnot(&set->used, &set->used, &msix->mask);
-               if (cpumask_empty(&set->used) && set->gen) {
-                       set->gen--;
-                       cpumask_copy(&set->used, &set->mask);
-               }
-               spin_unlock(&dd->affinity->lock);
-       }
-
-       irq_set_affinity_hint(msix->msix.vector, NULL);
-       cpumask_clear(&msix->mask);
-}
-
-int hfi1_get_proc_affinity(struct hfi1_devdata *dd, int node)
-{
-       int cpu = -1, ret;
-       cpumask_var_t diff, mask, intrs;
-       const struct cpumask *node_mask,
-               *proc_mask = tsk_cpus_allowed(current);
-       struct cpu_mask_set *set = &dd->affinity->proc;
-       char buf[1024];
-
-       /*
-        * check whether process/context affinity has already
-        * been set
-        */
-       if (cpumask_weight(proc_mask) == 1) {
-               scnprintf(buf, 1024, "%*pbl", cpumask_pr_args(proc_mask));
-               hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %s",
-                         current->pid, current->comm, buf);
-               /*
-                * Mark the pre-set CPU as used. This is atomic so we don't
-                * need the lock
-                */
-               cpu = cpumask_first(proc_mask);
-               cpumask_set_cpu(cpu, &set->used);
-               goto done;
-       } else if (cpumask_weight(proc_mask) < cpumask_weight(&set->mask)) {
-               scnprintf(buf, 1024, "%*pbl", cpumask_pr_args(proc_mask));
-               hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %s",
-                         current->pid, current->comm, buf);
-               goto done;
-       }
-
-       /*
-        * The process does not have a preset CPU affinity so find one to
-        * recommend. We prefer CPUs on the same NUMA as the device.
-        */
-
-       ret = zalloc_cpumask_var(&diff, GFP_KERNEL);
-       if (!ret)
-               goto done;
-       ret = zalloc_cpumask_var(&mask, GFP_KERNEL);
-       if (!ret)
-               goto free_diff;
-       ret = zalloc_cpumask_var(&intrs, GFP_KERNEL);
-       if (!ret)
-               goto free_mask;
-
-       spin_lock(&dd->affinity->lock);
-       /*
-        * If we've used all available CPUs, clear the mask and start
-        * overloading.
-        */
-       if (cpumask_equal(&set->mask, &set->used)) {
-               set->gen++;
-               cpumask_clear(&set->used);
-       }
-
-       /* CPUs used by interrupt handlers */
-       cpumask_copy(intrs, (dd->affinity->def_intr.gen ?
-                            &dd->affinity->def_intr.mask :
-                            &dd->affinity->def_intr.used));
-       cpumask_or(intrs, intrs, (dd->affinity->rcv_intr.gen ?
-                                 &dd->affinity->rcv_intr.mask :
-                                 &dd->affinity->rcv_intr.used));
-       scnprintf(buf, 1024, "%*pbl", cpumask_pr_args(intrs));
-       hfi1_cdbg(PROC, "CPUs used by interrupts: %s", buf);
-
-       /*
-        * If we don't have a NUMA node requested, preference is towards
-        * device NUMA node
-        */
-       if (node == -1)
-               node = dd->node;
-       node_mask = cpumask_of_node(node);
-       scnprintf(buf, 1024, "%*pbl", cpumask_pr_args(node_mask));
-       hfi1_cdbg(PROC, "device on NUMA %u, CPUs %s", node, buf);
-
-       /* diff will hold all unused cpus */
-       cpumask_andnot(diff, &set->mask, &set->used);
-       scnprintf(buf, 1024, "%*pbl", cpumask_pr_args(diff));
-       hfi1_cdbg(PROC, "unused CPUs (all) %s", buf);
-
-       /* get cpumask of available CPUs on preferred NUMA */
-       cpumask_and(mask, diff, node_mask);
-       scnprintf(buf, 1024, "%*pbl", cpumask_pr_args(mask));
-       hfi1_cdbg(PROC, "available cpus on NUMA %s", buf);
-
-       /*
-        * At first, we don't want to place processes on the same
-        * CPUs as interrupt handlers.
-        */
-       cpumask_andnot(diff, mask, intrs);
-       if (!cpumask_empty(diff))
-               cpumask_copy(mask, diff);
-
-       /*
-        * if we don't have a cpu on the preferred NUMA, get
-        * the list of the remaining available CPUs
-        */
-       if (cpumask_empty(mask)) {
-               cpumask_andnot(diff, &set->mask, &set->used);
-               cpumask_andnot(mask, diff, node_mask);
-       }
-       scnprintf(buf, 1024, "%*pbl", cpumask_pr_args(mask));
-       hfi1_cdbg(PROC, "possible CPUs for process %s", buf);
-
-       cpu = cpumask_first(mask);
-       if (cpu >= nr_cpu_ids) /* empty */
-               cpu = -1;
-       else
-               cpumask_set_cpu(cpu, &set->used);
-       spin_unlock(&dd->affinity->lock);
-
-       free_cpumask_var(intrs);
-free_mask:
-       free_cpumask_var(mask);
-free_diff:
-       free_cpumask_var(diff);
-done:
-       return cpu;
-}
-
-void hfi1_put_proc_affinity(struct hfi1_devdata *dd, int cpu)
-{
-       struct cpu_mask_set *set = &dd->affinity->proc;
-
-       if (cpu < 0)
-               return;
-       spin_lock(&dd->affinity->lock);
-       cpumask_clear_cpu(cpu, &set->used);
-       if (cpumask_empty(&set->used) && set->gen) {
-               set->gen--;
-               cpumask_copy(&set->used, &set->mask);
-       }
-       spin_unlock(&dd->affinity->lock);
-}
-
diff --git a/drivers/staging/rdma/hfi1/affinity.h b/drivers/staging/rdma/hfi1/affinity.h
deleted file mode 100644 (file)
index 20f52fe..0000000
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#ifndef _HFI1_AFFINITY_H
-#define _HFI1_AFFINITY_H
-
-#include "hfi.h"
-
-enum irq_type {
-       IRQ_SDMA,
-       IRQ_RCVCTXT,
-       IRQ_GENERAL,
-       IRQ_OTHER
-};
-
-/* Can be used for both memory and cpu */
-enum affinity_flags {
-       AFF_AUTO,
-       AFF_NUMA_LOCAL,
-       AFF_DEV_LOCAL,
-       AFF_IRQ_LOCAL
-};
-
-struct cpu_mask_set {
-       struct cpumask mask;
-       struct cpumask used;
-       uint gen;
-};
-
-struct hfi1_affinity {
-       struct cpu_mask_set def_intr;
-       struct cpu_mask_set rcv_intr;
-       struct cpu_mask_set proc;
-       struct cpumask real_cpu_mask;
-       /* spin lock to protect affinity struct */
-       spinlock_t lock;
-};
-
-struct hfi1_msix_entry;
-
-/* Initialize non-HT cpu cores mask */
-int init_real_cpu_mask(struct hfi1_devdata *);
-/* Initialize driver affinity data */
-void hfi1_dev_affinity_init(struct hfi1_devdata *);
-/* Free driver affinity data */
-void hfi1_dev_affinity_free(struct hfi1_devdata *);
-/*
- * Set IRQ affinity to a CPU. The function will determine the
- * CPU and set the affinity to it.
- */
-int hfi1_get_irq_affinity(struct hfi1_devdata *, struct hfi1_msix_entry *);
-/*
- * Remove the IRQ's CPU affinity. This function also updates
- * any internal CPU tracking data
- */
-void hfi1_put_irq_affinity(struct hfi1_devdata *, struct hfi1_msix_entry *);
-/*
- * Determine a CPU affinity for a user process, if the process does not
- * have an affinity set yet.
- */
-int hfi1_get_proc_affinity(struct hfi1_devdata *, int);
-/* Release a CPU used by a user process. */
-void hfi1_put_proc_affinity(struct hfi1_devdata *, int);
-
-#endif /* _HFI1_AFFINITY_H */
diff --git a/drivers/staging/rdma/hfi1/aspm.h b/drivers/staging/rdma/hfi1/aspm.h
deleted file mode 100644 (file)
index 0d58fe3..0000000
+++ /dev/null
@@ -1,309 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#ifndef _ASPM_H
-#define _ASPM_H
-
-#include "hfi.h"
-
-extern uint aspm_mode;
-
-enum aspm_mode {
-       ASPM_MODE_DISABLED = 0, /* ASPM always disabled, performance mode */
-       ASPM_MODE_ENABLED = 1,  /* ASPM always enabled, power saving mode */
-       ASPM_MODE_DYNAMIC = 2,  /* ASPM enabled/disabled dynamically */
-};
-
-/* Time after which the timer interrupt will re-enable ASPM */
-#define ASPM_TIMER_MS 1000
-/* Time for which interrupts are ignored after a timer has been scheduled */
-#define ASPM_RESCHED_TIMER_MS (ASPM_TIMER_MS / 2)
-/* Two interrupts within this time trigger ASPM disable */
-#define ASPM_TRIGGER_MS 1
-#define ASPM_TRIGGER_NS (ASPM_TRIGGER_MS * 1000 * 1000ull)
-#define ASPM_L1_SUPPORTED(reg) \
-       (((reg & PCI_EXP_LNKCAP_ASPMS) >> 10) & 0x2)
-
-static inline bool aspm_hw_l1_supported(struct hfi1_devdata *dd)
-{
-       struct pci_dev *parent = dd->pcidev->bus->self;
-       u32 up, dn;
-
-       /*
-        * If the driver does not have access to the upstream component,
-        * it cannot support ASPM L1 at all.
-        */
-       if (!parent)
-               return false;
-
-       pcie_capability_read_dword(dd->pcidev, PCI_EXP_LNKCAP, &dn);
-       dn = ASPM_L1_SUPPORTED(dn);
-
-       pcie_capability_read_dword(parent, PCI_EXP_LNKCAP, &up);
-       up = ASPM_L1_SUPPORTED(up);
-
-       /* ASPM works on A-step but is reported as not supported */
-       return (!!dn || is_ax(dd)) && !!up;
-}
-
-/* Set L1 entrance latency for slower entry to L1 */
-static inline void aspm_hw_set_l1_ent_latency(struct hfi1_devdata *dd)
-{
-       u32 l1_ent_lat = 0x4u;
-       u32 reg32;
-
-       pci_read_config_dword(dd->pcidev, PCIE_CFG_REG_PL3, &reg32);
-       reg32 &= ~PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SMASK;
-       reg32 |= l1_ent_lat << PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SHIFT;
-       pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL3, reg32);
-}
-
-static inline void aspm_hw_enable_l1(struct hfi1_devdata *dd)
-{
-       struct pci_dev *parent = dd->pcidev->bus->self;
-
-       /*
-        * If the driver does not have access to the upstream component,
-        * it cannot support ASPM L1 at all.
-        */
-       if (!parent)
-               return;
-
-       /* Enable ASPM L1 first in upstream component and then downstream */
-       pcie_capability_clear_and_set_word(parent, PCI_EXP_LNKCTL,
-                                          PCI_EXP_LNKCTL_ASPMC,
-                                          PCI_EXP_LNKCTL_ASPM_L1);
-       pcie_capability_clear_and_set_word(dd->pcidev, PCI_EXP_LNKCTL,
-                                          PCI_EXP_LNKCTL_ASPMC,
-                                          PCI_EXP_LNKCTL_ASPM_L1);
-}
-
-static inline void aspm_hw_disable_l1(struct hfi1_devdata *dd)
-{
-       struct pci_dev *parent = dd->pcidev->bus->self;
-
-       /* Disable ASPM L1 first in downstream component and then upstream */
-       pcie_capability_clear_and_set_word(dd->pcidev, PCI_EXP_LNKCTL,
-                                          PCI_EXP_LNKCTL_ASPMC, 0x0);
-       if (parent)
-               pcie_capability_clear_and_set_word(parent, PCI_EXP_LNKCTL,
-                                                  PCI_EXP_LNKCTL_ASPMC, 0x0);
-}
-
-static inline void aspm_enable(struct hfi1_devdata *dd)
-{
-       if (dd->aspm_enabled || aspm_mode == ASPM_MODE_DISABLED ||
-           !dd->aspm_supported)
-               return;
-
-       aspm_hw_enable_l1(dd);
-       dd->aspm_enabled = true;
-}
-
-static inline void aspm_disable(struct hfi1_devdata *dd)
-{
-       if (!dd->aspm_enabled || aspm_mode == ASPM_MODE_ENABLED)
-               return;
-
-       aspm_hw_disable_l1(dd);
-       dd->aspm_enabled = false;
-}
-
-static inline void aspm_disable_inc(struct hfi1_devdata *dd)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&dd->aspm_lock, flags);
-       aspm_disable(dd);
-       atomic_inc(&dd->aspm_disabled_cnt);
-       spin_unlock_irqrestore(&dd->aspm_lock, flags);
-}
-
-static inline void aspm_enable_dec(struct hfi1_devdata *dd)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&dd->aspm_lock, flags);
-       if (atomic_dec_and_test(&dd->aspm_disabled_cnt))
-               aspm_enable(dd);
-       spin_unlock_irqrestore(&dd->aspm_lock, flags);
-}
-
-/* ASPM processing for each receive context interrupt */
-static inline void aspm_ctx_disable(struct hfi1_ctxtdata *rcd)
-{
-       bool restart_timer;
-       bool close_interrupts;
-       unsigned long flags;
-       ktime_t now, prev;
-
-       /* Quickest exit for minimum impact */
-       if (!rcd->aspm_intr_supported)
-               return;
-
-       spin_lock_irqsave(&rcd->aspm_lock, flags);
-       /* PSM contexts are open */
-       if (!rcd->aspm_intr_enable)
-               goto unlock;
-
-       prev = rcd->aspm_ts_last_intr;
-       now = ktime_get();
-       rcd->aspm_ts_last_intr = now;
-
-       /* An interrupt pair close together in time */
-       close_interrupts = ktime_to_ns(ktime_sub(now, prev)) < ASPM_TRIGGER_NS;
-
-       /* Don't push out our timer till this much time has elapsed */
-       restart_timer = ktime_to_ns(ktime_sub(now, rcd->aspm_ts_timer_sched)) >
-                                   ASPM_RESCHED_TIMER_MS * NSEC_PER_MSEC;
-       restart_timer = restart_timer && close_interrupts;
-
-       /* Disable ASPM and schedule timer */
-       if (rcd->aspm_enabled && close_interrupts) {
-               aspm_disable_inc(rcd->dd);
-               rcd->aspm_enabled = false;
-               restart_timer = true;
-       }
-
-       if (restart_timer) {
-               mod_timer(&rcd->aspm_timer,
-                         jiffies + msecs_to_jiffies(ASPM_TIMER_MS));
-               rcd->aspm_ts_timer_sched = now;
-       }
-unlock:
-       spin_unlock_irqrestore(&rcd->aspm_lock, flags);
-}
-
-/* Timer function for re-enabling ASPM in the absence of interrupt activity */
-static inline void aspm_ctx_timer_function(unsigned long data)
-{
-       struct hfi1_ctxtdata *rcd = (struct hfi1_ctxtdata *)data;
-       unsigned long flags;
-
-       spin_lock_irqsave(&rcd->aspm_lock, flags);
-       aspm_enable_dec(rcd->dd);
-       rcd->aspm_enabled = true;
-       spin_unlock_irqrestore(&rcd->aspm_lock, flags);
-}
-
-/* Disable interrupt processing for verbs contexts when PSM contexts are open */
-static inline void aspm_disable_all(struct hfi1_devdata *dd)
-{
-       struct hfi1_ctxtdata *rcd;
-       unsigned long flags;
-       unsigned i;
-
-       for (i = 0; i < dd->first_user_ctxt; i++) {
-               rcd = dd->rcd[i];
-               del_timer_sync(&rcd->aspm_timer);
-               spin_lock_irqsave(&rcd->aspm_lock, flags);
-               rcd->aspm_intr_enable = false;
-               spin_unlock_irqrestore(&rcd->aspm_lock, flags);
-       }
-
-       aspm_disable(dd);
-       atomic_set(&dd->aspm_disabled_cnt, 0);
-}
-
-/* Re-enable interrupt processing for verbs contexts */
-static inline void aspm_enable_all(struct hfi1_devdata *dd)
-{
-       struct hfi1_ctxtdata *rcd;
-       unsigned long flags;
-       unsigned i;
-
-       aspm_enable(dd);
-
-       if (aspm_mode != ASPM_MODE_DYNAMIC)
-               return;
-
-       for (i = 0; i < dd->first_user_ctxt; i++) {
-               rcd = dd->rcd[i];
-               spin_lock_irqsave(&rcd->aspm_lock, flags);
-               rcd->aspm_intr_enable = true;
-               rcd->aspm_enabled = true;
-               spin_unlock_irqrestore(&rcd->aspm_lock, flags);
-       }
-}
-
-static inline void aspm_ctx_init(struct hfi1_ctxtdata *rcd)
-{
-       spin_lock_init(&rcd->aspm_lock);
-       setup_timer(&rcd->aspm_timer, aspm_ctx_timer_function,
-                   (unsigned long)rcd);
-       rcd->aspm_intr_supported = rcd->dd->aspm_supported &&
-               aspm_mode == ASPM_MODE_DYNAMIC &&
-               rcd->ctxt < rcd->dd->first_user_ctxt;
-}
-
-static inline void aspm_init(struct hfi1_devdata *dd)
-{
-       unsigned i;
-
-       spin_lock_init(&dd->aspm_lock);
-       dd->aspm_supported = aspm_hw_l1_supported(dd);
-
-       for (i = 0; i < dd->first_user_ctxt; i++)
-               aspm_ctx_init(dd->rcd[i]);
-
-       /* Start with ASPM disabled */
-       aspm_hw_set_l1_ent_latency(dd);
-       dd->aspm_enabled = false;
-       aspm_hw_disable_l1(dd);
-
-       /* Now turn on ASPM if configured */
-       aspm_enable_all(dd);
-}
-
-static inline void aspm_exit(struct hfi1_devdata *dd)
-{
-       aspm_disable_all(dd);
-
-       /* Turn on ASPM on exit to conserve power */
-       aspm_enable(dd);
-}
-
-#endif /* _ASPM_H */
diff --git a/drivers/staging/rdma/hfi1/chip.c b/drivers/staging/rdma/hfi1/chip.c
deleted file mode 100644 (file)
index dcae8e7..0000000
+++ /dev/null
@@ -1,14693 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-/*
- * This file contains all of the code that is specific to the HFI chip
- */
-
-#include <linux/pci.h>
-#include <linux/delay.h>
-#include <linux/interrupt.h>
-#include <linux/module.h>
-
-#include "hfi.h"
-#include "trace.h"
-#include "mad.h"
-#include "pio.h"
-#include "sdma.h"
-#include "eprom.h"
-#include "efivar.h"
-#include "platform.h"
-#include "aspm.h"
-
-#define NUM_IB_PORTS 1
-
-uint kdeth_qp;
-module_param_named(kdeth_qp, kdeth_qp, uint, S_IRUGO);
-MODULE_PARM_DESC(kdeth_qp, "Set the KDETH queue pair prefix");
-
-uint num_vls = HFI1_MAX_VLS_SUPPORTED;
-module_param(num_vls, uint, S_IRUGO);
-MODULE_PARM_DESC(num_vls, "Set number of Virtual Lanes to use (1-8)");
-
-/*
- * Default time to aggregate two 10K packets from the idle state
- * (timer not running). The timer starts at the end of the first packet,
- * so only the time for one 10K packet and header plus a bit extra is needed.
- * 10 * 1024 + 64 header byte = 10304 byte
- * 10304 byte / 12.5 GB/s = 824.32ns
- */
-uint rcv_intr_timeout = (824 + 16); /* 16 is for coalescing interrupt */
-module_param(rcv_intr_timeout, uint, S_IRUGO);
-MODULE_PARM_DESC(rcv_intr_timeout, "Receive interrupt mitigation timeout in ns");
-
-uint rcv_intr_count = 16; /* same as qib */
-module_param(rcv_intr_count, uint, S_IRUGO);
-MODULE_PARM_DESC(rcv_intr_count, "Receive interrupt mitigation count");
-
-ushort link_crc_mask = SUPPORTED_CRCS;
-module_param(link_crc_mask, ushort, S_IRUGO);
-MODULE_PARM_DESC(link_crc_mask, "CRCs to use on the link");
-
-uint loopback;
-module_param_named(loopback, loopback, uint, S_IRUGO);
-MODULE_PARM_DESC(loopback, "Put into loopback mode (1 = serdes, 3 = external cable");
-
-/* Other driver tunables */
-uint rcv_intr_dynamic = 1; /* enable dynamic mode for rcv int mitigation*/
-static ushort crc_14b_sideband = 1;
-static uint use_flr = 1;
-uint quick_linkup; /* skip LNI */
-
-struct flag_table {
-       u64 flag;       /* the flag */
-       char *str;      /* description string */
-       u16 extra;      /* extra information */
-       u16 unused0;
-       u32 unused1;
-};
-
-/* str must be a string constant */
-#define FLAG_ENTRY(str, extra, flag) {flag, str, extra}
-#define FLAG_ENTRY0(str, flag) {flag, str, 0}
-
-/* Send Error Consequences */
-#define SEC_WRITE_DROPPED      0x1
-#define SEC_PACKET_DROPPED     0x2
-#define SEC_SC_HALTED          0x4     /* per-context only */
-#define SEC_SPC_FREEZE         0x8     /* per-HFI only */
-
-#define MIN_KERNEL_KCTXTS         2
-#define FIRST_KERNEL_KCTXT        1
-/* sizes for both the QP and RSM map tables */
-#define NUM_MAP_ENTRIES                256
-#define NUM_MAP_REGS             32
-
-/* Bit offset into the GUID which carries HFI id information */
-#define GUID_HFI_INDEX_SHIFT     39
-
-/* extract the emulation revision */
-#define emulator_rev(dd) ((dd)->irev >> 8)
-/* parallel and serial emulation versions are 3 and 4 respectively */
-#define is_emulator_p(dd) ((((dd)->irev) & 0xf) == 3)
-#define is_emulator_s(dd) ((((dd)->irev) & 0xf) == 4)
-
-/* RSM fields */
-
-/* packet type */
-#define IB_PACKET_TYPE         2ull
-#define QW_SHIFT               6ull
-/* QPN[7..1] */
-#define QPN_WIDTH              7ull
-
-/* LRH.BTH: QW 0, OFFSET 48 - for match */
-#define LRH_BTH_QW             0ull
-#define LRH_BTH_BIT_OFFSET     48ull
-#define LRH_BTH_OFFSET(off)    ((LRH_BTH_QW << QW_SHIFT) | (off))
-#define LRH_BTH_MATCH_OFFSET   LRH_BTH_OFFSET(LRH_BTH_BIT_OFFSET)
-#define LRH_BTH_SELECT
-#define LRH_BTH_MASK           3ull
-#define LRH_BTH_VALUE          2ull
-
-/* LRH.SC[3..0] QW 0, OFFSET 56 - for match */
-#define LRH_SC_QW              0ull
-#define LRH_SC_BIT_OFFSET      56ull
-#define LRH_SC_OFFSET(off)     ((LRH_SC_QW << QW_SHIFT) | (off))
-#define LRH_SC_MATCH_OFFSET    LRH_SC_OFFSET(LRH_SC_BIT_OFFSET)
-#define LRH_SC_MASK            128ull
-#define LRH_SC_VALUE           0ull
-
-/* SC[n..0] QW 0, OFFSET 60 - for select */
-#define LRH_SC_SELECT_OFFSET  ((LRH_SC_QW << QW_SHIFT) | (60ull))
-
-/* QPN[m+n:1] QW 1, OFFSET 1 */
-#define QPN_SELECT_OFFSET      ((1ull << QW_SHIFT) | (1ull))
-
-/* defines to build power on SC2VL table */
-#define SC2VL_VAL( \
-       num, \
-       sc0, sc0val, \
-       sc1, sc1val, \
-       sc2, sc2val, \
-       sc3, sc3val, \
-       sc4, sc4val, \
-       sc5, sc5val, \
-       sc6, sc6val, \
-       sc7, sc7val) \
-( \
-       ((u64)(sc0val) << SEND_SC2VLT##num##_SC##sc0##_SHIFT) | \
-       ((u64)(sc1val) << SEND_SC2VLT##num##_SC##sc1##_SHIFT) | \
-       ((u64)(sc2val) << SEND_SC2VLT##num##_SC##sc2##_SHIFT) | \
-       ((u64)(sc3val) << SEND_SC2VLT##num##_SC##sc3##_SHIFT) | \
-       ((u64)(sc4val) << SEND_SC2VLT##num##_SC##sc4##_SHIFT) | \
-       ((u64)(sc5val) << SEND_SC2VLT##num##_SC##sc5##_SHIFT) | \
-       ((u64)(sc6val) << SEND_SC2VLT##num##_SC##sc6##_SHIFT) | \
-       ((u64)(sc7val) << SEND_SC2VLT##num##_SC##sc7##_SHIFT)   \
-)
-
-#define DC_SC_VL_VAL( \
-       range, \
-       e0, e0val, \
-       e1, e1val, \
-       e2, e2val, \
-       e3, e3val, \
-       e4, e4val, \
-       e5, e5val, \
-       e6, e6val, \
-       e7, e7val, \
-       e8, e8val, \
-       e9, e9val, \
-       e10, e10val, \
-       e11, e11val, \
-       e12, e12val, \
-       e13, e13val, \
-       e14, e14val, \
-       e15, e15val) \
-( \
-       ((u64)(e0val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e0##_SHIFT) | \
-       ((u64)(e1val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e1##_SHIFT) | \
-       ((u64)(e2val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e2##_SHIFT) | \
-       ((u64)(e3val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e3##_SHIFT) | \
-       ((u64)(e4val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e4##_SHIFT) | \
-       ((u64)(e5val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e5##_SHIFT) | \
-       ((u64)(e6val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e6##_SHIFT) | \
-       ((u64)(e7val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e7##_SHIFT) | \
-       ((u64)(e8val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e8##_SHIFT) | \
-       ((u64)(e9val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e9##_SHIFT) | \
-       ((u64)(e10val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e10##_SHIFT) | \
-       ((u64)(e11val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e11##_SHIFT) | \
-       ((u64)(e12val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e12##_SHIFT) | \
-       ((u64)(e13val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e13##_SHIFT) | \
-       ((u64)(e14val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e14##_SHIFT) | \
-       ((u64)(e15val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e15##_SHIFT) \
-)
-
-/* all CceStatus sub-block freeze bits */
-#define ALL_FROZE (CCE_STATUS_SDMA_FROZE_SMASK \
-                       | CCE_STATUS_RXE_FROZE_SMASK \
-                       | CCE_STATUS_TXE_FROZE_SMASK \
-                       | CCE_STATUS_TXE_PIO_FROZE_SMASK)
-/* all CceStatus sub-block TXE pause bits */
-#define ALL_TXE_PAUSE (CCE_STATUS_TXE_PIO_PAUSED_SMASK \
-                       | CCE_STATUS_TXE_PAUSED_SMASK \
-                       | CCE_STATUS_SDMA_PAUSED_SMASK)
-/* all CceStatus sub-block RXE pause bits */
-#define ALL_RXE_PAUSE CCE_STATUS_RXE_PAUSED_SMASK
-
-/*
- * CCE Error flags.
- */
-static struct flag_table cce_err_status_flags[] = {
-/* 0*/ FLAG_ENTRY0("CceCsrParityErr",
-               CCE_ERR_STATUS_CCE_CSR_PARITY_ERR_SMASK),
-/* 1*/ FLAG_ENTRY0("CceCsrReadBadAddrErr",
-               CCE_ERR_STATUS_CCE_CSR_READ_BAD_ADDR_ERR_SMASK),
-/* 2*/ FLAG_ENTRY0("CceCsrWriteBadAddrErr",
-               CCE_ERR_STATUS_CCE_CSR_WRITE_BAD_ADDR_ERR_SMASK),
-/* 3*/ FLAG_ENTRY0("CceTrgtAsyncFifoParityErr",
-               CCE_ERR_STATUS_CCE_TRGT_ASYNC_FIFO_PARITY_ERR_SMASK),
-/* 4*/ FLAG_ENTRY0("CceTrgtAccessErr",
-               CCE_ERR_STATUS_CCE_TRGT_ACCESS_ERR_SMASK),
-/* 5*/ FLAG_ENTRY0("CceRspdDataParityErr",
-               CCE_ERR_STATUS_CCE_RSPD_DATA_PARITY_ERR_SMASK),
-/* 6*/ FLAG_ENTRY0("CceCli0AsyncFifoParityErr",
-               CCE_ERR_STATUS_CCE_CLI0_ASYNC_FIFO_PARITY_ERR_SMASK),
-/* 7*/ FLAG_ENTRY0("CceCsrCfgBusParityErr",
-               CCE_ERR_STATUS_CCE_CSR_CFG_BUS_PARITY_ERR_SMASK),
-/* 8*/ FLAG_ENTRY0("CceCli2AsyncFifoParityErr",
-               CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK),
-/* 9*/ FLAG_ENTRY0("CceCli1AsyncFifoPioCrdtParityErr",
-           CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR_SMASK),
-/*10*/ FLAG_ENTRY0("CceCli1AsyncFifoPioCrdtParityErr",
-           CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR_SMASK),
-/*11*/ FLAG_ENTRY0("CceCli1AsyncFifoRxdmaParityError",
-           CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERROR_SMASK),
-/*12*/ FLAG_ENTRY0("CceCli1AsyncFifoDbgParityError",
-               CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERROR_SMASK),
-/*13*/ FLAG_ENTRY0("PcicRetryMemCorErr",
-               CCE_ERR_STATUS_PCIC_RETRY_MEM_COR_ERR_SMASK),
-/*14*/ FLAG_ENTRY0("PcicRetryMemCorErr",
-               CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_COR_ERR_SMASK),
-/*15*/ FLAG_ENTRY0("PcicPostHdQCorErr",
-               CCE_ERR_STATUS_PCIC_POST_HD_QCOR_ERR_SMASK),
-/*16*/ FLAG_ENTRY0("PcicPostHdQCorErr",
-               CCE_ERR_STATUS_PCIC_POST_DAT_QCOR_ERR_SMASK),
-/*17*/ FLAG_ENTRY0("PcicPostHdQCorErr",
-               CCE_ERR_STATUS_PCIC_CPL_HD_QCOR_ERR_SMASK),
-/*18*/ FLAG_ENTRY0("PcicCplDatQCorErr",
-               CCE_ERR_STATUS_PCIC_CPL_DAT_QCOR_ERR_SMASK),
-/*19*/ FLAG_ENTRY0("PcicNPostHQParityErr",
-               CCE_ERR_STATUS_PCIC_NPOST_HQ_PARITY_ERR_SMASK),
-/*20*/ FLAG_ENTRY0("PcicNPostDatQParityErr",
-               CCE_ERR_STATUS_PCIC_NPOST_DAT_QPARITY_ERR_SMASK),
-/*21*/ FLAG_ENTRY0("PcicRetryMemUncErr",
-               CCE_ERR_STATUS_PCIC_RETRY_MEM_UNC_ERR_SMASK),
-/*22*/ FLAG_ENTRY0("PcicRetrySotMemUncErr",
-               CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_UNC_ERR_SMASK),
-/*23*/ FLAG_ENTRY0("PcicPostHdQUncErr",
-               CCE_ERR_STATUS_PCIC_POST_HD_QUNC_ERR_SMASK),
-/*24*/ FLAG_ENTRY0("PcicPostDatQUncErr",
-               CCE_ERR_STATUS_PCIC_POST_DAT_QUNC_ERR_SMASK),
-/*25*/ FLAG_ENTRY0("PcicCplHdQUncErr",
-               CCE_ERR_STATUS_PCIC_CPL_HD_QUNC_ERR_SMASK),
-/*26*/ FLAG_ENTRY0("PcicCplDatQUncErr",
-               CCE_ERR_STATUS_PCIC_CPL_DAT_QUNC_ERR_SMASK),
-/*27*/ FLAG_ENTRY0("PcicTransmitFrontParityErr",
-               CCE_ERR_STATUS_PCIC_TRANSMIT_FRONT_PARITY_ERR_SMASK),
-/*28*/ FLAG_ENTRY0("PcicTransmitBackParityErr",
-               CCE_ERR_STATUS_PCIC_TRANSMIT_BACK_PARITY_ERR_SMASK),
-/*29*/ FLAG_ENTRY0("PcicReceiveParityErr",
-               CCE_ERR_STATUS_PCIC_RECEIVE_PARITY_ERR_SMASK),
-/*30*/ FLAG_ENTRY0("CceTrgtCplTimeoutErr",
-               CCE_ERR_STATUS_CCE_TRGT_CPL_TIMEOUT_ERR_SMASK),
-/*31*/ FLAG_ENTRY0("LATriggered",
-               CCE_ERR_STATUS_LA_TRIGGERED_SMASK),
-/*32*/ FLAG_ENTRY0("CceSegReadBadAddrErr",
-               CCE_ERR_STATUS_CCE_SEG_READ_BAD_ADDR_ERR_SMASK),
-/*33*/ FLAG_ENTRY0("CceSegWriteBadAddrErr",
-               CCE_ERR_STATUS_CCE_SEG_WRITE_BAD_ADDR_ERR_SMASK),
-/*34*/ FLAG_ENTRY0("CceRcplAsyncFifoParityErr",
-               CCE_ERR_STATUS_CCE_RCPL_ASYNC_FIFO_PARITY_ERR_SMASK),
-/*35*/ FLAG_ENTRY0("CceRxdmaConvFifoParityErr",
-               CCE_ERR_STATUS_CCE_RXDMA_CONV_FIFO_PARITY_ERR_SMASK),
-/*36*/ FLAG_ENTRY0("CceMsixTableCorErr",
-               CCE_ERR_STATUS_CCE_MSIX_TABLE_COR_ERR_SMASK),
-/*37*/ FLAG_ENTRY0("CceMsixTableUncErr",
-               CCE_ERR_STATUS_CCE_MSIX_TABLE_UNC_ERR_SMASK),
-/*38*/ FLAG_ENTRY0("CceIntMapCorErr",
-               CCE_ERR_STATUS_CCE_INT_MAP_COR_ERR_SMASK),
-/*39*/ FLAG_ENTRY0("CceIntMapUncErr",
-               CCE_ERR_STATUS_CCE_INT_MAP_UNC_ERR_SMASK),
-/*40*/ FLAG_ENTRY0("CceMsixCsrParityErr",
-               CCE_ERR_STATUS_CCE_MSIX_CSR_PARITY_ERR_SMASK),
-/*41-63 reserved*/
-};
-
-/*
- * Misc Error flags
- */
-#define MES(text) MISC_ERR_STATUS_MISC_##text##_ERR_SMASK
-static struct flag_table misc_err_status_flags[] = {
-/* 0*/ FLAG_ENTRY0("CSR_PARITY", MES(CSR_PARITY)),
-/* 1*/ FLAG_ENTRY0("CSR_READ_BAD_ADDR", MES(CSR_READ_BAD_ADDR)),
-/* 2*/ FLAG_ENTRY0("CSR_WRITE_BAD_ADDR", MES(CSR_WRITE_BAD_ADDR)),
-/* 3*/ FLAG_ENTRY0("SBUS_WRITE_FAILED", MES(SBUS_WRITE_FAILED)),
-/* 4*/ FLAG_ENTRY0("KEY_MISMATCH", MES(KEY_MISMATCH)),
-/* 5*/ FLAG_ENTRY0("FW_AUTH_FAILED", MES(FW_AUTH_FAILED)),
-/* 6*/ FLAG_ENTRY0("EFUSE_CSR_PARITY", MES(EFUSE_CSR_PARITY)),
-/* 7*/ FLAG_ENTRY0("EFUSE_READ_BAD_ADDR", MES(EFUSE_READ_BAD_ADDR)),
-/* 8*/ FLAG_ENTRY0("EFUSE_WRITE", MES(EFUSE_WRITE)),
-/* 9*/ FLAG_ENTRY0("EFUSE_DONE_PARITY", MES(EFUSE_DONE_PARITY)),
-/*10*/ FLAG_ENTRY0("INVALID_EEP_CMD", MES(INVALID_EEP_CMD)),
-/*11*/ FLAG_ENTRY0("MBIST_FAIL", MES(MBIST_FAIL)),
-/*12*/ FLAG_ENTRY0("PLL_LOCK_FAIL", MES(PLL_LOCK_FAIL))
-};
-
-/*
- * TXE PIO Error flags and consequences
- */
-static struct flag_table pio_err_status_flags[] = {
-/* 0*/ FLAG_ENTRY("PioWriteBadCtxt",
-       SEC_WRITE_DROPPED,
-       SEND_PIO_ERR_STATUS_PIO_WRITE_BAD_CTXT_ERR_SMASK),
-/* 1*/ FLAG_ENTRY("PioWriteAddrParity",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK),
-/* 2*/ FLAG_ENTRY("PioCsrParity",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK),
-/* 3*/ FLAG_ENTRY("PioSbMemFifo0",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK),
-/* 4*/ FLAG_ENTRY("PioSbMemFifo1",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK),
-/* 5*/ FLAG_ENTRY("PioPccFifoParity",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK),
-/* 6*/ FLAG_ENTRY("PioPecFifoParity",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK),
-/* 7*/ FLAG_ENTRY("PioSbrdctlCrrelParity",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK),
-/* 8*/ FLAG_ENTRY("PioSbrdctrlCrrelFifoParity",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK),
-/* 9*/ FLAG_ENTRY("PioPktEvictFifoParityErr",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK),
-/*10*/ FLAG_ENTRY("PioSmPktResetParity",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK),
-/*11*/ FLAG_ENTRY("PioVlLenMemBank0Unc",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK),
-/*12*/ FLAG_ENTRY("PioVlLenMemBank1Unc",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK),
-/*13*/ FLAG_ENTRY("PioVlLenMemBank0Cor",
-       0,
-       SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_COR_ERR_SMASK),
-/*14*/ FLAG_ENTRY("PioVlLenMemBank1Cor",
-       0,
-       SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_COR_ERR_SMASK),
-/*15*/ FLAG_ENTRY("PioCreditRetFifoParity",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK),
-/*16*/ FLAG_ENTRY("PioPpmcPblFifo",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK),
-/*17*/ FLAG_ENTRY("PioInitSmIn",
-       0,
-       SEND_PIO_ERR_STATUS_PIO_INIT_SM_IN_ERR_SMASK),
-/*18*/ FLAG_ENTRY("PioPktEvictSmOrArbSm",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK),
-/*19*/ FLAG_ENTRY("PioHostAddrMemUnc",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK),
-/*20*/ FLAG_ENTRY("PioHostAddrMemCor",
-       0,
-       SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_COR_ERR_SMASK),
-/*21*/ FLAG_ENTRY("PioWriteDataParity",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK),
-/*22*/ FLAG_ENTRY("PioStateMachine",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK),
-/*23*/ FLAG_ENTRY("PioWriteQwValidParity",
-       SEC_WRITE_DROPPED | SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK),
-/*24*/ FLAG_ENTRY("PioBlockQwCountParity",
-       SEC_WRITE_DROPPED | SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK),
-/*25*/ FLAG_ENTRY("PioVlfVlLenParity",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK),
-/*26*/ FLAG_ENTRY("PioVlfSopParity",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK),
-/*27*/ FLAG_ENTRY("PioVlFifoParity",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK),
-/*28*/ FLAG_ENTRY("PioPpmcBqcMemParity",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK),
-/*29*/ FLAG_ENTRY("PioPpmcSopLen",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK),
-/*30-31 reserved*/
-/*32*/ FLAG_ENTRY("PioCurrentFreeCntParity",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK),
-/*33*/ FLAG_ENTRY("PioLastReturnedCntParity",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK),
-/*34*/ FLAG_ENTRY("PioPccSopHeadParity",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK),
-/*35*/ FLAG_ENTRY("PioPecSopHeadParityErr",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK),
-/*36-63 reserved*/
-};
-
-/* TXE PIO errors that cause an SPC freeze */
-#define ALL_PIO_FREEZE_ERR \
-       (SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK)
-
-/*
- * TXE SDMA Error flags
- */
-static struct flag_table sdma_err_status_flags[] = {
-/* 0*/ FLAG_ENTRY0("SDmaRpyTagErr",
-               SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK),
-/* 1*/ FLAG_ENTRY0("SDmaCsrParityErr",
-               SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK),
-/* 2*/ FLAG_ENTRY0("SDmaPcieReqTrackingUncErr",
-               SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK),
-/* 3*/ FLAG_ENTRY0("SDmaPcieReqTrackingCorErr",
-               SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_COR_ERR_SMASK),
-/*04-63 reserved*/
-};
-
-/* TXE SDMA errors that cause an SPC freeze */
-#define ALL_SDMA_FREEZE_ERR  \
-               (SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK \
-               | SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK \
-               | SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK)
-
-/* SendEgressErrInfo bits that correspond to a PortXmitDiscard counter */
-#define PORT_DISCARD_EGRESS_ERRS \
-       (SEND_EGRESS_ERR_INFO_TOO_LONG_IB_PACKET_ERR_SMASK \
-       | SEND_EGRESS_ERR_INFO_VL_MAPPING_ERR_SMASK \
-       | SEND_EGRESS_ERR_INFO_VL_ERR_SMASK)
-
-/*
- * TXE Egress Error flags
- */
-#define SEES(text) SEND_EGRESS_ERR_STATUS_##text##_ERR_SMASK
-static struct flag_table egress_err_status_flags[] = {
-/* 0*/ FLAG_ENTRY0("TxPktIntegrityMemCorErr", SEES(TX_PKT_INTEGRITY_MEM_COR)),
-/* 1*/ FLAG_ENTRY0("TxPktIntegrityMemUncErr", SEES(TX_PKT_INTEGRITY_MEM_UNC)),
-/* 2 reserved */
-/* 3*/ FLAG_ENTRY0("TxEgressFifoUnderrunOrParityErr",
-               SEES(TX_EGRESS_FIFO_UNDERRUN_OR_PARITY)),
-/* 4*/ FLAG_ENTRY0("TxLinkdownErr", SEES(TX_LINKDOWN)),
-/* 5*/ FLAG_ENTRY0("TxIncorrectLinkStateErr", SEES(TX_INCORRECT_LINK_STATE)),
-/* 6 reserved */
-/* 7*/ FLAG_ENTRY0("TxPioLaunchIntfParityErr",
-               SEES(TX_PIO_LAUNCH_INTF_PARITY)),
-/* 8*/ FLAG_ENTRY0("TxSdmaLaunchIntfParityErr",
-               SEES(TX_SDMA_LAUNCH_INTF_PARITY)),
-/* 9-10 reserved */
-/*11*/ FLAG_ENTRY0("TxSbrdCtlStateMachineParityErr",
-               SEES(TX_SBRD_CTL_STATE_MACHINE_PARITY)),
-/*12*/ FLAG_ENTRY0("TxIllegalVLErr", SEES(TX_ILLEGAL_VL)),
-/*13*/ FLAG_ENTRY0("TxLaunchCsrParityErr", SEES(TX_LAUNCH_CSR_PARITY)),
-/*14*/ FLAG_ENTRY0("TxSbrdCtlCsrParityErr", SEES(TX_SBRD_CTL_CSR_PARITY)),
-/*15*/ FLAG_ENTRY0("TxConfigParityErr", SEES(TX_CONFIG_PARITY)),
-/*16*/ FLAG_ENTRY0("TxSdma0DisallowedPacketErr",
-               SEES(TX_SDMA0_DISALLOWED_PACKET)),
-/*17*/ FLAG_ENTRY0("TxSdma1DisallowedPacketErr",
-               SEES(TX_SDMA1_DISALLOWED_PACKET)),
-/*18*/ FLAG_ENTRY0("TxSdma2DisallowedPacketErr",
-               SEES(TX_SDMA2_DISALLOWED_PACKET)),
-/*19*/ FLAG_ENTRY0("TxSdma3DisallowedPacketErr",
-               SEES(TX_SDMA3_DISALLOWED_PACKET)),
-/*20*/ FLAG_ENTRY0("TxSdma4DisallowedPacketErr",
-               SEES(TX_SDMA4_DISALLOWED_PACKET)),
-/*21*/ FLAG_ENTRY0("TxSdma5DisallowedPacketErr",
-               SEES(TX_SDMA5_DISALLOWED_PACKET)),
-/*22*/ FLAG_ENTRY0("TxSdma6DisallowedPacketErr",
-               SEES(TX_SDMA6_DISALLOWED_PACKET)),
-/*23*/ FLAG_ENTRY0("TxSdma7DisallowedPacketErr",
-               SEES(TX_SDMA7_DISALLOWED_PACKET)),
-/*24*/ FLAG_ENTRY0("TxSdma8DisallowedPacketErr",
-               SEES(TX_SDMA8_DISALLOWED_PACKET)),
-/*25*/ FLAG_ENTRY0("TxSdma9DisallowedPacketErr",
-               SEES(TX_SDMA9_DISALLOWED_PACKET)),
-/*26*/ FLAG_ENTRY0("TxSdma10DisallowedPacketErr",
-               SEES(TX_SDMA10_DISALLOWED_PACKET)),
-/*27*/ FLAG_ENTRY0("TxSdma11DisallowedPacketErr",
-               SEES(TX_SDMA11_DISALLOWED_PACKET)),
-/*28*/ FLAG_ENTRY0("TxSdma12DisallowedPacketErr",
-               SEES(TX_SDMA12_DISALLOWED_PACKET)),
-/*29*/ FLAG_ENTRY0("TxSdma13DisallowedPacketErr",
-               SEES(TX_SDMA13_DISALLOWED_PACKET)),
-/*30*/ FLAG_ENTRY0("TxSdma14DisallowedPacketErr",
-               SEES(TX_SDMA14_DISALLOWED_PACKET)),
-/*31*/ FLAG_ENTRY0("TxSdma15DisallowedPacketErr",
-               SEES(TX_SDMA15_DISALLOWED_PACKET)),
-/*32*/ FLAG_ENTRY0("TxLaunchFifo0UncOrParityErr",
-               SEES(TX_LAUNCH_FIFO0_UNC_OR_PARITY)),
-/*33*/ FLAG_ENTRY0("TxLaunchFifo1UncOrParityErr",
-               SEES(TX_LAUNCH_FIFO1_UNC_OR_PARITY)),
-/*34*/ FLAG_ENTRY0("TxLaunchFifo2UncOrParityErr",
-               SEES(TX_LAUNCH_FIFO2_UNC_OR_PARITY)),
-/*35*/ FLAG_ENTRY0("TxLaunchFifo3UncOrParityErr",
-               SEES(TX_LAUNCH_FIFO3_UNC_OR_PARITY)),
-/*36*/ FLAG_ENTRY0("TxLaunchFifo4UncOrParityErr",
-               SEES(TX_LAUNCH_FIFO4_UNC_OR_PARITY)),
-/*37*/ FLAG_ENTRY0("TxLaunchFifo5UncOrParityErr",
-               SEES(TX_LAUNCH_FIFO5_UNC_OR_PARITY)),
-/*38*/ FLAG_ENTRY0("TxLaunchFifo6UncOrParityErr",
-               SEES(TX_LAUNCH_FIFO6_UNC_OR_PARITY)),
-/*39*/ FLAG_ENTRY0("TxLaunchFifo7UncOrParityErr",
-               SEES(TX_LAUNCH_FIFO7_UNC_OR_PARITY)),
-/*40*/ FLAG_ENTRY0("TxLaunchFifo8UncOrParityErr",
-               SEES(TX_LAUNCH_FIFO8_UNC_OR_PARITY)),
-/*41*/ FLAG_ENTRY0("TxCreditReturnParityErr", SEES(TX_CREDIT_RETURN_PARITY)),
-/*42*/ FLAG_ENTRY0("TxSbHdrUncErr", SEES(TX_SB_HDR_UNC)),
-/*43*/ FLAG_ENTRY0("TxReadSdmaMemoryUncErr", SEES(TX_READ_SDMA_MEMORY_UNC)),
-/*44*/ FLAG_ENTRY0("TxReadPioMemoryUncErr", SEES(TX_READ_PIO_MEMORY_UNC)),
-/*45*/ FLAG_ENTRY0("TxEgressFifoUncErr", SEES(TX_EGRESS_FIFO_UNC)),
-/*46*/ FLAG_ENTRY0("TxHcrcInsertionErr", SEES(TX_HCRC_INSERTION)),
-/*47*/ FLAG_ENTRY0("TxCreditReturnVLErr", SEES(TX_CREDIT_RETURN_VL)),
-/*48*/ FLAG_ENTRY0("TxLaunchFifo0CorErr", SEES(TX_LAUNCH_FIFO0_COR)),
-/*49*/ FLAG_ENTRY0("TxLaunchFifo1CorErr", SEES(TX_LAUNCH_FIFO1_COR)),
-/*50*/ FLAG_ENTRY0("TxLaunchFifo2CorErr", SEES(TX_LAUNCH_FIFO2_COR)),
-/*51*/ FLAG_ENTRY0("TxLaunchFifo3CorErr", SEES(TX_LAUNCH_FIFO3_COR)),
-/*52*/ FLAG_ENTRY0("TxLaunchFifo4CorErr", SEES(TX_LAUNCH_FIFO4_COR)),
-/*53*/ FLAG_ENTRY0("TxLaunchFifo5CorErr", SEES(TX_LAUNCH_FIFO5_COR)),
-/*54*/ FLAG_ENTRY0("TxLaunchFifo6CorErr", SEES(TX_LAUNCH_FIFO6_COR)),
-/*55*/ FLAG_ENTRY0("TxLaunchFifo7CorErr", SEES(TX_LAUNCH_FIFO7_COR)),
-/*56*/ FLAG_ENTRY0("TxLaunchFifo8CorErr", SEES(TX_LAUNCH_FIFO8_COR)),
-/*57*/ FLAG_ENTRY0("TxCreditOverrunErr", SEES(TX_CREDIT_OVERRUN)),
-/*58*/ FLAG_ENTRY0("TxSbHdrCorErr", SEES(TX_SB_HDR_COR)),
-/*59*/ FLAG_ENTRY0("TxReadSdmaMemoryCorErr", SEES(TX_READ_SDMA_MEMORY_COR)),
-/*60*/ FLAG_ENTRY0("TxReadPioMemoryCorErr", SEES(TX_READ_PIO_MEMORY_COR)),
-/*61*/ FLAG_ENTRY0("TxEgressFifoCorErr", SEES(TX_EGRESS_FIFO_COR)),
-/*62*/ FLAG_ENTRY0("TxReadSdmaMemoryCsrUncErr",
-               SEES(TX_READ_SDMA_MEMORY_CSR_UNC)),
-/*63*/ FLAG_ENTRY0("TxReadPioMemoryCsrUncErr",
-               SEES(TX_READ_PIO_MEMORY_CSR_UNC)),
-};
-
-/*
- * TXE Egress Error Info flags
- */
-#define SEEI(text) SEND_EGRESS_ERR_INFO_##text##_ERR_SMASK
-static struct flag_table egress_err_info_flags[] = {
-/* 0*/ FLAG_ENTRY0("Reserved", 0ull),
-/* 1*/ FLAG_ENTRY0("VLErr", SEEI(VL)),
-/* 2*/ FLAG_ENTRY0("JobKeyErr", SEEI(JOB_KEY)),
-/* 3*/ FLAG_ENTRY0("JobKeyErr", SEEI(JOB_KEY)),
-/* 4*/ FLAG_ENTRY0("PartitionKeyErr", SEEI(PARTITION_KEY)),
-/* 5*/ FLAG_ENTRY0("SLIDErr", SEEI(SLID)),
-/* 6*/ FLAG_ENTRY0("OpcodeErr", SEEI(OPCODE)),
-/* 7*/ FLAG_ENTRY0("VLMappingErr", SEEI(VL_MAPPING)),
-/* 8*/ FLAG_ENTRY0("RawErr", SEEI(RAW)),
-/* 9*/ FLAG_ENTRY0("RawIPv6Err", SEEI(RAW_IPV6)),
-/*10*/ FLAG_ENTRY0("GRHErr", SEEI(GRH)),
-/*11*/ FLAG_ENTRY0("BypassErr", SEEI(BYPASS)),
-/*12*/ FLAG_ENTRY0("KDETHPacketsErr", SEEI(KDETH_PACKETS)),
-/*13*/ FLAG_ENTRY0("NonKDETHPacketsErr", SEEI(NON_KDETH_PACKETS)),
-/*14*/ FLAG_ENTRY0("TooSmallIBPacketsErr", SEEI(TOO_SMALL_IB_PACKETS)),
-/*15*/ FLAG_ENTRY0("TooSmallBypassPacketsErr", SEEI(TOO_SMALL_BYPASS_PACKETS)),
-/*16*/ FLAG_ENTRY0("PbcTestErr", SEEI(PBC_TEST)),
-/*17*/ FLAG_ENTRY0("BadPktLenErr", SEEI(BAD_PKT_LEN)),
-/*18*/ FLAG_ENTRY0("TooLongIBPacketErr", SEEI(TOO_LONG_IB_PACKET)),
-/*19*/ FLAG_ENTRY0("TooLongBypassPacketsErr", SEEI(TOO_LONG_BYPASS_PACKETS)),
-/*20*/ FLAG_ENTRY0("PbcStaticRateControlErr", SEEI(PBC_STATIC_RATE_CONTROL)),
-/*21*/ FLAG_ENTRY0("BypassBadPktLenErr", SEEI(BAD_PKT_LEN)),
-};
-
-/* TXE Egress errors that cause an SPC freeze */
-#define ALL_TXE_EGRESS_FREEZE_ERR \
-       (SEES(TX_EGRESS_FIFO_UNDERRUN_OR_PARITY) \
-       | SEES(TX_PIO_LAUNCH_INTF_PARITY) \
-       | SEES(TX_SDMA_LAUNCH_INTF_PARITY) \
-       | SEES(TX_SBRD_CTL_STATE_MACHINE_PARITY) \
-       | SEES(TX_LAUNCH_CSR_PARITY) \
-       | SEES(TX_SBRD_CTL_CSR_PARITY) \
-       | SEES(TX_CONFIG_PARITY) \
-       | SEES(TX_LAUNCH_FIFO0_UNC_OR_PARITY) \
-       | SEES(TX_LAUNCH_FIFO1_UNC_OR_PARITY) \
-       | SEES(TX_LAUNCH_FIFO2_UNC_OR_PARITY) \
-       | SEES(TX_LAUNCH_FIFO3_UNC_OR_PARITY) \
-       | SEES(TX_LAUNCH_FIFO4_UNC_OR_PARITY) \
-       | SEES(TX_LAUNCH_FIFO5_UNC_OR_PARITY) \
-       | SEES(TX_LAUNCH_FIFO6_UNC_OR_PARITY) \
-       | SEES(TX_LAUNCH_FIFO7_UNC_OR_PARITY) \
-       | SEES(TX_LAUNCH_FIFO8_UNC_OR_PARITY) \
-       | SEES(TX_CREDIT_RETURN_PARITY))
-
-/*
- * TXE Send error flags
- */
-#define SES(name) SEND_ERR_STATUS_SEND_##name##_ERR_SMASK
-static struct flag_table send_err_status_flags[] = {
-/* 0*/ FLAG_ENTRY0("SendCsrParityErr", SES(CSR_PARITY)),
-/* 1*/ FLAG_ENTRY0("SendCsrReadBadAddrErr", SES(CSR_READ_BAD_ADDR)),
-/* 2*/ FLAG_ENTRY0("SendCsrWriteBadAddrErr", SES(CSR_WRITE_BAD_ADDR))
-};
-
-/*
- * TXE Send Context Error flags and consequences
- */
-static struct flag_table sc_err_status_flags[] = {
-/* 0*/ FLAG_ENTRY("InconsistentSop",
-               SEC_PACKET_DROPPED | SEC_SC_HALTED,
-               SEND_CTXT_ERR_STATUS_PIO_INCONSISTENT_SOP_ERR_SMASK),
-/* 1*/ FLAG_ENTRY("DisallowedPacket",
-               SEC_PACKET_DROPPED | SEC_SC_HALTED,
-               SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK),
-/* 2*/ FLAG_ENTRY("WriteCrossesBoundary",
-               SEC_WRITE_DROPPED | SEC_SC_HALTED,
-               SEND_CTXT_ERR_STATUS_PIO_WRITE_CROSSES_BOUNDARY_ERR_SMASK),
-/* 3*/ FLAG_ENTRY("WriteOverflow",
-               SEC_WRITE_DROPPED | SEC_SC_HALTED,
-               SEND_CTXT_ERR_STATUS_PIO_WRITE_OVERFLOW_ERR_SMASK),
-/* 4*/ FLAG_ENTRY("WriteOutOfBounds",
-               SEC_WRITE_DROPPED | SEC_SC_HALTED,
-               SEND_CTXT_ERR_STATUS_PIO_WRITE_OUT_OF_BOUNDS_ERR_SMASK),
-/* 5-63 reserved*/
-};
-
-/*
- * RXE Receive Error flags
- */
-#define RXES(name) RCV_ERR_STATUS_RX_##name##_ERR_SMASK
-static struct flag_table rxe_err_status_flags[] = {
-/* 0*/ FLAG_ENTRY0("RxDmaCsrCorErr", RXES(DMA_CSR_COR)),
-/* 1*/ FLAG_ENTRY0("RxDcIntfParityErr", RXES(DC_INTF_PARITY)),
-/* 2*/ FLAG_ENTRY0("RxRcvHdrUncErr", RXES(RCV_HDR_UNC)),
-/* 3*/ FLAG_ENTRY0("RxRcvHdrCorErr", RXES(RCV_HDR_COR)),
-/* 4*/ FLAG_ENTRY0("RxRcvDataUncErr", RXES(RCV_DATA_UNC)),
-/* 5*/ FLAG_ENTRY0("RxRcvDataCorErr", RXES(RCV_DATA_COR)),
-/* 6*/ FLAG_ENTRY0("RxRcvQpMapTableUncErr", RXES(RCV_QP_MAP_TABLE_UNC)),
-/* 7*/ FLAG_ENTRY0("RxRcvQpMapTableCorErr", RXES(RCV_QP_MAP_TABLE_COR)),
-/* 8*/ FLAG_ENTRY0("RxRcvCsrParityErr", RXES(RCV_CSR_PARITY)),
-/* 9*/ FLAG_ENTRY0("RxDcSopEopParityErr", RXES(DC_SOP_EOP_PARITY)),
-/*10*/ FLAG_ENTRY0("RxDmaFlagUncErr", RXES(DMA_FLAG_UNC)),
-/*11*/ FLAG_ENTRY0("RxDmaFlagCorErr", RXES(DMA_FLAG_COR)),
-/*12*/ FLAG_ENTRY0("RxRcvFsmEncodingErr", RXES(RCV_FSM_ENCODING)),
-/*13*/ FLAG_ENTRY0("RxRbufFreeListUncErr", RXES(RBUF_FREE_LIST_UNC)),
-/*14*/ FLAG_ENTRY0("RxRbufFreeListCorErr", RXES(RBUF_FREE_LIST_COR)),
-/*15*/ FLAG_ENTRY0("RxRbufLookupDesRegUncErr", RXES(RBUF_LOOKUP_DES_REG_UNC)),
-/*16*/ FLAG_ENTRY0("RxRbufLookupDesRegUncCorErr",
-               RXES(RBUF_LOOKUP_DES_REG_UNC_COR)),
-/*17*/ FLAG_ENTRY0("RxRbufLookupDesUncErr", RXES(RBUF_LOOKUP_DES_UNC)),
-/*18*/ FLAG_ENTRY0("RxRbufLookupDesCorErr", RXES(RBUF_LOOKUP_DES_COR)),
-/*19*/ FLAG_ENTRY0("RxRbufBlockListReadUncErr",
-               RXES(RBUF_BLOCK_LIST_READ_UNC)),
-/*20*/ FLAG_ENTRY0("RxRbufBlockListReadCorErr",
-               RXES(RBUF_BLOCK_LIST_READ_COR)),
-/*21*/ FLAG_ENTRY0("RxRbufCsrQHeadBufNumParityErr",
-               RXES(RBUF_CSR_QHEAD_BUF_NUM_PARITY)),
-/*22*/ FLAG_ENTRY0("RxRbufCsrQEntCntParityErr",
-               RXES(RBUF_CSR_QENT_CNT_PARITY)),
-/*23*/ FLAG_ENTRY0("RxRbufCsrQNextBufParityErr",
-               RXES(RBUF_CSR_QNEXT_BUF_PARITY)),
-/*24*/ FLAG_ENTRY0("RxRbufCsrQVldBitParityErr",
-               RXES(RBUF_CSR_QVLD_BIT_PARITY)),
-/*25*/ FLAG_ENTRY0("RxRbufCsrQHdPtrParityErr", RXES(RBUF_CSR_QHD_PTR_PARITY)),
-/*26*/ FLAG_ENTRY0("RxRbufCsrQTlPtrParityErr", RXES(RBUF_CSR_QTL_PTR_PARITY)),
-/*27*/ FLAG_ENTRY0("RxRbufCsrQNumOfPktParityErr",
-               RXES(RBUF_CSR_QNUM_OF_PKT_PARITY)),
-/*28*/ FLAG_ENTRY0("RxRbufCsrQEOPDWParityErr", RXES(RBUF_CSR_QEOPDW_PARITY)),
-/*29*/ FLAG_ENTRY0("RxRbufCtxIdParityErr", RXES(RBUF_CTX_ID_PARITY)),
-/*30*/ FLAG_ENTRY0("RxRBufBadLookupErr", RXES(RBUF_BAD_LOOKUP)),
-/*31*/ FLAG_ENTRY0("RxRbufFullErr", RXES(RBUF_FULL)),
-/*32*/ FLAG_ENTRY0("RxRbufEmptyErr", RXES(RBUF_EMPTY)),
-/*33*/ FLAG_ENTRY0("RxRbufFlRdAddrParityErr", RXES(RBUF_FL_RD_ADDR_PARITY)),
-/*34*/ FLAG_ENTRY0("RxRbufFlWrAddrParityErr", RXES(RBUF_FL_WR_ADDR_PARITY)),
-/*35*/ FLAG_ENTRY0("RxRbufFlInitdoneParityErr",
-               RXES(RBUF_FL_INITDONE_PARITY)),
-/*36*/ FLAG_ENTRY0("RxRbufFlInitWrAddrParityErr",
-               RXES(RBUF_FL_INIT_WR_ADDR_PARITY)),
-/*37*/ FLAG_ENTRY0("RxRbufNextFreeBufUncErr", RXES(RBUF_NEXT_FREE_BUF_UNC)),
-/*38*/ FLAG_ENTRY0("RxRbufNextFreeBufCorErr", RXES(RBUF_NEXT_FREE_BUF_COR)),
-/*39*/ FLAG_ENTRY0("RxLookupDesPart1UncErr", RXES(LOOKUP_DES_PART1_UNC)),
-/*40*/ FLAG_ENTRY0("RxLookupDesPart1UncCorErr",
-               RXES(LOOKUP_DES_PART1_UNC_COR)),
-/*41*/ FLAG_ENTRY0("RxLookupDesPart2ParityErr",
-               RXES(LOOKUP_DES_PART2_PARITY)),
-/*42*/ FLAG_ENTRY0("RxLookupRcvArrayUncErr", RXES(LOOKUP_RCV_ARRAY_UNC)),
-/*43*/ FLAG_ENTRY0("RxLookupRcvArrayCorErr", RXES(LOOKUP_RCV_ARRAY_COR)),
-/*44*/ FLAG_ENTRY0("RxLookupCsrParityErr", RXES(LOOKUP_CSR_PARITY)),
-/*45*/ FLAG_ENTRY0("RxHqIntrCsrParityErr", RXES(HQ_INTR_CSR_PARITY)),
-/*46*/ FLAG_ENTRY0("RxHqIntrFsmErr", RXES(HQ_INTR_FSM)),
-/*47*/ FLAG_ENTRY0("RxRbufDescPart1UncErr", RXES(RBUF_DESC_PART1_UNC)),
-/*48*/ FLAG_ENTRY0("RxRbufDescPart1CorErr", RXES(RBUF_DESC_PART1_COR)),
-/*49*/ FLAG_ENTRY0("RxRbufDescPart2UncErr", RXES(RBUF_DESC_PART2_UNC)),
-/*50*/ FLAG_ENTRY0("RxRbufDescPart2CorErr", RXES(RBUF_DESC_PART2_COR)),
-/*51*/ FLAG_ENTRY0("RxDmaHdrFifoRdUncErr", RXES(DMA_HDR_FIFO_RD_UNC)),
-/*52*/ FLAG_ENTRY0("RxDmaHdrFifoRdCorErr", RXES(DMA_HDR_FIFO_RD_COR)),
-/*53*/ FLAG_ENTRY0("RxDmaDataFifoRdUncErr", RXES(DMA_DATA_FIFO_RD_UNC)),
-/*54*/ FLAG_ENTRY0("RxDmaDataFifoRdCorErr", RXES(DMA_DATA_FIFO_RD_COR)),
-/*55*/ FLAG_ENTRY0("RxRbufDataUncErr", RXES(RBUF_DATA_UNC)),
-/*56*/ FLAG_ENTRY0("RxRbufDataCorErr", RXES(RBUF_DATA_COR)),
-/*57*/ FLAG_ENTRY0("RxDmaCsrParityErr", RXES(DMA_CSR_PARITY)),
-/*58*/ FLAG_ENTRY0("RxDmaEqFsmEncodingErr", RXES(DMA_EQ_FSM_ENCODING)),
-/*59*/ FLAG_ENTRY0("RxDmaDqFsmEncodingErr", RXES(DMA_DQ_FSM_ENCODING)),
-/*60*/ FLAG_ENTRY0("RxDmaCsrUncErr", RXES(DMA_CSR_UNC)),
-/*61*/ FLAG_ENTRY0("RxCsrReadBadAddrErr", RXES(CSR_READ_BAD_ADDR)),
-/*62*/ FLAG_ENTRY0("RxCsrWriteBadAddrErr", RXES(CSR_WRITE_BAD_ADDR)),
-/*63*/ FLAG_ENTRY0("RxCsrParityErr", RXES(CSR_PARITY))
-};
-
-/* RXE errors that will trigger an SPC freeze */
-#define ALL_RXE_FREEZE_ERR  \
-       (RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_UNC_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RCV_CSR_PARITY_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_DMA_FLAG_UNC_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RCV_FSM_ENCODING_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_FREE_LIST_UNC_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_UNC_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_UNC_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_CSR_QHEAD_BUF_NUM_PARITY_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_CSR_QENT_CNT_PARITY_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_CSR_QNEXT_BUF_PARITY_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_CSR_QVLD_BIT_PARITY_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_CSR_QHD_PTR_PARITY_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_CSR_QTL_PTR_PARITY_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_CSR_QNUM_OF_PKT_PARITY_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_CSR_QEOPDW_PARITY_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_CTX_ID_PARITY_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_BAD_LOOKUP_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_FULL_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_EMPTY_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_FL_RD_ADDR_PARITY_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_FL_WR_ADDR_PARITY_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_FL_INITDONE_PARITY_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_UNC_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_COR_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_LOOKUP_DES_PART2_PARITY_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_UNC_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_LOOKUP_CSR_PARITY_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_HQ_INTR_CSR_PARITY_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_HQ_INTR_FSM_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_DESC_PART1_UNC_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_DESC_PART1_COR_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_DESC_PART2_UNC_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_DATA_UNC_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_DMA_CSR_PARITY_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_DMA_EQ_FSM_ENCODING_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_DMA_DQ_FSM_ENCODING_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_CSR_PARITY_ERR_SMASK)
-
-#define RXE_FREEZE_ABORT_MASK \
-       (RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK | \
-       RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK | \
-       RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK)
-
-/*
- * DCC Error Flags
- */
-#define DCCE(name) DCC_ERR_FLG_##name##_SMASK
-static struct flag_table dcc_err_flags[] = {
-       FLAG_ENTRY0("bad_l2_err", DCCE(BAD_L2_ERR)),
-       FLAG_ENTRY0("bad_sc_err", DCCE(BAD_SC_ERR)),
-       FLAG_ENTRY0("bad_mid_tail_err", DCCE(BAD_MID_TAIL_ERR)),
-       FLAG_ENTRY0("bad_preemption_err", DCCE(BAD_PREEMPTION_ERR)),
-       FLAG_ENTRY0("preemption_err", DCCE(PREEMPTION_ERR)),
-       FLAG_ENTRY0("preemptionvl15_err", DCCE(PREEMPTIONVL15_ERR)),
-       FLAG_ENTRY0("bad_vl_marker_err", DCCE(BAD_VL_MARKER_ERR)),
-       FLAG_ENTRY0("bad_dlid_target_err", DCCE(BAD_DLID_TARGET_ERR)),
-       FLAG_ENTRY0("bad_lver_err", DCCE(BAD_LVER_ERR)),
-       FLAG_ENTRY0("uncorrectable_err", DCCE(UNCORRECTABLE_ERR)),
-       FLAG_ENTRY0("bad_crdt_ack_err", DCCE(BAD_CRDT_ACK_ERR)),
-       FLAG_ENTRY0("unsup_pkt_type", DCCE(UNSUP_PKT_TYPE)),
-       FLAG_ENTRY0("bad_ctrl_flit_err", DCCE(BAD_CTRL_FLIT_ERR)),
-       FLAG_ENTRY0("event_cntr_parity_err", DCCE(EVENT_CNTR_PARITY_ERR)),
-       FLAG_ENTRY0("event_cntr_rollover_err", DCCE(EVENT_CNTR_ROLLOVER_ERR)),
-       FLAG_ENTRY0("link_err", DCCE(LINK_ERR)),
-       FLAG_ENTRY0("misc_cntr_rollover_err", DCCE(MISC_CNTR_ROLLOVER_ERR)),
-       FLAG_ENTRY0("bad_ctrl_dist_err", DCCE(BAD_CTRL_DIST_ERR)),
-       FLAG_ENTRY0("bad_tail_dist_err", DCCE(BAD_TAIL_DIST_ERR)),
-       FLAG_ENTRY0("bad_head_dist_err", DCCE(BAD_HEAD_DIST_ERR)),
-       FLAG_ENTRY0("nonvl15_state_err", DCCE(NONVL15_STATE_ERR)),
-       FLAG_ENTRY0("vl15_multi_err", DCCE(VL15_MULTI_ERR)),
-       FLAG_ENTRY0("bad_pkt_length_err", DCCE(BAD_PKT_LENGTH_ERR)),
-       FLAG_ENTRY0("unsup_vl_err", DCCE(UNSUP_VL_ERR)),
-       FLAG_ENTRY0("perm_nvl15_err", DCCE(PERM_NVL15_ERR)),
-       FLAG_ENTRY0("slid_zero_err", DCCE(SLID_ZERO_ERR)),
-       FLAG_ENTRY0("dlid_zero_err", DCCE(DLID_ZERO_ERR)),
-       FLAG_ENTRY0("length_mtu_err", DCCE(LENGTH_MTU_ERR)),
-       FLAG_ENTRY0("rx_early_drop_err", DCCE(RX_EARLY_DROP_ERR)),
-       FLAG_ENTRY0("late_short_err", DCCE(LATE_SHORT_ERR)),
-       FLAG_ENTRY0("late_long_err", DCCE(LATE_LONG_ERR)),
-       FLAG_ENTRY0("late_ebp_err", DCCE(LATE_EBP_ERR)),
-       FLAG_ENTRY0("fpe_tx_fifo_ovflw_err", DCCE(FPE_TX_FIFO_OVFLW_ERR)),
-       FLAG_ENTRY0("fpe_tx_fifo_unflw_err", DCCE(FPE_TX_FIFO_UNFLW_ERR)),
-       FLAG_ENTRY0("csr_access_blocked_host", DCCE(CSR_ACCESS_BLOCKED_HOST)),
-       FLAG_ENTRY0("csr_access_blocked_uc", DCCE(CSR_ACCESS_BLOCKED_UC)),
-       FLAG_ENTRY0("tx_ctrl_parity_err", DCCE(TX_CTRL_PARITY_ERR)),
-       FLAG_ENTRY0("tx_ctrl_parity_mbe_err", DCCE(TX_CTRL_PARITY_MBE_ERR)),
-       FLAG_ENTRY0("tx_sc_parity_err", DCCE(TX_SC_PARITY_ERR)),
-       FLAG_ENTRY0("rx_ctrl_parity_mbe_err", DCCE(RX_CTRL_PARITY_MBE_ERR)),
-       FLAG_ENTRY0("csr_parity_err", DCCE(CSR_PARITY_ERR)),
-       FLAG_ENTRY0("csr_inval_addr", DCCE(CSR_INVAL_ADDR)),
-       FLAG_ENTRY0("tx_byte_shft_parity_err", DCCE(TX_BYTE_SHFT_PARITY_ERR)),
-       FLAG_ENTRY0("rx_byte_shft_parity_err", DCCE(RX_BYTE_SHFT_PARITY_ERR)),
-       FLAG_ENTRY0("fmconfig_err", DCCE(FMCONFIG_ERR)),
-       FLAG_ENTRY0("rcvport_err", DCCE(RCVPORT_ERR)),
-};
-
-/*
- * LCB error flags
- */
-#define LCBE(name) DC_LCB_ERR_FLG_##name##_SMASK
-static struct flag_table lcb_err_flags[] = {
-/* 0*/ FLAG_ENTRY0("CSR_PARITY_ERR", LCBE(CSR_PARITY_ERR)),
-/* 1*/ FLAG_ENTRY0("INVALID_CSR_ADDR", LCBE(INVALID_CSR_ADDR)),
-/* 2*/ FLAG_ENTRY0("RST_FOR_FAILED_DESKEW", LCBE(RST_FOR_FAILED_DESKEW)),
-/* 3*/ FLAG_ENTRY0("ALL_LNS_FAILED_REINIT_TEST",
-               LCBE(ALL_LNS_FAILED_REINIT_TEST)),
-/* 4*/ FLAG_ENTRY0("LOST_REINIT_STALL_OR_TOS", LCBE(LOST_REINIT_STALL_OR_TOS)),
-/* 5*/ FLAG_ENTRY0("TX_LESS_THAN_FOUR_LNS", LCBE(TX_LESS_THAN_FOUR_LNS)),
-/* 6*/ FLAG_ENTRY0("RX_LESS_THAN_FOUR_LNS", LCBE(RX_LESS_THAN_FOUR_LNS)),
-/* 7*/ FLAG_ENTRY0("SEQ_CRC_ERR", LCBE(SEQ_CRC_ERR)),
-/* 8*/ FLAG_ENTRY0("REINIT_FROM_PEER", LCBE(REINIT_FROM_PEER)),
-/* 9*/ FLAG_ENTRY0("REINIT_FOR_LN_DEGRADE", LCBE(REINIT_FOR_LN_DEGRADE)),
-/*10*/ FLAG_ENTRY0("CRC_ERR_CNT_HIT_LIMIT", LCBE(CRC_ERR_CNT_HIT_LIMIT)),
-/*11*/ FLAG_ENTRY0("RCLK_STOPPED", LCBE(RCLK_STOPPED)),
-/*12*/ FLAG_ENTRY0("UNEXPECTED_REPLAY_MARKER", LCBE(UNEXPECTED_REPLAY_MARKER)),
-/*13*/ FLAG_ENTRY0("UNEXPECTED_ROUND_TRIP_MARKER",
-               LCBE(UNEXPECTED_ROUND_TRIP_MARKER)),
-/*14*/ FLAG_ENTRY0("ILLEGAL_NULL_LTP", LCBE(ILLEGAL_NULL_LTP)),
-/*15*/ FLAG_ENTRY0("ILLEGAL_FLIT_ENCODING", LCBE(ILLEGAL_FLIT_ENCODING)),
-/*16*/ FLAG_ENTRY0("FLIT_INPUT_BUF_OFLW", LCBE(FLIT_INPUT_BUF_OFLW)),
-/*17*/ FLAG_ENTRY0("VL_ACK_INPUT_BUF_OFLW", LCBE(VL_ACK_INPUT_BUF_OFLW)),
-/*18*/ FLAG_ENTRY0("VL_ACK_INPUT_PARITY_ERR", LCBE(VL_ACK_INPUT_PARITY_ERR)),
-/*19*/ FLAG_ENTRY0("VL_ACK_INPUT_WRONG_CRC_MODE",
-               LCBE(VL_ACK_INPUT_WRONG_CRC_MODE)),
-/*20*/ FLAG_ENTRY0("FLIT_INPUT_BUF_MBE", LCBE(FLIT_INPUT_BUF_MBE)),
-/*21*/ FLAG_ENTRY0("FLIT_INPUT_BUF_SBE", LCBE(FLIT_INPUT_BUF_SBE)),
-/*22*/ FLAG_ENTRY0("REPLAY_BUF_MBE", LCBE(REPLAY_BUF_MBE)),
-/*23*/ FLAG_ENTRY0("REPLAY_BUF_SBE", LCBE(REPLAY_BUF_SBE)),
-/*24*/ FLAG_ENTRY0("CREDIT_RETURN_FLIT_MBE", LCBE(CREDIT_RETURN_FLIT_MBE)),
-/*25*/ FLAG_ENTRY0("RST_FOR_LINK_TIMEOUT", LCBE(RST_FOR_LINK_TIMEOUT)),
-/*26*/ FLAG_ENTRY0("RST_FOR_INCOMPLT_RND_TRIP",
-               LCBE(RST_FOR_INCOMPLT_RND_TRIP)),
-/*27*/ FLAG_ENTRY0("HOLD_REINIT", LCBE(HOLD_REINIT)),
-/*28*/ FLAG_ENTRY0("NEG_EDGE_LINK_TRANSFER_ACTIVE",
-               LCBE(NEG_EDGE_LINK_TRANSFER_ACTIVE)),
-/*29*/ FLAG_ENTRY0("REDUNDANT_FLIT_PARITY_ERR",
-               LCBE(REDUNDANT_FLIT_PARITY_ERR))
-};
-
-/*
- * DC8051 Error Flags
- */
-#define D8E(name) DC_DC8051_ERR_FLG_##name##_SMASK
-static struct flag_table dc8051_err_flags[] = {
-       FLAG_ENTRY0("SET_BY_8051", D8E(SET_BY_8051)),
-       FLAG_ENTRY0("LOST_8051_HEART_BEAT", D8E(LOST_8051_HEART_BEAT)),
-       FLAG_ENTRY0("CRAM_MBE", D8E(CRAM_MBE)),
-       FLAG_ENTRY0("CRAM_SBE", D8E(CRAM_SBE)),
-       FLAG_ENTRY0("DRAM_MBE", D8E(DRAM_MBE)),
-       FLAG_ENTRY0("DRAM_SBE", D8E(DRAM_SBE)),
-       FLAG_ENTRY0("IRAM_MBE", D8E(IRAM_MBE)),
-       FLAG_ENTRY0("IRAM_SBE", D8E(IRAM_SBE)),
-       FLAG_ENTRY0("UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES",
-                   D8E(UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES)),
-       FLAG_ENTRY0("INVALID_CSR_ADDR", D8E(INVALID_CSR_ADDR)),
-};
-
-/*
- * DC8051 Information Error flags
- *
- * Flags in DC8051_DBG_ERR_INFO_SET_BY_8051.ERROR field.
- */
-static struct flag_table dc8051_info_err_flags[] = {
-       FLAG_ENTRY0("Spico ROM check failed",  SPICO_ROM_FAILED),
-       FLAG_ENTRY0("Unknown frame received",  UNKNOWN_FRAME),
-       FLAG_ENTRY0("Target BER not met",      TARGET_BER_NOT_MET),
-       FLAG_ENTRY0("Serdes internal loopback failure",
-                   FAILED_SERDES_INTERNAL_LOOPBACK),
-       FLAG_ENTRY0("Failed SerDes init",      FAILED_SERDES_INIT),
-       FLAG_ENTRY0("Failed LNI(Polling)",     FAILED_LNI_POLLING),
-       FLAG_ENTRY0("Failed LNI(Debounce)",    FAILED_LNI_DEBOUNCE),
-       FLAG_ENTRY0("Failed LNI(EstbComm)",    FAILED_LNI_ESTBCOMM),
-       FLAG_ENTRY0("Failed LNI(OptEq)",       FAILED_LNI_OPTEQ),
-       FLAG_ENTRY0("Failed LNI(VerifyCap_1)", FAILED_LNI_VERIFY_CAP1),
-       FLAG_ENTRY0("Failed LNI(VerifyCap_2)", FAILED_LNI_VERIFY_CAP2),
-       FLAG_ENTRY0("Failed LNI(ConfigLT)",    FAILED_LNI_CONFIGLT),
-       FLAG_ENTRY0("Host Handshake Timeout",  HOST_HANDSHAKE_TIMEOUT)
-};
-
-/*
- * DC8051 Information Host Information flags
- *
- * Flags in DC8051_DBG_ERR_INFO_SET_BY_8051.HOST_MSG field.
- */
-static struct flag_table dc8051_info_host_msg_flags[] = {
-       FLAG_ENTRY0("Host request done", 0x0001),
-       FLAG_ENTRY0("BC SMA message", 0x0002),
-       FLAG_ENTRY0("BC PWR_MGM message", 0x0004),
-       FLAG_ENTRY0("BC Unknown message (BCC)", 0x0008),
-       FLAG_ENTRY0("BC Unknown message (LCB)", 0x0010),
-       FLAG_ENTRY0("External device config request", 0x0020),
-       FLAG_ENTRY0("VerifyCap all frames received", 0x0040),
-       FLAG_ENTRY0("LinkUp achieved", 0x0080),
-       FLAG_ENTRY0("Link going down", 0x0100),
-};
-
-static u32 encoded_size(u32 size);
-static u32 chip_to_opa_lstate(struct hfi1_devdata *dd, u32 chip_lstate);
-static int set_physical_link_state(struct hfi1_devdata *dd, u64 state);
-static void read_vc_remote_phy(struct hfi1_devdata *dd, u8 *power_management,
-                              u8 *continuous);
-static void read_vc_remote_fabric(struct hfi1_devdata *dd, u8 *vau, u8 *z,
-                                 u8 *vcu, u16 *vl15buf, u8 *crc_sizes);
-static void read_vc_remote_link_width(struct hfi1_devdata *dd,
-                                     u8 *remote_tx_rate, u16 *link_widths);
-static void read_vc_local_link_width(struct hfi1_devdata *dd, u8 *misc_bits,
-                                    u8 *flag_bits, u16 *link_widths);
-static void read_remote_device_id(struct hfi1_devdata *dd, u16 *device_id,
-                                 u8 *device_rev);
-static void read_mgmt_allowed(struct hfi1_devdata *dd, u8 *mgmt_allowed);
-static void read_local_lni(struct hfi1_devdata *dd, u8 *enable_lane_rx);
-static int read_tx_settings(struct hfi1_devdata *dd, u8 *enable_lane_tx,
-                           u8 *tx_polarity_inversion,
-                           u8 *rx_polarity_inversion, u8 *max_rate);
-static void handle_sdma_eng_err(struct hfi1_devdata *dd,
-                               unsigned int context, u64 err_status);
-static void handle_qsfp_int(struct hfi1_devdata *dd, u32 source, u64 reg);
-static void handle_dcc_err(struct hfi1_devdata *dd,
-                          unsigned int context, u64 err_status);
-static void handle_lcb_err(struct hfi1_devdata *dd,
-                          unsigned int context, u64 err_status);
-static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg);
-static void handle_cce_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
-static void handle_rxe_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
-static void handle_misc_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
-static void handle_pio_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
-static void handle_sdma_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
-static void handle_egress_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
-static void handle_txe_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
-static void set_partition_keys(struct hfi1_pportdata *);
-static const char *link_state_name(u32 state);
-static const char *link_state_reason_name(struct hfi1_pportdata *ppd,
-                                         u32 state);
-static int do_8051_command(struct hfi1_devdata *dd, u32 type, u64 in_data,
-                          u64 *out_data);
-static int read_idle_sma(struct hfi1_devdata *dd, u64 *data);
-static int thermal_init(struct hfi1_devdata *dd);
-
-static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state,
-                                 int msecs);
-static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc);
-static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr);
-static void handle_temp_err(struct hfi1_devdata *);
-static void dc_shutdown(struct hfi1_devdata *);
-static void dc_start(struct hfi1_devdata *);
-static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp,
-                          unsigned int *np);
-
-/*
- * Error interrupt table entry.  This is used as input to the interrupt
- * "clear down" routine used for all second tier error interrupt register.
- * Second tier interrupt registers have a single bit representing them
- * in the top-level CceIntStatus.
- */
-struct err_reg_info {
-       u32 status;             /* status CSR offset */
-       u32 clear;              /* clear CSR offset */
-       u32 mask;               /* mask CSR offset */
-       void (*handler)(struct hfi1_devdata *dd, u32 source, u64 reg);
-       const char *desc;
-};
-
-#define NUM_MISC_ERRS (IS_GENERAL_ERR_END - IS_GENERAL_ERR_START)
-#define NUM_DC_ERRS (IS_DC_END - IS_DC_START)
-#define NUM_VARIOUS (IS_VARIOUS_END - IS_VARIOUS_START)
-
-/*
- * Helpers for building HFI and DC error interrupt table entries.  Different
- * helpers are needed because of inconsistent register names.
- */
-#define EE(reg, handler, desc) \
-       { reg##_STATUS, reg##_CLEAR, reg##_MASK, \
-               handler, desc }
-#define DC_EE1(reg, handler, desc) \
-       { reg##_FLG, reg##_FLG_CLR, reg##_FLG_EN, handler, desc }
-#define DC_EE2(reg, handler, desc) \
-       { reg##_FLG, reg##_CLR, reg##_EN, handler, desc }
-
-/*
- * Table of the "misc" grouping of error interrupts.  Each entry refers to
- * another register containing more information.
- */
-static const struct err_reg_info misc_errs[NUM_MISC_ERRS] = {
-/* 0*/ EE(CCE_ERR,             handle_cce_err,    "CceErr"),
-/* 1*/ EE(RCV_ERR,             handle_rxe_err,    "RxeErr"),
-/* 2*/ EE(MISC_ERR,    handle_misc_err,   "MiscErr"),
-/* 3*/ { 0, 0, 0, NULL }, /* reserved */
-/* 4*/ EE(SEND_PIO_ERR,    handle_pio_err,    "PioErr"),
-/* 5*/ EE(SEND_DMA_ERR,    handle_sdma_err,   "SDmaErr"),
-/* 6*/ EE(SEND_EGRESS_ERR, handle_egress_err, "EgressErr"),
-/* 7*/ EE(SEND_ERR,    handle_txe_err,    "TxeErr")
-       /* the rest are reserved */
-};
-
-/*
- * Index into the Various section of the interrupt sources
- * corresponding to the Critical Temperature interrupt.
- */
-#define TCRIT_INT_SOURCE 4
-
-/*
- * SDMA error interrupt entry - refers to another register containing more
- * information.
- */
-static const struct err_reg_info sdma_eng_err =
-       EE(SEND_DMA_ENG_ERR, handle_sdma_eng_err, "SDmaEngErr");
-
-static const struct err_reg_info various_err[NUM_VARIOUS] = {
-/* 0*/ { 0, 0, 0, NULL }, /* PbcInt */
-/* 1*/ { 0, 0, 0, NULL }, /* GpioAssertInt */
-/* 2*/ EE(ASIC_QSFP1,  handle_qsfp_int,        "QSFP1"),
-/* 3*/ EE(ASIC_QSFP2,  handle_qsfp_int,        "QSFP2"),
-/* 4*/ { 0, 0, 0, NULL }, /* TCritInt */
-       /* rest are reserved */
-};
-
-/*
- * The DC encoding of mtu_cap for 10K MTU in the DCC_CFG_PORT_CONFIG
- * register can not be derived from the MTU value because 10K is not
- * a power of 2. Therefore, we need a constant. Everything else can
- * be calculated.
- */
-#define DCC_CFG_PORT_MTU_CAP_10240 7
-
-/*
- * Table of the DC grouping of error interrupts.  Each entry refers to
- * another register containing more information.
- */
-static const struct err_reg_info dc_errs[NUM_DC_ERRS] = {
-/* 0*/ DC_EE1(DCC_ERR,         handle_dcc_err,        "DCC Err"),
-/* 1*/ DC_EE2(DC_LCB_ERR,      handle_lcb_err,        "LCB Err"),
-/* 2*/ DC_EE2(DC_DC8051_ERR,   handle_8051_interrupt, "DC8051 Interrupt"),
-/* 3*/ /* dc_lbm_int - special, see is_dc_int() */
-       /* the rest are reserved */
-};
-
-struct cntr_entry {
-       /*
-        * counter name
-        */
-       char *name;
-
-       /*
-        * csr to read for name (if applicable)
-        */
-       u64 csr;
-
-       /*
-        * offset into dd or ppd to store the counter's value
-        */
-       int offset;
-
-       /*
-        * flags
-        */
-       u8 flags;
-
-       /*
-        * accessor for stat element, context either dd or ppd
-        */
-       u64 (*rw_cntr)(const struct cntr_entry *, void *context, int vl,
-                      int mode, u64 data);
-};
-
-#define C_RCV_HDR_OVF_FIRST C_RCV_HDR_OVF_0
-#define C_RCV_HDR_OVF_LAST C_RCV_HDR_OVF_159
-
-#define CNTR_ELEM(name, csr, offset, flags, accessor) \
-{ \
-       name, \
-       csr, \
-       offset, \
-       flags, \
-       accessor \
-}
-
-/* 32bit RXE */
-#define RXE32_PORT_CNTR_ELEM(name, counter, flags) \
-CNTR_ELEM(#name, \
-         (counter * 8 + RCV_COUNTER_ARRAY32), \
-         0, flags | CNTR_32BIT, \
-         port_access_u32_csr)
-
-#define RXE32_DEV_CNTR_ELEM(name, counter, flags) \
-CNTR_ELEM(#name, \
-         (counter * 8 + RCV_COUNTER_ARRAY32), \
-         0, flags | CNTR_32BIT, \
-         dev_access_u32_csr)
-
-/* 64bit RXE */
-#define RXE64_PORT_CNTR_ELEM(name, counter, flags) \
-CNTR_ELEM(#name, \
-         (counter * 8 + RCV_COUNTER_ARRAY64), \
-         0, flags, \
-         port_access_u64_csr)
-
-#define RXE64_DEV_CNTR_ELEM(name, counter, flags) \
-CNTR_ELEM(#name, \
-         (counter * 8 + RCV_COUNTER_ARRAY64), \
-         0, flags, \
-         dev_access_u64_csr)
-
-#define OVR_LBL(ctx) C_RCV_HDR_OVF_ ## ctx
-#define OVR_ELM(ctx) \
-CNTR_ELEM("RcvHdrOvr" #ctx, \
-         (RCV_HDR_OVFL_CNT + ctx * 0x100), \
-         0, CNTR_NORMAL, port_access_u64_csr)
-
-/* 32bit TXE */
-#define TXE32_PORT_CNTR_ELEM(name, counter, flags) \
-CNTR_ELEM(#name, \
-         (counter * 8 + SEND_COUNTER_ARRAY32), \
-         0, flags | CNTR_32BIT, \
-         port_access_u32_csr)
-
-/* 64bit TXE */
-#define TXE64_PORT_CNTR_ELEM(name, counter, flags) \
-CNTR_ELEM(#name, \
-         (counter * 8 + SEND_COUNTER_ARRAY64), \
-         0, flags, \
-         port_access_u64_csr)
-
-# define TX64_DEV_CNTR_ELEM(name, counter, flags) \
-CNTR_ELEM(#name,\
-         counter * 8 + SEND_COUNTER_ARRAY64, \
-         0, \
-         flags, \
-         dev_access_u64_csr)
-
-/* CCE */
-#define CCE_PERF_DEV_CNTR_ELEM(name, counter, flags) \
-CNTR_ELEM(#name, \
-         (counter * 8 + CCE_COUNTER_ARRAY32), \
-         0, flags | CNTR_32BIT, \
-         dev_access_u32_csr)
-
-#define CCE_INT_DEV_CNTR_ELEM(name, counter, flags) \
-CNTR_ELEM(#name, \
-         (counter * 8 + CCE_INT_COUNTER_ARRAY32), \
-         0, flags | CNTR_32BIT, \
-         dev_access_u32_csr)
-
-/* DC */
-#define DC_PERF_CNTR(name, counter, flags) \
-CNTR_ELEM(#name, \
-         counter, \
-         0, \
-         flags, \
-         dev_access_u64_csr)
-
-#define DC_PERF_CNTR_LCB(name, counter, flags) \
-CNTR_ELEM(#name, \
-         counter, \
-         0, \
-         flags, \
-         dc_access_lcb_cntr)
-
-/* ibp counters */
-#define SW_IBP_CNTR(name, cntr) \
-CNTR_ELEM(#name, \
-         0, \
-         0, \
-         CNTR_SYNTH, \
-         access_ibp_##cntr)
-
-u64 read_csr(const struct hfi1_devdata *dd, u32 offset)
-{
-       if (dd->flags & HFI1_PRESENT) {
-               return readq((void __iomem *)dd->kregbase + offset);
-       }
-       return -1;
-}
-
-void write_csr(const struct hfi1_devdata *dd, u32 offset, u64 value)
-{
-       if (dd->flags & HFI1_PRESENT)
-               writeq(value, (void __iomem *)dd->kregbase + offset);
-}
-
-void __iomem *get_csr_addr(
-       struct hfi1_devdata *dd,
-       u32 offset)
-{
-       return (void __iomem *)dd->kregbase + offset;
-}
-
-static inline u64 read_write_csr(const struct hfi1_devdata *dd, u32 csr,
-                                int mode, u64 value)
-{
-       u64 ret;
-
-       if (mode == CNTR_MODE_R) {
-               ret = read_csr(dd, csr);
-       } else if (mode == CNTR_MODE_W) {
-               write_csr(dd, csr, value);
-               ret = value;
-       } else {
-               dd_dev_err(dd, "Invalid cntr register access mode");
-               return 0;
-       }
-
-       hfi1_cdbg(CNTR, "csr 0x%x val 0x%llx mode %d", csr, ret, mode);
-       return ret;
-}
-
-/* Dev Access */
-static u64 dev_access_u32_csr(const struct cntr_entry *entry,
-                             void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = context;
-       u64 csr = entry->csr;
-
-       if (entry->flags & CNTR_SDMA) {
-               if (vl == CNTR_INVALID_VL)
-                       return 0;
-               csr += 0x100 * vl;
-       } else {
-               if (vl != CNTR_INVALID_VL)
-                       return 0;
-       }
-       return read_write_csr(dd, csr, mode, data);
-}
-
-static u64 access_sde_err_cnt(const struct cntr_entry *entry,
-                             void *context, int idx, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       if (dd->per_sdma && idx < dd->num_sdma)
-               return dd->per_sdma[idx].err_cnt;
-       return 0;
-}
-
-static u64 access_sde_int_cnt(const struct cntr_entry *entry,
-                             void *context, int idx, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       if (dd->per_sdma && idx < dd->num_sdma)
-               return dd->per_sdma[idx].sdma_int_cnt;
-       return 0;
-}
-
-static u64 access_sde_idle_int_cnt(const struct cntr_entry *entry,
-                                  void *context, int idx, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       if (dd->per_sdma && idx < dd->num_sdma)
-               return dd->per_sdma[idx].idle_int_cnt;
-       return 0;
-}
-
-static u64 access_sde_progress_int_cnt(const struct cntr_entry *entry,
-                                      void *context, int idx, int mode,
-                                      u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       if (dd->per_sdma && idx < dd->num_sdma)
-               return dd->per_sdma[idx].progress_int_cnt;
-       return 0;
-}
-
-static u64 dev_access_u64_csr(const struct cntr_entry *entry, void *context,
-                             int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = context;
-
-       u64 val = 0;
-       u64 csr = entry->csr;
-
-       if (entry->flags & CNTR_VL) {
-               if (vl == CNTR_INVALID_VL)
-                       return 0;
-               csr += 8 * vl;
-       } else {
-               if (vl != CNTR_INVALID_VL)
-                       return 0;
-       }
-
-       val = read_write_csr(dd, csr, mode, data);
-       return val;
-}
-
-static u64 dc_access_lcb_cntr(const struct cntr_entry *entry, void *context,
-                             int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = context;
-       u32 csr = entry->csr;
-       int ret = 0;
-
-       if (vl != CNTR_INVALID_VL)
-               return 0;
-       if (mode == CNTR_MODE_R)
-               ret = read_lcb_csr(dd, csr, &data);
-       else if (mode == CNTR_MODE_W)
-               ret = write_lcb_csr(dd, csr, data);
-
-       if (ret) {
-               dd_dev_err(dd, "Could not acquire LCB for counter 0x%x", csr);
-               return 0;
-       }
-
-       hfi1_cdbg(CNTR, "csr 0x%x val 0x%llx mode %d", csr, data, mode);
-       return data;
-}
-
-/* Port Access */
-static u64 port_access_u32_csr(const struct cntr_entry *entry, void *context,
-                              int vl, int mode, u64 data)
-{
-       struct hfi1_pportdata *ppd = context;
-
-       if (vl != CNTR_INVALID_VL)
-               return 0;
-       return read_write_csr(ppd->dd, entry->csr, mode, data);
-}
-
-static u64 port_access_u64_csr(const struct cntr_entry *entry,
-                              void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_pportdata *ppd = context;
-       u64 val;
-       u64 csr = entry->csr;
-
-       if (entry->flags & CNTR_VL) {
-               if (vl == CNTR_INVALID_VL)
-                       return 0;
-               csr += 8 * vl;
-       } else {
-               if (vl != CNTR_INVALID_VL)
-                       return 0;
-       }
-       val = read_write_csr(ppd->dd, csr, mode, data);
-       return val;
-}
-
-/* Software defined */
-static inline u64 read_write_sw(struct hfi1_devdata *dd, u64 *cntr, int mode,
-                               u64 data)
-{
-       u64 ret;
-
-       if (mode == CNTR_MODE_R) {
-               ret = *cntr;
-       } else if (mode == CNTR_MODE_W) {
-               *cntr = data;
-               ret = data;
-       } else {
-               dd_dev_err(dd, "Invalid cntr sw access mode");
-               return 0;
-       }
-
-       hfi1_cdbg(CNTR, "val 0x%llx mode %d", ret, mode);
-
-       return ret;
-}
-
-static u64 access_sw_link_dn_cnt(const struct cntr_entry *entry, void *context,
-                                int vl, int mode, u64 data)
-{
-       struct hfi1_pportdata *ppd = context;
-
-       if (vl != CNTR_INVALID_VL)
-               return 0;
-       return read_write_sw(ppd->dd, &ppd->link_downed, mode, data);
-}
-
-static u64 access_sw_link_up_cnt(const struct cntr_entry *entry, void *context,
-                                int vl, int mode, u64 data)
-{
-       struct hfi1_pportdata *ppd = context;
-
-       if (vl != CNTR_INVALID_VL)
-               return 0;
-       return read_write_sw(ppd->dd, &ppd->link_up, mode, data);
-}
-
-static u64 access_sw_unknown_frame_cnt(const struct cntr_entry *entry,
-                                      void *context, int vl, int mode,
-                                      u64 data)
-{
-       struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
-
-       if (vl != CNTR_INVALID_VL)
-               return 0;
-       return read_write_sw(ppd->dd, &ppd->unknown_frame_count, mode, data);
-}
-
-static u64 access_sw_xmit_discards(const struct cntr_entry *entry,
-                                  void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
-       u64 zero = 0;
-       u64 *counter;
-
-       if (vl == CNTR_INVALID_VL)
-               counter = &ppd->port_xmit_discards;
-       else if (vl >= 0 && vl < C_VL_COUNT)
-               counter = &ppd->port_xmit_discards_vl[vl];
-       else
-               counter = &zero;
-
-       return read_write_sw(ppd->dd, counter, mode, data);
-}
-
-static u64 access_xmit_constraint_errs(const struct cntr_entry *entry,
-                                      void *context, int vl, int mode,
-                                      u64 data)
-{
-       struct hfi1_pportdata *ppd = context;
-
-       if (vl != CNTR_INVALID_VL)
-               return 0;
-
-       return read_write_sw(ppd->dd, &ppd->port_xmit_constraint_errors,
-                            mode, data);
-}
-
-static u64 access_rcv_constraint_errs(const struct cntr_entry *entry,
-                                     void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_pportdata *ppd = context;
-
-       if (vl != CNTR_INVALID_VL)
-               return 0;
-
-       return read_write_sw(ppd->dd, &ppd->port_rcv_constraint_errors,
-                            mode, data);
-}
-
-u64 get_all_cpu_total(u64 __percpu *cntr)
-{
-       int cpu;
-       u64 counter = 0;
-
-       for_each_possible_cpu(cpu)
-               counter += *per_cpu_ptr(cntr, cpu);
-       return counter;
-}
-
-static u64 read_write_cpu(struct hfi1_devdata *dd, u64 *z_val,
-                         u64 __percpu *cntr,
-                         int vl, int mode, u64 data)
-{
-       u64 ret = 0;
-
-       if (vl != CNTR_INVALID_VL)
-               return 0;
-
-       if (mode == CNTR_MODE_R) {
-               ret = get_all_cpu_total(cntr) - *z_val;
-       } else if (mode == CNTR_MODE_W) {
-               /* A write can only zero the counter */
-               if (data == 0)
-                       *z_val = get_all_cpu_total(cntr);
-               else
-                       dd_dev_err(dd, "Per CPU cntrs can only be zeroed");
-       } else {
-               dd_dev_err(dd, "Invalid cntr sw cpu access mode");
-               return 0;
-       }
-
-       return ret;
-}
-
-static u64 access_sw_cpu_intr(const struct cntr_entry *entry,
-                             void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = context;
-
-       return read_write_cpu(dd, &dd->z_int_counter, dd->int_counter, vl,
-                             mode, data);
-}
-
-static u64 access_sw_cpu_rcv_limit(const struct cntr_entry *entry,
-                                  void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = context;
-
-       return read_write_cpu(dd, &dd->z_rcv_limit, dd->rcv_limit, vl,
-                             mode, data);
-}
-
-static u64 access_sw_pio_wait(const struct cntr_entry *entry,
-                             void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = context;
-
-       return dd->verbs_dev.n_piowait;
-}
-
-static u64 access_sw_pio_drain(const struct cntr_entry *entry,
-                              void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->verbs_dev.n_piodrain;
-}
-
-static u64 access_sw_vtx_wait(const struct cntr_entry *entry,
-                             void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = context;
-
-       return dd->verbs_dev.n_txwait;
-}
-
-static u64 access_sw_kmem_wait(const struct cntr_entry *entry,
-                              void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = context;
-
-       return dd->verbs_dev.n_kmem_wait;
-}
-
-static u64 access_sw_send_schedule(const struct cntr_entry *entry,
-                                  void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return read_write_cpu(dd, &dd->z_send_schedule, dd->send_schedule, vl,
-                             mode, data);
-}
-
-/* Software counters for the error status bits within MISC_ERR_STATUS */
-static u64 access_misc_pll_lock_fail_err_cnt(const struct cntr_entry *entry,
-                                            void *context, int vl, int mode,
-                                            u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->misc_err_status_cnt[12];
-}
-
-static u64 access_misc_mbist_fail_err_cnt(const struct cntr_entry *entry,
-                                         void *context, int vl, int mode,
-                                         u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->misc_err_status_cnt[11];
-}
-
-static u64 access_misc_invalid_eep_cmd_err_cnt(const struct cntr_entry *entry,
-                                              void *context, int vl, int mode,
-                                              u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->misc_err_status_cnt[10];
-}
-
-static u64 access_misc_efuse_done_parity_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->misc_err_status_cnt[9];
-}
-
-static u64 access_misc_efuse_write_err_cnt(const struct cntr_entry *entry,
-                                          void *context, int vl, int mode,
-                                          u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->misc_err_status_cnt[8];
-}
-
-static u64 access_misc_efuse_read_bad_addr_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->misc_err_status_cnt[7];
-}
-
-static u64 access_misc_efuse_csr_parity_err_cnt(const struct cntr_entry *entry,
-                                               void *context, int vl,
-                                               int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->misc_err_status_cnt[6];
-}
-
-static u64 access_misc_fw_auth_failed_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->misc_err_status_cnt[5];
-}
-
-static u64 access_misc_key_mismatch_err_cnt(const struct cntr_entry *entry,
-                                           void *context, int vl, int mode,
-                                           u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->misc_err_status_cnt[4];
-}
-
-static u64 access_misc_sbus_write_failed_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->misc_err_status_cnt[3];
-}
-
-static u64 access_misc_csr_write_bad_addr_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->misc_err_status_cnt[2];
-}
-
-static u64 access_misc_csr_read_bad_addr_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->misc_err_status_cnt[1];
-}
-
-static u64 access_misc_csr_parity_err_cnt(const struct cntr_entry *entry,
-                                         void *context, int vl, int mode,
-                                         u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->misc_err_status_cnt[0];
-}
-
-/*
- * Software counter for the aggregate of
- * individual CceErrStatus counters
- */
-static u64 access_sw_cce_err_status_aggregated_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_cce_err_status_aggregate;
-}
-
-/*
- * Software counters corresponding to each of the
- * error status bits within CceErrStatus
- */
-static u64 access_cce_msix_csr_parity_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[40];
-}
-
-static u64 access_cce_int_map_unc_err_cnt(const struct cntr_entry *entry,
-                                         void *context, int vl, int mode,
-                                         u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[39];
-}
-
-static u64 access_cce_int_map_cor_err_cnt(const struct cntr_entry *entry,
-                                         void *context, int vl, int mode,
-                                         u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[38];
-}
-
-static u64 access_cce_msix_table_unc_err_cnt(const struct cntr_entry *entry,
-                                            void *context, int vl, int mode,
-                                            u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[37];
-}
-
-static u64 access_cce_msix_table_cor_err_cnt(const struct cntr_entry *entry,
-                                            void *context, int vl, int mode,
-                                            u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[36];
-}
-
-static u64 access_cce_rxdma_conv_fifo_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[35];
-}
-
-static u64 access_cce_rcpl_async_fifo_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[34];
-}
-
-static u64 access_cce_seg_write_bad_addr_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[33];
-}
-
-static u64 access_cce_seg_read_bad_addr_err_cnt(const struct cntr_entry *entry,
-                                               void *context, int vl, int mode,
-                                               u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[32];
-}
-
-static u64 access_la_triggered_cnt(const struct cntr_entry *entry,
-                                  void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[31];
-}
-
-static u64 access_cce_trgt_cpl_timeout_err_cnt(const struct cntr_entry *entry,
-                                              void *context, int vl, int mode,
-                                              u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[30];
-}
-
-static u64 access_pcic_receive_parity_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[29];
-}
-
-static u64 access_pcic_transmit_back_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[28];
-}
-
-static u64 access_pcic_transmit_front_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[27];
-}
-
-static u64 access_pcic_cpl_dat_q_unc_err_cnt(const struct cntr_entry *entry,
-                                            void *context, int vl, int mode,
-                                            u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[26];
-}
-
-static u64 access_pcic_cpl_hd_q_unc_err_cnt(const struct cntr_entry *entry,
-                                           void *context, int vl, int mode,
-                                           u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[25];
-}
-
-static u64 access_pcic_post_dat_q_unc_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[24];
-}
-
-static u64 access_pcic_post_hd_q_unc_err_cnt(const struct cntr_entry *entry,
-                                            void *context, int vl, int mode,
-                                            u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[23];
-}
-
-static u64 access_pcic_retry_sot_mem_unc_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[22];
-}
-
-static u64 access_pcic_retry_mem_unc_err(const struct cntr_entry *entry,
-                                        void *context, int vl, int mode,
-                                        u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[21];
-}
-
-static u64 access_pcic_n_post_dat_q_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[20];
-}
-
-static u64 access_pcic_n_post_h_q_parity_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[19];
-}
-
-static u64 access_pcic_cpl_dat_q_cor_err_cnt(const struct cntr_entry *entry,
-                                            void *context, int vl, int mode,
-                                            u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[18];
-}
-
-static u64 access_pcic_cpl_hd_q_cor_err_cnt(const struct cntr_entry *entry,
-                                           void *context, int vl, int mode,
-                                           u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[17];
-}
-
-static u64 access_pcic_post_dat_q_cor_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[16];
-}
-
-static u64 access_pcic_post_hd_q_cor_err_cnt(const struct cntr_entry *entry,
-                                            void *context, int vl, int mode,
-                                            u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[15];
-}
-
-static u64 access_pcic_retry_sot_mem_cor_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[14];
-}
-
-static u64 access_pcic_retry_mem_cor_err_cnt(const struct cntr_entry *entry,
-                                            void *context, int vl, int mode,
-                                            u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[13];
-}
-
-static u64 access_cce_cli1_async_fifo_dbg_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[12];
-}
-
-static u64 access_cce_cli1_async_fifo_rxdma_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[11];
-}
-
-static u64 access_cce_cli1_async_fifo_sdma_hd_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[10];
-}
-
-static u64 access_cce_cl1_async_fifo_pio_crdt_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[9];
-}
-
-static u64 access_cce_cli2_async_fifo_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[8];
-}
-
-static u64 access_cce_csr_cfg_bus_parity_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[7];
-}
-
-static u64 access_cce_cli0_async_fifo_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[6];
-}
-
-static u64 access_cce_rspd_data_parity_err_cnt(const struct cntr_entry *entry,
-                                              void *context, int vl, int mode,
-                                              u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[5];
-}
-
-static u64 access_cce_trgt_access_err_cnt(const struct cntr_entry *entry,
-                                         void *context, int vl, int mode,
-                                         u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[4];
-}
-
-static u64 access_cce_trgt_async_fifo_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[3];
-}
-
-static u64 access_cce_csr_write_bad_addr_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[2];
-}
-
-static u64 access_cce_csr_read_bad_addr_err_cnt(const struct cntr_entry *entry,
-                                               void *context, int vl,
-                                               int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[1];
-}
-
-static u64 access_ccs_csr_parity_err_cnt(const struct cntr_entry *entry,
-                                        void *context, int vl, int mode,
-                                        u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[0];
-}
-
-/*
- * Software counters corresponding to each of the
- * error status bits within RcvErrStatus
- */
-static u64 access_rx_csr_parity_err_cnt(const struct cntr_entry *entry,
-                                       void *context, int vl, int mode,
-                                       u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[63];
-}
-
-static u64 access_rx_csr_write_bad_addr_err_cnt(const struct cntr_entry *entry,
-                                               void *context, int vl,
-                                               int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[62];
-}
-
-static u64 access_rx_csr_read_bad_addr_err_cnt(const struct cntr_entry *entry,
-                                              void *context, int vl, int mode,
-                                              u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[61];
-}
-
-static u64 access_rx_dma_csr_unc_err_cnt(const struct cntr_entry *entry,
-                                        void *context, int vl, int mode,
-                                        u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[60];
-}
-
-static u64 access_rx_dma_dq_fsm_encoding_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[59];
-}
-
-static u64 access_rx_dma_eq_fsm_encoding_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[58];
-}
-
-static u64 access_rx_dma_csr_parity_err_cnt(const struct cntr_entry *entry,
-                                           void *context, int vl, int mode,
-                                           u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[57];
-}
-
-static u64 access_rx_rbuf_data_cor_err_cnt(const struct cntr_entry *entry,
-                                          void *context, int vl, int mode,
-                                          u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[56];
-}
-
-static u64 access_rx_rbuf_data_unc_err_cnt(const struct cntr_entry *entry,
-                                          void *context, int vl, int mode,
-                                          u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[55];
-}
-
-static u64 access_rx_dma_data_fifo_rd_cor_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[54];
-}
-
-static u64 access_rx_dma_data_fifo_rd_unc_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[53];
-}
-
-static u64 access_rx_dma_hdr_fifo_rd_cor_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[52];
-}
-
-static u64 access_rx_dma_hdr_fifo_rd_unc_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[51];
-}
-
-static u64 access_rx_rbuf_desc_part2_cor_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[50];
-}
-
-static u64 access_rx_rbuf_desc_part2_unc_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[49];
-}
-
-static u64 access_rx_rbuf_desc_part1_cor_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[48];
-}
-
-static u64 access_rx_rbuf_desc_part1_unc_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[47];
-}
-
-static u64 access_rx_hq_intr_fsm_err_cnt(const struct cntr_entry *entry,
-                                        void *context, int vl, int mode,
-                                        u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[46];
-}
-
-static u64 access_rx_hq_intr_csr_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[45];
-}
-
-static u64 access_rx_lookup_csr_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[44];
-}
-
-static u64 access_rx_lookup_rcv_array_cor_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[43];
-}
-
-static u64 access_rx_lookup_rcv_array_unc_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[42];
-}
-
-static u64 access_rx_lookup_des_part2_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[41];
-}
-
-static u64 access_rx_lookup_des_part1_unc_cor_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[40];
-}
-
-static u64 access_rx_lookup_des_part1_unc_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[39];
-}
-
-static u64 access_rx_rbuf_next_free_buf_cor_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[38];
-}
-
-static u64 access_rx_rbuf_next_free_buf_unc_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[37];
-}
-
-static u64 access_rbuf_fl_init_wr_addr_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[36];
-}
-
-static u64 access_rx_rbuf_fl_initdone_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[35];
-}
-
-static u64 access_rx_rbuf_fl_write_addr_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[34];
-}
-
-static u64 access_rx_rbuf_fl_rd_addr_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[33];
-}
-
-static u64 access_rx_rbuf_empty_err_cnt(const struct cntr_entry *entry,
-                                       void *context, int vl, int mode,
-                                       u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[32];
-}
-
-static u64 access_rx_rbuf_full_err_cnt(const struct cntr_entry *entry,
-                                      void *context, int vl, int mode,
-                                      u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[31];
-}
-
-static u64 access_rbuf_bad_lookup_err_cnt(const struct cntr_entry *entry,
-                                         void *context, int vl, int mode,
-                                         u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[30];
-}
-
-static u64 access_rbuf_ctx_id_parity_err_cnt(const struct cntr_entry *entry,
-                                            void *context, int vl, int mode,
-                                            u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[29];
-}
-
-static u64 access_rbuf_csr_qeopdw_parity_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[28];
-}
-
-static u64 access_rx_rbuf_csr_q_num_of_pkt_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[27];
-}
-
-static u64 access_rx_rbuf_csr_q_t1_ptr_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[26];
-}
-
-static u64 access_rx_rbuf_csr_q_hd_ptr_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[25];
-}
-
-static u64 access_rx_rbuf_csr_q_vld_bit_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[24];
-}
-
-static u64 access_rx_rbuf_csr_q_next_buf_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[23];
-}
-
-static u64 access_rx_rbuf_csr_q_ent_cnt_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[22];
-}
-
-static u64 access_rx_rbuf_csr_q_head_buf_num_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[21];
-}
-
-static u64 access_rx_rbuf_block_list_read_cor_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[20];
-}
-
-static u64 access_rx_rbuf_block_list_read_unc_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[19];
-}
-
-static u64 access_rx_rbuf_lookup_des_cor_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[18];
-}
-
-static u64 access_rx_rbuf_lookup_des_unc_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[17];
-}
-
-static u64 access_rx_rbuf_lookup_des_reg_unc_cor_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[16];
-}
-
-static u64 access_rx_rbuf_lookup_des_reg_unc_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[15];
-}
-
-static u64 access_rx_rbuf_free_list_cor_err_cnt(const struct cntr_entry *entry,
-                                               void *context, int vl,
-                                               int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[14];
-}
-
-static u64 access_rx_rbuf_free_list_unc_err_cnt(const struct cntr_entry *entry,
-                                               void *context, int vl,
-                                               int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[13];
-}
-
-static u64 access_rx_rcv_fsm_encoding_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[12];
-}
-
-static u64 access_rx_dma_flag_cor_err_cnt(const struct cntr_entry *entry,
-                                         void *context, int vl, int mode,
-                                         u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[11];
-}
-
-static u64 access_rx_dma_flag_unc_err_cnt(const struct cntr_entry *entry,
-                                         void *context, int vl, int mode,
-                                         u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[10];
-}
-
-static u64 access_rx_dc_sop_eop_parity_err_cnt(const struct cntr_entry *entry,
-                                              void *context, int vl, int mode,
-                                              u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[9];
-}
-
-static u64 access_rx_rcv_csr_parity_err_cnt(const struct cntr_entry *entry,
-                                           void *context, int vl, int mode,
-                                           u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[8];
-}
-
-static u64 access_rx_rcv_qp_map_table_cor_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[7];
-}
-
-static u64 access_rx_rcv_qp_map_table_unc_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[6];
-}
-
-static u64 access_rx_rcv_data_cor_err_cnt(const struct cntr_entry *entry,
-                                         void *context, int vl, int mode,
-                                         u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[5];
-}
-
-static u64 access_rx_rcv_data_unc_err_cnt(const struct cntr_entry *entry,
-                                         void *context, int vl, int mode,
-                                         u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[4];
-}
-
-static u64 access_rx_rcv_hdr_cor_err_cnt(const struct cntr_entry *entry,
-                                        void *context, int vl, int mode,
-                                        u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[3];
-}
-
-static u64 access_rx_rcv_hdr_unc_err_cnt(const struct cntr_entry *entry,
-                                        void *context, int vl, int mode,
-                                        u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[2];
-}
-
-static u64 access_rx_dc_intf_parity_err_cnt(const struct cntr_entry *entry,
-                                           void *context, int vl, int mode,
-                                           u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[1];
-}
-
-static u64 access_rx_dma_csr_cor_err_cnt(const struct cntr_entry *entry,
-                                        void *context, int vl, int mode,
-                                        u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[0];
-}
-
-/*
- * Software counters corresponding to each of the
- * error status bits within SendPioErrStatus
- */
-static u64 access_pio_pec_sop_head_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[35];
-}
-
-static u64 access_pio_pcc_sop_head_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[34];
-}
-
-static u64 access_pio_last_returned_cnt_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[33];
-}
-
-static u64 access_pio_current_free_cnt_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[32];
-}
-
-static u64 access_pio_reserved_31_err_cnt(const struct cntr_entry *entry,
-                                         void *context, int vl, int mode,
-                                         u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[31];
-}
-
-static u64 access_pio_reserved_30_err_cnt(const struct cntr_entry *entry,
-                                         void *context, int vl, int mode,
-                                         u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[30];
-}
-
-static u64 access_pio_ppmc_sop_len_err_cnt(const struct cntr_entry *entry,
-                                          void *context, int vl, int mode,
-                                          u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[29];
-}
-
-static u64 access_pio_ppmc_bqc_mem_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[28];
-}
-
-static u64 access_pio_vl_fifo_parity_err_cnt(const struct cntr_entry *entry,
-                                            void *context, int vl, int mode,
-                                            u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[27];
-}
-
-static u64 access_pio_vlf_sop_parity_err_cnt(const struct cntr_entry *entry,
-                                            void *context, int vl, int mode,
-                                            u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[26];
-}
-
-static u64 access_pio_vlf_v1_len_parity_err_cnt(const struct cntr_entry *entry,
-                                               void *context, int vl,
-                                               int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[25];
-}
-
-static u64 access_pio_block_qw_count_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[24];
-}
-
-static u64 access_pio_write_qw_valid_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[23];
-}
-
-static u64 access_pio_state_machine_err_cnt(const struct cntr_entry *entry,
-                                           void *context, int vl, int mode,
-                                           u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[22];
-}
-
-static u64 access_pio_write_data_parity_err_cnt(const struct cntr_entry *entry,
-                                               void *context, int vl,
-                                               int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[21];
-}
-
-static u64 access_pio_host_addr_mem_cor_err_cnt(const struct cntr_entry *entry,
-                                               void *context, int vl,
-                                               int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[20];
-}
-
-static u64 access_pio_host_addr_mem_unc_err_cnt(const struct cntr_entry *entry,
-                                               void *context, int vl,
-                                               int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[19];
-}
-
-static u64 access_pio_pkt_evict_sm_or_arb_sm_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[18];
-}
-
-static u64 access_pio_init_sm_in_err_cnt(const struct cntr_entry *entry,
-                                        void *context, int vl, int mode,
-                                        u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[17];
-}
-
-static u64 access_pio_ppmc_pbl_fifo_err_cnt(const struct cntr_entry *entry,
-                                           void *context, int vl, int mode,
-                                           u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[16];
-}
-
-static u64 access_pio_credit_ret_fifo_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[15];
-}
-
-static u64 access_pio_v1_len_mem_bank1_cor_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[14];
-}
-
-static u64 access_pio_v1_len_mem_bank0_cor_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[13];
-}
-
-static u64 access_pio_v1_len_mem_bank1_unc_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[12];
-}
-
-static u64 access_pio_v1_len_mem_bank0_unc_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[11];
-}
-
-static u64 access_pio_sm_pkt_reset_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[10];
-}
-
-static u64 access_pio_pkt_evict_fifo_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[9];
-}
-
-static u64 access_pio_sbrdctrl_crrel_fifo_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[8];
-}
-
-static u64 access_pio_sbrdctl_crrel_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[7];
-}
-
-static u64 access_pio_pec_fifo_parity_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[6];
-}
-
-static u64 access_pio_pcc_fifo_parity_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[5];
-}
-
-static u64 access_pio_sb_mem_fifo1_err_cnt(const struct cntr_entry *entry,
-                                          void *context, int vl, int mode,
-                                          u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[4];
-}
-
-static u64 access_pio_sb_mem_fifo0_err_cnt(const struct cntr_entry *entry,
-                                          void *context, int vl, int mode,
-                                          u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[3];
-}
-
-static u64 access_pio_csr_parity_err_cnt(const struct cntr_entry *entry,
-                                        void *context, int vl, int mode,
-                                        u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[2];
-}
-
-static u64 access_pio_write_addr_parity_err_cnt(const struct cntr_entry *entry,
-                                               void *context, int vl,
-                                               int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[1];
-}
-
-static u64 access_pio_write_bad_ctxt_err_cnt(const struct cntr_entry *entry,
-                                            void *context, int vl, int mode,
-                                            u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[0];
-}
-
-/*
- * Software counters corresponding to each of the
- * error status bits within SendDmaErrStatus
- */
-static u64 access_sdma_pcie_req_tracking_cor_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_dma_err_status_cnt[3];
-}
-
-static u64 access_sdma_pcie_req_tracking_unc_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_dma_err_status_cnt[2];
-}
-
-static u64 access_sdma_csr_parity_err_cnt(const struct cntr_entry *entry,
-                                         void *context, int vl, int mode,
-                                         u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_dma_err_status_cnt[1];
-}
-
-static u64 access_sdma_rpy_tag_err_cnt(const struct cntr_entry *entry,
-                                      void *context, int vl, int mode,
-                                      u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_dma_err_status_cnt[0];
-}
-
-/*
- * Software counters corresponding to each of the
- * error status bits within SendEgressErrStatus
- */
-static u64 access_tx_read_pio_memory_csr_unc_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[63];
-}
-
-static u64 access_tx_read_sdma_memory_csr_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[62];
-}
-
-static u64 access_tx_egress_fifo_cor_err_cnt(const struct cntr_entry *entry,
-                                            void *context, int vl, int mode,
-                                            u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[61];
-}
-
-static u64 access_tx_read_pio_memory_cor_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[60];
-}
-
-static u64 access_tx_read_sdma_memory_cor_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[59];
-}
-
-static u64 access_tx_sb_hdr_cor_err_cnt(const struct cntr_entry *entry,
-                                       void *context, int vl, int mode,
-                                       u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[58];
-}
-
-static u64 access_tx_credit_overrun_err_cnt(const struct cntr_entry *entry,
-                                           void *context, int vl, int mode,
-                                           u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[57];
-}
-
-static u64 access_tx_launch_fifo8_cor_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[56];
-}
-
-static u64 access_tx_launch_fifo7_cor_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[55];
-}
-
-static u64 access_tx_launch_fifo6_cor_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[54];
-}
-
-static u64 access_tx_launch_fifo5_cor_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[53];
-}
-
-static u64 access_tx_launch_fifo4_cor_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[52];
-}
-
-static u64 access_tx_launch_fifo3_cor_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[51];
-}
-
-static u64 access_tx_launch_fifo2_cor_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[50];
-}
-
-static u64 access_tx_launch_fifo1_cor_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[49];
-}
-
-static u64 access_tx_launch_fifo0_cor_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[48];
-}
-
-static u64 access_tx_credit_return_vl_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[47];
-}
-
-static u64 access_tx_hcrc_insertion_err_cnt(const struct cntr_entry *entry,
-                                           void *context, int vl, int mode,
-                                           u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[46];
-}
-
-static u64 access_tx_egress_fifo_unc_err_cnt(const struct cntr_entry *entry,
-                                            void *context, int vl, int mode,
-                                            u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[45];
-}
-
-static u64 access_tx_read_pio_memory_unc_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[44];
-}
-
-static u64 access_tx_read_sdma_memory_unc_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[43];
-}
-
-static u64 access_tx_sb_hdr_unc_err_cnt(const struct cntr_entry *entry,
-                                       void *context, int vl, int mode,
-                                       u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[42];
-}
-
-static u64 access_tx_credit_return_partiy_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[41];
-}
-
-static u64 access_tx_launch_fifo8_unc_or_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[40];
-}
-
-static u64 access_tx_launch_fifo7_unc_or_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[39];
-}
-
-static u64 access_tx_launch_fifo6_unc_or_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[38];
-}
-
-static u64 access_tx_launch_fifo5_unc_or_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[37];
-}
-
-static u64 access_tx_launch_fifo4_unc_or_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[36];
-}
-
-static u64 access_tx_launch_fifo3_unc_or_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[35];
-}
-
-static u64 access_tx_launch_fifo2_unc_or_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[34];
-}
-
-static u64 access_tx_launch_fifo1_unc_or_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[33];
-}
-
-static u64 access_tx_launch_fifo0_unc_or_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[32];
-}
-
-static u64 access_tx_sdma15_disallowed_packet_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[31];
-}
-
-static u64 access_tx_sdma14_disallowed_packet_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[30];
-}
-
-static u64 access_tx_sdma13_disallowed_packet_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[29];
-}
-
-static u64 access_tx_sdma12_disallowed_packet_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[28];
-}
-
-static u64 access_tx_sdma11_disallowed_packet_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[27];
-}
-
-static u64 access_tx_sdma10_disallowed_packet_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[26];
-}
-
-static u64 access_tx_sdma9_disallowed_packet_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[25];
-}
-
-static u64 access_tx_sdma8_disallowed_packet_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[24];
-}
-
-static u64 access_tx_sdma7_disallowed_packet_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[23];
-}
-
-static u64 access_tx_sdma6_disallowed_packet_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[22];
-}
-
-static u64 access_tx_sdma5_disallowed_packet_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[21];
-}
-
-static u64 access_tx_sdma4_disallowed_packet_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[20];
-}
-
-static u64 access_tx_sdma3_disallowed_packet_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[19];
-}
-
-static u64 access_tx_sdma2_disallowed_packet_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[18];
-}
-
-static u64 access_tx_sdma1_disallowed_packet_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[17];
-}
-
-static u64 access_tx_sdma0_disallowed_packet_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[16];
-}
-
-static u64 access_tx_config_parity_err_cnt(const struct cntr_entry *entry,
-                                          void *context, int vl, int mode,
-                                          u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[15];
-}
-
-static u64 access_tx_sbrd_ctl_csr_parity_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[14];
-}
-
-static u64 access_tx_launch_csr_parity_err_cnt(const struct cntr_entry *entry,
-                                              void *context, int vl, int mode,
-                                              u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[13];
-}
-
-static u64 access_tx_illegal_vl_err_cnt(const struct cntr_entry *entry,
-                                       void *context, int vl, int mode,
-                                       u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[12];
-}
-
-static u64 access_tx_sbrd_ctl_state_machine_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[11];
-}
-
-static u64 access_egress_reserved_10_err_cnt(const struct cntr_entry *entry,
-                                            void *context, int vl, int mode,
-                                            u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[10];
-}
-
-static u64 access_egress_reserved_9_err_cnt(const struct cntr_entry *entry,
-                                           void *context, int vl, int mode,
-                                           u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[9];
-}
-
-static u64 access_tx_sdma_launch_intf_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[8];
-}
-
-static u64 access_tx_pio_launch_intf_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[7];
-}
-
-static u64 access_egress_reserved_6_err_cnt(const struct cntr_entry *entry,
-                                           void *context, int vl, int mode,
-                                           u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[6];
-}
-
-static u64 access_tx_incorrect_link_state_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[5];
-}
-
-static u64 access_tx_linkdown_err_cnt(const struct cntr_entry *entry,
-                                     void *context, int vl, int mode,
-                                     u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[4];
-}
-
-static u64 access_tx_egress_fifi_underrun_or_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[3];
-}
-
-static u64 access_egress_reserved_2_err_cnt(const struct cntr_entry *entry,
-                                           void *context, int vl, int mode,
-                                           u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[2];
-}
-
-static u64 access_tx_pkt_integrity_mem_unc_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[1];
-}
-
-static u64 access_tx_pkt_integrity_mem_cor_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[0];
-}
-
-/*
- * Software counters corresponding to each of the
- * error status bits within SendErrStatus
- */
-static u64 access_send_csr_write_bad_addr_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_err_status_cnt[2];
-}
-
-static u64 access_send_csr_read_bad_addr_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_err_status_cnt[1];
-}
-
-static u64 access_send_csr_parity_cnt(const struct cntr_entry *entry,
-                                     void *context, int vl, int mode,
-                                     u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_err_status_cnt[0];
-}
-
-/*
- * Software counters corresponding to each of the
- * error status bits within SendCtxtErrStatus
- */
-static u64 access_pio_write_out_of_bounds_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_ctxt_err_status_cnt[4];
-}
-
-static u64 access_pio_write_overflow_err_cnt(const struct cntr_entry *entry,
-                                            void *context, int vl, int mode,
-                                            u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_ctxt_err_status_cnt[3];
-}
-
-static u64 access_pio_write_crosses_boundary_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_ctxt_err_status_cnt[2];
-}
-
-static u64 access_pio_disallowed_packet_err_cnt(const struct cntr_entry *entry,
-                                               void *context, int vl,
-                                               int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_ctxt_err_status_cnt[1];
-}
-
-static u64 access_pio_inconsistent_sop_err_cnt(const struct cntr_entry *entry,
-                                              void *context, int vl, int mode,
-                                              u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_ctxt_err_status_cnt[0];
-}
-
-/*
- * Software counters corresponding to each of the
- * error status bits within SendDmaEngErrStatus
- */
-static u64 access_sdma_header_request_fifo_cor_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[23];
-}
-
-static u64 access_sdma_header_storage_cor_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[22];
-}
-
-static u64 access_sdma_packet_tracking_cor_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[21];
-}
-
-static u64 access_sdma_assembly_cor_err_cnt(const struct cntr_entry *entry,
-                                           void *context, int vl, int mode,
-                                           u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[20];
-}
-
-static u64 access_sdma_desc_table_cor_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[19];
-}
-
-static u64 access_sdma_header_request_fifo_unc_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[18];
-}
-
-static u64 access_sdma_header_storage_unc_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[17];
-}
-
-static u64 access_sdma_packet_tracking_unc_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[16];
-}
-
-static u64 access_sdma_assembly_unc_err_cnt(const struct cntr_entry *entry,
-                                           void *context, int vl, int mode,
-                                           u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[15];
-}
-
-static u64 access_sdma_desc_table_unc_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[14];
-}
-
-static u64 access_sdma_timeout_err_cnt(const struct cntr_entry *entry,
-                                      void *context, int vl, int mode,
-                                      u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[13];
-}
-
-static u64 access_sdma_header_length_err_cnt(const struct cntr_entry *entry,
-                                            void *context, int vl, int mode,
-                                            u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[12];
-}
-
-static u64 access_sdma_header_address_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[11];
-}
-
-static u64 access_sdma_header_select_err_cnt(const struct cntr_entry *entry,
-                                            void *context, int vl, int mode,
-                                            u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[10];
-}
-
-static u64 access_sdma_reserved_9_err_cnt(const struct cntr_entry *entry,
-                                         void *context, int vl, int mode,
-                                         u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[9];
-}
-
-static u64 access_sdma_packet_desc_overflow_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[8];
-}
-
-static u64 access_sdma_length_mismatch_err_cnt(const struct cntr_entry *entry,
-                                              void *context, int vl,
-                                              int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[7];
-}
-
-static u64 access_sdma_halt_err_cnt(const struct cntr_entry *entry,
-                                   void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[6];
-}
-
-static u64 access_sdma_mem_read_err_cnt(const struct cntr_entry *entry,
-                                       void *context, int vl, int mode,
-                                       u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[5];
-}
-
-static u64 access_sdma_first_desc_err_cnt(const struct cntr_entry *entry,
-                                         void *context, int vl, int mode,
-                                         u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[4];
-}
-
-static u64 access_sdma_tail_out_of_bounds_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[3];
-}
-
-static u64 access_sdma_too_long_err_cnt(const struct cntr_entry *entry,
-                                       void *context, int vl, int mode,
-                                       u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[2];
-}
-
-static u64 access_sdma_gen_mismatch_err_cnt(const struct cntr_entry *entry,
-                                           void *context, int vl, int mode,
-                                           u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[1];
-}
-
-static u64 access_sdma_wrong_dw_err_cnt(const struct cntr_entry *entry,
-                                       void *context, int vl, int mode,
-                                       u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[0];
-}
-
-#define def_access_sw_cpu(cntr) \
-static u64 access_sw_cpu_##cntr(const struct cntr_entry *entry,                      \
-                             void *context, int vl, int mode, u64 data)      \
-{                                                                            \
-       struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;        \
-       return read_write_cpu(ppd->dd, &ppd->ibport_data.rvp.z_ ##cntr,       \
-                             ppd->ibport_data.rvp.cntr, vl,                  \
-                             mode, data);                                    \
-}
-
-def_access_sw_cpu(rc_acks);
-def_access_sw_cpu(rc_qacks);
-def_access_sw_cpu(rc_delayed_comp);
-
-#define def_access_ibp_counter(cntr) \
-static u64 access_ibp_##cntr(const struct cntr_entry *entry,                 \
-                               void *context, int vl, int mode, u64 data)    \
-{                                                                            \
-       struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;        \
-                                                                             \
-       if (vl != CNTR_INVALID_VL)                                            \
-               return 0;                                                     \
-                                                                             \
-       return read_write_sw(ppd->dd, &ppd->ibport_data.rvp.n_ ##cntr,        \
-                            mode, data);                                     \
-}
-
-def_access_ibp_counter(loop_pkts);
-def_access_ibp_counter(rc_resends);
-def_access_ibp_counter(rnr_naks);
-def_access_ibp_counter(other_naks);
-def_access_ibp_counter(rc_timeouts);
-def_access_ibp_counter(pkt_drops);
-def_access_ibp_counter(dmawait);
-def_access_ibp_counter(rc_seqnak);
-def_access_ibp_counter(rc_dupreq);
-def_access_ibp_counter(rdma_seq);
-def_access_ibp_counter(unaligned);
-def_access_ibp_counter(seq_naks);
-
-static struct cntr_entry dev_cntrs[DEV_CNTR_LAST] = {
-[C_RCV_OVF] = RXE32_DEV_CNTR_ELEM(RcvOverflow, RCV_BUF_OVFL_CNT, CNTR_SYNTH),
-[C_RX_TID_FULL] = RXE32_DEV_CNTR_ELEM(RxTIDFullEr, RCV_TID_FULL_ERR_CNT,
-                       CNTR_NORMAL),
-[C_RX_TID_INVALID] = RXE32_DEV_CNTR_ELEM(RxTIDInvalid, RCV_TID_VALID_ERR_CNT,
-                       CNTR_NORMAL),
-[C_RX_TID_FLGMS] = RXE32_DEV_CNTR_ELEM(RxTidFLGMs,
-                       RCV_TID_FLOW_GEN_MISMATCH_CNT,
-                       CNTR_NORMAL),
-[C_RX_CTX_EGRS] = RXE32_DEV_CNTR_ELEM(RxCtxEgrS, RCV_CONTEXT_EGR_STALL,
-                       CNTR_NORMAL),
-[C_RCV_TID_FLSMS] = RXE32_DEV_CNTR_ELEM(RxTidFLSMs,
-                       RCV_TID_FLOW_SEQ_MISMATCH_CNT, CNTR_NORMAL),
-[C_CCE_PCI_CR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePciCrSt,
-                       CCE_PCIE_POSTED_CRDT_STALL_CNT, CNTR_NORMAL),
-[C_CCE_PCI_TR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePciTrSt, CCE_PCIE_TRGT_STALL_CNT,
-                       CNTR_NORMAL),
-[C_CCE_PIO_WR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePioWrSt, CCE_PIO_WR_STALL_CNT,
-                       CNTR_NORMAL),
-[C_CCE_ERR_INT] = CCE_INT_DEV_CNTR_ELEM(CceErrInt, CCE_ERR_INT_CNT,
-                       CNTR_NORMAL),
-[C_CCE_SDMA_INT] = CCE_INT_DEV_CNTR_ELEM(CceSdmaInt, CCE_SDMA_INT_CNT,
-                       CNTR_NORMAL),
-[C_CCE_MISC_INT] = CCE_INT_DEV_CNTR_ELEM(CceMiscInt, CCE_MISC_INT_CNT,
-                       CNTR_NORMAL),
-[C_CCE_RCV_AV_INT] = CCE_INT_DEV_CNTR_ELEM(CceRcvAvInt, CCE_RCV_AVAIL_INT_CNT,
-                       CNTR_NORMAL),
-[C_CCE_RCV_URG_INT] = CCE_INT_DEV_CNTR_ELEM(CceRcvUrgInt,
-                       CCE_RCV_URGENT_INT_CNT, CNTR_NORMAL),
-[C_CCE_SEND_CR_INT] = CCE_INT_DEV_CNTR_ELEM(CceSndCrInt,
-                       CCE_SEND_CREDIT_INT_CNT, CNTR_NORMAL),
-[C_DC_UNC_ERR] = DC_PERF_CNTR(DcUnctblErr, DCC_ERR_UNCORRECTABLE_CNT,
-                             CNTR_SYNTH),
-[C_DC_RCV_ERR] = DC_PERF_CNTR(DcRecvErr, DCC_ERR_PORTRCV_ERR_CNT, CNTR_SYNTH),
-[C_DC_FM_CFG_ERR] = DC_PERF_CNTR(DcFmCfgErr, DCC_ERR_FMCONFIG_ERR_CNT,
-                                CNTR_SYNTH),
-[C_DC_RMT_PHY_ERR] = DC_PERF_CNTR(DcRmtPhyErr, DCC_ERR_RCVREMOTE_PHY_ERR_CNT,
-                                 CNTR_SYNTH),
-[C_DC_DROPPED_PKT] = DC_PERF_CNTR(DcDroppedPkt, DCC_ERR_DROPPED_PKT_CNT,
-                                 CNTR_SYNTH),
-[C_DC_MC_XMIT_PKTS] = DC_PERF_CNTR(DcMcXmitPkts,
-                                  DCC_PRF_PORT_XMIT_MULTICAST_CNT, CNTR_SYNTH),
-[C_DC_MC_RCV_PKTS] = DC_PERF_CNTR(DcMcRcvPkts,
-                                 DCC_PRF_PORT_RCV_MULTICAST_PKT_CNT,
-                                 CNTR_SYNTH),
-[C_DC_XMIT_CERR] = DC_PERF_CNTR(DcXmitCorr,
-                               DCC_PRF_PORT_XMIT_CORRECTABLE_CNT, CNTR_SYNTH),
-[C_DC_RCV_CERR] = DC_PERF_CNTR(DcRcvCorrCnt, DCC_PRF_PORT_RCV_CORRECTABLE_CNT,
-                              CNTR_SYNTH),
-[C_DC_RCV_FCC] = DC_PERF_CNTR(DcRxFCntl, DCC_PRF_RX_FLOW_CRTL_CNT,
-                             CNTR_SYNTH),
-[C_DC_XMIT_FCC] = DC_PERF_CNTR(DcXmitFCntl, DCC_PRF_TX_FLOW_CRTL_CNT,
-                              CNTR_SYNTH),
-[C_DC_XMIT_FLITS] = DC_PERF_CNTR(DcXmitFlits, DCC_PRF_PORT_XMIT_DATA_CNT,
-                                CNTR_SYNTH),
-[C_DC_RCV_FLITS] = DC_PERF_CNTR(DcRcvFlits, DCC_PRF_PORT_RCV_DATA_CNT,
-                               CNTR_SYNTH),
-[C_DC_XMIT_PKTS] = DC_PERF_CNTR(DcXmitPkts, DCC_PRF_PORT_XMIT_PKTS_CNT,
-                               CNTR_SYNTH),
-[C_DC_RCV_PKTS] = DC_PERF_CNTR(DcRcvPkts, DCC_PRF_PORT_RCV_PKTS_CNT,
-                              CNTR_SYNTH),
-[C_DC_RX_FLIT_VL] = DC_PERF_CNTR(DcRxFlitVl, DCC_PRF_PORT_VL_RCV_DATA_CNT,
-                                CNTR_SYNTH | CNTR_VL),
-[C_DC_RX_PKT_VL] = DC_PERF_CNTR(DcRxPktVl, DCC_PRF_PORT_VL_RCV_PKTS_CNT,
-                               CNTR_SYNTH | CNTR_VL),
-[C_DC_RCV_FCN] = DC_PERF_CNTR(DcRcvFcn, DCC_PRF_PORT_RCV_FECN_CNT, CNTR_SYNTH),
-[C_DC_RCV_FCN_VL] = DC_PERF_CNTR(DcRcvFcnVl, DCC_PRF_PORT_VL_RCV_FECN_CNT,
-                                CNTR_SYNTH | CNTR_VL),
-[C_DC_RCV_BCN] = DC_PERF_CNTR(DcRcvBcn, DCC_PRF_PORT_RCV_BECN_CNT, CNTR_SYNTH),
-[C_DC_RCV_BCN_VL] = DC_PERF_CNTR(DcRcvBcnVl, DCC_PRF_PORT_VL_RCV_BECN_CNT,
-                                CNTR_SYNTH | CNTR_VL),
-[C_DC_RCV_BBL] = DC_PERF_CNTR(DcRcvBbl, DCC_PRF_PORT_RCV_BUBBLE_CNT,
-                             CNTR_SYNTH),
-[C_DC_RCV_BBL_VL] = DC_PERF_CNTR(DcRcvBblVl, DCC_PRF_PORT_VL_RCV_BUBBLE_CNT,
-                                CNTR_SYNTH | CNTR_VL),
-[C_DC_MARK_FECN] = DC_PERF_CNTR(DcMarkFcn, DCC_PRF_PORT_MARK_FECN_CNT,
-                               CNTR_SYNTH),
-[C_DC_MARK_FECN_VL] = DC_PERF_CNTR(DcMarkFcnVl, DCC_PRF_PORT_VL_MARK_FECN_CNT,
-                                  CNTR_SYNTH | CNTR_VL),
-[C_DC_TOTAL_CRC] =
-       DC_PERF_CNTR_LCB(DcTotCrc, DC_LCB_ERR_INFO_TOTAL_CRC_ERR,
-                        CNTR_SYNTH),
-[C_DC_CRC_LN0] = DC_PERF_CNTR_LCB(DcCrcLn0, DC_LCB_ERR_INFO_CRC_ERR_LN0,
-                                 CNTR_SYNTH),
-[C_DC_CRC_LN1] = DC_PERF_CNTR_LCB(DcCrcLn1, DC_LCB_ERR_INFO_CRC_ERR_LN1,
-                                 CNTR_SYNTH),
-[C_DC_CRC_LN2] = DC_PERF_CNTR_LCB(DcCrcLn2, DC_LCB_ERR_INFO_CRC_ERR_LN2,
-                                 CNTR_SYNTH),
-[C_DC_CRC_LN3] = DC_PERF_CNTR_LCB(DcCrcLn3, DC_LCB_ERR_INFO_CRC_ERR_LN3,
-                                 CNTR_SYNTH),
-[C_DC_CRC_MULT_LN] =
-       DC_PERF_CNTR_LCB(DcMultLn, DC_LCB_ERR_INFO_CRC_ERR_MULTI_LN,
-                        CNTR_SYNTH),
-[C_DC_TX_REPLAY] = DC_PERF_CNTR_LCB(DcTxReplay, DC_LCB_ERR_INFO_TX_REPLAY_CNT,
-                                   CNTR_SYNTH),
-[C_DC_RX_REPLAY] = DC_PERF_CNTR_LCB(DcRxReplay, DC_LCB_ERR_INFO_RX_REPLAY_CNT,
-                                   CNTR_SYNTH),
-[C_DC_SEQ_CRC_CNT] =
-       DC_PERF_CNTR_LCB(DcLinkSeqCrc, DC_LCB_ERR_INFO_SEQ_CRC_CNT,
-                        CNTR_SYNTH),
-[C_DC_ESC0_ONLY_CNT] =
-       DC_PERF_CNTR_LCB(DcEsc0, DC_LCB_ERR_INFO_ESCAPE_0_ONLY_CNT,
-                        CNTR_SYNTH),
-[C_DC_ESC0_PLUS1_CNT] =
-       DC_PERF_CNTR_LCB(DcEsc1, DC_LCB_ERR_INFO_ESCAPE_0_PLUS1_CNT,
-                        CNTR_SYNTH),
-[C_DC_ESC0_PLUS2_CNT] =
-       DC_PERF_CNTR_LCB(DcEsc0Plus2, DC_LCB_ERR_INFO_ESCAPE_0_PLUS2_CNT,
-                        CNTR_SYNTH),
-[C_DC_REINIT_FROM_PEER_CNT] =
-       DC_PERF_CNTR_LCB(DcReinitPeer, DC_LCB_ERR_INFO_REINIT_FROM_PEER_CNT,
-                        CNTR_SYNTH),
-[C_DC_SBE_CNT] = DC_PERF_CNTR_LCB(DcSbe, DC_LCB_ERR_INFO_SBE_CNT,
-                                 CNTR_SYNTH),
-[C_DC_MISC_FLG_CNT] =
-       DC_PERF_CNTR_LCB(DcMiscFlg, DC_LCB_ERR_INFO_MISC_FLG_CNT,
-                        CNTR_SYNTH),
-[C_DC_PRF_GOOD_LTP_CNT] =
-       DC_PERF_CNTR_LCB(DcGoodLTP, DC_LCB_PRF_GOOD_LTP_CNT, CNTR_SYNTH),
-[C_DC_PRF_ACCEPTED_LTP_CNT] =
-       DC_PERF_CNTR_LCB(DcAccLTP, DC_LCB_PRF_ACCEPTED_LTP_CNT,
-                        CNTR_SYNTH),
-[C_DC_PRF_RX_FLIT_CNT] =
-       DC_PERF_CNTR_LCB(DcPrfRxFlit, DC_LCB_PRF_RX_FLIT_CNT, CNTR_SYNTH),
-[C_DC_PRF_TX_FLIT_CNT] =
-       DC_PERF_CNTR_LCB(DcPrfTxFlit, DC_LCB_PRF_TX_FLIT_CNT, CNTR_SYNTH),
-[C_DC_PRF_CLK_CNTR] =
-       DC_PERF_CNTR_LCB(DcPrfClk, DC_LCB_PRF_CLK_CNTR, CNTR_SYNTH),
-[C_DC_PG_DBG_FLIT_CRDTS_CNT] =
-       DC_PERF_CNTR_LCB(DcFltCrdts, DC_LCB_PG_DBG_FLIT_CRDTS_CNT, CNTR_SYNTH),
-[C_DC_PG_STS_PAUSE_COMPLETE_CNT] =
-       DC_PERF_CNTR_LCB(DcPauseComp, DC_LCB_PG_STS_PAUSE_COMPLETE_CNT,
-                        CNTR_SYNTH),
-[C_DC_PG_STS_TX_SBE_CNT] =
-       DC_PERF_CNTR_LCB(DcStsTxSbe, DC_LCB_PG_STS_TX_SBE_CNT, CNTR_SYNTH),
-[C_DC_PG_STS_TX_MBE_CNT] =
-       DC_PERF_CNTR_LCB(DcStsTxMbe, DC_LCB_PG_STS_TX_MBE_CNT,
-                        CNTR_SYNTH),
-[C_SW_CPU_INTR] = CNTR_ELEM("Intr", 0, 0, CNTR_NORMAL,
-                           access_sw_cpu_intr),
-[C_SW_CPU_RCV_LIM] = CNTR_ELEM("RcvLimit", 0, 0, CNTR_NORMAL,
-                           access_sw_cpu_rcv_limit),
-[C_SW_VTX_WAIT] = CNTR_ELEM("vTxWait", 0, 0, CNTR_NORMAL,
-                           access_sw_vtx_wait),
-[C_SW_PIO_WAIT] = CNTR_ELEM("PioWait", 0, 0, CNTR_NORMAL,
-                           access_sw_pio_wait),
-[C_SW_PIO_DRAIN] = CNTR_ELEM("PioDrain", 0, 0, CNTR_NORMAL,
-                           access_sw_pio_drain),
-[C_SW_KMEM_WAIT] = CNTR_ELEM("KmemWait", 0, 0, CNTR_NORMAL,
-                           access_sw_kmem_wait),
-[C_SW_SEND_SCHED] = CNTR_ELEM("SendSched", 0, 0, CNTR_NORMAL,
-                           access_sw_send_schedule),
-[C_SDMA_DESC_FETCHED_CNT] = CNTR_ELEM("SDEDscFdCn",
-                                     SEND_DMA_DESC_FETCHED_CNT, 0,
-                                     CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA,
-                                     dev_access_u32_csr),
-[C_SDMA_INT_CNT] = CNTR_ELEM("SDMAInt", 0, 0,
-                            CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA,
-                            access_sde_int_cnt),
-[C_SDMA_ERR_CNT] = CNTR_ELEM("SDMAErrCt", 0, 0,
-                            CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA,
-                            access_sde_err_cnt),
-[C_SDMA_IDLE_INT_CNT] = CNTR_ELEM("SDMAIdInt", 0, 0,
-                                 CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA,
-                                 access_sde_idle_int_cnt),
-[C_SDMA_PROGRESS_INT_CNT] = CNTR_ELEM("SDMAPrIntCn", 0, 0,
-                                     CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA,
-                                     access_sde_progress_int_cnt),
-/* MISC_ERR_STATUS */
-[C_MISC_PLL_LOCK_FAIL_ERR] = CNTR_ELEM("MISC_PLL_LOCK_FAIL_ERR", 0, 0,
-                               CNTR_NORMAL,
-                               access_misc_pll_lock_fail_err_cnt),
-[C_MISC_MBIST_FAIL_ERR] = CNTR_ELEM("MISC_MBIST_FAIL_ERR", 0, 0,
-                               CNTR_NORMAL,
-                               access_misc_mbist_fail_err_cnt),
-[C_MISC_INVALID_EEP_CMD_ERR] = CNTR_ELEM("MISC_INVALID_EEP_CMD_ERR", 0, 0,
-                               CNTR_NORMAL,
-                               access_misc_invalid_eep_cmd_err_cnt),
-[C_MISC_EFUSE_DONE_PARITY_ERR] = CNTR_ELEM("MISC_EFUSE_DONE_PARITY_ERR", 0, 0,
-                               CNTR_NORMAL,
-                               access_misc_efuse_done_parity_err_cnt),
-[C_MISC_EFUSE_WRITE_ERR] = CNTR_ELEM("MISC_EFUSE_WRITE_ERR", 0, 0,
-                               CNTR_NORMAL,
-                               access_misc_efuse_write_err_cnt),
-[C_MISC_EFUSE_READ_BAD_ADDR_ERR] = CNTR_ELEM("MISC_EFUSE_READ_BAD_ADDR_ERR", 0,
-                               0, CNTR_NORMAL,
-                               access_misc_efuse_read_bad_addr_err_cnt),
-[C_MISC_EFUSE_CSR_PARITY_ERR] = CNTR_ELEM("MISC_EFUSE_CSR_PARITY_ERR", 0, 0,
-                               CNTR_NORMAL,
-                               access_misc_efuse_csr_parity_err_cnt),
-[C_MISC_FW_AUTH_FAILED_ERR] = CNTR_ELEM("MISC_FW_AUTH_FAILED_ERR", 0, 0,
-                               CNTR_NORMAL,
-                               access_misc_fw_auth_failed_err_cnt),
-[C_MISC_KEY_MISMATCH_ERR] = CNTR_ELEM("MISC_KEY_MISMATCH_ERR", 0, 0,
-                               CNTR_NORMAL,
-                               access_misc_key_mismatch_err_cnt),
-[C_MISC_SBUS_WRITE_FAILED_ERR] = CNTR_ELEM("MISC_SBUS_WRITE_FAILED_ERR", 0, 0,
-                               CNTR_NORMAL,
-                               access_misc_sbus_write_failed_err_cnt),
-[C_MISC_CSR_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("MISC_CSR_WRITE_BAD_ADDR_ERR", 0, 0,
-                               CNTR_NORMAL,
-                               access_misc_csr_write_bad_addr_err_cnt),
-[C_MISC_CSR_READ_BAD_ADDR_ERR] = CNTR_ELEM("MISC_CSR_READ_BAD_ADDR_ERR", 0, 0,
-                               CNTR_NORMAL,
-                               access_misc_csr_read_bad_addr_err_cnt),
-[C_MISC_CSR_PARITY_ERR] = CNTR_ELEM("MISC_CSR_PARITY_ERR", 0, 0,
-                               CNTR_NORMAL,
-                               access_misc_csr_parity_err_cnt),
-/* CceErrStatus */
-[C_CCE_ERR_STATUS_AGGREGATED_CNT] = CNTR_ELEM("CceErrStatusAggregatedCnt", 0, 0,
-                               CNTR_NORMAL,
-                               access_sw_cce_err_status_aggregated_cnt),
-[C_CCE_MSIX_CSR_PARITY_ERR] = CNTR_ELEM("CceMsixCsrParityErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_cce_msix_csr_parity_err_cnt),
-[C_CCE_INT_MAP_UNC_ERR] = CNTR_ELEM("CceIntMapUncErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_cce_int_map_unc_err_cnt),
-[C_CCE_INT_MAP_COR_ERR] = CNTR_ELEM("CceIntMapCorErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_cce_int_map_cor_err_cnt),
-[C_CCE_MSIX_TABLE_UNC_ERR] = CNTR_ELEM("CceMsixTableUncErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_cce_msix_table_unc_err_cnt),
-[C_CCE_MSIX_TABLE_COR_ERR] = CNTR_ELEM("CceMsixTableCorErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_cce_msix_table_cor_err_cnt),
-[C_CCE_RXDMA_CONV_FIFO_PARITY_ERR] = CNTR_ELEM("CceRxdmaConvFifoParityErr", 0,
-                               0, CNTR_NORMAL,
-                               access_cce_rxdma_conv_fifo_parity_err_cnt),
-[C_CCE_RCPL_ASYNC_FIFO_PARITY_ERR] = CNTR_ELEM("CceRcplAsyncFifoParityErr", 0,
-                               0, CNTR_NORMAL,
-                               access_cce_rcpl_async_fifo_parity_err_cnt),
-[C_CCE_SEG_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("CceSegWriteBadAddrErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_cce_seg_write_bad_addr_err_cnt),
-[C_CCE_SEG_READ_BAD_ADDR_ERR] = CNTR_ELEM("CceSegReadBadAddrErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_cce_seg_read_bad_addr_err_cnt),
-[C_LA_TRIGGERED] = CNTR_ELEM("Cce LATriggered", 0, 0,
-                               CNTR_NORMAL,
-                               access_la_triggered_cnt),
-[C_CCE_TRGT_CPL_TIMEOUT_ERR] = CNTR_ELEM("CceTrgtCplTimeoutErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_cce_trgt_cpl_timeout_err_cnt),
-[C_PCIC_RECEIVE_PARITY_ERR] = CNTR_ELEM("PcicReceiveParityErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_pcic_receive_parity_err_cnt),
-[C_PCIC_TRANSMIT_BACK_PARITY_ERR] = CNTR_ELEM("PcicTransmitBackParityErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_pcic_transmit_back_parity_err_cnt),
-[C_PCIC_TRANSMIT_FRONT_PARITY_ERR] = CNTR_ELEM("PcicTransmitFrontParityErr", 0,
-                               0, CNTR_NORMAL,
-                               access_pcic_transmit_front_parity_err_cnt),
-[C_PCIC_CPL_DAT_Q_UNC_ERR] = CNTR_ELEM("PcicCplDatQUncErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_pcic_cpl_dat_q_unc_err_cnt),
-[C_PCIC_CPL_HD_Q_UNC_ERR] = CNTR_ELEM("PcicCplHdQUncErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_pcic_cpl_hd_q_unc_err_cnt),
-[C_PCIC_POST_DAT_Q_UNC_ERR] = CNTR_ELEM("PcicPostDatQUncErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_pcic_post_dat_q_unc_err_cnt),
-[C_PCIC_POST_HD_Q_UNC_ERR] = CNTR_ELEM("PcicPostHdQUncErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_pcic_post_hd_q_unc_err_cnt),
-[C_PCIC_RETRY_SOT_MEM_UNC_ERR] = CNTR_ELEM("PcicRetrySotMemUncErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_pcic_retry_sot_mem_unc_err_cnt),
-[C_PCIC_RETRY_MEM_UNC_ERR] = CNTR_ELEM("PcicRetryMemUncErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_pcic_retry_mem_unc_err),
-[C_PCIC_N_POST_DAT_Q_PARITY_ERR] = CNTR_ELEM("PcicNPostDatQParityErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_pcic_n_post_dat_q_parity_err_cnt),
-[C_PCIC_N_POST_H_Q_PARITY_ERR] = CNTR_ELEM("PcicNPostHQParityErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_pcic_n_post_h_q_parity_err_cnt),
-[C_PCIC_CPL_DAT_Q_COR_ERR] = CNTR_ELEM("PcicCplDatQCorErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_pcic_cpl_dat_q_cor_err_cnt),
-[C_PCIC_CPL_HD_Q_COR_ERR] = CNTR_ELEM("PcicCplHdQCorErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_pcic_cpl_hd_q_cor_err_cnt),
-[C_PCIC_POST_DAT_Q_COR_ERR] = CNTR_ELEM("PcicPostDatQCorErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_pcic_post_dat_q_cor_err_cnt),
-[C_PCIC_POST_HD_Q_COR_ERR] = CNTR_ELEM("PcicPostHdQCorErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_pcic_post_hd_q_cor_err_cnt),
-[C_PCIC_RETRY_SOT_MEM_COR_ERR] = CNTR_ELEM("PcicRetrySotMemCorErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_pcic_retry_sot_mem_cor_err_cnt),
-[C_PCIC_RETRY_MEM_COR_ERR] = CNTR_ELEM("PcicRetryMemCorErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_pcic_retry_mem_cor_err_cnt),
-[C_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERR] = CNTR_ELEM(
-                               "CceCli1AsyncFifoDbgParityError", 0, 0,
-                               CNTR_NORMAL,
-                               access_cce_cli1_async_fifo_dbg_parity_err_cnt),
-[C_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERR] = CNTR_ELEM(
-                               "CceCli1AsyncFifoRxdmaParityError", 0, 0,
-                               CNTR_NORMAL,
-                               access_cce_cli1_async_fifo_rxdma_parity_err_cnt
-                               ),
-[C_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR] = CNTR_ELEM(
-                       "CceCli1AsyncFifoSdmaHdParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_cce_cli1_async_fifo_sdma_hd_parity_err_cnt),
-[C_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR] = CNTR_ELEM(
-                       "CceCli1AsyncFifoPioCrdtParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_cce_cl1_async_fifo_pio_crdt_parity_err_cnt),
-[C_CCE_CLI2_ASYNC_FIFO_PARITY_ERR] = CNTR_ELEM("CceCli2AsyncFifoParityErr", 0,
-                       0, CNTR_NORMAL,
-                       access_cce_cli2_async_fifo_parity_err_cnt),
-[C_CCE_CSR_CFG_BUS_PARITY_ERR] = CNTR_ELEM("CceCsrCfgBusParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_cce_csr_cfg_bus_parity_err_cnt),
-[C_CCE_CLI0_ASYNC_FIFO_PARTIY_ERR] = CNTR_ELEM("CceCli0AsyncFifoParityErr", 0,
-                       0, CNTR_NORMAL,
-                       access_cce_cli0_async_fifo_parity_err_cnt),
-[C_CCE_RSPD_DATA_PARITY_ERR] = CNTR_ELEM("CceRspdDataParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_cce_rspd_data_parity_err_cnt),
-[C_CCE_TRGT_ACCESS_ERR] = CNTR_ELEM("CceTrgtAccessErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_cce_trgt_access_err_cnt),
-[C_CCE_TRGT_ASYNC_FIFO_PARITY_ERR] = CNTR_ELEM("CceTrgtAsyncFifoParityErr", 0,
-                       0, CNTR_NORMAL,
-                       access_cce_trgt_async_fifo_parity_err_cnt),
-[C_CCE_CSR_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("CceCsrWriteBadAddrErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_cce_csr_write_bad_addr_err_cnt),
-[C_CCE_CSR_READ_BAD_ADDR_ERR] = CNTR_ELEM("CceCsrReadBadAddrErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_cce_csr_read_bad_addr_err_cnt),
-[C_CCE_CSR_PARITY_ERR] = CNTR_ELEM("CceCsrParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_ccs_csr_parity_err_cnt),
-
-/* RcvErrStatus */
-[C_RX_CSR_PARITY_ERR] = CNTR_ELEM("RxCsrParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_csr_parity_err_cnt),
-[C_RX_CSR_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("RxCsrWriteBadAddrErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_csr_write_bad_addr_err_cnt),
-[C_RX_CSR_READ_BAD_ADDR_ERR] = CNTR_ELEM("RxCsrReadBadAddrErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_csr_read_bad_addr_err_cnt),
-[C_RX_DMA_CSR_UNC_ERR] = CNTR_ELEM("RxDmaCsrUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_dma_csr_unc_err_cnt),
-[C_RX_DMA_DQ_FSM_ENCODING_ERR] = CNTR_ELEM("RxDmaDqFsmEncodingErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_dma_dq_fsm_encoding_err_cnt),
-[C_RX_DMA_EQ_FSM_ENCODING_ERR] = CNTR_ELEM("RxDmaEqFsmEncodingErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_dma_eq_fsm_encoding_err_cnt),
-[C_RX_DMA_CSR_PARITY_ERR] = CNTR_ELEM("RxDmaCsrParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_dma_csr_parity_err_cnt),
-[C_RX_RBUF_DATA_COR_ERR] = CNTR_ELEM("RxRbufDataCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_data_cor_err_cnt),
-[C_RX_RBUF_DATA_UNC_ERR] = CNTR_ELEM("RxRbufDataUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_data_unc_err_cnt),
-[C_RX_DMA_DATA_FIFO_RD_COR_ERR] = CNTR_ELEM("RxDmaDataFifoRdCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_dma_data_fifo_rd_cor_err_cnt),
-[C_RX_DMA_DATA_FIFO_RD_UNC_ERR] = CNTR_ELEM("RxDmaDataFifoRdUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_dma_data_fifo_rd_unc_err_cnt),
-[C_RX_DMA_HDR_FIFO_RD_COR_ERR] = CNTR_ELEM("RxDmaHdrFifoRdCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_dma_hdr_fifo_rd_cor_err_cnt),
-[C_RX_DMA_HDR_FIFO_RD_UNC_ERR] = CNTR_ELEM("RxDmaHdrFifoRdUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_dma_hdr_fifo_rd_unc_err_cnt),
-[C_RX_RBUF_DESC_PART2_COR_ERR] = CNTR_ELEM("RxRbufDescPart2CorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_desc_part2_cor_err_cnt),
-[C_RX_RBUF_DESC_PART2_UNC_ERR] = CNTR_ELEM("RxRbufDescPart2UncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_desc_part2_unc_err_cnt),
-[C_RX_RBUF_DESC_PART1_COR_ERR] = CNTR_ELEM("RxRbufDescPart1CorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_desc_part1_cor_err_cnt),
-[C_RX_RBUF_DESC_PART1_UNC_ERR] = CNTR_ELEM("RxRbufDescPart1UncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_desc_part1_unc_err_cnt),
-[C_RX_HQ_INTR_FSM_ERR] = CNTR_ELEM("RxHqIntrFsmErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_hq_intr_fsm_err_cnt),
-[C_RX_HQ_INTR_CSR_PARITY_ERR] = CNTR_ELEM("RxHqIntrCsrParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_hq_intr_csr_parity_err_cnt),
-[C_RX_LOOKUP_CSR_PARITY_ERR] = CNTR_ELEM("RxLookupCsrParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_lookup_csr_parity_err_cnt),
-[C_RX_LOOKUP_RCV_ARRAY_COR_ERR] = CNTR_ELEM("RxLookupRcvArrayCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_lookup_rcv_array_cor_err_cnt),
-[C_RX_LOOKUP_RCV_ARRAY_UNC_ERR] = CNTR_ELEM("RxLookupRcvArrayUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_lookup_rcv_array_unc_err_cnt),
-[C_RX_LOOKUP_DES_PART2_PARITY_ERR] = CNTR_ELEM("RxLookupDesPart2ParityErr", 0,
-                       0, CNTR_NORMAL,
-                       access_rx_lookup_des_part2_parity_err_cnt),
-[C_RX_LOOKUP_DES_PART1_UNC_COR_ERR] = CNTR_ELEM("RxLookupDesPart1UncCorErr", 0,
-                       0, CNTR_NORMAL,
-                       access_rx_lookup_des_part1_unc_cor_err_cnt),
-[C_RX_LOOKUP_DES_PART1_UNC_ERR] = CNTR_ELEM("RxLookupDesPart1UncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_lookup_des_part1_unc_err_cnt),
-[C_RX_RBUF_NEXT_FREE_BUF_COR_ERR] = CNTR_ELEM("RxRbufNextFreeBufCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_next_free_buf_cor_err_cnt),
-[C_RX_RBUF_NEXT_FREE_BUF_UNC_ERR] = CNTR_ELEM("RxRbufNextFreeBufUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_next_free_buf_unc_err_cnt),
-[C_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR] = CNTR_ELEM(
-                       "RxRbufFlInitWrAddrParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rbuf_fl_init_wr_addr_parity_err_cnt),
-[C_RX_RBUF_FL_INITDONE_PARITY_ERR] = CNTR_ELEM("RxRbufFlInitdoneParityErr", 0,
-                       0, CNTR_NORMAL,
-                       access_rx_rbuf_fl_initdone_parity_err_cnt),
-[C_RX_RBUF_FL_WRITE_ADDR_PARITY_ERR] = CNTR_ELEM("RxRbufFlWrAddrParityErr", 0,
-                       0, CNTR_NORMAL,
-                       access_rx_rbuf_fl_write_addr_parity_err_cnt),
-[C_RX_RBUF_FL_RD_ADDR_PARITY_ERR] = CNTR_ELEM("RxRbufFlRdAddrParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_fl_rd_addr_parity_err_cnt),
-[C_RX_RBUF_EMPTY_ERR] = CNTR_ELEM("RxRbufEmptyErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_empty_err_cnt),
-[C_RX_RBUF_FULL_ERR] = CNTR_ELEM("RxRbufFullErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_full_err_cnt),
-[C_RX_RBUF_BAD_LOOKUP_ERR] = CNTR_ELEM("RxRBufBadLookupErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rbuf_bad_lookup_err_cnt),
-[C_RX_RBUF_CTX_ID_PARITY_ERR] = CNTR_ELEM("RxRbufCtxIdParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rbuf_ctx_id_parity_err_cnt),
-[C_RX_RBUF_CSR_QEOPDW_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQEOPDWParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rbuf_csr_qeopdw_parity_err_cnt),
-[C_RX_RBUF_CSR_Q_NUM_OF_PKT_PARITY_ERR] = CNTR_ELEM(
-                       "RxRbufCsrQNumOfPktParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_csr_q_num_of_pkt_parity_err_cnt),
-[C_RX_RBUF_CSR_Q_T1_PTR_PARITY_ERR] = CNTR_ELEM(
-                       "RxRbufCsrQTlPtrParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_csr_q_t1_ptr_parity_err_cnt),
-[C_RX_RBUF_CSR_Q_HD_PTR_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQHdPtrParityErr", 0,
-                       0, CNTR_NORMAL,
-                       access_rx_rbuf_csr_q_hd_ptr_parity_err_cnt),
-[C_RX_RBUF_CSR_Q_VLD_BIT_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQVldBitParityErr", 0,
-                       0, CNTR_NORMAL,
-                       access_rx_rbuf_csr_q_vld_bit_parity_err_cnt),
-[C_RX_RBUF_CSR_Q_NEXT_BUF_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQNextBufParityErr",
-                       0, 0, CNTR_NORMAL,
-                       access_rx_rbuf_csr_q_next_buf_parity_err_cnt),
-[C_RX_RBUF_CSR_Q_ENT_CNT_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQEntCntParityErr", 0,
-                       0, CNTR_NORMAL,
-                       access_rx_rbuf_csr_q_ent_cnt_parity_err_cnt),
-[C_RX_RBUF_CSR_Q_HEAD_BUF_NUM_PARITY_ERR] = CNTR_ELEM(
-                       "RxRbufCsrQHeadBufNumParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_csr_q_head_buf_num_parity_err_cnt),
-[C_RX_RBUF_BLOCK_LIST_READ_COR_ERR] = CNTR_ELEM("RxRbufBlockListReadCorErr", 0,
-                       0, CNTR_NORMAL,
-                       access_rx_rbuf_block_list_read_cor_err_cnt),
-[C_RX_RBUF_BLOCK_LIST_READ_UNC_ERR] = CNTR_ELEM("RxRbufBlockListReadUncErr", 0,
-                       0, CNTR_NORMAL,
-                       access_rx_rbuf_block_list_read_unc_err_cnt),
-[C_RX_RBUF_LOOKUP_DES_COR_ERR] = CNTR_ELEM("RxRbufLookupDesCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_lookup_des_cor_err_cnt),
-[C_RX_RBUF_LOOKUP_DES_UNC_ERR] = CNTR_ELEM("RxRbufLookupDesUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_lookup_des_unc_err_cnt),
-[C_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR] = CNTR_ELEM(
-                       "RxRbufLookupDesRegUncCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_lookup_des_reg_unc_cor_err_cnt),
-[C_RX_RBUF_LOOKUP_DES_REG_UNC_ERR] = CNTR_ELEM("RxRbufLookupDesRegUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_lookup_des_reg_unc_err_cnt),
-[C_RX_RBUF_FREE_LIST_COR_ERR] = CNTR_ELEM("RxRbufFreeListCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_free_list_cor_err_cnt),
-[C_RX_RBUF_FREE_LIST_UNC_ERR] = CNTR_ELEM("RxRbufFreeListUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_free_list_unc_err_cnt),
-[C_RX_RCV_FSM_ENCODING_ERR] = CNTR_ELEM("RxRcvFsmEncodingErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rcv_fsm_encoding_err_cnt),
-[C_RX_DMA_FLAG_COR_ERR] = CNTR_ELEM("RxDmaFlagCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_dma_flag_cor_err_cnt),
-[C_RX_DMA_FLAG_UNC_ERR] = CNTR_ELEM("RxDmaFlagUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_dma_flag_unc_err_cnt),
-[C_RX_DC_SOP_EOP_PARITY_ERR] = CNTR_ELEM("RxDcSopEopParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_dc_sop_eop_parity_err_cnt),
-[C_RX_RCV_CSR_PARITY_ERR] = CNTR_ELEM("RxRcvCsrParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rcv_csr_parity_err_cnt),
-[C_RX_RCV_QP_MAP_TABLE_COR_ERR] = CNTR_ELEM("RxRcvQpMapTableCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rcv_qp_map_table_cor_err_cnt),
-[C_RX_RCV_QP_MAP_TABLE_UNC_ERR] = CNTR_ELEM("RxRcvQpMapTableUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rcv_qp_map_table_unc_err_cnt),
-[C_RX_RCV_DATA_COR_ERR] = CNTR_ELEM("RxRcvDataCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rcv_data_cor_err_cnt),
-[C_RX_RCV_DATA_UNC_ERR] = CNTR_ELEM("RxRcvDataUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rcv_data_unc_err_cnt),
-[C_RX_RCV_HDR_COR_ERR] = CNTR_ELEM("RxRcvHdrCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rcv_hdr_cor_err_cnt),
-[C_RX_RCV_HDR_UNC_ERR] = CNTR_ELEM("RxRcvHdrUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rcv_hdr_unc_err_cnt),
-[C_RX_DC_INTF_PARITY_ERR] = CNTR_ELEM("RxDcIntfParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_dc_intf_parity_err_cnt),
-[C_RX_DMA_CSR_COR_ERR] = CNTR_ELEM("RxDmaCsrCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_dma_csr_cor_err_cnt),
-/* SendPioErrStatus */
-[C_PIO_PEC_SOP_HEAD_PARITY_ERR] = CNTR_ELEM("PioPecSopHeadParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_pec_sop_head_parity_err_cnt),
-[C_PIO_PCC_SOP_HEAD_PARITY_ERR] = CNTR_ELEM("PioPccSopHeadParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_pcc_sop_head_parity_err_cnt),
-[C_PIO_LAST_RETURNED_CNT_PARITY_ERR] = CNTR_ELEM("PioLastReturnedCntParityErr",
-                       0, 0, CNTR_NORMAL,
-                       access_pio_last_returned_cnt_parity_err_cnt),
-[C_PIO_CURRENT_FREE_CNT_PARITY_ERR] = CNTR_ELEM("PioCurrentFreeCntParityErr", 0,
-                       0, CNTR_NORMAL,
-                       access_pio_current_free_cnt_parity_err_cnt),
-[C_PIO_RSVD_31_ERR] = CNTR_ELEM("Pio Reserved 31", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_reserved_31_err_cnt),
-[C_PIO_RSVD_30_ERR] = CNTR_ELEM("Pio Reserved 30", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_reserved_30_err_cnt),
-[C_PIO_PPMC_SOP_LEN_ERR] = CNTR_ELEM("PioPpmcSopLenErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_ppmc_sop_len_err_cnt),
-[C_PIO_PPMC_BQC_MEM_PARITY_ERR] = CNTR_ELEM("PioPpmcBqcMemParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_ppmc_bqc_mem_parity_err_cnt),
-[C_PIO_VL_FIFO_PARITY_ERR] = CNTR_ELEM("PioVlFifoParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_vl_fifo_parity_err_cnt),
-[C_PIO_VLF_SOP_PARITY_ERR] = CNTR_ELEM("PioVlfSopParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_vlf_sop_parity_err_cnt),
-[C_PIO_VLF_V1_LEN_PARITY_ERR] = CNTR_ELEM("PioVlfVlLenParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_vlf_v1_len_parity_err_cnt),
-[C_PIO_BLOCK_QW_COUNT_PARITY_ERR] = CNTR_ELEM("PioBlockQwCountParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_block_qw_count_parity_err_cnt),
-[C_PIO_WRITE_QW_VALID_PARITY_ERR] = CNTR_ELEM("PioWriteQwValidParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_write_qw_valid_parity_err_cnt),
-[C_PIO_STATE_MACHINE_ERR] = CNTR_ELEM("PioStateMachineErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_state_machine_err_cnt),
-[C_PIO_WRITE_DATA_PARITY_ERR] = CNTR_ELEM("PioWriteDataParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_write_data_parity_err_cnt),
-[C_PIO_HOST_ADDR_MEM_COR_ERR] = CNTR_ELEM("PioHostAddrMemCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_host_addr_mem_cor_err_cnt),
-[C_PIO_HOST_ADDR_MEM_UNC_ERR] = CNTR_ELEM("PioHostAddrMemUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_host_addr_mem_unc_err_cnt),
-[C_PIO_PKT_EVICT_SM_OR_ARM_SM_ERR] = CNTR_ELEM("PioPktEvictSmOrArbSmErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_pkt_evict_sm_or_arb_sm_err_cnt),
-[C_PIO_INIT_SM_IN_ERR] = CNTR_ELEM("PioInitSmInErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_init_sm_in_err_cnt),
-[C_PIO_PPMC_PBL_FIFO_ERR] = CNTR_ELEM("PioPpmcPblFifoErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_ppmc_pbl_fifo_err_cnt),
-[C_PIO_CREDIT_RET_FIFO_PARITY_ERR] = CNTR_ELEM("PioCreditRetFifoParityErr", 0,
-                       0, CNTR_NORMAL,
-                       access_pio_credit_ret_fifo_parity_err_cnt),
-[C_PIO_V1_LEN_MEM_BANK1_COR_ERR] = CNTR_ELEM("PioVlLenMemBank1CorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_v1_len_mem_bank1_cor_err_cnt),
-[C_PIO_V1_LEN_MEM_BANK0_COR_ERR] = CNTR_ELEM("PioVlLenMemBank0CorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_v1_len_mem_bank0_cor_err_cnt),
-[C_PIO_V1_LEN_MEM_BANK1_UNC_ERR] = CNTR_ELEM("PioVlLenMemBank1UncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_v1_len_mem_bank1_unc_err_cnt),
-[C_PIO_V1_LEN_MEM_BANK0_UNC_ERR] = CNTR_ELEM("PioVlLenMemBank0UncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_v1_len_mem_bank0_unc_err_cnt),
-[C_PIO_SM_PKT_RESET_PARITY_ERR] = CNTR_ELEM("PioSmPktResetParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_sm_pkt_reset_parity_err_cnt),
-[C_PIO_PKT_EVICT_FIFO_PARITY_ERR] = CNTR_ELEM("PioPktEvictFifoParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_pkt_evict_fifo_parity_err_cnt),
-[C_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR] = CNTR_ELEM(
-                       "PioSbrdctrlCrrelFifoParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_sbrdctrl_crrel_fifo_parity_err_cnt),
-[C_PIO_SBRDCTL_CRREL_PARITY_ERR] = CNTR_ELEM("PioSbrdctlCrrelParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_sbrdctl_crrel_parity_err_cnt),
-[C_PIO_PEC_FIFO_PARITY_ERR] = CNTR_ELEM("PioPecFifoParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_pec_fifo_parity_err_cnt),
-[C_PIO_PCC_FIFO_PARITY_ERR] = CNTR_ELEM("PioPccFifoParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_pcc_fifo_parity_err_cnt),
-[C_PIO_SB_MEM_FIFO1_ERR] = CNTR_ELEM("PioSbMemFifo1Err", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_sb_mem_fifo1_err_cnt),
-[C_PIO_SB_MEM_FIFO0_ERR] = CNTR_ELEM("PioSbMemFifo0Err", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_sb_mem_fifo0_err_cnt),
-[C_PIO_CSR_PARITY_ERR] = CNTR_ELEM("PioCsrParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_csr_parity_err_cnt),
-[C_PIO_WRITE_ADDR_PARITY_ERR] = CNTR_ELEM("PioWriteAddrParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_write_addr_parity_err_cnt),
-[C_PIO_WRITE_BAD_CTXT_ERR] = CNTR_ELEM("PioWriteBadCtxtErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_write_bad_ctxt_err_cnt),
-/* SendDmaErrStatus */
-[C_SDMA_PCIE_REQ_TRACKING_COR_ERR] = CNTR_ELEM("SDmaPcieReqTrackingCorErr", 0,
-                       0, CNTR_NORMAL,
-                       access_sdma_pcie_req_tracking_cor_err_cnt),
-[C_SDMA_PCIE_REQ_TRACKING_UNC_ERR] = CNTR_ELEM("SDmaPcieReqTrackingUncErr", 0,
-                       0, CNTR_NORMAL,
-                       access_sdma_pcie_req_tracking_unc_err_cnt),
-[C_SDMA_CSR_PARITY_ERR] = CNTR_ELEM("SDmaCsrParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_csr_parity_err_cnt),
-[C_SDMA_RPY_TAG_ERR] = CNTR_ELEM("SDmaRpyTagErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_rpy_tag_err_cnt),
-/* SendEgressErrStatus */
-[C_TX_READ_PIO_MEMORY_CSR_UNC_ERR] = CNTR_ELEM("TxReadPioMemoryCsrUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_read_pio_memory_csr_unc_err_cnt),
-[C_TX_READ_SDMA_MEMORY_CSR_UNC_ERR] = CNTR_ELEM("TxReadSdmaMemoryCsrUncErr", 0,
-                       0, CNTR_NORMAL,
-                       access_tx_read_sdma_memory_csr_err_cnt),
-[C_TX_EGRESS_FIFO_COR_ERR] = CNTR_ELEM("TxEgressFifoCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_egress_fifo_cor_err_cnt),
-[C_TX_READ_PIO_MEMORY_COR_ERR] = CNTR_ELEM("TxReadPioMemoryCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_read_pio_memory_cor_err_cnt),
-[C_TX_READ_SDMA_MEMORY_COR_ERR] = CNTR_ELEM("TxReadSdmaMemoryCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_read_sdma_memory_cor_err_cnt),
-[C_TX_SB_HDR_COR_ERR] = CNTR_ELEM("TxSbHdrCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_sb_hdr_cor_err_cnt),
-[C_TX_CREDIT_OVERRUN_ERR] = CNTR_ELEM("TxCreditOverrunErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_credit_overrun_err_cnt),
-[C_TX_LAUNCH_FIFO8_COR_ERR] = CNTR_ELEM("TxLaunchFifo8CorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_launch_fifo8_cor_err_cnt),
-[C_TX_LAUNCH_FIFO7_COR_ERR] = CNTR_ELEM("TxLaunchFifo7CorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_launch_fifo7_cor_err_cnt),
-[C_TX_LAUNCH_FIFO6_COR_ERR] = CNTR_ELEM("TxLaunchFifo6CorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_launch_fifo6_cor_err_cnt),
-[C_TX_LAUNCH_FIFO5_COR_ERR] = CNTR_ELEM("TxLaunchFifo5CorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_launch_fifo5_cor_err_cnt),
-[C_TX_LAUNCH_FIFO4_COR_ERR] = CNTR_ELEM("TxLaunchFifo4CorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_launch_fifo4_cor_err_cnt),
-[C_TX_LAUNCH_FIFO3_COR_ERR] = CNTR_ELEM("TxLaunchFifo3CorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_launch_fifo3_cor_err_cnt),
-[C_TX_LAUNCH_FIFO2_COR_ERR] = CNTR_ELEM("TxLaunchFifo2CorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_launch_fifo2_cor_err_cnt),
-[C_TX_LAUNCH_FIFO1_COR_ERR] = CNTR_ELEM("TxLaunchFifo1CorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_launch_fifo1_cor_err_cnt),
-[C_TX_LAUNCH_FIFO0_COR_ERR] = CNTR_ELEM("TxLaunchFifo0CorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_launch_fifo0_cor_err_cnt),
-[C_TX_CREDIT_RETURN_VL_ERR] = CNTR_ELEM("TxCreditReturnVLErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_credit_return_vl_err_cnt),
-[C_TX_HCRC_INSERTION_ERR] = CNTR_ELEM("TxHcrcInsertionErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_hcrc_insertion_err_cnt),
-[C_TX_EGRESS_FIFI_UNC_ERR] = CNTR_ELEM("TxEgressFifoUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_egress_fifo_unc_err_cnt),
-[C_TX_READ_PIO_MEMORY_UNC_ERR] = CNTR_ELEM("TxReadPioMemoryUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_read_pio_memory_unc_err_cnt),
-[C_TX_READ_SDMA_MEMORY_UNC_ERR] = CNTR_ELEM("TxReadSdmaMemoryUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_read_sdma_memory_unc_err_cnt),
-[C_TX_SB_HDR_UNC_ERR] = CNTR_ELEM("TxSbHdrUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_sb_hdr_unc_err_cnt),
-[C_TX_CREDIT_RETURN_PARITY_ERR] = CNTR_ELEM("TxCreditReturnParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_credit_return_partiy_err_cnt),
-[C_TX_LAUNCH_FIFO8_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo8UncOrParityErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_launch_fifo8_unc_or_parity_err_cnt),
-[C_TX_LAUNCH_FIFO7_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo7UncOrParityErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_launch_fifo7_unc_or_parity_err_cnt),
-[C_TX_LAUNCH_FIFO6_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo6UncOrParityErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_launch_fifo6_unc_or_parity_err_cnt),
-[C_TX_LAUNCH_FIFO5_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo5UncOrParityErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_launch_fifo5_unc_or_parity_err_cnt),
-[C_TX_LAUNCH_FIFO4_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo4UncOrParityErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_launch_fifo4_unc_or_parity_err_cnt),
-[C_TX_LAUNCH_FIFO3_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo3UncOrParityErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_launch_fifo3_unc_or_parity_err_cnt),
-[C_TX_LAUNCH_FIFO2_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo2UncOrParityErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_launch_fifo2_unc_or_parity_err_cnt),
-[C_TX_LAUNCH_FIFO1_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo1UncOrParityErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_launch_fifo1_unc_or_parity_err_cnt),
-[C_TX_LAUNCH_FIFO0_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo0UncOrParityErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_launch_fifo0_unc_or_parity_err_cnt),
-[C_TX_SDMA15_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma15DisallowedPacketErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_sdma15_disallowed_packet_err_cnt),
-[C_TX_SDMA14_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma14DisallowedPacketErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_sdma14_disallowed_packet_err_cnt),
-[C_TX_SDMA13_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma13DisallowedPacketErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_sdma13_disallowed_packet_err_cnt),
-[C_TX_SDMA12_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma12DisallowedPacketErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_sdma12_disallowed_packet_err_cnt),
-[C_TX_SDMA11_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma11DisallowedPacketErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_sdma11_disallowed_packet_err_cnt),
-[C_TX_SDMA10_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma10DisallowedPacketErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_sdma10_disallowed_packet_err_cnt),
-[C_TX_SDMA9_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma9DisallowedPacketErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_sdma9_disallowed_packet_err_cnt),
-[C_TX_SDMA8_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma8DisallowedPacketErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_sdma8_disallowed_packet_err_cnt),
-[C_TX_SDMA7_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma7DisallowedPacketErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_sdma7_disallowed_packet_err_cnt),
-[C_TX_SDMA6_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma6DisallowedPacketErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_sdma6_disallowed_packet_err_cnt),
-[C_TX_SDMA5_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma5DisallowedPacketErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_sdma5_disallowed_packet_err_cnt),
-[C_TX_SDMA4_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma4DisallowedPacketErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_sdma4_disallowed_packet_err_cnt),
-[C_TX_SDMA3_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma3DisallowedPacketErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_sdma3_disallowed_packet_err_cnt),
-[C_TX_SDMA2_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma2DisallowedPacketErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_sdma2_disallowed_packet_err_cnt),
-[C_TX_SDMA1_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma1DisallowedPacketErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_sdma1_disallowed_packet_err_cnt),
-[C_TX_SDMA0_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma0DisallowedPacketErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_sdma0_disallowed_packet_err_cnt),
-[C_TX_CONFIG_PARITY_ERR] = CNTR_ELEM("TxConfigParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_config_parity_err_cnt),
-[C_TX_SBRD_CTL_CSR_PARITY_ERR] = CNTR_ELEM("TxSbrdCtlCsrParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_sbrd_ctl_csr_parity_err_cnt),
-[C_TX_LAUNCH_CSR_PARITY_ERR] = CNTR_ELEM("TxLaunchCsrParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_launch_csr_parity_err_cnt),
-[C_TX_ILLEGAL_CL_ERR] = CNTR_ELEM("TxIllegalVLErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_illegal_vl_err_cnt),
-[C_TX_SBRD_CTL_STATE_MACHINE_PARITY_ERR] = CNTR_ELEM(
-                       "TxSbrdCtlStateMachineParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_sbrd_ctl_state_machine_parity_err_cnt),
-[C_TX_RESERVED_10] = CNTR_ELEM("Tx Egress Reserved 10", 0, 0,
-                       CNTR_NORMAL,
-                       access_egress_reserved_10_err_cnt),
-[C_TX_RESERVED_9] = CNTR_ELEM("Tx Egress Reserved 9", 0, 0,
-                       CNTR_NORMAL,
-                       access_egress_reserved_9_err_cnt),
-[C_TX_SDMA_LAUNCH_INTF_PARITY_ERR] = CNTR_ELEM("TxSdmaLaunchIntfParityErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_sdma_launch_intf_parity_err_cnt),
-[C_TX_PIO_LAUNCH_INTF_PARITY_ERR] = CNTR_ELEM("TxPioLaunchIntfParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_pio_launch_intf_parity_err_cnt),
-[C_TX_RESERVED_6] = CNTR_ELEM("Tx Egress Reserved 6", 0, 0,
-                       CNTR_NORMAL,
-                       access_egress_reserved_6_err_cnt),
-[C_TX_INCORRECT_LINK_STATE_ERR] = CNTR_ELEM("TxIncorrectLinkStateErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_incorrect_link_state_err_cnt),
-[C_TX_LINK_DOWN_ERR] = CNTR_ELEM("TxLinkdownErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_linkdown_err_cnt),
-[C_TX_EGRESS_FIFO_UNDERRUN_OR_PARITY_ERR] = CNTR_ELEM(
-                       "EgressFifoUnderrunOrParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_egress_fifi_underrun_or_parity_err_cnt),
-[C_TX_RESERVED_2] = CNTR_ELEM("Tx Egress Reserved 2", 0, 0,
-                       CNTR_NORMAL,
-                       access_egress_reserved_2_err_cnt),
-[C_TX_PKT_INTEGRITY_MEM_UNC_ERR] = CNTR_ELEM("TxPktIntegrityMemUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_pkt_integrity_mem_unc_err_cnt),
-[C_TX_PKT_INTEGRITY_MEM_COR_ERR] = CNTR_ELEM("TxPktIntegrityMemCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_pkt_integrity_mem_cor_err_cnt),
-/* SendErrStatus */
-[C_SEND_CSR_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("SendCsrWriteBadAddrErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_send_csr_write_bad_addr_err_cnt),
-[C_SEND_CSR_READ_BAD_ADD_ERR] = CNTR_ELEM("SendCsrReadBadAddrErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_send_csr_read_bad_addr_err_cnt),
-[C_SEND_CSR_PARITY_ERR] = CNTR_ELEM("SendCsrParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_send_csr_parity_cnt),
-/* SendCtxtErrStatus */
-[C_PIO_WRITE_OUT_OF_BOUNDS_ERR] = CNTR_ELEM("PioWriteOutOfBoundsErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_write_out_of_bounds_err_cnt),
-[C_PIO_WRITE_OVERFLOW_ERR] = CNTR_ELEM("PioWriteOverflowErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_write_overflow_err_cnt),
-[C_PIO_WRITE_CROSSES_BOUNDARY_ERR] = CNTR_ELEM("PioWriteCrossesBoundaryErr",
-                       0, 0, CNTR_NORMAL,
-                       access_pio_write_crosses_boundary_err_cnt),
-[C_PIO_DISALLOWED_PACKET_ERR] = CNTR_ELEM("PioDisallowedPacketErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_disallowed_packet_err_cnt),
-[C_PIO_INCONSISTENT_SOP_ERR] = CNTR_ELEM("PioInconsistentSopErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_inconsistent_sop_err_cnt),
-/* SendDmaEngErrStatus */
-[C_SDMA_HEADER_REQUEST_FIFO_COR_ERR] = CNTR_ELEM("SDmaHeaderRequestFifoCorErr",
-                       0, 0, CNTR_NORMAL,
-                       access_sdma_header_request_fifo_cor_err_cnt),
-[C_SDMA_HEADER_STORAGE_COR_ERR] = CNTR_ELEM("SDmaHeaderStorageCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_header_storage_cor_err_cnt),
-[C_SDMA_PACKET_TRACKING_COR_ERR] = CNTR_ELEM("SDmaPacketTrackingCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_packet_tracking_cor_err_cnt),
-[C_SDMA_ASSEMBLY_COR_ERR] = CNTR_ELEM("SDmaAssemblyCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_assembly_cor_err_cnt),
-[C_SDMA_DESC_TABLE_COR_ERR] = CNTR_ELEM("SDmaDescTableCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_desc_table_cor_err_cnt),
-[C_SDMA_HEADER_REQUEST_FIFO_UNC_ERR] = CNTR_ELEM("SDmaHeaderRequestFifoUncErr",
-                       0, 0, CNTR_NORMAL,
-                       access_sdma_header_request_fifo_unc_err_cnt),
-[C_SDMA_HEADER_STORAGE_UNC_ERR] = CNTR_ELEM("SDmaHeaderStorageUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_header_storage_unc_err_cnt),
-[C_SDMA_PACKET_TRACKING_UNC_ERR] = CNTR_ELEM("SDmaPacketTrackingUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_packet_tracking_unc_err_cnt),
-[C_SDMA_ASSEMBLY_UNC_ERR] = CNTR_ELEM("SDmaAssemblyUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_assembly_unc_err_cnt),
-[C_SDMA_DESC_TABLE_UNC_ERR] = CNTR_ELEM("SDmaDescTableUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_desc_table_unc_err_cnt),
-[C_SDMA_TIMEOUT_ERR] = CNTR_ELEM("SDmaTimeoutErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_timeout_err_cnt),
-[C_SDMA_HEADER_LENGTH_ERR] = CNTR_ELEM("SDmaHeaderLengthErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_header_length_err_cnt),
-[C_SDMA_HEADER_ADDRESS_ERR] = CNTR_ELEM("SDmaHeaderAddressErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_header_address_err_cnt),
-[C_SDMA_HEADER_SELECT_ERR] = CNTR_ELEM("SDmaHeaderSelectErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_header_select_err_cnt),
-[C_SMDA_RESERVED_9] = CNTR_ELEM("SDma Reserved 9", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_reserved_9_err_cnt),
-[C_SDMA_PACKET_DESC_OVERFLOW_ERR] = CNTR_ELEM("SDmaPacketDescOverflowErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_packet_desc_overflow_err_cnt),
-[C_SDMA_LENGTH_MISMATCH_ERR] = CNTR_ELEM("SDmaLengthMismatchErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_length_mismatch_err_cnt),
-[C_SDMA_HALT_ERR] = CNTR_ELEM("SDmaHaltErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_halt_err_cnt),
-[C_SDMA_MEM_READ_ERR] = CNTR_ELEM("SDmaMemReadErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_mem_read_err_cnt),
-[C_SDMA_FIRST_DESC_ERR] = CNTR_ELEM("SDmaFirstDescErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_first_desc_err_cnt),
-[C_SDMA_TAIL_OUT_OF_BOUNDS_ERR] = CNTR_ELEM("SDmaTailOutOfBoundsErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_tail_out_of_bounds_err_cnt),
-[C_SDMA_TOO_LONG_ERR] = CNTR_ELEM("SDmaTooLongErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_too_long_err_cnt),
-[C_SDMA_GEN_MISMATCH_ERR] = CNTR_ELEM("SDmaGenMismatchErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_gen_mismatch_err_cnt),
-[C_SDMA_WRONG_DW_ERR] = CNTR_ELEM("SDmaWrongDwErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_wrong_dw_err_cnt),
-};
-
-static struct cntr_entry port_cntrs[PORT_CNTR_LAST] = {
-[C_TX_UNSUP_VL] = TXE32_PORT_CNTR_ELEM(TxUnVLErr, SEND_UNSUP_VL_ERR_CNT,
-                       CNTR_NORMAL),
-[C_TX_INVAL_LEN] = TXE32_PORT_CNTR_ELEM(TxInvalLen, SEND_LEN_ERR_CNT,
-                       CNTR_NORMAL),
-[C_TX_MM_LEN_ERR] = TXE32_PORT_CNTR_ELEM(TxMMLenErr, SEND_MAX_MIN_LEN_ERR_CNT,
-                       CNTR_NORMAL),
-[C_TX_UNDERRUN] = TXE32_PORT_CNTR_ELEM(TxUnderrun, SEND_UNDERRUN_CNT,
-                       CNTR_NORMAL),
-[C_TX_FLOW_STALL] = TXE32_PORT_CNTR_ELEM(TxFlowStall, SEND_FLOW_STALL_CNT,
-                       CNTR_NORMAL),
-[C_TX_DROPPED] = TXE32_PORT_CNTR_ELEM(TxDropped, SEND_DROPPED_PKT_CNT,
-                       CNTR_NORMAL),
-[C_TX_HDR_ERR] = TXE32_PORT_CNTR_ELEM(TxHdrErr, SEND_HEADERS_ERR_CNT,
-                       CNTR_NORMAL),
-[C_TX_PKT] = TXE64_PORT_CNTR_ELEM(TxPkt, SEND_DATA_PKT_CNT, CNTR_NORMAL),
-[C_TX_WORDS] = TXE64_PORT_CNTR_ELEM(TxWords, SEND_DWORD_CNT, CNTR_NORMAL),
-[C_TX_WAIT] = TXE64_PORT_CNTR_ELEM(TxWait, SEND_WAIT_CNT, CNTR_SYNTH),
-[C_TX_FLIT_VL] = TXE64_PORT_CNTR_ELEM(TxFlitVL, SEND_DATA_VL0_CNT,
-                                     CNTR_SYNTH | CNTR_VL),
-[C_TX_PKT_VL] = TXE64_PORT_CNTR_ELEM(TxPktVL, SEND_DATA_PKT_VL0_CNT,
-                                    CNTR_SYNTH | CNTR_VL),
-[C_TX_WAIT_VL] = TXE64_PORT_CNTR_ELEM(TxWaitVL, SEND_WAIT_VL0_CNT,
-                                     CNTR_SYNTH | CNTR_VL),
-[C_RX_PKT] = RXE64_PORT_CNTR_ELEM(RxPkt, RCV_DATA_PKT_CNT, CNTR_NORMAL),
-[C_RX_WORDS] = RXE64_PORT_CNTR_ELEM(RxWords, RCV_DWORD_CNT, CNTR_NORMAL),
-[C_SW_LINK_DOWN] = CNTR_ELEM("SwLinkDown", 0, 0, CNTR_SYNTH | CNTR_32BIT,
-                            access_sw_link_dn_cnt),
-[C_SW_LINK_UP] = CNTR_ELEM("SwLinkUp", 0, 0, CNTR_SYNTH | CNTR_32BIT,
-                          access_sw_link_up_cnt),
-[C_SW_UNKNOWN_FRAME] = CNTR_ELEM("UnknownFrame", 0, 0, CNTR_NORMAL,
-                                access_sw_unknown_frame_cnt),
-[C_SW_XMIT_DSCD] = CNTR_ELEM("XmitDscd", 0, 0, CNTR_SYNTH | CNTR_32BIT,
-                            access_sw_xmit_discards),
-[C_SW_XMIT_DSCD_VL] = CNTR_ELEM("XmitDscdVl", 0, 0,
-                               CNTR_SYNTH | CNTR_32BIT | CNTR_VL,
-                               access_sw_xmit_discards),
-[C_SW_XMIT_CSTR_ERR] = CNTR_ELEM("XmitCstrErr", 0, 0, CNTR_SYNTH,
-                                access_xmit_constraint_errs),
-[C_SW_RCV_CSTR_ERR] = CNTR_ELEM("RcvCstrErr", 0, 0, CNTR_SYNTH,
-                               access_rcv_constraint_errs),
-[C_SW_IBP_LOOP_PKTS] = SW_IBP_CNTR(LoopPkts, loop_pkts),
-[C_SW_IBP_RC_RESENDS] = SW_IBP_CNTR(RcResend, rc_resends),
-[C_SW_IBP_RNR_NAKS] = SW_IBP_CNTR(RnrNak, rnr_naks),
-[C_SW_IBP_OTHER_NAKS] = SW_IBP_CNTR(OtherNak, other_naks),
-[C_SW_IBP_RC_TIMEOUTS] = SW_IBP_CNTR(RcTimeOut, rc_timeouts),
-[C_SW_IBP_PKT_DROPS] = SW_IBP_CNTR(PktDrop, pkt_drops),
-[C_SW_IBP_DMA_WAIT] = SW_IBP_CNTR(DmaWait, dmawait),
-[C_SW_IBP_RC_SEQNAK] = SW_IBP_CNTR(RcSeqNak, rc_seqnak),
-[C_SW_IBP_RC_DUPREQ] = SW_IBP_CNTR(RcDupRew, rc_dupreq),
-[C_SW_IBP_RDMA_SEQ] = SW_IBP_CNTR(RdmaSeq, rdma_seq),
-[C_SW_IBP_UNALIGNED] = SW_IBP_CNTR(Unaligned, unaligned),
-[C_SW_IBP_SEQ_NAK] = SW_IBP_CNTR(SeqNak, seq_naks),
-[C_SW_CPU_RC_ACKS] = CNTR_ELEM("RcAcks", 0, 0, CNTR_NORMAL,
-                              access_sw_cpu_rc_acks),
-[C_SW_CPU_RC_QACKS] = CNTR_ELEM("RcQacks", 0, 0, CNTR_NORMAL,
-                               access_sw_cpu_rc_qacks),
-[C_SW_CPU_RC_DELAYED_COMP] = CNTR_ELEM("RcDelayComp", 0, 0, CNTR_NORMAL,
-                                      access_sw_cpu_rc_delayed_comp),
-[OVR_LBL(0)] = OVR_ELM(0), [OVR_LBL(1)] = OVR_ELM(1),
-[OVR_LBL(2)] = OVR_ELM(2), [OVR_LBL(3)] = OVR_ELM(3),
-[OVR_LBL(4)] = OVR_ELM(4), [OVR_LBL(5)] = OVR_ELM(5),
-[OVR_LBL(6)] = OVR_ELM(6), [OVR_LBL(7)] = OVR_ELM(7),
-[OVR_LBL(8)] = OVR_ELM(8), [OVR_LBL(9)] = OVR_ELM(9),
-[OVR_LBL(10)] = OVR_ELM(10), [OVR_LBL(11)] = OVR_ELM(11),
-[OVR_LBL(12)] = OVR_ELM(12), [OVR_LBL(13)] = OVR_ELM(13),
-[OVR_LBL(14)] = OVR_ELM(14), [OVR_LBL(15)] = OVR_ELM(15),
-[OVR_LBL(16)] = OVR_ELM(16), [OVR_LBL(17)] = OVR_ELM(17),
-[OVR_LBL(18)] = OVR_ELM(18), [OVR_LBL(19)] = OVR_ELM(19),
-[OVR_LBL(20)] = OVR_ELM(20), [OVR_LBL(21)] = OVR_ELM(21),
-[OVR_LBL(22)] = OVR_ELM(22), [OVR_LBL(23)] = OVR_ELM(23),
-[OVR_LBL(24)] = OVR_ELM(24), [OVR_LBL(25)] = OVR_ELM(25),
-[OVR_LBL(26)] = OVR_ELM(26), [OVR_LBL(27)] = OVR_ELM(27),
-[OVR_LBL(28)] = OVR_ELM(28), [OVR_LBL(29)] = OVR_ELM(29),
-[OVR_LBL(30)] = OVR_ELM(30), [OVR_LBL(31)] = OVR_ELM(31),
-[OVR_LBL(32)] = OVR_ELM(32), [OVR_LBL(33)] = OVR_ELM(33),
-[OVR_LBL(34)] = OVR_ELM(34), [OVR_LBL(35)] = OVR_ELM(35),
-[OVR_LBL(36)] = OVR_ELM(36), [OVR_LBL(37)] = OVR_ELM(37),
-[OVR_LBL(38)] = OVR_ELM(38), [OVR_LBL(39)] = OVR_ELM(39),
-[OVR_LBL(40)] = OVR_ELM(40), [OVR_LBL(41)] = OVR_ELM(41),
-[OVR_LBL(42)] = OVR_ELM(42), [OVR_LBL(43)] = OVR_ELM(43),
-[OVR_LBL(44)] = OVR_ELM(44), [OVR_LBL(45)] = OVR_ELM(45),
-[OVR_LBL(46)] = OVR_ELM(46), [OVR_LBL(47)] = OVR_ELM(47),
-[OVR_LBL(48)] = OVR_ELM(48), [OVR_LBL(49)] = OVR_ELM(49),
-[OVR_LBL(50)] = OVR_ELM(50), [OVR_LBL(51)] = OVR_ELM(51),
-[OVR_LBL(52)] = OVR_ELM(52), [OVR_LBL(53)] = OVR_ELM(53),
-[OVR_LBL(54)] = OVR_ELM(54), [OVR_LBL(55)] = OVR_ELM(55),
-[OVR_LBL(56)] = OVR_ELM(56), [OVR_LBL(57)] = OVR_ELM(57),
-[OVR_LBL(58)] = OVR_ELM(58), [OVR_LBL(59)] = OVR_ELM(59),
-[OVR_LBL(60)] = OVR_ELM(60), [OVR_LBL(61)] = OVR_ELM(61),
-[OVR_LBL(62)] = OVR_ELM(62), [OVR_LBL(63)] = OVR_ELM(63),
-[OVR_LBL(64)] = OVR_ELM(64), [OVR_LBL(65)] = OVR_ELM(65),
-[OVR_LBL(66)] = OVR_ELM(66), [OVR_LBL(67)] = OVR_ELM(67),
-[OVR_LBL(68)] = OVR_ELM(68), [OVR_LBL(69)] = OVR_ELM(69),
-[OVR_LBL(70)] = OVR_ELM(70), [OVR_LBL(71)] = OVR_ELM(71),
-[OVR_LBL(72)] = OVR_ELM(72), [OVR_LBL(73)] = OVR_ELM(73),
-[OVR_LBL(74)] = OVR_ELM(74), [OVR_LBL(75)] = OVR_ELM(75),
-[OVR_LBL(76)] = OVR_ELM(76), [OVR_LBL(77)] = OVR_ELM(77),
-[OVR_LBL(78)] = OVR_ELM(78), [OVR_LBL(79)] = OVR_ELM(79),
-[OVR_LBL(80)] = OVR_ELM(80), [OVR_LBL(81)] = OVR_ELM(81),
-[OVR_LBL(82)] = OVR_ELM(82), [OVR_LBL(83)] = OVR_ELM(83),
-[OVR_LBL(84)] = OVR_ELM(84), [OVR_LBL(85)] = OVR_ELM(85),
-[OVR_LBL(86)] = OVR_ELM(86), [OVR_LBL(87)] = OVR_ELM(87),
-[OVR_LBL(88)] = OVR_ELM(88), [OVR_LBL(89)] = OVR_ELM(89),
-[OVR_LBL(90)] = OVR_ELM(90), [OVR_LBL(91)] = OVR_ELM(91),
-[OVR_LBL(92)] = OVR_ELM(92), [OVR_LBL(93)] = OVR_ELM(93),
-[OVR_LBL(94)] = OVR_ELM(94), [OVR_LBL(95)] = OVR_ELM(95),
-[OVR_LBL(96)] = OVR_ELM(96), [OVR_LBL(97)] = OVR_ELM(97),
-[OVR_LBL(98)] = OVR_ELM(98), [OVR_LBL(99)] = OVR_ELM(99),
-[OVR_LBL(100)] = OVR_ELM(100), [OVR_LBL(101)] = OVR_ELM(101),
-[OVR_LBL(102)] = OVR_ELM(102), [OVR_LBL(103)] = OVR_ELM(103),
-[OVR_LBL(104)] = OVR_ELM(104), [OVR_LBL(105)] = OVR_ELM(105),
-[OVR_LBL(106)] = OVR_ELM(106), [OVR_LBL(107)] = OVR_ELM(107),
-[OVR_LBL(108)] = OVR_ELM(108), [OVR_LBL(109)] = OVR_ELM(109),
-[OVR_LBL(110)] = OVR_ELM(110), [OVR_LBL(111)] = OVR_ELM(111),
-[OVR_LBL(112)] = OVR_ELM(112), [OVR_LBL(113)] = OVR_ELM(113),
-[OVR_LBL(114)] = OVR_ELM(114), [OVR_LBL(115)] = OVR_ELM(115),
-[OVR_LBL(116)] = OVR_ELM(116), [OVR_LBL(117)] = OVR_ELM(117),
-[OVR_LBL(118)] = OVR_ELM(118), [OVR_LBL(119)] = OVR_ELM(119),
-[OVR_LBL(120)] = OVR_ELM(120), [OVR_LBL(121)] = OVR_ELM(121),
-[OVR_LBL(122)] = OVR_ELM(122), [OVR_LBL(123)] = OVR_ELM(123),
-[OVR_LBL(124)] = OVR_ELM(124), [OVR_LBL(125)] = OVR_ELM(125),
-[OVR_LBL(126)] = OVR_ELM(126), [OVR_LBL(127)] = OVR_ELM(127),
-[OVR_LBL(128)] = OVR_ELM(128), [OVR_LBL(129)] = OVR_ELM(129),
-[OVR_LBL(130)] = OVR_ELM(130), [OVR_LBL(131)] = OVR_ELM(131),
-[OVR_LBL(132)] = OVR_ELM(132), [OVR_LBL(133)] = OVR_ELM(133),
-[OVR_LBL(134)] = OVR_ELM(134), [OVR_LBL(135)] = OVR_ELM(135),
-[OVR_LBL(136)] = OVR_ELM(136), [OVR_LBL(137)] = OVR_ELM(137),
-[OVR_LBL(138)] = OVR_ELM(138), [OVR_LBL(139)] = OVR_ELM(139),
-[OVR_LBL(140)] = OVR_ELM(140), [OVR_LBL(141)] = OVR_ELM(141),
-[OVR_LBL(142)] = OVR_ELM(142), [OVR_LBL(143)] = OVR_ELM(143),
-[OVR_LBL(144)] = OVR_ELM(144), [OVR_LBL(145)] = OVR_ELM(145),
-[OVR_LBL(146)] = OVR_ELM(146), [OVR_LBL(147)] = OVR_ELM(147),
-[OVR_LBL(148)] = OVR_ELM(148), [OVR_LBL(149)] = OVR_ELM(149),
-[OVR_LBL(150)] = OVR_ELM(150), [OVR_LBL(151)] = OVR_ELM(151),
-[OVR_LBL(152)] = OVR_ELM(152), [OVR_LBL(153)] = OVR_ELM(153),
-[OVR_LBL(154)] = OVR_ELM(154), [OVR_LBL(155)] = OVR_ELM(155),
-[OVR_LBL(156)] = OVR_ELM(156), [OVR_LBL(157)] = OVR_ELM(157),
-[OVR_LBL(158)] = OVR_ELM(158), [OVR_LBL(159)] = OVR_ELM(159),
-};
-
-/* ======================================================================== */
-
-/* return true if this is chip revision revision a */
-int is_ax(struct hfi1_devdata *dd)
-{
-       u8 chip_rev_minor =
-               dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT
-                       & CCE_REVISION_CHIP_REV_MINOR_MASK;
-       return (chip_rev_minor & 0xf0) == 0;
-}
-
-/* return true if this is chip revision revision b */
-int is_bx(struct hfi1_devdata *dd)
-{
-       u8 chip_rev_minor =
-               dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT
-                       & CCE_REVISION_CHIP_REV_MINOR_MASK;
-       return (chip_rev_minor & 0xF0) == 0x10;
-}
-
-/*
- * Append string s to buffer buf.  Arguments curp and len are the current
- * position and remaining length, respectively.
- *
- * return 0 on success, 1 on out of room
- */
-static int append_str(char *buf, char **curp, int *lenp, const char *s)
-{
-       char *p = *curp;
-       int len = *lenp;
-       int result = 0; /* success */
-       char c;
-
-       /* add a comma, if first in the buffer */
-       if (p != buf) {
-               if (len == 0) {
-                       result = 1; /* out of room */
-                       goto done;
-               }
-               *p++ = ',';
-               len--;
-       }
-
-       /* copy the string */
-       while ((c = *s++) != 0) {
-               if (len == 0) {
-                       result = 1; /* out of room */
-                       goto done;
-               }
-               *p++ = c;
-               len--;
-       }
-
-done:
-       /* write return values */
-       *curp = p;
-       *lenp = len;
-
-       return result;
-}
-
-/*
- * Using the given flag table, print a comma separated string into
- * the buffer.  End in '*' if the buffer is too short.
- */
-static char *flag_string(char *buf, int buf_len, u64 flags,
-                        struct flag_table *table, int table_size)
-{
-       char extra[32];
-       char *p = buf;
-       int len = buf_len;
-       int no_room = 0;
-       int i;
-
-       /* make sure there is at least 2 so we can form "*" */
-       if (len < 2)
-               return "";
-
-       len--;  /* leave room for a nul */
-       for (i = 0; i < table_size; i++) {
-               if (flags & table[i].flag) {
-                       no_room = append_str(buf, &p, &len, table[i].str);
-                       if (no_room)
-                               break;
-                       flags &= ~table[i].flag;
-               }
-       }
-
-       /* any undocumented bits left? */
-       if (!no_room && flags) {
-               snprintf(extra, sizeof(extra), "bits 0x%llx", flags);
-               no_room = append_str(buf, &p, &len, extra);
-       }
-
-       /* add * if ran out of room */
-       if (no_room) {
-               /* may need to back up to add space for a '*' */
-               if (len == 0)
-                       --p;
-               *p++ = '*';
-       }
-
-       /* add final nul - space already allocated above */
-       *p = 0;
-       return buf;
-}
-
-/* first 8 CCE error interrupt source names */
-static const char * const cce_misc_names[] = {
-       "CceErrInt",            /* 0 */
-       "RxeErrInt",            /* 1 */
-       "MiscErrInt",           /* 2 */
-       "Reserved3",            /* 3 */
-       "PioErrInt",            /* 4 */
-       "SDmaErrInt",           /* 5 */
-       "EgressErrInt",         /* 6 */
-       "TxeErrInt"             /* 7 */
-};
-
-/*
- * Return the miscellaneous error interrupt name.
- */
-static char *is_misc_err_name(char *buf, size_t bsize, unsigned int source)
-{
-       if (source < ARRAY_SIZE(cce_misc_names))
-               strncpy(buf, cce_misc_names[source], bsize);
-       else
-               snprintf(buf, bsize, "Reserved%u",
-                        source + IS_GENERAL_ERR_START);
-
-       return buf;
-}
-
-/*
- * Return the SDMA engine error interrupt name.
- */
-static char *is_sdma_eng_err_name(char *buf, size_t bsize, unsigned int source)
-{
-       snprintf(buf, bsize, "SDmaEngErrInt%u", source);
-       return buf;
-}
-
-/*
- * Return the send context error interrupt name.
- */
-static char *is_sendctxt_err_name(char *buf, size_t bsize, unsigned int source)
-{
-       snprintf(buf, bsize, "SendCtxtErrInt%u", source);
-       return buf;
-}
-
-static const char * const various_names[] = {
-       "PbcInt",
-       "GpioAssertInt",
-       "Qsfp1Int",
-       "Qsfp2Int",
-       "TCritInt"
-};
-
-/*
- * Return the various interrupt name.
- */
-static char *is_various_name(char *buf, size_t bsize, unsigned int source)
-{
-       if (source < ARRAY_SIZE(various_names))
-               strncpy(buf, various_names[source], bsize);
-       else
-               snprintf(buf, bsize, "Reserved%u", source + IS_VARIOUS_START);
-       return buf;
-}
-
-/*
- * Return the DC interrupt name.
- */
-static char *is_dc_name(char *buf, size_t bsize, unsigned int source)
-{
-       static const char * const dc_int_names[] = {
-               "common",
-               "lcb",
-               "8051",
-               "lbm"   /* local block merge */
-       };
-
-       if (source < ARRAY_SIZE(dc_int_names))
-               snprintf(buf, bsize, "dc_%s_int", dc_int_names[source]);
-       else
-               snprintf(buf, bsize, "DCInt%u", source);
-       return buf;
-}
-
-static const char * const sdma_int_names[] = {
-       "SDmaInt",
-       "SdmaIdleInt",
-       "SdmaProgressInt",
-};
-
-/*
- * Return the SDMA engine interrupt name.
- */
-static char *is_sdma_eng_name(char *buf, size_t bsize, unsigned int source)
-{
-       /* what interrupt */
-       unsigned int what  = source / TXE_NUM_SDMA_ENGINES;
-       /* which engine */
-       unsigned int which = source % TXE_NUM_SDMA_ENGINES;
-
-       if (likely(what < 3))
-               snprintf(buf, bsize, "%s%u", sdma_int_names[what], which);
-       else
-               snprintf(buf, bsize, "Invalid SDMA interrupt %u", source);
-       return buf;
-}
-
-/*
- * Return the receive available interrupt name.
- */
-static char *is_rcv_avail_name(char *buf, size_t bsize, unsigned int source)
-{
-       snprintf(buf, bsize, "RcvAvailInt%u", source);
-       return buf;
-}
-
-/*
- * Return the receive urgent interrupt name.
- */
-static char *is_rcv_urgent_name(char *buf, size_t bsize, unsigned int source)
-{
-       snprintf(buf, bsize, "RcvUrgentInt%u", source);
-       return buf;
-}
-
-/*
- * Return the send credit interrupt name.
- */
-static char *is_send_credit_name(char *buf, size_t bsize, unsigned int source)
-{
-       snprintf(buf, bsize, "SendCreditInt%u", source);
-       return buf;
-}
-
-/*
- * Return the reserved interrupt name.
- */
-static char *is_reserved_name(char *buf, size_t bsize, unsigned int source)
-{
-       snprintf(buf, bsize, "Reserved%u", source + IS_RESERVED_START);
-       return buf;
-}
-
-static char *cce_err_status_string(char *buf, int buf_len, u64 flags)
-{
-       return flag_string(buf, buf_len, flags,
-                          cce_err_status_flags,
-                          ARRAY_SIZE(cce_err_status_flags));
-}
-
-static char *rxe_err_status_string(char *buf, int buf_len, u64 flags)
-{
-       return flag_string(buf, buf_len, flags,
-                          rxe_err_status_flags,
-                          ARRAY_SIZE(rxe_err_status_flags));
-}
-
-static char *misc_err_status_string(char *buf, int buf_len, u64 flags)
-{
-       return flag_string(buf, buf_len, flags, misc_err_status_flags,
-                          ARRAY_SIZE(misc_err_status_flags));
-}
-
-static char *pio_err_status_string(char *buf, int buf_len, u64 flags)
-{
-       return flag_string(buf, buf_len, flags,
-                          pio_err_status_flags,
-                          ARRAY_SIZE(pio_err_status_flags));
-}
-
-static char *sdma_err_status_string(char *buf, int buf_len, u64 flags)
-{
-       return flag_string(buf, buf_len, flags,
-                          sdma_err_status_flags,
-                          ARRAY_SIZE(sdma_err_status_flags));
-}
-
-static char *egress_err_status_string(char *buf, int buf_len, u64 flags)
-{
-       return flag_string(buf, buf_len, flags,
-                          egress_err_status_flags,
-                          ARRAY_SIZE(egress_err_status_flags));
-}
-
-static char *egress_err_info_string(char *buf, int buf_len, u64 flags)
-{
-       return flag_string(buf, buf_len, flags,
-                          egress_err_info_flags,
-                          ARRAY_SIZE(egress_err_info_flags));
-}
-
-static char *send_err_status_string(char *buf, int buf_len, u64 flags)
-{
-       return flag_string(buf, buf_len, flags,
-                          send_err_status_flags,
-                          ARRAY_SIZE(send_err_status_flags));
-}
-
-static void handle_cce_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
-{
-       char buf[96];
-       int i = 0;
-
-       /*
-        * For most these errors, there is nothing that can be done except
-        * report or record it.
-        */
-       dd_dev_info(dd, "CCE Error: %s\n",
-                   cce_err_status_string(buf, sizeof(buf), reg));
-
-       if ((reg & CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK) &&
-           is_ax(dd) && (dd->icode != ICODE_FUNCTIONAL_SIMULATOR)) {
-               /* this error requires a manual drop into SPC freeze mode */
-               /* then a fix up */
-               start_freeze_handling(dd->pport, FREEZE_SELF);
-       }
-
-       for (i = 0; i < NUM_CCE_ERR_STATUS_COUNTERS; i++) {
-               if (reg & (1ull << i)) {
-                       incr_cntr64(&dd->cce_err_status_cnt[i]);
-                       /* maintain a counter over all cce_err_status errors */
-                       incr_cntr64(&dd->sw_cce_err_status_aggregate);
-               }
-       }
-}
-
-/*
- * Check counters for receive errors that do not have an interrupt
- * associated with them.
- */
-#define RCVERR_CHECK_TIME 10
-static void update_rcverr_timer(unsigned long opaque)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)opaque;
-       struct hfi1_pportdata *ppd = dd->pport;
-       u32 cur_ovfl_cnt = read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL);
-
-       if (dd->rcv_ovfl_cnt < cur_ovfl_cnt &&
-           ppd->port_error_action & OPA_PI_MASK_EX_BUFFER_OVERRUN) {
-               dd_dev_info(dd, "%s: PortErrorAction bounce\n", __func__);
-               set_link_down_reason(
-               ppd, OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN, 0,
-               OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN);
-               queue_work(ppd->hfi1_wq, &ppd->link_bounce_work);
-       }
-       dd->rcv_ovfl_cnt = (u32)cur_ovfl_cnt;
-
-       mod_timer(&dd->rcverr_timer, jiffies + HZ * RCVERR_CHECK_TIME);
-}
-
-static int init_rcverr(struct hfi1_devdata *dd)
-{
-       setup_timer(&dd->rcverr_timer, update_rcverr_timer, (unsigned long)dd);
-       /* Assume the hardware counter has been reset */
-       dd->rcv_ovfl_cnt = 0;
-       return mod_timer(&dd->rcverr_timer, jiffies + HZ * RCVERR_CHECK_TIME);
-}
-
-static void free_rcverr(struct hfi1_devdata *dd)
-{
-       if (dd->rcverr_timer.data)
-               del_timer_sync(&dd->rcverr_timer);
-       dd->rcverr_timer.data = 0;
-}
-
-static void handle_rxe_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
-{
-       char buf[96];
-       int i = 0;
-
-       dd_dev_info(dd, "Receive Error: %s\n",
-                   rxe_err_status_string(buf, sizeof(buf), reg));
-
-       if (reg & ALL_RXE_FREEZE_ERR) {
-               int flags = 0;
-
-               /*
-                * Freeze mode recovery is disabled for the errors
-                * in RXE_FREEZE_ABORT_MASK
-                */
-               if (is_ax(dd) && (reg & RXE_FREEZE_ABORT_MASK))
-                       flags = FREEZE_ABORT;
-
-               start_freeze_handling(dd->pport, flags);
-       }
-
-       for (i = 0; i < NUM_RCV_ERR_STATUS_COUNTERS; i++) {
-               if (reg & (1ull << i))
-                       incr_cntr64(&dd->rcv_err_status_cnt[i]);
-       }
-}
-
-static void handle_misc_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
-{
-       char buf[96];
-       int i = 0;
-
-       dd_dev_info(dd, "Misc Error: %s",
-                   misc_err_status_string(buf, sizeof(buf), reg));
-       for (i = 0; i < NUM_MISC_ERR_STATUS_COUNTERS; i++) {
-               if (reg & (1ull << i))
-                       incr_cntr64(&dd->misc_err_status_cnt[i]);
-       }
-}
-
-static void handle_pio_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
-{
-       char buf[96];
-       int i = 0;
-
-       dd_dev_info(dd, "PIO Error: %s\n",
-                   pio_err_status_string(buf, sizeof(buf), reg));
-
-       if (reg & ALL_PIO_FREEZE_ERR)
-               start_freeze_handling(dd->pport, 0);
-
-       for (i = 0; i < NUM_SEND_PIO_ERR_STATUS_COUNTERS; i++) {
-               if (reg & (1ull << i))
-                       incr_cntr64(&dd->send_pio_err_status_cnt[i]);
-       }
-}
-
-static void handle_sdma_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
-{
-       char buf[96];
-       int i = 0;
-
-       dd_dev_info(dd, "SDMA Error: %s\n",
-                   sdma_err_status_string(buf, sizeof(buf), reg));
-
-       if (reg & ALL_SDMA_FREEZE_ERR)
-               start_freeze_handling(dd->pport, 0);
-
-       for (i = 0; i < NUM_SEND_DMA_ERR_STATUS_COUNTERS; i++) {
-               if (reg & (1ull << i))
-                       incr_cntr64(&dd->send_dma_err_status_cnt[i]);
-       }
-}
-
-static inline void __count_port_discards(struct hfi1_pportdata *ppd)
-{
-       incr_cntr64(&ppd->port_xmit_discards);
-}
-
-static void count_port_inactive(struct hfi1_devdata *dd)
-{
-       __count_port_discards(dd->pport);
-}
-
-/*
- * We have had a "disallowed packet" error during egress. Determine the
- * integrity check which failed, and update relevant error counter, etc.
- *
- * Note that the SEND_EGRESS_ERR_INFO register has only a single
- * bit of state per integrity check, and so we can miss the reason for an
- * egress error if more than one packet fails the same integrity check
- * since we cleared the corresponding bit in SEND_EGRESS_ERR_INFO.
- */
-static void handle_send_egress_err_info(struct hfi1_devdata *dd,
-                                       int vl)
-{
-       struct hfi1_pportdata *ppd = dd->pport;
-       u64 src = read_csr(dd, SEND_EGRESS_ERR_SOURCE); /* read first */
-       u64 info = read_csr(dd, SEND_EGRESS_ERR_INFO);
-       char buf[96];
-
-       /* clear down all observed info as quickly as possible after read */
-       write_csr(dd, SEND_EGRESS_ERR_INFO, info);
-
-       dd_dev_info(dd,
-                   "Egress Error Info: 0x%llx, %s Egress Error Src 0x%llx\n",
-                   info, egress_err_info_string(buf, sizeof(buf), info), src);
-
-       /* Eventually add other counters for each bit */
-       if (info & PORT_DISCARD_EGRESS_ERRS) {
-               int weight, i;
-
-               /*
-                * Count all applicable bits as individual errors and
-                * attribute them to the packet that triggered this handler.
-                * This may not be completely accurate due to limitations
-                * on the available hardware error information.  There is
-                * a single information register and any number of error
-                * packets may have occurred and contributed to it before
-                * this routine is called.  This means that:
-                * a) If multiple packets with the same error occur before
-                *    this routine is called, earlier packets are missed.
-                *    There is only a single bit for each error type.
-                * b) Errors may not be attributed to the correct VL.
-                *    The driver is attributing all bits in the info register
-                *    to the packet that triggered this call, but bits
-                *    could be an accumulation of different packets with
-                *    different VLs.
-                * c) A single error packet may have multiple counts attached
-                *    to it.  There is no way for the driver to know if
-                *    multiple bits set in the info register are due to a
-                *    single packet or multiple packets.  The driver assumes
-                *    multiple packets.
-                */
-               weight = hweight64(info & PORT_DISCARD_EGRESS_ERRS);
-               for (i = 0; i < weight; i++) {
-                       __count_port_discards(ppd);
-                       if (vl >= 0 && vl < TXE_NUM_DATA_VL)
-                               incr_cntr64(&ppd->port_xmit_discards_vl[vl]);
-                       else if (vl == 15)
-                               incr_cntr64(&ppd->port_xmit_discards_vl
-                                           [C_VL_15]);
-               }
-       }
-}
-
-/*
- * Input value is a bit position within the SEND_EGRESS_ERR_STATUS
- * register. Does it represent a 'port inactive' error?
- */
-static inline int port_inactive_err(u64 posn)
-{
-       return (posn >= SEES(TX_LINKDOWN) &&
-               posn <= SEES(TX_INCORRECT_LINK_STATE));
-}
-
-/*
- * Input value is a bit position within the SEND_EGRESS_ERR_STATUS
- * register. Does it represent a 'disallowed packet' error?
- */
-static inline int disallowed_pkt_err(int posn)
-{
-       return (posn >= SEES(TX_SDMA0_DISALLOWED_PACKET) &&
-               posn <= SEES(TX_SDMA15_DISALLOWED_PACKET));
-}
-
-/*
- * Input value is a bit position of one of the SDMA engine disallowed
- * packet errors.  Return which engine.  Use of this must be guarded by
- * disallowed_pkt_err().
- */
-static inline int disallowed_pkt_engine(int posn)
-{
-       return posn - SEES(TX_SDMA0_DISALLOWED_PACKET);
-}
-
-/*
- * Translate an SDMA engine to a VL.  Return -1 if the tranlation cannot
- * be done.
- */
-static int engine_to_vl(struct hfi1_devdata *dd, int engine)
-{
-       struct sdma_vl_map *m;
-       int vl;
-
-       /* range check */
-       if (engine < 0 || engine >= TXE_NUM_SDMA_ENGINES)
-               return -1;
-
-       rcu_read_lock();
-       m = rcu_dereference(dd->sdma_map);
-       vl = m->engine_to_vl[engine];
-       rcu_read_unlock();
-
-       return vl;
-}
-
-/*
- * Translate the send context (sofware index) into a VL.  Return -1 if the
- * translation cannot be done.
- */
-static int sc_to_vl(struct hfi1_devdata *dd, int sw_index)
-{
-       struct send_context_info *sci;
-       struct send_context *sc;
-       int i;
-
-       sci = &dd->send_contexts[sw_index];
-
-       /* there is no information for user (PSM) and ack contexts */
-       if ((sci->type != SC_KERNEL) && (sci->type != SC_VL15))
-               return -1;
-
-       sc = sci->sc;
-       if (!sc)
-               return -1;
-       if (dd->vld[15].sc == sc)
-               return 15;
-       for (i = 0; i < num_vls; i++)
-               if (dd->vld[i].sc == sc)
-                       return i;
-
-       return -1;
-}
-
-static void handle_egress_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
-{
-       u64 reg_copy = reg, handled = 0;
-       char buf[96];
-       int i = 0;
-
-       if (reg & ALL_TXE_EGRESS_FREEZE_ERR)
-               start_freeze_handling(dd->pport, 0);
-       else if (is_ax(dd) &&
-                (reg & SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_VL_ERR_SMASK) &&
-                (dd->icode != ICODE_FUNCTIONAL_SIMULATOR))
-               start_freeze_handling(dd->pport, 0);
-
-       while (reg_copy) {
-               int posn = fls64(reg_copy);
-               /* fls64() returns a 1-based offset, we want it zero based */
-               int shift = posn - 1;
-               u64 mask = 1ULL << shift;
-
-               if (port_inactive_err(shift)) {
-                       count_port_inactive(dd);
-                       handled |= mask;
-               } else if (disallowed_pkt_err(shift)) {
-                       int vl = engine_to_vl(dd, disallowed_pkt_engine(shift));
-
-                       handle_send_egress_err_info(dd, vl);
-                       handled |= mask;
-               }
-               reg_copy &= ~mask;
-       }
-
-       reg &= ~handled;
-
-       if (reg)
-               dd_dev_info(dd, "Egress Error: %s\n",
-                           egress_err_status_string(buf, sizeof(buf), reg));
-
-       for (i = 0; i < NUM_SEND_EGRESS_ERR_STATUS_COUNTERS; i++) {
-               if (reg & (1ull << i))
-                       incr_cntr64(&dd->send_egress_err_status_cnt[i]);
-       }
-}
-
-static void handle_txe_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
-{
-       char buf[96];
-       int i = 0;
-
-       dd_dev_info(dd, "Send Error: %s\n",
-                   send_err_status_string(buf, sizeof(buf), reg));
-
-       for (i = 0; i < NUM_SEND_ERR_STATUS_COUNTERS; i++) {
-               if (reg & (1ull << i))
-                       incr_cntr64(&dd->send_err_status_cnt[i]);
-       }
-}
-
-/*
- * The maximum number of times the error clear down will loop before
- * blocking a repeating error.  This value is arbitrary.
- */
-#define MAX_CLEAR_COUNT 20
-
-/*
- * Clear and handle an error register.  All error interrupts are funneled
- * through here to have a central location to correctly handle single-
- * or multi-shot errors.
- *
- * For non per-context registers, call this routine with a context value
- * of 0 so the per-context offset is zero.
- *
- * If the handler loops too many times, assume that something is wrong
- * and can't be fixed, so mask the error bits.
- */
-static void interrupt_clear_down(struct hfi1_devdata *dd,
-                                u32 context,
-                                const struct err_reg_info *eri)
-{
-       u64 reg;
-       u32 count;
-
-       /* read in a loop until no more errors are seen */
-       count = 0;
-       while (1) {
-               reg = read_kctxt_csr(dd, context, eri->status);
-               if (reg == 0)
-                       break;
-               write_kctxt_csr(dd, context, eri->clear, reg);
-               if (likely(eri->handler))
-                       eri->handler(dd, context, reg);
-               count++;
-               if (count > MAX_CLEAR_COUNT) {
-                       u64 mask;
-
-                       dd_dev_err(dd, "Repeating %s bits 0x%llx - masking\n",
-                                  eri->desc, reg);
-                       /*
-                        * Read-modify-write so any other masked bits
-                        * remain masked.
-                        */
-                       mask = read_kctxt_csr(dd, context, eri->mask);
-                       mask &= ~reg;
-                       write_kctxt_csr(dd, context, eri->mask, mask);
-                       break;
-               }
-       }
-}
-
-/*
- * CCE block "misc" interrupt.  Source is < 16.
- */
-static void is_misc_err_int(struct hfi1_devdata *dd, unsigned int source)
-{
-       const struct err_reg_info *eri = &misc_errs[source];
-
-       if (eri->handler) {
-               interrupt_clear_down(dd, 0, eri);
-       } else {
-               dd_dev_err(dd, "Unexpected misc interrupt (%u) - reserved\n",
-                          source);
-       }
-}
-
-static char *send_context_err_status_string(char *buf, int buf_len, u64 flags)
-{
-       return flag_string(buf, buf_len, flags,
-                          sc_err_status_flags,
-                          ARRAY_SIZE(sc_err_status_flags));
-}
-
-/*
- * Send context error interrupt.  Source (hw_context) is < 160.
- *
- * All send context errors cause the send context to halt.  The normal
- * clear-down mechanism cannot be used because we cannot clear the
- * error bits until several other long-running items are done first.
- * This is OK because with the context halted, nothing else is going
- * to happen on it anyway.
- */
-static void is_sendctxt_err_int(struct hfi1_devdata *dd,
-                               unsigned int hw_context)
-{
-       struct send_context_info *sci;
-       struct send_context *sc;
-       char flags[96];
-       u64 status;
-       u32 sw_index;
-       int i = 0;
-
-       sw_index = dd->hw_to_sw[hw_context];
-       if (sw_index >= dd->num_send_contexts) {
-               dd_dev_err(dd,
-                          "out of range sw index %u for send context %u\n",
-                          sw_index, hw_context);
-               return;
-       }
-       sci = &dd->send_contexts[sw_index];
-       sc = sci->sc;
-       if (!sc) {
-               dd_dev_err(dd, "%s: context %u(%u): no sc?\n", __func__,
-                          sw_index, hw_context);
-               return;
-       }
-
-       /* tell the software that a halt has begun */
-       sc_stop(sc, SCF_HALTED);
-
-       status = read_kctxt_csr(dd, hw_context, SEND_CTXT_ERR_STATUS);
-
-       dd_dev_info(dd, "Send Context %u(%u) Error: %s\n", sw_index, hw_context,
-                   send_context_err_status_string(flags, sizeof(flags),
-                                                  status));
-
-       if (status & SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK)
-               handle_send_egress_err_info(dd, sc_to_vl(dd, sw_index));
-
-       /*
-        * Automatically restart halted kernel contexts out of interrupt
-        * context.  User contexts must ask the driver to restart the context.
-        */
-       if (sc->type != SC_USER)
-               queue_work(dd->pport->hfi1_wq, &sc->halt_work);
-
-       /*
-        * Update the counters for the corresponding status bits.
-        * Note that these particular counters are aggregated over all
-        * 160 contexts.
-        */
-       for (i = 0; i < NUM_SEND_CTXT_ERR_STATUS_COUNTERS; i++) {
-               if (status & (1ull << i))
-                       incr_cntr64(&dd->sw_ctxt_err_status_cnt[i]);
-       }
-}
-
-static void handle_sdma_eng_err(struct hfi1_devdata *dd,
-                               unsigned int source, u64 status)
-{
-       struct sdma_engine *sde;
-       int i = 0;
-
-       sde = &dd->per_sdma[source];
-#ifdef CONFIG_SDMA_VERBOSITY
-       dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
-                  slashstrip(__FILE__), __LINE__, __func__);
-       dd_dev_err(sde->dd, "CONFIG SDMA(%u) source: %u status 0x%llx\n",
-                  sde->this_idx, source, (unsigned long long)status);
-#endif
-       sde->err_cnt++;
-       sdma_engine_error(sde, status);
-
-       /*
-       * Update the counters for the corresponding status bits.
-       * Note that these particular counters are aggregated over
-       * all 16 DMA engines.
-       */
-       for (i = 0; i < NUM_SEND_DMA_ENG_ERR_STATUS_COUNTERS; i++) {
-               if (status & (1ull << i))
-                       incr_cntr64(&dd->sw_send_dma_eng_err_status_cnt[i]);
-       }
-}
-
-/*
- * CCE block SDMA error interrupt.  Source is < 16.
- */
-static void is_sdma_eng_err_int(struct hfi1_devdata *dd, unsigned int source)
-{
-#ifdef CONFIG_SDMA_VERBOSITY
-       struct sdma_engine *sde = &dd->per_sdma[source];
-
-       dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
-                  slashstrip(__FILE__), __LINE__, __func__);
-       dd_dev_err(dd, "CONFIG SDMA(%u) source: %u\n", sde->this_idx,
-                  source);
-       sdma_dumpstate(sde);
-#endif
-       interrupt_clear_down(dd, source, &sdma_eng_err);
-}
-
-/*
- * CCE block "various" interrupt.  Source is < 8.
- */
-static void is_various_int(struct hfi1_devdata *dd, unsigned int source)
-{
-       const struct err_reg_info *eri = &various_err[source];
-
-       /*
-        * TCritInt cannot go through interrupt_clear_down()
-        * because it is not a second tier interrupt. The handler
-        * should be called directly.
-        */
-       if (source == TCRIT_INT_SOURCE)
-               handle_temp_err(dd);
-       else if (eri->handler)
-               interrupt_clear_down(dd, 0, eri);
-       else
-               dd_dev_info(dd,
-                           "%s: Unimplemented/reserved interrupt %d\n",
-                           __func__, source);
-}
-
-static void handle_qsfp_int(struct hfi1_devdata *dd, u32 src_ctx, u64 reg)
-{
-       /* src_ctx is always zero */
-       struct hfi1_pportdata *ppd = dd->pport;
-       unsigned long flags;
-       u64 qsfp_int_mgmt = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N);
-
-       if (reg & QSFP_HFI0_MODPRST_N) {
-               if (!qsfp_mod_present(ppd)) {
-                       dd_dev_info(dd, "%s: QSFP module removed\n",
-                                   __func__);
-
-                       ppd->driver_link_ready = 0;
-                       /*
-                        * Cable removed, reset all our information about the
-                        * cache and cable capabilities
-                        */
-
-                       spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
-                       /*
-                        * We don't set cache_refresh_required here as we expect
-                        * an interrupt when a cable is inserted
-                        */
-                       ppd->qsfp_info.cache_valid = 0;
-                       ppd->qsfp_info.reset_needed = 0;
-                       ppd->qsfp_info.limiting_active = 0;
-                       spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock,
-                                              flags);
-                       /* Invert the ModPresent pin now to detect plug-in */
-                       write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_INVERT :
-                                 ASIC_QSFP1_INVERT, qsfp_int_mgmt);
-
-                       if ((ppd->offline_disabled_reason >
-                         HFI1_ODR_MASK(
-                         OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED)) ||
-                         (ppd->offline_disabled_reason ==
-                         HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE)))
-                               ppd->offline_disabled_reason =
-                               HFI1_ODR_MASK(
-                               OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED);
-
-                       if (ppd->host_link_state == HLS_DN_POLL) {
-                               /*
-                                * The link is still in POLL. This means
-                                * that the normal link down processing
-                                * will not happen. We have to do it here
-                                * before turning the DC off.
-                                */
-                               queue_work(ppd->hfi1_wq, &ppd->link_down_work);
-                       }
-               } else {
-                       dd_dev_info(dd, "%s: QSFP module inserted\n",
-                                   __func__);
-
-                       spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
-                       ppd->qsfp_info.cache_valid = 0;
-                       ppd->qsfp_info.cache_refresh_required = 1;
-                       spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock,
-                                              flags);
-
-                       /*
-                        * Stop inversion of ModPresent pin to detect
-                        * removal of the cable
-                        */
-                       qsfp_int_mgmt &= ~(u64)QSFP_HFI0_MODPRST_N;
-                       write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_INVERT :
-                                 ASIC_QSFP1_INVERT, qsfp_int_mgmt);
-
-                       ppd->offline_disabled_reason =
-                               HFI1_ODR_MASK(OPA_LINKDOWN_REASON_TRANSIENT);
-               }
-       }
-
-       if (reg & QSFP_HFI0_INT_N) {
-               dd_dev_info(dd, "%s: Interrupt received from QSFP module\n",
-                           __func__);
-               spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
-               ppd->qsfp_info.check_interrupt_flags = 1;
-               spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, flags);
-       }
-
-       /* Schedule the QSFP work only if there is a cable attached. */
-       if (qsfp_mod_present(ppd))
-               queue_work(ppd->hfi1_wq, &ppd->qsfp_info.qsfp_work);
-}
-
-static int request_host_lcb_access(struct hfi1_devdata *dd)
-{
-       int ret;
-
-       ret = do_8051_command(dd, HCMD_MISC,
-                             (u64)HCMD_MISC_REQUEST_LCB_ACCESS <<
-                             LOAD_DATA_FIELD_ID_SHIFT, NULL);
-       if (ret != HCMD_SUCCESS) {
-               dd_dev_err(dd, "%s: command failed with error %d\n",
-                          __func__, ret);
-       }
-       return ret == HCMD_SUCCESS ? 0 : -EBUSY;
-}
-
-static int request_8051_lcb_access(struct hfi1_devdata *dd)
-{
-       int ret;
-
-       ret = do_8051_command(dd, HCMD_MISC,
-                             (u64)HCMD_MISC_GRANT_LCB_ACCESS <<
-                             LOAD_DATA_FIELD_ID_SHIFT, NULL);
-       if (ret != HCMD_SUCCESS) {
-               dd_dev_err(dd, "%s: command failed with error %d\n",
-                          __func__, ret);
-       }
-       return ret == HCMD_SUCCESS ? 0 : -EBUSY;
-}
-
-/*
- * Set the LCB selector - allow host access.  The DCC selector always
- * points to the host.
- */
-static inline void set_host_lcb_access(struct hfi1_devdata *dd)
-{
-       write_csr(dd, DC_DC8051_CFG_CSR_ACCESS_SEL,
-                 DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK |
-                 DC_DC8051_CFG_CSR_ACCESS_SEL_LCB_SMASK);
-}
-
-/*
- * Clear the LCB selector - allow 8051 access.  The DCC selector always
- * points to the host.
- */
-static inline void set_8051_lcb_access(struct hfi1_devdata *dd)
-{
-       write_csr(dd, DC_DC8051_CFG_CSR_ACCESS_SEL,
-                 DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK);
-}
-
-/*
- * Acquire LCB access from the 8051.  If the host already has access,
- * just increment a counter.  Otherwise, inform the 8051 that the
- * host is taking access.
- *
- * Returns:
- *     0 on success
- *     -EBUSY if the 8051 has control and cannot be disturbed
- *     -errno if unable to acquire access from the 8051
- */
-int acquire_lcb_access(struct hfi1_devdata *dd, int sleep_ok)
-{
-       struct hfi1_pportdata *ppd = dd->pport;
-       int ret = 0;
-
-       /*
-        * Use the host link state lock so the operation of this routine
-        * { link state check, selector change, count increment } can occur
-        * as a unit against a link state change.  Otherwise there is a
-        * race between the state change and the count increment.
-        */
-       if (sleep_ok) {
-               mutex_lock(&ppd->hls_lock);
-       } else {
-               while (!mutex_trylock(&ppd->hls_lock))
-                       udelay(1);
-       }
-
-       /* this access is valid only when the link is up */
-       if ((ppd->host_link_state & HLS_UP) == 0) {
-               dd_dev_info(dd, "%s: link state %s not up\n",
-                           __func__, link_state_name(ppd->host_link_state));
-               ret = -EBUSY;
-               goto done;
-       }
-
-       if (dd->lcb_access_count == 0) {
-               ret = request_host_lcb_access(dd);
-               if (ret) {
-                       dd_dev_err(dd,
-                                  "%s: unable to acquire LCB access, err %d\n",
-                                  __func__, ret);
-                       goto done;
-               }
-               set_host_lcb_access(dd);
-       }
-       dd->lcb_access_count++;
-done:
-       mutex_unlock(&ppd->hls_lock);
-       return ret;
-}
-
-/*
- * Release LCB access by decrementing the use count.  If the count is moving
- * from 1 to 0, inform 8051 that it has control back.
- *
- * Returns:
- *     0 on success
- *     -errno if unable to release access to the 8051
- */
-int release_lcb_access(struct hfi1_devdata *dd, int sleep_ok)
-{
-       int ret = 0;
-
-       /*
-        * Use the host link state lock because the acquire needed it.
-        * Here, we only need to keep { selector change, count decrement }
-        * as a unit.
-        */
-       if (sleep_ok) {
-               mutex_lock(&dd->pport->hls_lock);
-       } else {
-               while (!mutex_trylock(&dd->pport->hls_lock))
-                       udelay(1);
-       }
-
-       if (dd->lcb_access_count == 0) {
-               dd_dev_err(dd, "%s: LCB access count is zero.  Skipping.\n",
-                          __func__);
-               goto done;
-       }
-
-       if (dd->lcb_access_count == 1) {
-               set_8051_lcb_access(dd);
-               ret = request_8051_lcb_access(dd);
-               if (ret) {
-                       dd_dev_err(dd,
-                                  "%s: unable to release LCB access, err %d\n",
-                                  __func__, ret);
-                       /* restore host access if the grant didn't work */
-                       set_host_lcb_access(dd);
-                       goto done;
-               }
-       }
-       dd->lcb_access_count--;
-done:
-       mutex_unlock(&dd->pport->hls_lock);
-       return ret;
-}
-
-/*
- * Initialize LCB access variables and state.  Called during driver load,
- * after most of the initialization is finished.
- *
- * The DC default is LCB access on for the host.  The driver defaults to
- * leaving access to the 8051.  Assign access now - this constrains the call
- * to this routine to be after all LCB set-up is done.  In particular, after
- * hf1_init_dd() -> set_up_interrupts() -> clear_all_interrupts()
- */
-static void init_lcb_access(struct hfi1_devdata *dd)
-{
-       dd->lcb_access_count = 0;
-}
-
-/*
- * Write a response back to a 8051 request.
- */
-static void hreq_response(struct hfi1_devdata *dd, u8 return_code, u16 rsp_data)
-{
-       write_csr(dd, DC_DC8051_CFG_EXT_DEV_0,
-                 DC_DC8051_CFG_EXT_DEV_0_COMPLETED_SMASK |
-                 (u64)return_code <<
-                 DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT |
-                 (u64)rsp_data << DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT);
-}
-
-/*
- * Handle host requests from the 8051.
- */
-static void handle_8051_request(struct hfi1_pportdata *ppd)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       u64 reg;
-       u16 data = 0;
-       u8 type;
-
-       reg = read_csr(dd, DC_DC8051_CFG_EXT_DEV_1);
-       if ((reg & DC_DC8051_CFG_EXT_DEV_1_REQ_NEW_SMASK) == 0)
-               return; /* no request */
-
-       /* zero out COMPLETED so the response is seen */
-       write_csr(dd, DC_DC8051_CFG_EXT_DEV_0, 0);
-
-       /* extract request details */
-       type = (reg >> DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_SHIFT)
-                       & DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_MASK;
-       data = (reg >> DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT)
-                       & DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_MASK;
-
-       switch (type) {
-       case HREQ_LOAD_CONFIG:
-       case HREQ_SAVE_CONFIG:
-       case HREQ_READ_CONFIG:
-       case HREQ_SET_TX_EQ_ABS:
-       case HREQ_SET_TX_EQ_REL:
-       case HREQ_ENABLE:
-               dd_dev_info(dd, "8051 request: request 0x%x not supported\n",
-                           type);
-               hreq_response(dd, HREQ_NOT_SUPPORTED, 0);
-               break;
-       case HREQ_CONFIG_DONE:
-               hreq_response(dd, HREQ_SUCCESS, 0);
-               break;
-
-       case HREQ_INTERFACE_TEST:
-               hreq_response(dd, HREQ_SUCCESS, data);
-               break;
-       default:
-               dd_dev_err(dd, "8051 request: unknown request 0x%x\n", type);
-               hreq_response(dd, HREQ_NOT_SUPPORTED, 0);
-               break;
-       }
-}
-
-static void write_global_credit(struct hfi1_devdata *dd,
-                               u8 vau, u16 total, u16 shared)
-{
-       write_csr(dd, SEND_CM_GLOBAL_CREDIT,
-                 ((u64)total <<
-                  SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT) |
-                 ((u64)shared <<
-                  SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT) |
-                 ((u64)vau << SEND_CM_GLOBAL_CREDIT_AU_SHIFT));
-}
-
-/*
- * Set up initial VL15 credits of the remote.  Assumes the rest of
- * the CM credit registers are zero from a previous global or credit reset .
- */
-void set_up_vl15(struct hfi1_devdata *dd, u8 vau, u16 vl15buf)
-{
-       /* leave shared count at zero for both global and VL15 */
-       write_global_credit(dd, vau, vl15buf, 0);
-
-       /* We may need some credits for another VL when sending packets
-        * with the snoop interface. Dividing it down the middle for VL15
-        * and VL0 should suffice.
-        */
-       if (unlikely(dd->hfi1_snoop.mode_flag == HFI1_PORT_SNOOP_MODE)) {
-               write_csr(dd, SEND_CM_CREDIT_VL15, (u64)(vl15buf >> 1)
-                   << SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT);
-               write_csr(dd, SEND_CM_CREDIT_VL, (u64)(vl15buf >> 1)
-                   << SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT);
-       } else {
-               write_csr(dd, SEND_CM_CREDIT_VL15, (u64)vl15buf
-                       << SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT);
-       }
-}
-
-/*
- * Zero all credit details from the previous connection and
- * reset the CM manager's internal counters.
- */
-void reset_link_credits(struct hfi1_devdata *dd)
-{
-       int i;
-
-       /* remove all previous VL credit limits */
-       for (i = 0; i < TXE_NUM_DATA_VL; i++)
-               write_csr(dd, SEND_CM_CREDIT_VL + (8 * i), 0);
-       write_csr(dd, SEND_CM_CREDIT_VL15, 0);
-       write_global_credit(dd, 0, 0, 0);
-       /* reset the CM block */
-       pio_send_control(dd, PSC_CM_RESET);
-}
-
-/* convert a vCU to a CU */
-static u32 vcu_to_cu(u8 vcu)
-{
-       return 1 << vcu;
-}
-
-/* convert a CU to a vCU */
-static u8 cu_to_vcu(u32 cu)
-{
-       return ilog2(cu);
-}
-
-/* convert a vAU to an AU */
-static u32 vau_to_au(u8 vau)
-{
-       return 8 * (1 << vau);
-}
-
-static void set_linkup_defaults(struct hfi1_pportdata *ppd)
-{
-       ppd->sm_trap_qp = 0x0;
-       ppd->sa_qp = 0x1;
-}
-
-/*
- * Graceful LCB shutdown.  This leaves the LCB FIFOs in reset.
- */
-static void lcb_shutdown(struct hfi1_devdata *dd, int abort)
-{
-       u64 reg;
-
-       /* clear lcb run: LCB_CFG_RUN.EN = 0 */
-       write_csr(dd, DC_LCB_CFG_RUN, 0);
-       /* set tx fifo reset: LCB_CFG_TX_FIFOS_RESET.VAL = 1 */
-       write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET,
-                 1ull << DC_LCB_CFG_TX_FIFOS_RESET_VAL_SHIFT);
-       /* set dcc reset csr: DCC_CFG_RESET.{reset_lcb,reset_rx_fpe} = 1 */
-       dd->lcb_err_en = read_csr(dd, DC_LCB_ERR_EN);
-       reg = read_csr(dd, DCC_CFG_RESET);
-       write_csr(dd, DCC_CFG_RESET, reg |
-                 (1ull << DCC_CFG_RESET_RESET_LCB_SHIFT) |
-                 (1ull << DCC_CFG_RESET_RESET_RX_FPE_SHIFT));
-       (void)read_csr(dd, DCC_CFG_RESET); /* make sure the write completed */
-       if (!abort) {
-               udelay(1);    /* must hold for the longer of 16cclks or 20ns */
-               write_csr(dd, DCC_CFG_RESET, reg);
-               write_csr(dd, DC_LCB_ERR_EN, dd->lcb_err_en);
-       }
-}
-
-/*
- * This routine should be called after the link has been transitioned to
- * OFFLINE (OFFLINE state has the side effect of putting the SerDes into
- * reset).
- *
- * The expectation is that the caller of this routine would have taken
- * care of properly transitioning the link into the correct state.
- */
-static void dc_shutdown(struct hfi1_devdata *dd)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&dd->dc8051_lock, flags);
-       if (dd->dc_shutdown) {
-               spin_unlock_irqrestore(&dd->dc8051_lock, flags);
-               return;
-       }
-       dd->dc_shutdown = 1;
-       spin_unlock_irqrestore(&dd->dc8051_lock, flags);
-       /* Shutdown the LCB */
-       lcb_shutdown(dd, 1);
-       /*
-        * Going to OFFLINE would have causes the 8051 to put the
-        * SerDes into reset already. Just need to shut down the 8051,
-        * itself.
-        */
-       write_csr(dd, DC_DC8051_CFG_RST, 0x1);
-}
-
-/*
- * Calling this after the DC has been brought out of reset should not
- * do any damage.
- */
-static void dc_start(struct hfi1_devdata *dd)
-{
-       unsigned long flags;
-       int ret;
-
-       spin_lock_irqsave(&dd->dc8051_lock, flags);
-       if (!dd->dc_shutdown)
-               goto done;
-       spin_unlock_irqrestore(&dd->dc8051_lock, flags);
-       /* Take the 8051 out of reset */
-       write_csr(dd, DC_DC8051_CFG_RST, 0ull);
-       /* Wait until 8051 is ready */
-       ret = wait_fm_ready(dd, TIMEOUT_8051_START);
-       if (ret) {
-               dd_dev_err(dd, "%s: timeout starting 8051 firmware\n",
-                          __func__);
-       }
-       /* Take away reset for LCB and RX FPE (set in lcb_shutdown). */
-       write_csr(dd, DCC_CFG_RESET, 0x10);
-       /* lcb_shutdown() with abort=1 does not restore these */
-       write_csr(dd, DC_LCB_ERR_EN, dd->lcb_err_en);
-       spin_lock_irqsave(&dd->dc8051_lock, flags);
-       dd->dc_shutdown = 0;
-done:
-       spin_unlock_irqrestore(&dd->dc8051_lock, flags);
-}
-
-/*
- * These LCB adjustments are for the Aurora SerDes core in the FPGA.
- */
-static void adjust_lcb_for_fpga_serdes(struct hfi1_devdata *dd)
-{
-       u64 rx_radr, tx_radr;
-       u32 version;
-
-       if (dd->icode != ICODE_FPGA_EMULATION)
-               return;
-
-       /*
-        * These LCB defaults on emulator _s are good, nothing to do here:
-        *      LCB_CFG_TX_FIFOS_RADR
-        *      LCB_CFG_RX_FIFOS_RADR
-        *      LCB_CFG_LN_DCLK
-        *      LCB_CFG_IGNORE_LOST_RCLK
-        */
-       if (is_emulator_s(dd))
-               return;
-       /* else this is _p */
-
-       version = emulator_rev(dd);
-       if (!is_ax(dd))
-               version = 0x2d; /* all B0 use 0x2d or higher settings */
-
-       if (version <= 0x12) {
-               /* release 0x12 and below */
-
-               /*
-                * LCB_CFG_RX_FIFOS_RADR.RST_VAL = 0x9
-                * LCB_CFG_RX_FIFOS_RADR.OK_TO_JUMP_VAL = 0x9
-                * LCB_CFG_RX_FIFOS_RADR.DO_NOT_JUMP_VAL = 0xa
-                */
-               rx_radr =
-                     0xaull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
-                   | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
-                   | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
-               /*
-                * LCB_CFG_TX_FIFOS_RADR.ON_REINIT = 0 (default)
-                * LCB_CFG_TX_FIFOS_RADR.RST_VAL = 6
-                */
-               tx_radr = 6ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
-       } else if (version <= 0x18) {
-               /* release 0x13 up to 0x18 */
-               /* LCB_CFG_RX_FIFOS_RADR = 0x988 */
-               rx_radr =
-                     0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
-                   | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
-                   | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
-               tx_radr = 7ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
-       } else if (version == 0x19) {
-               /* release 0x19 */
-               /* LCB_CFG_RX_FIFOS_RADR = 0xa99 */
-               rx_radr =
-                     0xAull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
-                   | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
-                   | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
-               tx_radr = 3ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
-       } else if (version == 0x1a) {
-               /* release 0x1a */
-               /* LCB_CFG_RX_FIFOS_RADR = 0x988 */
-               rx_radr =
-                     0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
-                   | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
-                   | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
-               tx_radr = 7ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
-               write_csr(dd, DC_LCB_CFG_LN_DCLK, 1ull);
-       } else {
-               /* release 0x1b and higher */
-               /* LCB_CFG_RX_FIFOS_RADR = 0x877 */
-               rx_radr =
-                     0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
-                   | 0x7ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
-                   | 0x7ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
-               tx_radr = 3ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
-       }
-
-       write_csr(dd, DC_LCB_CFG_RX_FIFOS_RADR, rx_radr);
-       /* LCB_CFG_IGNORE_LOST_RCLK.EN = 1 */
-       write_csr(dd, DC_LCB_CFG_IGNORE_LOST_RCLK,
-                 DC_LCB_CFG_IGNORE_LOST_RCLK_EN_SMASK);
-       write_csr(dd, DC_LCB_CFG_TX_FIFOS_RADR, tx_radr);
-}
-
-/*
- * Handle a SMA idle message
- *
- * This is a work-queue function outside of the interrupt.
- */
-void handle_sma_message(struct work_struct *work)
-{
-       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
-                                                       sma_message_work);
-       struct hfi1_devdata *dd = ppd->dd;
-       u64 msg;
-       int ret;
-
-       /*
-        * msg is bytes 1-4 of the 40-bit idle message - the command code
-        * is stripped off
-        */
-       ret = read_idle_sma(dd, &msg);
-       if (ret)
-               return;
-       dd_dev_info(dd, "%s: SMA message 0x%llx\n", __func__, msg);
-       /*
-        * React to the SMA message.  Byte[1] (0 for us) is the command.
-        */
-       switch (msg & 0xff) {
-       case SMA_IDLE_ARM:
-               /*
-                * See OPAv1 table 9-14 - HFI and External Switch Ports Key
-                * State Transitions
-                *
-                * Only expected in INIT or ARMED, discard otherwise.
-                */
-               if (ppd->host_link_state & (HLS_UP_INIT | HLS_UP_ARMED))
-                       ppd->neighbor_normal = 1;
-               break;
-       case SMA_IDLE_ACTIVE:
-               /*
-                * See OPAv1 table 9-14 - HFI and External Switch Ports Key
-                * State Transitions
-                *
-                * Can activate the node.  Discard otherwise.
-                */
-               if (ppd->host_link_state == HLS_UP_ARMED &&
-                   ppd->is_active_optimize_enabled) {
-                       ppd->neighbor_normal = 1;
-                       ret = set_link_state(ppd, HLS_UP_ACTIVE);
-                       if (ret)
-                               dd_dev_err(
-                                       dd,
-                                       "%s: received Active SMA idle message, couldn't set link to Active\n",
-                                       __func__);
-               }
-               break;
-       default:
-               dd_dev_err(dd,
-                          "%s: received unexpected SMA idle message 0x%llx\n",
-                          __func__, msg);
-               break;
-       }
-}
-
-static void adjust_rcvctrl(struct hfi1_devdata *dd, u64 add, u64 clear)
-{
-       u64 rcvctrl;
-       unsigned long flags;
-
-       spin_lock_irqsave(&dd->rcvctrl_lock, flags);
-       rcvctrl = read_csr(dd, RCV_CTRL);
-       rcvctrl |= add;
-       rcvctrl &= ~clear;
-       write_csr(dd, RCV_CTRL, rcvctrl);
-       spin_unlock_irqrestore(&dd->rcvctrl_lock, flags);
-}
-
-static inline void add_rcvctrl(struct hfi1_devdata *dd, u64 add)
-{
-       adjust_rcvctrl(dd, add, 0);
-}
-
-static inline void clear_rcvctrl(struct hfi1_devdata *dd, u64 clear)
-{
-       adjust_rcvctrl(dd, 0, clear);
-}
-
-/*
- * Called from all interrupt handlers to start handling an SPC freeze.
- */
-void start_freeze_handling(struct hfi1_pportdata *ppd, int flags)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       struct send_context *sc;
-       int i;
-
-       if (flags & FREEZE_SELF)
-               write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_FREEZE_SMASK);
-
-       /* enter frozen mode */
-       dd->flags |= HFI1_FROZEN;
-
-       /* notify all SDMA engines that they are going into a freeze */
-       sdma_freeze_notify(dd, !!(flags & FREEZE_LINK_DOWN));
-
-       /* do halt pre-handling on all enabled send contexts */
-       for (i = 0; i < dd->num_send_contexts; i++) {
-               sc = dd->send_contexts[i].sc;
-               if (sc && (sc->flags & SCF_ENABLED))
-                       sc_stop(sc, SCF_FROZEN | SCF_HALTED);
-       }
-
-       /* Send context are frozen. Notify user space */
-       hfi1_set_uevent_bits(ppd, _HFI1_EVENT_FROZEN_BIT);
-
-       if (flags & FREEZE_ABORT) {
-               dd_dev_err(dd,
-                          "Aborted freeze recovery. Please REBOOT system\n");
-               return;
-       }
-       /* queue non-interrupt handler */
-       queue_work(ppd->hfi1_wq, &ppd->freeze_work);
-}
-
-/*
- * Wait until all 4 sub-blocks indicate that they have frozen or unfrozen,
- * depending on the "freeze" parameter.
- *
- * No need to return an error if it times out, our only option
- * is to proceed anyway.
- */
-static void wait_for_freeze_status(struct hfi1_devdata *dd, int freeze)
-{
-       unsigned long timeout;
-       u64 reg;
-
-       timeout = jiffies + msecs_to_jiffies(FREEZE_STATUS_TIMEOUT);
-       while (1) {
-               reg = read_csr(dd, CCE_STATUS);
-               if (freeze) {
-                       /* waiting until all indicators are set */
-                       if ((reg & ALL_FROZE) == ALL_FROZE)
-                               return; /* all done */
-               } else {
-                       /* waiting until all indicators are clear */
-                       if ((reg & ALL_FROZE) == 0)
-                               return; /* all done */
-               }
-
-               if (time_after(jiffies, timeout)) {
-                       dd_dev_err(dd,
-                                  "Time out waiting for SPC %sfreeze, bits 0x%llx, expecting 0x%llx, continuing",
-                                  freeze ? "" : "un", reg & ALL_FROZE,
-                                  freeze ? ALL_FROZE : 0ull);
-                       return;
-               }
-               usleep_range(80, 120);
-       }
-}
-
-/*
- * Do all freeze handling for the RXE block.
- */
-static void rxe_freeze(struct hfi1_devdata *dd)
-{
-       int i;
-
-       /* disable port */
-       clear_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
-
-       /* disable all receive contexts */
-       for (i = 0; i < dd->num_rcv_contexts; i++)
-               hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS, i);
-}
-
-/*
- * Unfreeze handling for the RXE block - kernel contexts only.
- * This will also enable the port.  User contexts will do unfreeze
- * handling on a per-context basis as they call into the driver.
- *
- */
-static void rxe_kernel_unfreeze(struct hfi1_devdata *dd)
-{
-       u32 rcvmask;
-       int i;
-
-       /* enable all kernel contexts */
-       for (i = 0; i < dd->n_krcv_queues; i++) {
-               rcvmask = HFI1_RCVCTRL_CTXT_ENB;
-               /* HFI1_RCVCTRL_TAILUPD_[ENB|DIS] needs to be set explicitly */
-               rcvmask |= HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, DMA_RTAIL) ?
-                       HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS;
-               hfi1_rcvctrl(dd, rcvmask, i);
-       }
-
-       /* enable port */
-       add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
-}
-
-/*
- * Non-interrupt SPC freeze handling.
- *
- * This is a work-queue function outside of the triggering interrupt.
- */
-void handle_freeze(struct work_struct *work)
-{
-       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
-                                                               freeze_work);
-       struct hfi1_devdata *dd = ppd->dd;
-
-       /* wait for freeze indicators on all affected blocks */
-       wait_for_freeze_status(dd, 1);
-
-       /* SPC is now frozen */
-
-       /* do send PIO freeze steps */
-       pio_freeze(dd);
-
-       /* do send DMA freeze steps */
-       sdma_freeze(dd);
-
-       /* do send egress freeze steps - nothing to do */
-
-       /* do receive freeze steps */
-       rxe_freeze(dd);
-
-       /*
-        * Unfreeze the hardware - clear the freeze, wait for each
-        * block's frozen bit to clear, then clear the frozen flag.
-        */
-       write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_UNFREEZE_SMASK);
-       wait_for_freeze_status(dd, 0);
-
-       if (is_ax(dd)) {
-               write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_FREEZE_SMASK);
-               wait_for_freeze_status(dd, 1);
-               write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_UNFREEZE_SMASK);
-               wait_for_freeze_status(dd, 0);
-       }
-
-       /* do send PIO unfreeze steps for kernel contexts */
-       pio_kernel_unfreeze(dd);
-
-       /* do send DMA unfreeze steps */
-       sdma_unfreeze(dd);
-
-       /* do send egress unfreeze steps - nothing to do */
-
-       /* do receive unfreeze steps for kernel contexts */
-       rxe_kernel_unfreeze(dd);
-
-       /*
-        * The unfreeze procedure touches global device registers when
-        * it disables and re-enables RXE. Mark the device unfrozen
-        * after all that is done so other parts of the driver waiting
-        * for the device to unfreeze don't do things out of order.
-        *
-        * The above implies that the meaning of HFI1_FROZEN flag is
-        * "Device has gone into freeze mode and freeze mode handling
-        * is still in progress."
-        *
-        * The flag will be removed when freeze mode processing has
-        * completed.
-        */
-       dd->flags &= ~HFI1_FROZEN;
-       wake_up(&dd->event_queue);
-
-       /* no longer frozen */
-}
-
-/*
- * Handle a link up interrupt from the 8051.
- *
- * This is a work-queue function outside of the interrupt.
- */
-void handle_link_up(struct work_struct *work)
-{
-       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
-                                                 link_up_work);
-       set_link_state(ppd, HLS_UP_INIT);
-
-       /* cache the read of DC_LCB_STS_ROUND_TRIP_LTP_CNT */
-       read_ltp_rtt(ppd->dd);
-       /*
-        * OPA specifies that certain counters are cleared on a transition
-        * to link up, so do that.
-        */
-       clear_linkup_counters(ppd->dd);
-       /*
-        * And (re)set link up default values.
-        */
-       set_linkup_defaults(ppd);
-
-       /* enforce link speed enabled */
-       if ((ppd->link_speed_active & ppd->link_speed_enabled) == 0) {
-               /* oops - current speed is not enabled, bounce */
-               dd_dev_err(ppd->dd,
-                          "Link speed active 0x%x is outside enabled 0x%x, downing link\n",
-                          ppd->link_speed_active, ppd->link_speed_enabled);
-               set_link_down_reason(ppd, OPA_LINKDOWN_REASON_SPEED_POLICY, 0,
-                                    OPA_LINKDOWN_REASON_SPEED_POLICY);
-               set_link_state(ppd, HLS_DN_OFFLINE);
-               tune_serdes(ppd);
-               start_link(ppd);
-       }
-}
-
-/*
- * Several pieces of LNI information were cached for SMA in ppd.
- * Reset these on link down
- */
-static void reset_neighbor_info(struct hfi1_pportdata *ppd)
-{
-       ppd->neighbor_guid = 0;
-       ppd->neighbor_port_number = 0;
-       ppd->neighbor_type = 0;
-       ppd->neighbor_fm_security = 0;
-}
-
-static const char * const link_down_reason_strs[] = {
-       [OPA_LINKDOWN_REASON_NONE] = "None",
-       [OPA_LINKDOWN_REASON_RCV_ERROR_0] = "Recive error 0",
-       [OPA_LINKDOWN_REASON_BAD_PKT_LEN] = "Bad packet length",
-       [OPA_LINKDOWN_REASON_PKT_TOO_LONG] = "Packet too long",
-       [OPA_LINKDOWN_REASON_PKT_TOO_SHORT] = "Packet too short",
-       [OPA_LINKDOWN_REASON_BAD_SLID] = "Bad SLID",
-       [OPA_LINKDOWN_REASON_BAD_DLID] = "Bad DLID",
-       [OPA_LINKDOWN_REASON_BAD_L2] = "Bad L2",
-       [OPA_LINKDOWN_REASON_BAD_SC] = "Bad SC",
-       [OPA_LINKDOWN_REASON_RCV_ERROR_8] = "Receive error 8",
-       [OPA_LINKDOWN_REASON_BAD_MID_TAIL] = "Bad mid tail",
-       [OPA_LINKDOWN_REASON_RCV_ERROR_10] = "Receive error 10",
-       [OPA_LINKDOWN_REASON_PREEMPT_ERROR] = "Preempt error",
-       [OPA_LINKDOWN_REASON_PREEMPT_VL15] = "Preempt vl15",
-       [OPA_LINKDOWN_REASON_BAD_VL_MARKER] = "Bad VL marker",
-       [OPA_LINKDOWN_REASON_RCV_ERROR_14] = "Receive error 14",
-       [OPA_LINKDOWN_REASON_RCV_ERROR_15] = "Receive error 15",
-       [OPA_LINKDOWN_REASON_BAD_HEAD_DIST] = "Bad head distance",
-       [OPA_LINKDOWN_REASON_BAD_TAIL_DIST] = "Bad tail distance",
-       [OPA_LINKDOWN_REASON_BAD_CTRL_DIST] = "Bad control distance",
-       [OPA_LINKDOWN_REASON_BAD_CREDIT_ACK] = "Bad credit ack",
-       [OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER] = "Unsupported VL marker",
-       [OPA_LINKDOWN_REASON_BAD_PREEMPT] = "Bad preempt",
-       [OPA_LINKDOWN_REASON_BAD_CONTROL_FLIT] = "Bad control flit",
-       [OPA_LINKDOWN_REASON_EXCEED_MULTICAST_LIMIT] = "Exceed multicast limit",
-       [OPA_LINKDOWN_REASON_RCV_ERROR_24] = "Receive error 24",
-       [OPA_LINKDOWN_REASON_RCV_ERROR_25] = "Receive error 25",
-       [OPA_LINKDOWN_REASON_RCV_ERROR_26] = "Receive error 26",
-       [OPA_LINKDOWN_REASON_RCV_ERROR_27] = "Receive error 27",
-       [OPA_LINKDOWN_REASON_RCV_ERROR_28] = "Receive error 28",
-       [OPA_LINKDOWN_REASON_RCV_ERROR_29] = "Receive error 29",
-       [OPA_LINKDOWN_REASON_RCV_ERROR_30] = "Receive error 30",
-       [OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN] =
-                                       "Excessive buffer overrun",
-       [OPA_LINKDOWN_REASON_UNKNOWN] = "Unknown",
-       [OPA_LINKDOWN_REASON_REBOOT] = "Reboot",
-       [OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN] = "Neighbor unknown",
-       [OPA_LINKDOWN_REASON_FM_BOUNCE] = "FM bounce",
-       [OPA_LINKDOWN_REASON_SPEED_POLICY] = "Speed policy",
-       [OPA_LINKDOWN_REASON_WIDTH_POLICY] = "Width policy",
-       [OPA_LINKDOWN_REASON_DISCONNECTED] = "Disconnected",
-       [OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED] =
-                                       "Local media not installed",
-       [OPA_LINKDOWN_REASON_NOT_INSTALLED] = "Not installed",
-       [OPA_LINKDOWN_REASON_CHASSIS_CONFIG] = "Chassis config",
-       [OPA_LINKDOWN_REASON_END_TO_END_NOT_INSTALLED] =
-                                       "End to end not installed",
-       [OPA_LINKDOWN_REASON_POWER_POLICY] = "Power policy",
-       [OPA_LINKDOWN_REASON_LINKSPEED_POLICY] = "Link speed policy",
-       [OPA_LINKDOWN_REASON_LINKWIDTH_POLICY] = "Link width policy",
-       [OPA_LINKDOWN_REASON_SWITCH_MGMT] = "Switch management",
-       [OPA_LINKDOWN_REASON_SMA_DISABLED] = "SMA disabled",
-       [OPA_LINKDOWN_REASON_TRANSIENT] = "Transient"
-};
-
-/* return the neighbor link down reason string */
-static const char *link_down_reason_str(u8 reason)
-{
-       const char *str = NULL;
-
-       if (reason < ARRAY_SIZE(link_down_reason_strs))
-               str = link_down_reason_strs[reason];
-       if (!str)
-               str = "(invalid)";
-
-       return str;
-}
-
-/*
- * Handle a link down interrupt from the 8051.
- *
- * This is a work-queue function outside of the interrupt.
- */
-void handle_link_down(struct work_struct *work)
-{
-       u8 lcl_reason, neigh_reason = 0;
-       u8 link_down_reason;
-       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
-                                                 link_down_work);
-       int was_up;
-       static const char ldr_str[] = "Link down reason: ";
-
-       if ((ppd->host_link_state &
-            (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) &&
-            ppd->port_type == PORT_TYPE_FIXED)
-               ppd->offline_disabled_reason =
-                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NOT_INSTALLED);
-
-       /* Go offline first, then deal with reading/writing through 8051 */
-       was_up = !!(ppd->host_link_state & HLS_UP);
-       set_link_state(ppd, HLS_DN_OFFLINE);
-
-       if (was_up) {
-               lcl_reason = 0;
-               /* link down reason is only valid if the link was up */
-               read_link_down_reason(ppd->dd, &link_down_reason);
-               switch (link_down_reason) {
-               case LDR_LINK_TRANSFER_ACTIVE_LOW:
-                       /* the link went down, no idle message reason */
-                       dd_dev_info(ppd->dd, "%sUnexpected link down\n",
-                                   ldr_str);
-                       break;
-               case LDR_RECEIVED_LINKDOWN_IDLE_MSG:
-                       /*
-                        * The neighbor reason is only valid if an idle message
-                        * was received for it.
-                        */
-                       read_planned_down_reason_code(ppd->dd, &neigh_reason);
-                       dd_dev_info(ppd->dd,
-                                   "%sNeighbor link down message %d, %s\n",
-                                   ldr_str, neigh_reason,
-                                   link_down_reason_str(neigh_reason));
-                       break;
-               case LDR_RECEIVED_HOST_OFFLINE_REQ:
-                       dd_dev_info(ppd->dd,
-                                   "%sHost requested link to go offline\n",
-                                   ldr_str);
-                       break;
-               default:
-                       dd_dev_info(ppd->dd, "%sUnknown reason 0x%x\n",
-                                   ldr_str, link_down_reason);
-                       break;
-               }
-
-               /*
-                * If no reason, assume peer-initiated but missed
-                * LinkGoingDown idle flits.
-                */
-               if (neigh_reason == 0)
-                       lcl_reason = OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN;
-       } else {
-               /* went down while polling or going up */
-               lcl_reason = OPA_LINKDOWN_REASON_TRANSIENT;
-       }
-
-       set_link_down_reason(ppd, lcl_reason, neigh_reason, 0);
-
-       /* inform the SMA when the link transitions from up to down */
-       if (was_up && ppd->local_link_down_reason.sma == 0 &&
-           ppd->neigh_link_down_reason.sma == 0) {
-               ppd->local_link_down_reason.sma =
-                                       ppd->local_link_down_reason.latest;
-               ppd->neigh_link_down_reason.sma =
-                                       ppd->neigh_link_down_reason.latest;
-       }
-
-       reset_neighbor_info(ppd);
-
-       /* disable the port */
-       clear_rcvctrl(ppd->dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
-
-       /*
-        * If there is no cable attached, turn the DC off. Otherwise,
-        * start the link bring up.
-        */
-       if (ppd->port_type == PORT_TYPE_QSFP && !qsfp_mod_present(ppd)) {
-               dc_shutdown(ppd->dd);
-       } else {
-               tune_serdes(ppd);
-               start_link(ppd);
-       }
-}
-
-void handle_link_bounce(struct work_struct *work)
-{
-       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
-                                                       link_bounce_work);
-
-       /*
-        * Only do something if the link is currently up.
-        */
-       if (ppd->host_link_state & HLS_UP) {
-               set_link_state(ppd, HLS_DN_OFFLINE);
-               tune_serdes(ppd);
-               start_link(ppd);
-       } else {
-               dd_dev_info(ppd->dd, "%s: link not up (%s), nothing to do\n",
-                           __func__, link_state_name(ppd->host_link_state));
-       }
-}
-
-/*
- * Mask conversion: Capability exchange to Port LTP.  The capability
- * exchange has an implicit 16b CRC that is mandatory.
- */
-static int cap_to_port_ltp(int cap)
-{
-       int port_ltp = PORT_LTP_CRC_MODE_16; /* this mode is mandatory */
-
-       if (cap & CAP_CRC_14B)
-               port_ltp |= PORT_LTP_CRC_MODE_14;
-       if (cap & CAP_CRC_48B)
-               port_ltp |= PORT_LTP_CRC_MODE_48;
-       if (cap & CAP_CRC_12B_16B_PER_LANE)
-               port_ltp |= PORT_LTP_CRC_MODE_PER_LANE;
-
-       return port_ltp;
-}
-
-/*
- * Convert an OPA Port LTP mask to capability mask
- */
-int port_ltp_to_cap(int port_ltp)
-{
-       int cap_mask = 0;
-
-       if (port_ltp & PORT_LTP_CRC_MODE_14)
-               cap_mask |= CAP_CRC_14B;
-       if (port_ltp & PORT_LTP_CRC_MODE_48)
-               cap_mask |= CAP_CRC_48B;
-       if (port_ltp & PORT_LTP_CRC_MODE_PER_LANE)
-               cap_mask |= CAP_CRC_12B_16B_PER_LANE;
-
-       return cap_mask;
-}
-
-/*
- * Convert a single DC LCB CRC mode to an OPA Port LTP mask.
- */
-static int lcb_to_port_ltp(int lcb_crc)
-{
-       int port_ltp = 0;
-
-       if (lcb_crc == LCB_CRC_12B_16B_PER_LANE)
-               port_ltp = PORT_LTP_CRC_MODE_PER_LANE;
-       else if (lcb_crc == LCB_CRC_48B)
-               port_ltp = PORT_LTP_CRC_MODE_48;
-       else if (lcb_crc == LCB_CRC_14B)
-               port_ltp = PORT_LTP_CRC_MODE_14;
-       else
-               port_ltp = PORT_LTP_CRC_MODE_16;
-
-       return port_ltp;
-}
-
-/*
- * Our neighbor has indicated that we are allowed to act as a fabric
- * manager, so place the full management partition key in the second
- * (0-based) pkey array position (see OPAv1, section 20.2.2.6.8). Note
- * that we should already have the limited management partition key in
- * array element 1, and also that the port is not yet up when
- * add_full_mgmt_pkey() is invoked.
- */
-static void add_full_mgmt_pkey(struct hfi1_pportdata *ppd)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-
-       /* Sanity check - ppd->pkeys[2] should be 0, or already initalized */
-       if (!((ppd->pkeys[2] == 0) || (ppd->pkeys[2] == FULL_MGMT_P_KEY)))
-               dd_dev_warn(dd, "%s pkey[2] already set to 0x%x, resetting it to 0x%x\n",
-                           __func__, ppd->pkeys[2], FULL_MGMT_P_KEY);
-       ppd->pkeys[2] = FULL_MGMT_P_KEY;
-       (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_PKEYS, 0);
-}
-
-/*
- * Convert the given link width to the OPA link width bitmask.
- */
-static u16 link_width_to_bits(struct hfi1_devdata *dd, u16 width)
-{
-       switch (width) {
-       case 0:
-               /*
-                * Simulator and quick linkup do not set the width.
-                * Just set it to 4x without complaint.
-                */
-               if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR || quick_linkup)
-                       return OPA_LINK_WIDTH_4X;
-               return 0; /* no lanes up */
-       case 1: return OPA_LINK_WIDTH_1X;
-       case 2: return OPA_LINK_WIDTH_2X;
-       case 3: return OPA_LINK_WIDTH_3X;
-       default:
-               dd_dev_info(dd, "%s: invalid width %d, using 4\n",
-                           __func__, width);
-               /* fall through */
-       case 4: return OPA_LINK_WIDTH_4X;
-       }
-}
-
-/*
- * Do a population count on the bottom nibble.
- */
-static const u8 bit_counts[16] = {
-       0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4
-};
-
-static inline u8 nibble_to_count(u8 nibble)
-{
-       return bit_counts[nibble & 0xf];
-}
-
-/*
- * Read the active lane information from the 8051 registers and return
- * their widths.
- *
- * Active lane information is found in these 8051 registers:
- *     enable_lane_tx
- *     enable_lane_rx
- */
-static void get_link_widths(struct hfi1_devdata *dd, u16 *tx_width,
-                           u16 *rx_width)
-{
-       u16 tx, rx;
-       u8 enable_lane_rx;
-       u8 enable_lane_tx;
-       u8 tx_polarity_inversion;
-       u8 rx_polarity_inversion;
-       u8 max_rate;
-
-       /* read the active lanes */
-       read_tx_settings(dd, &enable_lane_tx, &tx_polarity_inversion,
-                        &rx_polarity_inversion, &max_rate);
-       read_local_lni(dd, &enable_lane_rx);
-
-       /* convert to counts */
-       tx = nibble_to_count(enable_lane_tx);
-       rx = nibble_to_count(enable_lane_rx);
-
-       /*
-        * Set link_speed_active here, overriding what was set in
-        * handle_verify_cap().  The ASIC 8051 firmware does not correctly
-        * set the max_rate field in handle_verify_cap until v0.19.
-        */
-       if ((dd->icode == ICODE_RTL_SILICON) &&
-           (dd->dc8051_ver < dc8051_ver(0, 19))) {
-               /* max_rate: 0 = 12.5G, 1 = 25G */
-               switch (max_rate) {
-               case 0:
-                       dd->pport[0].link_speed_active = OPA_LINK_SPEED_12_5G;
-                       break;
-               default:
-                       dd_dev_err(dd,
-                                  "%s: unexpected max rate %d, using 25Gb\n",
-                                  __func__, (int)max_rate);
-                       /* fall through */
-               case 1:
-                       dd->pport[0].link_speed_active = OPA_LINK_SPEED_25G;
-                       break;
-               }
-       }
-
-       dd_dev_info(dd,
-                   "Fabric active lanes (width): tx 0x%x (%d), rx 0x%x (%d)\n",
-                   enable_lane_tx, tx, enable_lane_rx, rx);
-       *tx_width = link_width_to_bits(dd, tx);
-       *rx_width = link_width_to_bits(dd, rx);
-}
-
-/*
- * Read verify_cap_local_fm_link_width[1] to obtain the link widths.
- * Valid after the end of VerifyCap and during LinkUp.  Does not change
- * after link up.  I.e. look elsewhere for downgrade information.
- *
- * Bits are:
- *     + bits [7:4] contain the number of active transmitters
- *     + bits [3:0] contain the number of active receivers
- * These are numbers 1 through 4 and can be different values if the
- * link is asymmetric.
- *
- * verify_cap_local_fm_link_width[0] retains its original value.
- */
-static void get_linkup_widths(struct hfi1_devdata *dd, u16 *tx_width,
-                             u16 *rx_width)
-{
-       u16 widths, tx, rx;
-       u8 misc_bits, local_flags;
-       u16 active_tx, active_rx;
-
-       read_vc_local_link_width(dd, &misc_bits, &local_flags, &widths);
-       tx = widths >> 12;
-       rx = (widths >> 8) & 0xf;
-
-       *tx_width = link_width_to_bits(dd, tx);
-       *rx_width = link_width_to_bits(dd, rx);
-
-       /* print the active widths */
-       get_link_widths(dd, &active_tx, &active_rx);
-}
-
-/*
- * Set ppd->link_width_active and ppd->link_width_downgrade_active using
- * hardware information when the link first comes up.
- *
- * The link width is not available until after VerifyCap.AllFramesReceived
- * (the trigger for handle_verify_cap), so this is outside that routine
- * and should be called when the 8051 signals linkup.
- */
-void get_linkup_link_widths(struct hfi1_pportdata *ppd)
-{
-       u16 tx_width, rx_width;
-
-       /* get end-of-LNI link widths */
-       get_linkup_widths(ppd->dd, &tx_width, &rx_width);
-
-       /* use tx_width as the link is supposed to be symmetric on link up */
-       ppd->link_width_active = tx_width;
-       /* link width downgrade active (LWD.A) starts out matching LW.A */
-       ppd->link_width_downgrade_tx_active = ppd->link_width_active;
-       ppd->link_width_downgrade_rx_active = ppd->link_width_active;
-       /* per OPA spec, on link up LWD.E resets to LWD.S */
-       ppd->link_width_downgrade_enabled = ppd->link_width_downgrade_supported;
-       /* cache the active egress rate (units {10^6 bits/sec]) */
-       ppd->current_egress_rate = active_egress_rate(ppd);
-}
-
-/*
- * Handle a verify capabilities interrupt from the 8051.
- *
- * This is a work-queue function outside of the interrupt.
- */
-void handle_verify_cap(struct work_struct *work)
-{
-       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
-                                                               link_vc_work);
-       struct hfi1_devdata *dd = ppd->dd;
-       u64 reg;
-       u8 power_management;
-       u8 continious;
-       u8 vcu;
-       u8 vau;
-       u8 z;
-       u16 vl15buf;
-       u16 link_widths;
-       u16 crc_mask;
-       u16 crc_val;
-       u16 device_id;
-       u16 active_tx, active_rx;
-       u8 partner_supported_crc;
-       u8 remote_tx_rate;
-       u8 device_rev;
-
-       set_link_state(ppd, HLS_VERIFY_CAP);
-
-       lcb_shutdown(dd, 0);
-       adjust_lcb_for_fpga_serdes(dd);
-
-       /*
-        * These are now valid:
-        *      remote VerifyCap fields in the general LNI config
-        *      CSR DC8051_STS_REMOTE_GUID
-        *      CSR DC8051_STS_REMOTE_NODE_TYPE
-        *      CSR DC8051_STS_REMOTE_FM_SECURITY
-        *      CSR DC8051_STS_REMOTE_PORT_NO
-        */
-
-       read_vc_remote_phy(dd, &power_management, &continious);
-       read_vc_remote_fabric(dd, &vau, &z, &vcu, &vl15buf,
-                             &partner_supported_crc);
-       read_vc_remote_link_width(dd, &remote_tx_rate, &link_widths);
-       read_remote_device_id(dd, &device_id, &device_rev);
-       /*
-        * And the 'MgmtAllowed' information, which is exchanged during
-        * LNI, is also be available at this point.
-        */
-       read_mgmt_allowed(dd, &ppd->mgmt_allowed);
-       /* print the active widths */
-       get_link_widths(dd, &active_tx, &active_rx);
-       dd_dev_info(dd,
-                   "Peer PHY: power management 0x%x, continuous updates 0x%x\n",
-                   (int)power_management, (int)continious);
-       dd_dev_info(dd,
-                   "Peer Fabric: vAU %d, Z %d, vCU %d, vl15 credits 0x%x, CRC sizes 0x%x\n",
-                   (int)vau, (int)z, (int)vcu, (int)vl15buf,
-                   (int)partner_supported_crc);
-       dd_dev_info(dd, "Peer Link Width: tx rate 0x%x, widths 0x%x\n",
-                   (u32)remote_tx_rate, (u32)link_widths);
-       dd_dev_info(dd, "Peer Device ID: 0x%04x, Revision 0x%02x\n",
-                   (u32)device_id, (u32)device_rev);
-       /*
-        * The peer vAU value just read is the peer receiver value.  HFI does
-        * not support a transmit vAU of 0 (AU == 8).  We advertised that
-        * with Z=1 in the fabric capabilities sent to the peer.  The peer
-        * will see our Z=1, and, if it advertised a vAU of 0, will move its
-        * receive to vAU of 1 (AU == 16).  Do the same here.  We do not care
-        * about the peer Z value - our sent vAU is 3 (hardwired) and is not
-        * subject to the Z value exception.
-        */
-       if (vau == 0)
-               vau = 1;
-       set_up_vl15(dd, vau, vl15buf);
-
-       /* set up the LCB CRC mode */
-       crc_mask = ppd->port_crc_mode_enabled & partner_supported_crc;
-
-       /* order is important: use the lowest bit in common */
-       if (crc_mask & CAP_CRC_14B)
-               crc_val = LCB_CRC_14B;
-       else if (crc_mask & CAP_CRC_48B)
-               crc_val = LCB_CRC_48B;
-       else if (crc_mask & CAP_CRC_12B_16B_PER_LANE)
-               crc_val = LCB_CRC_12B_16B_PER_LANE;
-       else
-               crc_val = LCB_CRC_16B;
-
-       dd_dev_info(dd, "Final LCB CRC mode: %d\n", (int)crc_val);
-       write_csr(dd, DC_LCB_CFG_CRC_MODE,
-                 (u64)crc_val << DC_LCB_CFG_CRC_MODE_TX_VAL_SHIFT);
-
-       /* set (14b only) or clear sideband credit */
-       reg = read_csr(dd, SEND_CM_CTRL);
-       if (crc_val == LCB_CRC_14B && crc_14b_sideband) {
-               write_csr(dd, SEND_CM_CTRL,
-                         reg | SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK);
-       } else {
-               write_csr(dd, SEND_CM_CTRL,
-                         reg & ~SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK);
-       }
-
-       ppd->link_speed_active = 0;     /* invalid value */
-       if (dd->dc8051_ver < dc8051_ver(0, 20)) {
-               /* remote_tx_rate: 0 = 12.5G, 1 = 25G */
-               switch (remote_tx_rate) {
-               case 0:
-                       ppd->link_speed_active = OPA_LINK_SPEED_12_5G;
-                       break;
-               case 1:
-                       ppd->link_speed_active = OPA_LINK_SPEED_25G;
-                       break;
-               }
-       } else {
-               /* actual rate is highest bit of the ANDed rates */
-               u8 rate = remote_tx_rate & ppd->local_tx_rate;
-
-               if (rate & 2)
-                       ppd->link_speed_active = OPA_LINK_SPEED_25G;
-               else if (rate & 1)
-                       ppd->link_speed_active = OPA_LINK_SPEED_12_5G;
-       }
-       if (ppd->link_speed_active == 0) {
-               dd_dev_err(dd, "%s: unexpected remote tx rate %d, using 25Gb\n",
-                          __func__, (int)remote_tx_rate);
-               ppd->link_speed_active = OPA_LINK_SPEED_25G;
-       }
-
-       /*
-        * Cache the values of the supported, enabled, and active
-        * LTP CRC modes to return in 'portinfo' queries. But the bit
-        * flags that are returned in the portinfo query differ from
-        * what's in the link_crc_mask, crc_sizes, and crc_val
-        * variables. Convert these here.
-        */
-       ppd->port_ltp_crc_mode = cap_to_port_ltp(link_crc_mask) << 8;
-               /* supported crc modes */
-       ppd->port_ltp_crc_mode |=
-               cap_to_port_ltp(ppd->port_crc_mode_enabled) << 4;
-               /* enabled crc modes */
-       ppd->port_ltp_crc_mode |= lcb_to_port_ltp(crc_val);
-               /* active crc mode */
-
-       /* set up the remote credit return table */
-       assign_remote_cm_au_table(dd, vcu);
-
-       /*
-        * The LCB is reset on entry to handle_verify_cap(), so this must
-        * be applied on every link up.
-        *
-        * Adjust LCB error kill enable to kill the link if
-        * these RBUF errors are seen:
-        *      REPLAY_BUF_MBE_SMASK
-        *      FLIT_INPUT_BUF_MBE_SMASK
-        */
-       if (is_ax(dd)) {                        /* fixed in B0 */
-               reg = read_csr(dd, DC_LCB_CFG_LINK_KILL_EN);
-               reg |= DC_LCB_CFG_LINK_KILL_EN_REPLAY_BUF_MBE_SMASK
-                       | DC_LCB_CFG_LINK_KILL_EN_FLIT_INPUT_BUF_MBE_SMASK;
-               write_csr(dd, DC_LCB_CFG_LINK_KILL_EN, reg);
-       }
-
-       /* pull LCB fifos out of reset - all fifo clocks must be stable */
-       write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0);
-
-       /* give 8051 access to the LCB CSRs */
-       write_csr(dd, DC_LCB_ERR_EN, 0); /* mask LCB errors */
-       set_8051_lcb_access(dd);
-
-       ppd->neighbor_guid =
-               read_csr(dd, DC_DC8051_STS_REMOTE_GUID);
-       ppd->neighbor_port_number = read_csr(dd, DC_DC8051_STS_REMOTE_PORT_NO) &
-                                       DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK;
-       ppd->neighbor_type =
-               read_csr(dd, DC_DC8051_STS_REMOTE_NODE_TYPE) &
-               DC_DC8051_STS_REMOTE_NODE_TYPE_VAL_MASK;
-       ppd->neighbor_fm_security =
-               read_csr(dd, DC_DC8051_STS_REMOTE_FM_SECURITY) &
-               DC_DC8051_STS_LOCAL_FM_SECURITY_DISABLED_MASK;
-       dd_dev_info(dd,
-                   "Neighbor Guid: %llx Neighbor type %d MgmtAllowed %d FM security bypass %d\n",
-                   ppd->neighbor_guid, ppd->neighbor_type,
-                   ppd->mgmt_allowed, ppd->neighbor_fm_security);
-       if (ppd->mgmt_allowed)
-               add_full_mgmt_pkey(ppd);
-
-       /* tell the 8051 to go to LinkUp */
-       set_link_state(ppd, HLS_GOING_UP);
-}
-
-/*
- * Apply the link width downgrade enabled policy against the current active
- * link widths.
- *
- * Called when the enabled policy changes or the active link widths change.
- */
-void apply_link_downgrade_policy(struct hfi1_pportdata *ppd, int refresh_widths)
-{
-       int do_bounce = 0;
-       int tries;
-       u16 lwde;
-       u16 tx, rx;
-
-       /* use the hls lock to avoid a race with actual link up */
-       tries = 0;
-retry:
-       mutex_lock(&ppd->hls_lock);
-       /* only apply if the link is up */
-       if (!(ppd->host_link_state & HLS_UP)) {
-               /* still going up..wait and retry */
-               if (ppd->host_link_state & HLS_GOING_UP) {
-                       if (++tries < 1000) {
-                               mutex_unlock(&ppd->hls_lock);
-                               usleep_range(100, 120); /* arbitrary */
-                               goto retry;
-                       }
-                       dd_dev_err(ppd->dd,
-                                  "%s: giving up waiting for link state change\n",
-                                  __func__);
-               }
-               goto done;
-       }
-
-       lwde = ppd->link_width_downgrade_enabled;
-
-       if (refresh_widths) {
-               get_link_widths(ppd->dd, &tx, &rx);
-               ppd->link_width_downgrade_tx_active = tx;
-               ppd->link_width_downgrade_rx_active = rx;
-       }
-
-       if (ppd->link_width_downgrade_tx_active == 0 ||
-           ppd->link_width_downgrade_rx_active == 0) {
-               /* the 8051 reported a dead link as a downgrade */
-               dd_dev_err(ppd->dd, "Link downgrade is really a link down, ignoring\n");
-       } else if (lwde == 0) {
-               /* downgrade is disabled */
-
-               /* bounce if not at starting active width */
-               if ((ppd->link_width_active !=
-                    ppd->link_width_downgrade_tx_active) ||
-                   (ppd->link_width_active !=
-                    ppd->link_width_downgrade_rx_active)) {
-                       dd_dev_err(ppd->dd,
-                                  "Link downgrade is disabled and link has downgraded, downing link\n");
-                       dd_dev_err(ppd->dd,
-                                  "  original 0x%x, tx active 0x%x, rx active 0x%x\n",
-                                  ppd->link_width_active,
-                                  ppd->link_width_downgrade_tx_active,
-                                  ppd->link_width_downgrade_rx_active);
-                       do_bounce = 1;
-               }
-       } else if ((lwde & ppd->link_width_downgrade_tx_active) == 0 ||
-                  (lwde & ppd->link_width_downgrade_rx_active) == 0) {
-               /* Tx or Rx is outside the enabled policy */
-               dd_dev_err(ppd->dd,
-                          "Link is outside of downgrade allowed, downing link\n");
-               dd_dev_err(ppd->dd,
-                          "  enabled 0x%x, tx active 0x%x, rx active 0x%x\n",
-                          lwde, ppd->link_width_downgrade_tx_active,
-                          ppd->link_width_downgrade_rx_active);
-               do_bounce = 1;
-       }
-
-done:
-       mutex_unlock(&ppd->hls_lock);
-
-       if (do_bounce) {
-               set_link_down_reason(ppd, OPA_LINKDOWN_REASON_WIDTH_POLICY, 0,
-                                    OPA_LINKDOWN_REASON_WIDTH_POLICY);
-               set_link_state(ppd, HLS_DN_OFFLINE);
-               tune_serdes(ppd);
-               start_link(ppd);
-       }
-}
-
-/*
- * Handle a link downgrade interrupt from the 8051.
- *
- * This is a work-queue function outside of the interrupt.
- */
-void handle_link_downgrade(struct work_struct *work)
-{
-       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
-                                                       link_downgrade_work);
-
-       dd_dev_info(ppd->dd, "8051: Link width downgrade\n");
-       apply_link_downgrade_policy(ppd, 1);
-}
-
-static char *dcc_err_string(char *buf, int buf_len, u64 flags)
-{
-       return flag_string(buf, buf_len, flags, dcc_err_flags,
-               ARRAY_SIZE(dcc_err_flags));
-}
-
-static char *lcb_err_string(char *buf, int buf_len, u64 flags)
-{
-       return flag_string(buf, buf_len, flags, lcb_err_flags,
-               ARRAY_SIZE(lcb_err_flags));
-}
-
-static char *dc8051_err_string(char *buf, int buf_len, u64 flags)
-{
-       return flag_string(buf, buf_len, flags, dc8051_err_flags,
-               ARRAY_SIZE(dc8051_err_flags));
-}
-
-static char *dc8051_info_err_string(char *buf, int buf_len, u64 flags)
-{
-       return flag_string(buf, buf_len, flags, dc8051_info_err_flags,
-               ARRAY_SIZE(dc8051_info_err_flags));
-}
-
-static char *dc8051_info_host_msg_string(char *buf, int buf_len, u64 flags)
-{
-       return flag_string(buf, buf_len, flags, dc8051_info_host_msg_flags,
-               ARRAY_SIZE(dc8051_info_host_msg_flags));
-}
-
-static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg)
-{
-       struct hfi1_pportdata *ppd = dd->pport;
-       u64 info, err, host_msg;
-       int queue_link_down = 0;
-       char buf[96];
-
-       /* look at the flags */
-       if (reg & DC_DC8051_ERR_FLG_SET_BY_8051_SMASK) {
-               /* 8051 information set by firmware */
-               /* read DC8051_DBG_ERR_INFO_SET_BY_8051 for details */
-               info = read_csr(dd, DC_DC8051_DBG_ERR_INFO_SET_BY_8051);
-               err = (info >> DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_SHIFT)
-                       & DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_MASK;
-               host_msg = (info >>
-                       DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_SHIFT)
-                       & DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_MASK;
-
-               /*
-                * Handle error flags.
-                */
-               if (err & FAILED_LNI) {
-                       /*
-                        * LNI error indications are cleared by the 8051
-                        * only when starting polling.  Only pay attention
-                        * to them when in the states that occur during
-                        * LNI.
-                        */
-                       if (ppd->host_link_state
-                           & (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) {
-                               queue_link_down = 1;
-                               dd_dev_info(dd, "Link error: %s\n",
-                                           dc8051_info_err_string(buf,
-                                                                  sizeof(buf),
-                                                                  err &
-                                                                  FAILED_LNI));
-                       }
-                       err &= ~(u64)FAILED_LNI;
-               }
-               /* unknown frames can happen durning LNI, just count */
-               if (err & UNKNOWN_FRAME) {
-                       ppd->unknown_frame_count++;
-                       err &= ~(u64)UNKNOWN_FRAME;
-               }
-               if (err) {
-                       /* report remaining errors, but do not do anything */
-                       dd_dev_err(dd, "8051 info error: %s\n",
-                                  dc8051_info_err_string(buf, sizeof(buf),
-                                                         err));
-               }
-
-               /*
-                * Handle host message flags.
-                */
-               if (host_msg & HOST_REQ_DONE) {
-                       /*
-                        * Presently, the driver does a busy wait for
-                        * host requests to complete.  This is only an
-                        * informational message.
-                        * NOTE: The 8051 clears the host message
-                        * information *on the next 8051 command*.
-                        * Therefore, when linkup is achieved,
-                        * this flag will still be set.
-                        */
-                       host_msg &= ~(u64)HOST_REQ_DONE;
-               }
-               if (host_msg & BC_SMA_MSG) {
-                       queue_work(ppd->hfi1_wq, &ppd->sma_message_work);
-                       host_msg &= ~(u64)BC_SMA_MSG;
-               }
-               if (host_msg & LINKUP_ACHIEVED) {
-                       dd_dev_info(dd, "8051: Link up\n");
-                       queue_work(ppd->hfi1_wq, &ppd->link_up_work);
-                       host_msg &= ~(u64)LINKUP_ACHIEVED;
-               }
-               if (host_msg & EXT_DEVICE_CFG_REQ) {
-                       handle_8051_request(ppd);
-                       host_msg &= ~(u64)EXT_DEVICE_CFG_REQ;
-               }
-               if (host_msg & VERIFY_CAP_FRAME) {
-                       queue_work(ppd->hfi1_wq, &ppd->link_vc_work);
-                       host_msg &= ~(u64)VERIFY_CAP_FRAME;
-               }
-               if (host_msg & LINK_GOING_DOWN) {
-                       const char *extra = "";
-                       /* no downgrade action needed if going down */
-                       if (host_msg & LINK_WIDTH_DOWNGRADED) {
-                               host_msg &= ~(u64)LINK_WIDTH_DOWNGRADED;
-                               extra = " (ignoring downgrade)";
-                       }
-                       dd_dev_info(dd, "8051: Link down%s\n", extra);
-                       queue_link_down = 1;
-                       host_msg &= ~(u64)LINK_GOING_DOWN;
-               }
-               if (host_msg & LINK_WIDTH_DOWNGRADED) {
-                       queue_work(ppd->hfi1_wq, &ppd->link_downgrade_work);
-                       host_msg &= ~(u64)LINK_WIDTH_DOWNGRADED;
-               }
-               if (host_msg) {
-                       /* report remaining messages, but do not do anything */
-                       dd_dev_info(dd, "8051 info host message: %s\n",
-                                   dc8051_info_host_msg_string(buf,
-                                                               sizeof(buf),
-                                                               host_msg));
-               }
-
-               reg &= ~DC_DC8051_ERR_FLG_SET_BY_8051_SMASK;
-       }
-       if (reg & DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK) {
-               /*
-                * Lost the 8051 heartbeat.  If this happens, we
-                * receive constant interrupts about it.  Disable
-                * the interrupt after the first.
-                */
-               dd_dev_err(dd, "Lost 8051 heartbeat\n");
-               write_csr(dd, DC_DC8051_ERR_EN,
-                         read_csr(dd, DC_DC8051_ERR_EN) &
-                         ~DC_DC8051_ERR_EN_LOST_8051_HEART_BEAT_SMASK);
-
-               reg &= ~DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK;
-       }
-       if (reg) {
-               /* report the error, but do not do anything */
-               dd_dev_err(dd, "8051 error: %s\n",
-                          dc8051_err_string(buf, sizeof(buf), reg));
-       }
-
-       if (queue_link_down) {
-               /*
-                * if the link is already going down or disabled, do not
-                * queue another
-                */
-               if ((ppd->host_link_state &
-                   (HLS_GOING_OFFLINE | HLS_LINK_COOLDOWN)) ||
-                   ppd->link_enabled == 0) {
-                       dd_dev_info(dd, "%s: not queuing link down\n",
-                                   __func__);
-               } else {
-                       queue_work(ppd->hfi1_wq, &ppd->link_down_work);
-               }
-       }
-}
-
-static const char * const fm_config_txt[] = {
-[0] =
-       "BadHeadDist: Distance violation between two head flits",
-[1] =
-       "BadTailDist: Distance violation between two tail flits",
-[2] =
-       "BadCtrlDist: Distance violation between two credit control flits",
-[3] =
-       "BadCrdAck: Credits return for unsupported VL",
-[4] =
-       "UnsupportedVLMarker: Received VL Marker",
-[5] =
-       "BadPreempt: Exceeded the preemption nesting level",
-[6] =
-       "BadControlFlit: Received unsupported control flit",
-/* no 7 */
-[8] =
-       "UnsupportedVLMarker: Received VL Marker for unconfigured or disabled VL",
-};
-
-static const char * const port_rcv_txt[] = {
-[1] =
-       "BadPktLen: Illegal PktLen",
-[2] =
-       "PktLenTooLong: Packet longer than PktLen",
-[3] =
-       "PktLenTooShort: Packet shorter than PktLen",
-[4] =
-       "BadSLID: Illegal SLID (0, using multicast as SLID, does not include security validation of SLID)",
-[5] =
-       "BadDLID: Illegal DLID (0, doesn't match HFI)",
-[6] =
-       "BadL2: Illegal L2 opcode",
-[7] =
-       "BadSC: Unsupported SC",
-[9] =
-       "BadRC: Illegal RC",
-[11] =
-       "PreemptError: Preempting with same VL",
-[12] =
-       "PreemptVL15: Preempting a VL15 packet",
-};
-
-#define OPA_LDR_FMCONFIG_OFFSET 16
-#define OPA_LDR_PORTRCV_OFFSET 0
-static void handle_dcc_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
-{
-       u64 info, hdr0, hdr1;
-       const char *extra;
-       char buf[96];
-       struct hfi1_pportdata *ppd = dd->pport;
-       u8 lcl_reason = 0;
-       int do_bounce = 0;
-
-       if (reg & DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK) {
-               if (!(dd->err_info_uncorrectable & OPA_EI_STATUS_SMASK)) {
-                       info = read_csr(dd, DCC_ERR_INFO_UNCORRECTABLE);
-                       dd->err_info_uncorrectable = info & OPA_EI_CODE_SMASK;
-                       /* set status bit */
-                       dd->err_info_uncorrectable |= OPA_EI_STATUS_SMASK;
-               }
-               reg &= ~DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK;
-       }
-
-       if (reg & DCC_ERR_FLG_LINK_ERR_SMASK) {
-               struct hfi1_pportdata *ppd = dd->pport;
-               /* this counter saturates at (2^32) - 1 */
-               if (ppd->link_downed < (u32)UINT_MAX)
-                       ppd->link_downed++;
-               reg &= ~DCC_ERR_FLG_LINK_ERR_SMASK;
-       }
-
-       if (reg & DCC_ERR_FLG_FMCONFIG_ERR_SMASK) {
-               u8 reason_valid = 1;
-
-               info = read_csr(dd, DCC_ERR_INFO_FMCONFIG);
-               if (!(dd->err_info_fmconfig & OPA_EI_STATUS_SMASK)) {
-                       dd->err_info_fmconfig = info & OPA_EI_CODE_SMASK;
-                       /* set status bit */
-                       dd->err_info_fmconfig |= OPA_EI_STATUS_SMASK;
-               }
-               switch (info) {
-               case 0:
-               case 1:
-               case 2:
-               case 3:
-               case 4:
-               case 5:
-               case 6:
-                       extra = fm_config_txt[info];
-                       break;
-               case 8:
-                       extra = fm_config_txt[info];
-                       if (ppd->port_error_action &
-                           OPA_PI_MASK_FM_CFG_UNSUPPORTED_VL_MARKER) {
-                               do_bounce = 1;
-                               /*
-                                * lcl_reason cannot be derived from info
-                                * for this error
-                                */
-                               lcl_reason =
-                                 OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER;
-                       }
-                       break;
-               default:
-                       reason_valid = 0;
-                       snprintf(buf, sizeof(buf), "reserved%lld", info);
-                       extra = buf;
-                       break;
-               }
-
-               if (reason_valid && !do_bounce) {
-                       do_bounce = ppd->port_error_action &
-                                       (1 << (OPA_LDR_FMCONFIG_OFFSET + info));
-                       lcl_reason = info + OPA_LINKDOWN_REASON_BAD_HEAD_DIST;
-               }
-
-               /* just report this */
-               dd_dev_info(dd, "DCC Error: fmconfig error: %s\n", extra);
-               reg &= ~DCC_ERR_FLG_FMCONFIG_ERR_SMASK;
-       }
-
-       if (reg & DCC_ERR_FLG_RCVPORT_ERR_SMASK) {
-               u8 reason_valid = 1;
-
-               info = read_csr(dd, DCC_ERR_INFO_PORTRCV);
-               hdr0 = read_csr(dd, DCC_ERR_INFO_PORTRCV_HDR0);
-               hdr1 = read_csr(dd, DCC_ERR_INFO_PORTRCV_HDR1);
-               if (!(dd->err_info_rcvport.status_and_code &
-                     OPA_EI_STATUS_SMASK)) {
-                       dd->err_info_rcvport.status_and_code =
-                               info & OPA_EI_CODE_SMASK;
-                       /* set status bit */
-                       dd->err_info_rcvport.status_and_code |=
-                               OPA_EI_STATUS_SMASK;
-                       /*
-                        * save first 2 flits in the packet that caused
-                        * the error
-                        */
-                        dd->err_info_rcvport.packet_flit1 = hdr0;
-                        dd->err_info_rcvport.packet_flit2 = hdr1;
-               }
-               switch (info) {
-               case 1:
-               case 2:
-               case 3:
-               case 4:
-               case 5:
-               case 6:
-               case 7:
-               case 9:
-               case 11:
-               case 12:
-                       extra = port_rcv_txt[info];
-                       break;
-               default:
-                       reason_valid = 0;
-                       snprintf(buf, sizeof(buf), "reserved%lld", info);
-                       extra = buf;
-                       break;
-               }
-
-               if (reason_valid && !do_bounce) {
-                       do_bounce = ppd->port_error_action &
-                                       (1 << (OPA_LDR_PORTRCV_OFFSET + info));
-                       lcl_reason = info + OPA_LINKDOWN_REASON_RCV_ERROR_0;
-               }
-
-               /* just report this */
-               dd_dev_info(dd, "DCC Error: PortRcv error: %s\n", extra);
-               dd_dev_info(dd, "           hdr0 0x%llx, hdr1 0x%llx\n",
-                           hdr0, hdr1);
-
-               reg &= ~DCC_ERR_FLG_RCVPORT_ERR_SMASK;
-       }
-
-       if (reg & DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK) {
-               /* informative only */
-               dd_dev_info(dd, "8051 access to LCB blocked\n");
-               reg &= ~DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK;
-       }
-       if (reg & DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK) {
-               /* informative only */
-               dd_dev_info(dd, "host access to LCB blocked\n");
-               reg &= ~DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK;
-       }
-
-       /* report any remaining errors */
-       if (reg)
-               dd_dev_info(dd, "DCC Error: %s\n",
-                           dcc_err_string(buf, sizeof(buf), reg));
-
-       if (lcl_reason == 0)
-               lcl_reason = OPA_LINKDOWN_REASON_UNKNOWN;
-
-       if (do_bounce) {
-               dd_dev_info(dd, "%s: PortErrorAction bounce\n", __func__);
-               set_link_down_reason(ppd, lcl_reason, 0, lcl_reason);
-               queue_work(ppd->hfi1_wq, &ppd->link_bounce_work);
-       }
-}
-
-static void handle_lcb_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
-{
-       char buf[96];
-
-       dd_dev_info(dd, "LCB Error: %s\n",
-                   lcb_err_string(buf, sizeof(buf), reg));
-}
-
-/*
- * CCE block DC interrupt.  Source is < 8.
- */
-static void is_dc_int(struct hfi1_devdata *dd, unsigned int source)
-{
-       const struct err_reg_info *eri = &dc_errs[source];
-
-       if (eri->handler) {
-               interrupt_clear_down(dd, 0, eri);
-       } else if (source == 3 /* dc_lbm_int */) {
-               /*
-                * This indicates that a parity error has occurred on the
-                * address/control lines presented to the LBM.  The error
-                * is a single pulse, there is no associated error flag,
-                * and it is non-maskable.  This is because if a parity
-                * error occurs on the request the request is dropped.
-                * This should never occur, but it is nice to know if it
-                * ever does.
-                */
-               dd_dev_err(dd, "Parity error in DC LBM block\n");
-       } else {
-               dd_dev_err(dd, "Invalid DC interrupt %u\n", source);
-       }
-}
-
-/*
- * TX block send credit interrupt.  Source is < 160.
- */
-static void is_send_credit_int(struct hfi1_devdata *dd, unsigned int source)
-{
-       sc_group_release_update(dd, source);
-}
-
-/*
- * TX block SDMA interrupt.  Source is < 48.
- *
- * SDMA interrupts are grouped by type:
- *
- *      0 -  N-1 = SDma
- *      N - 2N-1 = SDmaProgress
- *     2N - 3N-1 = SDmaIdle
- */
-static void is_sdma_eng_int(struct hfi1_devdata *dd, unsigned int source)
-{
-       /* what interrupt */
-       unsigned int what  = source / TXE_NUM_SDMA_ENGINES;
-       /* which engine */
-       unsigned int which = source % TXE_NUM_SDMA_ENGINES;
-
-#ifdef CONFIG_SDMA_VERBOSITY
-       dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", which,
-                  slashstrip(__FILE__), __LINE__, __func__);
-       sdma_dumpstate(&dd->per_sdma[which]);
-#endif
-
-       if (likely(what < 3 && which < dd->num_sdma)) {
-               sdma_engine_interrupt(&dd->per_sdma[which], 1ull << source);
-       } else {
-               /* should not happen */
-               dd_dev_err(dd, "Invalid SDMA interrupt 0x%x\n", source);
-       }
-}
-
-/*
- * RX block receive available interrupt.  Source is < 160.
- */
-static void is_rcv_avail_int(struct hfi1_devdata *dd, unsigned int source)
-{
-       struct hfi1_ctxtdata *rcd;
-       char *err_detail;
-
-       if (likely(source < dd->num_rcv_contexts)) {
-               rcd = dd->rcd[source];
-               if (rcd) {
-                       if (source < dd->first_user_ctxt)
-                               rcd->do_interrupt(rcd, 0);
-                       else
-                               handle_user_interrupt(rcd);
-                       return; /* OK */
-               }
-               /* received an interrupt, but no rcd */
-               err_detail = "dataless";
-       } else {
-               /* received an interrupt, but are not using that context */
-               err_detail = "out of range";
-       }
-       dd_dev_err(dd, "unexpected %s receive available context interrupt %u\n",
-                  err_detail, source);
-}
-
-/*
- * RX block receive urgent interrupt.  Source is < 160.
- */
-static void is_rcv_urgent_int(struct hfi1_devdata *dd, unsigned int source)
-{
-       struct hfi1_ctxtdata *rcd;
-       char *err_detail;
-
-       if (likely(source < dd->num_rcv_contexts)) {
-               rcd = dd->rcd[source];
-               if (rcd) {
-                       /* only pay attention to user urgent interrupts */
-                       if (source >= dd->first_user_ctxt)
-                               handle_user_interrupt(rcd);
-                       return; /* OK */
-               }
-               /* received an interrupt, but no rcd */
-               err_detail = "dataless";
-       } else {
-               /* received an interrupt, but are not using that context */
-               err_detail = "out of range";
-       }
-       dd_dev_err(dd, "unexpected %s receive urgent context interrupt %u\n",
-                  err_detail, source);
-}
-
-/*
- * Reserved range interrupt.  Should not be called in normal operation.
- */
-static void is_reserved_int(struct hfi1_devdata *dd, unsigned int source)
-{
-       char name[64];
-
-       dd_dev_err(dd, "unexpected %s interrupt\n",
-                  is_reserved_name(name, sizeof(name), source));
-}
-
-static const struct is_table is_table[] = {
-/*
- * start                end
- *                             name func               interrupt func
- */
-{ IS_GENERAL_ERR_START,  IS_GENERAL_ERR_END,
-                               is_misc_err_name,       is_misc_err_int },
-{ IS_SDMAENG_ERR_START,  IS_SDMAENG_ERR_END,
-                               is_sdma_eng_err_name,   is_sdma_eng_err_int },
-{ IS_SENDCTXT_ERR_START, IS_SENDCTXT_ERR_END,
-                               is_sendctxt_err_name,   is_sendctxt_err_int },
-{ IS_SDMA_START,            IS_SDMA_END,
-                               is_sdma_eng_name,       is_sdma_eng_int },
-{ IS_VARIOUS_START,         IS_VARIOUS_END,
-                               is_various_name,        is_various_int },
-{ IS_DC_START,      IS_DC_END,
-                               is_dc_name,             is_dc_int },
-{ IS_RCVAVAIL_START,     IS_RCVAVAIL_END,
-                               is_rcv_avail_name,      is_rcv_avail_int },
-{ IS_RCVURGENT_START,    IS_RCVURGENT_END,
-                               is_rcv_urgent_name,     is_rcv_urgent_int },
-{ IS_SENDCREDIT_START,   IS_SENDCREDIT_END,
-                               is_send_credit_name,    is_send_credit_int},
-{ IS_RESERVED_START,     IS_RESERVED_END,
-                               is_reserved_name,       is_reserved_int},
-};
-
-/*
- * Interrupt source interrupt - called when the given source has an interrupt.
- * Source is a bit index into an array of 64-bit integers.
- */
-static void is_interrupt(struct hfi1_devdata *dd, unsigned int source)
-{
-       const struct is_table *entry;
-
-       /* avoids a double compare by walking the table in-order */
-       for (entry = &is_table[0]; entry->is_name; entry++) {
-               if (source < entry->end) {
-                       trace_hfi1_interrupt(dd, entry, source);
-                       entry->is_int(dd, source - entry->start);
-                       return;
-               }
-       }
-       /* fell off the end */
-       dd_dev_err(dd, "invalid interrupt source %u\n", source);
-}
-
-/*
- * General interrupt handler.  This is able to correctly handle
- * all interrupts in case INTx is used.
- */
-static irqreturn_t general_interrupt(int irq, void *data)
-{
-       struct hfi1_devdata *dd = data;
-       u64 regs[CCE_NUM_INT_CSRS];
-       u32 bit;
-       int i;
-
-       this_cpu_inc(*dd->int_counter);
-
-       /* phase 1: scan and clear all handled interrupts */
-       for (i = 0; i < CCE_NUM_INT_CSRS; i++) {
-               if (dd->gi_mask[i] == 0) {
-                       regs[i] = 0;    /* used later */
-                       continue;
-               }
-               regs[i] = read_csr(dd, CCE_INT_STATUS + (8 * i)) &
-                               dd->gi_mask[i];
-               /* only clear if anything is set */
-               if (regs[i])
-                       write_csr(dd, CCE_INT_CLEAR + (8 * i), regs[i]);
-       }
-
-       /* phase 2: call the appropriate handler */
-       for_each_set_bit(bit, (unsigned long *)&regs[0],
-                        CCE_NUM_INT_CSRS * 64) {
-               is_interrupt(dd, bit);
-       }
-
-       return IRQ_HANDLED;
-}
-
-static irqreturn_t sdma_interrupt(int irq, void *data)
-{
-       struct sdma_engine *sde = data;
-       struct hfi1_devdata *dd = sde->dd;
-       u64 status;
-
-#ifdef CONFIG_SDMA_VERBOSITY
-       dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
-                  slashstrip(__FILE__), __LINE__, __func__);
-       sdma_dumpstate(sde);
-#endif
-
-       this_cpu_inc(*dd->int_counter);
-
-       /* This read_csr is really bad in the hot path */
-       status = read_csr(dd,
-                         CCE_INT_STATUS + (8 * (IS_SDMA_START / 64)))
-                         & sde->imask;
-       if (likely(status)) {
-               /* clear the interrupt(s) */
-               write_csr(dd,
-                         CCE_INT_CLEAR + (8 * (IS_SDMA_START / 64)),
-                         status);
-
-               /* handle the interrupt(s) */
-               sdma_engine_interrupt(sde, status);
-       } else
-               dd_dev_err(dd, "SDMA engine %u interrupt, but no status bits set\n",
-                          sde->this_idx);
-
-       return IRQ_HANDLED;
-}
-
-/*
- * Clear the receive interrupt.  Use a read of the interrupt clear CSR
- * to insure that the write completed.  This does NOT guarantee that
- * queued DMA writes to memory from the chip are pushed.
- */
-static inline void clear_recv_intr(struct hfi1_ctxtdata *rcd)
-{
-       struct hfi1_devdata *dd = rcd->dd;
-       u32 addr = CCE_INT_CLEAR + (8 * rcd->ireg);
-
-       mmiowb();       /* make sure everything before is written */
-       write_csr(dd, addr, rcd->imask);
-       /* force the above write on the chip and get a value back */
-       (void)read_csr(dd, addr);
-}
-
-/* force the receive interrupt */
-void force_recv_intr(struct hfi1_ctxtdata *rcd)
-{
-       write_csr(rcd->dd, CCE_INT_FORCE + (8 * rcd->ireg), rcd->imask);
-}
-
-/*
- * Return non-zero if a packet is present.
- *
- * This routine is called when rechecking for packets after the RcvAvail
- * interrupt has been cleared down.  First, do a quick check of memory for
- * a packet present.  If not found, use an expensive CSR read of the context
- * tail to determine the actual tail.  The CSR read is necessary because there
- * is no method to push pending DMAs to memory other than an interrupt and we
- * are trying to determine if we need to force an interrupt.
- */
-static inline int check_packet_present(struct hfi1_ctxtdata *rcd)
-{
-       u32 tail;
-       int present;
-
-       if (!HFI1_CAP_IS_KSET(DMA_RTAIL))
-               present = (rcd->seq_cnt ==
-                               rhf_rcv_seq(rhf_to_cpu(get_rhf_addr(rcd))));
-       else /* is RDMA rtail */
-               present = (rcd->head != get_rcvhdrtail(rcd));
-
-       if (present)
-               return 1;
-
-       /* fall back to a CSR read, correct indpendent of DMA_RTAIL */
-       tail = (u32)read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_TAIL);
-       return rcd->head != tail;
-}
-
-/*
- * Receive packet IRQ handler.  This routine expects to be on its own IRQ.
- * This routine will try to handle packets immediately (latency), but if
- * it finds too many, it will invoke the thread handler (bandwitdh).  The
- * chip receive interrupt is *not* cleared down until this or the thread (if
- * invoked) is finished.  The intent is to avoid extra interrupts while we
- * are processing packets anyway.
- */
-static irqreturn_t receive_context_interrupt(int irq, void *data)
-{
-       struct hfi1_ctxtdata *rcd = data;
-       struct hfi1_devdata *dd = rcd->dd;
-       int disposition;
-       int present;
-
-       trace_hfi1_receive_interrupt(dd, rcd->ctxt);
-       this_cpu_inc(*dd->int_counter);
-       aspm_ctx_disable(rcd);
-
-       /* receive interrupt remains blocked while processing packets */
-       disposition = rcd->do_interrupt(rcd, 0);
-
-       /*
-        * Too many packets were seen while processing packets in this
-        * IRQ handler.  Invoke the handler thread.  The receive interrupt
-        * remains blocked.
-        */
-       if (disposition == RCV_PKT_LIMIT)
-               return IRQ_WAKE_THREAD;
-
-       /*
-        * The packet processor detected no more packets.  Clear the receive
-        * interrupt and recheck for a packet packet that may have arrived
-        * after the previous check and interrupt clear.  If a packet arrived,
-        * force another interrupt.
-        */
-       clear_recv_intr(rcd);
-       present = check_packet_present(rcd);
-       if (present)
-               force_recv_intr(rcd);
-
-       return IRQ_HANDLED;
-}
-
-/*
- * Receive packet thread handler.  This expects to be invoked with the
- * receive interrupt still blocked.
- */
-static irqreturn_t receive_context_thread(int irq, void *data)
-{
-       struct hfi1_ctxtdata *rcd = data;
-       int present;
-
-       /* receive interrupt is still blocked from the IRQ handler */
-       (void)rcd->do_interrupt(rcd, 1);
-
-       /*
-        * The packet processor will only return if it detected no more
-        * packets.  Hold IRQs here so we can safely clear the interrupt and
-        * recheck for a packet that may have arrived after the previous
-        * check and the interrupt clear.  If a packet arrived, force another
-        * interrupt.
-        */
-       local_irq_disable();
-       clear_recv_intr(rcd);
-       present = check_packet_present(rcd);
-       if (present)
-               force_recv_intr(rcd);
-       local_irq_enable();
-
-       return IRQ_HANDLED;
-}
-
-/* ========================================================================= */
-
-u32 read_physical_state(struct hfi1_devdata *dd)
-{
-       u64 reg;
-
-       reg = read_csr(dd, DC_DC8051_STS_CUR_STATE);
-       return (reg >> DC_DC8051_STS_CUR_STATE_PORT_SHIFT)
-                               & DC_DC8051_STS_CUR_STATE_PORT_MASK;
-}
-
-u32 read_logical_state(struct hfi1_devdata *dd)
-{
-       u64 reg;
-
-       reg = read_csr(dd, DCC_CFG_PORT_CONFIG);
-       return (reg >> DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT)
-                               & DCC_CFG_PORT_CONFIG_LINK_STATE_MASK;
-}
-
-static void set_logical_state(struct hfi1_devdata *dd, u32 chip_lstate)
-{
-       u64 reg;
-
-       reg = read_csr(dd, DCC_CFG_PORT_CONFIG);
-       /* clear current state, set new state */
-       reg &= ~DCC_CFG_PORT_CONFIG_LINK_STATE_SMASK;
-       reg |= (u64)chip_lstate << DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT;
-       write_csr(dd, DCC_CFG_PORT_CONFIG, reg);
-}
-
-/*
- * Use the 8051 to read a LCB CSR.
- */
-static int read_lcb_via_8051(struct hfi1_devdata *dd, u32 addr, u64 *data)
-{
-       u32 regno;
-       int ret;
-
-       if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
-               if (acquire_lcb_access(dd, 0) == 0) {
-                       *data = read_csr(dd, addr);
-                       release_lcb_access(dd, 0);
-                       return 0;
-               }
-               return -EBUSY;
-       }
-
-       /* register is an index of LCB registers: (offset - base) / 8 */
-       regno = (addr - DC_LCB_CFG_RUN) >> 3;
-       ret = do_8051_command(dd, HCMD_READ_LCB_CSR, regno, data);
-       if (ret != HCMD_SUCCESS)
-               return -EBUSY;
-       return 0;
-}
-
-/*
- * Read an LCB CSR.  Access may not be in host control, so check.
- * Return 0 on success, -EBUSY on failure.
- */
-int read_lcb_csr(struct hfi1_devdata *dd, u32 addr, u64 *data)
-{
-       struct hfi1_pportdata *ppd = dd->pport;
-
-       /* if up, go through the 8051 for the value */
-       if (ppd->host_link_state & HLS_UP)
-               return read_lcb_via_8051(dd, addr, data);
-       /* if going up or down, no access */
-       if (ppd->host_link_state & (HLS_GOING_UP | HLS_GOING_OFFLINE))
-               return -EBUSY;
-       /* otherwise, host has access */
-       *data = read_csr(dd, addr);
-       return 0;
-}
-
-/*
- * Use the 8051 to write a LCB CSR.
- */
-static int write_lcb_via_8051(struct hfi1_devdata *dd, u32 addr, u64 data)
-{
-       u32 regno;
-       int ret;
-
-       if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR ||
-           (dd->dc8051_ver < dc8051_ver(0, 20))) {
-               if (acquire_lcb_access(dd, 0) == 0) {
-                       write_csr(dd, addr, data);
-                       release_lcb_access(dd, 0);
-                       return 0;
-               }
-               return -EBUSY;
-       }
-
-       /* register is an index of LCB registers: (offset - base) / 8 */
-       regno = (addr - DC_LCB_CFG_RUN) >> 3;
-       ret = do_8051_command(dd, HCMD_WRITE_LCB_CSR, regno, &data);
-       if (ret != HCMD_SUCCESS)
-               return -EBUSY;
-       return 0;
-}
-
-/*
- * Write an LCB CSR.  Access may not be in host control, so check.
- * Return 0 on success, -EBUSY on failure.
- */
-int write_lcb_csr(struct hfi1_devdata *dd, u32 addr, u64 data)
-{
-       struct hfi1_pportdata *ppd = dd->pport;
-
-       /* if up, go through the 8051 for the value */
-       if (ppd->host_link_state & HLS_UP)
-               return write_lcb_via_8051(dd, addr, data);
-       /* if going up or down, no access */
-       if (ppd->host_link_state & (HLS_GOING_UP | HLS_GOING_OFFLINE))
-               return -EBUSY;
-       /* otherwise, host has access */
-       write_csr(dd, addr, data);
-       return 0;
-}
-
-/*
- * Returns:
- *     < 0 = Linux error, not able to get access
- *     > 0 = 8051 command RETURN_CODE
- */
-static int do_8051_command(
-       struct hfi1_devdata *dd,
-       u32 type,
-       u64 in_data,
-       u64 *out_data)
-{
-       u64 reg, completed;
-       int return_code;
-       unsigned long flags;
-       unsigned long timeout;
-
-       hfi1_cdbg(DC8051, "type %d, data 0x%012llx", type, in_data);
-
-       /*
-        * Alternative to holding the lock for a long time:
-        * - keep busy wait - have other users bounce off
-        */
-       spin_lock_irqsave(&dd->dc8051_lock, flags);
-
-       /* We can't send any commands to the 8051 if it's in reset */
-       if (dd->dc_shutdown) {
-               return_code = -ENODEV;
-               goto fail;
-       }
-
-       /*
-        * If an 8051 host command timed out previously, then the 8051 is
-        * stuck.
-        *
-        * On first timeout, attempt to reset and restart the entire DC
-        * block (including 8051). (Is this too big of a hammer?)
-        *
-        * If the 8051 times out a second time, the reset did not bring it
-        * back to healthy life. In that case, fail any subsequent commands.
-        */
-       if (dd->dc8051_timed_out) {
-               if (dd->dc8051_timed_out > 1) {
-                       dd_dev_err(dd,
-                                  "Previous 8051 host command timed out, skipping command %u\n",
-                                  type);
-                       return_code = -ENXIO;
-                       goto fail;
-               }
-               spin_unlock_irqrestore(&dd->dc8051_lock, flags);
-               dc_shutdown(dd);
-               dc_start(dd);
-               spin_lock_irqsave(&dd->dc8051_lock, flags);
-       }
-
-       /*
-        * If there is no timeout, then the 8051 command interface is
-        * waiting for a command.
-        */
-
-       /*
-        * When writing a LCB CSR, out_data contains the full value to
-        * to be written, while in_data contains the relative LCB
-        * address in 7:0.  Do the work here, rather than the caller,
-        * of distrubting the write data to where it needs to go:
-        *
-        * Write data
-        *   39:00 -> in_data[47:8]
-        *   47:40 -> DC8051_CFG_EXT_DEV_0.RETURN_CODE
-        *   63:48 -> DC8051_CFG_EXT_DEV_0.RSP_DATA
-        */
-       if (type == HCMD_WRITE_LCB_CSR) {
-               in_data |= ((*out_data) & 0xffffffffffull) << 8;
-               reg = ((((*out_data) >> 40) & 0xff) <<
-                               DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT)
-                     | ((((*out_data) >> 48) & 0xffff) <<
-                               DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT);
-               write_csr(dd, DC_DC8051_CFG_EXT_DEV_0, reg);
-       }
-
-       /*
-        * Do two writes: the first to stabilize the type and req_data, the
-        * second to activate.
-        */
-       reg = ((u64)type & DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_MASK)
-                       << DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_SHIFT
-               | (in_data & DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_MASK)
-                       << DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_SHIFT;
-       write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, reg);
-       reg |= DC_DC8051_CFG_HOST_CMD_0_REQ_NEW_SMASK;
-       write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, reg);
-
-       /* wait for completion, alternate: interrupt */
-       timeout = jiffies + msecs_to_jiffies(DC8051_COMMAND_TIMEOUT);
-       while (1) {
-               reg = read_csr(dd, DC_DC8051_CFG_HOST_CMD_1);
-               completed = reg & DC_DC8051_CFG_HOST_CMD_1_COMPLETED_SMASK;
-               if (completed)
-                       break;
-               if (time_after(jiffies, timeout)) {
-                       dd->dc8051_timed_out++;
-                       dd_dev_err(dd, "8051 host command %u timeout\n", type);
-                       if (out_data)
-                               *out_data = 0;
-                       return_code = -ETIMEDOUT;
-                       goto fail;
-               }
-               udelay(2);
-       }
-
-       if (out_data) {
-               *out_data = (reg >> DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_SHIFT)
-                               & DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_MASK;
-               if (type == HCMD_READ_LCB_CSR) {
-                       /* top 16 bits are in a different register */
-                       *out_data |= (read_csr(dd, DC_DC8051_CFG_EXT_DEV_1)
-                               & DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SMASK)
-                               << (48
-                                   - DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT);
-               }
-       }
-       return_code = (reg >> DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_SHIFT)
-                               & DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_MASK;
-       dd->dc8051_timed_out = 0;
-       /*
-        * Clear command for next user.
-        */
-       write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, 0);
-
-fail:
-       spin_unlock_irqrestore(&dd->dc8051_lock, flags);
-
-       return return_code;
-}
-
-static int set_physical_link_state(struct hfi1_devdata *dd, u64 state)
-{
-       return do_8051_command(dd, HCMD_CHANGE_PHY_STATE, state, NULL);
-}
-
-int load_8051_config(struct hfi1_devdata *dd, u8 field_id,
-                    u8 lane_id, u32 config_data)
-{
-       u64 data;
-       int ret;
-
-       data = (u64)field_id << LOAD_DATA_FIELD_ID_SHIFT
-               | (u64)lane_id << LOAD_DATA_LANE_ID_SHIFT
-               | (u64)config_data << LOAD_DATA_DATA_SHIFT;
-       ret = do_8051_command(dd, HCMD_LOAD_CONFIG_DATA, data, NULL);
-       if (ret != HCMD_SUCCESS) {
-               dd_dev_err(dd,
-                          "load 8051 config: field id %d, lane %d, err %d\n",
-                          (int)field_id, (int)lane_id, ret);
-       }
-       return ret;
-}
-
-/*
- * Read the 8051 firmware "registers".  Use the RAM directly.  Always
- * set the result, even on error.
- * Return 0 on success, -errno on failure
- */
-int read_8051_config(struct hfi1_devdata *dd, u8 field_id, u8 lane_id,
-                    u32 *result)
-{
-       u64 big_data;
-       u32 addr;
-       int ret;
-
-       /* address start depends on the lane_id */
-       if (lane_id < 4)
-               addr = (4 * NUM_GENERAL_FIELDS)
-                       + (lane_id * 4 * NUM_LANE_FIELDS);
-       else
-               addr = 0;
-       addr += field_id * 4;
-
-       /* read is in 8-byte chunks, hardware will truncate the address down */
-       ret = read_8051_data(dd, addr, 8, &big_data);
-
-       if (ret == 0) {
-               /* extract the 4 bytes we want */
-               if (addr & 0x4)
-                       *result = (u32)(big_data >> 32);
-               else
-                       *result = (u32)big_data;
-       } else {
-               *result = 0;
-               dd_dev_err(dd, "%s: direct read failed, lane %d, field %d!\n",
-                          __func__, lane_id, field_id);
-       }
-
-       return ret;
-}
-
-static int write_vc_local_phy(struct hfi1_devdata *dd, u8 power_management,
-                             u8 continuous)
-{
-       u32 frame;
-
-       frame = continuous << CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT
-               | power_management << POWER_MANAGEMENT_SHIFT;
-       return load_8051_config(dd, VERIFY_CAP_LOCAL_PHY,
-                               GENERAL_CONFIG, frame);
-}
-
-static int write_vc_local_fabric(struct hfi1_devdata *dd, u8 vau, u8 z, u8 vcu,
-                                u16 vl15buf, u8 crc_sizes)
-{
-       u32 frame;
-
-       frame = (u32)vau << VAU_SHIFT
-               | (u32)z << Z_SHIFT
-               | (u32)vcu << VCU_SHIFT
-               | (u32)vl15buf << VL15BUF_SHIFT
-               | (u32)crc_sizes << CRC_SIZES_SHIFT;
-       return load_8051_config(dd, VERIFY_CAP_LOCAL_FABRIC,
-                               GENERAL_CONFIG, frame);
-}
-
-static void read_vc_local_link_width(struct hfi1_devdata *dd, u8 *misc_bits,
-                                    u8 *flag_bits, u16 *link_widths)
-{
-       u32 frame;
-
-       read_8051_config(dd, VERIFY_CAP_LOCAL_LINK_WIDTH, GENERAL_CONFIG,
-                        &frame);
-       *misc_bits = (frame >> MISC_CONFIG_BITS_SHIFT) & MISC_CONFIG_BITS_MASK;
-       *flag_bits = (frame >> LOCAL_FLAG_BITS_SHIFT) & LOCAL_FLAG_BITS_MASK;
-       *link_widths = (frame >> LINK_WIDTH_SHIFT) & LINK_WIDTH_MASK;
-}
-
-static int write_vc_local_link_width(struct hfi1_devdata *dd,
-                                    u8 misc_bits,
-                                    u8 flag_bits,
-                                    u16 link_widths)
-{
-       u32 frame;
-
-       frame = (u32)misc_bits << MISC_CONFIG_BITS_SHIFT
-               | (u32)flag_bits << LOCAL_FLAG_BITS_SHIFT
-               | (u32)link_widths << LINK_WIDTH_SHIFT;
-       return load_8051_config(dd, VERIFY_CAP_LOCAL_LINK_WIDTH, GENERAL_CONFIG,
-                    frame);
-}
-
-static int write_local_device_id(struct hfi1_devdata *dd, u16 device_id,
-                                u8 device_rev)
-{
-       u32 frame;
-
-       frame = ((u32)device_id << LOCAL_DEVICE_ID_SHIFT)
-               | ((u32)device_rev << LOCAL_DEVICE_REV_SHIFT);
-       return load_8051_config(dd, LOCAL_DEVICE_ID, GENERAL_CONFIG, frame);
-}
-
-static void read_remote_device_id(struct hfi1_devdata *dd, u16 *device_id,
-                                 u8 *device_rev)
-{
-       u32 frame;
-
-       read_8051_config(dd, REMOTE_DEVICE_ID, GENERAL_CONFIG, &frame);
-       *device_id = (frame >> REMOTE_DEVICE_ID_SHIFT) & REMOTE_DEVICE_ID_MASK;
-       *device_rev = (frame >> REMOTE_DEVICE_REV_SHIFT)
-                       & REMOTE_DEVICE_REV_MASK;
-}
-
-void read_misc_status(struct hfi1_devdata *dd, u8 *ver_a, u8 *ver_b)
-{
-       u32 frame;
-
-       read_8051_config(dd, MISC_STATUS, GENERAL_CONFIG, &frame);
-       *ver_a = (frame >> STS_FM_VERSION_A_SHIFT) & STS_FM_VERSION_A_MASK;
-       *ver_b = (frame >> STS_FM_VERSION_B_SHIFT) & STS_FM_VERSION_B_MASK;
-}
-
-static void read_vc_remote_phy(struct hfi1_devdata *dd, u8 *power_management,
-                              u8 *continuous)
-{
-       u32 frame;
-
-       read_8051_config(dd, VERIFY_CAP_REMOTE_PHY, GENERAL_CONFIG, &frame);
-       *power_management = (frame >> POWER_MANAGEMENT_SHIFT)
-                                       & POWER_MANAGEMENT_MASK;
-       *continuous = (frame >> CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT)
-                                       & CONTINIOUS_REMOTE_UPDATE_SUPPORT_MASK;
-}
-
-static void read_vc_remote_fabric(struct hfi1_devdata *dd, u8 *vau, u8 *z,
-                                 u8 *vcu, u16 *vl15buf, u8 *crc_sizes)
-{
-       u32 frame;
-
-       read_8051_config(dd, VERIFY_CAP_REMOTE_FABRIC, GENERAL_CONFIG, &frame);
-       *vau = (frame >> VAU_SHIFT) & VAU_MASK;
-       *z = (frame >> Z_SHIFT) & Z_MASK;
-       *vcu = (frame >> VCU_SHIFT) & VCU_MASK;
-       *vl15buf = (frame >> VL15BUF_SHIFT) & VL15BUF_MASK;
-       *crc_sizes = (frame >> CRC_SIZES_SHIFT) & CRC_SIZES_MASK;
-}
-
-static void read_vc_remote_link_width(struct hfi1_devdata *dd,
-                                     u8 *remote_tx_rate,
-                                     u16 *link_widths)
-{
-       u32 frame;
-
-       read_8051_config(dd, VERIFY_CAP_REMOTE_LINK_WIDTH, GENERAL_CONFIG,
-                        &frame);
-       *remote_tx_rate = (frame >> REMOTE_TX_RATE_SHIFT)
-                               & REMOTE_TX_RATE_MASK;
-       *link_widths = (frame >> LINK_WIDTH_SHIFT) & LINK_WIDTH_MASK;
-}
-
-static void read_local_lni(struct hfi1_devdata *dd, u8 *enable_lane_rx)
-{
-       u32 frame;
-
-       read_8051_config(dd, LOCAL_LNI_INFO, GENERAL_CONFIG, &frame);
-       *enable_lane_rx = (frame >> ENABLE_LANE_RX_SHIFT) & ENABLE_LANE_RX_MASK;
-}
-
-static void read_mgmt_allowed(struct hfi1_devdata *dd, u8 *mgmt_allowed)
-{
-       u32 frame;
-
-       read_8051_config(dd, REMOTE_LNI_INFO, GENERAL_CONFIG, &frame);
-       *mgmt_allowed = (frame >> MGMT_ALLOWED_SHIFT) & MGMT_ALLOWED_MASK;
-}
-
-static void read_last_local_state(struct hfi1_devdata *dd, u32 *lls)
-{
-       read_8051_config(dd, LAST_LOCAL_STATE_COMPLETE, GENERAL_CONFIG, lls);
-}
-
-static void read_last_remote_state(struct hfi1_devdata *dd, u32 *lrs)
-{
-       read_8051_config(dd, LAST_REMOTE_STATE_COMPLETE, GENERAL_CONFIG, lrs);
-}
-
-void hfi1_read_link_quality(struct hfi1_devdata *dd, u8 *link_quality)
-{
-       u32 frame;
-       int ret;
-
-       *link_quality = 0;
-       if (dd->pport->host_link_state & HLS_UP) {
-               ret = read_8051_config(dd, LINK_QUALITY_INFO, GENERAL_CONFIG,
-                                      &frame);
-               if (ret == 0)
-                       *link_quality = (frame >> LINK_QUALITY_SHIFT)
-                                               & LINK_QUALITY_MASK;
-       }
-}
-
-static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc)
-{
-       u32 frame;
-
-       read_8051_config(dd, LINK_QUALITY_INFO, GENERAL_CONFIG, &frame);
-       *pdrrc = (frame >> DOWN_REMOTE_REASON_SHIFT) & DOWN_REMOTE_REASON_MASK;
-}
-
-static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr)
-{
-       u32 frame;
-
-       read_8051_config(dd, LINK_DOWN_REASON, GENERAL_CONFIG, &frame);
-       *ldr = (frame & 0xff);
-}
-
-static int read_tx_settings(struct hfi1_devdata *dd,
-                           u8 *enable_lane_tx,
-                           u8 *tx_polarity_inversion,
-                           u8 *rx_polarity_inversion,
-                           u8 *max_rate)
-{
-       u32 frame;
-       int ret;
-
-       ret = read_8051_config(dd, TX_SETTINGS, GENERAL_CONFIG, &frame);
-       *enable_lane_tx = (frame >> ENABLE_LANE_TX_SHIFT)
-                               & ENABLE_LANE_TX_MASK;
-       *tx_polarity_inversion = (frame >> TX_POLARITY_INVERSION_SHIFT)
-                               & TX_POLARITY_INVERSION_MASK;
-       *rx_polarity_inversion = (frame >> RX_POLARITY_INVERSION_SHIFT)
-                               & RX_POLARITY_INVERSION_MASK;
-       *max_rate = (frame >> MAX_RATE_SHIFT) & MAX_RATE_MASK;
-       return ret;
-}
-
-static int write_tx_settings(struct hfi1_devdata *dd,
-                            u8 enable_lane_tx,
-                            u8 tx_polarity_inversion,
-                            u8 rx_polarity_inversion,
-                            u8 max_rate)
-{
-       u32 frame;
-
-       /* no need to mask, all variable sizes match field widths */
-       frame = enable_lane_tx << ENABLE_LANE_TX_SHIFT
-               | tx_polarity_inversion << TX_POLARITY_INVERSION_SHIFT
-               | rx_polarity_inversion << RX_POLARITY_INVERSION_SHIFT
-               | max_rate << MAX_RATE_SHIFT;
-       return load_8051_config(dd, TX_SETTINGS, GENERAL_CONFIG, frame);
-}
-
-static void check_fabric_firmware_versions(struct hfi1_devdata *dd)
-{
-       u32 frame, version, prod_id;
-       int ret, lane;
-
-       /* 4 lanes */
-       for (lane = 0; lane < 4; lane++) {
-               ret = read_8051_config(dd, SPICO_FW_VERSION, lane, &frame);
-               if (ret) {
-                       dd_dev_err(dd,
-                                  "Unable to read lane %d firmware details\n",
-                                  lane);
-                       continue;
-               }
-               version = (frame >> SPICO_ROM_VERSION_SHIFT)
-                                       & SPICO_ROM_VERSION_MASK;
-               prod_id = (frame >> SPICO_ROM_PROD_ID_SHIFT)
-                                       & SPICO_ROM_PROD_ID_MASK;
-               dd_dev_info(dd,
-                           "Lane %d firmware: version 0x%04x, prod_id 0x%04x\n",
-                           lane, version, prod_id);
-       }
-}
-
-/*
- * Read an idle LCB message.
- *
- * Returns 0 on success, -EINVAL on error
- */
-static int read_idle_message(struct hfi1_devdata *dd, u64 type, u64 *data_out)
-{
-       int ret;
-
-       ret = do_8051_command(dd, HCMD_READ_LCB_IDLE_MSG, type, data_out);
-       if (ret != HCMD_SUCCESS) {
-               dd_dev_err(dd, "read idle message: type %d, err %d\n",
-                          (u32)type, ret);
-               return -EINVAL;
-       }
-       dd_dev_info(dd, "%s: read idle message 0x%llx\n", __func__, *data_out);
-       /* return only the payload as we already know the type */
-       *data_out >>= IDLE_PAYLOAD_SHIFT;
-       return 0;
-}
-
-/*
- * Read an idle SMA message.  To be done in response to a notification from
- * the 8051.
- *
- * Returns 0 on success, -EINVAL on error
- */
-static int read_idle_sma(struct hfi1_devdata *dd, u64 *data)
-{
-       return read_idle_message(dd, (u64)IDLE_SMA << IDLE_MSG_TYPE_SHIFT,
-                                data);
-}
-
-/*
- * Send an idle LCB message.
- *
- * Returns 0 on success, -EINVAL on error
- */
-static int send_idle_message(struct hfi1_devdata *dd, u64 data)
-{
-       int ret;
-
-       dd_dev_info(dd, "%s: sending idle message 0x%llx\n", __func__, data);
-       ret = do_8051_command(dd, HCMD_SEND_LCB_IDLE_MSG, data, NULL);
-       if (ret != HCMD_SUCCESS) {
-               dd_dev_err(dd, "send idle message: data 0x%llx, err %d\n",
-                          data, ret);
-               return -EINVAL;
-       }
-       return 0;
-}
-
-/*
- * Send an idle SMA message.
- *
- * Returns 0 on success, -EINVAL on error
- */
-int send_idle_sma(struct hfi1_devdata *dd, u64 message)
-{
-       u64 data;
-
-       data = ((message & IDLE_PAYLOAD_MASK) << IDLE_PAYLOAD_SHIFT) |
-               ((u64)IDLE_SMA << IDLE_MSG_TYPE_SHIFT);
-       return send_idle_message(dd, data);
-}
-
-/*
- * Initialize the LCB then do a quick link up.  This may or may not be
- * in loopback.
- *
- * return 0 on success, -errno on error
- */
-static int do_quick_linkup(struct hfi1_devdata *dd)
-{
-       u64 reg;
-       unsigned long timeout;
-       int ret;
-
-       lcb_shutdown(dd, 0);
-
-       if (loopback) {
-               /* LCB_CFG_LOOPBACK.VAL = 2 */
-               /* LCB_CFG_LANE_WIDTH.VAL = 0 */
-               write_csr(dd, DC_LCB_CFG_LOOPBACK,
-                         IB_PACKET_TYPE << DC_LCB_CFG_LOOPBACK_VAL_SHIFT);
-               write_csr(dd, DC_LCB_CFG_LANE_WIDTH, 0);
-       }
-
-       /* start the LCBs */
-       /* LCB_CFG_TX_FIFOS_RESET.VAL = 0 */
-       write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0);
-
-       /* simulator only loopback steps */
-       if (loopback && dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
-               /* LCB_CFG_RUN.EN = 1 */
-               write_csr(dd, DC_LCB_CFG_RUN,
-                         1ull << DC_LCB_CFG_RUN_EN_SHIFT);
-
-               /* watch LCB_STS_LINK_TRANSFER_ACTIVE */
-               timeout = jiffies + msecs_to_jiffies(10);
-               while (1) {
-                       reg = read_csr(dd, DC_LCB_STS_LINK_TRANSFER_ACTIVE);
-                       if (reg)
-                               break;
-                       if (time_after(jiffies, timeout)) {
-                               dd_dev_err(dd,
-                                          "timeout waiting for LINK_TRANSFER_ACTIVE\n");
-                               return -ETIMEDOUT;
-                       }
-                       udelay(2);
-               }
-
-               write_csr(dd, DC_LCB_CFG_ALLOW_LINK_UP,
-                         1ull << DC_LCB_CFG_ALLOW_LINK_UP_VAL_SHIFT);
-       }
-
-       if (!loopback) {
-               /*
-                * When doing quick linkup and not in loopback, both
-                * sides must be done with LCB set-up before either
-                * starts the quick linkup.  Put a delay here so that
-                * both sides can be started and have a chance to be
-                * done with LCB set up before resuming.
-                */
-               dd_dev_err(dd,
-                          "Pausing for peer to be finished with LCB set up\n");
-               msleep(5000);
-               dd_dev_err(dd, "Continuing with quick linkup\n");
-       }
-
-       write_csr(dd, DC_LCB_ERR_EN, 0); /* mask LCB errors */
-       set_8051_lcb_access(dd);
-
-       /*
-        * State "quick" LinkUp request sets the physical link state to
-        * LinkUp without a verify capability sequence.
-        * This state is in simulator v37 and later.
-        */
-       ret = set_physical_link_state(dd, PLS_QUICK_LINKUP);
-       if (ret != HCMD_SUCCESS) {
-               dd_dev_err(dd,
-                          "%s: set physical link state to quick LinkUp failed with return %d\n",
-                          __func__, ret);
-
-               set_host_lcb_access(dd);
-               write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */
-
-               if (ret >= 0)
-                       ret = -EINVAL;
-               return ret;
-       }
-
-       return 0; /* success */
-}
-
-/*
- * Set the SerDes to internal loopback mode.
- * Returns 0 on success, -errno on error.
- */
-static int set_serdes_loopback_mode(struct hfi1_devdata *dd)
-{
-       int ret;
-
-       ret = set_physical_link_state(dd, PLS_INTERNAL_SERDES_LOOPBACK);
-       if (ret == HCMD_SUCCESS)
-               return 0;
-       dd_dev_err(dd,
-                  "Set physical link state to SerDes Loopback failed with return %d\n",
-                  ret);
-       if (ret >= 0)
-               ret = -EINVAL;
-       return ret;
-}
-
-/*
- * Do all special steps to set up loopback.
- */
-static int init_loopback(struct hfi1_devdata *dd)
-{
-       dd_dev_info(dd, "Entering loopback mode\n");
-
-       /* all loopbacks should disable self GUID check */
-       write_csr(dd, DC_DC8051_CFG_MODE,
-                 (read_csr(dd, DC_DC8051_CFG_MODE) | DISABLE_SELF_GUID_CHECK));
-
-       /*
-        * The simulator has only one loopback option - LCB.  Switch
-        * to that option, which includes quick link up.
-        *
-        * Accept all valid loopback values.
-        */
-       if ((dd->icode == ICODE_FUNCTIONAL_SIMULATOR) &&
-           (loopback == LOOPBACK_SERDES || loopback == LOOPBACK_LCB ||
-            loopback == LOOPBACK_CABLE)) {
-               loopback = LOOPBACK_LCB;
-               quick_linkup = 1;
-               return 0;
-       }
-
-       /* handle serdes loopback */
-       if (loopback == LOOPBACK_SERDES) {
-               /* internal serdes loopack needs quick linkup on RTL */
-               if (dd->icode == ICODE_RTL_SILICON)
-                       quick_linkup = 1;
-               return set_serdes_loopback_mode(dd);
-       }
-
-       /* LCB loopback - handled at poll time */
-       if (loopback == LOOPBACK_LCB) {
-               quick_linkup = 1; /* LCB is always quick linkup */
-
-               /* not supported in emulation due to emulation RTL changes */
-               if (dd->icode == ICODE_FPGA_EMULATION) {
-                       dd_dev_err(dd,
-                                  "LCB loopback not supported in emulation\n");
-                       return -EINVAL;
-               }
-               return 0;
-       }
-
-       /* external cable loopback requires no extra steps */
-       if (loopback == LOOPBACK_CABLE)
-               return 0;
-
-       dd_dev_err(dd, "Invalid loopback mode %d\n", loopback);
-       return -EINVAL;
-}
-
-/*
- * Translate from the OPA_LINK_WIDTH handed to us by the FM to bits
- * used in the Verify Capability link width attribute.
- */
-static u16 opa_to_vc_link_widths(u16 opa_widths)
-{
-       int i;
-       u16 result = 0;
-
-       static const struct link_bits {
-               u16 from;
-               u16 to;
-       } opa_link_xlate[] = {
-               { OPA_LINK_WIDTH_1X, 1 << (1 - 1)  },
-               { OPA_LINK_WIDTH_2X, 1 << (2 - 1)  },
-               { OPA_LINK_WIDTH_3X, 1 << (3 - 1)  },
-               { OPA_LINK_WIDTH_4X, 1 << (4 - 1)  },
-       };
-
-       for (i = 0; i < ARRAY_SIZE(opa_link_xlate); i++) {
-               if (opa_widths & opa_link_xlate[i].from)
-                       result |= opa_link_xlate[i].to;
-       }
-       return result;
-}
-
-/*
- * Set link attributes before moving to polling.
- */
-static int set_local_link_attributes(struct hfi1_pportdata *ppd)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       u8 enable_lane_tx;
-       u8 tx_polarity_inversion;
-       u8 rx_polarity_inversion;
-       int ret;
-
-       /* reset our fabric serdes to clear any lingering problems */
-       fabric_serdes_reset(dd);
-
-       /* set the local tx rate - need to read-modify-write */
-       ret = read_tx_settings(dd, &enable_lane_tx, &tx_polarity_inversion,
-                              &rx_polarity_inversion, &ppd->local_tx_rate);
-       if (ret)
-               goto set_local_link_attributes_fail;
-
-       if (dd->dc8051_ver < dc8051_ver(0, 20)) {
-               /* set the tx rate to the fastest enabled */
-               if (ppd->link_speed_enabled & OPA_LINK_SPEED_25G)
-                       ppd->local_tx_rate = 1;
-               else
-                       ppd->local_tx_rate = 0;
-       } else {
-               /* set the tx rate to all enabled */
-               ppd->local_tx_rate = 0;
-               if (ppd->link_speed_enabled & OPA_LINK_SPEED_25G)
-                       ppd->local_tx_rate |= 2;
-               if (ppd->link_speed_enabled & OPA_LINK_SPEED_12_5G)
-                       ppd->local_tx_rate |= 1;
-       }
-
-       enable_lane_tx = 0xF; /* enable all four lanes */
-       ret = write_tx_settings(dd, enable_lane_tx, tx_polarity_inversion,
-                               rx_polarity_inversion, ppd->local_tx_rate);
-       if (ret != HCMD_SUCCESS)
-               goto set_local_link_attributes_fail;
-
-       /*
-        * DC supports continuous updates.
-        */
-       ret = write_vc_local_phy(dd,
-                                0 /* no power management */,
-                                1 /* continuous updates */);
-       if (ret != HCMD_SUCCESS)
-               goto set_local_link_attributes_fail;
-
-       /* z=1 in the next call: AU of 0 is not supported by the hardware */
-       ret = write_vc_local_fabric(dd, dd->vau, 1, dd->vcu, dd->vl15_init,
-                                   ppd->port_crc_mode_enabled);
-       if (ret != HCMD_SUCCESS)
-               goto set_local_link_attributes_fail;
-
-       ret = write_vc_local_link_width(dd, 0, 0,
-                                       opa_to_vc_link_widths(
-                                               ppd->link_width_enabled));
-       if (ret != HCMD_SUCCESS)
-               goto set_local_link_attributes_fail;
-
-       /* let peer know who we are */
-       ret = write_local_device_id(dd, dd->pcidev->device, dd->minrev);
-       if (ret == HCMD_SUCCESS)
-               return 0;
-
-set_local_link_attributes_fail:
-       dd_dev_err(dd,
-                  "Failed to set local link attributes, return 0x%x\n",
-                  ret);
-       return ret;
-}
-
-/*
- * Call this to start the link.
- * Do not do anything if the link is disabled.
- * Returns 0 if link is disabled, moved to polling, or the driver is not ready.
- */
-int start_link(struct hfi1_pportdata *ppd)
-{
-       if (!ppd->link_enabled) {
-               dd_dev_info(ppd->dd,
-                           "%s: stopping link start because link is disabled\n",
-                           __func__);
-               return 0;
-       }
-       if (!ppd->driver_link_ready) {
-               dd_dev_info(ppd->dd,
-                           "%s: stopping link start because driver is not ready\n",
-                           __func__);
-               return 0;
-       }
-
-       return set_link_state(ppd, HLS_DN_POLL);
-}
-
-static void wait_for_qsfp_init(struct hfi1_pportdata *ppd)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       u64 mask;
-       unsigned long timeout;
-
-       /*
-        * Check for QSFP interrupt for t_init (SFF 8679)
-        */
-       timeout = jiffies + msecs_to_jiffies(2000);
-       while (1) {
-               mask = read_csr(dd, dd->hfi1_id ?
-                               ASIC_QSFP2_IN : ASIC_QSFP1_IN);
-               if (!(mask & QSFP_HFI0_INT_N)) {
-                       write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_CLEAR :
-                                 ASIC_QSFP1_CLEAR, QSFP_HFI0_INT_N);
-                       break;
-               }
-               if (time_after(jiffies, timeout)) {
-                       dd_dev_info(dd, "%s: No IntN detected, reset complete\n",
-                                   __func__);
-                       break;
-               }
-               udelay(2);
-       }
-}
-
-static void set_qsfp_int_n(struct hfi1_pportdata *ppd, u8 enable)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       u64 mask;
-
-       mask = read_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK);
-       if (enable)
-               mask |= (u64)QSFP_HFI0_INT_N;
-       else
-               mask &= ~(u64)QSFP_HFI0_INT_N;
-       write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK, mask);
-}
-
-void reset_qsfp(struct hfi1_pportdata *ppd)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       u64 mask, qsfp_mask;
-
-       /* Disable INT_N from triggering QSFP interrupts */
-       set_qsfp_int_n(ppd, 0);
-
-       /* Reset the QSFP */
-       mask = (u64)QSFP_HFI0_RESET_N;
-       qsfp_mask = read_csr(dd, dd->hfi1_id ? ASIC_QSFP2_OE : ASIC_QSFP1_OE);
-       qsfp_mask |= mask;
-       write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_OE : ASIC_QSFP1_OE, qsfp_mask);
-
-       qsfp_mask = read_csr(dd,
-                            dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT);
-       qsfp_mask &= ~mask;
-       write_csr(dd,
-                 dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT, qsfp_mask);
-
-       udelay(10);
-
-       qsfp_mask |= mask;
-       write_csr(dd,
-                 dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT, qsfp_mask);
-
-       wait_for_qsfp_init(ppd);
-
-       /*
-        * Allow INT_N to trigger the QSFP interrupt to watch
-        * for alarms and warnings
-        */
-       set_qsfp_int_n(ppd, 1);
-}
-
-static int handle_qsfp_error_conditions(struct hfi1_pportdata *ppd,
-                                       u8 *qsfp_interrupt_status)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-
-       if ((qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_ALARM) ||
-           (qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_WARNING))
-               dd_dev_info(dd, "%s: QSFP cable on fire\n",
-                           __func__);
-
-       if ((qsfp_interrupt_status[0] & QSFP_LOW_TEMP_ALARM) ||
-           (qsfp_interrupt_status[0] & QSFP_LOW_TEMP_WARNING))
-               dd_dev_info(dd, "%s: QSFP cable temperature too low\n",
-                           __func__);
-
-       if ((qsfp_interrupt_status[1] & QSFP_HIGH_VCC_ALARM) ||
-           (qsfp_interrupt_status[1] & QSFP_HIGH_VCC_WARNING))
-               dd_dev_info(dd, "%s: QSFP supply voltage too high\n",
-                           __func__);
-
-       if ((qsfp_interrupt_status[1] & QSFP_LOW_VCC_ALARM) ||
-           (qsfp_interrupt_status[1] & QSFP_LOW_VCC_WARNING))
-               dd_dev_info(dd, "%s: QSFP supply voltage too low\n",
-                           __func__);
-
-       /* Byte 2 is vendor specific */
-
-       if ((qsfp_interrupt_status[3] & QSFP_HIGH_POWER_ALARM) ||
-           (qsfp_interrupt_status[3] & QSFP_HIGH_POWER_WARNING))
-               dd_dev_info(dd, "%s: Cable RX channel 1/2 power too high\n",
-                           __func__);
-
-       if ((qsfp_interrupt_status[3] & QSFP_LOW_POWER_ALARM) ||
-           (qsfp_interrupt_status[3] & QSFP_LOW_POWER_WARNING))
-               dd_dev_info(dd, "%s: Cable RX channel 1/2 power too low\n",
-                           __func__);
-
-       if ((qsfp_interrupt_status[4] & QSFP_HIGH_POWER_ALARM) ||
-           (qsfp_interrupt_status[4] & QSFP_HIGH_POWER_WARNING))
-               dd_dev_info(dd, "%s: Cable RX channel 3/4 power too high\n",
-                           __func__);
-
-       if ((qsfp_interrupt_status[4] & QSFP_LOW_POWER_ALARM) ||
-           (qsfp_interrupt_status[4] & QSFP_LOW_POWER_WARNING))
-               dd_dev_info(dd, "%s: Cable RX channel 3/4 power too low\n",
-                           __func__);
-
-       if ((qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_ALARM) ||
-           (qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_WARNING))
-               dd_dev_info(dd, "%s: Cable TX channel 1/2 bias too high\n",
-                           __func__);
-
-       if ((qsfp_interrupt_status[5] & QSFP_LOW_BIAS_ALARM) ||
-           (qsfp_interrupt_status[5] & QSFP_LOW_BIAS_WARNING))
-               dd_dev_info(dd, "%s: Cable TX channel 1/2 bias too low\n",
-                           __func__);
-
-       if ((qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_ALARM) ||
-           (qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_WARNING))
-               dd_dev_info(dd, "%s: Cable TX channel 3/4 bias too high\n",
-                           __func__);
-
-       if ((qsfp_interrupt_status[6] & QSFP_LOW_BIAS_ALARM) ||
-           (qsfp_interrupt_status[6] & QSFP_LOW_BIAS_WARNING))
-               dd_dev_info(dd, "%s: Cable TX channel 3/4 bias too low\n",
-                           __func__);
-
-       if ((qsfp_interrupt_status[7] & QSFP_HIGH_POWER_ALARM) ||
-           (qsfp_interrupt_status[7] & QSFP_HIGH_POWER_WARNING))
-               dd_dev_info(dd, "%s: Cable TX channel 1/2 power too high\n",
-                           __func__);
-
-       if ((qsfp_interrupt_status[7] & QSFP_LOW_POWER_ALARM) ||
-           (qsfp_interrupt_status[7] & QSFP_LOW_POWER_WARNING))
-               dd_dev_info(dd, "%s: Cable TX channel 1/2 power too low\n",
-                           __func__);
-
-       if ((qsfp_interrupt_status[8] & QSFP_HIGH_POWER_ALARM) ||
-           (qsfp_interrupt_status[8] & QSFP_HIGH_POWER_WARNING))
-               dd_dev_info(dd, "%s: Cable TX channel 3/4 power too high\n",
-                           __func__);
-
-       if ((qsfp_interrupt_status[8] & QSFP_LOW_POWER_ALARM) ||
-           (qsfp_interrupt_status[8] & QSFP_LOW_POWER_WARNING))
-               dd_dev_info(dd, "%s: Cable TX channel 3/4 power too low\n",
-                           __func__);
-
-       /* Bytes 9-10 and 11-12 are reserved */
-       /* Bytes 13-15 are vendor specific */
-
-       return 0;
-}
-
-/* This routine will only be scheduled if the QSFP module present is asserted */
-void qsfp_event(struct work_struct *work)
-{
-       struct qsfp_data *qd;
-       struct hfi1_pportdata *ppd;
-       struct hfi1_devdata *dd;
-
-       qd = container_of(work, struct qsfp_data, qsfp_work);
-       ppd = qd->ppd;
-       dd = ppd->dd;
-
-       /* Sanity check */
-       if (!qsfp_mod_present(ppd))
-               return;
-
-       /*
-        * Turn DC back on after cables has been
-        * re-inserted. Up until now, the DC has been in
-        * reset to save power.
-        */
-       dc_start(dd);
-
-       if (qd->cache_refresh_required) {
-               set_qsfp_int_n(ppd, 0);
-
-               wait_for_qsfp_init(ppd);
-
-               /*
-                * Allow INT_N to trigger the QSFP interrupt to watch
-                * for alarms and warnings
-                */
-               set_qsfp_int_n(ppd, 1);
-
-               tune_serdes(ppd);
-
-               start_link(ppd);
-       }
-
-       if (qd->check_interrupt_flags) {
-               u8 qsfp_interrupt_status[16] = {0,};
-
-               if (one_qsfp_read(ppd, dd->hfi1_id, 6,
-                                 &qsfp_interrupt_status[0], 16) != 16) {
-                       dd_dev_info(dd,
-                                   "%s: Failed to read status of QSFP module\n",
-                                   __func__);
-               } else {
-                       unsigned long flags;
-
-                       handle_qsfp_error_conditions(
-                                       ppd, qsfp_interrupt_status);
-                       spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
-                       ppd->qsfp_info.check_interrupt_flags = 0;
-                       spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock,
-                                              flags);
-               }
-       }
-}
-
-static void init_qsfp_int(struct hfi1_devdata *dd)
-{
-       struct hfi1_pportdata *ppd = dd->pport;
-       u64 qsfp_mask, cce_int_mask;
-       const int qsfp1_int_smask = QSFP1_INT % 64;
-       const int qsfp2_int_smask = QSFP2_INT % 64;
-
-       /*
-        * disable QSFP1 interrupts for HFI1, QSFP2 interrupts for HFI0
-        * Qsfp1Int and Qsfp2Int are adjacent bits in the same CSR,
-        * therefore just one of QSFP1_INT/QSFP2_INT can be used to find
-        * the index of the appropriate CSR in the CCEIntMask CSR array
-        */
-       cce_int_mask = read_csr(dd, CCE_INT_MASK +
-                               (8 * (QSFP1_INT / 64)));
-       if (dd->hfi1_id) {
-               cce_int_mask &= ~((u64)1 << qsfp1_int_smask);
-               write_csr(dd, CCE_INT_MASK + (8 * (QSFP1_INT / 64)),
-                         cce_int_mask);
-       } else {
-               cce_int_mask &= ~((u64)1 << qsfp2_int_smask);
-               write_csr(dd, CCE_INT_MASK + (8 * (QSFP2_INT / 64)),
-                         cce_int_mask);
-       }
-
-       qsfp_mask = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N);
-       /* Clear current status to avoid spurious interrupts */
-       write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_CLEAR : ASIC_QSFP1_CLEAR,
-                 qsfp_mask);
-       write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK,
-                 qsfp_mask);
-
-       set_qsfp_int_n(ppd, 0);
-
-       /* Handle active low nature of INT_N and MODPRST_N pins */
-       if (qsfp_mod_present(ppd))
-               qsfp_mask &= ~(u64)QSFP_HFI0_MODPRST_N;
-       write_csr(dd,
-                 dd->hfi1_id ? ASIC_QSFP2_INVERT : ASIC_QSFP1_INVERT,
-                 qsfp_mask);
-}
-
-/*
- * Do a one-time initialize of the LCB block.
- */
-static void init_lcb(struct hfi1_devdata *dd)
-{
-       /* simulator does not correctly handle LCB cclk loopback, skip */
-       if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
-               return;
-
-       /* the DC has been reset earlier in the driver load */
-
-       /* set LCB for cclk loopback on the port */
-       write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0x01);
-       write_csr(dd, DC_LCB_CFG_LANE_WIDTH, 0x00);
-       write_csr(dd, DC_LCB_CFG_REINIT_AS_SLAVE, 0x00);
-       write_csr(dd, DC_LCB_CFG_CNT_FOR_SKIP_STALL, 0x110);
-       write_csr(dd, DC_LCB_CFG_CLK_CNTR, 0x08);
-       write_csr(dd, DC_LCB_CFG_LOOPBACK, 0x02);
-       write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0x00);
-}
-
-int bringup_serdes(struct hfi1_pportdata *ppd)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       u64 guid;
-       int ret;
-
-       if (HFI1_CAP_IS_KSET(EXTENDED_PSN))
-               add_rcvctrl(dd, RCV_CTRL_RCV_EXTENDED_PSN_ENABLE_SMASK);
-
-       guid = ppd->guid;
-       if (!guid) {
-               if (dd->base_guid)
-                       guid = dd->base_guid + ppd->port - 1;
-               ppd->guid = guid;
-       }
-
-       /* Set linkinit_reason on power up per OPA spec */
-       ppd->linkinit_reason = OPA_LINKINIT_REASON_LINKUP;
-
-       /* one-time init of the LCB */
-       init_lcb(dd);
-
-       if (loopback) {
-               ret = init_loopback(dd);
-               if (ret < 0)
-                       return ret;
-       }
-
-       /* tune the SERDES to a ballpark setting for
-        * optimal signal and bit error rate
-        * Needs to be done before starting the link
-        */
-       tune_serdes(ppd);
-
-       return start_link(ppd);
-}
-
-void hfi1_quiet_serdes(struct hfi1_pportdata *ppd)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-
-       /*
-        * Shut down the link and keep it down.   First turn off that the
-        * driver wants to allow the link to be up (driver_link_ready).
-        * Then make sure the link is not automatically restarted
-        * (link_enabled).  Cancel any pending restart.  And finally
-        * go offline.
-        */
-       ppd->driver_link_ready = 0;
-       ppd->link_enabled = 0;
-
-       ppd->offline_disabled_reason =
-                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_SMA_DISABLED);
-       set_link_down_reason(ppd, OPA_LINKDOWN_REASON_SMA_DISABLED, 0,
-                            OPA_LINKDOWN_REASON_SMA_DISABLED);
-       set_link_state(ppd, HLS_DN_OFFLINE);
-
-       /* disable the port */
-       clear_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
-}
-
-static inline int init_cpu_counters(struct hfi1_devdata *dd)
-{
-       struct hfi1_pportdata *ppd;
-       int i;
-
-       ppd = (struct hfi1_pportdata *)(dd + 1);
-       for (i = 0; i < dd->num_pports; i++, ppd++) {
-               ppd->ibport_data.rvp.rc_acks = NULL;
-               ppd->ibport_data.rvp.rc_qacks = NULL;
-               ppd->ibport_data.rvp.rc_acks = alloc_percpu(u64);
-               ppd->ibport_data.rvp.rc_qacks = alloc_percpu(u64);
-               ppd->ibport_data.rvp.rc_delayed_comp = alloc_percpu(u64);
-               if (!ppd->ibport_data.rvp.rc_acks ||
-                   !ppd->ibport_data.rvp.rc_delayed_comp ||
-                   !ppd->ibport_data.rvp.rc_qacks)
-                       return -ENOMEM;
-       }
-
-       return 0;
-}
-
-static const char * const pt_names[] = {
-       "expected",
-       "eager",
-       "invalid"
-};
-
-static const char *pt_name(u32 type)
-{
-       return type >= ARRAY_SIZE(pt_names) ? "unknown" : pt_names[type];
-}
-
-/*
- * index is the index into the receive array
- */
-void hfi1_put_tid(struct hfi1_devdata *dd, u32 index,
-                 u32 type, unsigned long pa, u16 order)
-{
-       u64 reg;
-       void __iomem *base = (dd->rcvarray_wc ? dd->rcvarray_wc :
-                             (dd->kregbase + RCV_ARRAY));
-
-       if (!(dd->flags & HFI1_PRESENT))
-               goto done;
-
-       if (type == PT_INVALID) {
-               pa = 0;
-       } else if (type > PT_INVALID) {
-               dd_dev_err(dd,
-                          "unexpected receive array type %u for index %u, not handled\n",
-                          type, index);
-               goto done;
-       }
-
-       hfi1_cdbg(TID, "type %s, index 0x%x, pa 0x%lx, bsize 0x%lx",
-                 pt_name(type), index, pa, (unsigned long)order);
-
-#define RT_ADDR_SHIFT 12       /* 4KB kernel address boundary */
-       reg = RCV_ARRAY_RT_WRITE_ENABLE_SMASK
-               | (u64)order << RCV_ARRAY_RT_BUF_SIZE_SHIFT
-               | ((pa >> RT_ADDR_SHIFT) & RCV_ARRAY_RT_ADDR_MASK)
-                                       << RCV_ARRAY_RT_ADDR_SHIFT;
-       writeq(reg, base + (index * 8));
-
-       if (type == PT_EAGER)
-               /*
-                * Eager entries are written one-by-one so we have to push them
-                * after we write the entry.
-                */
-               flush_wc();
-done:
-       return;
-}
-
-void hfi1_clear_tids(struct hfi1_ctxtdata *rcd)
-{
-       struct hfi1_devdata *dd = rcd->dd;
-       u32 i;
-
-       /* this could be optimized */
-       for (i = rcd->eager_base; i < rcd->eager_base +
-                    rcd->egrbufs.alloced; i++)
-               hfi1_put_tid(dd, i, PT_INVALID, 0, 0);
-
-       for (i = rcd->expected_base;
-                       i < rcd->expected_base + rcd->expected_count; i++)
-               hfi1_put_tid(dd, i, PT_INVALID, 0, 0);
-}
-
-int hfi1_get_base_kinfo(struct hfi1_ctxtdata *rcd,
-                       struct hfi1_ctxt_info *kinfo)
-{
-       kinfo->runtime_flags = (HFI1_MISC_GET() << HFI1_CAP_USER_SHIFT) |
-               HFI1_CAP_UGET(MASK) | HFI1_CAP_KGET(K2U);
-       return 0;
-}
-
-struct hfi1_message_header *hfi1_get_msgheader(
-                               struct hfi1_devdata *dd, __le32 *rhf_addr)
-{
-       u32 offset = rhf_hdrq_offset(rhf_to_cpu(rhf_addr));
-
-       return (struct hfi1_message_header *)
-               (rhf_addr - dd->rhf_offset + offset);
-}
-
-static const char * const ib_cfg_name_strings[] = {
-       "HFI1_IB_CFG_LIDLMC",
-       "HFI1_IB_CFG_LWID_DG_ENB",
-       "HFI1_IB_CFG_LWID_ENB",
-       "HFI1_IB_CFG_LWID",
-       "HFI1_IB_CFG_SPD_ENB",
-       "HFI1_IB_CFG_SPD",
-       "HFI1_IB_CFG_RXPOL_ENB",
-       "HFI1_IB_CFG_LREV_ENB",
-       "HFI1_IB_CFG_LINKLATENCY",
-       "HFI1_IB_CFG_HRTBT",
-       "HFI1_IB_CFG_OP_VLS",
-       "HFI1_IB_CFG_VL_HIGH_CAP",
-       "HFI1_IB_CFG_VL_LOW_CAP",
-       "HFI1_IB_CFG_OVERRUN_THRESH",
-       "HFI1_IB_CFG_PHYERR_THRESH",
-       "HFI1_IB_CFG_LINKDEFAULT",
-       "HFI1_IB_CFG_PKEYS",
-       "HFI1_IB_CFG_MTU",
-       "HFI1_IB_CFG_LSTATE",
-       "HFI1_IB_CFG_VL_HIGH_LIMIT",
-       "HFI1_IB_CFG_PMA_TICKS",
-       "HFI1_IB_CFG_PORT"
-};
-
-static const char *ib_cfg_name(int which)
-{
-       if (which < 0 || which >= ARRAY_SIZE(ib_cfg_name_strings))
-               return "invalid";
-       return ib_cfg_name_strings[which];
-}
-
-int hfi1_get_ib_cfg(struct hfi1_pportdata *ppd, int which)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       int val = 0;
-
-       switch (which) {
-       case HFI1_IB_CFG_LWID_ENB: /* allowed Link-width */
-               val = ppd->link_width_enabled;
-               break;
-       case HFI1_IB_CFG_LWID: /* currently active Link-width */
-               val = ppd->link_width_active;
-               break;
-       case HFI1_IB_CFG_SPD_ENB: /* allowed Link speeds */
-               val = ppd->link_speed_enabled;
-               break;
-       case HFI1_IB_CFG_SPD: /* current Link speed */
-               val = ppd->link_speed_active;
-               break;
-
-       case HFI1_IB_CFG_RXPOL_ENB: /* Auto-RX-polarity enable */
-       case HFI1_IB_CFG_LREV_ENB: /* Auto-Lane-reversal enable */
-       case HFI1_IB_CFG_LINKLATENCY:
-               goto unimplemented;
-
-       case HFI1_IB_CFG_OP_VLS:
-               val = ppd->vls_operational;
-               break;
-       case HFI1_IB_CFG_VL_HIGH_CAP: /* VL arb high priority table size */
-               val = VL_ARB_HIGH_PRIO_TABLE_SIZE;
-               break;
-       case HFI1_IB_CFG_VL_LOW_CAP: /* VL arb low priority table size */
-               val = VL_ARB_LOW_PRIO_TABLE_SIZE;
-               break;
-       case HFI1_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
-               val = ppd->overrun_threshold;
-               break;
-       case HFI1_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
-               val = ppd->phy_error_threshold;
-               break;
-       case HFI1_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
-               val = dd->link_default;
-               break;
-
-       case HFI1_IB_CFG_HRTBT: /* Heartbeat off/enable/auto */
-       case HFI1_IB_CFG_PMA_TICKS:
-       default:
-unimplemented:
-               if (HFI1_CAP_IS_KSET(PRINT_UNIMPL))
-                       dd_dev_info(
-                               dd,
-                               "%s: which %s: not implemented\n",
-                               __func__,
-                               ib_cfg_name(which));
-               break;
-       }
-
-       return val;
-}
-
-/*
- * The largest MAD packet size.
- */
-#define MAX_MAD_PACKET 2048
-
-/*
- * Return the maximum header bytes that can go on the _wire_
- * for this device. This count includes the ICRC which is
- * not part of the packet held in memory but it is appended
- * by the HW.
- * This is dependent on the device's receive header entry size.
- * HFI allows this to be set per-receive context, but the
- * driver presently enforces a global value.
- */
-u32 lrh_max_header_bytes(struct hfi1_devdata *dd)
-{
-       /*
-        * The maximum non-payload (MTU) bytes in LRH.PktLen are
-        * the Receive Header Entry Size minus the PBC (or RHF) size
-        * plus one DW for the ICRC appended by HW.
-        *
-        * dd->rcd[0].rcvhdrqentsize is in DW.
-        * We use rcd[0] as all context will have the same value. Also,
-        * the first kernel context would have been allocated by now so
-        * we are guaranteed a valid value.
-        */
-       return (dd->rcd[0]->rcvhdrqentsize - 2/*PBC/RHF*/ + 1/*ICRC*/) << 2;
-}
-
-/*
- * Set Send Length
- * @ppd - per port data
- *
- * Set the MTU by limiting how many DWs may be sent.  The SendLenCheck*
- * registers compare against LRH.PktLen, so use the max bytes included
- * in the LRH.
- *
- * This routine changes all VL values except VL15, which it maintains at
- * the same value.
- */
-static void set_send_length(struct hfi1_pportdata *ppd)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       u32 max_hb = lrh_max_header_bytes(dd), dcmtu;
-       u32 maxvlmtu = dd->vld[15].mtu;
-       u64 len1 = 0, len2 = (((dd->vld[15].mtu + max_hb) >> 2)
-                             & SEND_LEN_CHECK1_LEN_VL15_MASK) <<
-               SEND_LEN_CHECK1_LEN_VL15_SHIFT;
-       int i;
-       u32 thres;
-
-       for (i = 0; i < ppd->vls_supported; i++) {
-               if (dd->vld[i].mtu > maxvlmtu)
-                       maxvlmtu = dd->vld[i].mtu;
-               if (i <= 3)
-                       len1 |= (((dd->vld[i].mtu + max_hb) >> 2)
-                                & SEND_LEN_CHECK0_LEN_VL0_MASK) <<
-                               ((i % 4) * SEND_LEN_CHECK0_LEN_VL1_SHIFT);
-               else
-                       len2 |= (((dd->vld[i].mtu + max_hb) >> 2)
-                                & SEND_LEN_CHECK1_LEN_VL4_MASK) <<
-                               ((i % 4) * SEND_LEN_CHECK1_LEN_VL5_SHIFT);
-       }
-       write_csr(dd, SEND_LEN_CHECK0, len1);
-       write_csr(dd, SEND_LEN_CHECK1, len2);
-       /* adjust kernel credit return thresholds based on new MTUs */
-       /* all kernel receive contexts have the same hdrqentsize */
-       for (i = 0; i < ppd->vls_supported; i++) {
-               thres = min(sc_percent_to_threshold(dd->vld[i].sc, 50),
-                           sc_mtu_to_threshold(dd->vld[i].sc,
-                                               dd->vld[i].mtu,
-                                               dd->rcd[0]->rcvhdrqentsize));
-               sc_set_cr_threshold(dd->vld[i].sc, thres);
-       }
-       thres = min(sc_percent_to_threshold(dd->vld[15].sc, 50),
-                   sc_mtu_to_threshold(dd->vld[15].sc,
-                                       dd->vld[15].mtu,
-                                       dd->rcd[0]->rcvhdrqentsize));
-       sc_set_cr_threshold(dd->vld[15].sc, thres);
-
-       /* Adjust maximum MTU for the port in DC */
-       dcmtu = maxvlmtu == 10240 ? DCC_CFG_PORT_MTU_CAP_10240 :
-               (ilog2(maxvlmtu >> 8) + 1);
-       len1 = read_csr(ppd->dd, DCC_CFG_PORT_CONFIG);
-       len1 &= ~DCC_CFG_PORT_CONFIG_MTU_CAP_SMASK;
-       len1 |= ((u64)dcmtu & DCC_CFG_PORT_CONFIG_MTU_CAP_MASK) <<
-               DCC_CFG_PORT_CONFIG_MTU_CAP_SHIFT;
-       write_csr(ppd->dd, DCC_CFG_PORT_CONFIG, len1);
-}
-
-static void set_lidlmc(struct hfi1_pportdata *ppd)
-{
-       int i;
-       u64 sreg = 0;
-       struct hfi1_devdata *dd = ppd->dd;
-       u32 mask = ~((1U << ppd->lmc) - 1);
-       u64 c1 = read_csr(ppd->dd, DCC_CFG_PORT_CONFIG1);
-
-       if (dd->hfi1_snoop.mode_flag)
-               dd_dev_info(dd, "Set lid/lmc while snooping");
-
-       c1 &= ~(DCC_CFG_PORT_CONFIG1_TARGET_DLID_SMASK
-               | DCC_CFG_PORT_CONFIG1_DLID_MASK_SMASK);
-       c1 |= ((ppd->lid & DCC_CFG_PORT_CONFIG1_TARGET_DLID_MASK)
-                       << DCC_CFG_PORT_CONFIG1_TARGET_DLID_SHIFT) |
-             ((mask & DCC_CFG_PORT_CONFIG1_DLID_MASK_MASK)
-                       << DCC_CFG_PORT_CONFIG1_DLID_MASK_SHIFT);
-       write_csr(ppd->dd, DCC_CFG_PORT_CONFIG1, c1);
-
-       /*
-        * Iterate over all the send contexts and set their SLID check
-        */
-       sreg = ((mask & SEND_CTXT_CHECK_SLID_MASK_MASK) <<
-                       SEND_CTXT_CHECK_SLID_MASK_SHIFT) |
-              (((ppd->lid & mask) & SEND_CTXT_CHECK_SLID_VALUE_MASK) <<
-                       SEND_CTXT_CHECK_SLID_VALUE_SHIFT);
-
-       for (i = 0; i < dd->chip_send_contexts; i++) {
-               hfi1_cdbg(LINKVERB, "SendContext[%d].SLID_CHECK = 0x%x",
-                         i, (u32)sreg);
-               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_SLID, sreg);
-       }
-
-       /* Now we have to do the same thing for the sdma engines */
-       sdma_update_lmc(dd, mask, ppd->lid);
-}
-
-static int wait_phy_linkstate(struct hfi1_devdata *dd, u32 state, u32 msecs)
-{
-       unsigned long timeout;
-       u32 curr_state;
-
-       timeout = jiffies + msecs_to_jiffies(msecs);
-       while (1) {
-               curr_state = read_physical_state(dd);
-               if (curr_state == state)
-                       break;
-               if (time_after(jiffies, timeout)) {
-                       dd_dev_err(dd,
-                                  "timeout waiting for phy link state 0x%x, current state is 0x%x\n",
-                                  state, curr_state);
-                       return -ETIMEDOUT;
-               }
-               usleep_range(1950, 2050); /* sleep 2ms-ish */
-       }
-
-       return 0;
-}
-
-/*
- * Helper for set_link_state().  Do not call except from that routine.
- * Expects ppd->hls_mutex to be held.
- *
- * @rem_reason value to be sent to the neighbor
- *
- * LinkDownReasons only set if transition succeeds.
- */
-static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       u32 pstate, previous_state;
-       u32 last_local_state;
-       u32 last_remote_state;
-       int ret;
-       int do_transition;
-       int do_wait;
-
-       previous_state = ppd->host_link_state;
-       ppd->host_link_state = HLS_GOING_OFFLINE;
-       pstate = read_physical_state(dd);
-       if (pstate == PLS_OFFLINE) {
-               do_transition = 0;      /* in right state */
-               do_wait = 0;            /* ...no need to wait */
-       } else if ((pstate & 0xff) == PLS_OFFLINE) {
-               do_transition = 0;      /* in an offline transient state */
-               do_wait = 1;            /* ...wait for it to settle */
-       } else {
-               do_transition = 1;      /* need to move to offline */
-               do_wait = 1;            /* ...will need to wait */
-       }
-
-       if (do_transition) {
-               ret = set_physical_link_state(dd,
-                                             (rem_reason << 8) | PLS_OFFLINE);
-
-               if (ret != HCMD_SUCCESS) {
-                       dd_dev_err(dd,
-                                  "Failed to transition to Offline link state, return %d\n",
-                                  ret);
-                       return -EINVAL;
-               }
-               if (ppd->offline_disabled_reason ==
-                               HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE))
-                       ppd->offline_disabled_reason =
-                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_TRANSIENT);
-       }
-
-       if (do_wait) {
-               /* it can take a while for the link to go down */
-               ret = wait_phy_linkstate(dd, PLS_OFFLINE, 10000);
-               if (ret < 0)
-                       return ret;
-       }
-
-       /* make sure the logical state is also down */
-       wait_logical_linkstate(ppd, IB_PORT_DOWN, 1000);
-
-       /*
-        * Now in charge of LCB - must be after the physical state is
-        * offline.quiet and before host_link_state is changed.
-        */
-       set_host_lcb_access(dd);
-       write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */
-       ppd->host_link_state = HLS_LINK_COOLDOWN; /* LCB access allowed */
-
-       if (ppd->port_type == PORT_TYPE_QSFP &&
-           ppd->qsfp_info.limiting_active &&
-           qsfp_mod_present(ppd)) {
-               int ret;
-
-               ret = acquire_chip_resource(dd, qsfp_resource(dd), QSFP_WAIT);
-               if (ret == 0) {
-                       set_qsfp_tx(ppd, 0);
-                       release_chip_resource(dd, qsfp_resource(dd));
-               } else {
-                       /* not fatal, but should warn */
-                       dd_dev_err(dd,
-                                  "Unable to acquire lock to turn off QSFP TX\n");
-               }
-       }
-
-       /*
-        * The LNI has a mandatory wait time after the physical state
-        * moves to Offline.Quiet.  The wait time may be different
-        * depending on how the link went down.  The 8051 firmware
-        * will observe the needed wait time and only move to ready
-        * when that is completed.  The largest of the quiet timeouts
-        * is 6s, so wait that long and then at least 0.5s more for
-        * other transitions, and another 0.5s for a buffer.
-        */
-       ret = wait_fm_ready(dd, 7000);
-       if (ret) {
-               dd_dev_err(dd,
-                          "After going offline, timed out waiting for the 8051 to become ready to accept host requests\n");
-               /* state is really offline, so make it so */
-               ppd->host_link_state = HLS_DN_OFFLINE;
-               return ret;
-       }
-
-       /*
-        * The state is now offline and the 8051 is ready to accept host
-        * requests.
-        *      - change our state
-        *      - notify others if we were previously in a linkup state
-        */
-       ppd->host_link_state = HLS_DN_OFFLINE;
-       if (previous_state & HLS_UP) {
-               /* went down while link was up */
-               handle_linkup_change(dd, 0);
-       } else if (previous_state
-                       & (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) {
-               /* went down while attempting link up */
-               /* byte 1 of last_*_state is the failure reason */
-               read_last_local_state(dd, &last_local_state);
-               read_last_remote_state(dd, &last_remote_state);
-               dd_dev_err(dd,
-                          "LNI failure last states: local 0x%08x, remote 0x%08x\n",
-                          last_local_state, last_remote_state);
-       }
-
-       /* the active link width (downgrade) is 0 on link down */
-       ppd->link_width_active = 0;
-       ppd->link_width_downgrade_tx_active = 0;
-       ppd->link_width_downgrade_rx_active = 0;
-       ppd->current_egress_rate = 0;
-       return 0;
-}
-
-/* return the link state name */
-static const char *link_state_name(u32 state)
-{
-       const char *name;
-       int n = ilog2(state);
-       static const char * const names[] = {
-               [__HLS_UP_INIT_BP]       = "INIT",
-               [__HLS_UP_ARMED_BP]      = "ARMED",
-               [__HLS_UP_ACTIVE_BP]     = "ACTIVE",
-               [__HLS_DN_DOWNDEF_BP]    = "DOWNDEF",
-               [__HLS_DN_POLL_BP]       = "POLL",
-               [__HLS_DN_DISABLE_BP]    = "DISABLE",
-               [__HLS_DN_OFFLINE_BP]    = "OFFLINE",
-               [__HLS_VERIFY_CAP_BP]    = "VERIFY_CAP",
-               [__HLS_GOING_UP_BP]      = "GOING_UP",
-               [__HLS_GOING_OFFLINE_BP] = "GOING_OFFLINE",
-               [__HLS_LINK_COOLDOWN_BP] = "LINK_COOLDOWN"
-       };
-
-       name = n < ARRAY_SIZE(names) ? names[n] : NULL;
-       return name ? name : "unknown";
-}
-
-/* return the link state reason name */
-static const char *link_state_reason_name(struct hfi1_pportdata *ppd, u32 state)
-{
-       if (state == HLS_UP_INIT) {
-               switch (ppd->linkinit_reason) {
-               case OPA_LINKINIT_REASON_LINKUP:
-                       return "(LINKUP)";
-               case OPA_LINKINIT_REASON_FLAPPING:
-                       return "(FLAPPING)";
-               case OPA_LINKINIT_OUTSIDE_POLICY:
-                       return "(OUTSIDE_POLICY)";
-               case OPA_LINKINIT_QUARANTINED:
-                       return "(QUARANTINED)";
-               case OPA_LINKINIT_INSUFIC_CAPABILITY:
-                       return "(INSUFIC_CAPABILITY)";
-               default:
-                       break;
-               }
-       }
-       return "";
-}
-
-/*
- * driver_physical_state - convert the driver's notion of a port's
- * state (an HLS_*) into a physical state (a {IB,OPA}_PORTPHYSSTATE_*).
- * Return -1 (converted to a u32) to indicate error.
- */
-u32 driver_physical_state(struct hfi1_pportdata *ppd)
-{
-       switch (ppd->host_link_state) {
-       case HLS_UP_INIT:
-       case HLS_UP_ARMED:
-       case HLS_UP_ACTIVE:
-               return IB_PORTPHYSSTATE_LINKUP;
-       case HLS_DN_POLL:
-               return IB_PORTPHYSSTATE_POLLING;
-       case HLS_DN_DISABLE:
-               return IB_PORTPHYSSTATE_DISABLED;
-       case HLS_DN_OFFLINE:
-               return OPA_PORTPHYSSTATE_OFFLINE;
-       case HLS_VERIFY_CAP:
-               return IB_PORTPHYSSTATE_POLLING;
-       case HLS_GOING_UP:
-               return IB_PORTPHYSSTATE_POLLING;
-       case HLS_GOING_OFFLINE:
-               return OPA_PORTPHYSSTATE_OFFLINE;
-       case HLS_LINK_COOLDOWN:
-               return OPA_PORTPHYSSTATE_OFFLINE;
-       case HLS_DN_DOWNDEF:
-       default:
-               dd_dev_err(ppd->dd, "invalid host_link_state 0x%x\n",
-                          ppd->host_link_state);
-               return  -1;
-       }
-}
-
-/*
- * driver_logical_state - convert the driver's notion of a port's
- * state (an HLS_*) into a logical state (a IB_PORT_*). Return -1
- * (converted to a u32) to indicate error.
- */
-u32 driver_logical_state(struct hfi1_pportdata *ppd)
-{
-       if (ppd->host_link_state && !(ppd->host_link_state & HLS_UP))
-               return IB_PORT_DOWN;
-
-       switch (ppd->host_link_state & HLS_UP) {
-       case HLS_UP_INIT:
-               return IB_PORT_INIT;
-       case HLS_UP_ARMED:
-               return IB_PORT_ARMED;
-       case HLS_UP_ACTIVE:
-               return IB_PORT_ACTIVE;
-       default:
-               dd_dev_err(ppd->dd, "invalid host_link_state 0x%x\n",
-                          ppd->host_link_state);
-       return -1;
-       }
-}
-
-void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason,
-                         u8 neigh_reason, u8 rem_reason)
-{
-       if (ppd->local_link_down_reason.latest == 0 &&
-           ppd->neigh_link_down_reason.latest == 0) {
-               ppd->local_link_down_reason.latest = lcl_reason;
-               ppd->neigh_link_down_reason.latest = neigh_reason;
-               ppd->remote_link_down_reason = rem_reason;
-       }
-}
-
-/*
- * Change the physical and/or logical link state.
- *
- * Do not call this routine while inside an interrupt.  It contains
- * calls to routines that can take multiple seconds to finish.
- *
- * Returns 0 on success, -errno on failure.
- */
-int set_link_state(struct hfi1_pportdata *ppd, u32 state)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       struct ib_event event = {.device = NULL};
-       int ret1, ret = 0;
-       int orig_new_state, poll_bounce;
-
-       mutex_lock(&ppd->hls_lock);
-
-       orig_new_state = state;
-       if (state == HLS_DN_DOWNDEF)
-               state = dd->link_default;
-
-       /* interpret poll -> poll as a link bounce */
-       poll_bounce = ppd->host_link_state == HLS_DN_POLL &&
-                     state == HLS_DN_POLL;
-
-       dd_dev_info(dd, "%s: current %s, new %s %s%s\n", __func__,
-                   link_state_name(ppd->host_link_state),
-                   link_state_name(orig_new_state),
-                   poll_bounce ? "(bounce) " : "",
-                   link_state_reason_name(ppd, state));
-
-       /*
-        * If we're going to a (HLS_*) link state that implies the logical
-        * link state is neither of (IB_PORT_ARMED, IB_PORT_ACTIVE), then
-        * reset is_sm_config_started to 0.
-        */
-       if (!(state & (HLS_UP_ARMED | HLS_UP_ACTIVE)))
-               ppd->is_sm_config_started = 0;
-
-       /*
-        * Do nothing if the states match.  Let a poll to poll link bounce
-        * go through.
-        */
-       if (ppd->host_link_state == state && !poll_bounce)
-               goto done;
-
-       switch (state) {
-       case HLS_UP_INIT:
-               if (ppd->host_link_state == HLS_DN_POLL &&
-                   (quick_linkup || dd->icode == ICODE_FUNCTIONAL_SIMULATOR)) {
-                       /*
-                        * Quick link up jumps from polling to here.
-                        *
-                        * Whether in normal or loopback mode, the
-                        * simulator jumps from polling to link up.
-                        * Accept that here.
-                        */
-                       /* OK */
-               } else if (ppd->host_link_state != HLS_GOING_UP) {
-                       goto unexpected;
-               }
-
-               ppd->host_link_state = HLS_UP_INIT;
-               ret = wait_logical_linkstate(ppd, IB_PORT_INIT, 1000);
-               if (ret) {
-                       /* logical state didn't change, stay at going_up */
-                       ppd->host_link_state = HLS_GOING_UP;
-                       dd_dev_err(dd,
-                                  "%s: logical state did not change to INIT\n",
-                                  __func__);
-               } else {
-                       /* clear old transient LINKINIT_REASON code */
-                       if (ppd->linkinit_reason >= OPA_LINKINIT_REASON_CLEAR)
-                               ppd->linkinit_reason =
-                                       OPA_LINKINIT_REASON_LINKUP;
-
-                       /* enable the port */
-                       add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
-
-                       handle_linkup_change(dd, 1);
-               }
-               break;
-       case HLS_UP_ARMED:
-               if (ppd->host_link_state != HLS_UP_INIT)
-                       goto unexpected;
-
-               ppd->host_link_state = HLS_UP_ARMED;
-               set_logical_state(dd, LSTATE_ARMED);
-               ret = wait_logical_linkstate(ppd, IB_PORT_ARMED, 1000);
-               if (ret) {
-                       /* logical state didn't change, stay at init */
-                       ppd->host_link_state = HLS_UP_INIT;
-                       dd_dev_err(dd,
-                                  "%s: logical state did not change to ARMED\n",
-                                  __func__);
-               }
-               /*
-                * The simulator does not currently implement SMA messages,
-                * so neighbor_normal is not set.  Set it here when we first
-                * move to Armed.
-                */
-               if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
-                       ppd->neighbor_normal = 1;
-               break;
-       case HLS_UP_ACTIVE:
-               if (ppd->host_link_state != HLS_UP_ARMED)
-                       goto unexpected;
-
-               ppd->host_link_state = HLS_UP_ACTIVE;
-               set_logical_state(dd, LSTATE_ACTIVE);
-               ret = wait_logical_linkstate(ppd, IB_PORT_ACTIVE, 1000);
-               if (ret) {
-                       /* logical state didn't change, stay at armed */
-                       ppd->host_link_state = HLS_UP_ARMED;
-                       dd_dev_err(dd,
-                                  "%s: logical state did not change to ACTIVE\n",
-                                  __func__);
-               } else {
-                       /* tell all engines to go running */
-                       sdma_all_running(dd);
-
-                       /* Signal the IB layer that the port has went active */
-                       event.device = &dd->verbs_dev.rdi.ibdev;
-                       event.element.port_num = ppd->port;
-                       event.event = IB_EVENT_PORT_ACTIVE;
-               }
-               break;
-       case HLS_DN_POLL:
-               if ((ppd->host_link_state == HLS_DN_DISABLE ||
-                    ppd->host_link_state == HLS_DN_OFFLINE) &&
-                   dd->dc_shutdown)
-                       dc_start(dd);
-               /* Hand LED control to the DC */
-               write_csr(dd, DCC_CFG_LED_CNTRL, 0);
-
-               if (ppd->host_link_state != HLS_DN_OFFLINE) {
-                       u8 tmp = ppd->link_enabled;
-
-                       ret = goto_offline(ppd, ppd->remote_link_down_reason);
-                       if (ret) {
-                               ppd->link_enabled = tmp;
-                               break;
-                       }
-                       ppd->remote_link_down_reason = 0;
-
-                       if (ppd->driver_link_ready)
-                               ppd->link_enabled = 1;
-               }
-
-               set_all_slowpath(ppd->dd);
-               ret = set_local_link_attributes(ppd);
-               if (ret)
-                       break;
-
-               ppd->port_error_action = 0;
-               ppd->host_link_state = HLS_DN_POLL;
-
-               if (quick_linkup) {
-                       /* quick linkup does not go into polling */
-                       ret = do_quick_linkup(dd);
-               } else {
-                       ret1 = set_physical_link_state(dd, PLS_POLLING);
-                       if (ret1 != HCMD_SUCCESS) {
-                               dd_dev_err(dd,
-                                          "Failed to transition to Polling link state, return 0x%x\n",
-                                          ret1);
-                               ret = -EINVAL;
-                       }
-               }
-               ppd->offline_disabled_reason =
-                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE);
-               /*
-                * If an error occurred above, go back to offline.  The
-                * caller may reschedule another attempt.
-                */
-               if (ret)
-                       goto_offline(ppd, 0);
-               break;
-       case HLS_DN_DISABLE:
-               /* link is disabled */
-               ppd->link_enabled = 0;
-
-               /* allow any state to transition to disabled */
-
-               /* must transition to offline first */
-               if (ppd->host_link_state != HLS_DN_OFFLINE) {
-                       ret = goto_offline(ppd, ppd->remote_link_down_reason);
-                       if (ret)
-                               break;
-                       ppd->remote_link_down_reason = 0;
-               }
-
-               ret1 = set_physical_link_state(dd, PLS_DISABLED);
-               if (ret1 != HCMD_SUCCESS) {
-                       dd_dev_err(dd,
-                                  "Failed to transition to Disabled link state, return 0x%x\n",
-                                  ret1);
-                       ret = -EINVAL;
-                       break;
-               }
-               ppd->host_link_state = HLS_DN_DISABLE;
-               dc_shutdown(dd);
-               break;
-       case HLS_DN_OFFLINE:
-               if (ppd->host_link_state == HLS_DN_DISABLE)
-                       dc_start(dd);
-
-               /* allow any state to transition to offline */
-               ret = goto_offline(ppd, ppd->remote_link_down_reason);
-               if (!ret)
-                       ppd->remote_link_down_reason = 0;
-               break;
-       case HLS_VERIFY_CAP:
-               if (ppd->host_link_state != HLS_DN_POLL)
-                       goto unexpected;
-               ppd->host_link_state = HLS_VERIFY_CAP;
-               break;
-       case HLS_GOING_UP:
-               if (ppd->host_link_state != HLS_VERIFY_CAP)
-                       goto unexpected;
-
-               ret1 = set_physical_link_state(dd, PLS_LINKUP);
-               if (ret1 != HCMD_SUCCESS) {
-                       dd_dev_err(dd,
-                                  "Failed to transition to link up state, return 0x%x\n",
-                                  ret1);
-                       ret = -EINVAL;
-                       break;
-               }
-               ppd->host_link_state = HLS_GOING_UP;
-               break;
-
-       case HLS_GOING_OFFLINE:         /* transient within goto_offline() */
-       case HLS_LINK_COOLDOWN:         /* transient within goto_offline() */
-       default:
-               dd_dev_info(dd, "%s: state 0x%x: not supported\n",
-                           __func__, state);
-               ret = -EINVAL;
-               break;
-       }
-
-       goto done;
-
-unexpected:
-       dd_dev_err(dd, "%s: unexpected state transition from %s to %s\n",
-                  __func__, link_state_name(ppd->host_link_state),
-                  link_state_name(state));
-       ret = -EINVAL;
-
-done:
-       mutex_unlock(&ppd->hls_lock);
-
-       if (event.device)
-               ib_dispatch_event(&event);
-
-       return ret;
-}
-
-int hfi1_set_ib_cfg(struct hfi1_pportdata *ppd, int which, u32 val)
-{
-       u64 reg;
-       int ret = 0;
-
-       switch (which) {
-       case HFI1_IB_CFG_LIDLMC:
-               set_lidlmc(ppd);
-               break;
-       case HFI1_IB_CFG_VL_HIGH_LIMIT:
-               /*
-                * The VL Arbitrator high limit is sent in units of 4k
-                * bytes, while HFI stores it in units of 64 bytes.
-                */
-               val *= 4096 / 64;
-               reg = ((u64)val & SEND_HIGH_PRIORITY_LIMIT_LIMIT_MASK)
-                       << SEND_HIGH_PRIORITY_LIMIT_LIMIT_SHIFT;
-               write_csr(ppd->dd, SEND_HIGH_PRIORITY_LIMIT, reg);
-               break;
-       case HFI1_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
-               /* HFI only supports POLL as the default link down state */
-               if (val != HLS_DN_POLL)
-                       ret = -EINVAL;
-               break;
-       case HFI1_IB_CFG_OP_VLS:
-               if (ppd->vls_operational != val) {
-                       ppd->vls_operational = val;
-                       if (!ppd->port)
-                               ret = -EINVAL;
-               }
-               break;
-       /*
-        * For link width, link width downgrade, and speed enable, always AND
-        * the setting with what is actually supported.  This has two benefits.
-        * First, enabled can't have unsupported values, no matter what the
-        * SM or FM might want.  Second, the ALL_SUPPORTED wildcards that mean
-        * "fill in with your supported value" have all the bits in the
-        * field set, so simply ANDing with supported has the desired result.
-        */
-       case HFI1_IB_CFG_LWID_ENB: /* set allowed Link-width */
-               ppd->link_width_enabled = val & ppd->link_width_supported;
-               break;
-       case HFI1_IB_CFG_LWID_DG_ENB: /* set allowed link width downgrade */
-               ppd->link_width_downgrade_enabled =
-                               val & ppd->link_width_downgrade_supported;
-               break;
-       case HFI1_IB_CFG_SPD_ENB: /* allowed Link speeds */
-               ppd->link_speed_enabled = val & ppd->link_speed_supported;
-               break;
-       case HFI1_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
-               /*
-                * HFI does not follow IB specs, save this value
-                * so we can report it, if asked.
-                */
-               ppd->overrun_threshold = val;
-               break;
-       case HFI1_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
-               /*
-                * HFI does not follow IB specs, save this value
-                * so we can report it, if asked.
-                */
-               ppd->phy_error_threshold = val;
-               break;
-
-       case HFI1_IB_CFG_MTU:
-               set_send_length(ppd);
-               break;
-
-       case HFI1_IB_CFG_PKEYS:
-               if (HFI1_CAP_IS_KSET(PKEY_CHECK))
-                       set_partition_keys(ppd);
-               break;
-
-       default:
-               if (HFI1_CAP_IS_KSET(PRINT_UNIMPL))
-                       dd_dev_info(ppd->dd,
-                                   "%s: which %s, val 0x%x: not implemented\n",
-                                   __func__, ib_cfg_name(which), val);
-               break;
-       }
-       return ret;
-}
-
-/* begin functions related to vl arbitration table caching */
-static void init_vl_arb_caches(struct hfi1_pportdata *ppd)
-{
-       int i;
-
-       BUILD_BUG_ON(VL_ARB_TABLE_SIZE !=
-                       VL_ARB_LOW_PRIO_TABLE_SIZE);
-       BUILD_BUG_ON(VL_ARB_TABLE_SIZE !=
-                       VL_ARB_HIGH_PRIO_TABLE_SIZE);
-
-       /*
-        * Note that we always return values directly from the
-        * 'vl_arb_cache' (and do no CSR reads) in response to a
-        * 'Get(VLArbTable)'. This is obviously correct after a
-        * 'Set(VLArbTable)', since the cache will then be up to
-        * date. But it's also correct prior to any 'Set(VLArbTable)'
-        * since then both the cache, and the relevant h/w registers
-        * will be zeroed.
-        */
-
-       for (i = 0; i < MAX_PRIO_TABLE; i++)
-               spin_lock_init(&ppd->vl_arb_cache[i].lock);
-}
-
-/*
- * vl_arb_lock_cache
- *
- * All other vl_arb_* functions should be called only after locking
- * the cache.
- */
-static inline struct vl_arb_cache *
-vl_arb_lock_cache(struct hfi1_pportdata *ppd, int idx)
-{
-       if (idx != LO_PRIO_TABLE && idx != HI_PRIO_TABLE)
-               return NULL;
-       spin_lock(&ppd->vl_arb_cache[idx].lock);
-       return &ppd->vl_arb_cache[idx];
-}
-
-static inline void vl_arb_unlock_cache(struct hfi1_pportdata *ppd, int idx)
-{
-       spin_unlock(&ppd->vl_arb_cache[idx].lock);
-}
-
-static void vl_arb_get_cache(struct vl_arb_cache *cache,
-                            struct ib_vl_weight_elem *vl)
-{
-       memcpy(vl, cache->table, VL_ARB_TABLE_SIZE * sizeof(*vl));
-}
-
-static void vl_arb_set_cache(struct vl_arb_cache *cache,
-                            struct ib_vl_weight_elem *vl)
-{
-       memcpy(cache->table, vl, VL_ARB_TABLE_SIZE * sizeof(*vl));
-}
-
-static int vl_arb_match_cache(struct vl_arb_cache *cache,
-                             struct ib_vl_weight_elem *vl)
-{
-       return !memcmp(cache->table, vl, VL_ARB_TABLE_SIZE * sizeof(*vl));
-}
-
-/* end functions related to vl arbitration table caching */
-
-static int set_vl_weights(struct hfi1_pportdata *ppd, u32 target,
-                         u32 size, struct ib_vl_weight_elem *vl)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       u64 reg;
-       unsigned int i, is_up = 0;
-       int drain, ret = 0;
-
-       mutex_lock(&ppd->hls_lock);
-
-       if (ppd->host_link_state & HLS_UP)
-               is_up = 1;
-
-       drain = !is_ax(dd) && is_up;
-
-       if (drain)
-               /*
-                * Before adjusting VL arbitration weights, empty per-VL
-                * FIFOs, otherwise a packet whose VL weight is being
-                * set to 0 could get stuck in a FIFO with no chance to
-                * egress.
-                */
-               ret = stop_drain_data_vls(dd);
-
-       if (ret) {
-               dd_dev_err(
-                       dd,
-                       "%s: cannot stop/drain VLs - refusing to change VL arbitration weights\n",
-                       __func__);
-               goto err;
-       }
-
-       for (i = 0; i < size; i++, vl++) {
-               /*
-                * NOTE: The low priority shift and mask are used here, but
-                * they are the same for both the low and high registers.
-                */
-               reg = (((u64)vl->vl & SEND_LOW_PRIORITY_LIST_VL_MASK)
-                               << SEND_LOW_PRIORITY_LIST_VL_SHIFT)
-                     | (((u64)vl->weight
-                               & SEND_LOW_PRIORITY_LIST_WEIGHT_MASK)
-                               << SEND_LOW_PRIORITY_LIST_WEIGHT_SHIFT);
-               write_csr(dd, target + (i * 8), reg);
-       }
-       pio_send_control(dd, PSC_GLOBAL_VLARB_ENABLE);
-
-       if (drain)
-               open_fill_data_vls(dd); /* reopen all VLs */
-
-err:
-       mutex_unlock(&ppd->hls_lock);
-
-       return ret;
-}
-
-/*
- * Read one credit merge VL register.
- */
-static void read_one_cm_vl(struct hfi1_devdata *dd, u32 csr,
-                          struct vl_limit *vll)
-{
-       u64 reg = read_csr(dd, csr);
-
-       vll->dedicated = cpu_to_be16(
-               (reg >> SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT)
-               & SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_MASK);
-       vll->shared = cpu_to_be16(
-               (reg >> SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT)
-               & SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_MASK);
-}
-
-/*
- * Read the current credit merge limits.
- */
-static int get_buffer_control(struct hfi1_devdata *dd,
-                             struct buffer_control *bc, u16 *overall_limit)
-{
-       u64 reg;
-       int i;
-
-       /* not all entries are filled in */
-       memset(bc, 0, sizeof(*bc));
-
-       /* OPA and HFI have a 1-1 mapping */
-       for (i = 0; i < TXE_NUM_DATA_VL; i++)
-               read_one_cm_vl(dd, SEND_CM_CREDIT_VL + (8 * i), &bc->vl[i]);
-
-       /* NOTE: assumes that VL* and VL15 CSRs are bit-wise identical */
-       read_one_cm_vl(dd, SEND_CM_CREDIT_VL15, &bc->vl[15]);
-
-       reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT);
-       bc->overall_shared_limit = cpu_to_be16(
-               (reg >> SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT)
-               & SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_MASK);
-       if (overall_limit)
-               *overall_limit = (reg
-                       >> SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT)
-                       & SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_MASK;
-       return sizeof(struct buffer_control);
-}
-
-static int get_sc2vlnt(struct hfi1_devdata *dd, struct sc2vlnt *dp)
-{
-       u64 reg;
-       int i;
-
-       /* each register contains 16 SC->VLnt mappings, 4 bits each */
-       reg = read_csr(dd, DCC_CFG_SC_VL_TABLE_15_0);
-       for (i = 0; i < sizeof(u64); i++) {
-               u8 byte = *(((u8 *)&reg) + i);
-
-               dp->vlnt[2 * i] = byte & 0xf;
-               dp->vlnt[(2 * i) + 1] = (byte & 0xf0) >> 4;
-       }
-
-       reg = read_csr(dd, DCC_CFG_SC_VL_TABLE_31_16);
-       for (i = 0; i < sizeof(u64); i++) {
-               u8 byte = *(((u8 *)&reg) + i);
-
-               dp->vlnt[16 + (2 * i)] = byte & 0xf;
-               dp->vlnt[16 + (2 * i) + 1] = (byte & 0xf0) >> 4;
-       }
-       return sizeof(struct sc2vlnt);
-}
-
-static void get_vlarb_preempt(struct hfi1_devdata *dd, u32 nelems,
-                             struct ib_vl_weight_elem *vl)
-{
-       unsigned int i;
-
-       for (i = 0; i < nelems; i++, vl++) {
-               vl->vl = 0xf;
-               vl->weight = 0;
-       }
-}
-
-static void set_sc2vlnt(struct hfi1_devdata *dd, struct sc2vlnt *dp)
-{
-       write_csr(dd, DCC_CFG_SC_VL_TABLE_15_0,
-                 DC_SC_VL_VAL(15_0,
-                              0, dp->vlnt[0] & 0xf,
-                              1, dp->vlnt[1] & 0xf,
-                              2, dp->vlnt[2] & 0xf,
-                              3, dp->vlnt[3] & 0xf,
-                              4, dp->vlnt[4] & 0xf,
-                              5, dp->vlnt[5] & 0xf,
-                              6, dp->vlnt[6] & 0xf,
-                              7, dp->vlnt[7] & 0xf,
-                              8, dp->vlnt[8] & 0xf,
-                              9, dp->vlnt[9] & 0xf,
-                              10, dp->vlnt[10] & 0xf,
-                              11, dp->vlnt[11] & 0xf,
-                              12, dp->vlnt[12] & 0xf,
-                              13, dp->vlnt[13] & 0xf,
-                              14, dp->vlnt[14] & 0xf,
-                              15, dp->vlnt[15] & 0xf));
-       write_csr(dd, DCC_CFG_SC_VL_TABLE_31_16,
-                 DC_SC_VL_VAL(31_16,
-                              16, dp->vlnt[16] & 0xf,
-                              17, dp->vlnt[17] & 0xf,
-                              18, dp->vlnt[18] & 0xf,
-                              19, dp->vlnt[19] & 0xf,
-                              20, dp->vlnt[20] & 0xf,
-                              21, dp->vlnt[21] & 0xf,
-                              22, dp->vlnt[22] & 0xf,
-                              23, dp->vlnt[23] & 0xf,
-                              24, dp->vlnt[24] & 0xf,
-                              25, dp->vlnt[25] & 0xf,
-                              26, dp->vlnt[26] & 0xf,
-                              27, dp->vlnt[27] & 0xf,
-                              28, dp->vlnt[28] & 0xf,
-                              29, dp->vlnt[29] & 0xf,
-                              30, dp->vlnt[30] & 0xf,
-                              31, dp->vlnt[31] & 0xf));
-}
-
-static void nonzero_msg(struct hfi1_devdata *dd, int idx, const char *what,
-                       u16 limit)
-{
-       if (limit != 0)
-               dd_dev_info(dd, "Invalid %s limit %d on VL %d, ignoring\n",
-                           what, (int)limit, idx);
-}
-
-/* change only the shared limit portion of SendCmGLobalCredit */
-static void set_global_shared(struct hfi1_devdata *dd, u16 limit)
-{
-       u64 reg;
-
-       reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT);
-       reg &= ~SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SMASK;
-       reg |= (u64)limit << SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT;
-       write_csr(dd, SEND_CM_GLOBAL_CREDIT, reg);
-}
-
-/* change only the total credit limit portion of SendCmGLobalCredit */
-static void set_global_limit(struct hfi1_devdata *dd, u16 limit)
-{
-       u64 reg;
-
-       reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT);
-       reg &= ~SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SMASK;
-       reg |= (u64)limit << SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT;
-       write_csr(dd, SEND_CM_GLOBAL_CREDIT, reg);
-}
-
-/* set the given per-VL shared limit */
-static void set_vl_shared(struct hfi1_devdata *dd, int vl, u16 limit)
-{
-       u64 reg;
-       u32 addr;
-
-       if (vl < TXE_NUM_DATA_VL)
-               addr = SEND_CM_CREDIT_VL + (8 * vl);
-       else
-               addr = SEND_CM_CREDIT_VL15;
-
-       reg = read_csr(dd, addr);
-       reg &= ~SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SMASK;
-       reg |= (u64)limit << SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT;
-       write_csr(dd, addr, reg);
-}
-
-/* set the given per-VL dedicated limit */
-static void set_vl_dedicated(struct hfi1_devdata *dd, int vl, u16 limit)
-{
-       u64 reg;
-       u32 addr;
-
-       if (vl < TXE_NUM_DATA_VL)
-               addr = SEND_CM_CREDIT_VL + (8 * vl);
-       else
-               addr = SEND_CM_CREDIT_VL15;
-
-       reg = read_csr(dd, addr);
-       reg &= ~SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SMASK;
-       reg |= (u64)limit << SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT;
-       write_csr(dd, addr, reg);
-}
-
-/* spin until the given per-VL status mask bits clear */
-static void wait_for_vl_status_clear(struct hfi1_devdata *dd, u64 mask,
-                                    const char *which)
-{
-       unsigned long timeout;
-       u64 reg;
-
-       timeout = jiffies + msecs_to_jiffies(VL_STATUS_CLEAR_TIMEOUT);
-       while (1) {
-               reg = read_csr(dd, SEND_CM_CREDIT_USED_STATUS) & mask;
-
-               if (reg == 0)
-                       return; /* success */
-               if (time_after(jiffies, timeout))
-                       break;          /* timed out */
-               udelay(1);
-       }
-
-       dd_dev_err(dd,
-                  "%s credit change status not clearing after %dms, mask 0x%llx, not clear 0x%llx\n",
-                  which, VL_STATUS_CLEAR_TIMEOUT, mask, reg);
-       /*
-        * If this occurs, it is likely there was a credit loss on the link.
-        * The only recovery from that is a link bounce.
-        */
-       dd_dev_err(dd,
-                  "Continuing anyway.  A credit loss may occur.  Suggest a link bounce\n");
-}
-
-/*
- * The number of credits on the VLs may be changed while everything
- * is "live", but the following algorithm must be followed due to
- * how the hardware is actually implemented.  In particular,
- * Return_Credit_Status[] is the only correct status check.
- *
- * if (reducing Global_Shared_Credit_Limit or any shared limit changing)
- *     set Global_Shared_Credit_Limit = 0
- *     use_all_vl = 1
- * mask0 = all VLs that are changing either dedicated or shared limits
- * set Shared_Limit[mask0] = 0
- * spin until Return_Credit_Status[use_all_vl ? all VL : mask0] == 0
- * if (changing any dedicated limit)
- *     mask1 = all VLs that are lowering dedicated limits
- *     lower Dedicated_Limit[mask1]
- *     spin until Return_Credit_Status[mask1] == 0
- *     raise Dedicated_Limits
- * raise Shared_Limits
- * raise Global_Shared_Credit_Limit
- *
- * lower = if the new limit is lower, set the limit to the new value
- * raise = if the new limit is higher than the current value (may be changed
- *     earlier in the algorithm), set the new limit to the new value
- */
-int set_buffer_control(struct hfi1_pportdata *ppd,
-                      struct buffer_control *new_bc)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       u64 changing_mask, ld_mask, stat_mask;
-       int change_count;
-       int i, use_all_mask;
-       int this_shared_changing;
-       int vl_count = 0, ret;
-       /*
-        * A0: add the variable any_shared_limit_changing below and in the
-        * algorithm above.  If removing A0 support, it can be removed.
-        */
-       int any_shared_limit_changing;
-       struct buffer_control cur_bc;
-       u8 changing[OPA_MAX_VLS];
-       u8 lowering_dedicated[OPA_MAX_VLS];
-       u16 cur_total;
-       u32 new_total = 0;
-       const u64 all_mask =
-       SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK
-        | SEND_CM_CREDIT_USED_STATUS_VL1_RETURN_CREDIT_STATUS_SMASK
-        | SEND_CM_CREDIT_USED_STATUS_VL2_RETURN_CREDIT_STATUS_SMASK
-        | SEND_CM_CREDIT_USED_STATUS_VL3_RETURN_CREDIT_STATUS_SMASK
-        | SEND_CM_CREDIT_USED_STATUS_VL4_RETURN_CREDIT_STATUS_SMASK
-        | SEND_CM_CREDIT_USED_STATUS_VL5_RETURN_CREDIT_STATUS_SMASK
-        | SEND_CM_CREDIT_USED_STATUS_VL6_RETURN_CREDIT_STATUS_SMASK
-        | SEND_CM_CREDIT_USED_STATUS_VL7_RETURN_CREDIT_STATUS_SMASK
-        | SEND_CM_CREDIT_USED_STATUS_VL15_RETURN_CREDIT_STATUS_SMASK;
-
-#define valid_vl(idx) ((idx) < TXE_NUM_DATA_VL || (idx) == 15)
-#define NUM_USABLE_VLS 16      /* look at VL15 and less */
-
-       /* find the new total credits, do sanity check on unused VLs */
-       for (i = 0; i < OPA_MAX_VLS; i++) {
-               if (valid_vl(i)) {
-                       new_total += be16_to_cpu(new_bc->vl[i].dedicated);
-                       continue;
-               }
-               nonzero_msg(dd, i, "dedicated",
-                           be16_to_cpu(new_bc->vl[i].dedicated));
-               nonzero_msg(dd, i, "shared",
-                           be16_to_cpu(new_bc->vl[i].shared));
-               new_bc->vl[i].dedicated = 0;
-               new_bc->vl[i].shared = 0;
-       }
-       new_total += be16_to_cpu(new_bc->overall_shared_limit);
-
-       /* fetch the current values */
-       get_buffer_control(dd, &cur_bc, &cur_total);
-
-       /*
-        * Create the masks we will use.
-        */
-       memset(changing, 0, sizeof(changing));
-       memset(lowering_dedicated, 0, sizeof(lowering_dedicated));
-       /*
-        * NOTE: Assumes that the individual VL bits are adjacent and in
-        * increasing order
-        */
-       stat_mask =
-               SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK;
-       changing_mask = 0;
-       ld_mask = 0;
-       change_count = 0;
-       any_shared_limit_changing = 0;
-       for (i = 0; i < NUM_USABLE_VLS; i++, stat_mask <<= 1) {
-               if (!valid_vl(i))
-                       continue;
-               this_shared_changing = new_bc->vl[i].shared
-                                               != cur_bc.vl[i].shared;
-               if (this_shared_changing)
-                       any_shared_limit_changing = 1;
-               if (new_bc->vl[i].dedicated != cur_bc.vl[i].dedicated ||
-                   this_shared_changing) {
-                       changing[i] = 1;
-                       changing_mask |= stat_mask;
-                       change_count++;
-               }
-               if (be16_to_cpu(new_bc->vl[i].dedicated) <
-                                       be16_to_cpu(cur_bc.vl[i].dedicated)) {
-                       lowering_dedicated[i] = 1;
-                       ld_mask |= stat_mask;
-               }
-       }
-
-       /* bracket the credit change with a total adjustment */
-       if (new_total > cur_total)
-               set_global_limit(dd, new_total);
-
-       /*
-        * Start the credit change algorithm.
-        */
-       use_all_mask = 0;
-       if ((be16_to_cpu(new_bc->overall_shared_limit) <
-            be16_to_cpu(cur_bc.overall_shared_limit)) ||
-           (is_ax(dd) && any_shared_limit_changing)) {
-               set_global_shared(dd, 0);
-               cur_bc.overall_shared_limit = 0;
-               use_all_mask = 1;
-       }
-
-       for (i = 0; i < NUM_USABLE_VLS; i++) {
-               if (!valid_vl(i))
-                       continue;
-
-               if (changing[i]) {
-                       set_vl_shared(dd, i, 0);
-                       cur_bc.vl[i].shared = 0;
-               }
-       }
-
-       wait_for_vl_status_clear(dd, use_all_mask ? all_mask : changing_mask,
-                                "shared");
-
-       if (change_count > 0) {
-               for (i = 0; i < NUM_USABLE_VLS; i++) {
-                       if (!valid_vl(i))
-                               continue;
-
-                       if (lowering_dedicated[i]) {
-                               set_vl_dedicated(dd, i,
-                                                be16_to_cpu(new_bc->
-                                                            vl[i].dedicated));
-                               cur_bc.vl[i].dedicated =
-                                               new_bc->vl[i].dedicated;
-                       }
-               }
-
-               wait_for_vl_status_clear(dd, ld_mask, "dedicated");
-
-               /* now raise all dedicated that are going up */
-               for (i = 0; i < NUM_USABLE_VLS; i++) {
-                       if (!valid_vl(i))
-                               continue;
-
-                       if (be16_to_cpu(new_bc->vl[i].dedicated) >
-                                       be16_to_cpu(cur_bc.vl[i].dedicated))
-                               set_vl_dedicated(dd, i,
-                                                be16_to_cpu(new_bc->
-                                                            vl[i].dedicated));
-               }
-       }
-
-       /* next raise all shared that are going up */
-       for (i = 0; i < NUM_USABLE_VLS; i++) {
-               if (!valid_vl(i))
-                       continue;
-
-               if (be16_to_cpu(new_bc->vl[i].shared) >
-                               be16_to_cpu(cur_bc.vl[i].shared))
-                       set_vl_shared(dd, i, be16_to_cpu(new_bc->vl[i].shared));
-       }
-
-       /* finally raise the global shared */
-       if (be16_to_cpu(new_bc->overall_shared_limit) >
-           be16_to_cpu(cur_bc.overall_shared_limit))
-               set_global_shared(dd,
-                                 be16_to_cpu(new_bc->overall_shared_limit));
-
-       /* bracket the credit change with a total adjustment */
-       if (new_total < cur_total)
-               set_global_limit(dd, new_total);
-
-       /*
-        * Determine the actual number of operational VLS using the number of
-        * dedicated and shared credits for each VL.
-        */
-       if (change_count > 0) {
-               for (i = 0; i < TXE_NUM_DATA_VL; i++)
-                       if (be16_to_cpu(new_bc->vl[i].dedicated) > 0 ||
-                           be16_to_cpu(new_bc->vl[i].shared) > 0)
-                               vl_count++;
-               ppd->actual_vls_operational = vl_count;
-               ret = sdma_map_init(dd, ppd->port - 1, vl_count ?
-                                   ppd->actual_vls_operational :
-                                   ppd->vls_operational,
-                                   NULL);
-               if (ret == 0)
-                       ret = pio_map_init(dd, ppd->port - 1, vl_count ?
-                                          ppd->actual_vls_operational :
-                                          ppd->vls_operational, NULL);
-               if (ret)
-                       return ret;
-       }
-       return 0;
-}
-
-/*
- * Read the given fabric manager table. Return the size of the
- * table (in bytes) on success, and a negative error code on
- * failure.
- */
-int fm_get_table(struct hfi1_pportdata *ppd, int which, void *t)
-
-{
-       int size;
-       struct vl_arb_cache *vlc;
-
-       switch (which) {
-       case FM_TBL_VL_HIGH_ARB:
-               size = 256;
-               /*
-                * OPA specifies 128 elements (of 2 bytes each), though
-                * HFI supports only 16 elements in h/w.
-                */
-               vlc = vl_arb_lock_cache(ppd, HI_PRIO_TABLE);
-               vl_arb_get_cache(vlc, t);
-               vl_arb_unlock_cache(ppd, HI_PRIO_TABLE);
-               break;
-       case FM_TBL_VL_LOW_ARB:
-               size = 256;
-               /*
-                * OPA specifies 128 elements (of 2 bytes each), though
-                * HFI supports only 16 elements in h/w.
-                */
-               vlc = vl_arb_lock_cache(ppd, LO_PRIO_TABLE);
-               vl_arb_get_cache(vlc, t);
-               vl_arb_unlock_cache(ppd, LO_PRIO_TABLE);
-               break;
-       case FM_TBL_BUFFER_CONTROL:
-               size = get_buffer_control(ppd->dd, t, NULL);
-               break;
-       case FM_TBL_SC2VLNT:
-               size = get_sc2vlnt(ppd->dd, t);
-               break;
-       case FM_TBL_VL_PREEMPT_ELEMS:
-               size = 256;
-               /* OPA specifies 128 elements, of 2 bytes each */
-               get_vlarb_preempt(ppd->dd, OPA_MAX_VLS, t);
-               break;
-       case FM_TBL_VL_PREEMPT_MATRIX:
-               size = 256;
-               /*
-                * OPA specifies that this is the same size as the VL
-                * arbitration tables (i.e., 256 bytes).
-                */
-               break;
-       default:
-               return -EINVAL;
-       }
-       return size;
-}
-
-/*
- * Write the given fabric manager table.
- */
-int fm_set_table(struct hfi1_pportdata *ppd, int which, void *t)
-{
-       int ret = 0;
-       struct vl_arb_cache *vlc;
-
-       switch (which) {
-       case FM_TBL_VL_HIGH_ARB:
-               vlc = vl_arb_lock_cache(ppd, HI_PRIO_TABLE);
-               if (vl_arb_match_cache(vlc, t)) {
-                       vl_arb_unlock_cache(ppd, HI_PRIO_TABLE);
-                       break;
-               }
-               vl_arb_set_cache(vlc, t);
-               vl_arb_unlock_cache(ppd, HI_PRIO_TABLE);
-               ret = set_vl_weights(ppd, SEND_HIGH_PRIORITY_LIST,
-                                    VL_ARB_HIGH_PRIO_TABLE_SIZE, t);
-               break;
-       case FM_TBL_VL_LOW_ARB:
-               vlc = vl_arb_lock_cache(ppd, LO_PRIO_TABLE);
-               if (vl_arb_match_cache(vlc, t)) {
-                       vl_arb_unlock_cache(ppd, LO_PRIO_TABLE);
-                       break;
-               }
-               vl_arb_set_cache(vlc, t);
-               vl_arb_unlock_cache(ppd, LO_PRIO_TABLE);
-               ret = set_vl_weights(ppd, SEND_LOW_PRIORITY_LIST,
-                                    VL_ARB_LOW_PRIO_TABLE_SIZE, t);
-               break;
-       case FM_TBL_BUFFER_CONTROL:
-               ret = set_buffer_control(ppd, t);
-               break;
-       case FM_TBL_SC2VLNT:
-               set_sc2vlnt(ppd->dd, t);
-               break;
-       default:
-               ret = -EINVAL;
-       }
-       return ret;
-}
-
-/*
- * Disable all data VLs.
- *
- * Return 0 if disabled, non-zero if the VLs cannot be disabled.
- */
-static int disable_data_vls(struct hfi1_devdata *dd)
-{
-       if (is_ax(dd))
-               return 1;
-
-       pio_send_control(dd, PSC_DATA_VL_DISABLE);
-
-       return 0;
-}
-
-/*
- * open_fill_data_vls() - the counterpart to stop_drain_data_vls().
- * Just re-enables all data VLs (the "fill" part happens
- * automatically - the name was chosen for symmetry with
- * stop_drain_data_vls()).
- *
- * Return 0 if successful, non-zero if the VLs cannot be enabled.
- */
-int open_fill_data_vls(struct hfi1_devdata *dd)
-{
-       if (is_ax(dd))
-               return 1;
-
-       pio_send_control(dd, PSC_DATA_VL_ENABLE);
-
-       return 0;
-}
-
-/*
- * drain_data_vls() - assumes that disable_data_vls() has been called,
- * wait for occupancy (of per-VL FIFOs) for all contexts, and SDMA
- * engines to drop to 0.
- */
-static void drain_data_vls(struct hfi1_devdata *dd)
-{
-       sc_wait(dd);
-       sdma_wait(dd);
-       pause_for_credit_return(dd);
-}
-
-/*
- * stop_drain_data_vls() - disable, then drain all per-VL fifos.
- *
- * Use open_fill_data_vls() to resume using data VLs.  This pair is
- * meant to be used like this:
- *
- * stop_drain_data_vls(dd);
- * // do things with per-VL resources
- * open_fill_data_vls(dd);
- */
-int stop_drain_data_vls(struct hfi1_devdata *dd)
-{
-       int ret;
-
-       ret = disable_data_vls(dd);
-       if (ret == 0)
-               drain_data_vls(dd);
-
-       return ret;
-}
-
-/*
- * Convert a nanosecond time to a cclock count.  No matter how slow
- * the cclock, a non-zero ns will always have a non-zero result.
- */
-u32 ns_to_cclock(struct hfi1_devdata *dd, u32 ns)
-{
-       u32 cclocks;
-
-       if (dd->icode == ICODE_FPGA_EMULATION)
-               cclocks = (ns * 1000) / FPGA_CCLOCK_PS;
-       else  /* simulation pretends to be ASIC */
-               cclocks = (ns * 1000) / ASIC_CCLOCK_PS;
-       if (ns && !cclocks)     /* if ns nonzero, must be at least 1 */
-               cclocks = 1;
-       return cclocks;
-}
-
-/*
- * Convert a cclock count to nanoseconds. Not matter how slow
- * the cclock, a non-zero cclocks will always have a non-zero result.
- */
-u32 cclock_to_ns(struct hfi1_devdata *dd, u32 cclocks)
-{
-       u32 ns;
-
-       if (dd->icode == ICODE_FPGA_EMULATION)
-               ns = (cclocks * FPGA_CCLOCK_PS) / 1000;
-       else  /* simulation pretends to be ASIC */
-               ns = (cclocks * ASIC_CCLOCK_PS) / 1000;
-       if (cclocks && !ns)
-               ns = 1;
-       return ns;
-}
-
-/*
- * Dynamically adjust the receive interrupt timeout for a context based on
- * incoming packet rate.
- *
- * NOTE: Dynamic adjustment does not allow rcv_intr_count to be zero.
- */
-static void adjust_rcv_timeout(struct hfi1_ctxtdata *rcd, u32 npkts)
-{
-       struct hfi1_devdata *dd = rcd->dd;
-       u32 timeout = rcd->rcvavail_timeout;
-
-       /*
-        * This algorithm doubles or halves the timeout depending on whether
-        * the number of packets received in this interrupt were less than or
-        * greater equal the interrupt count.
-        *
-        * The calculations below do not allow a steady state to be achieved.
-        * Only at the endpoints it is possible to have an unchanging
-        * timeout.
-        */
-       if (npkts < rcv_intr_count) {
-               /*
-                * Not enough packets arrived before the timeout, adjust
-                * timeout downward.
-                */
-               if (timeout < 2) /* already at minimum? */
-                       return;
-               timeout >>= 1;
-       } else {
-               /*
-                * More than enough packets arrived before the timeout, adjust
-                * timeout upward.
-                */
-               if (timeout >= dd->rcv_intr_timeout_csr) /* already at max? */
-                       return;
-               timeout = min(timeout << 1, dd->rcv_intr_timeout_csr);
-       }
-
-       rcd->rcvavail_timeout = timeout;
-       /*
-        * timeout cannot be larger than rcv_intr_timeout_csr which has already
-        * been verified to be in range
-        */
-       write_kctxt_csr(dd, rcd->ctxt, RCV_AVAIL_TIME_OUT,
-                       (u64)timeout <<
-                       RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT);
-}
-
-void update_usrhead(struct hfi1_ctxtdata *rcd, u32 hd, u32 updegr, u32 egrhd,
-                   u32 intr_adjust, u32 npkts)
-{
-       struct hfi1_devdata *dd = rcd->dd;
-       u64 reg;
-       u32 ctxt = rcd->ctxt;
-
-       /*
-        * Need to write timeout register before updating RcvHdrHead to ensure
-        * that a new value is used when the HW decides to restart counting.
-        */
-       if (intr_adjust)
-               adjust_rcv_timeout(rcd, npkts);
-       if (updegr) {
-               reg = (egrhd & RCV_EGR_INDEX_HEAD_HEAD_MASK)
-                       << RCV_EGR_INDEX_HEAD_HEAD_SHIFT;
-               write_uctxt_csr(dd, ctxt, RCV_EGR_INDEX_HEAD, reg);
-       }
-       mmiowb();
-       reg = ((u64)rcv_intr_count << RCV_HDR_HEAD_COUNTER_SHIFT) |
-               (((u64)hd & RCV_HDR_HEAD_HEAD_MASK)
-                       << RCV_HDR_HEAD_HEAD_SHIFT);
-       write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, reg);
-       mmiowb();
-}
-
-u32 hdrqempty(struct hfi1_ctxtdata *rcd)
-{
-       u32 head, tail;
-
-       head = (read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_HEAD)
-               & RCV_HDR_HEAD_HEAD_SMASK) >> RCV_HDR_HEAD_HEAD_SHIFT;
-
-       if (rcd->rcvhdrtail_kvaddr)
-               tail = get_rcvhdrtail(rcd);
-       else
-               tail = read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_TAIL);
-
-       return head == tail;
-}
-
-/*
- * Context Control and Receive Array encoding for buffer size:
- *     0x0 invalid
- *     0x1   4 KB
- *     0x2   8 KB
- *     0x3  16 KB
- *     0x4  32 KB
- *     0x5  64 KB
- *     0x6 128 KB
- *     0x7 256 KB
- *     0x8 512 KB (Receive Array only)
- *     0x9   1 MB (Receive Array only)
- *     0xa   2 MB (Receive Array only)
- *
- *     0xB-0xF - reserved (Receive Array only)
- *
- *
- * This routine assumes that the value has already been sanity checked.
- */
-static u32 encoded_size(u32 size)
-{
-       switch (size) {
-       case   4 * 1024: return 0x1;
-       case   8 * 1024: return 0x2;
-       case  16 * 1024: return 0x3;
-       case  32 * 1024: return 0x4;
-       case  64 * 1024: return 0x5;
-       case 128 * 1024: return 0x6;
-       case 256 * 1024: return 0x7;
-       case 512 * 1024: return 0x8;
-       case   1 * 1024 * 1024: return 0x9;
-       case   2 * 1024 * 1024: return 0xa;
-       }
-       return 0x1;     /* if invalid, go with the minimum size */
-}
-
-void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, int ctxt)
-{
-       struct hfi1_ctxtdata *rcd;
-       u64 rcvctrl, reg;
-       int did_enable = 0;
-
-       rcd = dd->rcd[ctxt];
-       if (!rcd)
-               return;
-
-       hfi1_cdbg(RCVCTRL, "ctxt %d op 0x%x", ctxt, op);
-
-       rcvctrl = read_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL);
-       /* if the context already enabled, don't do the extra steps */
-       if ((op & HFI1_RCVCTRL_CTXT_ENB) &&
-           !(rcvctrl & RCV_CTXT_CTRL_ENABLE_SMASK)) {
-               /* reset the tail and hdr addresses, and sequence count */
-               write_kctxt_csr(dd, ctxt, RCV_HDR_ADDR,
-                               rcd->rcvhdrq_phys);
-               if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL))
-                       write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR,
-                                       rcd->rcvhdrqtailaddr_phys);
-               rcd->seq_cnt = 1;
-
-               /* reset the cached receive header queue head value */
-               rcd->head = 0;
-
-               /*
-                * Zero the receive header queue so we don't get false
-                * positives when checking the sequence number.  The
-                * sequence numbers could land exactly on the same spot.
-                * E.g. a rcd restart before the receive header wrapped.
-                */
-               memset(rcd->rcvhdrq, 0, rcd->rcvhdrq_size);
-
-               /* starting timeout */
-               rcd->rcvavail_timeout = dd->rcv_intr_timeout_csr;
-
-               /* enable the context */
-               rcvctrl |= RCV_CTXT_CTRL_ENABLE_SMASK;
-
-               /* clean the egr buffer size first */
-               rcvctrl &= ~RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK;
-               rcvctrl |= ((u64)encoded_size(rcd->egrbufs.rcvtid_size)
-                               & RCV_CTXT_CTRL_EGR_BUF_SIZE_MASK)
-                                       << RCV_CTXT_CTRL_EGR_BUF_SIZE_SHIFT;
-
-               /* zero RcvHdrHead - set RcvHdrHead.Counter after enable */
-               write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0);
-               did_enable = 1;
-
-               /* zero RcvEgrIndexHead */
-               write_uctxt_csr(dd, ctxt, RCV_EGR_INDEX_HEAD, 0);
-
-               /* set eager count and base index */
-               reg = (((u64)(rcd->egrbufs.alloced >> RCV_SHIFT)
-                       & RCV_EGR_CTRL_EGR_CNT_MASK)
-                      << RCV_EGR_CTRL_EGR_CNT_SHIFT) |
-                       (((rcd->eager_base >> RCV_SHIFT)
-                         & RCV_EGR_CTRL_EGR_BASE_INDEX_MASK)
-                        << RCV_EGR_CTRL_EGR_BASE_INDEX_SHIFT);
-               write_kctxt_csr(dd, ctxt, RCV_EGR_CTRL, reg);
-
-               /*
-                * Set TID (expected) count and base index.
-                * rcd->expected_count is set to individual RcvArray entries,
-                * not pairs, and the CSR takes a pair-count in groups of
-                * four, so divide by 8.
-                */
-               reg = (((rcd->expected_count >> RCV_SHIFT)
-                                       & RCV_TID_CTRL_TID_PAIR_CNT_MASK)
-                               << RCV_TID_CTRL_TID_PAIR_CNT_SHIFT) |
-                     (((rcd->expected_base >> RCV_SHIFT)
-                                       & RCV_TID_CTRL_TID_BASE_INDEX_MASK)
-                               << RCV_TID_CTRL_TID_BASE_INDEX_SHIFT);
-               write_kctxt_csr(dd, ctxt, RCV_TID_CTRL, reg);
-               if (ctxt == HFI1_CTRL_CTXT)
-                       write_csr(dd, RCV_VL15, HFI1_CTRL_CTXT);
-       }
-       if (op & HFI1_RCVCTRL_CTXT_DIS) {
-               write_csr(dd, RCV_VL15, 0);
-               /*
-                * When receive context is being disabled turn on tail
-                * update with a dummy tail address and then disable
-                * receive context.
-                */
-               if (dd->rcvhdrtail_dummy_physaddr) {
-                       write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR,
-                                       dd->rcvhdrtail_dummy_physaddr);
-                       /* Enabling RcvCtxtCtrl.TailUpd is intentional. */
-                       rcvctrl |= RCV_CTXT_CTRL_TAIL_UPD_SMASK;
-               }
-
-               rcvctrl &= ~RCV_CTXT_CTRL_ENABLE_SMASK;
-       }
-       if (op & HFI1_RCVCTRL_INTRAVAIL_ENB)
-               rcvctrl |= RCV_CTXT_CTRL_INTR_AVAIL_SMASK;
-       if (op & HFI1_RCVCTRL_INTRAVAIL_DIS)
-               rcvctrl &= ~RCV_CTXT_CTRL_INTR_AVAIL_SMASK;
-       if (op & HFI1_RCVCTRL_TAILUPD_ENB && rcd->rcvhdrqtailaddr_phys)
-               rcvctrl |= RCV_CTXT_CTRL_TAIL_UPD_SMASK;
-       if (op & HFI1_RCVCTRL_TAILUPD_DIS) {
-               /* See comment on RcvCtxtCtrl.TailUpd above */
-               if (!(op & HFI1_RCVCTRL_CTXT_DIS))
-                       rcvctrl &= ~RCV_CTXT_CTRL_TAIL_UPD_SMASK;
-       }
-       if (op & HFI1_RCVCTRL_TIDFLOW_ENB)
-               rcvctrl |= RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK;
-       if (op & HFI1_RCVCTRL_TIDFLOW_DIS)
-               rcvctrl &= ~RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK;
-       if (op & HFI1_RCVCTRL_ONE_PKT_EGR_ENB) {
-               /*
-                * In one-packet-per-eager mode, the size comes from
-                * the RcvArray entry.
-                */
-               rcvctrl &= ~RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK;
-               rcvctrl |= RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK;
-       }
-       if (op & HFI1_RCVCTRL_ONE_PKT_EGR_DIS)
-               rcvctrl &= ~RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK;
-       if (op & HFI1_RCVCTRL_NO_RHQ_DROP_ENB)
-               rcvctrl |= RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK;
-       if (op & HFI1_RCVCTRL_NO_RHQ_DROP_DIS)
-               rcvctrl &= ~RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK;
-       if (op & HFI1_RCVCTRL_NO_EGR_DROP_ENB)
-               rcvctrl |= RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK;
-       if (op & HFI1_RCVCTRL_NO_EGR_DROP_DIS)
-               rcvctrl &= ~RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK;
-       rcd->rcvctrl = rcvctrl;
-       hfi1_cdbg(RCVCTRL, "ctxt %d rcvctrl 0x%llx\n", ctxt, rcvctrl);
-       write_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL, rcd->rcvctrl);
-
-       /* work around sticky RcvCtxtStatus.BlockedRHQFull */
-       if (did_enable &&
-           (rcvctrl & RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK)) {
-               reg = read_kctxt_csr(dd, ctxt, RCV_CTXT_STATUS);
-               if (reg != 0) {
-                       dd_dev_info(dd, "ctxt %d status %lld (blocked)\n",
-                                   ctxt, reg);
-                       read_uctxt_csr(dd, ctxt, RCV_HDR_HEAD);
-                       write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0x10);
-                       write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0x00);
-                       read_uctxt_csr(dd, ctxt, RCV_HDR_HEAD);
-                       reg = read_kctxt_csr(dd, ctxt, RCV_CTXT_STATUS);
-                       dd_dev_info(dd, "ctxt %d status %lld (%s blocked)\n",
-                                   ctxt, reg, reg == 0 ? "not" : "still");
-               }
-       }
-
-       if (did_enable) {
-               /*
-                * The interrupt timeout and count must be set after
-                * the context is enabled to take effect.
-                */
-               /* set interrupt timeout */
-               write_kctxt_csr(dd, ctxt, RCV_AVAIL_TIME_OUT,
-                               (u64)rcd->rcvavail_timeout <<
-                               RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT);
-
-               /* set RcvHdrHead.Counter, zero RcvHdrHead.Head (again) */
-               reg = (u64)rcv_intr_count << RCV_HDR_HEAD_COUNTER_SHIFT;
-               write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, reg);
-       }
-
-       if (op & (HFI1_RCVCTRL_TAILUPD_DIS | HFI1_RCVCTRL_CTXT_DIS))
-               /*
-                * If the context has been disabled and the Tail Update has
-                * been cleared, set the RCV_HDR_TAIL_ADDR CSR to dummy address
-                * so it doesn't contain an address that is invalid.
-                */
-               write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR,
-                               dd->rcvhdrtail_dummy_physaddr);
-}
-
-u32 hfi1_read_cntrs(struct hfi1_devdata *dd, char **namep, u64 **cntrp)
-{
-       int ret;
-       u64 val = 0;
-
-       if (namep) {
-               ret = dd->cntrnameslen;
-               *namep = dd->cntrnames;
-       } else {
-               const struct cntr_entry *entry;
-               int i, j;
-
-               ret = (dd->ndevcntrs) * sizeof(u64);
-
-               /* Get the start of the block of counters */
-               *cntrp = dd->cntrs;
-
-               /*
-                * Now go and fill in each counter in the block.
-                */
-               for (i = 0; i < DEV_CNTR_LAST; i++) {
-                       entry = &dev_cntrs[i];
-                       hfi1_cdbg(CNTR, "reading %s", entry->name);
-                       if (entry->flags & CNTR_DISABLED) {
-                               /* Nothing */
-                               hfi1_cdbg(CNTR, "\tDisabled\n");
-                       } else {
-                               if (entry->flags & CNTR_VL) {
-                                       hfi1_cdbg(CNTR, "\tPer VL\n");
-                                       for (j = 0; j < C_VL_COUNT; j++) {
-                                               val = entry->rw_cntr(entry,
-                                                                 dd, j,
-                                                                 CNTR_MODE_R,
-                                                                 0);
-                                               hfi1_cdbg(
-                                                  CNTR,
-                                                  "\t\tRead 0x%llx for %d\n",
-                                                  val, j);
-                                               dd->cntrs[entry->offset + j] =
-                                                                           val;
-                                       }
-                               } else if (entry->flags & CNTR_SDMA) {
-                                       hfi1_cdbg(CNTR,
-                                                 "\t Per SDMA Engine\n");
-                                       for (j = 0; j < dd->chip_sdma_engines;
-                                            j++) {
-                                               val =
-                                               entry->rw_cntr(entry, dd, j,
-                                                              CNTR_MODE_R, 0);
-                                               hfi1_cdbg(CNTR,
-                                                         "\t\tRead 0x%llx for %d\n",
-                                                         val, j);
-                                               dd->cntrs[entry->offset + j] =
-                                                                       val;
-                                       }
-                               } else {
-                                       val = entry->rw_cntr(entry, dd,
-                                                       CNTR_INVALID_VL,
-                                                       CNTR_MODE_R, 0);
-                                       dd->cntrs[entry->offset] = val;
-                                       hfi1_cdbg(CNTR, "\tRead 0x%llx", val);
-                               }
-                       }
-               }
-       }
-       return ret;
-}
-
-/*
- * Used by sysfs to create files for hfi stats to read
- */
-u32 hfi1_read_portcntrs(struct hfi1_pportdata *ppd, char **namep, u64 **cntrp)
-{
-       int ret;
-       u64 val = 0;
-
-       if (namep) {
-               ret = ppd->dd->portcntrnameslen;
-               *namep = ppd->dd->portcntrnames;
-       } else {
-               const struct cntr_entry *entry;
-               int i, j;
-
-               ret = ppd->dd->nportcntrs * sizeof(u64);
-               *cntrp = ppd->cntrs;
-
-               for (i = 0; i < PORT_CNTR_LAST; i++) {
-                       entry = &port_cntrs[i];
-                       hfi1_cdbg(CNTR, "reading %s", entry->name);
-                       if (entry->flags & CNTR_DISABLED) {
-                               /* Nothing */
-                               hfi1_cdbg(CNTR, "\tDisabled\n");
-                               continue;
-                       }
-
-                       if (entry->flags & CNTR_VL) {
-                               hfi1_cdbg(CNTR, "\tPer VL");
-                               for (j = 0; j < C_VL_COUNT; j++) {
-                                       val = entry->rw_cntr(entry, ppd, j,
-                                                              CNTR_MODE_R,
-                                                              0);
-                                       hfi1_cdbg(
-                                          CNTR,
-                                          "\t\tRead 0x%llx for %d",
-                                          val, j);
-                                       ppd->cntrs[entry->offset + j] = val;
-                               }
-                       } else {
-                               val = entry->rw_cntr(entry, ppd,
-                                                      CNTR_INVALID_VL,
-                                                      CNTR_MODE_R,
-                                                      0);
-                               ppd->cntrs[entry->offset] = val;
-                               hfi1_cdbg(CNTR, "\tRead 0x%llx", val);
-                       }
-               }
-       }
-       return ret;
-}
-
-static void free_cntrs(struct hfi1_devdata *dd)
-{
-       struct hfi1_pportdata *ppd;
-       int i;
-
-       if (dd->synth_stats_timer.data)
-               del_timer_sync(&dd->synth_stats_timer);
-       dd->synth_stats_timer.data = 0;
-       ppd = (struct hfi1_pportdata *)(dd + 1);
-       for (i = 0; i < dd->num_pports; i++, ppd++) {
-               kfree(ppd->cntrs);
-               kfree(ppd->scntrs);
-               free_percpu(ppd->ibport_data.rvp.rc_acks);
-               free_percpu(ppd->ibport_data.rvp.rc_qacks);
-               free_percpu(ppd->ibport_data.rvp.rc_delayed_comp);
-               ppd->cntrs = NULL;
-               ppd->scntrs = NULL;
-               ppd->ibport_data.rvp.rc_acks = NULL;
-               ppd->ibport_data.rvp.rc_qacks = NULL;
-               ppd->ibport_data.rvp.rc_delayed_comp = NULL;
-       }
-       kfree(dd->portcntrnames);
-       dd->portcntrnames = NULL;
-       kfree(dd->cntrs);
-       dd->cntrs = NULL;
-       kfree(dd->scntrs);
-       dd->scntrs = NULL;
-       kfree(dd->cntrnames);
-       dd->cntrnames = NULL;
-}
-
-#define CNTR_MAX 0xFFFFFFFFFFFFFFFFULL
-#define CNTR_32BIT_MAX 0x00000000FFFFFFFF
-
-static u64 read_dev_port_cntr(struct hfi1_devdata *dd, struct cntr_entry *entry,
-                             u64 *psval, void *context, int vl)
-{
-       u64 val;
-       u64 sval = *psval;
-
-       if (entry->flags & CNTR_DISABLED) {
-               dd_dev_err(dd, "Counter %s not enabled", entry->name);
-               return 0;
-       }
-
-       hfi1_cdbg(CNTR, "cntr: %s vl %d psval 0x%llx", entry->name, vl, *psval);
-
-       val = entry->rw_cntr(entry, context, vl, CNTR_MODE_R, 0);
-
-       /* If its a synthetic counter there is more work we need to do */
-       if (entry->flags & CNTR_SYNTH) {
-               if (sval == CNTR_MAX) {
-                       /* No need to read already saturated */
-                       return CNTR_MAX;
-               }
-
-               if (entry->flags & CNTR_32BIT) {
-                       /* 32bit counters can wrap multiple times */
-                       u64 upper = sval >> 32;
-                       u64 lower = (sval << 32) >> 32;
-
-                       if (lower > val) { /* hw wrapped */
-                               if (upper == CNTR_32BIT_MAX)
-                                       val = CNTR_MAX;
-                               else
-                                       upper++;
-                       }
-
-                       if (val != CNTR_MAX)
-                               val = (upper << 32) | val;
-
-               } else {
-                       /* If we rolled we are saturated */
-                       if ((val < sval) || (val > CNTR_MAX))
-                               val = CNTR_MAX;
-               }
-       }
-
-       *psval = val;
-
-       hfi1_cdbg(CNTR, "\tNew val=0x%llx", val);
-
-       return val;
-}
-
-static u64 write_dev_port_cntr(struct hfi1_devdata *dd,
-                              struct cntr_entry *entry,
-                              u64 *psval, void *context, int vl, u64 data)
-{
-       u64 val;
-
-       if (entry->flags & CNTR_DISABLED) {
-               dd_dev_err(dd, "Counter %s not enabled", entry->name);
-               return 0;
-       }
-
-       hfi1_cdbg(CNTR, "cntr: %s vl %d psval 0x%llx", entry->name, vl, *psval);
-
-       if (entry->flags & CNTR_SYNTH) {
-               *psval = data;
-               if (entry->flags & CNTR_32BIT) {
-                       val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W,
-                                            (data << 32) >> 32);
-                       val = data; /* return the full 64bit value */
-               } else {
-                       val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W,
-                                            data);
-               }
-       } else {
-               val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W, data);
-       }
-
-       *psval = val;
-
-       hfi1_cdbg(CNTR, "\tNew val=0x%llx", val);
-
-       return val;
-}
-
-u64 read_dev_cntr(struct hfi1_devdata *dd, int index, int vl)
-{
-       struct cntr_entry *entry;
-       u64 *sval;
-
-       entry = &dev_cntrs[index];
-       sval = dd->scntrs + entry->offset;
-
-       if (vl != CNTR_INVALID_VL)
-               sval += vl;
-
-       return read_dev_port_cntr(dd, entry, sval, dd, vl);
-}
-
-u64 write_dev_cntr(struct hfi1_devdata *dd, int index, int vl, u64 data)
-{
-       struct cntr_entry *entry;
-       u64 *sval;
-
-       entry = &dev_cntrs[index];
-       sval = dd->scntrs + entry->offset;
-
-       if (vl != CNTR_INVALID_VL)
-               sval += vl;
-
-       return write_dev_port_cntr(dd, entry, sval, dd, vl, data);
-}
-
-u64 read_port_cntr(struct hfi1_pportdata *ppd, int index, int vl)
-{
-       struct cntr_entry *entry;
-       u64 *sval;
-
-       entry = &port_cntrs[index];
-       sval = ppd->scntrs + entry->offset;
-
-       if (vl != CNTR_INVALID_VL)
-               sval += vl;
-
-       if ((index >= C_RCV_HDR_OVF_FIRST + ppd->dd->num_rcv_contexts) &&
-           (index <= C_RCV_HDR_OVF_LAST)) {
-               /* We do not want to bother for disabled contexts */
-               return 0;
-       }
-
-       return read_dev_port_cntr(ppd->dd, entry, sval, ppd, vl);
-}
-
-u64 write_port_cntr(struct hfi1_pportdata *ppd, int index, int vl, u64 data)
-{
-       struct cntr_entry *entry;
-       u64 *sval;
-
-       entry = &port_cntrs[index];
-       sval = ppd->scntrs + entry->offset;
-
-       if (vl != CNTR_INVALID_VL)
-               sval += vl;
-
-       if ((index >= C_RCV_HDR_OVF_FIRST + ppd->dd->num_rcv_contexts) &&
-           (index <= C_RCV_HDR_OVF_LAST)) {
-               /* We do not want to bother for disabled contexts */
-               return 0;
-       }
-
-       return write_dev_port_cntr(ppd->dd, entry, sval, ppd, vl, data);
-}
-
-static void update_synth_timer(unsigned long opaque)
-{
-       u64 cur_tx;
-       u64 cur_rx;
-       u64 total_flits;
-       u8 update = 0;
-       int i, j, vl;
-       struct hfi1_pportdata *ppd;
-       struct cntr_entry *entry;
-
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)opaque;
-
-       /*
-        * Rather than keep beating on the CSRs pick a minimal set that we can
-        * check to watch for potential roll over. We can do this by looking at
-        * the number of flits sent/recv. If the total flits exceeds 32bits then
-        * we have to iterate all the counters and update.
-        */
-       entry = &dev_cntrs[C_DC_RCV_FLITS];
-       cur_rx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL, CNTR_MODE_R, 0);
-
-       entry = &dev_cntrs[C_DC_XMIT_FLITS];
-       cur_tx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL, CNTR_MODE_R, 0);
-
-       hfi1_cdbg(
-           CNTR,
-           "[%d] curr tx=0x%llx rx=0x%llx :: last tx=0x%llx rx=0x%llx\n",
-           dd->unit, cur_tx, cur_rx, dd->last_tx, dd->last_rx);
-
-       if ((cur_tx < dd->last_tx) || (cur_rx < dd->last_rx)) {
-               /*
-                * May not be strictly necessary to update but it won't hurt and
-                * simplifies the logic here.
-                */
-               update = 1;
-               hfi1_cdbg(CNTR, "[%d] Tripwire counter rolled, updating",
-                         dd->unit);
-       } else {
-               total_flits = (cur_tx - dd->last_tx) + (cur_rx - dd->last_rx);
-               hfi1_cdbg(CNTR,
-                         "[%d] total flits 0x%llx limit 0x%llx\n", dd->unit,
-                         total_flits, (u64)CNTR_32BIT_MAX);
-               if (total_flits >= CNTR_32BIT_MAX) {
-                       hfi1_cdbg(CNTR, "[%d] 32bit limit hit, updating",
-                                 dd->unit);
-                       update = 1;
-               }
-       }
-
-       if (update) {
-               hfi1_cdbg(CNTR, "[%d] Updating dd and ppd counters", dd->unit);
-               for (i = 0; i < DEV_CNTR_LAST; i++) {
-                       entry = &dev_cntrs[i];
-                       if (entry->flags & CNTR_VL) {
-                               for (vl = 0; vl < C_VL_COUNT; vl++)
-                                       read_dev_cntr(dd, i, vl);
-                       } else {
-                               read_dev_cntr(dd, i, CNTR_INVALID_VL);
-                       }
-               }
-               ppd = (struct hfi1_pportdata *)(dd + 1);
-               for (i = 0; i < dd->num_pports; i++, ppd++) {
-                       for (j = 0; j < PORT_CNTR_LAST; j++) {
-                               entry = &port_cntrs[j];
-                               if (entry->flags & CNTR_VL) {
-                                       for (vl = 0; vl < C_VL_COUNT; vl++)
-                                               read_port_cntr(ppd, j, vl);
-                               } else {
-                                       read_port_cntr(ppd, j, CNTR_INVALID_VL);
-                               }
-                       }
-               }
-
-               /*
-                * We want the value in the register. The goal is to keep track
-                * of the number of "ticks" not the counter value. In other
-                * words if the register rolls we want to notice it and go ahead
-                * and force an update.
-                */
-               entry = &dev_cntrs[C_DC_XMIT_FLITS];
-               dd->last_tx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL,
-                                               CNTR_MODE_R, 0);
-
-               entry = &dev_cntrs[C_DC_RCV_FLITS];
-               dd->last_rx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL,
-                                               CNTR_MODE_R, 0);
-
-               hfi1_cdbg(CNTR, "[%d] setting last tx/rx to 0x%llx 0x%llx",
-                         dd->unit, dd->last_tx, dd->last_rx);
-
-       } else {
-               hfi1_cdbg(CNTR, "[%d] No update necessary", dd->unit);
-       }
-
-mod_timer(&dd->synth_stats_timer, jiffies + HZ * SYNTH_CNT_TIME);
-}
-
-#define C_MAX_NAME 13 /* 12 chars + one for /0 */
-static int init_cntrs(struct hfi1_devdata *dd)
-{
-       int i, rcv_ctxts, j;
-       size_t sz;
-       char *p;
-       char name[C_MAX_NAME];
-       struct hfi1_pportdata *ppd;
-       const char *bit_type_32 = ",32";
-       const int bit_type_32_sz = strlen(bit_type_32);
-
-       /* set up the stats timer; the add_timer is done at the end */
-       setup_timer(&dd->synth_stats_timer, update_synth_timer,
-                   (unsigned long)dd);
-
-       /***********************/
-       /* per device counters */
-       /***********************/
-
-       /* size names and determine how many we have*/
-       dd->ndevcntrs = 0;
-       sz = 0;
-
-       for (i = 0; i < DEV_CNTR_LAST; i++) {
-               if (dev_cntrs[i].flags & CNTR_DISABLED) {
-                       hfi1_dbg_early("\tSkipping %s\n", dev_cntrs[i].name);
-                       continue;
-               }
-
-               if (dev_cntrs[i].flags & CNTR_VL) {
-                       dev_cntrs[i].offset = dd->ndevcntrs;
-                       for (j = 0; j < C_VL_COUNT; j++) {
-                               snprintf(name, C_MAX_NAME, "%s%d",
-                                        dev_cntrs[i].name, vl_from_idx(j));
-                               sz += strlen(name);
-                               /* Add ",32" for 32-bit counters */
-                               if (dev_cntrs[i].flags & CNTR_32BIT)
-                                       sz += bit_type_32_sz;
-                               sz++;
-                               dd->ndevcntrs++;
-                       }
-               } else if (dev_cntrs[i].flags & CNTR_SDMA) {
-                       dev_cntrs[i].offset = dd->ndevcntrs;
-                       for (j = 0; j < dd->chip_sdma_engines; j++) {
-                               snprintf(name, C_MAX_NAME, "%s%d",
-                                        dev_cntrs[i].name, j);
-                               sz += strlen(name);
-                               /* Add ",32" for 32-bit counters */
-                               if (dev_cntrs[i].flags & CNTR_32BIT)
-                                       sz += bit_type_32_sz;
-                               sz++;
-                               dd->ndevcntrs++;
-                       }
-               } else {
-                       /* +1 for newline. */
-                       sz += strlen(dev_cntrs[i].name) + 1;
-                       /* Add ",32" for 32-bit counters */
-                       if (dev_cntrs[i].flags & CNTR_32BIT)
-                               sz += bit_type_32_sz;
-                       dev_cntrs[i].offset = dd->ndevcntrs;
-                       dd->ndevcntrs++;
-               }
-       }
-
-       /* allocate space for the counter values */
-       dd->cntrs = kcalloc(dd->ndevcntrs, sizeof(u64), GFP_KERNEL);
-       if (!dd->cntrs)
-               goto bail;
-
-       dd->scntrs = kcalloc(dd->ndevcntrs, sizeof(u64), GFP_KERNEL);
-       if (!dd->scntrs)
-               goto bail;
-
-       /* allocate space for the counter names */
-       dd->cntrnameslen = sz;
-       dd->cntrnames = kmalloc(sz, GFP_KERNEL);
-       if (!dd->cntrnames)
-               goto bail;
-
-       /* fill in the names */
-       for (p = dd->cntrnames, i = 0; i < DEV_CNTR_LAST; i++) {
-               if (dev_cntrs[i].flags & CNTR_DISABLED) {
-                       /* Nothing */
-               } else if (dev_cntrs[i].flags & CNTR_VL) {
-                       for (j = 0; j < C_VL_COUNT; j++) {
-                               snprintf(name, C_MAX_NAME, "%s%d",
-                                        dev_cntrs[i].name,
-                                        vl_from_idx(j));
-                               memcpy(p, name, strlen(name));
-                               p += strlen(name);
-
-                               /* Counter is 32 bits */
-                               if (dev_cntrs[i].flags & CNTR_32BIT) {
-                                       memcpy(p, bit_type_32, bit_type_32_sz);
-                                       p += bit_type_32_sz;
-                               }
-
-                               *p++ = '\n';
-                       }
-               } else if (dev_cntrs[i].flags & CNTR_SDMA) {
-                       for (j = 0; j < dd->chip_sdma_engines; j++) {
-                               snprintf(name, C_MAX_NAME, "%s%d",
-                                        dev_cntrs[i].name, j);
-                               memcpy(p, name, strlen(name));
-                               p += strlen(name);
-
-                               /* Counter is 32 bits */
-                               if (dev_cntrs[i].flags & CNTR_32BIT) {
-                                       memcpy(p, bit_type_32, bit_type_32_sz);
-                                       p += bit_type_32_sz;
-                               }
-
-                               *p++ = '\n';
-                       }
-               } else {
-                       memcpy(p, dev_cntrs[i].name, strlen(dev_cntrs[i].name));
-                       p += strlen(dev_cntrs[i].name);
-
-                       /* Counter is 32 bits */
-                       if (dev_cntrs[i].flags & CNTR_32BIT) {
-                               memcpy(p, bit_type_32, bit_type_32_sz);
-                               p += bit_type_32_sz;
-                       }
-
-                       *p++ = '\n';
-               }
-       }
-
-       /*********************/
-       /* per port counters */
-       /*********************/
-
-       /*
-        * Go through the counters for the overflows and disable the ones we
-        * don't need. This varies based on platform so we need to do it
-        * dynamically here.
-        */
-       rcv_ctxts = dd->num_rcv_contexts;
-       for (i = C_RCV_HDR_OVF_FIRST + rcv_ctxts;
-            i <= C_RCV_HDR_OVF_LAST; i++) {
-               port_cntrs[i].flags |= CNTR_DISABLED;
-       }
-
-       /* size port counter names and determine how many we have*/
-       sz = 0;
-       dd->nportcntrs = 0;
-       for (i = 0; i < PORT_CNTR_LAST; i++) {
-               if (port_cntrs[i].flags & CNTR_DISABLED) {
-                       hfi1_dbg_early("\tSkipping %s\n", port_cntrs[i].name);
-                       continue;
-               }
-
-               if (port_cntrs[i].flags & CNTR_VL) {
-                       port_cntrs[i].offset = dd->nportcntrs;
-                       for (j = 0; j < C_VL_COUNT; j++) {
-                               snprintf(name, C_MAX_NAME, "%s%d",
-                                        port_cntrs[i].name, vl_from_idx(j));
-                               sz += strlen(name);
-                               /* Add ",32" for 32-bit counters */
-                               if (port_cntrs[i].flags & CNTR_32BIT)
-                                       sz += bit_type_32_sz;
-                               sz++;
-                               dd->nportcntrs++;
-                       }
-               } else {
-                       /* +1 for newline */
-                       sz += strlen(port_cntrs[i].name) + 1;
-                       /* Add ",32" for 32-bit counters */
-                       if (port_cntrs[i].flags & CNTR_32BIT)
-                               sz += bit_type_32_sz;
-                       port_cntrs[i].offset = dd->nportcntrs;
-                       dd->nportcntrs++;
-               }
-       }
-
-       /* allocate space for the counter names */
-       dd->portcntrnameslen = sz;
-       dd->portcntrnames = kmalloc(sz, GFP_KERNEL);
-       if (!dd->portcntrnames)
-               goto bail;
-
-       /* fill in port cntr names */
-       for (p = dd->portcntrnames, i = 0; i < PORT_CNTR_LAST; i++) {
-               if (port_cntrs[i].flags & CNTR_DISABLED)
-                       continue;
-
-               if (port_cntrs[i].flags & CNTR_VL) {
-                       for (j = 0; j < C_VL_COUNT; j++) {
-                               snprintf(name, C_MAX_NAME, "%s%d",
-                                        port_cntrs[i].name, vl_from_idx(j));
-                               memcpy(p, name, strlen(name));
-                               p += strlen(name);
-
-                               /* Counter is 32 bits */
-                               if (port_cntrs[i].flags & CNTR_32BIT) {
-                                       memcpy(p, bit_type_32, bit_type_32_sz);
-                                       p += bit_type_32_sz;
-                               }
-
-                               *p++ = '\n';
-                       }
-               } else {
-                       memcpy(p, port_cntrs[i].name,
-                              strlen(port_cntrs[i].name));
-                       p += strlen(port_cntrs[i].name);
-
-                       /* Counter is 32 bits */
-                       if (port_cntrs[i].flags & CNTR_32BIT) {
-                               memcpy(p, bit_type_32, bit_type_32_sz);
-                               p += bit_type_32_sz;
-                       }
-
-                       *p++ = '\n';
-               }
-       }
-
-       /* allocate per port storage for counter values */
-       ppd = (struct hfi1_pportdata *)(dd + 1);
-       for (i = 0; i < dd->num_pports; i++, ppd++) {
-               ppd->cntrs = kcalloc(dd->nportcntrs, sizeof(u64), GFP_KERNEL);
-               if (!ppd->cntrs)
-                       goto bail;
-
-               ppd->scntrs = kcalloc(dd->nportcntrs, sizeof(u64), GFP_KERNEL);
-               if (!ppd->scntrs)
-                       goto bail;
-       }
-
-       /* CPU counters need to be allocated and zeroed */
-       if (init_cpu_counters(dd))
-               goto bail;
-
-       mod_timer(&dd->synth_stats_timer, jiffies + HZ * SYNTH_CNT_TIME);
-       return 0;
-bail:
-       free_cntrs(dd);
-       return -ENOMEM;
-}
-
-static u32 chip_to_opa_lstate(struct hfi1_devdata *dd, u32 chip_lstate)
-{
-       switch (chip_lstate) {
-       default:
-               dd_dev_err(dd,
-                          "Unknown logical state 0x%x, reporting IB_PORT_DOWN\n",
-                          chip_lstate);
-               /* fall through */
-       case LSTATE_DOWN:
-               return IB_PORT_DOWN;
-       case LSTATE_INIT:
-               return IB_PORT_INIT;
-       case LSTATE_ARMED:
-               return IB_PORT_ARMED;
-       case LSTATE_ACTIVE:
-               return IB_PORT_ACTIVE;
-       }
-}
-
-u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate)
-{
-       /* look at the HFI meta-states only */
-       switch (chip_pstate & 0xf0) {
-       default:
-               dd_dev_err(dd, "Unexpected chip physical state of 0x%x\n",
-                          chip_pstate);
-               /* fall through */
-       case PLS_DISABLED:
-               return IB_PORTPHYSSTATE_DISABLED;
-       case PLS_OFFLINE:
-               return OPA_PORTPHYSSTATE_OFFLINE;
-       case PLS_POLLING:
-               return IB_PORTPHYSSTATE_POLLING;
-       case PLS_CONFIGPHY:
-               return IB_PORTPHYSSTATE_TRAINING;
-       case PLS_LINKUP:
-               return IB_PORTPHYSSTATE_LINKUP;
-       case PLS_PHYTEST:
-               return IB_PORTPHYSSTATE_PHY_TEST;
-       }
-}
-
-/* return the OPA port logical state name */
-const char *opa_lstate_name(u32 lstate)
-{
-       static const char * const port_logical_names[] = {
-               "PORT_NOP",
-               "PORT_DOWN",
-               "PORT_INIT",
-               "PORT_ARMED",
-               "PORT_ACTIVE",
-               "PORT_ACTIVE_DEFER",
-       };
-       if (lstate < ARRAY_SIZE(port_logical_names))
-               return port_logical_names[lstate];
-       return "unknown";
-}
-
-/* return the OPA port physical state name */
-const char *opa_pstate_name(u32 pstate)
-{
-       static const char * const port_physical_names[] = {
-               "PHYS_NOP",
-               "reserved1",
-               "PHYS_POLL",
-               "PHYS_DISABLED",
-               "PHYS_TRAINING",
-               "PHYS_LINKUP",
-               "PHYS_LINK_ERR_RECOVER",
-               "PHYS_PHY_TEST",
-               "reserved8",
-               "PHYS_OFFLINE",
-               "PHYS_GANGED",
-               "PHYS_TEST",
-       };
-       if (pstate < ARRAY_SIZE(port_physical_names))
-               return port_physical_names[pstate];
-       return "unknown";
-}
-
-/*
- * Read the hardware link state and set the driver's cached value of it.
- * Return the (new) current value.
- */
-u32 get_logical_state(struct hfi1_pportdata *ppd)
-{
-       u32 new_state;
-
-       new_state = chip_to_opa_lstate(ppd->dd, read_logical_state(ppd->dd));
-       if (new_state != ppd->lstate) {
-               dd_dev_info(ppd->dd, "logical state changed to %s (0x%x)\n",
-                           opa_lstate_name(new_state), new_state);
-               ppd->lstate = new_state;
-       }
-       /*
-        * Set port status flags in the page mapped into userspace
-        * memory. Do it here to ensure a reliable state - this is
-        * the only function called by all state handling code.
-        * Always set the flags due to the fact that the cache value
-        * might have been changed explicitly outside of this
-        * function.
-        */
-       if (ppd->statusp) {
-               switch (ppd->lstate) {
-               case IB_PORT_DOWN:
-               case IB_PORT_INIT:
-                       *ppd->statusp &= ~(HFI1_STATUS_IB_CONF |
-                                          HFI1_STATUS_IB_READY);
-                       break;
-               case IB_PORT_ARMED:
-                       *ppd->statusp |= HFI1_STATUS_IB_CONF;
-                       break;
-               case IB_PORT_ACTIVE:
-                       *ppd->statusp |= HFI1_STATUS_IB_READY;
-                       break;
-               }
-       }
-       return ppd->lstate;
-}
-
-/**
- * wait_logical_linkstate - wait for an IB link state change to occur
- * @ppd: port device
- * @state: the state to wait for
- * @msecs: the number of milliseconds to wait
- *
- * Wait up to msecs milliseconds for IB link state change to occur.
- * For now, take the easy polling route.
- * Returns 0 if state reached, otherwise -ETIMEDOUT.
- */
-static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state,
-                                 int msecs)
-{
-       unsigned long timeout;
-
-       timeout = jiffies + msecs_to_jiffies(msecs);
-       while (1) {
-               if (get_logical_state(ppd) == state)
-                       return 0;
-               if (time_after(jiffies, timeout))
-                       break;
-               msleep(20);
-       }
-       dd_dev_err(ppd->dd, "timeout waiting for link state 0x%x\n", state);
-
-       return -ETIMEDOUT;
-}
-
-u8 hfi1_ibphys_portstate(struct hfi1_pportdata *ppd)
-{
-       u32 pstate;
-       u32 ib_pstate;
-
-       pstate = read_physical_state(ppd->dd);
-       ib_pstate = chip_to_opa_pstate(ppd->dd, pstate);
-       if (ppd->last_pstate != ib_pstate) {
-               dd_dev_info(ppd->dd,
-                           "%s: physical state changed to %s (0x%x), phy 0x%x\n",
-                           __func__, opa_pstate_name(ib_pstate), ib_pstate,
-                           pstate);
-               ppd->last_pstate = ib_pstate;
-       }
-       return ib_pstate;
-}
-
-/*
- * Read/modify/write ASIC_QSFP register bits as selected by mask
- * data: 0 or 1 in the positions depending on what needs to be written
- * dir: 0 for read, 1 for write
- * mask: select by setting
- *      I2CCLK  (bit 0)
- *      I2CDATA (bit 1)
- */
-u64 hfi1_gpio_mod(struct hfi1_devdata *dd, u32 target, u32 data, u32 dir,
-                 u32 mask)
-{
-       u64 qsfp_oe, target_oe;
-
-       target_oe = target ? ASIC_QSFP2_OE : ASIC_QSFP1_OE;
-       if (mask) {
-               /* We are writing register bits, so lock access */
-               dir &= mask;
-               data &= mask;
-
-               qsfp_oe = read_csr(dd, target_oe);
-               qsfp_oe = (qsfp_oe & ~(u64)mask) | (u64)dir;
-               write_csr(dd, target_oe, qsfp_oe);
-       }
-       /* We are exclusively reading bits here, but it is unlikely
-        * we'll get valid data when we set the direction of the pin
-        * in the same call, so read should call this function again
-        * to get valid data
-        */
-       return read_csr(dd, target ? ASIC_QSFP2_IN : ASIC_QSFP1_IN);
-}
-
-#define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \
-(r &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
-
-#define SET_STATIC_RATE_CONTROL_SMASK(r) \
-(r |= SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
-
-int hfi1_init_ctxt(struct send_context *sc)
-{
-       if (sc) {
-               struct hfi1_devdata *dd = sc->dd;
-               u64 reg;
-               u8 set = (sc->type == SC_USER ?
-                         HFI1_CAP_IS_USET(STATIC_RATE_CTRL) :
-                         HFI1_CAP_IS_KSET(STATIC_RATE_CTRL));
-               reg = read_kctxt_csr(dd, sc->hw_context,
-                                    SEND_CTXT_CHECK_ENABLE);
-               if (set)
-                       CLEAR_STATIC_RATE_CONTROL_SMASK(reg);
-               else
-                       SET_STATIC_RATE_CONTROL_SMASK(reg);
-               write_kctxt_csr(dd, sc->hw_context,
-                               SEND_CTXT_CHECK_ENABLE, reg);
-       }
-       return 0;
-}
-
-int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp)
-{
-       int ret = 0;
-       u64 reg;
-
-       if (dd->icode != ICODE_RTL_SILICON) {
-               if (HFI1_CAP_IS_KSET(PRINT_UNIMPL))
-                       dd_dev_info(dd, "%s: tempsense not supported by HW\n",
-                                   __func__);
-               return -EINVAL;
-       }
-       reg = read_csr(dd, ASIC_STS_THERM);
-       temp->curr = ((reg >> ASIC_STS_THERM_CURR_TEMP_SHIFT) &
-                     ASIC_STS_THERM_CURR_TEMP_MASK);
-       temp->lo_lim = ((reg >> ASIC_STS_THERM_LO_TEMP_SHIFT) &
-                       ASIC_STS_THERM_LO_TEMP_MASK);
-       temp->hi_lim = ((reg >> ASIC_STS_THERM_HI_TEMP_SHIFT) &
-                       ASIC_STS_THERM_HI_TEMP_MASK);
-       temp->crit_lim = ((reg >> ASIC_STS_THERM_CRIT_TEMP_SHIFT) &
-                         ASIC_STS_THERM_CRIT_TEMP_MASK);
-       /* triggers is a 3-bit value - 1 bit per trigger. */
-       temp->triggers = (u8)((reg >> ASIC_STS_THERM_LOW_SHIFT) & 0x7);
-
-       return ret;
-}
-
-/* ========================================================================= */
-
-/*
- * Enable/disable chip from delivering interrupts.
- */
-void set_intr_state(struct hfi1_devdata *dd, u32 enable)
-{
-       int i;
-
-       /*
-        * In HFI, the mask needs to be 1 to allow interrupts.
-        */
-       if (enable) {
-               /* enable all interrupts */
-               for (i = 0; i < CCE_NUM_INT_CSRS; i++)
-                       write_csr(dd, CCE_INT_MASK + (8 * i), ~(u64)0);
-
-               init_qsfp_int(dd);
-       } else {
-               for (i = 0; i < CCE_NUM_INT_CSRS; i++)
-                       write_csr(dd, CCE_INT_MASK + (8 * i), 0ull);
-       }
-}
-
-/*
- * Clear all interrupt sources on the chip.
- */
-static void clear_all_interrupts(struct hfi1_devdata *dd)
-{
-       int i;
-
-       for (i = 0; i < CCE_NUM_INT_CSRS; i++)
-               write_csr(dd, CCE_INT_CLEAR + (8 * i), ~(u64)0);
-
-       write_csr(dd, CCE_ERR_CLEAR, ~(u64)0);
-       write_csr(dd, MISC_ERR_CLEAR, ~(u64)0);
-       write_csr(dd, RCV_ERR_CLEAR, ~(u64)0);
-       write_csr(dd, SEND_ERR_CLEAR, ~(u64)0);
-       write_csr(dd, SEND_PIO_ERR_CLEAR, ~(u64)0);
-       write_csr(dd, SEND_DMA_ERR_CLEAR, ~(u64)0);
-       write_csr(dd, SEND_EGRESS_ERR_CLEAR, ~(u64)0);
-       for (i = 0; i < dd->chip_send_contexts; i++)
-               write_kctxt_csr(dd, i, SEND_CTXT_ERR_CLEAR, ~(u64)0);
-       for (i = 0; i < dd->chip_sdma_engines; i++)
-               write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_CLEAR, ~(u64)0);
-
-       write_csr(dd, DCC_ERR_FLG_CLR, ~(u64)0);
-       write_csr(dd, DC_LCB_ERR_CLR, ~(u64)0);
-       write_csr(dd, DC_DC8051_ERR_CLR, ~(u64)0);
-}
-
-/* Move to pcie.c? */
-static void disable_intx(struct pci_dev *pdev)
-{
-       pci_intx(pdev, 0);
-}
-
-static void clean_up_interrupts(struct hfi1_devdata *dd)
-{
-       int i;
-
-       /* remove irqs - must happen before disabling/turning off */
-       if (dd->num_msix_entries) {
-               /* MSI-X */
-               struct hfi1_msix_entry *me = dd->msix_entries;
-
-               for (i = 0; i < dd->num_msix_entries; i++, me++) {
-                       if (!me->arg) /* => no irq, no affinity */
-                               continue;
-                       hfi1_put_irq_affinity(dd, &dd->msix_entries[i]);
-                       free_irq(me->msix.vector, me->arg);
-               }
-       } else {
-               /* INTx */
-               if (dd->requested_intx_irq) {
-                       free_irq(dd->pcidev->irq, dd);
-                       dd->requested_intx_irq = 0;
-               }
-       }
-
-       /* turn off interrupts */
-       if (dd->num_msix_entries) {
-               /* MSI-X */
-               pci_disable_msix(dd->pcidev);
-       } else {
-               /* INTx */
-               disable_intx(dd->pcidev);
-       }
-
-       /* clean structures */
-       kfree(dd->msix_entries);
-       dd->msix_entries = NULL;
-       dd->num_msix_entries = 0;
-}
-
-/*
- * Remap the interrupt source from the general handler to the given MSI-X
- * interrupt.
- */
-static void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr)
-{
-       u64 reg;
-       int m, n;
-
-       /* clear from the handled mask of the general interrupt */
-       m = isrc / 64;
-       n = isrc % 64;
-       dd->gi_mask[m] &= ~((u64)1 << n);
-
-       /* direct the chip source to the given MSI-X interrupt */
-       m = isrc / 8;
-       n = isrc % 8;
-       reg = read_csr(dd, CCE_INT_MAP + (8 * m));
-       reg &= ~((u64)0xff << (8 * n));
-       reg |= ((u64)msix_intr & 0xff) << (8 * n);
-       write_csr(dd, CCE_INT_MAP + (8 * m), reg);
-}
-
-static void remap_sdma_interrupts(struct hfi1_devdata *dd,
-                                 int engine, int msix_intr)
-{
-       /*
-        * SDMA engine interrupt sources grouped by type, rather than
-        * engine.  Per-engine interrupts are as follows:
-        *      SDMA
-        *      SDMAProgress
-        *      SDMAIdle
-        */
-       remap_intr(dd, IS_SDMA_START + 0 * TXE_NUM_SDMA_ENGINES + engine,
-                  msix_intr);
-       remap_intr(dd, IS_SDMA_START + 1 * TXE_NUM_SDMA_ENGINES + engine,
-                  msix_intr);
-       remap_intr(dd, IS_SDMA_START + 2 * TXE_NUM_SDMA_ENGINES + engine,
-                  msix_intr);
-}
-
-static int request_intx_irq(struct hfi1_devdata *dd)
-{
-       int ret;
-
-       snprintf(dd->intx_name, sizeof(dd->intx_name), DRIVER_NAME "_%d",
-                dd->unit);
-       ret = request_irq(dd->pcidev->irq, general_interrupt,
-                         IRQF_SHARED, dd->intx_name, dd);
-       if (ret)
-               dd_dev_err(dd, "unable to request INTx interrupt, err %d\n",
-                          ret);
-       else
-               dd->requested_intx_irq = 1;
-       return ret;
-}
-
-static int request_msix_irqs(struct hfi1_devdata *dd)
-{
-       int first_general, last_general;
-       int first_sdma, last_sdma;
-       int first_rx, last_rx;
-       int i, ret = 0;
-
-       /* calculate the ranges we are going to use */
-       first_general = 0;
-       last_general = first_general + 1;
-       first_sdma = last_general;
-       last_sdma = first_sdma + dd->num_sdma;
-       first_rx = last_sdma;
-       last_rx = first_rx + dd->n_krcv_queues;
-
-       /*
-        * Sanity check - the code expects all SDMA chip source
-        * interrupts to be in the same CSR, starting at bit 0.  Verify
-        * that this is true by checking the bit location of the start.
-        */
-       BUILD_BUG_ON(IS_SDMA_START % 64);
-
-       for (i = 0; i < dd->num_msix_entries; i++) {
-               struct hfi1_msix_entry *me = &dd->msix_entries[i];
-               const char *err_info;
-               irq_handler_t handler;
-               irq_handler_t thread = NULL;
-               void *arg;
-               int idx;
-               struct hfi1_ctxtdata *rcd = NULL;
-               struct sdma_engine *sde = NULL;
-
-               /* obtain the arguments to request_irq */
-               if (first_general <= i && i < last_general) {
-                       idx = i - first_general;
-                       handler = general_interrupt;
-                       arg = dd;
-                       snprintf(me->name, sizeof(me->name),
-                                DRIVER_NAME "_%d", dd->unit);
-                       err_info = "general";
-                       me->type = IRQ_GENERAL;
-               } else if (first_sdma <= i && i < last_sdma) {
-                       idx = i - first_sdma;
-                       sde = &dd->per_sdma[idx];
-                       handler = sdma_interrupt;
-                       arg = sde;
-                       snprintf(me->name, sizeof(me->name),
-                                DRIVER_NAME "_%d sdma%d", dd->unit, idx);
-                       err_info = "sdma";
-                       remap_sdma_interrupts(dd, idx, i);
-                       me->type = IRQ_SDMA;
-               } else if (first_rx <= i && i < last_rx) {
-                       idx = i - first_rx;
-                       rcd = dd->rcd[idx];
-                       /* no interrupt if no rcd */
-                       if (!rcd)
-                               continue;
-                       /*
-                        * Set the interrupt register and mask for this
-                        * context's interrupt.
-                        */
-                       rcd->ireg = (IS_RCVAVAIL_START + idx) / 64;
-                       rcd->imask = ((u64)1) <<
-                                       ((IS_RCVAVAIL_START + idx) % 64);
-                       handler = receive_context_interrupt;
-                       thread = receive_context_thread;
-                       arg = rcd;
-                       snprintf(me->name, sizeof(me->name),
-                                DRIVER_NAME "_%d kctxt%d", dd->unit, idx);
-                       err_info = "receive context";
-                       remap_intr(dd, IS_RCVAVAIL_START + idx, i);
-                       me->type = IRQ_RCVCTXT;
-               } else {
-                       /* not in our expected range - complain, then
-                        * ignore it
-                        */
-                       dd_dev_err(dd,
-                                  "Unexpected extra MSI-X interrupt %d\n", i);
-                       continue;
-               }
-               /* no argument, no interrupt */
-               if (!arg)
-                       continue;
-               /* make sure the name is terminated */
-               me->name[sizeof(me->name) - 1] = 0;
-
-               ret = request_threaded_irq(me->msix.vector, handler, thread, 0,
-                                          me->name, arg);
-               if (ret) {
-                       dd_dev_err(dd,
-                                  "unable to allocate %s interrupt, vector %d, index %d, err %d\n",
-                                  err_info, me->msix.vector, idx, ret);
-                       return ret;
-               }
-               /*
-                * assign arg after request_irq call, so it will be
-                * cleaned up
-                */
-               me->arg = arg;
-
-               ret = hfi1_get_irq_affinity(dd, me);
-               if (ret)
-                       dd_dev_err(dd,
-                                  "unable to pin IRQ %d\n", ret);
-       }
-
-       return ret;
-}
-
-/*
- * Set the general handler to accept all interrupts, remap all
- * chip interrupts back to MSI-X 0.
- */
-static void reset_interrupts(struct hfi1_devdata *dd)
-{
-       int i;
-
-       /* all interrupts handled by the general handler */
-       for (i = 0; i < CCE_NUM_INT_CSRS; i++)
-               dd->gi_mask[i] = ~(u64)0;
-
-       /* all chip interrupts map to MSI-X 0 */
-       for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++)
-               write_csr(dd, CCE_INT_MAP + (8 * i), 0);
-}
-
-static int set_up_interrupts(struct hfi1_devdata *dd)
-{
-       struct hfi1_msix_entry *entries;
-       u32 total, request;
-       int i, ret;
-       int single_interrupt = 0; /* we expect to have all the interrupts */
-
-       /*
-        * Interrupt count:
-        *      1 general, "slow path" interrupt (includes the SDMA engines
-        *              slow source, SDMACleanupDone)
-        *      N interrupts - one per used SDMA engine
-        *      M interrupt - one per kernel receive context
-        */
-       total = 1 + dd->num_sdma + dd->n_krcv_queues;
-
-       entries = kcalloc(total, sizeof(*entries), GFP_KERNEL);
-       if (!entries) {
-               ret = -ENOMEM;
-               goto fail;
-       }
-       /* 1-1 MSI-X entry assignment */
-       for (i = 0; i < total; i++)
-               entries[i].msix.entry = i;
-
-       /* ask for MSI-X interrupts */
-       request = total;
-       request_msix(dd, &request, entries);
-
-       if (request == 0) {
-               /* using INTx */
-               /* dd->num_msix_entries already zero */
-               kfree(entries);
-               single_interrupt = 1;
-               dd_dev_err(dd, "MSI-X failed, using INTx interrupts\n");
-       } else {
-               /* using MSI-X */
-               dd->num_msix_entries = request;
-               dd->msix_entries = entries;
-
-               if (request != total) {
-                       /* using MSI-X, with reduced interrupts */
-                       dd_dev_err(
-                               dd,
-                               "cannot handle reduced interrupt case, want %u, got %u\n",
-                               total, request);
-                       ret = -EINVAL;
-                       goto fail;
-               }
-               dd_dev_info(dd, "%u MSI-X interrupts allocated\n", total);
-       }
-
-       /* mask all interrupts */
-       set_intr_state(dd, 0);
-       /* clear all pending interrupts */
-       clear_all_interrupts(dd);
-
-       /* reset general handler mask, chip MSI-X mappings */
-       reset_interrupts(dd);
-
-       if (single_interrupt)
-               ret = request_intx_irq(dd);
-       else
-               ret = request_msix_irqs(dd);
-       if (ret)
-               goto fail;
-
-       return 0;
-
-fail:
-       clean_up_interrupts(dd);
-       return ret;
-}
-
-/*
- * Set up context values in dd.  Sets:
- *
- *     num_rcv_contexts - number of contexts being used
- *     n_krcv_queues - number of kernel contexts
- *     first_user_ctxt - first non-kernel context in array of contexts
- *     freectxts  - number of free user contexts
- *     num_send_contexts - number of PIO send contexts being used
- */
-static int set_up_context_variables(struct hfi1_devdata *dd)
-{
-       int num_kernel_contexts;
-       int total_contexts;
-       int ret;
-       unsigned ngroups;
-       int qos_rmt_count;
-       int user_rmt_reduced;
-
-       /*
-        * Kernel receive contexts:
-        * - min of 2 or 1 context/numa (excluding control context)
-        * - Context 0 - control context (VL15/multicast/error)
-        * - Context 1 - first kernel context
-        * - Context 2 - second kernel context
-        * ...
-        */
-       if (n_krcvqs)
-               /*
-                * n_krcvqs is the sum of module parameter kernel receive
-                * contexts, krcvqs[].  It does not include the control
-                * context, so add that.
-                */
-               num_kernel_contexts = n_krcvqs + 1;
-       else
-               num_kernel_contexts = num_online_nodes() + 1;
-       num_kernel_contexts =
-               max_t(int, MIN_KERNEL_KCTXTS, num_kernel_contexts);
-       /*
-        * Every kernel receive context needs an ACK send context.
-        * one send context is allocated for each VL{0-7} and VL15
-        */
-       if (num_kernel_contexts > (dd->chip_send_contexts - num_vls - 1)) {
-               dd_dev_err(dd,
-                          "Reducing # kernel rcv contexts to: %d, from %d\n",
-                          (int)(dd->chip_send_contexts - num_vls - 1),
-                          (int)num_kernel_contexts);
-               num_kernel_contexts = dd->chip_send_contexts - num_vls - 1;
-       }
-       /*
-        * User contexts:
-        *      - default to 1 user context per real (non-HT) CPU core if
-        *        num_user_contexts is negative
-        */
-       if (num_user_contexts < 0)
-               num_user_contexts =
-                       cpumask_weight(&dd->affinity->real_cpu_mask);
-
-       total_contexts = num_kernel_contexts + num_user_contexts;
-
-       /*
-        * Adjust the counts given a global max.
-        */
-       if (total_contexts > dd->chip_rcv_contexts) {
-               dd_dev_err(dd,
-                          "Reducing # user receive contexts to: %d, from %d\n",
-                          (int)(dd->chip_rcv_contexts - num_kernel_contexts),
-                          (int)num_user_contexts);
-               num_user_contexts = dd->chip_rcv_contexts - num_kernel_contexts;
-               /* recalculate */
-               total_contexts = num_kernel_contexts + num_user_contexts;
-       }
-
-       /* each user context requires an entry in the RMT */
-       qos_rmt_count = qos_rmt_entries(dd, NULL, NULL);
-       if (qos_rmt_count + num_user_contexts > NUM_MAP_ENTRIES) {
-               user_rmt_reduced = NUM_MAP_ENTRIES - qos_rmt_count;
-               dd_dev_err(dd,
-                          "RMT size is reducing the number of user receive contexts from %d to %d\n",
-                          (int)num_user_contexts,
-                          user_rmt_reduced);
-               /* recalculate */
-               num_user_contexts = user_rmt_reduced;
-               total_contexts = num_kernel_contexts + num_user_contexts;
-       }
-
-       /* the first N are kernel contexts, the rest are user contexts */
-       dd->num_rcv_contexts = total_contexts;
-       dd->n_krcv_queues = num_kernel_contexts;
-       dd->first_user_ctxt = num_kernel_contexts;
-       dd->num_user_contexts = num_user_contexts;
-       dd->freectxts = num_user_contexts;
-       dd_dev_info(dd,
-                   "rcv contexts: chip %d, used %d (kernel %d, user %d)\n",
-                   (int)dd->chip_rcv_contexts,
-                   (int)dd->num_rcv_contexts,
-                   (int)dd->n_krcv_queues,
-                   (int)dd->num_rcv_contexts - dd->n_krcv_queues);
-
-       /*
-        * Receive array allocation:
-        *   All RcvArray entries are divided into groups of 8. This
-        *   is required by the hardware and will speed up writes to
-        *   consecutive entries by using write-combining of the entire
-        *   cacheline.
-        *
-        *   The number of groups are evenly divided among all contexts.
-        *   any left over groups will be given to the first N user
-        *   contexts.
-        */
-       dd->rcv_entries.group_size = RCV_INCREMENT;
-       ngroups = dd->chip_rcv_array_count / dd->rcv_entries.group_size;
-       dd->rcv_entries.ngroups = ngroups / dd->num_rcv_contexts;
-       dd->rcv_entries.nctxt_extra = ngroups -
-               (dd->num_rcv_contexts * dd->rcv_entries.ngroups);
-       dd_dev_info(dd, "RcvArray groups %u, ctxts extra %u\n",
-                   dd->rcv_entries.ngroups,
-                   dd->rcv_entries.nctxt_extra);
-       if (dd->rcv_entries.ngroups * dd->rcv_entries.group_size >
-           MAX_EAGER_ENTRIES * 2) {
-               dd->rcv_entries.ngroups = (MAX_EAGER_ENTRIES * 2) /
-                       dd->rcv_entries.group_size;
-               dd_dev_info(dd,
-                           "RcvArray group count too high, change to %u\n",
-                           dd->rcv_entries.ngroups);
-               dd->rcv_entries.nctxt_extra = 0;
-       }
-       /*
-        * PIO send contexts
-        */
-       ret = init_sc_pools_and_sizes(dd);
-       if (ret >= 0) { /* success */
-               dd->num_send_contexts = ret;
-               dd_dev_info(
-                       dd,
-                       "send contexts: chip %d, used %d (kernel %d, ack %d, user %d, vl15 %d)\n",
-                       dd->chip_send_contexts,
-                       dd->num_send_contexts,
-                       dd->sc_sizes[SC_KERNEL].count,
-                       dd->sc_sizes[SC_ACK].count,
-                       dd->sc_sizes[SC_USER].count,
-                       dd->sc_sizes[SC_VL15].count);
-               ret = 0;        /* success */
-       }
-
-       return ret;
-}
-
-/*
- * Set the device/port partition key table. The MAD code
- * will ensure that, at least, the partial management
- * partition key is present in the table.
- */
-static void set_partition_keys(struct hfi1_pportdata *ppd)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       u64 reg = 0;
-       int i;
-
-       dd_dev_info(dd, "Setting partition keys\n");
-       for (i = 0; i < hfi1_get_npkeys(dd); i++) {
-               reg |= (ppd->pkeys[i] &
-                       RCV_PARTITION_KEY_PARTITION_KEY_A_MASK) <<
-                       ((i % 4) *
-                        RCV_PARTITION_KEY_PARTITION_KEY_B_SHIFT);
-               /* Each register holds 4 PKey values. */
-               if ((i % 4) == 3) {
-                       write_csr(dd, RCV_PARTITION_KEY +
-                                 ((i - 3) * 2), reg);
-                       reg = 0;
-               }
-       }
-
-       /* Always enable HW pkeys check when pkeys table is set */
-       add_rcvctrl(dd, RCV_CTRL_RCV_PARTITION_KEY_ENABLE_SMASK);
-}
-
-/*
- * These CSRs and memories are uninitialized on reset and must be
- * written before reading to set the ECC/parity bits.
- *
- * NOTE: All user context CSRs that are not mmaped write-only
- * (e.g. the TID flows) must be initialized even if the driver never
- * reads them.
- */
-static void write_uninitialized_csrs_and_memories(struct hfi1_devdata *dd)
-{
-       int i, j;
-
-       /* CceIntMap */
-       for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++)
-               write_csr(dd, CCE_INT_MAP + (8 * i), 0);
-
-       /* SendCtxtCreditReturnAddr */
-       for (i = 0; i < dd->chip_send_contexts; i++)
-               write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_RETURN_ADDR, 0);
-
-       /* PIO Send buffers */
-       /* SDMA Send buffers */
-       /*
-        * These are not normally read, and (presently) have no method
-        * to be read, so are not pre-initialized
-        */
-
-       /* RcvHdrAddr */
-       /* RcvHdrTailAddr */
-       /* RcvTidFlowTable */
-       for (i = 0; i < dd->chip_rcv_contexts; i++) {
-               write_kctxt_csr(dd, i, RCV_HDR_ADDR, 0);
-               write_kctxt_csr(dd, i, RCV_HDR_TAIL_ADDR, 0);
-               for (j = 0; j < RXE_NUM_TID_FLOWS; j++)
-                       write_uctxt_csr(dd, i, RCV_TID_FLOW_TABLE + (8 * j), 0);
-       }
-
-       /* RcvArray */
-       for (i = 0; i < dd->chip_rcv_array_count; i++)
-               write_csr(dd, RCV_ARRAY + (8 * i),
-                         RCV_ARRAY_RT_WRITE_ENABLE_SMASK);
-
-       /* RcvQPMapTable */
-       for (i = 0; i < 32; i++)
-               write_csr(dd, RCV_QP_MAP_TABLE + (8 * i), 0);
-}
-
-/*
- * Use the ctrl_bits in CceCtrl to clear the status_bits in CceStatus.
- */
-static void clear_cce_status(struct hfi1_devdata *dd, u64 status_bits,
-                            u64 ctrl_bits)
-{
-       unsigned long timeout;
-       u64 reg;
-
-       /* is the condition present? */
-       reg = read_csr(dd, CCE_STATUS);
-       if ((reg & status_bits) == 0)
-               return;
-
-       /* clear the condition */
-       write_csr(dd, CCE_CTRL, ctrl_bits);
-
-       /* wait for the condition to clear */
-       timeout = jiffies + msecs_to_jiffies(CCE_STATUS_TIMEOUT);
-       while (1) {
-               reg = read_csr(dd, CCE_STATUS);
-               if ((reg & status_bits) == 0)
-                       return;
-               if (time_after(jiffies, timeout)) {
-                       dd_dev_err(dd,
-                                  "Timeout waiting for CceStatus to clear bits 0x%llx, remaining 0x%llx\n",
-                                  status_bits, reg & status_bits);
-                       return;
-               }
-               udelay(1);
-       }
-}
-
-/* set CCE CSRs to chip reset defaults */
-static void reset_cce_csrs(struct hfi1_devdata *dd)
-{
-       int i;
-
-       /* CCE_REVISION read-only */
-       /* CCE_REVISION2 read-only */
-       /* CCE_CTRL - bits clear automatically */
-       /* CCE_STATUS read-only, use CceCtrl to clear */
-       clear_cce_status(dd, ALL_FROZE, CCE_CTRL_SPC_UNFREEZE_SMASK);
-       clear_cce_status(dd, ALL_TXE_PAUSE, CCE_CTRL_TXE_RESUME_SMASK);
-       clear_cce_status(dd, ALL_RXE_PAUSE, CCE_CTRL_RXE_RESUME_SMASK);
-       for (i = 0; i < CCE_NUM_SCRATCH; i++)
-               write_csr(dd, CCE_SCRATCH + (8 * i), 0);
-       /* CCE_ERR_STATUS read-only */
-       write_csr(dd, CCE_ERR_MASK, 0);
-       write_csr(dd, CCE_ERR_CLEAR, ~0ull);
-       /* CCE_ERR_FORCE leave alone */
-       for (i = 0; i < CCE_NUM_32_BIT_COUNTERS; i++)
-               write_csr(dd, CCE_COUNTER_ARRAY32 + (8 * i), 0);
-       write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_RESETCSR);
-       /* CCE_PCIE_CTRL leave alone */
-       for (i = 0; i < CCE_NUM_MSIX_VECTORS; i++) {
-               write_csr(dd, CCE_MSIX_TABLE_LOWER + (8 * i), 0);
-               write_csr(dd, CCE_MSIX_TABLE_UPPER + (8 * i),
-                         CCE_MSIX_TABLE_UPPER_RESETCSR);
-       }
-       for (i = 0; i < CCE_NUM_MSIX_PBAS; i++) {
-               /* CCE_MSIX_PBA read-only */
-               write_csr(dd, CCE_MSIX_INT_GRANTED, ~0ull);
-               write_csr(dd, CCE_MSIX_VEC_CLR_WITHOUT_INT, ~0ull);
-       }
-       for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++)
-               write_csr(dd, CCE_INT_MAP, 0);
-       for (i = 0; i < CCE_NUM_INT_CSRS; i++) {
-               /* CCE_INT_STATUS read-only */
-               write_csr(dd, CCE_INT_MASK + (8 * i), 0);
-               write_csr(dd, CCE_INT_CLEAR + (8 * i), ~0ull);
-               /* CCE_INT_FORCE leave alone */
-               /* CCE_INT_BLOCKED read-only */
-       }
-       for (i = 0; i < CCE_NUM_32_BIT_INT_COUNTERS; i++)
-               write_csr(dd, CCE_INT_COUNTER_ARRAY32 + (8 * i), 0);
-}
-
-/* set MISC CSRs to chip reset defaults */
-static void reset_misc_csrs(struct hfi1_devdata *dd)
-{
-       int i;
-
-       for (i = 0; i < 32; i++) {
-               write_csr(dd, MISC_CFG_RSA_R2 + (8 * i), 0);
-               write_csr(dd, MISC_CFG_RSA_SIGNATURE + (8 * i), 0);
-               write_csr(dd, MISC_CFG_RSA_MODULUS + (8 * i), 0);
-       }
-       /*
-        * MISC_CFG_SHA_PRELOAD leave alone - always reads 0 and can
-        * only be written 128-byte chunks
-        */
-       /* init RSA engine to clear lingering errors */
-       write_csr(dd, MISC_CFG_RSA_CMD, 1);
-       write_csr(dd, MISC_CFG_RSA_MU, 0);
-       write_csr(dd, MISC_CFG_FW_CTRL, 0);
-       /* MISC_STS_8051_DIGEST read-only */
-       /* MISC_STS_SBM_DIGEST read-only */
-       /* MISC_STS_PCIE_DIGEST read-only */
-       /* MISC_STS_FAB_DIGEST read-only */
-       /* MISC_ERR_STATUS read-only */
-       write_csr(dd, MISC_ERR_MASK, 0);
-       write_csr(dd, MISC_ERR_CLEAR, ~0ull);
-       /* MISC_ERR_FORCE leave alone */
-}
-
-/* set TXE CSRs to chip reset defaults */
-static void reset_txe_csrs(struct hfi1_devdata *dd)
-{
-       int i;
-
-       /*
-        * TXE Kernel CSRs
-        */
-       write_csr(dd, SEND_CTRL, 0);
-       __cm_reset(dd, 0);      /* reset CM internal state */
-       /* SEND_CONTEXTS read-only */
-       /* SEND_DMA_ENGINES read-only */
-       /* SEND_PIO_MEM_SIZE read-only */
-       /* SEND_DMA_MEM_SIZE read-only */
-       write_csr(dd, SEND_HIGH_PRIORITY_LIMIT, 0);
-       pio_reset_all(dd);      /* SEND_PIO_INIT_CTXT */
-       /* SEND_PIO_ERR_STATUS read-only */
-       write_csr(dd, SEND_PIO_ERR_MASK, 0);
-       write_csr(dd, SEND_PIO_ERR_CLEAR, ~0ull);
-       /* SEND_PIO_ERR_FORCE leave alone */
-       /* SEND_DMA_ERR_STATUS read-only */
-       write_csr(dd, SEND_DMA_ERR_MASK, 0);
-       write_csr(dd, SEND_DMA_ERR_CLEAR, ~0ull);
-       /* SEND_DMA_ERR_FORCE leave alone */
-       /* SEND_EGRESS_ERR_STATUS read-only */
-       write_csr(dd, SEND_EGRESS_ERR_MASK, 0);
-       write_csr(dd, SEND_EGRESS_ERR_CLEAR, ~0ull);
-       /* SEND_EGRESS_ERR_FORCE leave alone */
-       write_csr(dd, SEND_BTH_QP, 0);
-       write_csr(dd, SEND_STATIC_RATE_CONTROL, 0);
-       write_csr(dd, SEND_SC2VLT0, 0);
-       write_csr(dd, SEND_SC2VLT1, 0);
-       write_csr(dd, SEND_SC2VLT2, 0);
-       write_csr(dd, SEND_SC2VLT3, 0);
-       write_csr(dd, SEND_LEN_CHECK0, 0);
-       write_csr(dd, SEND_LEN_CHECK1, 0);
-       /* SEND_ERR_STATUS read-only */
-       write_csr(dd, SEND_ERR_MASK, 0);
-       write_csr(dd, SEND_ERR_CLEAR, ~0ull);
-       /* SEND_ERR_FORCE read-only */
-       for (i = 0; i < VL_ARB_LOW_PRIO_TABLE_SIZE; i++)
-               write_csr(dd, SEND_LOW_PRIORITY_LIST + (8 * i), 0);
-       for (i = 0; i < VL_ARB_HIGH_PRIO_TABLE_SIZE; i++)
-               write_csr(dd, SEND_HIGH_PRIORITY_LIST + (8 * i), 0);
-       for (i = 0; i < dd->chip_send_contexts / NUM_CONTEXTS_PER_SET; i++)
-               write_csr(dd, SEND_CONTEXT_SET_CTRL + (8 * i), 0);
-       for (i = 0; i < TXE_NUM_32_BIT_COUNTER; i++)
-               write_csr(dd, SEND_COUNTER_ARRAY32 + (8 * i), 0);
-       for (i = 0; i < TXE_NUM_64_BIT_COUNTER; i++)
-               write_csr(dd, SEND_COUNTER_ARRAY64 + (8 * i), 0);
-       write_csr(dd, SEND_CM_CTRL, SEND_CM_CTRL_RESETCSR);
-       write_csr(dd, SEND_CM_GLOBAL_CREDIT, SEND_CM_GLOBAL_CREDIT_RESETCSR);
-       /* SEND_CM_CREDIT_USED_STATUS read-only */
-       write_csr(dd, SEND_CM_TIMER_CTRL, 0);
-       write_csr(dd, SEND_CM_LOCAL_AU_TABLE0_TO3, 0);
-       write_csr(dd, SEND_CM_LOCAL_AU_TABLE4_TO7, 0);
-       write_csr(dd, SEND_CM_REMOTE_AU_TABLE0_TO3, 0);
-       write_csr(dd, SEND_CM_REMOTE_AU_TABLE4_TO7, 0);
-       for (i = 0; i < TXE_NUM_DATA_VL; i++)
-               write_csr(dd, SEND_CM_CREDIT_VL + (8 * i), 0);
-       write_csr(dd, SEND_CM_CREDIT_VL15, 0);
-       /* SEND_CM_CREDIT_USED_VL read-only */
-       /* SEND_CM_CREDIT_USED_VL15 read-only */
-       /* SEND_EGRESS_CTXT_STATUS read-only */
-       /* SEND_EGRESS_SEND_DMA_STATUS read-only */
-       write_csr(dd, SEND_EGRESS_ERR_INFO, ~0ull);
-       /* SEND_EGRESS_ERR_INFO read-only */
-       /* SEND_EGRESS_ERR_SOURCE read-only */
-
-       /*
-        * TXE Per-Context CSRs
-        */
-       for (i = 0; i < dd->chip_send_contexts; i++) {
-               write_kctxt_csr(dd, i, SEND_CTXT_CTRL, 0);
-               write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_CTRL, 0);
-               write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_RETURN_ADDR, 0);
-               write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_FORCE, 0);
-               write_kctxt_csr(dd, i, SEND_CTXT_ERR_MASK, 0);
-               write_kctxt_csr(dd, i, SEND_CTXT_ERR_CLEAR, ~0ull);
-               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_ENABLE, 0);
-               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_VL, 0);
-               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_JOB_KEY, 0);
-               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_PARTITION_KEY, 0);
-               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_SLID, 0);
-               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_OPCODE, 0);
-       }
-
-       /*
-        * TXE Per-SDMA CSRs
-        */
-       for (i = 0; i < dd->chip_sdma_engines; i++) {
-               write_kctxt_csr(dd, i, SEND_DMA_CTRL, 0);
-               /* SEND_DMA_STATUS read-only */
-               write_kctxt_csr(dd, i, SEND_DMA_BASE_ADDR, 0);
-               write_kctxt_csr(dd, i, SEND_DMA_LEN_GEN, 0);
-               write_kctxt_csr(dd, i, SEND_DMA_TAIL, 0);
-               /* SEND_DMA_HEAD read-only */
-               write_kctxt_csr(dd, i, SEND_DMA_HEAD_ADDR, 0);
-               write_kctxt_csr(dd, i, SEND_DMA_PRIORITY_THLD, 0);
-               /* SEND_DMA_IDLE_CNT read-only */
-               write_kctxt_csr(dd, i, SEND_DMA_RELOAD_CNT, 0);
-               write_kctxt_csr(dd, i, SEND_DMA_DESC_CNT, 0);
-               /* SEND_DMA_DESC_FETCHED_CNT read-only */
-               /* SEND_DMA_ENG_ERR_STATUS read-only */
-               write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_MASK, 0);
-               write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_CLEAR, ~0ull);
-               /* SEND_DMA_ENG_ERR_FORCE leave alone */
-               write_kctxt_csr(dd, i, SEND_DMA_CHECK_ENABLE, 0);
-               write_kctxt_csr(dd, i, SEND_DMA_CHECK_VL, 0);
-               write_kctxt_csr(dd, i, SEND_DMA_CHECK_JOB_KEY, 0);
-               write_kctxt_csr(dd, i, SEND_DMA_CHECK_PARTITION_KEY, 0);
-               write_kctxt_csr(dd, i, SEND_DMA_CHECK_SLID, 0);
-               write_kctxt_csr(dd, i, SEND_DMA_CHECK_OPCODE, 0);
-               write_kctxt_csr(dd, i, SEND_DMA_MEMORY, 0);
-       }
-}
-
-/*
- * Expect on entry:
- * o Packet ingress is disabled, i.e. RcvCtrl.RcvPortEnable == 0
- */
-static void init_rbufs(struct hfi1_devdata *dd)
-{
-       u64 reg;
-       int count;
-
-       /*
-        * Wait for DMA to stop: RxRbufPktPending and RxPktInProgress are
-        * clear.
-        */
-       count = 0;
-       while (1) {
-               reg = read_csr(dd, RCV_STATUS);
-               if ((reg & (RCV_STATUS_RX_RBUF_PKT_PENDING_SMASK
-                           | RCV_STATUS_RX_PKT_IN_PROGRESS_SMASK)) == 0)
-                       break;
-               /*
-                * Give up after 1ms - maximum wait time.
-                *
-                * RBuf size is 148KiB.  Slowest possible is PCIe Gen1 x1 at
-                * 250MB/s bandwidth.  Lower rate to 66% for overhead to get:
-                *      148 KB / (66% * 250MB/s) = 920us
-                */
-               if (count++ > 500) {
-                       dd_dev_err(dd,
-                                  "%s: in-progress DMA not clearing: RcvStatus 0x%llx, continuing\n",
-                                  __func__, reg);
-                       break;
-               }
-               udelay(2); /* do not busy-wait the CSR */
-       }
-
-       /* start the init - expect RcvCtrl to be 0 */
-       write_csr(dd, RCV_CTRL, RCV_CTRL_RX_RBUF_INIT_SMASK);
-
-       /*
-        * Read to force the write of Rcvtrl.RxRbufInit.  There is a brief
-        * period after the write before RcvStatus.RxRbufInitDone is valid.
-        * The delay in the first run through the loop below is sufficient and
-        * required before the first read of RcvStatus.RxRbufInintDone.
-        */
-       read_csr(dd, RCV_CTRL);
-
-       /* wait for the init to finish */
-       count = 0;
-       while (1) {
-               /* delay is required first time through - see above */
-               udelay(2); /* do not busy-wait the CSR */
-               reg = read_csr(dd, RCV_STATUS);
-               if (reg & (RCV_STATUS_RX_RBUF_INIT_DONE_SMASK))
-                       break;
-
-               /* give up after 100us - slowest possible at 33MHz is 73us */
-               if (count++ > 50) {
-                       dd_dev_err(dd,
-                                  "%s: RcvStatus.RxRbufInit not set, continuing\n",
-                                  __func__);
-                       break;
-               }
-       }
-}
-
-/* set RXE CSRs to chip reset defaults */
-static void reset_rxe_csrs(struct hfi1_devdata *dd)
-{
-       int i, j;
-
-       /*
-        * RXE Kernel CSRs
-        */
-       write_csr(dd, RCV_CTRL, 0);
-       init_rbufs(dd);
-       /* RCV_STATUS read-only */
-       /* RCV_CONTEXTS read-only */
-       /* RCV_ARRAY_CNT read-only */
-       /* RCV_BUF_SIZE read-only */
-       write_csr(dd, RCV_BTH_QP, 0);
-       write_csr(dd, RCV_MULTICAST, 0);
-       write_csr(dd, RCV_BYPASS, 0);
-       write_csr(dd, RCV_VL15, 0);
-       /* this is a clear-down */
-       write_csr(dd, RCV_ERR_INFO,
-                 RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK);
-       /* RCV_ERR_STATUS read-only */
-       write_csr(dd, RCV_ERR_MASK, 0);
-       write_csr(dd, RCV_ERR_CLEAR, ~0ull);
-       /* RCV_ERR_FORCE leave alone */
-       for (i = 0; i < 32; i++)
-               write_csr(dd, RCV_QP_MAP_TABLE + (8 * i), 0);
-       for (i = 0; i < 4; i++)
-               write_csr(dd, RCV_PARTITION_KEY + (8 * i), 0);
-       for (i = 0; i < RXE_NUM_32_BIT_COUNTERS; i++)
-               write_csr(dd, RCV_COUNTER_ARRAY32 + (8 * i), 0);
-       for (i = 0; i < RXE_NUM_64_BIT_COUNTERS; i++)
-               write_csr(dd, RCV_COUNTER_ARRAY64 + (8 * i), 0);
-       for (i = 0; i < RXE_NUM_RSM_INSTANCES; i++) {
-               write_csr(dd, RCV_RSM_CFG + (8 * i), 0);
-               write_csr(dd, RCV_RSM_SELECT + (8 * i), 0);
-               write_csr(dd, RCV_RSM_MATCH + (8 * i), 0);
-       }
-       for (i = 0; i < 32; i++)
-               write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), 0);
-
-       /*
-        * RXE Kernel and User Per-Context CSRs
-        */
-       for (i = 0; i < dd->chip_rcv_contexts; i++) {
-               /* kernel */
-               write_kctxt_csr(dd, i, RCV_CTXT_CTRL, 0);
-               /* RCV_CTXT_STATUS read-only */
-               write_kctxt_csr(dd, i, RCV_EGR_CTRL, 0);
-               write_kctxt_csr(dd, i, RCV_TID_CTRL, 0);
-               write_kctxt_csr(dd, i, RCV_KEY_CTRL, 0);
-               write_kctxt_csr(dd, i, RCV_HDR_ADDR, 0);
-               write_kctxt_csr(dd, i, RCV_HDR_CNT, 0);
-               write_kctxt_csr(dd, i, RCV_HDR_ENT_SIZE, 0);
-               write_kctxt_csr(dd, i, RCV_HDR_SIZE, 0);
-               write_kctxt_csr(dd, i, RCV_HDR_TAIL_ADDR, 0);
-               write_kctxt_csr(dd, i, RCV_AVAIL_TIME_OUT, 0);
-               write_kctxt_csr(dd, i, RCV_HDR_OVFL_CNT, 0);
-
-               /* user */
-               /* RCV_HDR_TAIL read-only */
-               write_uctxt_csr(dd, i, RCV_HDR_HEAD, 0);
-               /* RCV_EGR_INDEX_TAIL read-only */
-               write_uctxt_csr(dd, i, RCV_EGR_INDEX_HEAD, 0);
-               /* RCV_EGR_OFFSET_TAIL read-only */
-               for (j = 0; j < RXE_NUM_TID_FLOWS; j++) {
-                       write_uctxt_csr(dd, i,
-                                       RCV_TID_FLOW_TABLE + (8 * j), 0);
-               }
-       }
-}
-
-/*
- * Set sc2vl tables.
- *
- * They power on to zeros, so to avoid send context errors
- * they need to be set:
- *
- * SC 0-7 -> VL 0-7 (respectively)
- * SC 15  -> VL 15
- * otherwise
- *        -> VL 0
- */
-static void init_sc2vl_tables(struct hfi1_devdata *dd)
-{
-       int i;
-       /* init per architecture spec, constrained by hardware capability */
-
-       /* HFI maps sent packets */
-       write_csr(dd, SEND_SC2VLT0, SC2VL_VAL(
-               0,
-               0, 0, 1, 1,
-               2, 2, 3, 3,
-               4, 4, 5, 5,
-               6, 6, 7, 7));
-       write_csr(dd, SEND_SC2VLT1, SC2VL_VAL(
-               1,
-               8, 0, 9, 0,
-               10, 0, 11, 0,
-               12, 0, 13, 0,
-               14, 0, 15, 15));
-       write_csr(dd, SEND_SC2VLT2, SC2VL_VAL(
-               2,
-               16, 0, 17, 0,
-               18, 0, 19, 0,
-               20, 0, 21, 0,
-               22, 0, 23, 0));
-       write_csr(dd, SEND_SC2VLT3, SC2VL_VAL(
-               3,
-               24, 0, 25, 0,
-               26, 0, 27, 0,
-               28, 0, 29, 0,
-               30, 0, 31, 0));
-
-       /* DC maps received packets */
-       write_csr(dd, DCC_CFG_SC_VL_TABLE_15_0, DC_SC_VL_VAL(
-               15_0,
-               0, 0, 1, 1,  2, 2,  3, 3,  4, 4,  5, 5,  6, 6,  7,  7,
-               8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 15));
-       write_csr(dd, DCC_CFG_SC_VL_TABLE_31_16, DC_SC_VL_VAL(
-               31_16,
-               16, 0, 17, 0, 18, 0, 19, 0, 20, 0, 21, 0, 22, 0, 23, 0,
-               24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 31, 0));
-
-       /* initialize the cached sc2vl values consistently with h/w */
-       for (i = 0; i < 32; i++) {
-               if (i < 8 || i == 15)
-                       *((u8 *)(dd->sc2vl) + i) = (u8)i;
-               else
-                       *((u8 *)(dd->sc2vl) + i) = 0;
-       }
-}
-
-/*
- * Read chip sizes and then reset parts to sane, disabled, values.  We cannot
- * depend on the chip going through a power-on reset - a driver may be loaded
- * and unloaded many times.
- *
- * Do not write any CSR values to the chip in this routine - there may be
- * a reset following the (possible) FLR in this routine.
- *
- */
-static void init_chip(struct hfi1_devdata *dd)
-{
-       int i;
-
-       /*
-        * Put the HFI CSRs in a known state.
-        * Combine this with a DC reset.
-        *
-        * Stop the device from doing anything while we do a
-        * reset.  We know there are no other active users of
-        * the device since we are now in charge.  Turn off
-        * off all outbound and inbound traffic and make sure
-        * the device does not generate any interrupts.
-        */
-
-       /* disable send contexts and SDMA engines */
-       write_csr(dd, SEND_CTRL, 0);
-       for (i = 0; i < dd->chip_send_contexts; i++)
-               write_kctxt_csr(dd, i, SEND_CTXT_CTRL, 0);
-       for (i = 0; i < dd->chip_sdma_engines; i++)
-               write_kctxt_csr(dd, i, SEND_DMA_CTRL, 0);
-       /* disable port (turn off RXE inbound traffic) and contexts */
-       write_csr(dd, RCV_CTRL, 0);
-       for (i = 0; i < dd->chip_rcv_contexts; i++)
-               write_csr(dd, RCV_CTXT_CTRL, 0);
-       /* mask all interrupt sources */
-       for (i = 0; i < CCE_NUM_INT_CSRS; i++)
-               write_csr(dd, CCE_INT_MASK + (8 * i), 0ull);
-
-       /*
-        * DC Reset: do a full DC reset before the register clear.
-        * A recommended length of time to hold is one CSR read,
-        * so reread the CceDcCtrl.  Then, hold the DC in reset
-        * across the clear.
-        */
-       write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_DC_RESET_SMASK);
-       (void)read_csr(dd, CCE_DC_CTRL);
-
-       if (use_flr) {
-               /*
-                * A FLR will reset the SPC core and part of the PCIe.
-                * The parts that need to be restored have already been
-                * saved.
-                */
-               dd_dev_info(dd, "Resetting CSRs with FLR\n");
-
-               /* do the FLR, the DC reset will remain */
-               hfi1_pcie_flr(dd);
-
-               /* restore command and BARs */
-               restore_pci_variables(dd);
-
-               if (is_ax(dd)) {
-                       dd_dev_info(dd, "Resetting CSRs with FLR\n");
-                       hfi1_pcie_flr(dd);
-                       restore_pci_variables(dd);
-               }
-       } else {
-               dd_dev_info(dd, "Resetting CSRs with writes\n");
-               reset_cce_csrs(dd);
-               reset_txe_csrs(dd);
-               reset_rxe_csrs(dd);
-               reset_misc_csrs(dd);
-       }
-       /* clear the DC reset */
-       write_csr(dd, CCE_DC_CTRL, 0);
-
-       /* Set the LED off */
-       setextled(dd, 0);
-
-       /*
-        * Clear the QSFP reset.
-        * An FLR enforces a 0 on all out pins. The driver does not touch
-        * ASIC_QSFPn_OUT otherwise.  This leaves RESET_N low and
-        * anything plugged constantly in reset, if it pays attention
-        * to RESET_N.
-        * Prime examples of this are optical cables. Set all pins high.
-        * I2CCLK and I2CDAT will change per direction, and INT_N and
-        * MODPRS_N are input only and their value is ignored.
-        */
-       write_csr(dd, ASIC_QSFP1_OUT, 0x1f);
-       write_csr(dd, ASIC_QSFP2_OUT, 0x1f);
-       init_chip_resources(dd);
-}
-
-static void init_early_variables(struct hfi1_devdata *dd)
-{
-       int i;
-
-       /* assign link credit variables */
-       dd->vau = CM_VAU;
-       dd->link_credits = CM_GLOBAL_CREDITS;
-       if (is_ax(dd))
-               dd->link_credits--;
-       dd->vcu = cu_to_vcu(hfi1_cu);
-       /* enough room for 8 MAD packets plus header - 17K */
-       dd->vl15_init = (8 * (2048 + 128)) / vau_to_au(dd->vau);
-       if (dd->vl15_init > dd->link_credits)
-               dd->vl15_init = dd->link_credits;
-
-       write_uninitialized_csrs_and_memories(dd);
-
-       if (HFI1_CAP_IS_KSET(PKEY_CHECK))
-               for (i = 0; i < dd->num_pports; i++) {
-                       struct hfi1_pportdata *ppd = &dd->pport[i];
-
-                       set_partition_keys(ppd);
-               }
-       init_sc2vl_tables(dd);
-}
-
-static void init_kdeth_qp(struct hfi1_devdata *dd)
-{
-       /* user changed the KDETH_QP */
-       if (kdeth_qp != 0 && kdeth_qp >= 0xff) {
-               /* out of range or illegal value */
-               dd_dev_err(dd, "Invalid KDETH queue pair prefix, ignoring");
-               kdeth_qp = 0;
-       }
-       if (kdeth_qp == 0)      /* not set, or failed range check */
-               kdeth_qp = DEFAULT_KDETH_QP;
-
-       write_csr(dd, SEND_BTH_QP,
-                 (kdeth_qp & SEND_BTH_QP_KDETH_QP_MASK) <<
-                 SEND_BTH_QP_KDETH_QP_SHIFT);
-
-       write_csr(dd, RCV_BTH_QP,
-                 (kdeth_qp & RCV_BTH_QP_KDETH_QP_MASK) <<
-                 RCV_BTH_QP_KDETH_QP_SHIFT);
-}
-
-/**
- * init_qpmap_table
- * @dd - device data
- * @first_ctxt - first context
- * @last_ctxt - first context
- *
- * This return sets the qpn mapping table that
- * is indexed by qpn[8:1].
- *
- * The routine will round robin the 256 settings
- * from first_ctxt to last_ctxt.
- *
- * The first/last looks ahead to having specialized
- * receive contexts for mgmt and bypass.  Normal
- * verbs traffic will assumed to be on a range
- * of receive contexts.
- */
-static void init_qpmap_table(struct hfi1_devdata *dd,
-                            u32 first_ctxt,
-                            u32 last_ctxt)
-{
-       u64 reg = 0;
-       u64 regno = RCV_QP_MAP_TABLE;
-       int i;
-       u64 ctxt = first_ctxt;
-
-       for (i = 0; i < 256; i++) {
-               reg |= ctxt << (8 * (i % 8));
-               ctxt++;
-               if (ctxt > last_ctxt)
-                       ctxt = first_ctxt;
-               if (i % 8 == 7) {
-                       write_csr(dd, regno, reg);
-                       reg = 0;
-                       regno += 8;
-               }
-       }
-
-       add_rcvctrl(dd, RCV_CTRL_RCV_QP_MAP_ENABLE_SMASK
-                       | RCV_CTRL_RCV_BYPASS_ENABLE_SMASK);
-}
-
-struct rsm_map_table {
-       u64 map[NUM_MAP_REGS];
-       unsigned int used;
-};
-
-struct rsm_rule_data {
-       u8 offset;
-       u8 pkt_type;
-       u32 field1_off;
-       u32 field2_off;
-       u32 index1_off;
-       u32 index1_width;
-       u32 index2_off;
-       u32 index2_width;
-       u32 mask1;
-       u32 value1;
-       u32 mask2;
-       u32 value2;
-};
-
-/*
- * Return an initialized RMT map table for users to fill in.  OK if it
- * returns NULL, indicating no table.
- */
-static struct rsm_map_table *alloc_rsm_map_table(struct hfi1_devdata *dd)
-{
-       struct rsm_map_table *rmt;
-       u8 rxcontext = is_ax(dd) ? 0 : 0xff;  /* 0 is default if a0 ver. */
-
-       rmt = kmalloc(sizeof(*rmt), GFP_KERNEL);
-       if (rmt) {
-               memset(rmt->map, rxcontext, sizeof(rmt->map));
-               rmt->used = 0;
-       }
-
-       return rmt;
-}
-
-/*
- * Write the final RMT map table to the chip and free the table.  OK if
- * table is NULL.
- */
-static void complete_rsm_map_table(struct hfi1_devdata *dd,
-                                  struct rsm_map_table *rmt)
-{
-       int i;
-
-       if (rmt) {
-               /* write table to chip */
-               for (i = 0; i < NUM_MAP_REGS; i++)
-                       write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), rmt->map[i]);
-
-               /* enable RSM */
-               add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK);
-       }
-}
-
-/*
- * Add a receive side mapping rule.
- */
-static void add_rsm_rule(struct hfi1_devdata *dd, u8 rule_index,
-                        struct rsm_rule_data *rrd)
-{
-       write_csr(dd, RCV_RSM_CFG + (8 * rule_index),
-                 (u64)rrd->offset << RCV_RSM_CFG_OFFSET_SHIFT |
-                 1ull << rule_index | /* enable bit */
-                 (u64)rrd->pkt_type << RCV_RSM_CFG_PACKET_TYPE_SHIFT);
-       write_csr(dd, RCV_RSM_SELECT + (8 * rule_index),
-                 (u64)rrd->field1_off << RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT |
-                 (u64)rrd->field2_off << RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT |
-                 (u64)rrd->index1_off << RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT |
-                 (u64)rrd->index1_width << RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT |
-                 (u64)rrd->index2_off << RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT |
-                 (u64)rrd->index2_width << RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT);
-       write_csr(dd, RCV_RSM_MATCH + (8 * rule_index),
-                 (u64)rrd->mask1 << RCV_RSM_MATCH_MASK1_SHIFT |
-                 (u64)rrd->value1 << RCV_RSM_MATCH_VALUE1_SHIFT |
-                 (u64)rrd->mask2 << RCV_RSM_MATCH_MASK2_SHIFT |
-                 (u64)rrd->value2 << RCV_RSM_MATCH_VALUE2_SHIFT);
-}
-
-/* return the number of RSM map table entries that will be used for QOS */
-static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp,
-                          unsigned int *np)
-{
-       int i;
-       unsigned int m, n;
-       u8 max_by_vl = 0;
-
-       /* is QOS active at all? */
-       if (dd->n_krcv_queues <= MIN_KERNEL_KCTXTS ||
-           num_vls == 1 ||
-           krcvqsset <= 1)
-               goto no_qos;
-
-       /* determine bits for qpn */
-       for (i = 0; i < min_t(unsigned int, num_vls, krcvqsset); i++)
-               if (krcvqs[i] > max_by_vl)
-                       max_by_vl = krcvqs[i];
-       if (max_by_vl > 32)
-               goto no_qos;
-       m = ilog2(__roundup_pow_of_two(max_by_vl));
-
-       /* determine bits for vl */
-       n = ilog2(__roundup_pow_of_two(num_vls));
-
-       /* reject if too much is used */
-       if ((m + n) > 7)
-               goto no_qos;
-
-       if (mp)
-               *mp = m;
-       if (np)
-               *np = n;
-
-       return 1 << (m + n);
-
-no_qos:
-       if (mp)
-               *mp = 0;
-       if (np)
-               *np = 0;
-       return 0;
-}
-
-/**
- * init_qos - init RX qos
- * @dd - device data
- * @rmt - RSM map table
- *
- * This routine initializes Rule 0 and the RSM map table to implement
- * quality of service (qos).
- *
- * If all of the limit tests succeed, qos is applied based on the array
- * interpretation of krcvqs where entry 0 is VL0.
- *
- * The number of vl bits (n) and the number of qpn bits (m) are computed to
- * feed both the RSM map table and the single rule.
- */
-static void init_qos(struct hfi1_devdata *dd, struct rsm_map_table *rmt)
-{
-       struct rsm_rule_data rrd;
-       unsigned qpns_per_vl, ctxt, i, qpn, n = 1, m;
-       unsigned int rmt_entries;
-       u64 reg;
-
-       if (!rmt)
-               goto bail;
-       rmt_entries = qos_rmt_entries(dd, &m, &n);
-       if (rmt_entries == 0)
-               goto bail;
-       qpns_per_vl = 1 << m;
-
-       /* enough room in the map table? */
-       rmt_entries = 1 << (m + n);
-       if (rmt->used + rmt_entries >= NUM_MAP_ENTRIES)
-               goto bail;
-
-       /* add qos entries to the the RSM map table */
-       for (i = 0, ctxt = FIRST_KERNEL_KCTXT; i < num_vls; i++) {
-               unsigned tctxt;
-
-               for (qpn = 0, tctxt = ctxt;
-                    krcvqs[i] && qpn < qpns_per_vl; qpn++) {
-                       unsigned idx, regoff, regidx;
-
-                       /* generate the index the hardware will produce */
-                       idx = rmt->used + ((qpn << n) ^ i);
-                       regoff = (idx % 8) * 8;
-                       regidx = idx / 8;
-                       /* replace default with context number */
-                       reg = rmt->map[regidx];
-                       reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK
-                               << regoff);
-                       reg |= (u64)(tctxt++) << regoff;
-                       rmt->map[regidx] = reg;
-                       if (tctxt == ctxt + krcvqs[i])
-                               tctxt = ctxt;
-               }
-               ctxt += krcvqs[i];
-       }
-
-       rrd.offset = rmt->used;
-       rrd.pkt_type = 2;
-       rrd.field1_off = LRH_BTH_MATCH_OFFSET;
-       rrd.field2_off = LRH_SC_MATCH_OFFSET;
-       rrd.index1_off = LRH_SC_SELECT_OFFSET;
-       rrd.index1_width = n;
-       rrd.index2_off = QPN_SELECT_OFFSET;
-       rrd.index2_width = m + n;
-       rrd.mask1 = LRH_BTH_MASK;
-       rrd.value1 = LRH_BTH_VALUE;
-       rrd.mask2 = LRH_SC_MASK;
-       rrd.value2 = LRH_SC_VALUE;
-
-       /* add rule 0 */
-       add_rsm_rule(dd, 0, &rrd);
-
-       /* mark RSM map entries as used */
-       rmt->used += rmt_entries;
-       /* map everything else to the mcast/err/vl15 context */
-       init_qpmap_table(dd, HFI1_CTRL_CTXT, HFI1_CTRL_CTXT);
-       dd->qos_shift = n + 1;
-       return;
-bail:
-       dd->qos_shift = 1;
-       init_qpmap_table(dd, FIRST_KERNEL_KCTXT, dd->n_krcv_queues - 1);
-}
-
-static void init_user_fecn_handling(struct hfi1_devdata *dd,
-                                   struct rsm_map_table *rmt)
-{
-       struct rsm_rule_data rrd;
-       u64 reg;
-       int i, idx, regoff, regidx;
-       u8 offset;
-
-       /* there needs to be enough room in the map table */
-       if (rmt->used + dd->num_user_contexts >= NUM_MAP_ENTRIES) {
-               dd_dev_err(dd, "User FECN handling disabled - too many user contexts allocated\n");
-               return;
-       }
-
-       /*
-        * RSM will extract the destination context as an index into the
-        * map table.  The destination contexts are a sequential block
-        * in the range first_user_ctxt...num_rcv_contexts-1 (inclusive).
-        * Map entries are accessed as offset + extracted value.  Adjust
-        * the added offset so this sequence can be placed anywhere in
-        * the table - as long as the entries themselves do not wrap.
-        * There are only enough bits in offset for the table size, so
-        * start with that to allow for a "negative" offset.
-        */
-       offset = (u8)(NUM_MAP_ENTRIES + (int)rmt->used -
-                                               (int)dd->first_user_ctxt);
-
-       for (i = dd->first_user_ctxt, idx = rmt->used;
-                               i < dd->num_rcv_contexts; i++, idx++) {
-               /* replace with identity mapping */
-               regoff = (idx % 8) * 8;
-               regidx = idx / 8;
-               reg = rmt->map[regidx];
-               reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK << regoff);
-               reg |= (u64)i << regoff;
-               rmt->map[regidx] = reg;
-       }
-
-       /*
-        * For RSM intercept of Expected FECN packets:
-        * o packet type 0 - expected
-        * o match on F (bit 95), using select/match 1, and
-        * o match on SH (bit 133), using select/match 2.
-        *
-        * Use index 1 to extract the 8-bit receive context from DestQP
-        * (start at bit 64).  Use that as the RSM map table index.
-        */
-       rrd.offset = offset;
-       rrd.pkt_type = 0;
-       rrd.field1_off = 95;
-       rrd.field2_off = 133;
-       rrd.index1_off = 64;
-       rrd.index1_width = 8;
-       rrd.index2_off = 0;
-       rrd.index2_width = 0;
-       rrd.mask1 = 1;
-       rrd.value1 = 1;
-       rrd.mask2 = 1;
-       rrd.value2 = 1;
-
-       /* add rule 1 */
-       add_rsm_rule(dd, 1, &rrd);
-
-       rmt->used += dd->num_user_contexts;
-}
-
-static void init_rxe(struct hfi1_devdata *dd)
-{
-       struct rsm_map_table *rmt;
-
-       /* enable all receive errors */
-       write_csr(dd, RCV_ERR_MASK, ~0ull);
-
-       rmt = alloc_rsm_map_table(dd);
-       /* set up QOS, including the QPN map table */
-       init_qos(dd, rmt);
-       init_user_fecn_handling(dd, rmt);
-       complete_rsm_map_table(dd, rmt);
-       kfree(rmt);
-
-       /*
-        * make sure RcvCtrl.RcvWcb <= PCIe Device Control
-        * Register Max_Payload_Size (PCI_EXP_DEVCTL in Linux PCIe config
-        * space, PciCfgCap2.MaxPayloadSize in HFI).  There is only one
-        * invalid configuration: RcvCtrl.RcvWcb set to its max of 256 and
-        * Max_PayLoad_Size set to its minimum of 128.
-        *
-        * Presently, RcvCtrl.RcvWcb is not modified from its default of 0
-        * (64 bytes).  Max_Payload_Size is possibly modified upward in
-        * tune_pcie_caps() which is called after this routine.
-        */
-}
-
-static void init_other(struct hfi1_devdata *dd)
-{
-       /* enable all CCE errors */
-       write_csr(dd, CCE_ERR_MASK, ~0ull);
-       /* enable *some* Misc errors */
-       write_csr(dd, MISC_ERR_MASK, DRIVER_MISC_MASK);
-       /* enable all DC errors, except LCB */
-       write_csr(dd, DCC_ERR_FLG_EN, ~0ull);
-       write_csr(dd, DC_DC8051_ERR_EN, ~0ull);
-}
-
-/*
- * Fill out the given AU table using the given CU.  A CU is defined in terms
- * AUs.  The table is a an encoding: given the index, how many AUs does that
- * represent?
- *
- * NOTE: Assumes that the register layout is the same for the
- * local and remote tables.
- */
-static void assign_cm_au_table(struct hfi1_devdata *dd, u32 cu,
-                              u32 csr0to3, u32 csr4to7)
-{
-       write_csr(dd, csr0to3,
-                 0ull << SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE0_SHIFT |
-                 1ull << SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE1_SHIFT |
-                 2ull * cu <<
-                 SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE2_SHIFT |
-                 4ull * cu <<
-                 SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE3_SHIFT);
-       write_csr(dd, csr4to7,
-                 8ull * cu <<
-                 SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE4_SHIFT |
-                 16ull * cu <<
-                 SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE5_SHIFT |
-                 32ull * cu <<
-                 SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE6_SHIFT |
-                 64ull * cu <<
-                 SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE7_SHIFT);
-}
-
-static void assign_local_cm_au_table(struct hfi1_devdata *dd, u8 vcu)
-{
-       assign_cm_au_table(dd, vcu_to_cu(vcu), SEND_CM_LOCAL_AU_TABLE0_TO3,
-                          SEND_CM_LOCAL_AU_TABLE4_TO7);
-}
-
-void assign_remote_cm_au_table(struct hfi1_devdata *dd, u8 vcu)
-{
-       assign_cm_au_table(dd, vcu_to_cu(vcu), SEND_CM_REMOTE_AU_TABLE0_TO3,
-                          SEND_CM_REMOTE_AU_TABLE4_TO7);
-}
-
-static void init_txe(struct hfi1_devdata *dd)
-{
-       int i;
-
-       /* enable all PIO, SDMA, general, and Egress errors */
-       write_csr(dd, SEND_PIO_ERR_MASK, ~0ull);
-       write_csr(dd, SEND_DMA_ERR_MASK, ~0ull);
-       write_csr(dd, SEND_ERR_MASK, ~0ull);
-       write_csr(dd, SEND_EGRESS_ERR_MASK, ~0ull);
-
-       /* enable all per-context and per-SDMA engine errors */
-       for (i = 0; i < dd->chip_send_contexts; i++)
-               write_kctxt_csr(dd, i, SEND_CTXT_ERR_MASK, ~0ull);
-       for (i = 0; i < dd->chip_sdma_engines; i++)
-               write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_MASK, ~0ull);
-
-       /* set the local CU to AU mapping */
-       assign_local_cm_au_table(dd, dd->vcu);
-
-       /*
-        * Set reasonable default for Credit Return Timer
-        * Don't set on Simulator - causes it to choke.
-        */
-       if (dd->icode != ICODE_FUNCTIONAL_SIMULATOR)
-               write_csr(dd, SEND_CM_TIMER_CTRL, HFI1_CREDIT_RETURN_RATE);
-}
-
-int hfi1_set_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt, u16 jkey)
-{
-       struct hfi1_ctxtdata *rcd = dd->rcd[ctxt];
-       unsigned sctxt;
-       int ret = 0;
-       u64 reg;
-
-       if (!rcd || !rcd->sc) {
-               ret = -EINVAL;
-               goto done;
-       }
-       sctxt = rcd->sc->hw_context;
-       reg = SEND_CTXT_CHECK_JOB_KEY_MASK_SMASK | /* mask is always 1's */
-               ((jkey & SEND_CTXT_CHECK_JOB_KEY_VALUE_MASK) <<
-                SEND_CTXT_CHECK_JOB_KEY_VALUE_SHIFT);
-       /* JOB_KEY_ALLOW_PERMISSIVE is not allowed by default */
-       if (HFI1_CAP_KGET_MASK(rcd->flags, ALLOW_PERM_JKEY))
-               reg |= SEND_CTXT_CHECK_JOB_KEY_ALLOW_PERMISSIVE_SMASK;
-       write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_JOB_KEY, reg);
-       /*
-        * Enable send-side J_KEY integrity check, unless this is A0 h/w
-        */
-       if (!is_ax(dd)) {
-               reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
-               reg |= SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
-               write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
-       }
-
-       /* Enable J_KEY check on receive context. */
-       reg = RCV_KEY_CTRL_JOB_KEY_ENABLE_SMASK |
-               ((jkey & RCV_KEY_CTRL_JOB_KEY_VALUE_MASK) <<
-                RCV_KEY_CTRL_JOB_KEY_VALUE_SHIFT);
-       write_kctxt_csr(dd, ctxt, RCV_KEY_CTRL, reg);
-done:
-       return ret;
-}
-
-int hfi1_clear_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt)
-{
-       struct hfi1_ctxtdata *rcd = dd->rcd[ctxt];
-       unsigned sctxt;
-       int ret = 0;
-       u64 reg;
-
-       if (!rcd || !rcd->sc) {
-               ret = -EINVAL;
-               goto done;
-       }
-       sctxt = rcd->sc->hw_context;
-       write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_JOB_KEY, 0);
-       /*
-        * Disable send-side J_KEY integrity check, unless this is A0 h/w.
-        * This check would not have been enabled for A0 h/w, see
-        * set_ctxt_jkey().
-        */
-       if (!is_ax(dd)) {
-               reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
-               reg &= ~SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
-               write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
-       }
-       /* Turn off the J_KEY on the receive side */
-       write_kctxt_csr(dd, ctxt, RCV_KEY_CTRL, 0);
-done:
-       return ret;
-}
-
-int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt, u16 pkey)
-{
-       struct hfi1_ctxtdata *rcd;
-       unsigned sctxt;
-       int ret = 0;
-       u64 reg;
-
-       if (ctxt < dd->num_rcv_contexts) {
-               rcd = dd->rcd[ctxt];
-       } else {
-               ret = -EINVAL;
-               goto done;
-       }
-       if (!rcd || !rcd->sc) {
-               ret = -EINVAL;
-               goto done;
-       }
-       sctxt = rcd->sc->hw_context;
-       reg = ((u64)pkey & SEND_CTXT_CHECK_PARTITION_KEY_VALUE_MASK) <<
-               SEND_CTXT_CHECK_PARTITION_KEY_VALUE_SHIFT;
-       write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_PARTITION_KEY, reg);
-       reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
-       reg |= SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK;
-       reg &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK;
-       write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
-done:
-       return ret;
-}
-
-int hfi1_clear_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt)
-{
-       struct hfi1_ctxtdata *rcd;
-       unsigned sctxt;
-       int ret = 0;
-       u64 reg;
-
-       if (ctxt < dd->num_rcv_contexts) {
-               rcd = dd->rcd[ctxt];
-       } else {
-               ret = -EINVAL;
-               goto done;
-       }
-       if (!rcd || !rcd->sc) {
-               ret = -EINVAL;
-               goto done;
-       }
-       sctxt = rcd->sc->hw_context;
-       reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
-       reg &= ~SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK;
-       write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
-       write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_PARTITION_KEY, 0);
-done:
-       return ret;
-}
-
-/*
- * Start doing the clean up the the chip. Our clean up happens in multiple
- * stages and this is just the first.
- */
-void hfi1_start_cleanup(struct hfi1_devdata *dd)
-{
-       aspm_exit(dd);
-       free_cntrs(dd);
-       free_rcverr(dd);
-       clean_up_interrupts(dd);
-       finish_chip_resources(dd);
-}
-
-#define HFI_BASE_GUID(dev) \
-       ((dev)->base_guid & ~(1ULL << GUID_HFI_INDEX_SHIFT))
-
-/*
- * Information can be shared between the two HFIs on the same ASIC
- * in the same OS.  This function finds the peer device and sets
- * up a shared structure.
- */
-static int init_asic_data(struct hfi1_devdata *dd)
-{
-       unsigned long flags;
-       struct hfi1_devdata *tmp, *peer = NULL;
-       int ret = 0;
-
-       spin_lock_irqsave(&hfi1_devs_lock, flags);
-       /* Find our peer device */
-       list_for_each_entry(tmp, &hfi1_dev_list, list) {
-               if ((HFI_BASE_GUID(dd) == HFI_BASE_GUID(tmp)) &&
-                   dd->unit != tmp->unit) {
-                       peer = tmp;
-                       break;
-               }
-       }
-
-       if (peer) {
-               dd->asic_data = peer->asic_data;
-       } else {
-               dd->asic_data = kzalloc(sizeof(*dd->asic_data), GFP_KERNEL);
-               if (!dd->asic_data) {
-                       ret = -ENOMEM;
-                       goto done;
-               }
-               mutex_init(&dd->asic_data->asic_resource_mutex);
-       }
-       dd->asic_data->dds[dd->hfi1_id] = dd; /* self back-pointer */
-
-done:
-       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
-       return ret;
-}
-
-/*
- * Set dd->boardname.  Use a generic name if a name is not returned from
- * EFI variable space.
- *
- * Return 0 on success, -ENOMEM if space could not be allocated.
- */
-static int obtain_boardname(struct hfi1_devdata *dd)
-{
-       /* generic board description */
-       const char generic[] =
-               "Intel Omni-Path Host Fabric Interface Adapter 100 Series";
-       unsigned long size;
-       int ret;
-
-       ret = read_hfi1_efi_var(dd, "description", &size,
-                               (void **)&dd->boardname);
-       if (ret) {
-               dd_dev_info(dd, "Board description not found\n");
-               /* use generic description */
-               dd->boardname = kstrdup(generic, GFP_KERNEL);
-               if (!dd->boardname)
-                       return -ENOMEM;
-       }
-       return 0;
-}
-
-/*
- * Check the interrupt registers to make sure that they are mapped correctly.
- * It is intended to help user identify any mismapping by VMM when the driver
- * is running in a VM. This function should only be called before interrupt
- * is set up properly.
- *
- * Return 0 on success, -EINVAL on failure.
- */
-static int check_int_registers(struct hfi1_devdata *dd)
-{
-       u64 reg;
-       u64 all_bits = ~(u64)0;
-       u64 mask;
-
-       /* Clear CceIntMask[0] to avoid raising any interrupts */
-       mask = read_csr(dd, CCE_INT_MASK);
-       write_csr(dd, CCE_INT_MASK, 0ull);
-       reg = read_csr(dd, CCE_INT_MASK);
-       if (reg)
-               goto err_exit;
-
-       /* Clear all interrupt status bits */
-       write_csr(dd, CCE_INT_CLEAR, all_bits);
-       reg = read_csr(dd, CCE_INT_STATUS);
-       if (reg)
-               goto err_exit;
-
-       /* Set all interrupt status bits */
-       write_csr(dd, CCE_INT_FORCE, all_bits);
-       reg = read_csr(dd, CCE_INT_STATUS);
-       if (reg != all_bits)
-               goto err_exit;
-
-       /* Restore the interrupt mask */
-       write_csr(dd, CCE_INT_CLEAR, all_bits);
-       write_csr(dd, CCE_INT_MASK, mask);
-
-       return 0;
-err_exit:
-       write_csr(dd, CCE_INT_MASK, mask);
-       dd_dev_err(dd, "Interrupt registers not properly mapped by VMM\n");
-       return -EINVAL;
-}
-
-/**
- * Allocate and initialize the device structure for the hfi.
- * @dev: the pci_dev for hfi1_ib device
- * @ent: pci_device_id struct for this dev
- *
- * Also allocates, initializes, and returns the devdata struct for this
- * device instance
- *
- * This is global, and is called directly at init to set up the
- * chip-specific function pointers for later use.
- */
-struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
-                                 const struct pci_device_id *ent)
-{
-       struct hfi1_devdata *dd;
-       struct hfi1_pportdata *ppd;
-       u64 reg;
-       int i, ret;
-       static const char * const inames[] = { /* implementation names */
-               "RTL silicon",
-               "RTL VCS simulation",
-               "RTL FPGA emulation",
-               "Functional simulator"
-       };
-       struct pci_dev *parent = pdev->bus->self;
-
-       dd = hfi1_alloc_devdata(pdev, NUM_IB_PORTS *
-                               sizeof(struct hfi1_pportdata));
-       if (IS_ERR(dd))
-               goto bail;
-       ppd = dd->pport;
-       for (i = 0; i < dd->num_pports; i++, ppd++) {
-               int vl;
-               /* init common fields */
-               hfi1_init_pportdata(pdev, ppd, dd, 0, 1);
-               /* DC supports 4 link widths */
-               ppd->link_width_supported =
-                       OPA_LINK_WIDTH_1X | OPA_LINK_WIDTH_2X |
-                       OPA_LINK_WIDTH_3X | OPA_LINK_WIDTH_4X;
-               ppd->link_width_downgrade_supported =
-                       ppd->link_width_supported;
-               /* start out enabling only 4X */
-               ppd->link_width_enabled = OPA_LINK_WIDTH_4X;
-               ppd->link_width_downgrade_enabled =
-                                       ppd->link_width_downgrade_supported;
-               /* link width active is 0 when link is down */
-               /* link width downgrade active is 0 when link is down */
-
-               if (num_vls < HFI1_MIN_VLS_SUPPORTED ||
-                   num_vls > HFI1_MAX_VLS_SUPPORTED) {
-                       hfi1_early_err(&pdev->dev,
-                                      "Invalid num_vls %u, using %u VLs\n",
-                                   num_vls, HFI1_MAX_VLS_SUPPORTED);
-                       num_vls = HFI1_MAX_VLS_SUPPORTED;
-               }
-               ppd->vls_supported = num_vls;
-               ppd->vls_operational = ppd->vls_supported;
-               ppd->actual_vls_operational = ppd->vls_supported;
-               /* Set the default MTU. */
-               for (vl = 0; vl < num_vls; vl++)
-                       dd->vld[vl].mtu = hfi1_max_mtu;
-               dd->vld[15].mtu = MAX_MAD_PACKET;
-               /*
-                * Set the initial values to reasonable default, will be set
-                * for real when link is up.
-                */
-               ppd->lstate = IB_PORT_DOWN;
-               ppd->overrun_threshold = 0x4;
-               ppd->phy_error_threshold = 0xf;
-               ppd->port_crc_mode_enabled = link_crc_mask;
-               /* initialize supported LTP CRC mode */
-               ppd->port_ltp_crc_mode = cap_to_port_ltp(link_crc_mask) << 8;
-               /* initialize enabled LTP CRC mode */
-               ppd->port_ltp_crc_mode |= cap_to_port_ltp(link_crc_mask) << 4;
-               /* start in offline */
-               ppd->host_link_state = HLS_DN_OFFLINE;
-               init_vl_arb_caches(ppd);
-               ppd->last_pstate = 0xff; /* invalid value */
-       }
-
-       dd->link_default = HLS_DN_POLL;
-
-       /*
-        * Do remaining PCIe setup and save PCIe values in dd.
-        * Any error printing is already done by the init code.
-        * On return, we have the chip mapped.
-        */
-       ret = hfi1_pcie_ddinit(dd, pdev, ent);
-       if (ret < 0)
-               goto bail_free;
-
-       /* verify that reads actually work, save revision for reset check */
-       dd->revision = read_csr(dd, CCE_REVISION);
-       if (dd->revision == ~(u64)0) {
-               dd_dev_err(dd, "cannot read chip CSRs\n");
-               ret = -EINVAL;
-               goto bail_cleanup;
-       }
-       dd->majrev = (dd->revision >> CCE_REVISION_CHIP_REV_MAJOR_SHIFT)
-                       & CCE_REVISION_CHIP_REV_MAJOR_MASK;
-       dd->minrev = (dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT)
-                       & CCE_REVISION_CHIP_REV_MINOR_MASK;
-
-       /*
-        * Check interrupt registers mapping if the driver has no access to
-        * the upstream component. In this case, it is likely that the driver
-        * is running in a VM.
-        */
-       if (!parent) {
-               ret = check_int_registers(dd);
-               if (ret)
-                       goto bail_cleanup;
-       }
-
-       /*
-        * obtain the hardware ID - NOT related to unit, which is a
-        * software enumeration
-        */
-       reg = read_csr(dd, CCE_REVISION2);
-       dd->hfi1_id = (reg >> CCE_REVISION2_HFI_ID_SHIFT)
-                                       & CCE_REVISION2_HFI_ID_MASK;
-       /* the variable size will remove unwanted bits */
-       dd->icode = reg >> CCE_REVISION2_IMPL_CODE_SHIFT;
-       dd->irev = reg >> CCE_REVISION2_IMPL_REVISION_SHIFT;
-       dd_dev_info(dd, "Implementation: %s, revision 0x%x\n",
-                   dd->icode < ARRAY_SIZE(inames) ?
-                   inames[dd->icode] : "unknown", (int)dd->irev);
-
-       /* speeds the hardware can support */
-       dd->pport->link_speed_supported = OPA_LINK_SPEED_25G;
-       /* speeds allowed to run at */
-       dd->pport->link_speed_enabled = dd->pport->link_speed_supported;
-       /* give a reasonable active value, will be set on link up */
-       dd->pport->link_speed_active = OPA_LINK_SPEED_25G;
-
-       dd->chip_rcv_contexts = read_csr(dd, RCV_CONTEXTS);
-       dd->chip_send_contexts = read_csr(dd, SEND_CONTEXTS);
-       dd->chip_sdma_engines = read_csr(dd, SEND_DMA_ENGINES);
-       dd->chip_pio_mem_size = read_csr(dd, SEND_PIO_MEM_SIZE);
-       dd->chip_sdma_mem_size = read_csr(dd, SEND_DMA_MEM_SIZE);
-       /* fix up link widths for emulation _p */
-       ppd = dd->pport;
-       if (dd->icode == ICODE_FPGA_EMULATION && is_emulator_p(dd)) {
-               ppd->link_width_supported =
-                       ppd->link_width_enabled =
-                       ppd->link_width_downgrade_supported =
-                       ppd->link_width_downgrade_enabled =
-                               OPA_LINK_WIDTH_1X;
-       }
-       /* insure num_vls isn't larger than number of sdma engines */
-       if (HFI1_CAP_IS_KSET(SDMA) && num_vls > dd->chip_sdma_engines) {
-               dd_dev_err(dd, "num_vls %u too large, using %u VLs\n",
-                          num_vls, dd->chip_sdma_engines);
-               num_vls = dd->chip_sdma_engines;
-               ppd->vls_supported = dd->chip_sdma_engines;
-               ppd->vls_operational = ppd->vls_supported;
-       }
-
-       /*
-        * Convert the ns parameter to the 64 * cclocks used in the CSR.
-        * Limit the max if larger than the field holds.  If timeout is
-        * non-zero, then the calculated field will be at least 1.
-        *
-        * Must be after icode is set up - the cclock rate depends
-        * on knowing the hardware being used.
-        */
-       dd->rcv_intr_timeout_csr = ns_to_cclock(dd, rcv_intr_timeout) / 64;
-       if (dd->rcv_intr_timeout_csr >
-                       RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK)
-               dd->rcv_intr_timeout_csr =
-                       RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK;
-       else if (dd->rcv_intr_timeout_csr == 0 && rcv_intr_timeout)
-               dd->rcv_intr_timeout_csr = 1;
-
-       /* needs to be done before we look for the peer device */
-       read_guid(dd);
-
-       /* set up shared ASIC data with peer device */
-       ret = init_asic_data(dd);
-       if (ret)
-               goto bail_cleanup;
-
-       /* obtain chip sizes, reset chip CSRs */
-       init_chip(dd);
-
-       /* read in the PCIe link speed information */
-       ret = pcie_speeds(dd);
-       if (ret)
-               goto bail_cleanup;
-
-       /* Needs to be called before hfi1_firmware_init */
-       get_platform_config(dd);
-
-       /* read in firmware */
-       ret = hfi1_firmware_init(dd);
-       if (ret)
-               goto bail_cleanup;
-
-       /*
-        * In general, the PCIe Gen3 transition must occur after the
-        * chip has been idled (so it won't initiate any PCIe transactions
-        * e.g. an interrupt) and before the driver changes any registers
-        * (the transition will reset the registers).
-        *
-        * In particular, place this call after:
-        * - init_chip()     - the chip will not initiate any PCIe transactions
-        * - pcie_speeds()   - reads the current link speed
-        * - hfi1_firmware_init() - the needed firmware is ready to be
-        *                          downloaded
-        */
-       ret = do_pcie_gen3_transition(dd);
-       if (ret)
-               goto bail_cleanup;
-
-       /* start setting dd values and adjusting CSRs */
-       init_early_variables(dd);
-
-       parse_platform_config(dd);
-
-       ret = obtain_boardname(dd);
-       if (ret)
-               goto bail_cleanup;
-
-       snprintf(dd->boardversion, BOARD_VERS_MAX,
-                "ChipABI %u.%u, ChipRev %u.%u, SW Compat %llu\n",
-                HFI1_CHIP_VERS_MAJ, HFI1_CHIP_VERS_MIN,
-                (u32)dd->majrev,
-                (u32)dd->minrev,
-                (dd->revision >> CCE_REVISION_SW_SHIFT)
-                   & CCE_REVISION_SW_MASK);
-
-       /*
-        * The real cpu mask is part of the affinity struct but has to be
-        * initialized earlier than the rest of the affinity struct because it
-        * is needed to calculate the number of user contexts in
-        * set_up_context_variables(). However, hfi1_dev_affinity_init(),
-        * which initializes the rest of the affinity struct members,
-        * depends on set_up_context_variables() for the number of kernel
-        * contexts, so it cannot be called before set_up_context_variables().
-        */
-       ret = init_real_cpu_mask(dd);
-       if (ret)
-               goto bail_cleanup;
-
-       ret = set_up_context_variables(dd);
-       if (ret)
-               goto bail_cleanup;
-
-       /* set initial RXE CSRs */
-       init_rxe(dd);
-       /* set initial TXE CSRs */
-       init_txe(dd);
-       /* set initial non-RXE, non-TXE CSRs */
-       init_other(dd);
-       /* set up KDETH QP prefix in both RX and TX CSRs */
-       init_kdeth_qp(dd);
-
-       hfi1_dev_affinity_init(dd);
-
-       /* send contexts must be set up before receive contexts */
-       ret = init_send_contexts(dd);
-       if (ret)
-               goto bail_cleanup;
-
-       ret = hfi1_create_ctxts(dd);
-       if (ret)
-               goto bail_cleanup;
-
-       dd->rcvhdrsize = DEFAULT_RCVHDRSIZE;
-       /*
-        * rcd[0] is guaranteed to be valid by this point. Also, all
-        * context are using the same value, as per the module parameter.
-        */
-       dd->rhf_offset = dd->rcd[0]->rcvhdrqentsize - sizeof(u64) / sizeof(u32);
-
-       ret = init_pervl_scs(dd);
-       if (ret)
-               goto bail_cleanup;
-
-       /* sdma init */
-       for (i = 0; i < dd->num_pports; ++i) {
-               ret = sdma_init(dd, i);
-               if (ret)
-                       goto bail_cleanup;
-       }
-
-       /* use contexts created by hfi1_create_ctxts */
-       ret = set_up_interrupts(dd);
-       if (ret)
-               goto bail_cleanup;
-
-       /* set up LCB access - must be after set_up_interrupts() */
-       init_lcb_access(dd);
-
-       snprintf(dd->serial, SERIAL_MAX, "0x%08llx\n",
-                dd->base_guid & 0xFFFFFF);
-
-       dd->oui1 = dd->base_guid >> 56 & 0xFF;
-       dd->oui2 = dd->base_guid >> 48 & 0xFF;
-       dd->oui3 = dd->base_guid >> 40 & 0xFF;
-
-       ret = load_firmware(dd); /* asymmetric with dispose_firmware() */
-       if (ret)
-               goto bail_clear_intr;
-       check_fabric_firmware_versions(dd);
-
-       thermal_init(dd);
-
-       ret = init_cntrs(dd);
-       if (ret)
-               goto bail_clear_intr;
-
-       ret = init_rcverr(dd);
-       if (ret)
-               goto bail_free_cntrs;
-
-       ret = eprom_init(dd);
-       if (ret)
-               goto bail_free_rcverr;
-
-       goto bail;
-
-bail_free_rcverr:
-       free_rcverr(dd);
-bail_free_cntrs:
-       free_cntrs(dd);
-bail_clear_intr:
-       clean_up_interrupts(dd);
-bail_cleanup:
-       hfi1_pcie_ddcleanup(dd);
-bail_free:
-       hfi1_free_devdata(dd);
-       dd = ERR_PTR(ret);
-bail:
-       return dd;
-}
-
-static u16 delay_cycles(struct hfi1_pportdata *ppd, u32 desired_egress_rate,
-                       u32 dw_len)
-{
-       u32 delta_cycles;
-       u32 current_egress_rate = ppd->current_egress_rate;
-       /* rates here are in units of 10^6 bits/sec */
-
-       if (desired_egress_rate == -1)
-               return 0; /* shouldn't happen */
-
-       if (desired_egress_rate >= current_egress_rate)
-               return 0; /* we can't help go faster, only slower */
-
-       delta_cycles = egress_cycles(dw_len * 4, desired_egress_rate) -
-                       egress_cycles(dw_len * 4, current_egress_rate);
-
-       return (u16)delta_cycles;
-}
-
-/**
- * create_pbc - build a pbc for transmission
- * @flags: special case flags or-ed in built pbc
- * @srate: static rate
- * @vl: vl
- * @dwlen: dword length (header words + data words + pbc words)
- *
- * Create a PBC with the given flags, rate, VL, and length.
- *
- * NOTE: The PBC created will not insert any HCRC - all callers but one are
- * for verbs, which does not use this PSM feature.  The lone other caller
- * is for the diagnostic interface which calls this if the user does not
- * supply their own PBC.
- */
-u64 create_pbc(struct hfi1_pportdata *ppd, u64 flags, int srate_mbs, u32 vl,
-              u32 dw_len)
-{
-       u64 pbc, delay = 0;
-
-       if (unlikely(srate_mbs))
-               delay = delay_cycles(ppd, srate_mbs, dw_len);
-
-       pbc = flags
-               | (delay << PBC_STATIC_RATE_CONTROL_COUNT_SHIFT)
-               | ((u64)PBC_IHCRC_NONE << PBC_INSERT_HCRC_SHIFT)
-               | (vl & PBC_VL_MASK) << PBC_VL_SHIFT
-               | (dw_len & PBC_LENGTH_DWS_MASK)
-                       << PBC_LENGTH_DWS_SHIFT;
-
-       return pbc;
-}
-
-#define SBUS_THERMAL    0x4f
-#define SBUS_THERM_MONITOR_MODE 0x1
-
-#define THERM_FAILURE(dev, ret, reason) \
-       dd_dev_err((dd),                                                \
-                  "Thermal sensor initialization failed: %s (%d)\n",   \
-                  (reason), (ret))
-
-/*
- * Initialize the Avago Thermal sensor.
- *
- * After initialization, enable polling of thermal sensor through
- * SBus interface. In order for this to work, the SBus Master
- * firmware has to be loaded due to the fact that the HW polling
- * logic uses SBus interrupts, which are not supported with
- * default firmware. Otherwise, no data will be returned through
- * the ASIC_STS_THERM CSR.
- */
-static int thermal_init(struct hfi1_devdata *dd)
-{
-       int ret = 0;
-
-       if (dd->icode != ICODE_RTL_SILICON ||
-           check_chip_resource(dd, CR_THERM_INIT, NULL))
-               return ret;
-
-       ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT);
-       if (ret) {
-               THERM_FAILURE(dd, ret, "Acquire SBus");
-               return ret;
-       }
-
-       dd_dev_info(dd, "Initializing thermal sensor\n");
-       /* Disable polling of thermal readings */
-       write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x0);
-       msleep(100);
-       /* Thermal Sensor Initialization */
-       /*    Step 1: Reset the Thermal SBus Receiver */
-       ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0,
-                               RESET_SBUS_RECEIVER, 0);
-       if (ret) {
-               THERM_FAILURE(dd, ret, "Bus Reset");
-               goto done;
-       }
-       /*    Step 2: Set Reset bit in Thermal block */
-       ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0,
-                               WRITE_SBUS_RECEIVER, 0x1);
-       if (ret) {
-               THERM_FAILURE(dd, ret, "Therm Block Reset");
-               goto done;
-       }
-       /*    Step 3: Write clock divider value (100MHz -> 2MHz) */
-       ret = sbus_request_slow(dd, SBUS_THERMAL, 0x1,
-                               WRITE_SBUS_RECEIVER, 0x32);
-       if (ret) {
-               THERM_FAILURE(dd, ret, "Write Clock Div");
-               goto done;
-       }
-       /*    Step 4: Select temperature mode */
-       ret = sbus_request_slow(dd, SBUS_THERMAL, 0x3,
-                               WRITE_SBUS_RECEIVER,
-                               SBUS_THERM_MONITOR_MODE);
-       if (ret) {
-               THERM_FAILURE(dd, ret, "Write Mode Sel");
-               goto done;
-       }
-       /*    Step 5: De-assert block reset and start conversion */
-       ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0,
-                               WRITE_SBUS_RECEIVER, 0x2);
-       if (ret) {
-               THERM_FAILURE(dd, ret, "Write Reset Deassert");
-               goto done;
-       }
-       /*    Step 5.1: Wait for first conversion (21.5ms per spec) */
-       msleep(22);
-
-       /* Enable polling of thermal readings */
-       write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x1);
-
-       /* Set initialized flag */
-       ret = acquire_chip_resource(dd, CR_THERM_INIT, 0);
-       if (ret)
-               THERM_FAILURE(dd, ret, "Unable to set thermal init flag");
-
-done:
-       release_chip_resource(dd, CR_SBUS);
-       return ret;
-}
-
-static void handle_temp_err(struct hfi1_devdata *dd)
-{
-       struct hfi1_pportdata *ppd = &dd->pport[0];
-       /*
-        * Thermal Critical Interrupt
-        * Put the device into forced freeze mode, take link down to
-        * offline, and put DC into reset.
-        */
-       dd_dev_emerg(dd,
-                    "Critical temperature reached! Forcing device into freeze mode!\n");
-       dd->flags |= HFI1_FORCED_FREEZE;
-       start_freeze_handling(ppd, FREEZE_SELF | FREEZE_ABORT);
-       /*
-        * Shut DC down as much and as quickly as possible.
-        *
-        * Step 1: Take the link down to OFFLINE. This will cause the
-        *         8051 to put the Serdes in reset. However, we don't want to
-        *         go through the entire link state machine since we want to
-        *         shutdown ASAP. Furthermore, this is not a graceful shutdown
-        *         but rather an attempt to save the chip.
-        *         Code below is almost the same as quiet_serdes() but avoids
-        *         all the extra work and the sleeps.
-        */
-       ppd->driver_link_ready = 0;
-       ppd->link_enabled = 0;
-       set_physical_link_state(dd, (OPA_LINKDOWN_REASON_SMA_DISABLED << 8) |
-                               PLS_OFFLINE);
-       /*
-        * Step 2: Shutdown LCB and 8051
-        *         After shutdown, do not restore DC_CFG_RESET value.
-        */
-       dc_shutdown(dd);
-}
diff --git a/drivers/staging/rdma/hfi1/chip.h b/drivers/staging/rdma/hfi1/chip.h
deleted file mode 100644 (file)
index 1948706..0000000
+++ /dev/null
@@ -1,1368 +0,0 @@
-#ifndef _CHIP_H
-#define _CHIP_H
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-/*
- * This file contains all of the defines that is specific to the HFI chip
- */
-
-/* sizes */
-#define CCE_NUM_MSIX_VECTORS 256
-#define CCE_NUM_INT_CSRS 12
-#define CCE_NUM_INT_MAP_CSRS 96
-#define NUM_INTERRUPT_SOURCES 768
-#define RXE_NUM_CONTEXTS 160
-#define RXE_PER_CONTEXT_SIZE 0x1000    /* 4k */
-#define RXE_NUM_TID_FLOWS 32
-#define RXE_NUM_DATA_VL 8
-#define TXE_NUM_CONTEXTS 160
-#define TXE_NUM_SDMA_ENGINES 16
-#define NUM_CONTEXTS_PER_SET 8
-#define VL_ARB_HIGH_PRIO_TABLE_SIZE 16
-#define VL_ARB_LOW_PRIO_TABLE_SIZE 16
-#define VL_ARB_TABLE_SIZE 16
-#define TXE_NUM_32_BIT_COUNTER 7
-#define TXE_NUM_64_BIT_COUNTER 30
-#define TXE_NUM_DATA_VL 8
-#define TXE_PIO_SIZE (32 * 0x100000)   /* 32 MB */
-#define PIO_BLOCK_SIZE 64                      /* bytes */
-#define SDMA_BLOCK_SIZE 64                     /* bytes */
-#define RCV_BUF_BLOCK_SIZE 64               /* bytes */
-#define PIO_CMASK 0x7ff        /* counter mask for free and fill counters */
-#define MAX_EAGER_ENTRIES    2048      /* max receive eager entries */
-#define MAX_TID_PAIR_ENTRIES 1024      /* max receive expected pairs */
-/*
- * Virtual? Allocation Unit, defined as AU = 8*2^vAU, 64 bytes, AU is fixed
- * at 64 bytes for all generation one devices
- */
-#define CM_VAU 3
-/* HFI link credit count, AKA receive buffer depth (RBUF_DEPTH) */
-#define CM_GLOBAL_CREDITS 0x940
-/* Number of PKey entries in the HW */
-#define MAX_PKEY_VALUES 16
-
-#include "chip_registers.h"
-
-#define RXE_PER_CONTEXT_USER   (RXE + RXE_PER_CONTEXT_OFFSET)
-#define TXE_PIO_SEND (TXE + TXE_PIO_SEND_OFFSET)
-
-/* PBC flags */
-#define PBC_INTR               BIT_ULL(31)
-#define PBC_DC_INFO_SHIFT      (30)
-#define PBC_DC_INFO            BIT_ULL(PBC_DC_INFO_SHIFT)
-#define PBC_TEST_EBP           BIT_ULL(29)
-#define PBC_PACKET_BYPASS      BIT_ULL(28)
-#define PBC_CREDIT_RETURN      BIT_ULL(25)
-#define PBC_INSERT_BYPASS_ICRC BIT_ULL(24)
-#define PBC_TEST_BAD_ICRC      BIT_ULL(23)
-#define PBC_FECN               BIT_ULL(22)
-
-/* PbcInsertHcrc field settings */
-#define PBC_IHCRC_LKDETH 0x0   /* insert @ local KDETH offset */
-#define PBC_IHCRC_GKDETH 0x1   /* insert @ global KDETH offset */
-#define PBC_IHCRC_NONE   0x2   /* no HCRC inserted */
-
-/* PBC fields */
-#define PBC_STATIC_RATE_CONTROL_COUNT_SHIFT 32
-#define PBC_STATIC_RATE_CONTROL_COUNT_MASK 0xffffull
-#define PBC_STATIC_RATE_CONTROL_COUNT_SMASK \
-       (PBC_STATIC_RATE_CONTROL_COUNT_MASK << \
-       PBC_STATIC_RATE_CONTROL_COUNT_SHIFT)
-
-#define PBC_INSERT_HCRC_SHIFT 26
-#define PBC_INSERT_HCRC_MASK 0x3ull
-#define PBC_INSERT_HCRC_SMASK \
-       (PBC_INSERT_HCRC_MASK << PBC_INSERT_HCRC_SHIFT)
-
-#define PBC_VL_SHIFT 12
-#define PBC_VL_MASK 0xfull
-#define PBC_VL_SMASK (PBC_VL_MASK << PBC_VL_SHIFT)
-
-#define PBC_LENGTH_DWS_SHIFT 0
-#define PBC_LENGTH_DWS_MASK 0xfffull
-#define PBC_LENGTH_DWS_SMASK \
-       (PBC_LENGTH_DWS_MASK << PBC_LENGTH_DWS_SHIFT)
-
-/* Credit Return Fields */
-#define CR_COUNTER_SHIFT 0
-#define CR_COUNTER_MASK 0x7ffull
-#define CR_COUNTER_SMASK (CR_COUNTER_MASK << CR_COUNTER_SHIFT)
-
-#define CR_STATUS_SHIFT 11
-#define CR_STATUS_MASK 0x1ull
-#define CR_STATUS_SMASK (CR_STATUS_MASK << CR_STATUS_SHIFT)
-
-#define CR_CREDIT_RETURN_DUE_TO_PBC_SHIFT 12
-#define CR_CREDIT_RETURN_DUE_TO_PBC_MASK 0x1ull
-#define CR_CREDIT_RETURN_DUE_TO_PBC_SMASK \
-       (CR_CREDIT_RETURN_DUE_TO_PBC_MASK << \
-       CR_CREDIT_RETURN_DUE_TO_PBC_SHIFT)
-
-#define CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SHIFT 13
-#define CR_CREDIT_RETURN_DUE_TO_THRESHOLD_MASK 0x1ull
-#define CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SMASK \
-       (CR_CREDIT_RETURN_DUE_TO_THRESHOLD_MASK << \
-       CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SHIFT)
-
-#define CR_CREDIT_RETURN_DUE_TO_ERR_SHIFT 14
-#define CR_CREDIT_RETURN_DUE_TO_ERR_MASK 0x1ull
-#define CR_CREDIT_RETURN_DUE_TO_ERR_SMASK \
-       (CR_CREDIT_RETURN_DUE_TO_ERR_MASK << \
-       CR_CREDIT_RETURN_DUE_TO_ERR_SHIFT)
-
-#define CR_CREDIT_RETURN_DUE_TO_FORCE_SHIFT 15
-#define CR_CREDIT_RETURN_DUE_TO_FORCE_MASK 0x1ull
-#define CR_CREDIT_RETURN_DUE_TO_FORCE_SMASK \
-       (CR_CREDIT_RETURN_DUE_TO_FORCE_MASK << \
-       CR_CREDIT_RETURN_DUE_TO_FORCE_SHIFT)
-
-/* interrupt source numbers */
-#define IS_GENERAL_ERR_START     0
-#define IS_SDMAENG_ERR_START    16
-#define IS_SENDCTXT_ERR_START   32
-#define IS_SDMA_START          192 /* includes SDmaProgress,SDmaIdle */
-#define IS_VARIOUS_START               240
-#define IS_DC_START                    248
-#define IS_RCVAVAIL_START              256
-#define IS_RCVURGENT_START             416
-#define IS_SENDCREDIT_START            576
-#define IS_RESERVED_START              736
-#define IS_MAX_SOURCES         768
-
-/* derived interrupt source values */
-#define IS_GENERAL_ERR_END             IS_SDMAENG_ERR_START
-#define IS_SDMAENG_ERR_END             IS_SENDCTXT_ERR_START
-#define IS_SENDCTXT_ERR_END            IS_SDMA_START
-#define IS_SDMA_END                    IS_VARIOUS_START
-#define IS_VARIOUS_END         IS_DC_START
-#define IS_DC_END                      IS_RCVAVAIL_START
-#define IS_RCVAVAIL_END                IS_RCVURGENT_START
-#define IS_RCVURGENT_END               IS_SENDCREDIT_START
-#define IS_SENDCREDIT_END              IS_RESERVED_START
-#define IS_RESERVED_END                IS_MAX_SOURCES
-
-/* absolute interrupt numbers for QSFP1Int and QSFP2Int */
-#define QSFP1_INT              242
-#define QSFP2_INT              243
-
-/* DCC_CFG_PORT_CONFIG logical link states */
-#define LSTATE_DOWN    0x1
-#define LSTATE_INIT    0x2
-#define LSTATE_ARMED   0x3
-#define LSTATE_ACTIVE  0x4
-
-/* DC8051_STS_CUR_STATE port values (physical link states) */
-#define PLS_DISABLED                      0x30
-#define PLS_OFFLINE                               0x90
-#define PLS_OFFLINE_QUIET                         0x90
-#define PLS_OFFLINE_PLANNED_DOWN_INFORM           0x91
-#define PLS_OFFLINE_READY_TO_QUIET_LT     0x92
-#define PLS_OFFLINE_REPORT_FAILURE                0x93
-#define PLS_OFFLINE_READY_TO_QUIET_BCC    0x94
-#define PLS_POLLING                               0x20
-#define PLS_POLLING_QUIET                         0x20
-#define PLS_POLLING_ACTIVE                        0x21
-#define PLS_CONFIGPHY                     0x40
-#define PLS_CONFIGPHY_DEBOUCE             0x40
-#define PLS_CONFIGPHY_ESTCOMM             0x41
-#define PLS_CONFIGPHY_ESTCOMM_TXRX_HUNT           0x42
-#define PLS_CONFIGPHY_ESTCOMM_LOCAL_COMPLETE   0x43
-#define PLS_CONFIGPHY_OPTEQ                       0x44
-#define PLS_CONFIGPHY_OPTEQ_OPTIMIZING    0x44
-#define PLS_CONFIGPHY_OPTEQ_LOCAL_COMPLETE        0x45
-#define PLS_CONFIGPHY_VERIFYCAP                   0x46
-#define PLS_CONFIGPHY_VERIFYCAP_EXCHANGE          0x46
-#define PLS_CONFIGPHY_VERIFYCAP_LOCAL_COMPLETE 0x47
-#define PLS_CONFIGLT                      0x48
-#define PLS_CONFIGLT_CONFIGURE            0x48
-#define PLS_CONFIGLT_LINK_TRANSFER_ACTIVE         0x49
-#define PLS_LINKUP                                0x50
-#define PLS_PHYTEST                               0xB0
-#define PLS_INTERNAL_SERDES_LOOPBACK      0xe1
-#define PLS_QUICK_LINKUP                          0xe2
-
-/* DC_DC8051_CFG_HOST_CMD_0.REQ_TYPE - 8051 host commands */
-#define HCMD_LOAD_CONFIG_DATA  0x01
-#define HCMD_READ_CONFIG_DATA  0x02
-#define HCMD_CHANGE_PHY_STATE  0x03
-#define HCMD_SEND_LCB_IDLE_MSG 0x04
-#define HCMD_MISC                 0x05
-#define HCMD_READ_LCB_IDLE_MSG 0x06
-#define HCMD_READ_LCB_CSR      0x07
-#define HCMD_WRITE_LCB_CSR     0x08
-#define HCMD_INTERFACE_TEST       0xff
-
-/* DC_DC8051_CFG_HOST_CMD_1.RETURN_CODE - 8051 host command return */
-#define HCMD_SUCCESS 2
-
-/* DC_DC8051_DBG_ERR_INFO_SET_BY_8051.ERROR - error flags */
-#define SPICO_ROM_FAILED               BIT(0)
-#define UNKNOWN_FRAME                  BIT(1)
-#define TARGET_BER_NOT_MET             BIT(2)
-#define FAILED_SERDES_INTERNAL_LOOPBACK        BIT(3)
-#define FAILED_SERDES_INIT             BIT(4)
-#define FAILED_LNI_POLLING             BIT(5)
-#define FAILED_LNI_DEBOUNCE            BIT(6)
-#define FAILED_LNI_ESTBCOMM            BIT(7)
-#define FAILED_LNI_OPTEQ               BIT(8)
-#define FAILED_LNI_VERIFY_CAP1         BIT(9)
-#define FAILED_LNI_VERIFY_CAP2         BIT(10)
-#define FAILED_LNI_CONFIGLT            BIT(11)
-#define HOST_HANDSHAKE_TIMEOUT         BIT(12)
-
-#define FAILED_LNI (FAILED_LNI_POLLING | FAILED_LNI_DEBOUNCE \
-                       | FAILED_LNI_ESTBCOMM | FAILED_LNI_OPTEQ \
-                       | FAILED_LNI_VERIFY_CAP1 \
-                       | FAILED_LNI_VERIFY_CAP2 \
-                       | FAILED_LNI_CONFIGLT | HOST_HANDSHAKE_TIMEOUT)
-
-/* DC_DC8051_DBG_ERR_INFO_SET_BY_8051.HOST_MSG - host message flags */
-#define HOST_REQ_DONE          BIT(0)
-#define BC_PWR_MGM_MSG         BIT(1)
-#define BC_SMA_MSG             BIT(2)
-#define BC_BCC_UNKNOWN_MSG     BIT(3)
-#define BC_IDLE_UNKNOWN_MSG    BIT(4)
-#define EXT_DEVICE_CFG_REQ     BIT(5)
-#define VERIFY_CAP_FRAME       BIT(6)
-#define LINKUP_ACHIEVED                BIT(7)
-#define LINK_GOING_DOWN                BIT(8)
-#define LINK_WIDTH_DOWNGRADED  BIT(9)
-
-/* DC_DC8051_CFG_EXT_DEV_1.REQ_TYPE - 8051 host requests */
-#define HREQ_LOAD_CONFIG       0x01
-#define HREQ_SAVE_CONFIG       0x02
-#define HREQ_READ_CONFIG       0x03
-#define HREQ_SET_TX_EQ_ABS     0x04
-#define HREQ_SET_TX_EQ_REL     0x05
-#define HREQ_ENABLE            0x06
-#define HREQ_CONFIG_DONE       0xfe
-#define HREQ_INTERFACE_TEST    0xff
-
-/* DC_DC8051_CFG_EXT_DEV_0.RETURN_CODE - 8051 host request return codes */
-#define HREQ_INVALID           0x01
-#define HREQ_SUCCESS           0x02
-#define HREQ_NOT_SUPPORTED             0x03
-#define HREQ_FEATURE_NOT_SUPPORTED     0x04 /* request specific feature */
-#define HREQ_REQUEST_REJECTED  0xfe
-#define HREQ_EXECUTION_ONGOING 0xff
-
-/* MISC host command functions */
-#define HCMD_MISC_REQUEST_LCB_ACCESS 0x1
-#define HCMD_MISC_GRANT_LCB_ACCESS   0x2
-
-/* idle flit message types */
-#define IDLE_PHYSICAL_LINK_MGMT 0x1
-#define IDLE_CRU                   0x2
-#define IDLE_SMA                   0x3
-#define IDLE_POWER_MGMT            0x4
-
-/* idle flit message send fields (both send and read) */
-#define IDLE_PAYLOAD_MASK 0xffffffffffull /* 40 bits */
-#define IDLE_PAYLOAD_SHIFT 8
-#define IDLE_MSG_TYPE_MASK 0xf
-#define IDLE_MSG_TYPE_SHIFT 0
-
-/* idle flit message read fields */
-#define READ_IDLE_MSG_TYPE_MASK 0xf
-#define READ_IDLE_MSG_TYPE_SHIFT 0
-
-/* SMA idle flit payload commands */
-#define SMA_IDLE_ARM   1
-#define SMA_IDLE_ACTIVE 2
-
-/* DC_DC8051_CFG_MODE.GENERAL bits */
-#define DISABLE_SELF_GUID_CHECK 0x2
-
-/*
- * Eager buffer minimum and maximum sizes supported by the hardware.
- * All power-of-two sizes in between are supported as well.
- * MAX_EAGER_BUFFER_TOTAL is the maximum size of memory
- * allocatable for Eager buffer to a single context. All others
- * are limits for the RcvArray entries.
- */
-#define MIN_EAGER_BUFFER       (4 * 1024)
-#define MAX_EAGER_BUFFER       (256 * 1024)
-#define MAX_EAGER_BUFFER_TOTAL (64 * (1 << 20)) /* max per ctxt 64MB */
-#define MAX_EXPECTED_BUFFER    (2048 * 1024)
-
-/*
- * Receive expected base and count and eager base and count increment -
- * the CSR fields hold multiples of this value.
- */
-#define RCV_SHIFT 3
-#define RCV_INCREMENT BIT(RCV_SHIFT)
-
-/*
- * Receive header queue entry increment - the CSR holds multiples of
- * this value.
- */
-#define HDRQ_SIZE_SHIFT 5
-#define HDRQ_INCREMENT BIT(HDRQ_SIZE_SHIFT)
-
-/*
- * Freeze handling flags
- */
-#define FREEZE_ABORT     0x01  /* do not do recovery */
-#define FREEZE_SELF         0x02       /* initiate the freeze */
-#define FREEZE_LINK_DOWN 0x04  /* link is down */
-
-/*
- * Chip implementation codes.
- */
-#define ICODE_RTL_SILICON              0x00
-#define ICODE_RTL_VCS_SIMULATION       0x01
-#define ICODE_FPGA_EMULATION   0x02
-#define ICODE_FUNCTIONAL_SIMULATOR     0x03
-
-/*
- * 8051 data memory size.
- */
-#define DC8051_DATA_MEM_SIZE 0x1000
-
-/*
- * 8051 firmware registers
- */
-#define NUM_GENERAL_FIELDS 0x17
-#define NUM_LANE_FIELDS    0x8
-
-/* 8051 general register Field IDs */
-#define LINK_OPTIMIZATION_SETTINGS   0x00
-#define LINK_TUNING_PARAMETERS      0x02
-#define DC_HOST_COMM_SETTINGS       0x03
-#define TX_SETTINGS                 0x06
-#define VERIFY_CAP_LOCAL_PHY        0x07
-#define VERIFY_CAP_LOCAL_FABRIC             0x08
-#define VERIFY_CAP_LOCAL_LINK_WIDTH  0x09
-#define LOCAL_DEVICE_ID                     0x0a
-#define LOCAL_LNI_INFO              0x0c
-#define REMOTE_LNI_INFO              0x0d
-#define MISC_STATUS                 0x0e
-#define VERIFY_CAP_REMOTE_PHY       0x0f
-#define VERIFY_CAP_REMOTE_FABRIC     0x10
-#define VERIFY_CAP_REMOTE_LINK_WIDTH 0x11
-#define LAST_LOCAL_STATE_COMPLETE    0x12
-#define LAST_REMOTE_STATE_COMPLETE   0x13
-#define LINK_QUALITY_INFO            0x14
-#define REMOTE_DEVICE_ID            0x15
-#define LINK_DOWN_REASON            0x16
-
-/* 8051 lane specific register field IDs */
-#define TX_EQ_SETTINGS         0x00
-#define CHANNEL_LOSS_SETTINGS  0x05
-
-/* Lane ID for general configuration registers */
-#define GENERAL_CONFIG 4
-
-/* LOAD_DATA 8051 command shifts and fields */
-#define LOAD_DATA_FIELD_ID_SHIFT 40
-#define LOAD_DATA_FIELD_ID_MASK 0xfull
-#define LOAD_DATA_LANE_ID_SHIFT 32
-#define LOAD_DATA_LANE_ID_MASK 0xfull
-#define LOAD_DATA_DATA_SHIFT   0x0
-#define LOAD_DATA_DATA_MASK   0xffffffffull
-
-/* READ_DATA 8051 command shifts and fields */
-#define READ_DATA_FIELD_ID_SHIFT 40
-#define READ_DATA_FIELD_ID_MASK 0xffull
-#define READ_DATA_LANE_ID_SHIFT 32
-#define READ_DATA_LANE_ID_MASK 0xffull
-#define READ_DATA_DATA_SHIFT   0x0
-#define READ_DATA_DATA_MASK   0xffffffffull
-
-/* TX settings fields */
-#define ENABLE_LANE_TX_SHIFT           0
-#define ENABLE_LANE_TX_MASK            0xff
-#define TX_POLARITY_INVERSION_SHIFT    8
-#define TX_POLARITY_INVERSION_MASK     0xff
-#define RX_POLARITY_INVERSION_SHIFT    16
-#define RX_POLARITY_INVERSION_MASK     0xff
-#define MAX_RATE_SHIFT                 24
-#define MAX_RATE_MASK                  0xff
-
-/* verify capability PHY fields */
-#define CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT 0x4
-#define CONTINIOUS_REMOTE_UPDATE_SUPPORT_MASK  0x1
-#define POWER_MANAGEMENT_SHIFT                 0x0
-#define POWER_MANAGEMENT_MASK                  0xf
-
-/* 8051 lane register Field IDs */
-#define SPICO_FW_VERSION 0x7   /* SPICO firmware version */
-
-/* SPICO firmware version fields */
-#define SPICO_ROM_VERSION_SHIFT 0
-#define SPICO_ROM_VERSION_MASK 0xffff
-#define SPICO_ROM_PROD_ID_SHIFT 16
-#define SPICO_ROM_PROD_ID_MASK 0xffff
-
-/* verify capability fabric fields */
-#define VAU_SHIFT      0
-#define VAU_MASK       0x0007
-#define Z_SHIFT                3
-#define Z_MASK         0x0001
-#define VCU_SHIFT      4
-#define VCU_MASK       0x0007
-#define VL15BUF_SHIFT  8
-#define VL15BUF_MASK   0x0fff
-#define CRC_SIZES_SHIFT 20
-#define CRC_SIZES_MASK 0x7
-
-/* verify capability local link width fields */
-#define LINK_WIDTH_SHIFT 0             /* also for remote link width */
-#define LINK_WIDTH_MASK 0xffff         /* also for remote link width */
-#define LOCAL_FLAG_BITS_SHIFT 16
-#define LOCAL_FLAG_BITS_MASK 0xff
-#define MISC_CONFIG_BITS_SHIFT 24
-#define MISC_CONFIG_BITS_MASK 0xff
-
-/* verify capability remote link width fields */
-#define REMOTE_TX_RATE_SHIFT 16
-#define REMOTE_TX_RATE_MASK 0xff
-
-/* LOCAL_DEVICE_ID fields */
-#define LOCAL_DEVICE_REV_SHIFT 0
-#define LOCAL_DEVICE_REV_MASK 0xff
-#define LOCAL_DEVICE_ID_SHIFT 8
-#define LOCAL_DEVICE_ID_MASK 0xffff
-
-/* REMOTE_DEVICE_ID fields */
-#define REMOTE_DEVICE_REV_SHIFT 0
-#define REMOTE_DEVICE_REV_MASK 0xff
-#define REMOTE_DEVICE_ID_SHIFT 8
-#define REMOTE_DEVICE_ID_MASK 0xffff
-
-/* local LNI link width fields */
-#define ENABLE_LANE_RX_SHIFT 16
-#define ENABLE_LANE_RX_MASK  0xff
-
-/* mask, shift for reading 'mgmt_enabled' value from REMOTE_LNI_INFO field */
-#define MGMT_ALLOWED_SHIFT 23
-#define MGMT_ALLOWED_MASK 0x1
-
-/* mask, shift for 'link_quality' within LINK_QUALITY_INFO field */
-#define LINK_QUALITY_SHIFT 24
-#define LINK_QUALITY_MASK  0x7
-
-/*
- * mask, shift for reading 'planned_down_remote_reason_code'
- * from LINK_QUALITY_INFO field
- */
-#define DOWN_REMOTE_REASON_SHIFT 16
-#define DOWN_REMOTE_REASON_MASK  0xff
-
-/* verify capability PHY power management bits */
-#define PWRM_BER_CONTROL       0x1
-#define PWRM_BANDWIDTH_CONTROL 0x2
-
-/* 8051 link down reasons */
-#define LDR_LINK_TRANSFER_ACTIVE_LOW   0xa
-#define LDR_RECEIVED_LINKDOWN_IDLE_MSG 0xb
-#define LDR_RECEIVED_HOST_OFFLINE_REQ  0xc
-
-/* verify capability fabric CRC size bits */
-enum {
-       CAP_CRC_14B = (1 << 0), /* 14b CRC */
-       CAP_CRC_48B = (1 << 1), /* 48b CRC */
-       CAP_CRC_12B_16B_PER_LANE = (1 << 2) /* 12b-16b per lane CRC */
-};
-
-#define SUPPORTED_CRCS (CAP_CRC_14B | CAP_CRC_48B)
-
-/* misc status version fields */
-#define STS_FM_VERSION_A_SHIFT 16
-#define STS_FM_VERSION_A_MASK  0xff
-#define STS_FM_VERSION_B_SHIFT 24
-#define STS_FM_VERSION_B_MASK  0xff
-
-/* LCB_CFG_CRC_MODE TX_VAL and RX_VAL CRC mode values */
-#define LCB_CRC_16B                    0x0     /* 16b CRC */
-#define LCB_CRC_14B                    0x1     /* 14b CRC */
-#define LCB_CRC_48B                    0x2     /* 48b CRC */
-#define LCB_CRC_12B_16B_PER_LANE       0x3     /* 12b-16b per lane CRC */
-
-/*
- * the following enum is (almost) a copy/paste of the definition
- * in the OPA spec, section 20.2.2.6.8 (PortInfo)
- */
-enum {
-       PORT_LTP_CRC_MODE_NONE = 0,
-       PORT_LTP_CRC_MODE_14 = 1, /* 14-bit LTP CRC mode (optional) */
-       PORT_LTP_CRC_MODE_16 = 2, /* 16-bit LTP CRC mode */
-       PORT_LTP_CRC_MODE_48 = 4,
-               /* 48-bit overlapping LTP CRC mode (optional) */
-       PORT_LTP_CRC_MODE_PER_LANE = 8
-               /* 12 to 16 bit per lane LTP CRC mode (optional) */
-};
-
-/* timeouts */
-#define LINK_RESTART_DELAY 1000                /* link restart delay, in ms */
-#define TIMEOUT_8051_START 5000         /* 8051 start timeout, in ms */
-#define DC8051_COMMAND_TIMEOUT 20000   /* DC8051 command timeout, in ms */
-#define FREEZE_STATUS_TIMEOUT 20       /* wait for freeze indicators, in ms */
-#define VL_STATUS_CLEAR_TIMEOUT 5000   /* per-VL status clear, in ms */
-#define CCE_STATUS_TIMEOUT 10          /* time to clear CCE Status, in ms */
-
-/* cclock tick time, in picoseconds per tick: 1/speed * 10^12  */
-#define ASIC_CCLOCK_PS  1242   /* 805 MHz */
-#define FPGA_CCLOCK_PS 30300   /*  33 MHz */
-
-/*
- * Mask of enabled MISC errors.  Do not enable the two RSA engine errors -
- * see firmware.c:run_rsa() for details.
- */
-#define DRIVER_MISC_MASK \
-       (~(MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK \
-               | MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK))
-
-/* valid values for the loopback module parameter */
-#define LOOPBACK_NONE  0       /* no loopback - default */
-#define LOOPBACK_SERDES 1
-#define LOOPBACK_LCB   2
-#define LOOPBACK_CABLE 3       /* external cable */
-
-/* read and write hardware registers */
-u64 read_csr(const struct hfi1_devdata *dd, u32 offset);
-void write_csr(const struct hfi1_devdata *dd, u32 offset, u64 value);
-
-/*
- * The *_kctxt_* flavor of the CSR read/write functions are for
- * per-context or per-SDMA CSRs that are not mappable to user-space.
- * Their spacing is not a PAGE_SIZE multiple.
- */
-static inline u64 read_kctxt_csr(const struct hfi1_devdata *dd, int ctxt,
-                                u32 offset0)
-{
-       /* kernel per-context CSRs are separated by 0x100 */
-       return read_csr(dd, offset0 + (0x100 * ctxt));
-}
-
-static inline void write_kctxt_csr(struct hfi1_devdata *dd, int ctxt,
-                                  u32 offset0, u64 value)
-{
-       /* kernel per-context CSRs are separated by 0x100 */
-       write_csr(dd, offset0 + (0x100 * ctxt), value);
-}
-
-int read_lcb_csr(struct hfi1_devdata *dd, u32 offset, u64 *data);
-int write_lcb_csr(struct hfi1_devdata *dd, u32 offset, u64 data);
-
-void __iomem *get_csr_addr(
-       struct hfi1_devdata *dd,
-       u32 offset);
-
-static inline void __iomem *get_kctxt_csr_addr(
-       struct hfi1_devdata *dd,
-       int ctxt,
-       u32 offset0)
-{
-       return get_csr_addr(dd, offset0 + (0x100 * ctxt));
-}
-
-/*
- * The *_uctxt_* flavor of the CSR read/write functions are for
- * per-context CSRs that are mappable to user space. All these CSRs
- * are spaced by a PAGE_SIZE multiple in order to be mappable to
- * different processes without exposing other contexts' CSRs
- */
-static inline u64 read_uctxt_csr(const struct hfi1_devdata *dd, int ctxt,
-                                u32 offset0)
-{
-       /* user per-context CSRs are separated by 0x1000 */
-       return read_csr(dd, offset0 + (0x1000 * ctxt));
-}
-
-static inline void write_uctxt_csr(struct hfi1_devdata *dd, int ctxt,
-                                  u32 offset0, u64 value)
-{
-       /* user per-context CSRs are separated by 0x1000 */
-       write_csr(dd, offset0 + (0x1000 * ctxt), value);
-}
-
-u64 create_pbc(struct hfi1_pportdata *ppd, u64, int, u32, u32);
-
-/* firmware.c */
-#define SBUS_MASTER_BROADCAST 0xfd
-#define NUM_PCIE_SERDES 16     /* number of PCIe serdes on the SBus */
-extern const u8 pcie_serdes_broadcast[];
-extern const u8 pcie_pcs_addrs[2][NUM_PCIE_SERDES];
-extern uint platform_config_load;
-
-/* SBus commands */
-#define RESET_SBUS_RECEIVER 0x20
-#define WRITE_SBUS_RECEIVER 0x21
-void sbus_request(struct hfi1_devdata *dd,
-                 u8 receiver_addr, u8 data_addr, u8 command, u32 data_in);
-int sbus_request_slow(struct hfi1_devdata *dd,
-                     u8 receiver_addr, u8 data_addr, u8 command, u32 data_in);
-void set_sbus_fast_mode(struct hfi1_devdata *dd);
-void clear_sbus_fast_mode(struct hfi1_devdata *dd);
-int hfi1_firmware_init(struct hfi1_devdata *dd);
-int load_pcie_firmware(struct hfi1_devdata *dd);
-int load_firmware(struct hfi1_devdata *dd);
-void dispose_firmware(void);
-int acquire_hw_mutex(struct hfi1_devdata *dd);
-void release_hw_mutex(struct hfi1_devdata *dd);
-
-/*
- * Bitmask of dynamic access for ASIC block chip resources.  Each HFI has its
- * own range of bits for the resource so it can clear its own bits on
- * starting and exiting.  If either HFI has the resource bit set, the
- * resource is in use.  The separate bit ranges are:
- *     HFI0 bits  7:0
- *     HFI1 bits 15:8
- */
-#define CR_SBUS  0x01  /* SBUS, THERM, and PCIE registers */
-#define CR_EPROM 0x02  /* EEP, GPIO registers */
-#define CR_I2C1  0x04  /* QSFP1_OE register */
-#define CR_I2C2  0x08  /* QSFP2_OE register */
-#define CR_DYN_SHIFT 8 /* dynamic flag shift */
-#define CR_DYN_MASK  ((1ull << CR_DYN_SHIFT) - 1)
-
-/*
- * Bitmask of static ASIC states these are outside of the dynamic ASIC
- * block chip resources above.  These are to be set once and never cleared.
- * Must be holding the SBus dynamic flag when setting.
- */
-#define CR_THERM_INIT  0x010000
-
-int acquire_chip_resource(struct hfi1_devdata *dd, u32 resource, u32 mswait);
-void release_chip_resource(struct hfi1_devdata *dd, u32 resource);
-bool check_chip_resource(struct hfi1_devdata *dd, u32 resource,
-                        const char *func);
-void init_chip_resources(struct hfi1_devdata *dd);
-void finish_chip_resources(struct hfi1_devdata *dd);
-
-/* ms wait time for access to an SBus resoure */
-#define SBUS_TIMEOUT 4000 /* long enough for a FW download and SBR */
-
-/* ms wait time for a qsfp (i2c) chain to become available */
-#define QSFP_WAIT 20000 /* long enough for FW update to the F4 uc */
-
-void fabric_serdes_reset(struct hfi1_devdata *dd);
-int read_8051_data(struct hfi1_devdata *dd, u32 addr, u32 len, u64 *result);
-
-/* chip.c */
-void read_misc_status(struct hfi1_devdata *dd, u8 *ver_a, u8 *ver_b);
-void read_guid(struct hfi1_devdata *dd);
-int wait_fm_ready(struct hfi1_devdata *dd, u32 mstimeout);
-void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason,
-                         u8 neigh_reason, u8 rem_reason);
-int set_link_state(struct hfi1_pportdata *, u32 state);
-int port_ltp_to_cap(int port_ltp);
-void handle_verify_cap(struct work_struct *work);
-void handle_freeze(struct work_struct *work);
-void handle_link_up(struct work_struct *work);
-void handle_link_down(struct work_struct *work);
-void handle_link_downgrade(struct work_struct *work);
-void handle_link_bounce(struct work_struct *work);
-void handle_sma_message(struct work_struct *work);
-void reset_qsfp(struct hfi1_pportdata *ppd);
-void qsfp_event(struct work_struct *work);
-void start_freeze_handling(struct hfi1_pportdata *ppd, int flags);
-int send_idle_sma(struct hfi1_devdata *dd, u64 message);
-int load_8051_config(struct hfi1_devdata *, u8, u8, u32);
-int read_8051_config(struct hfi1_devdata *, u8, u8, u32 *);
-int start_link(struct hfi1_pportdata *ppd);
-int bringup_serdes(struct hfi1_pportdata *ppd);
-void set_intr_state(struct hfi1_devdata *dd, u32 enable);
-void apply_link_downgrade_policy(struct hfi1_pportdata *ppd,
-                                int refresh_widths);
-void update_usrhead(struct hfi1_ctxtdata *, u32, u32, u32, u32, u32);
-int stop_drain_data_vls(struct hfi1_devdata *dd);
-int open_fill_data_vls(struct hfi1_devdata *dd);
-u32 ns_to_cclock(struct hfi1_devdata *dd, u32 ns);
-u32 cclock_to_ns(struct hfi1_devdata *dd, u32 cclock);
-void get_linkup_link_widths(struct hfi1_pportdata *ppd);
-void read_ltp_rtt(struct hfi1_devdata *dd);
-void clear_linkup_counters(struct hfi1_devdata *dd);
-u32 hdrqempty(struct hfi1_ctxtdata *rcd);
-int is_ax(struct hfi1_devdata *dd);
-int is_bx(struct hfi1_devdata *dd);
-u32 read_physical_state(struct hfi1_devdata *dd);
-u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate);
-u32 get_logical_state(struct hfi1_pportdata *ppd);
-const char *opa_lstate_name(u32 lstate);
-const char *opa_pstate_name(u32 pstate);
-u32 driver_physical_state(struct hfi1_pportdata *ppd);
-u32 driver_logical_state(struct hfi1_pportdata *ppd);
-
-int acquire_lcb_access(struct hfi1_devdata *dd, int sleep_ok);
-int release_lcb_access(struct hfi1_devdata *dd, int sleep_ok);
-#define LCB_START DC_LCB_CSRS
-#define LCB_END   DC_8051_CSRS /* next block is 8051 */
-static inline int is_lcb_offset(u32 offset)
-{
-       return (offset >= LCB_START && offset < LCB_END);
-}
-
-extern uint num_vls;
-
-extern uint disable_integrity;
-u64 read_dev_cntr(struct hfi1_devdata *dd, int index, int vl);
-u64 write_dev_cntr(struct hfi1_devdata *dd, int index, int vl, u64 data);
-u64 read_port_cntr(struct hfi1_pportdata *ppd, int index, int vl);
-u64 write_port_cntr(struct hfi1_pportdata *ppd, int index, int vl, u64 data);
-u32 read_logical_state(struct hfi1_devdata *dd);
-void force_recv_intr(struct hfi1_ctxtdata *rcd);
-
-/* Per VL indexes */
-enum {
-       C_VL_0 = 0,
-       C_VL_1,
-       C_VL_2,
-       C_VL_3,
-       C_VL_4,
-       C_VL_5,
-       C_VL_6,
-       C_VL_7,
-       C_VL_15,
-       C_VL_COUNT
-};
-
-static inline int vl_from_idx(int idx)
-{
-       return (idx == C_VL_15 ? 15 : idx);
-}
-
-static inline int idx_from_vl(int vl)
-{
-       return (vl == 15 ? C_VL_15 : vl);
-}
-
-/* Per device counter indexes */
-enum {
-       C_RCV_OVF = 0,
-       C_RX_TID_FULL,
-       C_RX_TID_INVALID,
-       C_RX_TID_FLGMS,
-       C_RX_CTX_EGRS,
-       C_RCV_TID_FLSMS,
-       C_CCE_PCI_CR_ST,
-       C_CCE_PCI_TR_ST,
-       C_CCE_PIO_WR_ST,
-       C_CCE_ERR_INT,
-       C_CCE_SDMA_INT,
-       C_CCE_MISC_INT,
-       C_CCE_RCV_AV_INT,
-       C_CCE_RCV_URG_INT,
-       C_CCE_SEND_CR_INT,
-       C_DC_UNC_ERR,
-       C_DC_RCV_ERR,
-       C_DC_FM_CFG_ERR,
-       C_DC_RMT_PHY_ERR,
-       C_DC_DROPPED_PKT,
-       C_DC_MC_XMIT_PKTS,
-       C_DC_MC_RCV_PKTS,
-       C_DC_XMIT_CERR,
-       C_DC_RCV_CERR,
-       C_DC_RCV_FCC,
-       C_DC_XMIT_FCC,
-       C_DC_XMIT_FLITS,
-       C_DC_RCV_FLITS,
-       C_DC_XMIT_PKTS,
-       C_DC_RCV_PKTS,
-       C_DC_RX_FLIT_VL,
-       C_DC_RX_PKT_VL,
-       C_DC_RCV_FCN,
-       C_DC_RCV_FCN_VL,
-       C_DC_RCV_BCN,
-       C_DC_RCV_BCN_VL,
-       C_DC_RCV_BBL,
-       C_DC_RCV_BBL_VL,
-       C_DC_MARK_FECN,
-       C_DC_MARK_FECN_VL,
-       C_DC_TOTAL_CRC,
-       C_DC_CRC_LN0,
-       C_DC_CRC_LN1,
-       C_DC_CRC_LN2,
-       C_DC_CRC_LN3,
-       C_DC_CRC_MULT_LN,
-       C_DC_TX_REPLAY,
-       C_DC_RX_REPLAY,
-       C_DC_SEQ_CRC_CNT,
-       C_DC_ESC0_ONLY_CNT,
-       C_DC_ESC0_PLUS1_CNT,
-       C_DC_ESC0_PLUS2_CNT,
-       C_DC_REINIT_FROM_PEER_CNT,
-       C_DC_SBE_CNT,
-       C_DC_MISC_FLG_CNT,
-       C_DC_PRF_GOOD_LTP_CNT,
-       C_DC_PRF_ACCEPTED_LTP_CNT,
-       C_DC_PRF_RX_FLIT_CNT,
-       C_DC_PRF_TX_FLIT_CNT,
-       C_DC_PRF_CLK_CNTR,
-       C_DC_PG_DBG_FLIT_CRDTS_CNT,
-       C_DC_PG_STS_PAUSE_COMPLETE_CNT,
-       C_DC_PG_STS_TX_SBE_CNT,
-       C_DC_PG_STS_TX_MBE_CNT,
-       C_SW_CPU_INTR,
-       C_SW_CPU_RCV_LIM,
-       C_SW_VTX_WAIT,
-       C_SW_PIO_WAIT,
-       C_SW_PIO_DRAIN,
-       C_SW_KMEM_WAIT,
-       C_SW_SEND_SCHED,
-       C_SDMA_DESC_FETCHED_CNT,
-       C_SDMA_INT_CNT,
-       C_SDMA_ERR_CNT,
-       C_SDMA_IDLE_INT_CNT,
-       C_SDMA_PROGRESS_INT_CNT,
-/* MISC_ERR_STATUS */
-       C_MISC_PLL_LOCK_FAIL_ERR,
-       C_MISC_MBIST_FAIL_ERR,
-       C_MISC_INVALID_EEP_CMD_ERR,
-       C_MISC_EFUSE_DONE_PARITY_ERR,
-       C_MISC_EFUSE_WRITE_ERR,
-       C_MISC_EFUSE_READ_BAD_ADDR_ERR,
-       C_MISC_EFUSE_CSR_PARITY_ERR,
-       C_MISC_FW_AUTH_FAILED_ERR,
-       C_MISC_KEY_MISMATCH_ERR,
-       C_MISC_SBUS_WRITE_FAILED_ERR,
-       C_MISC_CSR_WRITE_BAD_ADDR_ERR,
-       C_MISC_CSR_READ_BAD_ADDR_ERR,
-       C_MISC_CSR_PARITY_ERR,
-/* CceErrStatus */
-       /*
-       * A special counter that is the aggregate count
-       * of all the cce_err_status errors.  The remainder
-       * are actual bits in the CceErrStatus register.
-       */
-       C_CCE_ERR_STATUS_AGGREGATED_CNT,
-       C_CCE_MSIX_CSR_PARITY_ERR,
-       C_CCE_INT_MAP_UNC_ERR,
-       C_CCE_INT_MAP_COR_ERR,
-       C_CCE_MSIX_TABLE_UNC_ERR,
-       C_CCE_MSIX_TABLE_COR_ERR,
-       C_CCE_RXDMA_CONV_FIFO_PARITY_ERR,
-       C_CCE_RCPL_ASYNC_FIFO_PARITY_ERR,
-       C_CCE_SEG_WRITE_BAD_ADDR_ERR,
-       C_CCE_SEG_READ_BAD_ADDR_ERR,
-       C_LA_TRIGGERED,
-       C_CCE_TRGT_CPL_TIMEOUT_ERR,
-       C_PCIC_RECEIVE_PARITY_ERR,
-       C_PCIC_TRANSMIT_BACK_PARITY_ERR,
-       C_PCIC_TRANSMIT_FRONT_PARITY_ERR,
-       C_PCIC_CPL_DAT_Q_UNC_ERR,
-       C_PCIC_CPL_HD_Q_UNC_ERR,
-       C_PCIC_POST_DAT_Q_UNC_ERR,
-       C_PCIC_POST_HD_Q_UNC_ERR,
-       C_PCIC_RETRY_SOT_MEM_UNC_ERR,
-       C_PCIC_RETRY_MEM_UNC_ERR,
-       C_PCIC_N_POST_DAT_Q_PARITY_ERR,
-       C_PCIC_N_POST_H_Q_PARITY_ERR,
-       C_PCIC_CPL_DAT_Q_COR_ERR,
-       C_PCIC_CPL_HD_Q_COR_ERR,
-       C_PCIC_POST_DAT_Q_COR_ERR,
-       C_PCIC_POST_HD_Q_COR_ERR,
-       C_PCIC_RETRY_SOT_MEM_COR_ERR,
-       C_PCIC_RETRY_MEM_COR_ERR,
-       C_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERR,
-       C_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERR,
-       C_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR,
-       C_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR,
-       C_CCE_CLI2_ASYNC_FIFO_PARITY_ERR,
-       C_CCE_CSR_CFG_BUS_PARITY_ERR,
-       C_CCE_CLI0_ASYNC_FIFO_PARTIY_ERR,
-       C_CCE_RSPD_DATA_PARITY_ERR,
-       C_CCE_TRGT_ACCESS_ERR,
-       C_CCE_TRGT_ASYNC_FIFO_PARITY_ERR,
-       C_CCE_CSR_WRITE_BAD_ADDR_ERR,
-       C_CCE_CSR_READ_BAD_ADDR_ERR,
-       C_CCE_CSR_PARITY_ERR,
-/* RcvErrStatus */
-       C_RX_CSR_PARITY_ERR,
-       C_RX_CSR_WRITE_BAD_ADDR_ERR,
-       C_RX_CSR_READ_BAD_ADDR_ERR,
-       C_RX_DMA_CSR_UNC_ERR,
-       C_RX_DMA_DQ_FSM_ENCODING_ERR,
-       C_RX_DMA_EQ_FSM_ENCODING_ERR,
-       C_RX_DMA_CSR_PARITY_ERR,
-       C_RX_RBUF_DATA_COR_ERR,
-       C_RX_RBUF_DATA_UNC_ERR,
-       C_RX_DMA_DATA_FIFO_RD_COR_ERR,
-       C_RX_DMA_DATA_FIFO_RD_UNC_ERR,
-       C_RX_DMA_HDR_FIFO_RD_COR_ERR,
-       C_RX_DMA_HDR_FIFO_RD_UNC_ERR,
-       C_RX_RBUF_DESC_PART2_COR_ERR,
-       C_RX_RBUF_DESC_PART2_UNC_ERR,
-       C_RX_RBUF_DESC_PART1_COR_ERR,
-       C_RX_RBUF_DESC_PART1_UNC_ERR,
-       C_RX_HQ_INTR_FSM_ERR,
-       C_RX_HQ_INTR_CSR_PARITY_ERR,
-       C_RX_LOOKUP_CSR_PARITY_ERR,
-       C_RX_LOOKUP_RCV_ARRAY_COR_ERR,
-       C_RX_LOOKUP_RCV_ARRAY_UNC_ERR,
-       C_RX_LOOKUP_DES_PART2_PARITY_ERR,
-       C_RX_LOOKUP_DES_PART1_UNC_COR_ERR,
-       C_RX_LOOKUP_DES_PART1_UNC_ERR,
-       C_RX_RBUF_NEXT_FREE_BUF_COR_ERR,
-       C_RX_RBUF_NEXT_FREE_BUF_UNC_ERR,
-       C_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR,
-       C_RX_RBUF_FL_INITDONE_PARITY_ERR,
-       C_RX_RBUF_FL_WRITE_ADDR_PARITY_ERR,
-       C_RX_RBUF_FL_RD_ADDR_PARITY_ERR,
-       C_RX_RBUF_EMPTY_ERR,
-       C_RX_RBUF_FULL_ERR,
-       C_RX_RBUF_BAD_LOOKUP_ERR,
-       C_RX_RBUF_CTX_ID_PARITY_ERR,
-       C_RX_RBUF_CSR_QEOPDW_PARITY_ERR,
-       C_RX_RBUF_CSR_Q_NUM_OF_PKT_PARITY_ERR,
-       C_RX_RBUF_CSR_Q_T1_PTR_PARITY_ERR,
-       C_RX_RBUF_CSR_Q_HD_PTR_PARITY_ERR,
-       C_RX_RBUF_CSR_Q_VLD_BIT_PARITY_ERR,
-       C_RX_RBUF_CSR_Q_NEXT_BUF_PARITY_ERR,
-       C_RX_RBUF_CSR_Q_ENT_CNT_PARITY_ERR,
-       C_RX_RBUF_CSR_Q_HEAD_BUF_NUM_PARITY_ERR,
-       C_RX_RBUF_BLOCK_LIST_READ_COR_ERR,
-       C_RX_RBUF_BLOCK_LIST_READ_UNC_ERR,
-       C_RX_RBUF_LOOKUP_DES_COR_ERR,
-       C_RX_RBUF_LOOKUP_DES_UNC_ERR,
-       C_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR,
-       C_RX_RBUF_LOOKUP_DES_REG_UNC_ERR,
-       C_RX_RBUF_FREE_LIST_COR_ERR,
-       C_RX_RBUF_FREE_LIST_UNC_ERR,
-       C_RX_RCV_FSM_ENCODING_ERR,
-       C_RX_DMA_FLAG_COR_ERR,
-       C_RX_DMA_FLAG_UNC_ERR,
-       C_RX_DC_SOP_EOP_PARITY_ERR,
-       C_RX_RCV_CSR_PARITY_ERR,
-       C_RX_RCV_QP_MAP_TABLE_COR_ERR,
-       C_RX_RCV_QP_MAP_TABLE_UNC_ERR,
-       C_RX_RCV_DATA_COR_ERR,
-       C_RX_RCV_DATA_UNC_ERR,
-       C_RX_RCV_HDR_COR_ERR,
-       C_RX_RCV_HDR_UNC_ERR,
-       C_RX_DC_INTF_PARITY_ERR,
-       C_RX_DMA_CSR_COR_ERR,
-/* SendPioErrStatus */
-       C_PIO_PEC_SOP_HEAD_PARITY_ERR,
-       C_PIO_PCC_SOP_HEAD_PARITY_ERR,
-       C_PIO_LAST_RETURNED_CNT_PARITY_ERR,
-       C_PIO_CURRENT_FREE_CNT_PARITY_ERR,
-       C_PIO_RSVD_31_ERR,
-       C_PIO_RSVD_30_ERR,
-       C_PIO_PPMC_SOP_LEN_ERR,
-       C_PIO_PPMC_BQC_MEM_PARITY_ERR,
-       C_PIO_VL_FIFO_PARITY_ERR,
-       C_PIO_VLF_SOP_PARITY_ERR,
-       C_PIO_VLF_V1_LEN_PARITY_ERR,
-       C_PIO_BLOCK_QW_COUNT_PARITY_ERR,
-       C_PIO_WRITE_QW_VALID_PARITY_ERR,
-       C_PIO_STATE_MACHINE_ERR,
-       C_PIO_WRITE_DATA_PARITY_ERR,
-       C_PIO_HOST_ADDR_MEM_COR_ERR,
-       C_PIO_HOST_ADDR_MEM_UNC_ERR,
-       C_PIO_PKT_EVICT_SM_OR_ARM_SM_ERR,
-       C_PIO_INIT_SM_IN_ERR,
-       C_PIO_PPMC_PBL_FIFO_ERR,
-       C_PIO_CREDIT_RET_FIFO_PARITY_ERR,
-       C_PIO_V1_LEN_MEM_BANK1_COR_ERR,
-       C_PIO_V1_LEN_MEM_BANK0_COR_ERR,
-       C_PIO_V1_LEN_MEM_BANK1_UNC_ERR,
-       C_PIO_V1_LEN_MEM_BANK0_UNC_ERR,
-       C_PIO_SM_PKT_RESET_PARITY_ERR,
-       C_PIO_PKT_EVICT_FIFO_PARITY_ERR,
-       C_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR,
-       C_PIO_SBRDCTL_CRREL_PARITY_ERR,
-       C_PIO_PEC_FIFO_PARITY_ERR,
-       C_PIO_PCC_FIFO_PARITY_ERR,
-       C_PIO_SB_MEM_FIFO1_ERR,
-       C_PIO_SB_MEM_FIFO0_ERR,
-       C_PIO_CSR_PARITY_ERR,
-       C_PIO_WRITE_ADDR_PARITY_ERR,
-       C_PIO_WRITE_BAD_CTXT_ERR,
-/* SendDmaErrStatus */
-       C_SDMA_PCIE_REQ_TRACKING_COR_ERR,
-       C_SDMA_PCIE_REQ_TRACKING_UNC_ERR,
-       C_SDMA_CSR_PARITY_ERR,
-       C_SDMA_RPY_TAG_ERR,
-/* SendEgressErrStatus */
-       C_TX_READ_PIO_MEMORY_CSR_UNC_ERR,
-       C_TX_READ_SDMA_MEMORY_CSR_UNC_ERR,
-       C_TX_EGRESS_FIFO_COR_ERR,
-       C_TX_READ_PIO_MEMORY_COR_ERR,
-       C_TX_READ_SDMA_MEMORY_COR_ERR,
-       C_TX_SB_HDR_COR_ERR,
-       C_TX_CREDIT_OVERRUN_ERR,
-       C_TX_LAUNCH_FIFO8_COR_ERR,
-       C_TX_LAUNCH_FIFO7_COR_ERR,
-       C_TX_LAUNCH_FIFO6_COR_ERR,
-       C_TX_LAUNCH_FIFO5_COR_ERR,
-       C_TX_LAUNCH_FIFO4_COR_ERR,
-       C_TX_LAUNCH_FIFO3_COR_ERR,
-       C_TX_LAUNCH_FIFO2_COR_ERR,
-       C_TX_LAUNCH_FIFO1_COR_ERR,
-       C_TX_LAUNCH_FIFO0_COR_ERR,
-       C_TX_CREDIT_RETURN_VL_ERR,
-       C_TX_HCRC_INSERTION_ERR,
-       C_TX_EGRESS_FIFI_UNC_ERR,
-       C_TX_READ_PIO_MEMORY_UNC_ERR,
-       C_TX_READ_SDMA_MEMORY_UNC_ERR,
-       C_TX_SB_HDR_UNC_ERR,
-       C_TX_CREDIT_RETURN_PARITY_ERR,
-       C_TX_LAUNCH_FIFO8_UNC_OR_PARITY_ERR,
-       C_TX_LAUNCH_FIFO7_UNC_OR_PARITY_ERR,
-       C_TX_LAUNCH_FIFO6_UNC_OR_PARITY_ERR,
-       C_TX_LAUNCH_FIFO5_UNC_OR_PARITY_ERR,
-       C_TX_LAUNCH_FIFO4_UNC_OR_PARITY_ERR,
-       C_TX_LAUNCH_FIFO3_UNC_OR_PARITY_ERR,
-       C_TX_LAUNCH_FIFO2_UNC_OR_PARITY_ERR,
-       C_TX_LAUNCH_FIFO1_UNC_OR_PARITY_ERR,
-       C_TX_LAUNCH_FIFO0_UNC_OR_PARITY_ERR,
-       C_TX_SDMA15_DISALLOWED_PACKET_ERR,
-       C_TX_SDMA14_DISALLOWED_PACKET_ERR,
-       C_TX_SDMA13_DISALLOWED_PACKET_ERR,
-       C_TX_SDMA12_DISALLOWED_PACKET_ERR,
-       C_TX_SDMA11_DISALLOWED_PACKET_ERR,
-       C_TX_SDMA10_DISALLOWED_PACKET_ERR,
-       C_TX_SDMA9_DISALLOWED_PACKET_ERR,
-       C_TX_SDMA8_DISALLOWED_PACKET_ERR,
-       C_TX_SDMA7_DISALLOWED_PACKET_ERR,
-       C_TX_SDMA6_DISALLOWED_PACKET_ERR,
-       C_TX_SDMA5_DISALLOWED_PACKET_ERR,
-       C_TX_SDMA4_DISALLOWED_PACKET_ERR,
-       C_TX_SDMA3_DISALLOWED_PACKET_ERR,
-       C_TX_SDMA2_DISALLOWED_PACKET_ERR,
-       C_TX_SDMA1_DISALLOWED_PACKET_ERR,
-       C_TX_SDMA0_DISALLOWED_PACKET_ERR,
-       C_TX_CONFIG_PARITY_ERR,
-       C_TX_SBRD_CTL_CSR_PARITY_ERR,
-       C_TX_LAUNCH_CSR_PARITY_ERR,
-       C_TX_ILLEGAL_CL_ERR,
-       C_TX_SBRD_CTL_STATE_MACHINE_PARITY_ERR,
-       C_TX_RESERVED_10,
-       C_TX_RESERVED_9,
-       C_TX_SDMA_LAUNCH_INTF_PARITY_ERR,
-       C_TX_PIO_LAUNCH_INTF_PARITY_ERR,
-       C_TX_RESERVED_6,
-       C_TX_INCORRECT_LINK_STATE_ERR,
-       C_TX_LINK_DOWN_ERR,
-       C_TX_EGRESS_FIFO_UNDERRUN_OR_PARITY_ERR,
-       C_TX_RESERVED_2,
-       C_TX_PKT_INTEGRITY_MEM_UNC_ERR,
-       C_TX_PKT_INTEGRITY_MEM_COR_ERR,
-/* SendErrStatus */
-       C_SEND_CSR_WRITE_BAD_ADDR_ERR,
-       C_SEND_CSR_READ_BAD_ADD_ERR,
-       C_SEND_CSR_PARITY_ERR,
-/* SendCtxtErrStatus */
-       C_PIO_WRITE_OUT_OF_BOUNDS_ERR,
-       C_PIO_WRITE_OVERFLOW_ERR,
-       C_PIO_WRITE_CROSSES_BOUNDARY_ERR,
-       C_PIO_DISALLOWED_PACKET_ERR,
-       C_PIO_INCONSISTENT_SOP_ERR,
-/*SendDmaEngErrStatus */
-       C_SDMA_HEADER_REQUEST_FIFO_COR_ERR,
-       C_SDMA_HEADER_STORAGE_COR_ERR,
-       C_SDMA_PACKET_TRACKING_COR_ERR,
-       C_SDMA_ASSEMBLY_COR_ERR,
-       C_SDMA_DESC_TABLE_COR_ERR,
-       C_SDMA_HEADER_REQUEST_FIFO_UNC_ERR,
-       C_SDMA_HEADER_STORAGE_UNC_ERR,
-       C_SDMA_PACKET_TRACKING_UNC_ERR,
-       C_SDMA_ASSEMBLY_UNC_ERR,
-       C_SDMA_DESC_TABLE_UNC_ERR,
-       C_SDMA_TIMEOUT_ERR,
-       C_SDMA_HEADER_LENGTH_ERR,
-       C_SDMA_HEADER_ADDRESS_ERR,
-       C_SDMA_HEADER_SELECT_ERR,
-       C_SMDA_RESERVED_9,
-       C_SDMA_PACKET_DESC_OVERFLOW_ERR,
-       C_SDMA_LENGTH_MISMATCH_ERR,
-       C_SDMA_HALT_ERR,
-       C_SDMA_MEM_READ_ERR,
-       C_SDMA_FIRST_DESC_ERR,
-       C_SDMA_TAIL_OUT_OF_BOUNDS_ERR,
-       C_SDMA_TOO_LONG_ERR,
-       C_SDMA_GEN_MISMATCH_ERR,
-       C_SDMA_WRONG_DW_ERR,
-       DEV_CNTR_LAST  /* Must be kept last */
-};
-
-/* Per port counter indexes */
-enum {
-       C_TX_UNSUP_VL = 0,
-       C_TX_INVAL_LEN,
-       C_TX_MM_LEN_ERR,
-       C_TX_UNDERRUN,
-       C_TX_FLOW_STALL,
-       C_TX_DROPPED,
-       C_TX_HDR_ERR,
-       C_TX_PKT,
-       C_TX_WORDS,
-       C_TX_WAIT,
-       C_TX_FLIT_VL,
-       C_TX_PKT_VL,
-       C_TX_WAIT_VL,
-       C_RX_PKT,
-       C_RX_WORDS,
-       C_SW_LINK_DOWN,
-       C_SW_LINK_UP,
-       C_SW_UNKNOWN_FRAME,
-       C_SW_XMIT_DSCD,
-       C_SW_XMIT_DSCD_VL,
-       C_SW_XMIT_CSTR_ERR,
-       C_SW_RCV_CSTR_ERR,
-       C_SW_IBP_LOOP_PKTS,
-       C_SW_IBP_RC_RESENDS,
-       C_SW_IBP_RNR_NAKS,
-       C_SW_IBP_OTHER_NAKS,
-       C_SW_IBP_RC_TIMEOUTS,
-       C_SW_IBP_PKT_DROPS,
-       C_SW_IBP_DMA_WAIT,
-       C_SW_IBP_RC_SEQNAK,
-       C_SW_IBP_RC_DUPREQ,
-       C_SW_IBP_RDMA_SEQ,
-       C_SW_IBP_UNALIGNED,
-       C_SW_IBP_SEQ_NAK,
-       C_SW_CPU_RC_ACKS,
-       C_SW_CPU_RC_QACKS,
-       C_SW_CPU_RC_DELAYED_COMP,
-       C_RCV_HDR_OVF_0,
-       C_RCV_HDR_OVF_1,
-       C_RCV_HDR_OVF_2,
-       C_RCV_HDR_OVF_3,
-       C_RCV_HDR_OVF_4,
-       C_RCV_HDR_OVF_5,
-       C_RCV_HDR_OVF_6,
-       C_RCV_HDR_OVF_7,
-       C_RCV_HDR_OVF_8,
-       C_RCV_HDR_OVF_9,
-       C_RCV_HDR_OVF_10,
-       C_RCV_HDR_OVF_11,
-       C_RCV_HDR_OVF_12,
-       C_RCV_HDR_OVF_13,
-       C_RCV_HDR_OVF_14,
-       C_RCV_HDR_OVF_15,
-       C_RCV_HDR_OVF_16,
-       C_RCV_HDR_OVF_17,
-       C_RCV_HDR_OVF_18,
-       C_RCV_HDR_OVF_19,
-       C_RCV_HDR_OVF_20,
-       C_RCV_HDR_OVF_21,
-       C_RCV_HDR_OVF_22,
-       C_RCV_HDR_OVF_23,
-       C_RCV_HDR_OVF_24,
-       C_RCV_HDR_OVF_25,
-       C_RCV_HDR_OVF_26,
-       C_RCV_HDR_OVF_27,
-       C_RCV_HDR_OVF_28,
-       C_RCV_HDR_OVF_29,
-       C_RCV_HDR_OVF_30,
-       C_RCV_HDR_OVF_31,
-       C_RCV_HDR_OVF_32,
-       C_RCV_HDR_OVF_33,
-       C_RCV_HDR_OVF_34,
-       C_RCV_HDR_OVF_35,
-       C_RCV_HDR_OVF_36,
-       C_RCV_HDR_OVF_37,
-       C_RCV_HDR_OVF_38,
-       C_RCV_HDR_OVF_39,
-       C_RCV_HDR_OVF_40,
-       C_RCV_HDR_OVF_41,
-       C_RCV_HDR_OVF_42,
-       C_RCV_HDR_OVF_43,
-       C_RCV_HDR_OVF_44,
-       C_RCV_HDR_OVF_45,
-       C_RCV_HDR_OVF_46,
-       C_RCV_HDR_OVF_47,
-       C_RCV_HDR_OVF_48,
-       C_RCV_HDR_OVF_49,
-       C_RCV_HDR_OVF_50,
-       C_RCV_HDR_OVF_51,
-       C_RCV_HDR_OVF_52,
-       C_RCV_HDR_OVF_53,
-       C_RCV_HDR_OVF_54,
-       C_RCV_HDR_OVF_55,
-       C_RCV_HDR_OVF_56,
-       C_RCV_HDR_OVF_57,
-       C_RCV_HDR_OVF_58,
-       C_RCV_HDR_OVF_59,
-       C_RCV_HDR_OVF_60,
-       C_RCV_HDR_OVF_61,
-       C_RCV_HDR_OVF_62,
-       C_RCV_HDR_OVF_63,
-       C_RCV_HDR_OVF_64,
-       C_RCV_HDR_OVF_65,
-       C_RCV_HDR_OVF_66,
-       C_RCV_HDR_OVF_67,
-       C_RCV_HDR_OVF_68,
-       C_RCV_HDR_OVF_69,
-       C_RCV_HDR_OVF_70,
-       C_RCV_HDR_OVF_71,
-       C_RCV_HDR_OVF_72,
-       C_RCV_HDR_OVF_73,
-       C_RCV_HDR_OVF_74,
-       C_RCV_HDR_OVF_75,
-       C_RCV_HDR_OVF_76,
-       C_RCV_HDR_OVF_77,
-       C_RCV_HDR_OVF_78,
-       C_RCV_HDR_OVF_79,
-       C_RCV_HDR_OVF_80,
-       C_RCV_HDR_OVF_81,
-       C_RCV_HDR_OVF_82,
-       C_RCV_HDR_OVF_83,
-       C_RCV_HDR_OVF_84,
-       C_RCV_HDR_OVF_85,
-       C_RCV_HDR_OVF_86,
-       C_RCV_HDR_OVF_87,
-       C_RCV_HDR_OVF_88,
-       C_RCV_HDR_OVF_89,
-       C_RCV_HDR_OVF_90,
-       C_RCV_HDR_OVF_91,
-       C_RCV_HDR_OVF_92,
-       C_RCV_HDR_OVF_93,
-       C_RCV_HDR_OVF_94,
-       C_RCV_HDR_OVF_95,
-       C_RCV_HDR_OVF_96,
-       C_RCV_HDR_OVF_97,
-       C_RCV_HDR_OVF_98,
-       C_RCV_HDR_OVF_99,
-       C_RCV_HDR_OVF_100,
-       C_RCV_HDR_OVF_101,
-       C_RCV_HDR_OVF_102,
-       C_RCV_HDR_OVF_103,
-       C_RCV_HDR_OVF_104,
-       C_RCV_HDR_OVF_105,
-       C_RCV_HDR_OVF_106,
-       C_RCV_HDR_OVF_107,
-       C_RCV_HDR_OVF_108,
-       C_RCV_HDR_OVF_109,
-       C_RCV_HDR_OVF_110,
-       C_RCV_HDR_OVF_111,
-       C_RCV_HDR_OVF_112,
-       C_RCV_HDR_OVF_113,
-       C_RCV_HDR_OVF_114,
-       C_RCV_HDR_OVF_115,
-       C_RCV_HDR_OVF_116,
-       C_RCV_HDR_OVF_117,
-       C_RCV_HDR_OVF_118,
-       C_RCV_HDR_OVF_119,
-       C_RCV_HDR_OVF_120,
-       C_RCV_HDR_OVF_121,
-       C_RCV_HDR_OVF_122,
-       C_RCV_HDR_OVF_123,
-       C_RCV_HDR_OVF_124,
-       C_RCV_HDR_OVF_125,
-       C_RCV_HDR_OVF_126,
-       C_RCV_HDR_OVF_127,
-       C_RCV_HDR_OVF_128,
-       C_RCV_HDR_OVF_129,
-       C_RCV_HDR_OVF_130,
-       C_RCV_HDR_OVF_131,
-       C_RCV_HDR_OVF_132,
-       C_RCV_HDR_OVF_133,
-       C_RCV_HDR_OVF_134,
-       C_RCV_HDR_OVF_135,
-       C_RCV_HDR_OVF_136,
-       C_RCV_HDR_OVF_137,
-       C_RCV_HDR_OVF_138,
-       C_RCV_HDR_OVF_139,
-       C_RCV_HDR_OVF_140,
-       C_RCV_HDR_OVF_141,
-       C_RCV_HDR_OVF_142,
-       C_RCV_HDR_OVF_143,
-       C_RCV_HDR_OVF_144,
-       C_RCV_HDR_OVF_145,
-       C_RCV_HDR_OVF_146,
-       C_RCV_HDR_OVF_147,
-       C_RCV_HDR_OVF_148,
-       C_RCV_HDR_OVF_149,
-       C_RCV_HDR_OVF_150,
-       C_RCV_HDR_OVF_151,
-       C_RCV_HDR_OVF_152,
-       C_RCV_HDR_OVF_153,
-       C_RCV_HDR_OVF_154,
-       C_RCV_HDR_OVF_155,
-       C_RCV_HDR_OVF_156,
-       C_RCV_HDR_OVF_157,
-       C_RCV_HDR_OVF_158,
-       C_RCV_HDR_OVF_159,
-       PORT_CNTR_LAST /* Must be kept last */
-};
-
-u64 get_all_cpu_total(u64 __percpu *cntr);
-void hfi1_start_cleanup(struct hfi1_devdata *dd);
-void hfi1_clear_tids(struct hfi1_ctxtdata *rcd);
-struct hfi1_message_header *hfi1_get_msgheader(
-                               struct hfi1_devdata *dd, __le32 *rhf_addr);
-int hfi1_get_base_kinfo(struct hfi1_ctxtdata *rcd,
-                       struct hfi1_ctxt_info *kinfo);
-u64 hfi1_gpio_mod(struct hfi1_devdata *dd, u32 target, u32 data, u32 dir,
-                 u32 mask);
-int hfi1_init_ctxt(struct send_context *sc);
-void hfi1_put_tid(struct hfi1_devdata *dd, u32 index,
-                 u32 type, unsigned long pa, u16 order);
-void hfi1_quiet_serdes(struct hfi1_pportdata *ppd);
-void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, int ctxt);
-u32 hfi1_read_cntrs(struct hfi1_devdata *dd, char **namep, u64 **cntrp);
-u32 hfi1_read_portcntrs(struct hfi1_pportdata *ppd, char **namep, u64 **cntrp);
-u8 hfi1_ibphys_portstate(struct hfi1_pportdata *ppd);
-int hfi1_get_ib_cfg(struct hfi1_pportdata *ppd, int which);
-int hfi1_set_ib_cfg(struct hfi1_pportdata *ppd, int which, u32 val);
-int hfi1_set_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt, u16 jkey);
-int hfi1_clear_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt);
-int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt, u16 pkey);
-int hfi1_clear_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt);
-void hfi1_read_link_quality(struct hfi1_devdata *dd, u8 *link_quality);
-
-/*
- * Interrupt source table.
- *
- * Each entry is an interrupt source "type".  It is ordered by increasing
- * number.
- */
-struct is_table {
-       int start;       /* interrupt source type start */
-       int end;         /* interrupt source type end */
-       /* routine that returns the name of the interrupt source */
-       char *(*is_name)(char *name, size_t size, unsigned int source);
-       /* routine to call when receiving an interrupt */
-       void (*is_int)(struct hfi1_devdata *dd, unsigned int source);
-};
-
-#endif /* _CHIP_H */
diff --git a/drivers/staging/rdma/hfi1/chip_registers.h b/drivers/staging/rdma/hfi1/chip_registers.h
deleted file mode 100644 (file)
index 8744de6..0000000
+++ /dev/null
@@ -1,1307 +0,0 @@
-#ifndef DEF_CHIP_REG
-#define DEF_CHIP_REG
-
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#define CORE           0x000000000000
-#define CCE                    (CORE + 0x000000000000)
-#define ASIC           (CORE + 0x000000400000)
-#define MISC           (CORE + 0x000000500000)
-#define DC_TOP_CSRS            (CORE + 0x000000600000)
-#define CHIP_DEBUG             (CORE + 0x000000700000)
-#define RXE                    (CORE + 0x000001000000)
-#define TXE                    (CORE + 0x000001800000)
-#define DCC_CSRS               (DC_TOP_CSRS + 0x000000000000)
-#define DC_LCB_CSRS            (DC_TOP_CSRS + 0x000000001000)
-#define DC_8051_CSRS           (DC_TOP_CSRS + 0x000000002000)
-#define PCIE           0
-
-#define ASIC_NUM_SCRATCH 4
-#define CCE_ERR_INT_CNT 0
-#define CCE_MISC_INT_CNT 2
-#define CCE_NUM_32_BIT_COUNTERS 3
-#define CCE_NUM_32_BIT_INT_COUNTERS 6
-#define CCE_NUM_INT_CSRS 12
-#define CCE_NUM_INT_MAP_CSRS 96
-#define CCE_NUM_MSIX_PBAS 4
-#define CCE_NUM_MSIX_VECTORS 256
-#define CCE_NUM_SCRATCH 4
-#define CCE_PCIE_POSTED_CRDT_STALL_CNT 2
-#define CCE_PCIE_TRGT_STALL_CNT 0
-#define CCE_PIO_WR_STALL_CNT 1
-#define CCE_RCV_AVAIL_INT_CNT 3
-#define CCE_RCV_URGENT_INT_CNT 4
-#define CCE_SDMA_INT_CNT 1
-#define CCE_SEND_CREDIT_INT_CNT 5
-#define DCC_CFG_LED_CNTRL (DCC_CSRS + 0x000000000040)
-#define DCC_CFG_LED_CNTRL_LED_CNTRL_SMASK 0x10ull
-#define DCC_CFG_LED_CNTRL_LED_SW_BLINK_RATE_SHIFT 0
-#define DCC_CFG_LED_CNTRL_LED_SW_BLINK_RATE_SMASK 0xFull
-#define DCC_CFG_PORT_CONFIG (DCC_CSRS + 0x000000000008)
-#define DCC_CFG_PORT_CONFIG1 (DCC_CSRS + 0x000000000010)
-#define DCC_CFG_PORT_CONFIG1_DLID_MASK_MASK 0xFFFFull
-#define DCC_CFG_PORT_CONFIG1_DLID_MASK_SHIFT 16
-#define DCC_CFG_PORT_CONFIG1_DLID_MASK_SMASK 0xFFFF0000ull
-#define DCC_CFG_PORT_CONFIG1_TARGET_DLID_MASK 0xFFFFull
-#define DCC_CFG_PORT_CONFIG1_TARGET_DLID_SHIFT 0
-#define DCC_CFG_PORT_CONFIG1_TARGET_DLID_SMASK 0xFFFFull
-#define DCC_CFG_PORT_CONFIG_LINK_STATE_MASK 0x7ull
-#define DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT 48
-#define DCC_CFG_PORT_CONFIG_LINK_STATE_SMASK 0x7000000000000ull
-#define DCC_CFG_PORT_CONFIG_MTU_CAP_MASK 0x7ull
-#define DCC_CFG_PORT_CONFIG_MTU_CAP_SHIFT 32
-#define DCC_CFG_PORT_CONFIG_MTU_CAP_SMASK 0x700000000ull
-#define DCC_CFG_RESET (DCC_CSRS + 0x000000000000)
-#define DCC_CFG_RESET_RESET_LCB_SHIFT 0
-#define DCC_CFG_RESET_RESET_RX_FPE_SHIFT 2
-#define DCC_CFG_SC_VL_TABLE_15_0 (DCC_CSRS + 0x000000000028)
-#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY0_SHIFT 0
-#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY10_SHIFT 40
-#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY11_SHIFT 44
-#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY12_SHIFT 48
-#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY13_SHIFT 52
-#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY14_SHIFT 56
-#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY15_SHIFT 60
-#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY1_SHIFT 4
-#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY2_SHIFT 8
-#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY3_SHIFT 12
-#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY4_SHIFT 16
-#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY5_SHIFT 20
-#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY6_SHIFT 24
-#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY7_SHIFT 28
-#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY8_SHIFT 32
-#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY9_SHIFT 36
-#define DCC_CFG_SC_VL_TABLE_31_16 (DCC_CSRS + 0x000000000030)
-#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY16_SHIFT 0
-#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY17_SHIFT 4
-#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY18_SHIFT 8
-#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY19_SHIFT 12
-#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY20_SHIFT 16
-#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY21_SHIFT 20
-#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY22_SHIFT 24
-#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY23_SHIFT 28
-#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY24_SHIFT 32
-#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY25_SHIFT 36
-#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY26_SHIFT 40
-#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY27_SHIFT 44
-#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY28_SHIFT 48
-#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY29_SHIFT 52
-#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY30_SHIFT 56
-#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY31_SHIFT 60
-#define DCC_ERR_DROPPED_PKT_CNT (DCC_CSRS + 0x000000000120)
-#define DCC_ERR_FLG (DCC_CSRS + 0x000000000050)
-#define DCC_ERR_FLG_BAD_CRDT_ACK_ERR_SMASK 0x4000ull
-#define DCC_ERR_FLG_BAD_CTRL_DIST_ERR_SMASK 0x200000ull
-#define DCC_ERR_FLG_BAD_CTRL_FLIT_ERR_SMASK 0x10000ull
-#define DCC_ERR_FLG_BAD_DLID_TARGET_ERR_SMASK 0x200ull
-#define DCC_ERR_FLG_BAD_HEAD_DIST_ERR_SMASK 0x800000ull
-#define DCC_ERR_FLG_BAD_L2_ERR_SMASK 0x2ull
-#define DCC_ERR_FLG_BAD_LVER_ERR_SMASK 0x400ull
-#define DCC_ERR_FLG_BAD_MID_TAIL_ERR_SMASK 0x8ull
-#define DCC_ERR_FLG_BAD_PKT_LENGTH_ERR_SMASK 0x4000000ull
-#define DCC_ERR_FLG_BAD_PREEMPTION_ERR_SMASK 0x10ull
-#define DCC_ERR_FLG_BAD_SC_ERR_SMASK 0x4ull
-#define DCC_ERR_FLG_BAD_TAIL_DIST_ERR_SMASK 0x400000ull
-#define DCC_ERR_FLG_BAD_VL_MARKER_ERR_SMASK 0x80ull
-#define DCC_ERR_FLG_CLR (DCC_CSRS + 0x000000000060)
-#define DCC_ERR_FLG_CSR_ACCESS_BLOCKED_HOST_SMASK 0x8000000000ull
-#define DCC_ERR_FLG_CSR_ACCESS_BLOCKED_UC_SMASK 0x10000000000ull
-#define DCC_ERR_FLG_CSR_INVAL_ADDR_SMASK 0x400000000000ull
-#define DCC_ERR_FLG_CSR_PARITY_ERR_SMASK 0x200000000000ull
-#define DCC_ERR_FLG_DLID_ZERO_ERR_SMASK 0x40000000ull
-#define DCC_ERR_FLG_EN (DCC_CSRS + 0x000000000058)
-#define DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK 0x8000000000ull
-#define DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK 0x10000000000ull
-#define DCC_ERR_FLG_EVENT_CNTR_PARITY_ERR_SMASK 0x20000ull
-#define DCC_ERR_FLG_EVENT_CNTR_ROLLOVER_ERR_SMASK 0x40000ull
-#define DCC_ERR_FLG_FMCONFIG_ERR_SMASK 0x40000000000000ull
-#define DCC_ERR_FLG_FPE_TX_FIFO_OVFLW_ERR_SMASK 0x2000000000ull
-#define DCC_ERR_FLG_FPE_TX_FIFO_UNFLW_ERR_SMASK 0x4000000000ull
-#define DCC_ERR_FLG_LATE_EBP_ERR_SMASK 0x1000000000ull
-#define DCC_ERR_FLG_LATE_LONG_ERR_SMASK 0x800000000ull
-#define DCC_ERR_FLG_LATE_SHORT_ERR_SMASK 0x400000000ull
-#define DCC_ERR_FLG_LENGTH_MTU_ERR_SMASK 0x80000000ull
-#define DCC_ERR_FLG_LINK_ERR_SMASK 0x80000ull
-#define DCC_ERR_FLG_MISC_CNTR_ROLLOVER_ERR_SMASK 0x100000ull
-#define DCC_ERR_FLG_NONVL15_STATE_ERR_SMASK 0x1000000ull
-#define DCC_ERR_FLG_PERM_NVL15_ERR_SMASK 0x10000000ull
-#define DCC_ERR_FLG_PREEMPTION_ERR_SMASK 0x20ull
-#define DCC_ERR_FLG_PREEMPTIONVL15_ERR_SMASK 0x40ull
-#define DCC_ERR_FLG_RCVPORT_ERR_SMASK 0x80000000000000ull
-#define DCC_ERR_FLG_RX_BYTE_SHFT_PARITY_ERR_SMASK 0x1000000000000ull
-#define DCC_ERR_FLG_RX_CTRL_PARITY_MBE_ERR_SMASK 0x100000000000ull
-#define DCC_ERR_FLG_RX_EARLY_DROP_ERR_SMASK 0x200000000ull
-#define DCC_ERR_FLG_SLID_ZERO_ERR_SMASK 0x20000000ull
-#define DCC_ERR_FLG_TX_BYTE_SHFT_PARITY_ERR_SMASK 0x800000000000ull
-#define DCC_ERR_FLG_TX_CTRL_PARITY_ERR_SMASK 0x20000000000ull
-#define DCC_ERR_FLG_TX_CTRL_PARITY_MBE_ERR_SMASK 0x40000000000ull
-#define DCC_ERR_FLG_TX_SC_PARITY_ERR_SMASK 0x80000000000ull
-#define DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK 0x2000ull
-#define DCC_ERR_FLG_UNSUP_PKT_TYPE_SMASK 0x8000ull
-#define DCC_ERR_FLG_UNSUP_VL_ERR_SMASK 0x8000000ull
-#define DCC_ERR_FLG_VL15_MULTI_ERR_SMASK 0x2000000ull
-#define DCC_ERR_FMCONFIG_ERR_CNT (DCC_CSRS + 0x000000000110)
-#define DCC_ERR_INFO_FMCONFIG (DCC_CSRS + 0x000000000090)
-#define DCC_ERR_INFO_PORTRCV (DCC_CSRS + 0x000000000078)
-#define DCC_ERR_INFO_PORTRCV_HDR0 (DCC_CSRS + 0x000000000080)
-#define DCC_ERR_INFO_PORTRCV_HDR1 (DCC_CSRS + 0x000000000088)
-#define DCC_ERR_INFO_UNCORRECTABLE (DCC_CSRS + 0x000000000098)
-#define DCC_ERR_PORTRCV_ERR_CNT (DCC_CSRS + 0x000000000108)
-#define DCC_ERR_RCVREMOTE_PHY_ERR_CNT (DCC_CSRS + 0x000000000118)
-#define DCC_ERR_UNCORRECTABLE_CNT (DCC_CSRS + 0x000000000100)
-#define DCC_PRF_PORT_MARK_FECN_CNT (DCC_CSRS + 0x000000000330)
-#define DCC_PRF_PORT_RCV_BECN_CNT (DCC_CSRS + 0x000000000290)
-#define DCC_PRF_PORT_RCV_BUBBLE_CNT (DCC_CSRS + 0x0000000002E0)
-#define DCC_PRF_PORT_RCV_CORRECTABLE_CNT (DCC_CSRS + 0x000000000140)
-#define DCC_PRF_PORT_RCV_DATA_CNT (DCC_CSRS + 0x000000000198)
-#define DCC_PRF_PORT_RCV_FECN_CNT (DCC_CSRS + 0x000000000240)
-#define DCC_PRF_PORT_RCV_MULTICAST_PKT_CNT (DCC_CSRS + 0x000000000130)
-#define DCC_PRF_PORT_RCV_PKTS_CNT (DCC_CSRS + 0x0000000001A8)
-#define DCC_PRF_PORT_VL_MARK_FECN_CNT (DCC_CSRS + 0x000000000338)
-#define DCC_PRF_PORT_VL_RCV_BECN_CNT (DCC_CSRS + 0x000000000298)
-#define DCC_PRF_PORT_VL_RCV_BUBBLE_CNT (DCC_CSRS + 0x0000000002E8)
-#define DCC_PRF_PORT_VL_RCV_DATA_CNT (DCC_CSRS + 0x0000000001B0)
-#define DCC_PRF_PORT_VL_RCV_FECN_CNT (DCC_CSRS + 0x000000000248)
-#define DCC_PRF_PORT_VL_RCV_PKTS_CNT (DCC_CSRS + 0x0000000001F8)
-#define DCC_PRF_PORT_XMIT_CORRECTABLE_CNT (DCC_CSRS + 0x000000000138)
-#define DCC_PRF_PORT_XMIT_DATA_CNT (DCC_CSRS + 0x000000000190)
-#define DCC_PRF_PORT_XMIT_MULTICAST_CNT (DCC_CSRS + 0x000000000128)
-#define DCC_PRF_PORT_XMIT_PKTS_CNT (DCC_CSRS + 0x0000000001A0)
-#define DCC_PRF_RX_FLOW_CRTL_CNT (DCC_CSRS + 0x000000000180)
-#define DCC_PRF_TX_FLOW_CRTL_CNT (DCC_CSRS + 0x000000000188)
-#define DC_DC8051_CFG_CSR_ACCESS_SEL (DC_8051_CSRS + 0x000000000110)
-#define DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK 0x2ull
-#define DC_DC8051_CFG_CSR_ACCESS_SEL_LCB_SMASK 0x1ull
-#define DC_DC8051_CFG_EXT_DEV_0 (DC_8051_CSRS + 0x000000000118)
-#define DC_DC8051_CFG_EXT_DEV_0_COMPLETED_SMASK 0x1ull
-#define DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT 8
-#define DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT 16
-#define DC_DC8051_CFG_EXT_DEV_1 (DC_8051_CSRS + 0x000000000120)
-#define DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_MASK 0xFFFFull
-#define DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT 16
-#define DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SMASK 0xFFFF0000ull
-#define DC_DC8051_CFG_EXT_DEV_1_REQ_NEW_SMASK 0x1ull
-#define DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_MASK 0xFFull
-#define DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_SHIFT 8
-#define DC_DC8051_CFG_HOST_CMD_0 (DC_8051_CSRS + 0x000000000028)
-#define DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_MASK 0xFFFFFFFFFFFFull
-#define DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_SHIFT 16
-#define DC_DC8051_CFG_HOST_CMD_0_REQ_NEW_SMASK 0x1ull
-#define DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_MASK 0xFFull
-#define DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_SHIFT 8
-#define DC_DC8051_CFG_HOST_CMD_1 (DC_8051_CSRS + 0x000000000030)
-#define DC_DC8051_CFG_HOST_CMD_1_COMPLETED_SMASK 0x1ull
-#define DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_MASK 0xFFull
-#define DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_SHIFT 8
-#define DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_MASK 0xFFFFFFFFFFFFull
-#define DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_SHIFT 16
-#define DC_DC8051_CFG_LOCAL_GUID (DC_8051_CSRS + 0x000000000038)
-#define DC_DC8051_CFG_MODE (DC_8051_CSRS + 0x000000000070)
-#define DC_DC8051_CFG_RAM_ACCESS_CTRL (DC_8051_CSRS + 0x000000000008)
-#define DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK 0x7FFFull
-#define DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT 0
-#define DC_DC8051_CFG_RAM_ACCESS_CTRL_WRITE_ENA_SMASK 0x1000000ull
-#define DC_DC8051_CFG_RAM_ACCESS_CTRL_READ_ENA_SMASK 0x10000ull
-#define DC_DC8051_CFG_RAM_ACCESS_SETUP (DC_8051_CSRS + 0x000000000000)
-#define DC_DC8051_CFG_RAM_ACCESS_SETUP_AUTO_INCR_ADDR_SMASK 0x100ull
-#define DC_DC8051_CFG_RAM_ACCESS_SETUP_RAM_SEL_SMASK 0x1ull
-#define DC_DC8051_CFG_RAM_ACCESS_STATUS (DC_8051_CSRS + 0x000000000018)
-#define DC_DC8051_CFG_RAM_ACCESS_STATUS_ACCESS_COMPLETED_SMASK 0x10000ull
-#define DC_DC8051_CFG_RAM_ACCESS_WR_DATA (DC_8051_CSRS + 0x000000000010)
-#define DC_DC8051_CFG_RAM_ACCESS_RD_DATA (DC_8051_CSRS + 0x000000000020)
-#define DC_DC8051_CFG_RST (DC_8051_CSRS + 0x000000000068)
-#define DC_DC8051_CFG_RST_CRAM_SMASK 0x2ull
-#define DC_DC8051_CFG_RST_DRAM_SMASK 0x4ull
-#define DC_DC8051_CFG_RST_IRAM_SMASK 0x8ull
-#define DC_DC8051_CFG_RST_M8051W_SMASK 0x1ull
-#define DC_DC8051_CFG_RST_SFR_SMASK 0x10ull
-#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051 (DC_8051_CSRS + 0x0000000000D8)
-#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_MASK 0xFFFFFFFFull
-#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_SHIFT 16
-#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_MASK 0xFFFFull
-#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_SHIFT 0
-#define DC_DC8051_ERR_CLR (DC_8051_CSRS + 0x0000000000E8)
-#define DC_DC8051_ERR_EN (DC_8051_CSRS + 0x0000000000F0)
-#define DC_DC8051_ERR_EN_LOST_8051_HEART_BEAT_SMASK 0x2ull
-#define DC_DC8051_ERR_FLG (DC_8051_CSRS + 0x0000000000E0)
-#define DC_DC8051_ERR_FLG_CRAM_MBE_SMASK 0x4ull
-#define DC_DC8051_ERR_FLG_CRAM_SBE_SMASK 0x8ull
-#define DC_DC8051_ERR_FLG_DRAM_MBE_SMASK 0x10ull
-#define DC_DC8051_ERR_FLG_DRAM_SBE_SMASK 0x20ull
-#define DC_DC8051_ERR_FLG_INVALID_CSR_ADDR_SMASK 0x400ull
-#define DC_DC8051_ERR_FLG_IRAM_MBE_SMASK 0x40ull
-#define DC_DC8051_ERR_FLG_IRAM_SBE_SMASK 0x80ull
-#define DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK 0x2ull
-#define DC_DC8051_ERR_FLG_SET_BY_8051_SMASK 0x1ull
-#define DC_DC8051_ERR_FLG_UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES_SMASK 0x100ull
-#define DC_DC8051_STS_CUR_STATE (DC_8051_CSRS + 0x000000000060)
-#define DC_DC8051_STS_CUR_STATE_FIRMWARE_MASK 0xFFull
-#define DC_DC8051_STS_CUR_STATE_FIRMWARE_SHIFT 16
-#define DC_DC8051_STS_CUR_STATE_PORT_MASK 0xFFull
-#define DC_DC8051_STS_CUR_STATE_PORT_SHIFT 0
-#define DC_DC8051_STS_LOCAL_FM_SECURITY (DC_8051_CSRS + 0x000000000050)
-#define DC_DC8051_STS_LOCAL_FM_SECURITY_DISABLED_MASK 0x1ull
-#define DC_DC8051_STS_REMOTE_FM_SECURITY (DC_8051_CSRS + 0x000000000058)
-#define DC_DC8051_STS_REMOTE_GUID (DC_8051_CSRS + 0x000000000040)
-#define DC_DC8051_STS_REMOTE_NODE_TYPE (DC_8051_CSRS + 0x000000000048)
-#define DC_DC8051_STS_REMOTE_NODE_TYPE_VAL_MASK 0x3ull
-#define DC_DC8051_STS_REMOTE_PORT_NO (DC_8051_CSRS + 0x000000000130)
-#define DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK 0xFFull
-#define DC_LCB_CFG_ALLOW_LINK_UP (DC_LCB_CSRS + 0x000000000128)
-#define DC_LCB_CFG_ALLOW_LINK_UP_VAL_SHIFT 0
-#define DC_LCB_CFG_CRC_MODE (DC_LCB_CSRS + 0x000000000058)
-#define DC_LCB_CFG_CRC_MODE_TX_VAL_SHIFT 0
-#define DC_LCB_CFG_IGNORE_LOST_RCLK (DC_LCB_CSRS + 0x000000000020)
-#define DC_LCB_CFG_IGNORE_LOST_RCLK_EN_SMASK 0x1ull
-#define DC_LCB_CFG_LANE_WIDTH (DC_LCB_CSRS + 0x000000000100)
-#define DC_LCB_CFG_LINK_KILL_EN (DC_LCB_CSRS + 0x000000000120)
-#define DC_LCB_CFG_LINK_KILL_EN_FLIT_INPUT_BUF_MBE_SMASK 0x100000ull
-#define DC_LCB_CFG_LINK_KILL_EN_REPLAY_BUF_MBE_SMASK 0x400000ull
-#define DC_LCB_CFG_LN_DCLK (DC_LCB_CSRS + 0x000000000060)
-#define DC_LCB_CFG_LOOPBACK (DC_LCB_CSRS + 0x0000000000F8)
-#define DC_LCB_CFG_LOOPBACK_VAL_SHIFT 0
-#define DC_LCB_CFG_RUN (DC_LCB_CSRS + 0x000000000000)
-#define DC_LCB_CFG_RUN_EN_SHIFT 0
-#define DC_LCB_CFG_RX_FIFOS_RADR (DC_LCB_CSRS + 0x000000000018)
-#define DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT 8
-#define DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT 4
-#define DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT 0
-#define DC_LCB_CFG_TX_FIFOS_RADR (DC_LCB_CSRS + 0x000000000010)
-#define DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT 0
-#define DC_LCB_CFG_TX_FIFOS_RESET (DC_LCB_CSRS + 0x000000000008)
-#define DC_LCB_CFG_TX_FIFOS_RESET_VAL_SHIFT 0
-#define DC_LCB_CFG_REINIT_AS_SLAVE (DC_LCB_CSRS + 0x000000000030)
-#define DC_LCB_CFG_CNT_FOR_SKIP_STALL (DC_LCB_CSRS + 0x000000000040)
-#define DC_LCB_CFG_CLK_CNTR (DC_LCB_CSRS + 0x000000000110)
-#define DC_LCB_ERR_CLR (DC_LCB_CSRS + 0x000000000308)
-#define DC_LCB_ERR_EN (DC_LCB_CSRS + 0x000000000310)
-#define DC_LCB_ERR_FLG (DC_LCB_CSRS + 0x000000000300)
-#define DC_LCB_ERR_FLG_REDUNDANT_FLIT_PARITY_ERR_SMASK 0x20000000ull
-#define DC_LCB_ERR_FLG_NEG_EDGE_LINK_TRANSFER_ACTIVE_SMASK 0x10000000ull
-#define DC_LCB_ERR_FLG_HOLD_REINIT_SMASK 0x8000000ull
-#define DC_LCB_ERR_FLG_RST_FOR_INCOMPLT_RND_TRIP_SMASK 0x4000000ull
-#define DC_LCB_ERR_FLG_RST_FOR_LINK_TIMEOUT_SMASK 0x2000000ull
-#define DC_LCB_ERR_FLG_CREDIT_RETURN_FLIT_MBE_SMASK 0x1000000ull
-#define DC_LCB_ERR_FLG_REPLAY_BUF_SBE_SMASK 0x800000ull
-#define DC_LCB_ERR_FLG_REPLAY_BUF_MBE_SMASK 0x400000ull
-#define DC_LCB_ERR_FLG_FLIT_INPUT_BUF_SBE_SMASK 0x200000ull
-#define DC_LCB_ERR_FLG_FLIT_INPUT_BUF_MBE_SMASK 0x100000ull
-#define DC_LCB_ERR_FLG_VL_ACK_INPUT_WRONG_CRC_MODE_SMASK 0x80000ull
-#define DC_LCB_ERR_FLG_VL_ACK_INPUT_PARITY_ERR_SMASK 0x40000ull
-#define DC_LCB_ERR_FLG_VL_ACK_INPUT_BUF_OFLW_SMASK 0x20000ull
-#define DC_LCB_ERR_FLG_FLIT_INPUT_BUF_OFLW_SMASK 0x10000ull
-#define DC_LCB_ERR_FLG_ILLEGAL_FLIT_ENCODING_SMASK 0x8000ull
-#define DC_LCB_ERR_FLG_ILLEGAL_NULL_LTP_SMASK 0x4000ull
-#define DC_LCB_ERR_FLG_UNEXPECTED_ROUND_TRIP_MARKER_SMASK 0x2000ull
-#define DC_LCB_ERR_FLG_UNEXPECTED_REPLAY_MARKER_SMASK 0x1000ull
-#define DC_LCB_ERR_FLG_RCLK_STOPPED_SMASK 0x800ull
-#define DC_LCB_ERR_FLG_CRC_ERR_CNT_HIT_LIMIT_SMASK 0x400ull
-#define DC_LCB_ERR_FLG_REINIT_FOR_LN_DEGRADE_SMASK 0x200ull
-#define DC_LCB_ERR_FLG_REINIT_FROM_PEER_SMASK 0x100ull
-#define DC_LCB_ERR_FLG_SEQ_CRC_ERR_SMASK 0x80ull
-#define DC_LCB_ERR_FLG_RX_LESS_THAN_FOUR_LNS_SMASK 0x40ull
-#define DC_LCB_ERR_FLG_TX_LESS_THAN_FOUR_LNS_SMASK 0x20ull
-#define DC_LCB_ERR_FLG_LOST_REINIT_STALL_OR_TOS_SMASK 0x10ull
-#define DC_LCB_ERR_FLG_ALL_LNS_FAILED_REINIT_TEST_SMASK 0x8ull
-#define DC_LCB_ERR_FLG_RST_FOR_FAILED_DESKEW_SMASK 0x4ull
-#define DC_LCB_ERR_FLG_INVALID_CSR_ADDR_SMASK 0x2ull
-#define DC_LCB_ERR_FLG_CSR_PARITY_ERR_SMASK 0x1ull
-#define DC_LCB_ERR_INFO_CRC_ERR_LN0 (DC_LCB_CSRS + 0x000000000328)
-#define DC_LCB_ERR_INFO_CRC_ERR_LN1 (DC_LCB_CSRS + 0x000000000330)
-#define DC_LCB_ERR_INFO_CRC_ERR_LN2 (DC_LCB_CSRS + 0x000000000338)
-#define DC_LCB_ERR_INFO_CRC_ERR_LN3 (DC_LCB_CSRS + 0x000000000340)
-#define DC_LCB_ERR_INFO_CRC_ERR_MULTI_LN (DC_LCB_CSRS + 0x000000000348)
-#define DC_LCB_ERR_INFO_ESCAPE_0_ONLY_CNT (DC_LCB_CSRS + 0x000000000368)
-#define DC_LCB_ERR_INFO_ESCAPE_0_PLUS1_CNT (DC_LCB_CSRS + 0x000000000370)
-#define DC_LCB_ERR_INFO_ESCAPE_0_PLUS2_CNT (DC_LCB_CSRS + 0x000000000378)
-#define DC_LCB_ERR_INFO_MISC_FLG_CNT (DC_LCB_CSRS + 0x000000000390)
-#define DC_LCB_ERR_INFO_REINIT_FROM_PEER_CNT (DC_LCB_CSRS + 0x000000000380)
-#define DC_LCB_ERR_INFO_RX_REPLAY_CNT (DC_LCB_CSRS + 0x000000000358)
-#define DC_LCB_ERR_INFO_SBE_CNT (DC_LCB_CSRS + 0x000000000388)
-#define DC_LCB_ERR_INFO_SEQ_CRC_CNT (DC_LCB_CSRS + 0x000000000360)
-#define DC_LCB_ERR_INFO_TOTAL_CRC_ERR (DC_LCB_CSRS + 0x000000000320)
-#define DC_LCB_ERR_INFO_TX_REPLAY_CNT (DC_LCB_CSRS + 0x000000000350)
-#define DC_LCB_PG_DBG_FLIT_CRDTS_CNT (DC_LCB_CSRS + 0x000000000580)
-#define DC_LCB_PG_STS_PAUSE_COMPLETE_CNT (DC_LCB_CSRS + 0x0000000005F8)
-#define DC_LCB_PG_STS_TX_MBE_CNT (DC_LCB_CSRS + 0x000000000608)
-#define DC_LCB_PG_STS_TX_SBE_CNT (DC_LCB_CSRS + 0x000000000600)
-#define DC_LCB_PRF_ACCEPTED_LTP_CNT (DC_LCB_CSRS + 0x000000000408)
-#define DC_LCB_PRF_CLK_CNTR (DC_LCB_CSRS + 0x000000000420)
-#define DC_LCB_PRF_GOOD_LTP_CNT (DC_LCB_CSRS + 0x000000000400)
-#define DC_LCB_PRF_RX_FLIT_CNT (DC_LCB_CSRS + 0x000000000410)
-#define DC_LCB_PRF_TX_FLIT_CNT (DC_LCB_CSRS + 0x000000000418)
-#define DC_LCB_STS_LINK_TRANSFER_ACTIVE (DC_LCB_CSRS + 0x000000000468)
-#define DC_LCB_STS_ROUND_TRIP_LTP_CNT (DC_LCB_CSRS + 0x0000000004B0)
-#define RCV_BUF_OVFL_CNT 10
-#define RCV_CONTEXT_EGR_STALL 22
-#define RCV_DATA_PKT_CNT 0
-#define RCV_DWORD_CNT 1
-#define RCV_TID_FLOW_GEN_MISMATCH_CNT 20
-#define RCV_TID_FLOW_SEQ_MISMATCH_CNT 23
-#define RCV_TID_FULL_ERR_CNT 18
-#define RCV_TID_VALID_ERR_CNT 19
-#define RXE_NUM_32_BIT_COUNTERS 24
-#define RXE_NUM_64_BIT_COUNTERS 2
-#define RXE_NUM_RSM_INSTANCES 4
-#define RXE_NUM_TID_FLOWS 32
-#define RXE_PER_CONTEXT_OFFSET 0x0300000
-#define SEND_DATA_PKT_CNT 0
-#define SEND_DATA_PKT_VL0_CNT 12
-#define SEND_DATA_VL0_CNT 3
-#define SEND_DROPPED_PKT_CNT 5
-#define SEND_DWORD_CNT 1
-#define SEND_FLOW_STALL_CNT 4
-#define SEND_HEADERS_ERR_CNT 6
-#define SEND_LEN_ERR_CNT 1
-#define SEND_MAX_MIN_LEN_ERR_CNT 2
-#define SEND_UNDERRUN_CNT 3
-#define SEND_UNSUP_VL_ERR_CNT 0
-#define SEND_WAIT_CNT 2
-#define SEND_WAIT_VL0_CNT 21
-#define TXE_PIO_SEND_OFFSET 0x0800000
-#define ASIC_CFG_DRV_STR (ASIC + 0x000000000048)
-#define ASIC_CFG_MUTEX (ASIC + 0x000000000040)
-#define ASIC_CFG_SBUS_EXECUTE (ASIC + 0x000000000008)
-#define ASIC_CFG_SBUS_EXECUTE_EXECUTE_SMASK 0x1ull
-#define ASIC_CFG_SBUS_EXECUTE_FAST_MODE_SMASK 0x2ull
-#define ASIC_CFG_SBUS_REQUEST (ASIC + 0x000000000000)
-#define ASIC_CFG_SBUS_REQUEST_COMMAND_SHIFT 16
-#define ASIC_CFG_SBUS_REQUEST_DATA_ADDR_SHIFT 8
-#define ASIC_CFG_SBUS_REQUEST_DATA_IN_SHIFT 32
-#define ASIC_CFG_SBUS_REQUEST_RECEIVER_ADDR_SHIFT 0
-#define ASIC_CFG_SCRATCH (ASIC + 0x000000000020)
-#define ASIC_CFG_THERM_POLL_EN (ASIC + 0x000000000050)
-#define ASIC_EEP_ADDR_CMD (ASIC + 0x000000000308)
-#define ASIC_EEP_ADDR_CMD_EP_ADDR_MASK 0xFFFFFFull
-#define ASIC_EEP_CTL_STAT (ASIC + 0x000000000300)
-#define ASIC_EEP_CTL_STAT_EP_RESET_SMASK 0x4ull
-#define ASIC_EEP_CTL_STAT_RATE_SPI_SHIFT 8
-#define ASIC_EEP_CTL_STAT_RESETCSR 0x0000000083818000ull
-#define ASIC_EEP_DATA (ASIC + 0x000000000310)
-#define ASIC_GPIO_CLEAR (ASIC + 0x000000000230)
-#define ASIC_GPIO_FORCE (ASIC + 0x000000000238)
-#define ASIC_GPIO_IN (ASIC + 0x000000000200)
-#define ASIC_GPIO_INVERT (ASIC + 0x000000000210)
-#define ASIC_GPIO_MASK (ASIC + 0x000000000220)
-#define ASIC_GPIO_OE (ASIC + 0x000000000208)
-#define ASIC_GPIO_OUT (ASIC + 0x000000000218)
-#define ASIC_PCIE_SD_HOST_CMD (ASIC + 0x000000000100)
-#define ASIC_PCIE_SD_HOST_CMD_INTRPT_CMD_SHIFT 0
-#define ASIC_PCIE_SD_HOST_CMD_SBR_MODE_SMASK 0x400ull
-#define ASIC_PCIE_SD_HOST_CMD_SBUS_RCVR_ADDR_SHIFT 2
-#define ASIC_PCIE_SD_HOST_CMD_TIMER_MASK 0xFFFFFull
-#define ASIC_PCIE_SD_HOST_CMD_TIMER_SHIFT 12
-#define ASIC_PCIE_SD_HOST_STATUS (ASIC + 0x000000000108)
-#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_MASK 0x7ull
-#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_SHIFT 2
-#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_MASK 0x3ull
-#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_SHIFT 0
-#define ASIC_PCIE_SD_INTRPT_DATA_CODE (ASIC + 0x000000000110)
-#define ASIC_PCIE_SD_INTRPT_ENABLE (ASIC + 0x000000000118)
-#define ASIC_PCIE_SD_INTRPT_LIST (ASIC + 0x000000000180)
-#define ASIC_PCIE_SD_INTRPT_LIST_INTRPT_CODE_SHIFT 16
-#define ASIC_PCIE_SD_INTRPT_LIST_INTRPT_DATA_SHIFT 0
-#define ASIC_PCIE_SD_INTRPT_STATUS (ASIC + 0x000000000128)
-#define ASIC_QSFP1_CLEAR (ASIC + 0x000000000270)
-#define ASIC_QSFP1_FORCE (ASIC + 0x000000000278)
-#define ASIC_QSFP1_IN (ASIC + 0x000000000240)
-#define ASIC_QSFP1_INVERT (ASIC + 0x000000000250)
-#define ASIC_QSFP1_MASK (ASIC + 0x000000000260)
-#define ASIC_QSFP1_OE (ASIC + 0x000000000248)
-#define ASIC_QSFP1_OUT (ASIC + 0x000000000258)
-#define ASIC_QSFP1_STATUS (ASIC + 0x000000000268)
-#define ASIC_QSFP2_CLEAR (ASIC + 0x0000000002B0)
-#define ASIC_QSFP2_FORCE (ASIC + 0x0000000002B8)
-#define ASIC_QSFP2_IN (ASIC + 0x000000000280)
-#define ASIC_QSFP2_INVERT (ASIC + 0x000000000290)
-#define ASIC_QSFP2_MASK (ASIC + 0x0000000002A0)
-#define ASIC_QSFP2_OE (ASIC + 0x000000000288)
-#define ASIC_QSFP2_OUT (ASIC + 0x000000000298)
-#define ASIC_QSFP2_STATUS (ASIC + 0x0000000002A8)
-#define ASIC_STS_SBUS_COUNTERS (ASIC + 0x000000000018)
-#define ASIC_STS_SBUS_COUNTERS_EXECUTE_CNT_MASK 0xFFFFull
-#define ASIC_STS_SBUS_COUNTERS_EXECUTE_CNT_SHIFT 0
-#define ASIC_STS_SBUS_COUNTERS_RCV_DATA_VALID_CNT_MASK 0xFFFFull
-#define ASIC_STS_SBUS_COUNTERS_RCV_DATA_VALID_CNT_SHIFT 16
-#define ASIC_STS_SBUS_RESULT (ASIC + 0x000000000010)
-#define ASIC_STS_SBUS_RESULT_DONE_SMASK 0x1ull
-#define ASIC_STS_SBUS_RESULT_RCV_DATA_VALID_SMASK 0x2ull
-#define ASIC_STS_THERM (ASIC + 0x000000000058)
-#define ASIC_STS_THERM_CRIT_TEMP_MASK 0x7FFull
-#define ASIC_STS_THERM_CRIT_TEMP_SHIFT 18
-#define ASIC_STS_THERM_CURR_TEMP_MASK 0x7FFull
-#define ASIC_STS_THERM_CURR_TEMP_SHIFT 2
-#define ASIC_STS_THERM_HI_TEMP_MASK 0x7FFull
-#define ASIC_STS_THERM_HI_TEMP_SHIFT 50
-#define ASIC_STS_THERM_LO_TEMP_MASK 0x7FFull
-#define ASIC_STS_THERM_LO_TEMP_SHIFT 34
-#define ASIC_STS_THERM_LOW_SHIFT 13
-#define CCE_COUNTER_ARRAY32 (CCE + 0x000000000060)
-#define CCE_CTRL (CCE + 0x000000000010)
-#define CCE_CTRL_RXE_RESUME_SMASK 0x800ull
-#define CCE_CTRL_SPC_FREEZE_SMASK 0x100ull
-#define CCE_CTRL_SPC_UNFREEZE_SMASK 0x200ull
-#define CCE_CTRL_TXE_RESUME_SMASK 0x2000ull
-#define CCE_DC_CTRL (CCE + 0x0000000000B8)
-#define CCE_DC_CTRL_DC_RESET_SMASK 0x1ull
-#define CCE_DC_CTRL_RESETCSR 0x0000000000000001ull
-#define CCE_ERR_CLEAR (CCE + 0x000000000050)
-#define CCE_ERR_MASK (CCE + 0x000000000048)
-#define CCE_ERR_STATUS (CCE + 0x000000000040)
-#define CCE_ERR_STATUS_CCE_CLI0_ASYNC_FIFO_PARITY_ERR_SMASK 0x40ull
-#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERROR_SMASK 0x1000ull
-#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR_SMASK \
-               0x200ull
-#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERROR_SMASK \
-               0x800ull
-#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR_SMASK \
-               0x400ull
-#define CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK 0x100ull
-#define CCE_ERR_STATUS_CCE_CSR_CFG_BUS_PARITY_ERR_SMASK 0x80ull
-#define CCE_ERR_STATUS_CCE_CSR_PARITY_ERR_SMASK 0x1ull
-#define CCE_ERR_STATUS_CCE_CSR_READ_BAD_ADDR_ERR_SMASK 0x2ull
-#define CCE_ERR_STATUS_CCE_CSR_WRITE_BAD_ADDR_ERR_SMASK 0x4ull
-#define CCE_ERR_STATUS_CCE_INT_MAP_COR_ERR_SMASK 0x4000000000ull
-#define CCE_ERR_STATUS_CCE_INT_MAP_UNC_ERR_SMASK 0x8000000000ull
-#define CCE_ERR_STATUS_CCE_MSIX_CSR_PARITY_ERR_SMASK 0x10000000000ull
-#define CCE_ERR_STATUS_CCE_MSIX_TABLE_COR_ERR_SMASK 0x1000000000ull
-#define CCE_ERR_STATUS_CCE_MSIX_TABLE_UNC_ERR_SMASK 0x2000000000ull
-#define CCE_ERR_STATUS_CCE_RCPL_ASYNC_FIFO_PARITY_ERR_SMASK 0x400000000ull
-#define CCE_ERR_STATUS_CCE_RSPD_DATA_PARITY_ERR_SMASK 0x20ull
-#define CCE_ERR_STATUS_CCE_RXDMA_CONV_FIFO_PARITY_ERR_SMASK 0x800000000ull
-#define CCE_ERR_STATUS_CCE_SEG_READ_BAD_ADDR_ERR_SMASK 0x100000000ull
-#define CCE_ERR_STATUS_CCE_SEG_WRITE_BAD_ADDR_ERR_SMASK 0x200000000ull
-#define CCE_ERR_STATUS_CCE_TRGT_ACCESS_ERR_SMASK 0x10ull
-#define CCE_ERR_STATUS_CCE_TRGT_ASYNC_FIFO_PARITY_ERR_SMASK 0x8ull
-#define CCE_ERR_STATUS_CCE_TRGT_CPL_TIMEOUT_ERR_SMASK 0x40000000ull
-#define CCE_ERR_STATUS_LA_TRIGGERED_SMASK 0x80000000ull
-#define CCE_ERR_STATUS_PCIC_CPL_DAT_QCOR_ERR_SMASK 0x40000ull
-#define CCE_ERR_STATUS_PCIC_CPL_DAT_QUNC_ERR_SMASK 0x4000000ull
-#define CCE_ERR_STATUS_PCIC_CPL_HD_QCOR_ERR_SMASK 0x20000ull
-#define CCE_ERR_STATUS_PCIC_CPL_HD_QUNC_ERR_SMASK 0x2000000ull
-#define CCE_ERR_STATUS_PCIC_NPOST_DAT_QPARITY_ERR_SMASK 0x100000ull
-#define CCE_ERR_STATUS_PCIC_NPOST_HQ_PARITY_ERR_SMASK 0x80000ull
-#define CCE_ERR_STATUS_PCIC_POST_DAT_QCOR_ERR_SMASK 0x10000ull
-#define CCE_ERR_STATUS_PCIC_POST_DAT_QUNC_ERR_SMASK 0x1000000ull
-#define CCE_ERR_STATUS_PCIC_POST_HD_QCOR_ERR_SMASK 0x8000ull
-#define CCE_ERR_STATUS_PCIC_POST_HD_QUNC_ERR_SMASK 0x800000ull
-#define CCE_ERR_STATUS_PCIC_RECEIVE_PARITY_ERR_SMASK 0x20000000ull
-#define CCE_ERR_STATUS_PCIC_RETRY_MEM_COR_ERR_SMASK 0x2000ull
-#define CCE_ERR_STATUS_PCIC_RETRY_MEM_UNC_ERR_SMASK 0x200000ull
-#define CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_COR_ERR_SMASK 0x4000ull
-#define CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_UNC_ERR_SMASK 0x400000ull
-#define CCE_ERR_STATUS_PCIC_TRANSMIT_BACK_PARITY_ERR_SMASK 0x10000000ull
-#define CCE_ERR_STATUS_PCIC_TRANSMIT_FRONT_PARITY_ERR_SMASK 0x8000000ull
-#define CCE_INT_CLEAR (CCE + 0x000000110A00)
-#define CCE_INT_COUNTER_ARRAY32 (CCE + 0x000000110D00)
-#define CCE_INT_FORCE (CCE + 0x000000110B00)
-#define CCE_INT_MAP (CCE + 0x000000110500)
-#define CCE_INT_MASK (CCE + 0x000000110900)
-#define CCE_INT_STATUS (CCE + 0x000000110800)
-#define CCE_MSIX_INT_GRANTED (CCE + 0x000000110200)
-#define CCE_MSIX_TABLE_LOWER (CCE + 0x000000100000)
-#define CCE_MSIX_TABLE_UPPER (CCE + 0x000000100008)
-#define CCE_MSIX_TABLE_UPPER_RESETCSR 0x0000000100000000ull
-#define CCE_MSIX_VEC_CLR_WITHOUT_INT (CCE + 0x000000110400)
-#define CCE_PCIE_CTRL (CCE + 0x0000000000C0)
-#define CCE_PCIE_CTRL_PCIE_LANE_BUNDLE_MASK 0x3ull
-#define CCE_PCIE_CTRL_PCIE_LANE_BUNDLE_SHIFT 0
-#define CCE_PCIE_CTRL_PCIE_LANE_DELAY_MASK 0xFull
-#define CCE_PCIE_CTRL_PCIE_LANE_DELAY_SHIFT 2
-#define CCE_PCIE_CTRL_XMT_MARGIN_OVERWRITE_ENABLE_SHIFT 8
-#define CCE_PCIE_CTRL_XMT_MARGIN_SHIFT 9
-#define CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_OVERWRITE_ENABLE_MASK 0x1ull
-#define CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_OVERWRITE_ENABLE_SHIFT 12
-#define CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_MASK 0x7ull
-#define CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_SHIFT 13
-#define CCE_REVISION (CCE + 0x000000000000)
-#define CCE_REVISION2 (CCE + 0x000000000008)
-#define CCE_REVISION2_HFI_ID_MASK 0x1ull
-#define CCE_REVISION2_HFI_ID_SHIFT 0
-#define CCE_REVISION2_IMPL_CODE_SHIFT 8
-#define CCE_REVISION2_IMPL_REVISION_SHIFT 16
-#define CCE_REVISION_BOARD_ID_LOWER_NIBBLE_MASK 0xFull
-#define CCE_REVISION_BOARD_ID_LOWER_NIBBLE_SHIFT 32
-#define CCE_REVISION_CHIP_REV_MAJOR_MASK 0xFFull
-#define CCE_REVISION_CHIP_REV_MAJOR_SHIFT 8
-#define CCE_REVISION_CHIP_REV_MINOR_MASK 0xFFull
-#define CCE_REVISION_CHIP_REV_MINOR_SHIFT 0
-#define CCE_REVISION_SW_MASK 0xFFull
-#define CCE_REVISION_SW_SHIFT 24
-#define CCE_SCRATCH (CCE + 0x000000000020)
-#define CCE_STATUS (CCE + 0x000000000018)
-#define CCE_STATUS_RXE_FROZE_SMASK 0x2ull
-#define CCE_STATUS_RXE_PAUSED_SMASK 0x20ull
-#define CCE_STATUS_SDMA_FROZE_SMASK 0x1ull
-#define CCE_STATUS_SDMA_PAUSED_SMASK 0x10ull
-#define CCE_STATUS_TXE_FROZE_SMASK 0x4ull
-#define CCE_STATUS_TXE_PAUSED_SMASK 0x40ull
-#define CCE_STATUS_TXE_PIO_FROZE_SMASK 0x8ull
-#define CCE_STATUS_TXE_PIO_PAUSED_SMASK 0x80ull
-#define MISC_CFG_FW_CTRL (MISC + 0x000000001000)
-#define MISC_CFG_FW_CTRL_FW_8051_LOADED_SMASK 0x2ull
-#define MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT 2
-#define MISC_CFG_FW_CTRL_RSA_STATUS_SMASK 0xCull
-#define MISC_CFG_RSA_CMD (MISC + 0x000000000A08)
-#define MISC_CFG_RSA_MODULUS (MISC + 0x000000000400)
-#define MISC_CFG_RSA_MU (MISC + 0x000000000A10)
-#define MISC_CFG_RSA_R2 (MISC + 0x000000000000)
-#define MISC_CFG_RSA_SIGNATURE (MISC + 0x000000000200)
-#define MISC_CFG_SHA_PRELOAD (MISC + 0x000000000A00)
-#define MISC_ERR_CLEAR (MISC + 0x000000002010)
-#define MISC_ERR_MASK (MISC + 0x000000002008)
-#define MISC_ERR_STATUS (MISC + 0x000000002000)
-#define MISC_ERR_STATUS_MISC_PLL_LOCK_FAIL_ERR_SMASK 0x1000ull
-#define MISC_ERR_STATUS_MISC_MBIST_FAIL_ERR_SMASK 0x800ull
-#define MISC_ERR_STATUS_MISC_INVALID_EEP_CMD_ERR_SMASK 0x400ull
-#define MISC_ERR_STATUS_MISC_EFUSE_DONE_PARITY_ERR_SMASK 0x200ull
-#define MISC_ERR_STATUS_MISC_EFUSE_WRITE_ERR_SMASK 0x100ull
-#define MISC_ERR_STATUS_MISC_EFUSE_READ_BAD_ADDR_ERR_SMASK 0x80ull
-#define MISC_ERR_STATUS_MISC_EFUSE_CSR_PARITY_ERR_SMASK 0x40ull
-#define MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK 0x20ull
-#define MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK 0x10ull
-#define MISC_ERR_STATUS_MISC_SBUS_WRITE_FAILED_ERR_SMASK 0x8ull
-#define MISC_ERR_STATUS_MISC_CSR_WRITE_BAD_ADDR_ERR_SMASK 0x4ull
-#define MISC_ERR_STATUS_MISC_CSR_READ_BAD_ADDR_ERR_SMASK 0x2ull
-#define MISC_ERR_STATUS_MISC_CSR_PARITY_ERR_SMASK 0x1ull
-#define PCI_CFG_MSIX0 (PCIE + 0x0000000000B0)
-#define PCI_CFG_REG1 (PCIE + 0x000000000004)
-#define PCI_CFG_REG11 (PCIE + 0x00000000002C)
-#define PCIE_CFG_SPCIE1 (PCIE + 0x00000000014C)
-#define PCIE_CFG_SPCIE2 (PCIE + 0x000000000150)
-#define PCIE_CFG_TPH2 (PCIE + 0x000000000180)
-#define RCV_ARRAY (RXE + 0x000000200000)
-#define RCV_ARRAY_CNT (RXE + 0x000000000018)
-#define RCV_ARRAY_RT_ADDR_MASK 0xFFFFFFFFFull
-#define RCV_ARRAY_RT_ADDR_SHIFT 0
-#define RCV_ARRAY_RT_BUF_SIZE_SHIFT 36
-#define RCV_ARRAY_RT_WRITE_ENABLE_SMASK 0x8000000000000000ull
-#define RCV_AVAIL_TIME_OUT (RXE + 0x000000100050)
-#define RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK 0xFFull
-#define RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT 0
-#define RCV_BTH_QP (RXE + 0x000000000028)
-#define RCV_BTH_QP_KDETH_QP_MASK 0xFFull
-#define RCV_BTH_QP_KDETH_QP_SHIFT 16
-#define RCV_BYPASS (RXE + 0x000000000038)
-#define RCV_CONTEXTS (RXE + 0x000000000010)
-#define RCV_COUNTER_ARRAY32 (RXE + 0x000000000400)
-#define RCV_COUNTER_ARRAY64 (RXE + 0x000000000500)
-#define RCV_CTRL (RXE + 0x000000000000)
-#define RCV_CTRL_RCV_BYPASS_ENABLE_SMASK 0x10ull
-#define RCV_CTRL_RCV_EXTENDED_PSN_ENABLE_SMASK 0x40ull
-#define RCV_CTRL_RCV_PARTITION_KEY_ENABLE_SMASK 0x4ull
-#define RCV_CTRL_RCV_PORT_ENABLE_SMASK 0x1ull
-#define RCV_CTRL_RCV_QP_MAP_ENABLE_SMASK 0x2ull
-#define RCV_CTRL_RCV_RSM_ENABLE_SMASK 0x20ull
-#define RCV_CTRL_RX_RBUF_INIT_SMASK 0x200ull
-#define RCV_CTXT_CTRL (RXE + 0x000000100000)
-#define RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK 0x4ull
-#define RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK 0x8ull
-#define RCV_CTXT_CTRL_EGR_BUF_SIZE_MASK 0x7ull
-#define RCV_CTXT_CTRL_EGR_BUF_SIZE_SHIFT 8
-#define RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK 0x700ull
-#define RCV_CTXT_CTRL_ENABLE_SMASK 0x1ull
-#define RCV_CTXT_CTRL_INTR_AVAIL_SMASK 0x20ull
-#define RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK 0x2ull
-#define RCV_CTXT_CTRL_TAIL_UPD_SMASK 0x40ull
-#define RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK 0x10ull
-#define RCV_CTXT_STATUS (RXE + 0x000000100008)
-#define RCV_EGR_CTRL (RXE + 0x000000100010)
-#define RCV_EGR_CTRL_EGR_BASE_INDEX_MASK 0x1FFFull
-#define RCV_EGR_CTRL_EGR_BASE_INDEX_SHIFT 0
-#define RCV_EGR_CTRL_EGR_CNT_MASK 0x1FFull
-#define RCV_EGR_CTRL_EGR_CNT_SHIFT 32
-#define RCV_EGR_INDEX_HEAD (RXE + 0x000000300018)
-#define RCV_EGR_INDEX_HEAD_HEAD_MASK 0x7FFull
-#define RCV_EGR_INDEX_HEAD_HEAD_SHIFT 0
-#define RCV_ERR_CLEAR (RXE + 0x000000000070)
-#define RCV_ERR_INFO (RXE + 0x000000000050)
-#define RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SC_SMASK 0x1Full
-#define RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK 0x20ull
-#define RCV_ERR_MASK (RXE + 0x000000000068)
-#define RCV_ERR_STATUS (RXE + 0x000000000060)
-#define RCV_ERR_STATUS_RX_CSR_PARITY_ERR_SMASK 0x8000000000000000ull
-#define RCV_ERR_STATUS_RX_CSR_READ_BAD_ADDR_ERR_SMASK 0x2000000000000000ull
-#define RCV_ERR_STATUS_RX_CSR_WRITE_BAD_ADDR_ERR_SMASK \
-               0x4000000000000000ull
-#define RCV_ERR_STATUS_RX_DC_INTF_PARITY_ERR_SMASK 0x2ull
-#define RCV_ERR_STATUS_RX_DC_SOP_EOP_PARITY_ERR_SMASK 0x200ull
-#define RCV_ERR_STATUS_RX_DMA_CSR_COR_ERR_SMASK 0x1ull
-#define RCV_ERR_STATUS_RX_DMA_CSR_PARITY_ERR_SMASK 0x200000000000000ull
-#define RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK 0x1000000000000000ull
-#define RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_COR_ERR_SMASK \
-               0x40000000000000ull
-#define RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK \
-               0x20000000000000ull
-#define RCV_ERR_STATUS_RX_DMA_DQ_FSM_ENCODING_ERR_SMASK \
-               0x800000000000000ull
-#define RCV_ERR_STATUS_RX_DMA_EQ_FSM_ENCODING_ERR_SMASK \
-               0x400000000000000ull
-#define RCV_ERR_STATUS_RX_DMA_FLAG_COR_ERR_SMASK 0x800ull
-#define RCV_ERR_STATUS_RX_DMA_FLAG_UNC_ERR_SMASK 0x400ull
-#define RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_COR_ERR_SMASK 0x10000000000000ull
-#define RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK 0x8000000000000ull
-#define RCV_ERR_STATUS_RX_HQ_INTR_CSR_PARITY_ERR_SMASK 0x200000000000ull
-#define RCV_ERR_STATUS_RX_HQ_INTR_FSM_ERR_SMASK 0x400000000000ull
-#define RCV_ERR_STATUS_RX_LOOKUP_CSR_PARITY_ERR_SMASK 0x100000000000ull
-#define RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_COR_ERR_SMASK \
-               0x10000000000ull
-#define RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_ERR_SMASK 0x8000000000ull
-#define RCV_ERR_STATUS_RX_LOOKUP_DES_PART2_PARITY_ERR_SMASK \
-               0x20000000000ull
-#define RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_COR_ERR_SMASK 0x80000000000ull
-#define RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_UNC_ERR_SMASK 0x40000000000ull
-#define RCV_ERR_STATUS_RX_RBUF_BAD_LOOKUP_ERR_SMASK 0x40000000ull
-#define RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_COR_ERR_SMASK 0x100000ull
-#define RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_UNC_ERR_SMASK 0x80000ull
-#define RCV_ERR_STATUS_RX_RBUF_CSR_QENT_CNT_PARITY_ERR_SMASK 0x400000ull
-#define RCV_ERR_STATUS_RX_RBUF_CSR_QEOPDW_PARITY_ERR_SMASK 0x10000000ull
-#define RCV_ERR_STATUS_RX_RBUF_CSR_QHD_PTR_PARITY_ERR_SMASK 0x2000000ull
-#define RCV_ERR_STATUS_RX_RBUF_CSR_QHEAD_BUF_NUM_PARITY_ERR_SMASK \
-               0x200000ull
-#define RCV_ERR_STATUS_RX_RBUF_CSR_QNEXT_BUF_PARITY_ERR_SMASK 0x800000ull
-#define RCV_ERR_STATUS_RX_RBUF_CSR_QNUM_OF_PKT_PARITY_ERR_SMASK \
-               0x8000000ull
-#define RCV_ERR_STATUS_RX_RBUF_CSR_QTL_PTR_PARITY_ERR_SMASK 0x4000000ull
-#define RCV_ERR_STATUS_RX_RBUF_CSR_QVLD_BIT_PARITY_ERR_SMASK 0x1000000ull
-#define RCV_ERR_STATUS_RX_RBUF_CTX_ID_PARITY_ERR_SMASK 0x20000000ull
-#define RCV_ERR_STATUS_RX_RBUF_DATA_COR_ERR_SMASK 0x100000000000000ull
-#define RCV_ERR_STATUS_RX_RBUF_DATA_UNC_ERR_SMASK 0x80000000000000ull
-#define RCV_ERR_STATUS_RX_RBUF_DESC_PART1_COR_ERR_SMASK 0x1000000000000ull
-#define RCV_ERR_STATUS_RX_RBUF_DESC_PART1_UNC_ERR_SMASK 0x800000000000ull
-#define RCV_ERR_STATUS_RX_RBUF_DESC_PART2_COR_ERR_SMASK 0x4000000000000ull
-#define RCV_ERR_STATUS_RX_RBUF_DESC_PART2_UNC_ERR_SMASK 0x2000000000000ull
-#define RCV_ERR_STATUS_RX_RBUF_EMPTY_ERR_SMASK 0x100000000ull
-#define RCV_ERR_STATUS_RX_RBUF_FL_INITDONE_PARITY_ERR_SMASK 0x800000000ull
-#define RCV_ERR_STATUS_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR_SMASK \
-               0x1000000000ull
-#define RCV_ERR_STATUS_RX_RBUF_FL_RD_ADDR_PARITY_ERR_SMASK 0x200000000ull
-#define RCV_ERR_STATUS_RX_RBUF_FL_WR_ADDR_PARITY_ERR_SMASK 0x400000000ull
-#define RCV_ERR_STATUS_RX_RBUF_FREE_LIST_COR_ERR_SMASK 0x4000ull
-#define RCV_ERR_STATUS_RX_RBUF_FREE_LIST_UNC_ERR_SMASK 0x2000ull
-#define RCV_ERR_STATUS_RX_RBUF_FULL_ERR_SMASK 0x80000000ull
-#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_COR_ERR_SMASK 0x40000ull
-#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR_SMASK 0x10000ull
-#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_ERR_SMASK 0x8000ull
-#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_UNC_ERR_SMASK 0x20000ull
-#define RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_COR_ERR_SMASK 0x4000000000ull
-#define RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_UNC_ERR_SMASK 0x2000000000ull
-#define RCV_ERR_STATUS_RX_RCV_CSR_PARITY_ERR_SMASK 0x100ull
-#define RCV_ERR_STATUS_RX_RCV_DATA_COR_ERR_SMASK 0x20ull
-#define RCV_ERR_STATUS_RX_RCV_DATA_UNC_ERR_SMASK 0x10ull
-#define RCV_ERR_STATUS_RX_RCV_FSM_ENCODING_ERR_SMASK 0x1000ull
-#define RCV_ERR_STATUS_RX_RCV_HDR_COR_ERR_SMASK 0x8ull
-#define RCV_ERR_STATUS_RX_RCV_HDR_UNC_ERR_SMASK 0x4ull
-#define RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_COR_ERR_SMASK 0x80ull
-#define RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_UNC_ERR_SMASK 0x40ull
-#define RCV_HDR_ADDR (RXE + 0x000000100028)
-#define RCV_HDR_CNT (RXE + 0x000000100030)
-#define RCV_HDR_CNT_CNT_MASK 0x1FFull
-#define RCV_HDR_CNT_CNT_SHIFT 0
-#define RCV_HDR_ENT_SIZE (RXE + 0x000000100038)
-#define RCV_HDR_ENT_SIZE_ENT_SIZE_MASK 0x7ull
-#define RCV_HDR_ENT_SIZE_ENT_SIZE_SHIFT 0
-#define RCV_HDR_HEAD (RXE + 0x000000300008)
-#define RCV_HDR_HEAD_COUNTER_MASK 0xFFull
-#define RCV_HDR_HEAD_COUNTER_SHIFT 32
-#define RCV_HDR_HEAD_HEAD_MASK 0x7FFFFull
-#define RCV_HDR_HEAD_HEAD_SHIFT 0
-#define RCV_HDR_HEAD_HEAD_SMASK 0x7FFFFull
-#define RCV_HDR_OVFL_CNT (RXE + 0x000000100058)
-#define RCV_HDR_SIZE (RXE + 0x000000100040)
-#define RCV_HDR_SIZE_HDR_SIZE_MASK 0x1Full
-#define RCV_HDR_SIZE_HDR_SIZE_SHIFT 0
-#define RCV_HDR_TAIL (RXE + 0x000000300000)
-#define RCV_HDR_TAIL_ADDR (RXE + 0x000000100048)
-#define RCV_KEY_CTRL (RXE + 0x000000100020)
-#define RCV_KEY_CTRL_JOB_KEY_ENABLE_SMASK 0x200000000ull
-#define RCV_KEY_CTRL_JOB_KEY_VALUE_MASK 0xFFFFull
-#define RCV_KEY_CTRL_JOB_KEY_VALUE_SHIFT 0
-#define RCV_MULTICAST (RXE + 0x000000000030)
-#define RCV_PARTITION_KEY (RXE + 0x000000000200)
-#define RCV_PARTITION_KEY_PARTITION_KEY_A_MASK 0xFFFFull
-#define RCV_PARTITION_KEY_PARTITION_KEY_B_SHIFT 16
-#define RCV_QP_MAP_TABLE (RXE + 0x000000000100)
-#define RCV_RSM_CFG (RXE + 0x000000000600)
-#define RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK 0x1ull
-#define RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT 0
-#define RCV_RSM_CFG_PACKET_TYPE_SHIFT 60
-#define RCV_RSM_CFG_OFFSET_SHIFT 32
-#define RCV_RSM_MAP_TABLE (RXE + 0x000000000900)
-#define RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK 0xFFull
-#define RCV_RSM_MATCH (RXE + 0x000000000800)
-#define RCV_RSM_MATCH_MASK1_SHIFT 0
-#define RCV_RSM_MATCH_MASK2_SHIFT 16
-#define RCV_RSM_MATCH_VALUE1_SHIFT 8
-#define RCV_RSM_MATCH_VALUE2_SHIFT 24
-#define RCV_RSM_SELECT (RXE + 0x000000000700)
-#define RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT 0
-#define RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT 16
-#define RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT 32
-#define RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT 44
-#define RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT 48
-#define RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT 60
-#define RCV_STATUS (RXE + 0x000000000008)
-#define RCV_STATUS_RX_PKT_IN_PROGRESS_SMASK 0x1ull
-#define RCV_STATUS_RX_RBUF_INIT_DONE_SMASK 0x200ull
-#define RCV_STATUS_RX_RBUF_PKT_PENDING_SMASK 0x40ull
-#define RCV_TID_CTRL (RXE + 0x000000100018)
-#define RCV_TID_CTRL_TID_BASE_INDEX_MASK 0x1FFFull
-#define RCV_TID_CTRL_TID_BASE_INDEX_SHIFT 0
-#define RCV_TID_CTRL_TID_PAIR_CNT_MASK 0x1FFull
-#define RCV_TID_CTRL_TID_PAIR_CNT_SHIFT 32
-#define RCV_TID_FLOW_TABLE (RXE + 0x000000300800)
-#define RCV_VL15 (RXE + 0x000000000048)
-#define SEND_BTH_QP (TXE + 0x0000000000A0)
-#define SEND_BTH_QP_KDETH_QP_MASK 0xFFull
-#define SEND_BTH_QP_KDETH_QP_SHIFT 16
-#define SEND_CM_CREDIT_USED_STATUS (TXE + 0x000000000510)
-#define SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK \
-               0x1000000000000ull
-#define SEND_CM_CREDIT_USED_STATUS_VL15_RETURN_CREDIT_STATUS_SMASK \
-               0x8000000000000000ull
-#define SEND_CM_CREDIT_USED_STATUS_VL1_RETURN_CREDIT_STATUS_SMASK \
-               0x2000000000000ull
-#define SEND_CM_CREDIT_USED_STATUS_VL2_RETURN_CREDIT_STATUS_SMASK \
-               0x4000000000000ull
-#define SEND_CM_CREDIT_USED_STATUS_VL3_RETURN_CREDIT_STATUS_SMASK \
-               0x8000000000000ull
-#define SEND_CM_CREDIT_USED_STATUS_VL4_RETURN_CREDIT_STATUS_SMASK \
-               0x10000000000000ull
-#define SEND_CM_CREDIT_USED_STATUS_VL5_RETURN_CREDIT_STATUS_SMASK \
-               0x20000000000000ull
-#define SEND_CM_CREDIT_USED_STATUS_VL6_RETURN_CREDIT_STATUS_SMASK \
-               0x40000000000000ull
-#define SEND_CM_CREDIT_USED_STATUS_VL7_RETURN_CREDIT_STATUS_SMASK \
-               0x80000000000000ull
-#define SEND_CM_CREDIT_VL (TXE + 0x000000000600)
-#define SEND_CM_CREDIT_VL15 (TXE + 0x000000000678)
-#define SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT 0
-#define SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_MASK 0xFFFFull
-#define SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT 0
-#define SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SMASK 0xFFFFull
-#define SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_MASK 0xFFFFull
-#define SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT 16
-#define SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SMASK 0xFFFF0000ull
-#define SEND_CM_CTRL (TXE + 0x000000000500)
-#define SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK 0x8ull
-#define SEND_CM_CTRL_RESETCSR 0x0000000000000020ull
-#define SEND_CM_GLOBAL_CREDIT (TXE + 0x000000000508)
-#define SEND_CM_GLOBAL_CREDIT_AU_SHIFT 16
-#define SEND_CM_GLOBAL_CREDIT_RESETCSR 0x0000094000030000ull
-#define SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_MASK 0xFFFFull
-#define SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT 0
-#define SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SMASK 0xFFFFull
-#define SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_MASK 0xFFFFull
-#define SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT 32
-#define SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SMASK 0xFFFF00000000ull
-#define SEND_CM_LOCAL_AU_TABLE0_TO3 (TXE + 0x000000000520)
-#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE0_SHIFT 0
-#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE1_SHIFT 16
-#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE2_SHIFT 32
-#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE3_SHIFT 48
-#define SEND_CM_LOCAL_AU_TABLE4_TO7 (TXE + 0x000000000528)
-#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE4_SHIFT 0
-#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE5_SHIFT 16
-#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE6_SHIFT 32
-#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE7_SHIFT 48
-#define SEND_CM_REMOTE_AU_TABLE0_TO3 (TXE + 0x000000000530)
-#define SEND_CM_REMOTE_AU_TABLE4_TO7 (TXE + 0x000000000538)
-#define SEND_CM_TIMER_CTRL (TXE + 0x000000000518)
-#define SEND_CONTEXTS (TXE + 0x000000000010)
-#define SEND_CONTEXT_SET_CTRL (TXE + 0x000000000200)
-#define SEND_COUNTER_ARRAY32 (TXE + 0x000000000300)
-#define SEND_COUNTER_ARRAY64 (TXE + 0x000000000400)
-#define SEND_CTRL (TXE + 0x000000000000)
-#define SEND_CTRL_CM_RESET_SMASK 0x4ull
-#define SEND_CTRL_SEND_ENABLE_SMASK 0x1ull
-#define SEND_CTRL_VL_ARBITER_ENABLE_SMASK 0x2ull
-#define SEND_CTXT_CHECK_ENABLE (TXE + 0x000000100080)
-#define SEND_CTXT_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK 0x80ull
-#define SEND_CTXT_CHECK_ENABLE_CHECK_ENABLE_SMASK 0x1ull
-#define SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK 0x4ull
-#define SEND_CTXT_CHECK_ENABLE_CHECK_OPCODE_SMASK 0x20ull
-#define SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK 0x8ull
-#define SEND_CTXT_CHECK_ENABLE_CHECK_SLID_SMASK 0x10ull
-#define SEND_CTXT_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK 0x40ull
-#define SEND_CTXT_CHECK_ENABLE_CHECK_VL_SMASK 0x2ull
-#define SEND_CTXT_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK 0x20000ull
-#define SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK \
-               0x200000ull
-#define SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_SMASK 0x800ull
-#define SEND_CTXT_CHECK_ENABLE_DISALLOW_GRH_SMASK 0x400ull
-#define SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK 0x1000ull
-#define SEND_CTXT_CHECK_ENABLE_DISALLOW_NON_KDETH_PACKETS_SMASK 0x2000ull
-#define SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK \
-               0x100000ull
-#define SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK 0x10000ull
-#define SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK 0x200ull
-#define SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_SMASK 0x100ull
-#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK \
-               0x80000ull
-#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK \
-               0x40000ull
-#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK \
-               0x8000ull
-#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK \
-               0x4000ull
-#define SEND_CTXT_CHECK_JOB_KEY (TXE + 0x000000100090)
-#define SEND_CTXT_CHECK_JOB_KEY_ALLOW_PERMISSIVE_SMASK 0x100000000ull
-#define SEND_CTXT_CHECK_JOB_KEY_MASK_SMASK 0xFFFF0000ull
-#define SEND_CTXT_CHECK_JOB_KEY_VALUE_MASK 0xFFFFull
-#define SEND_CTXT_CHECK_JOB_KEY_VALUE_SHIFT 0
-#define SEND_CTXT_CHECK_OPCODE (TXE + 0x0000001000A8)
-#define SEND_CTXT_CHECK_OPCODE_MASK_SHIFT 8
-#define SEND_CTXT_CHECK_OPCODE_VALUE_SHIFT 0
-#define SEND_CTXT_CHECK_PARTITION_KEY (TXE + 0x000000100098)
-#define SEND_CTXT_CHECK_PARTITION_KEY_VALUE_MASK 0xFFFFull
-#define SEND_CTXT_CHECK_PARTITION_KEY_VALUE_SHIFT 0
-#define SEND_CTXT_CHECK_SLID (TXE + 0x0000001000A0)
-#define SEND_CTXT_CHECK_SLID_MASK_MASK 0xFFFFull
-#define SEND_CTXT_CHECK_SLID_MASK_SHIFT 16
-#define SEND_CTXT_CHECK_SLID_VALUE_MASK 0xFFFFull
-#define SEND_CTXT_CHECK_SLID_VALUE_SHIFT 0
-#define SEND_CTXT_CHECK_VL (TXE + 0x000000100088)
-#define SEND_CTXT_CREDIT_CTRL (TXE + 0x000000100010)
-#define SEND_CTXT_CREDIT_CTRL_CREDIT_INTR_SMASK 0x20000ull
-#define SEND_CTXT_CREDIT_CTRL_EARLY_RETURN_SMASK 0x10000ull
-#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_MASK 0x7FFull
-#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_SHIFT 0
-#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_SMASK 0x7FFull
-#define SEND_CTXT_CREDIT_FORCE (TXE + 0x000000100028)
-#define SEND_CTXT_CREDIT_FORCE_FORCE_RETURN_SMASK 0x1ull
-#define SEND_CTXT_CREDIT_RETURN_ADDR (TXE + 0x000000100020)
-#define SEND_CTXT_CREDIT_RETURN_ADDR_ADDRESS_SMASK 0xFFFFFFFFFFC0ull
-#define SEND_CTXT_CTRL (TXE + 0x000000100000)
-#define SEND_CTXT_CTRL_CTXT_BASE_MASK 0x3FFFull
-#define SEND_CTXT_CTRL_CTXT_BASE_SHIFT 32
-#define SEND_CTXT_CTRL_CTXT_DEPTH_MASK 0x7FFull
-#define SEND_CTXT_CTRL_CTXT_DEPTH_SHIFT 48
-#define SEND_CTXT_CTRL_CTXT_ENABLE_SMASK 0x1ull
-#define SEND_CTXT_ERR_CLEAR (TXE + 0x000000100050)
-#define SEND_CTXT_ERR_MASK (TXE + 0x000000100048)
-#define SEND_CTXT_ERR_STATUS (TXE + 0x000000100040)
-#define SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK 0x2ull
-#define SEND_CTXT_ERR_STATUS_PIO_INCONSISTENT_SOP_ERR_SMASK 0x1ull
-#define SEND_CTXT_ERR_STATUS_PIO_WRITE_CROSSES_BOUNDARY_ERR_SMASK 0x4ull
-#define SEND_CTXT_ERR_STATUS_PIO_WRITE_OUT_OF_BOUNDS_ERR_SMASK 0x10ull
-#define SEND_CTXT_ERR_STATUS_PIO_WRITE_OVERFLOW_ERR_SMASK 0x8ull
-#define SEND_CTXT_STATUS (TXE + 0x000000100008)
-#define SEND_CTXT_STATUS_CTXT_HALTED_SMASK 0x1ull
-#define SEND_DMA_BASE_ADDR (TXE + 0x000000200010)
-#define SEND_DMA_CHECK_ENABLE (TXE + 0x000000200080)
-#define SEND_DMA_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK 0x80ull
-#define SEND_DMA_CHECK_ENABLE_CHECK_ENABLE_SMASK 0x1ull
-#define SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK 0x4ull
-#define SEND_DMA_CHECK_ENABLE_CHECK_OPCODE_SMASK 0x20ull
-#define SEND_DMA_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK 0x8ull
-#define SEND_DMA_CHECK_ENABLE_CHECK_SLID_SMASK 0x10ull
-#define SEND_DMA_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK 0x40ull
-#define SEND_DMA_CHECK_ENABLE_CHECK_VL_SMASK 0x2ull
-#define SEND_DMA_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK 0x20000ull
-#define SEND_DMA_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK 0x200000ull
-#define SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK \
-               0x100000ull
-#define SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK 0x200ull
-#define SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_SMASK 0x100ull
-#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK \
-               0x80000ull
-#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK 0x40000ull
-#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK \
-               0x8000ull
-#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK 0x4000ull
-#define SEND_DMA_CHECK_JOB_KEY (TXE + 0x000000200090)
-#define SEND_DMA_CHECK_OPCODE (TXE + 0x0000002000A8)
-#define SEND_DMA_CHECK_PARTITION_KEY (TXE + 0x000000200098)
-#define SEND_DMA_CHECK_SLID (TXE + 0x0000002000A0)
-#define SEND_DMA_CHECK_SLID_MASK_MASK 0xFFFFull
-#define SEND_DMA_CHECK_SLID_MASK_SHIFT 16
-#define SEND_DMA_CHECK_SLID_VALUE_MASK 0xFFFFull
-#define SEND_DMA_CHECK_SLID_VALUE_SHIFT 0
-#define SEND_DMA_CHECK_VL (TXE + 0x000000200088)
-#define SEND_DMA_CTRL (TXE + 0x000000200000)
-#define SEND_DMA_CTRL_SDMA_CLEANUP_SMASK 0x4ull
-#define SEND_DMA_CTRL_SDMA_ENABLE_SMASK 0x1ull
-#define SEND_DMA_CTRL_SDMA_HALT_SMASK 0x2ull
-#define SEND_DMA_CTRL_SDMA_INT_ENABLE_SMASK 0x8ull
-#define SEND_DMA_DESC_CNT (TXE + 0x000000200050)
-#define SEND_DMA_DESC_CNT_CNT_MASK 0xFFFFull
-#define SEND_DMA_DESC_CNT_CNT_SHIFT 0
-#define SEND_DMA_ENG_ERR_CLEAR (TXE + 0x000000200070)
-#define SEND_DMA_ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_MASK 0x1ull
-#define SEND_DMA_ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SHIFT 18
-#define SEND_DMA_ENG_ERR_MASK (TXE + 0x000000200068)
-#define SEND_DMA_ENG_ERR_STATUS (TXE + 0x000000200060)
-#define SEND_DMA_ENG_ERR_STATUS_SDMA_ASSEMBLY_UNC_ERR_SMASK 0x8000ull
-#define SEND_DMA_ENG_ERR_STATUS_SDMA_DESC_TABLE_UNC_ERR_SMASK 0x4000ull
-#define SEND_DMA_ENG_ERR_STATUS_SDMA_FIRST_DESC_ERR_SMASK 0x10ull
-#define SEND_DMA_ENG_ERR_STATUS_SDMA_GEN_MISMATCH_ERR_SMASK 0x2ull
-#define SEND_DMA_ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK 0x40ull
-#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_ADDRESS_ERR_SMASK 0x800ull
-#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_LENGTH_ERR_SMASK 0x1000ull
-#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SMASK \
-               0x40000ull
-#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_SELECT_ERR_SMASK 0x400ull
-#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_STORAGE_UNC_ERR_SMASK \
-               0x20000ull
-#define SEND_DMA_ENG_ERR_STATUS_SDMA_LENGTH_MISMATCH_ERR_SMASK 0x80ull
-#define SEND_DMA_ENG_ERR_STATUS_SDMA_MEM_READ_ERR_SMASK 0x20ull
-#define SEND_DMA_ENG_ERR_STATUS_SDMA_PACKET_DESC_OVERFLOW_ERR_SMASK \
-               0x100ull
-#define SEND_DMA_ENG_ERR_STATUS_SDMA_PACKET_TRACKING_UNC_ERR_SMASK \
-               0x10000ull
-#define SEND_DMA_ENG_ERR_STATUS_SDMA_TAIL_OUT_OF_BOUNDS_ERR_SMASK 0x8ull
-#define SEND_DMA_ENG_ERR_STATUS_SDMA_TIMEOUT_ERR_SMASK 0x2000ull
-#define SEND_DMA_ENG_ERR_STATUS_SDMA_TOO_LONG_ERR_SMASK 0x4ull
-#define SEND_DMA_ENG_ERR_STATUS_SDMA_WRONG_DW_ERR_SMASK 0x1ull
-#define SEND_DMA_ENGINES (TXE + 0x000000000018)
-#define SEND_DMA_ERR_CLEAR (TXE + 0x000000000070)
-#define SEND_DMA_ERR_MASK (TXE + 0x000000000068)
-#define SEND_DMA_ERR_STATUS (TXE + 0x000000000060)
-#define SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK 0x2ull
-#define SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_COR_ERR_SMASK 0x8ull
-#define SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK 0x4ull
-#define SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK 0x1ull
-#define SEND_DMA_HEAD (TXE + 0x000000200028)
-#define SEND_DMA_HEAD_ADDR (TXE + 0x000000200030)
-#define SEND_DMA_LEN_GEN (TXE + 0x000000200018)
-#define SEND_DMA_LEN_GEN_GENERATION_SHIFT 16
-#define SEND_DMA_LEN_GEN_LENGTH_SHIFT 6
-#define SEND_DMA_MEMORY (TXE + 0x0000002000B0)
-#define SEND_DMA_MEMORY_SDMA_MEMORY_CNT_SHIFT 16
-#define SEND_DMA_MEMORY_SDMA_MEMORY_INDEX_SHIFT 0
-#define SEND_DMA_MEM_SIZE (TXE + 0x000000000028)
-#define SEND_DMA_PRIORITY_THLD (TXE + 0x000000200038)
-#define SEND_DMA_RELOAD_CNT (TXE + 0x000000200048)
-#define SEND_DMA_STATUS (TXE + 0x000000200008)
-#define SEND_DMA_STATUS_ENG_CLEANED_UP_SMASK 0x200000000000000ull
-#define SEND_DMA_STATUS_ENG_HALTED_SMASK 0x100000000000000ull
-#define SEND_DMA_TAIL (TXE + 0x000000200020)
-#define SEND_EGRESS_CTXT_STATUS (TXE + 0x000000000800)
-#define SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_HALT_STATUS_SMASK 0x10000ull
-#define SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SHIFT 0
-#define SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SMASK \
-               0x3FFFull
-#define SEND_EGRESS_ERR_CLEAR (TXE + 0x000000000090)
-#define SEND_EGRESS_ERR_INFO (TXE + 0x000000000F00)
-#define SEND_EGRESS_ERR_INFO_BAD_PKT_LEN_ERR_SMASK 0x20000ull
-#define SEND_EGRESS_ERR_INFO_BYPASS_ERR_SMASK 0x800ull
-#define SEND_EGRESS_ERR_INFO_GRH_ERR_SMASK 0x400ull
-#define SEND_EGRESS_ERR_INFO_JOB_KEY_ERR_SMASK 0x4ull
-#define SEND_EGRESS_ERR_INFO_KDETH_PACKETS_ERR_SMASK 0x1000ull
-#define SEND_EGRESS_ERR_INFO_NON_KDETH_PACKETS_ERR_SMASK 0x2000ull
-#define SEND_EGRESS_ERR_INFO_OPCODE_ERR_SMASK 0x20ull
-#define SEND_EGRESS_ERR_INFO_PARTITION_KEY_ERR_SMASK 0x8ull
-#define SEND_EGRESS_ERR_INFO_PBC_STATIC_RATE_CONTROL_ERR_SMASK 0x100000ull
-#define SEND_EGRESS_ERR_INFO_PBC_TEST_ERR_SMASK 0x10000ull
-#define SEND_EGRESS_ERR_INFO_RAW_ERR_SMASK 0x100ull
-#define SEND_EGRESS_ERR_INFO_RAW_IPV6_ERR_SMASK 0x200ull
-#define SEND_EGRESS_ERR_INFO_SLID_ERR_SMASK 0x10ull
-#define SEND_EGRESS_ERR_INFO_TOO_LONG_BYPASS_PACKETS_ERR_SMASK 0x80000ull
-#define SEND_EGRESS_ERR_INFO_TOO_LONG_IB_PACKET_ERR_SMASK 0x40000ull
-#define SEND_EGRESS_ERR_INFO_TOO_SMALL_BYPASS_PACKETS_ERR_SMASK 0x8000ull
-#define SEND_EGRESS_ERR_INFO_TOO_SMALL_IB_PACKETS_ERR_SMASK 0x4000ull
-#define SEND_EGRESS_ERR_INFO_VL_ERR_SMASK 0x2ull
-#define SEND_EGRESS_ERR_INFO_VL_MAPPING_ERR_SMASK 0x40ull
-#define SEND_EGRESS_ERR_MASK (TXE + 0x000000000088)
-#define SEND_EGRESS_ERR_SOURCE (TXE + 0x000000000F08)
-#define SEND_EGRESS_ERR_STATUS (TXE + 0x000000000080)
-#define SEND_EGRESS_ERR_STATUS_TX_CONFIG_PARITY_ERR_SMASK 0x8000ull
-#define SEND_EGRESS_ERR_STATUS_TX_CREDIT_OVERRUN_ERR_SMASK \
-               0x200000000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_PARITY_ERR_SMASK \
-               0x20000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_VL_ERR_SMASK \
-               0x800000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_EGRESS_FIFO_COR_ERR_SMASK \
-               0x2000000000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_EGRESS_FIFO_UNC_ERR_SMASK \
-               0x200000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_EGRESS_FIFO_UNDERRUN_OR_PARITY_ERR_SMASK \
-               0x8ull
-#define SEND_EGRESS_ERR_STATUS_TX_HCRC_INSERTION_ERR_SMASK \
-               0x400000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_ILLEGAL_VL_ERR_SMASK 0x1000ull
-#define SEND_EGRESS_ERR_STATUS_TX_INCORRECT_LINK_STATE_ERR_SMASK 0x20ull
-#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_CSR_PARITY_ERR_SMASK 0x2000ull
-#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO0_COR_ERR_SMASK \
-               0x1000000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO0_UNC_OR_PARITY_ERR_SMASK \
-               0x100000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO1_COR_ERR_SMASK \
-               0x2000000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO1_UNC_OR_PARITY_ERR_SMASK \
-               0x200000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO2_COR_ERR_SMASK \
-               0x4000000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO2_UNC_OR_PARITY_ERR_SMASK \
-               0x400000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO3_COR_ERR_SMASK \
-               0x8000000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO3_UNC_OR_PARITY_ERR_SMASK \
-               0x800000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO4_COR_ERR_SMASK \
-               0x10000000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO4_UNC_OR_PARITY_ERR_SMASK \
-               0x1000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO5_COR_ERR_SMASK \
-               0x20000000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO5_UNC_OR_PARITY_ERR_SMASK \
-               0x2000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO6_COR_ERR_SMASK \
-               0x40000000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO6_UNC_OR_PARITY_ERR_SMASK \
-               0x4000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO7_COR_ERR_SMASK \
-               0x80000000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO7_UNC_OR_PARITY_ERR_SMASK \
-               0x8000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO8_COR_ERR_SMASK \
-               0x100000000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO8_UNC_OR_PARITY_ERR_SMASK \
-               0x10000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_LINKDOWN_ERR_SMASK 0x10ull
-#define SEND_EGRESS_ERR_STATUS_TX_PIO_LAUNCH_INTF_PARITY_ERR_SMASK 0x80ull
-#define SEND_EGRESS_ERR_STATUS_TX_PKT_INTEGRITY_MEM_COR_ERR_SMASK 0x1ull
-#define SEND_EGRESS_ERR_STATUS_TX_PKT_INTEGRITY_MEM_UNC_ERR_SMASK 0x2ull
-#define SEND_EGRESS_ERR_STATUS_TX_READ_PIO_MEMORY_COR_ERR_SMASK \
-               0x1000000000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_READ_PIO_MEMORY_CSR_UNC_ERR_SMASK \
-               0x8000000000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_READ_PIO_MEMORY_UNC_ERR_SMASK \
-               0x100000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_READ_SDMA_MEMORY_COR_ERR_SMASK \
-               0x800000000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_READ_SDMA_MEMORY_CSR_UNC_ERR_SMASK \
-               0x4000000000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_READ_SDMA_MEMORY_UNC_ERR_SMASK \
-               0x80000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SB_HDR_COR_ERR_SMASK 0x400000000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SB_HDR_UNC_ERR_SMASK 0x40000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SBRD_CTL_CSR_PARITY_ERR_SMASK 0x4000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SBRD_CTL_STATE_MACHINE_PARITY_ERR_SMASK \
-               0x800ull
-#define SEND_EGRESS_ERR_STATUS_TX_SDMA0_DISALLOWED_PACKET_ERR_SMASK \
-               0x10000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SDMA10_DISALLOWED_PACKET_ERR_SMASK \
-               0x4000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SDMA11_DISALLOWED_PACKET_ERR_SMASK \
-               0x8000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SDMA12_DISALLOWED_PACKET_ERR_SMASK \
-               0x10000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SDMA13_DISALLOWED_PACKET_ERR_SMASK \
-               0x20000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SDMA14_DISALLOWED_PACKET_ERR_SMASK \
-               0x40000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SDMA15_DISALLOWED_PACKET_ERR_SMASK \
-               0x80000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SDMA1_DISALLOWED_PACKET_ERR_SMASK \
-               0x20000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SDMA2_DISALLOWED_PACKET_ERR_SMASK \
-               0x40000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SDMA3_DISALLOWED_PACKET_ERR_SMASK \
-               0x80000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SDMA4_DISALLOWED_PACKET_ERR_SMASK \
-               0x100000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SDMA5_DISALLOWED_PACKET_ERR_SMASK \
-               0x200000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SDMA6_DISALLOWED_PACKET_ERR_SMASK \
-               0x400000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SDMA7_DISALLOWED_PACKET_ERR_SMASK \
-               0x800000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SDMA8_DISALLOWED_PACKET_ERR_SMASK \
-               0x1000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SDMA9_DISALLOWED_PACKET_ERR_SMASK \
-               0x2000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SDMA_LAUNCH_INTF_PARITY_ERR_SMASK \
-               0x100ull
-#define SEND_EGRESS_SEND_DMA_STATUS (TXE + 0x000000000E00)
-#define SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT 0
-#define SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SMASK \
-               0x3FFFull
-#define SEND_ERR_CLEAR (TXE + 0x0000000000F0)
-#define SEND_ERR_MASK (TXE + 0x0000000000E8)
-#define SEND_ERR_STATUS (TXE + 0x0000000000E0)
-#define SEND_ERR_STATUS_SEND_CSR_PARITY_ERR_SMASK 0x1ull
-#define SEND_ERR_STATUS_SEND_CSR_READ_BAD_ADDR_ERR_SMASK 0x2ull
-#define SEND_ERR_STATUS_SEND_CSR_WRITE_BAD_ADDR_ERR_SMASK 0x4ull
-#define SEND_HIGH_PRIORITY_LIMIT (TXE + 0x000000000030)
-#define SEND_HIGH_PRIORITY_LIMIT_LIMIT_MASK 0x3FFFull
-#define SEND_HIGH_PRIORITY_LIMIT_LIMIT_SHIFT 0
-#define SEND_HIGH_PRIORITY_LIST (TXE + 0x000000000180)
-#define SEND_LEN_CHECK0 (TXE + 0x0000000000D0)
-#define SEND_LEN_CHECK0_LEN_VL0_MASK 0xFFFull
-#define SEND_LEN_CHECK0_LEN_VL1_SHIFT 12
-#define SEND_LEN_CHECK1 (TXE + 0x0000000000D8)
-#define SEND_LEN_CHECK1_LEN_VL15_MASK 0xFFFull
-#define SEND_LEN_CHECK1_LEN_VL15_SHIFT 48
-#define SEND_LEN_CHECK1_LEN_VL4_MASK 0xFFFull
-#define SEND_LEN_CHECK1_LEN_VL5_SHIFT 12
-#define SEND_LOW_PRIORITY_LIST (TXE + 0x000000000100)
-#define SEND_LOW_PRIORITY_LIST_VL_MASK 0x7ull
-#define SEND_LOW_PRIORITY_LIST_VL_SHIFT 16
-#define SEND_LOW_PRIORITY_LIST_WEIGHT_MASK 0xFFull
-#define SEND_LOW_PRIORITY_LIST_WEIGHT_SHIFT 0
-#define SEND_PIO_ERR_CLEAR (TXE + 0x000000000050)
-#define SEND_PIO_ERR_CLEAR_PIO_INIT_SM_IN_ERR_SMASK 0x20000ull
-#define SEND_PIO_ERR_MASK (TXE + 0x000000000048)
-#define SEND_PIO_ERR_STATUS (TXE + 0x000000000040)
-#define SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK \
-               0x1000000ull
-#define SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK 0x8000ull
-#define SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK 0x4ull
-#define SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK \
-               0x100000000ull
-#define SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_COR_ERR_SMASK 0x100000ull
-#define SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK 0x80000ull
-#define SEND_PIO_ERR_STATUS_PIO_INIT_SM_IN_ERR_SMASK 0x20000ull
-#define SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK \
-               0x200000000ull
-#define SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK 0x20ull
-#define SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK \
-               0x400000000ull
-#define SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK 0x40ull
-#define SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK \
-               0x800000000ull
-#define SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK 0x200ull
-#define SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK 0x40000ull
-#define SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK 0x10000000ull
-#define SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK 0x10000ull
-#define SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK 0x20000000ull
-#define SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK 0x8ull
-#define SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK 0x10ull
-#define SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK 0x80ull
-#define SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK \
-               0x100ull
-#define SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK 0x400ull
-#define SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK 0x400000ull
-#define SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK 0x8000000ull
-#define SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK 0x4000000ull
-#define SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK 0x2000000ull
-#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_COR_ERR_SMASK 0x2000ull
-#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK 0x800ull
-#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_COR_ERR_SMASK 0x4000ull
-#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK 0x1000ull
-#define SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK 0x2ull
-#define SEND_PIO_ERR_STATUS_PIO_WRITE_BAD_CTXT_ERR_SMASK 0x1ull
-#define SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK 0x200000ull
-#define SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK 0x800000ull
-#define SEND_PIO_INIT_CTXT (TXE + 0x000000000038)
-#define SEND_PIO_INIT_CTXT_PIO_ALL_CTXT_INIT_SMASK 0x1ull
-#define SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_MASK 0xFFull
-#define SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_SHIFT 8
-#define SEND_PIO_INIT_CTXT_PIO_INIT_ERR_SMASK 0x8ull
-#define SEND_PIO_INIT_CTXT_PIO_INIT_IN_PROGRESS_SMASK 0x4ull
-#define SEND_PIO_INIT_CTXT_PIO_SINGLE_CTXT_INIT_SMASK 0x2ull
-#define SEND_PIO_MEM_SIZE (TXE + 0x000000000020)
-#define SEND_SC2VLT0 (TXE + 0x0000000000B0)
-#define SEND_SC2VLT0_SC0_SHIFT 0
-#define SEND_SC2VLT0_SC1_SHIFT 8
-#define SEND_SC2VLT0_SC2_SHIFT 16
-#define SEND_SC2VLT0_SC3_SHIFT 24
-#define SEND_SC2VLT0_SC4_SHIFT 32
-#define SEND_SC2VLT0_SC5_SHIFT 40
-#define SEND_SC2VLT0_SC6_SHIFT 48
-#define SEND_SC2VLT0_SC7_SHIFT 56
-#define SEND_SC2VLT1 (TXE + 0x0000000000B8)
-#define SEND_SC2VLT1_SC10_SHIFT 16
-#define SEND_SC2VLT1_SC11_SHIFT 24
-#define SEND_SC2VLT1_SC12_SHIFT 32
-#define SEND_SC2VLT1_SC13_SHIFT 40
-#define SEND_SC2VLT1_SC14_SHIFT 48
-#define SEND_SC2VLT1_SC15_SHIFT 56
-#define SEND_SC2VLT1_SC8_SHIFT 0
-#define SEND_SC2VLT1_SC9_SHIFT 8
-#define SEND_SC2VLT2 (TXE + 0x0000000000C0)
-#define SEND_SC2VLT2_SC16_SHIFT 0
-#define SEND_SC2VLT2_SC17_SHIFT 8
-#define SEND_SC2VLT2_SC18_SHIFT 16
-#define SEND_SC2VLT2_SC19_SHIFT 24
-#define SEND_SC2VLT2_SC20_SHIFT 32
-#define SEND_SC2VLT2_SC21_SHIFT 40
-#define SEND_SC2VLT2_SC22_SHIFT 48
-#define SEND_SC2VLT2_SC23_SHIFT 56
-#define SEND_SC2VLT3 (TXE + 0x0000000000C8)
-#define SEND_SC2VLT3_SC24_SHIFT 0
-#define SEND_SC2VLT3_SC25_SHIFT 8
-#define SEND_SC2VLT3_SC26_SHIFT 16
-#define SEND_SC2VLT3_SC27_SHIFT 24
-#define SEND_SC2VLT3_SC28_SHIFT 32
-#define SEND_SC2VLT3_SC29_SHIFT 40
-#define SEND_SC2VLT3_SC30_SHIFT 48
-#define SEND_SC2VLT3_SC31_SHIFT 56
-#define SEND_STATIC_RATE_CONTROL (TXE + 0x0000000000A8)
-#define SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SHIFT 0
-#define SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SMASK 0xFFFFull
-#define PCIE_CFG_REG_PL2 (PCIE + 0x000000000708)
-#define PCIE_CFG_REG_PL3 (PCIE + 0x00000000070C)
-#define PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SHIFT 27
-#define PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SMASK 0x38000000
-#define PCIE_CFG_REG_PL102 (PCIE + 0x000000000898)
-#define PCIE_CFG_REG_PL102_GEN3_EQ_POST_CURSOR_PSET_SHIFT 12
-#define PCIE_CFG_REG_PL102_GEN3_EQ_CURSOR_PSET_SHIFT 6
-#define PCIE_CFG_REG_PL102_GEN3_EQ_PRE_CURSOR_PSET_SHIFT 0
-#define PCIE_CFG_REG_PL103 (PCIE + 0x00000000089C)
-#define PCIE_CFG_REG_PL105 (PCIE + 0x0000000008A4)
-#define PCIE_CFG_REG_PL105_GEN3_EQ_VIOLATE_COEF_RULES_SMASK 0x1ull
-#define PCIE_CFG_REG_PL2_LOW_PWR_ENT_CNT_SHIFT 24
-#define PCIE_CFG_REG_PL100 (PCIE + 0x000000000890)
-#define PCIE_CFG_REG_PL100_EQ_EIEOS_CNT_SMASK 0x400ull
-#define PCIE_CFG_REG_PL101 (PCIE + 0x000000000894)
-#define PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_FS_SHIFT 6
-#define PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_LF_SHIFT 0
-#define PCIE_CFG_REG_PL106 (PCIE + 0x0000000008A8)
-#define PCIE_CFG_REG_PL106_GEN3_EQ_PSET_REQ_VEC_SHIFT 8
-#define PCIE_CFG_REG_PL106_GEN3_EQ_EVAL2MS_DISABLE_SMASK 0x20ull
-#define PCIE_CFG_REG_PL106_GEN3_EQ_PHASE23_EXIT_MODE_SMASK 0x10ull
-#define CCE_INT_BLOCKED (CCE + 0x000000110C00)
-#define SEND_DMA_IDLE_CNT (TXE + 0x000000200040)
-#define SEND_DMA_DESC_FETCHED_CNT (TXE + 0x000000200058)
-#define CCE_MSIX_PBA_OFFSET 0X0110000
-
-#endif          /* DEF_CHIP_REG */
diff --git a/drivers/staging/rdma/hfi1/common.h b/drivers/staging/rdma/hfi1/common.h
deleted file mode 100644 (file)
index e9b6bb3..0000000
+++ /dev/null
@@ -1,408 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#ifndef _COMMON_H
-#define _COMMON_H
-
-#include <rdma/hfi/hfi1_user.h>
-
-/*
- * This file contains defines, structures, etc. that are used
- * to communicate between kernel and user code.
- */
-
-/* version of protocol header (known to chip also). In the long run,
- * we should be able to generate and accept a range of version numbers;
- * for now we only accept one, and it's compiled in.
- */
-#define IPS_PROTO_VERSION 2
-
-/*
- * These are compile time constants that you may want to enable or disable
- * if you are trying to debug problems with code or performance.
- * HFI1_VERBOSE_TRACING define as 1 if you want additional tracing in
- * fast path code
- * HFI1_TRACE_REGWRITES define as 1 if you want register writes to be
- * traced in fast path code
- * _HFI1_TRACING define as 0 if you want to remove all tracing in a
- * compilation unit
- */
-
-/*
- * If a packet's QP[23:16] bits match this value, then it is
- * a PSM packet and the hardware will expect a KDETH header
- * following the BTH.
- */
-#define DEFAULT_KDETH_QP 0x80
-
-/* driver/hw feature set bitmask */
-#define HFI1_CAP_USER_SHIFT      24
-#define HFI1_CAP_MASK            ((1UL << HFI1_CAP_USER_SHIFT) - 1)
-/* locked flag - if set, only HFI1_CAP_WRITABLE_MASK bits can be set */
-#define HFI1_CAP_LOCKED_SHIFT    63
-#define HFI1_CAP_LOCKED_MASK     0x1ULL
-#define HFI1_CAP_LOCKED_SMASK    (HFI1_CAP_LOCKED_MASK << HFI1_CAP_LOCKED_SHIFT)
-/* extra bits used between kernel and user processes */
-#define HFI1_CAP_MISC_SHIFT      (HFI1_CAP_USER_SHIFT * 2)
-#define HFI1_CAP_MISC_MASK       ((1ULL << (HFI1_CAP_LOCKED_SHIFT - \
-                                          HFI1_CAP_MISC_SHIFT)) - 1)
-
-#define HFI1_CAP_KSET(cap) ({ hfi1_cap_mask |= HFI1_CAP_##cap; hfi1_cap_mask; })
-#define HFI1_CAP_KCLEAR(cap)                                           \
-       ({                                                              \
-               hfi1_cap_mask &= ~HFI1_CAP_##cap;                       \
-               hfi1_cap_mask;                                          \
-       })
-#define HFI1_CAP_USET(cap)                                             \
-       ({                                                              \
-               hfi1_cap_mask |= (HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT); \
-               hfi1_cap_mask;                                          \
-               })
-#define HFI1_CAP_UCLEAR(cap)                                           \
-       ({                                                              \
-               hfi1_cap_mask &= ~(HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT); \
-               hfi1_cap_mask;                                          \
-       })
-#define HFI1_CAP_SET(cap)                                              \
-       ({                                                              \
-               hfi1_cap_mask |= (HFI1_CAP_##cap | (HFI1_CAP_##cap <<   \
-                                                 HFI1_CAP_USER_SHIFT)); \
-               hfi1_cap_mask;                                          \
-       })
-#define HFI1_CAP_CLEAR(cap)                                            \
-       ({                                                              \
-               hfi1_cap_mask &= ~(HFI1_CAP_##cap |                     \
-                                 (HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT)); \
-               hfi1_cap_mask;                                          \
-       })
-#define HFI1_CAP_LOCK()                                                        \
-       ({ hfi1_cap_mask |= HFI1_CAP_LOCKED_SMASK; hfi1_cap_mask; })
-#define HFI1_CAP_LOCKED() (!!(hfi1_cap_mask & HFI1_CAP_LOCKED_SMASK))
-/*
- * The set of capability bits that can be changed after initial load
- * This set is the same for kernel and user contexts. However, for
- * user contexts, the set can be further filtered by using the
- * HFI1_CAP_RESERVED_MASK bits.
- */
-#define HFI1_CAP_WRITABLE_MASK   (HFI1_CAP_SDMA_AHG |                  \
-                                 HFI1_CAP_HDRSUPP |                    \
-                                 HFI1_CAP_MULTI_PKT_EGR |              \
-                                 HFI1_CAP_NODROP_RHQ_FULL |            \
-                                 HFI1_CAP_NODROP_EGR_FULL |            \
-                                 HFI1_CAP_ALLOW_PERM_JKEY |            \
-                                 HFI1_CAP_STATIC_RATE_CTRL |           \
-                                 HFI1_CAP_PRINT_UNIMPL |               \
-                                 HFI1_CAP_TID_UNMAP)
-/*
- * A set of capability bits that are "global" and are not allowed to be
- * set in the user bitmask.
- */
-#define HFI1_CAP_RESERVED_MASK   ((HFI1_CAP_SDMA |                     \
-                                 HFI1_CAP_USE_SDMA_HEAD |              \
-                                 HFI1_CAP_EXTENDED_PSN |               \
-                                 HFI1_CAP_PRINT_UNIMPL |               \
-                                 HFI1_CAP_NO_INTEGRITY |               \
-                                 HFI1_CAP_PKEY_CHECK) <<               \
-                                HFI1_CAP_USER_SHIFT)
-/*
- * Set of capabilities that need to be enabled for kernel context in
- * order to be allowed for user contexts, as well.
- */
-#define HFI1_CAP_MUST_HAVE_KERN (HFI1_CAP_STATIC_RATE_CTRL)
-/* Default enabled capabilities (both kernel and user) */
-#define HFI1_CAP_MASK_DEFAULT    (HFI1_CAP_HDRSUPP |                   \
-                                HFI1_CAP_NODROP_RHQ_FULL |             \
-                                HFI1_CAP_NODROP_EGR_FULL |             \
-                                HFI1_CAP_SDMA |                        \
-                                HFI1_CAP_PRINT_UNIMPL |                \
-                                HFI1_CAP_STATIC_RATE_CTRL |            \
-                                HFI1_CAP_PKEY_CHECK |                  \
-                                HFI1_CAP_MULTI_PKT_EGR |               \
-                                HFI1_CAP_EXTENDED_PSN |                \
-                                ((HFI1_CAP_HDRSUPP |                   \
-                                  HFI1_CAP_MULTI_PKT_EGR |             \
-                                  HFI1_CAP_STATIC_RATE_CTRL |          \
-                                  HFI1_CAP_PKEY_CHECK |                \
-                                  HFI1_CAP_EARLY_CREDIT_RETURN) <<     \
-                                 HFI1_CAP_USER_SHIFT))
-/*
- * A bitmask of kernel/global capabilities that should be communicated
- * to user level processes.
- */
-#define HFI1_CAP_K2U (HFI1_CAP_SDMA |                  \
-                    HFI1_CAP_EXTENDED_PSN |            \
-                    HFI1_CAP_PKEY_CHECK |              \
-                    HFI1_CAP_NO_INTEGRITY)
-
-#define HFI1_USER_SWVERSION ((HFI1_USER_SWMAJOR << 16) | HFI1_USER_SWMINOR)
-
-#ifndef HFI1_KERN_TYPE
-#define HFI1_KERN_TYPE 0
-#endif
-
-/*
- * Similarly, this is the kernel version going back to the user.  It's
- * slightly different, in that we want to tell if the driver was built as
- * part of a Intel release, or from the driver from openfabrics.org,
- * kernel.org, or a standard distribution, for support reasons.
- * The high bit is 0 for non-Intel and 1 for Intel-built/supplied.
- *
- * It's returned by the driver to the user code during initialization in the
- * spi_sw_version field of hfi1_base_info, so the user code can in turn
- * check for compatibility with the kernel.
-*/
-#define HFI1_KERN_SWVERSION ((HFI1_KERN_TYPE << 31) | HFI1_USER_SWVERSION)
-
-/*
- * Define the driver version number.  This is something that refers only
- * to the driver itself, not the software interfaces it supports.
- */
-#ifndef HFI1_DRIVER_VERSION_BASE
-#define HFI1_DRIVER_VERSION_BASE "0.9-294"
-#endif
-
-/* create the final driver version string */
-#ifdef HFI1_IDSTR
-#define HFI1_DRIVER_VERSION HFI1_DRIVER_VERSION_BASE " " HFI1_IDSTR
-#else
-#define HFI1_DRIVER_VERSION HFI1_DRIVER_VERSION_BASE
-#endif
-
-/*
- * Diagnostics can send a packet by writing the following
- * struct to the diag packet special file.
- *
- * This allows a custom PBC qword, so that special modes and deliberate
- * changes to CRCs can be used.
- */
-#define _DIAG_PKT_VERS 1
-struct diag_pkt {
-       __u16 version;          /* structure version */
-       __u16 unit;             /* which device */
-       __u16 sw_index;         /* send sw index to use */
-       __u16 len;              /* data length, in bytes */
-       __u16 port;             /* port number */
-       __u16 unused;
-       __u32 flags;            /* call flags */
-       __u64 data;             /* user data pointer */
-       __u64 pbc;              /* PBC for the packet */
-};
-
-/* diag_pkt flags */
-#define F_DIAGPKT_WAIT 0x1     /* wait until packet is sent */
-
-/*
- * The next set of defines are for packet headers, and chip register
- * and memory bits that are visible to and/or used by user-mode software.
- */
-
-/*
- * Receive Header Flags
- */
-#define RHF_PKT_LEN_SHIFT      0
-#define RHF_PKT_LEN_MASK       0xfffull
-#define RHF_PKT_LEN_SMASK (RHF_PKT_LEN_MASK << RHF_PKT_LEN_SHIFT)
-
-#define RHF_RCV_TYPE_SHIFT     12
-#define RHF_RCV_TYPE_MASK      0x7ull
-#define RHF_RCV_TYPE_SMASK (RHF_RCV_TYPE_MASK << RHF_RCV_TYPE_SHIFT)
-
-#define RHF_USE_EGR_BFR_SHIFT  15
-#define RHF_USE_EGR_BFR_MASK   0x1ull
-#define RHF_USE_EGR_BFR_SMASK (RHF_USE_EGR_BFR_MASK << RHF_USE_EGR_BFR_SHIFT)
-
-#define RHF_EGR_INDEX_SHIFT    16
-#define RHF_EGR_INDEX_MASK     0x7ffull
-#define RHF_EGR_INDEX_SMASK (RHF_EGR_INDEX_MASK << RHF_EGR_INDEX_SHIFT)
-
-#define RHF_DC_INFO_SHIFT      27
-#define RHF_DC_INFO_MASK       0x1ull
-#define RHF_DC_INFO_SMASK (RHF_DC_INFO_MASK << RHF_DC_INFO_SHIFT)
-
-#define RHF_RCV_SEQ_SHIFT      28
-#define RHF_RCV_SEQ_MASK       0xfull
-#define RHF_RCV_SEQ_SMASK (RHF_RCV_SEQ_MASK << RHF_RCV_SEQ_SHIFT)
-
-#define RHF_EGR_OFFSET_SHIFT   32
-#define RHF_EGR_OFFSET_MASK    0xfffull
-#define RHF_EGR_OFFSET_SMASK (RHF_EGR_OFFSET_MASK << RHF_EGR_OFFSET_SHIFT)
-#define RHF_HDRQ_OFFSET_SHIFT  44
-#define RHF_HDRQ_OFFSET_MASK   0x1ffull
-#define RHF_HDRQ_OFFSET_SMASK (RHF_HDRQ_OFFSET_MASK << RHF_HDRQ_OFFSET_SHIFT)
-#define RHF_K_HDR_LEN_ERR      (0x1ull << 53)
-#define RHF_DC_UNC_ERR         (0x1ull << 54)
-#define RHF_DC_ERR             (0x1ull << 55)
-#define RHF_RCV_TYPE_ERR_SHIFT 56
-#define RHF_RCV_TYPE_ERR_MASK  0x7ul
-#define RHF_RCV_TYPE_ERR_SMASK (RHF_RCV_TYPE_ERR_MASK << RHF_RCV_TYPE_ERR_SHIFT)
-#define RHF_TID_ERR            (0x1ull << 59)
-#define RHF_LEN_ERR            (0x1ull << 60)
-#define RHF_ECC_ERR            (0x1ull << 61)
-#define RHF_VCRC_ERR           (0x1ull << 62)
-#define RHF_ICRC_ERR           (0x1ull << 63)
-
-#define RHF_ERROR_SMASK 0xffe0000000000000ull          /* bits 63:53 */
-
-/* RHF receive types */
-#define RHF_RCV_TYPE_EXPECTED 0
-#define RHF_RCV_TYPE_EAGER    1
-#define RHF_RCV_TYPE_IB       2 /* normal IB, IB Raw, or IPv6 */
-#define RHF_RCV_TYPE_ERROR    3
-#define RHF_RCV_TYPE_BYPASS   4
-#define RHF_RCV_TYPE_INVALID5 5
-#define RHF_RCV_TYPE_INVALID6 6
-#define RHF_RCV_TYPE_INVALID7 7
-
-/* RHF receive type error - expected packet errors */
-#define RHF_RTE_EXPECTED_FLOW_SEQ_ERR  0x2
-#define RHF_RTE_EXPECTED_FLOW_GEN_ERR  0x4
-
-/* RHF receive type error - eager packet errors */
-#define RHF_RTE_EAGER_NO_ERR           0x0
-
-/* RHF receive type error - IB packet errors */
-#define RHF_RTE_IB_NO_ERR              0x0
-
-/* RHF receive type error - error packet errors */
-#define RHF_RTE_ERROR_NO_ERR           0x0
-#define RHF_RTE_ERROR_OP_CODE_ERR      0x1
-#define RHF_RTE_ERROR_KHDR_MIN_LEN_ERR 0x2
-#define RHF_RTE_ERROR_KHDR_HCRC_ERR    0x3
-#define RHF_RTE_ERROR_KHDR_KVER_ERR    0x4
-#define RHF_RTE_ERROR_CONTEXT_ERR      0x5
-#define RHF_RTE_ERROR_KHDR_TID_ERR     0x6
-
-/* RHF receive type error - bypass packet errors */
-#define RHF_RTE_BYPASS_NO_ERR          0x0
-
-/*
- * This structure contains the first field common to all protocols
- * that employ this chip.
- */
-struct hfi1_message_header {
-       __be16 lrh[4];
-};
-
-/* IB - LRH header constants */
-#define HFI1_LRH_GRH 0x0003      /* 1. word of IB LRH - next header: GRH */
-#define HFI1_LRH_BTH 0x0002      /* 1. word of IB LRH - next header: BTH */
-
-/* misc. */
-#define SIZE_OF_CRC 1
-
-#define LIM_MGMT_P_KEY       0x7FFF
-#define FULL_MGMT_P_KEY      0xFFFF
-
-#define DEFAULT_P_KEY LIM_MGMT_P_KEY
-#define HFI1_AETH_CREDIT_SHIFT 24
-#define HFI1_AETH_CREDIT_MASK 0x1F
-#define HFI1_AETH_CREDIT_INVAL 0x1F
-#define HFI1_MSN_MASK 0xFFFFFF
-#define HFI1_FECN_SHIFT 31
-#define HFI1_FECN_MASK 1
-#define HFI1_FECN_SMASK BIT(HFI1_FECN_SHIFT)
-#define HFI1_BECN_SHIFT 30
-#define HFI1_BECN_MASK 1
-#define HFI1_BECN_SMASK BIT(HFI1_BECN_SHIFT)
-
-static inline __u64 rhf_to_cpu(const __le32 *rbuf)
-{
-       return __le64_to_cpu(*((__le64 *)rbuf));
-}
-
-static inline u64 rhf_err_flags(u64 rhf)
-{
-       return rhf & RHF_ERROR_SMASK;
-}
-
-static inline u32 rhf_rcv_type(u64 rhf)
-{
-       return (rhf >> RHF_RCV_TYPE_SHIFT) & RHF_RCV_TYPE_MASK;
-}
-
-static inline u32 rhf_rcv_type_err(u64 rhf)
-{
-       return (rhf >> RHF_RCV_TYPE_ERR_SHIFT) & RHF_RCV_TYPE_ERR_MASK;
-}
-
-/* return size is in bytes, not DWORDs */
-static inline u32 rhf_pkt_len(u64 rhf)
-{
-       return ((rhf & RHF_PKT_LEN_SMASK) >> RHF_PKT_LEN_SHIFT) << 2;
-}
-
-static inline u32 rhf_egr_index(u64 rhf)
-{
-       return (rhf >> RHF_EGR_INDEX_SHIFT) & RHF_EGR_INDEX_MASK;
-}
-
-static inline u32 rhf_rcv_seq(u64 rhf)
-{
-       return (rhf >> RHF_RCV_SEQ_SHIFT) & RHF_RCV_SEQ_MASK;
-}
-
-/* returned offset is in DWORDS */
-static inline u32 rhf_hdrq_offset(u64 rhf)
-{
-       return (rhf >> RHF_HDRQ_OFFSET_SHIFT) & RHF_HDRQ_OFFSET_MASK;
-}
-
-static inline u64 rhf_use_egr_bfr(u64 rhf)
-{
-       return rhf & RHF_USE_EGR_BFR_SMASK;
-}
-
-static inline u64 rhf_dc_info(u64 rhf)
-{
-       return rhf & RHF_DC_INFO_SMASK;
-}
-
-static inline u32 rhf_egr_buf_offset(u64 rhf)
-{
-       return (rhf >> RHF_EGR_OFFSET_SHIFT) & RHF_EGR_OFFSET_MASK;
-}
-#endif /* _COMMON_H */
diff --git a/drivers/staging/rdma/hfi1/debugfs.c b/drivers/staging/rdma/hfi1/debugfs.c
deleted file mode 100644 (file)
index dbab9d9..0000000
+++ /dev/null
@@ -1,1145 +0,0 @@
-#ifdef CONFIG_DEBUG_FS
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#include <linux/debugfs.h>
-#include <linux/seq_file.h>
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/module.h>
-
-#include "hfi.h"
-#include "debugfs.h"
-#include "device.h"
-#include "qp.h"
-#include "sdma.h"
-
-static struct dentry *hfi1_dbg_root;
-
-#define private2dd(file) (file_inode(file)->i_private)
-#define private2ppd(file) (file_inode(file)->i_private)
-
-#define DEBUGFS_SEQ_FILE_OPS(name) \
-static const struct seq_operations _##name##_seq_ops = { \
-       .start = _##name##_seq_start, \
-       .next  = _##name##_seq_next, \
-       .stop  = _##name##_seq_stop, \
-       .show  = _##name##_seq_show \
-}
-
-#define DEBUGFS_SEQ_FILE_OPEN(name) \
-static int _##name##_open(struct inode *inode, struct file *s) \
-{ \
-       struct seq_file *seq; \
-       int ret; \
-       ret =  seq_open(s, &_##name##_seq_ops); \
-       if (ret) \
-               return ret; \
-       seq = s->private_data; \
-       seq->private = inode->i_private; \
-       return 0; \
-}
-
-#define DEBUGFS_FILE_OPS(name) \
-static const struct file_operations _##name##_file_ops = { \
-       .owner   = THIS_MODULE, \
-       .open    = _##name##_open, \
-       .read    = seq_read, \
-       .llseek  = seq_lseek, \
-       .release = seq_release \
-}
-
-#define DEBUGFS_FILE_CREATE(name, parent, data, ops, mode)     \
-do { \
-       struct dentry *ent; \
-       ent = debugfs_create_file(name, mode, parent, \
-               data, ops); \
-       if (!ent) \
-               pr_warn("create of %s failed\n", name); \
-} while (0)
-
-#define DEBUGFS_SEQ_FILE_CREATE(name, parent, data) \
-       DEBUGFS_FILE_CREATE(#name, parent, data, &_##name##_file_ops, S_IRUGO)
-
-static void *_opcode_stats_seq_start(struct seq_file *s, loff_t *pos)
-__acquires(RCU)
-{
-       struct hfi1_opcode_stats_perctx *opstats;
-
-       rcu_read_lock();
-       if (*pos >= ARRAY_SIZE(opstats->stats))
-               return NULL;
-       return pos;
-}
-
-static void *_opcode_stats_seq_next(struct seq_file *s, void *v, loff_t *pos)
-{
-       struct hfi1_opcode_stats_perctx *opstats;
-
-       ++*pos;
-       if (*pos >= ARRAY_SIZE(opstats->stats))
-               return NULL;
-       return pos;
-}
-
-static void _opcode_stats_seq_stop(struct seq_file *s, void *v)
-__releases(RCU)
-{
-       rcu_read_unlock();
-}
-
-static int _opcode_stats_seq_show(struct seq_file *s, void *v)
-{
-       loff_t *spos = v;
-       loff_t i = *spos, j;
-       u64 n_packets = 0, n_bytes = 0;
-       struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
-       struct hfi1_devdata *dd = dd_from_dev(ibd);
-
-       for (j = 0; j < dd->first_user_ctxt; j++) {
-               if (!dd->rcd[j])
-                       continue;
-               n_packets += dd->rcd[j]->opstats->stats[i].n_packets;
-               n_bytes += dd->rcd[j]->opstats->stats[i].n_bytes;
-       }
-       if (!n_packets && !n_bytes)
-               return SEQ_SKIP;
-       seq_printf(s, "%02llx %llu/%llu\n", i,
-                  (unsigned long long)n_packets,
-                  (unsigned long long)n_bytes);
-
-       return 0;
-}
-
-DEBUGFS_SEQ_FILE_OPS(opcode_stats);
-DEBUGFS_SEQ_FILE_OPEN(opcode_stats)
-DEBUGFS_FILE_OPS(opcode_stats);
-
-static void *_ctx_stats_seq_start(struct seq_file *s, loff_t *pos)
-{
-       struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
-       struct hfi1_devdata *dd = dd_from_dev(ibd);
-
-       if (!*pos)
-               return SEQ_START_TOKEN;
-       if (*pos >= dd->first_user_ctxt)
-               return NULL;
-       return pos;
-}
-
-static void *_ctx_stats_seq_next(struct seq_file *s, void *v, loff_t *pos)
-{
-       struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
-       struct hfi1_devdata *dd = dd_from_dev(ibd);
-
-       if (v == SEQ_START_TOKEN)
-               return pos;
-
-       ++*pos;
-       if (*pos >= dd->first_user_ctxt)
-               return NULL;
-       return pos;
-}
-
-static void _ctx_stats_seq_stop(struct seq_file *s, void *v)
-{
-       /* nothing allocated */
-}
-
-static int _ctx_stats_seq_show(struct seq_file *s, void *v)
-{
-       loff_t *spos;
-       loff_t i, j;
-       u64 n_packets = 0;
-       struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
-       struct hfi1_devdata *dd = dd_from_dev(ibd);
-
-       if (v == SEQ_START_TOKEN) {
-               seq_puts(s, "Ctx:npkts\n");
-               return 0;
-       }
-
-       spos = v;
-       i = *spos;
-
-       if (!dd->rcd[i])
-               return SEQ_SKIP;
-
-       for (j = 0; j < ARRAY_SIZE(dd->rcd[i]->opstats->stats); j++)
-               n_packets += dd->rcd[i]->opstats->stats[j].n_packets;
-
-       if (!n_packets)
-               return SEQ_SKIP;
-
-       seq_printf(s, "  %llu:%llu\n", i, n_packets);
-       return 0;
-}
-
-DEBUGFS_SEQ_FILE_OPS(ctx_stats);
-DEBUGFS_SEQ_FILE_OPEN(ctx_stats)
-DEBUGFS_FILE_OPS(ctx_stats);
-
-static void *_qp_stats_seq_start(struct seq_file *s, loff_t *pos)
-__acquires(RCU)
-{
-       struct qp_iter *iter;
-       loff_t n = *pos;
-
-       rcu_read_lock();
-       iter = qp_iter_init(s->private);
-       if (!iter)
-               return NULL;
-
-       while (n--) {
-               if (qp_iter_next(iter)) {
-                       kfree(iter);
-                       return NULL;
-               }
-       }
-
-       return iter;
-}
-
-static void *_qp_stats_seq_next(struct seq_file *s, void *iter_ptr,
-                               loff_t *pos)
-{
-       struct qp_iter *iter = iter_ptr;
-
-       (*pos)++;
-
-       if (qp_iter_next(iter)) {
-               kfree(iter);
-               return NULL;
-       }
-
-       return iter;
-}
-
-static void _qp_stats_seq_stop(struct seq_file *s, void *iter_ptr)
-__releases(RCU)
-{
-       rcu_read_unlock();
-}
-
-static int _qp_stats_seq_show(struct seq_file *s, void *iter_ptr)
-{
-       struct qp_iter *iter = iter_ptr;
-
-       if (!iter)
-               return 0;
-
-       qp_iter_print(s, iter);
-
-       return 0;
-}
-
-DEBUGFS_SEQ_FILE_OPS(qp_stats);
-DEBUGFS_SEQ_FILE_OPEN(qp_stats)
-DEBUGFS_FILE_OPS(qp_stats);
-
-static void *_sdes_seq_start(struct seq_file *s, loff_t *pos)
-__acquires(RCU)
-{
-       struct hfi1_ibdev *ibd;
-       struct hfi1_devdata *dd;
-
-       rcu_read_lock();
-       ibd = (struct hfi1_ibdev *)s->private;
-       dd = dd_from_dev(ibd);
-       if (!dd->per_sdma || *pos >= dd->num_sdma)
-               return NULL;
-       return pos;
-}
-
-static void *_sdes_seq_next(struct seq_file *s, void *v, loff_t *pos)
-{
-       struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
-       struct hfi1_devdata *dd = dd_from_dev(ibd);
-
-       ++*pos;
-       if (!dd->per_sdma || *pos >= dd->num_sdma)
-               return NULL;
-       return pos;
-}
-
-static void _sdes_seq_stop(struct seq_file *s, void *v)
-__releases(RCU)
-{
-       rcu_read_unlock();
-}
-
-static int _sdes_seq_show(struct seq_file *s, void *v)
-{
-       struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
-       struct hfi1_devdata *dd = dd_from_dev(ibd);
-       loff_t *spos = v;
-       loff_t i = *spos;
-
-       sdma_seqfile_dump_sde(s, &dd->per_sdma[i]);
-       return 0;
-}
-
-DEBUGFS_SEQ_FILE_OPS(sdes);
-DEBUGFS_SEQ_FILE_OPEN(sdes)
-DEBUGFS_FILE_OPS(sdes);
-
-/* read the per-device counters */
-static ssize_t dev_counters_read(struct file *file, char __user *buf,
-                                size_t count, loff_t *ppos)
-{
-       u64 *counters;
-       size_t avail;
-       struct hfi1_devdata *dd;
-       ssize_t rval;
-
-       rcu_read_lock();
-       dd = private2dd(file);
-       avail = hfi1_read_cntrs(dd, NULL, &counters);
-       rval =  simple_read_from_buffer(buf, count, ppos, counters, avail);
-       rcu_read_unlock();
-       return rval;
-}
-
-/* read the per-device counters */
-static ssize_t dev_names_read(struct file *file, char __user *buf,
-                             size_t count, loff_t *ppos)
-{
-       char *names;
-       size_t avail;
-       struct hfi1_devdata *dd;
-       ssize_t rval;
-
-       rcu_read_lock();
-       dd = private2dd(file);
-       avail = hfi1_read_cntrs(dd, &names, NULL);
-       rval =  simple_read_from_buffer(buf, count, ppos, names, avail);
-       rcu_read_unlock();
-       return rval;
-}
-
-struct counter_info {
-       char *name;
-       const struct file_operations ops;
-};
-
-/*
- * Could use file_inode(file)->i_ino to figure out which file,
- * instead of separate routine for each, but for now, this works...
- */
-
-/* read the per-port names (same for each port) */
-static ssize_t portnames_read(struct file *file, char __user *buf,
-                             size_t count, loff_t *ppos)
-{
-       char *names;
-       size_t avail;
-       struct hfi1_devdata *dd;
-       ssize_t rval;
-
-       rcu_read_lock();
-       dd = private2dd(file);
-       avail = hfi1_read_portcntrs(dd->pport, &names, NULL);
-       rval = simple_read_from_buffer(buf, count, ppos, names, avail);
-       rcu_read_unlock();
-       return rval;
-}
-
-/* read the per-port counters */
-static ssize_t portcntrs_debugfs_read(struct file *file, char __user *buf,
-                                     size_t count, loff_t *ppos)
-{
-       u64 *counters;
-       size_t avail;
-       struct hfi1_pportdata *ppd;
-       ssize_t rval;
-
-       rcu_read_lock();
-       ppd = private2ppd(file);
-       avail = hfi1_read_portcntrs(ppd, NULL, &counters);
-       rval = simple_read_from_buffer(buf, count, ppos, counters, avail);
-       rcu_read_unlock();
-       return rval;
-}
-
-static void check_dyn_flag(u64 scratch0, char *p, int size, int *used,
-                          int this_hfi, int hfi, u32 flag, const char *what)
-{
-       u32 mask;
-
-       mask = flag << (hfi ? CR_DYN_SHIFT : 0);
-       if (scratch0 & mask) {
-               *used += scnprintf(p + *used, size - *used,
-                                  "  0x%08x - HFI%d %s in use, %s device\n",
-                                  mask, hfi, what,
-                                  this_hfi == hfi ? "this" : "other");
-       }
-}
-
-static ssize_t asic_flags_read(struct file *file, char __user *buf,
-                              size_t count, loff_t *ppos)
-{
-       struct hfi1_pportdata *ppd;
-       struct hfi1_devdata *dd;
-       u64 scratch0;
-       char *tmp;
-       int ret = 0;
-       int size;
-       int used;
-       int i;
-
-       rcu_read_lock();
-       ppd = private2ppd(file);
-       dd = ppd->dd;
-       size = PAGE_SIZE;
-       used = 0;
-       tmp = kmalloc(size, GFP_KERNEL);
-       if (!tmp) {
-               rcu_read_unlock();
-               return -ENOMEM;
-       }
-
-       scratch0 = read_csr(dd, ASIC_CFG_SCRATCH);
-       used += scnprintf(tmp + used, size - used,
-                         "Resource flags: 0x%016llx\n", scratch0);
-
-       /* check permanent flag */
-       if (scratch0 & CR_THERM_INIT) {
-               used += scnprintf(tmp + used, size - used,
-                                 "  0x%08x - thermal monitoring initialized\n",
-                                 (u32)CR_THERM_INIT);
-       }
-
-       /* check each dynamic flag on each HFI */
-       for (i = 0; i < 2; i++) {
-               check_dyn_flag(scratch0, tmp, size, &used, dd->hfi1_id, i,
-                              CR_SBUS, "SBus");
-               check_dyn_flag(scratch0, tmp, size, &used, dd->hfi1_id, i,
-                              CR_EPROM, "EPROM");
-               check_dyn_flag(scratch0, tmp, size, &used, dd->hfi1_id, i,
-                              CR_I2C1, "i2c chain 1");
-               check_dyn_flag(scratch0, tmp, size, &used, dd->hfi1_id, i,
-                              CR_I2C2, "i2c chain 2");
-       }
-       used += scnprintf(tmp + used, size - used, "Write bits to clear\n");
-
-       ret = simple_read_from_buffer(buf, count, ppos, tmp, used);
-       rcu_read_unlock();
-       kfree(tmp);
-       return ret;
-}
-
-static ssize_t asic_flags_write(struct file *file, const char __user *buf,
-                               size_t count, loff_t *ppos)
-{
-       struct hfi1_pportdata *ppd;
-       struct hfi1_devdata *dd;
-       char *buff;
-       int ret;
-       unsigned long long value;
-       u64 scratch0;
-       u64 clear;
-
-       rcu_read_lock();
-       ppd = private2ppd(file);
-       dd = ppd->dd;
-
-       buff = kmalloc(count + 1, GFP_KERNEL);
-       if (!buff) {
-               ret = -ENOMEM;
-               goto do_return;
-       }
-
-       ret = copy_from_user(buff, buf, count);
-       if (ret > 0) {
-               ret = -EFAULT;
-               goto do_free;
-       }
-
-       /* zero terminate and read the expected integer */
-       buff[count] = 0;
-       ret = kstrtoull(buff, 0, &value);
-       if (ret)
-               goto do_free;
-       clear = value;
-
-       /* obtain exclusive access */
-       mutex_lock(&dd->asic_data->asic_resource_mutex);
-       acquire_hw_mutex(dd);
-
-       scratch0 = read_csr(dd, ASIC_CFG_SCRATCH);
-       scratch0 &= ~clear;
-       write_csr(dd, ASIC_CFG_SCRATCH, scratch0);
-       /* force write to be visible to other HFI on another OS */
-       (void)read_csr(dd, ASIC_CFG_SCRATCH);
-
-       release_hw_mutex(dd);
-       mutex_unlock(&dd->asic_data->asic_resource_mutex);
-
-       /* return the number of bytes written */
-       ret = count;
-
- do_free:
-       kfree(buff);
- do_return:
-       rcu_read_unlock();
-       return ret;
-}
-
-/*
- * read the per-port QSFP data for ppd
- */
-static ssize_t qsfp_debugfs_dump(struct file *file, char __user *buf,
-                                size_t count, loff_t *ppos)
-{
-       struct hfi1_pportdata *ppd;
-       char *tmp;
-       int ret;
-
-       rcu_read_lock();
-       ppd = private2ppd(file);
-       tmp = kmalloc(PAGE_SIZE, GFP_KERNEL);
-       if (!tmp) {
-               rcu_read_unlock();
-               return -ENOMEM;
-       }
-
-       ret = qsfp_dump(ppd, tmp, PAGE_SIZE);
-       if (ret > 0)
-               ret = simple_read_from_buffer(buf, count, ppos, tmp, ret);
-       rcu_read_unlock();
-       kfree(tmp);
-       return ret;
-}
-
-/* Do an i2c write operation on the chain for the given HFI. */
-static ssize_t __i2c_debugfs_write(struct file *file, const char __user *buf,
-                                  size_t count, loff_t *ppos, u32 target)
-{
-       struct hfi1_pportdata *ppd;
-       char *buff;
-       int ret;
-       int i2c_addr;
-       int offset;
-       int total_written;
-
-       rcu_read_lock();
-       ppd = private2ppd(file);
-
-       /* byte offset format: [offsetSize][i2cAddr][offsetHigh][offsetLow] */
-       i2c_addr = (*ppos >> 16) & 0xffff;
-       offset = *ppos & 0xffff;
-
-       /* explicitly reject invalid address 0 to catch cp and cat */
-       if (i2c_addr == 0) {
-               ret = -EINVAL;
-               goto _return;
-       }
-
-       buff = kmalloc(count, GFP_KERNEL);
-       if (!buff) {
-               ret = -ENOMEM;
-               goto _return;
-       }
-
-       ret = copy_from_user(buff, buf, count);
-       if (ret > 0) {
-               ret = -EFAULT;
-               goto _free;
-       }
-
-       total_written = i2c_write(ppd, target, i2c_addr, offset, buff, count);
-       if (total_written < 0) {
-               ret = total_written;
-               goto _free;
-       }
-
-       *ppos += total_written;
-
-       ret = total_written;
-
- _free:
-       kfree(buff);
- _return:
-       rcu_read_unlock();
-       return ret;
-}
-
-/* Do an i2c write operation on chain for HFI 0. */
-static ssize_t i2c1_debugfs_write(struct file *file, const char __user *buf,
-                                 size_t count, loff_t *ppos)
-{
-       return __i2c_debugfs_write(file, buf, count, ppos, 0);
-}
-
-/* Do an i2c write operation on chain for HFI 1. */
-static ssize_t i2c2_debugfs_write(struct file *file, const char __user *buf,
-                                 size_t count, loff_t *ppos)
-{
-       return __i2c_debugfs_write(file, buf, count, ppos, 1);
-}
-
-/* Do an i2c read operation on the chain for the given HFI. */
-static ssize_t __i2c_debugfs_read(struct file *file, char __user *buf,
-                                 size_t count, loff_t *ppos, u32 target)
-{
-       struct hfi1_pportdata *ppd;
-       char *buff;
-       int ret;
-       int i2c_addr;
-       int offset;
-       int total_read;
-
-       rcu_read_lock();
-       ppd = private2ppd(file);
-
-       /* byte offset format: [offsetSize][i2cAddr][offsetHigh][offsetLow] */
-       i2c_addr = (*ppos >> 16) & 0xffff;
-       offset = *ppos & 0xffff;
-
-       /* explicitly reject invalid address 0 to catch cp and cat */
-       if (i2c_addr == 0) {
-               ret = -EINVAL;
-               goto _return;
-       }
-
-       buff = kmalloc(count, GFP_KERNEL);
-       if (!buff) {
-               ret = -ENOMEM;
-               goto _return;
-       }
-
-       total_read = i2c_read(ppd, target, i2c_addr, offset, buff, count);
-       if (total_read < 0) {
-               ret = total_read;
-               goto _free;
-       }
-
-       *ppos += total_read;
-
-       ret = copy_to_user(buf, buff, total_read);
-       if (ret > 0) {
-               ret = -EFAULT;
-               goto _free;
-       }
-
-       ret = total_read;
-
- _free:
-       kfree(buff);
- _return:
-       rcu_read_unlock();
-       return ret;
-}
-
-/* Do an i2c read operation on chain for HFI 0. */
-static ssize_t i2c1_debugfs_read(struct file *file, char __user *buf,
-                                size_t count, loff_t *ppos)
-{
-       return __i2c_debugfs_read(file, buf, count, ppos, 0);
-}
-
-/* Do an i2c read operation on chain for HFI 1. */
-static ssize_t i2c2_debugfs_read(struct file *file, char __user *buf,
-                                size_t count, loff_t *ppos)
-{
-       return __i2c_debugfs_read(file, buf, count, ppos, 1);
-}
-
-/* Do a QSFP write operation on the i2c chain for the given HFI. */
-static ssize_t __qsfp_debugfs_write(struct file *file, const char __user *buf,
-                                   size_t count, loff_t *ppos, u32 target)
-{
-       struct hfi1_pportdata *ppd;
-       char *buff;
-       int ret;
-       int total_written;
-
-       rcu_read_lock();
-       if (*ppos + count > QSFP_PAGESIZE * 4) { /* base page + page00-page03 */
-               ret = -EINVAL;
-               goto _return;
-       }
-
-       ppd = private2ppd(file);
-
-       buff = kmalloc(count, GFP_KERNEL);
-       if (!buff) {
-               ret = -ENOMEM;
-               goto _return;
-       }
-
-       ret = copy_from_user(buff, buf, count);
-       if (ret > 0) {
-               ret = -EFAULT;
-               goto _free;
-       }
-
-       total_written = qsfp_write(ppd, target, *ppos, buff, count);
-       if (total_written < 0) {
-               ret = total_written;
-               goto _free;
-       }
-
-       *ppos += total_written;
-
-       ret = total_written;
-
- _free:
-       kfree(buff);
- _return:
-       rcu_read_unlock();
-       return ret;
-}
-
-/* Do a QSFP write operation on i2c chain for HFI 0. */
-static ssize_t qsfp1_debugfs_write(struct file *file, const char __user *buf,
-                                  size_t count, loff_t *ppos)
-{
-       return __qsfp_debugfs_write(file, buf, count, ppos, 0);
-}
-
-/* Do a QSFP write operation on i2c chain for HFI 1. */
-static ssize_t qsfp2_debugfs_write(struct file *file, const char __user *buf,
-                                  size_t count, loff_t *ppos)
-{
-       return __qsfp_debugfs_write(file, buf, count, ppos, 1);
-}
-
-/* Do a QSFP read operation on the i2c chain for the given HFI. */
-static ssize_t __qsfp_debugfs_read(struct file *file, char __user *buf,
-                                  size_t count, loff_t *ppos, u32 target)
-{
-       struct hfi1_pportdata *ppd;
-       char *buff;
-       int ret;
-       int total_read;
-
-       rcu_read_lock();
-       if (*ppos + count > QSFP_PAGESIZE * 4) { /* base page + page00-page03 */
-               ret = -EINVAL;
-               goto _return;
-       }
-
-       ppd = private2ppd(file);
-
-       buff = kmalloc(count, GFP_KERNEL);
-       if (!buff) {
-               ret = -ENOMEM;
-               goto _return;
-       }
-
-       total_read = qsfp_read(ppd, target, *ppos, buff, count);
-       if (total_read < 0) {
-               ret = total_read;
-               goto _free;
-       }
-
-       *ppos += total_read;
-
-       ret = copy_to_user(buf, buff, total_read);
-       if (ret > 0) {
-               ret = -EFAULT;
-               goto _free;
-       }
-
-       ret = total_read;
-
- _free:
-       kfree(buff);
- _return:
-       rcu_read_unlock();
-       return ret;
-}
-
-/* Do a QSFP read operation on i2c chain for HFI 0. */
-static ssize_t qsfp1_debugfs_read(struct file *file, char __user *buf,
-                                 size_t count, loff_t *ppos)
-{
-       return __qsfp_debugfs_read(file, buf, count, ppos, 0);
-}
-
-/* Do a QSFP read operation on i2c chain for HFI 1. */
-static ssize_t qsfp2_debugfs_read(struct file *file, char __user *buf,
-                                 size_t count, loff_t *ppos)
-{
-       return __qsfp_debugfs_read(file, buf, count, ppos, 1);
-}
-
-static int __i2c_debugfs_open(struct inode *in, struct file *fp, u32 target)
-{
-       struct hfi1_pportdata *ppd;
-       int ret;
-
-       if (!try_module_get(THIS_MODULE))
-               return -ENODEV;
-
-       ppd = private2ppd(fp);
-
-       ret = acquire_chip_resource(ppd->dd, i2c_target(target), 0);
-       if (ret) /* failed - release the module */
-               module_put(THIS_MODULE);
-
-       return ret;
-}
-
-static int i2c1_debugfs_open(struct inode *in, struct file *fp)
-{
-       return __i2c_debugfs_open(in, fp, 0);
-}
-
-static int i2c2_debugfs_open(struct inode *in, struct file *fp)
-{
-       return __i2c_debugfs_open(in, fp, 1);
-}
-
-static int __i2c_debugfs_release(struct inode *in, struct file *fp, u32 target)
-{
-       struct hfi1_pportdata *ppd;
-
-       ppd = private2ppd(fp);
-
-       release_chip_resource(ppd->dd, i2c_target(target));
-       module_put(THIS_MODULE);
-
-       return 0;
-}
-
-static int i2c1_debugfs_release(struct inode *in, struct file *fp)
-{
-       return __i2c_debugfs_release(in, fp, 0);
-}
-
-static int i2c2_debugfs_release(struct inode *in, struct file *fp)
-{
-       return __i2c_debugfs_release(in, fp, 1);
-}
-
-static int __qsfp_debugfs_open(struct inode *in, struct file *fp, u32 target)
-{
-       struct hfi1_pportdata *ppd;
-       int ret;
-
-       if (!try_module_get(THIS_MODULE))
-               return -ENODEV;
-
-       ppd = private2ppd(fp);
-
-       ret = acquire_chip_resource(ppd->dd, i2c_target(target), 0);
-       if (ret) /* failed - release the module */
-               module_put(THIS_MODULE);
-
-       return ret;
-}
-
-static int qsfp1_debugfs_open(struct inode *in, struct file *fp)
-{
-       return __qsfp_debugfs_open(in, fp, 0);
-}
-
-static int qsfp2_debugfs_open(struct inode *in, struct file *fp)
-{
-       return __qsfp_debugfs_open(in, fp, 1);
-}
-
-static int __qsfp_debugfs_release(struct inode *in, struct file *fp, u32 target)
-{
-       struct hfi1_pportdata *ppd;
-
-       ppd = private2ppd(fp);
-
-       release_chip_resource(ppd->dd, i2c_target(target));
-       module_put(THIS_MODULE);
-
-       return 0;
-}
-
-static int qsfp1_debugfs_release(struct inode *in, struct file *fp)
-{
-       return __qsfp_debugfs_release(in, fp, 0);
-}
-
-static int qsfp2_debugfs_release(struct inode *in, struct file *fp)
-{
-       return __qsfp_debugfs_release(in, fp, 1);
-}
-
-#define DEBUGFS_OPS(nm, readroutine, writeroutine)     \
-{ \
-       .name = nm, \
-       .ops = { \
-               .read = readroutine, \
-               .write = writeroutine, \
-               .llseek = generic_file_llseek, \
-       }, \
-}
-
-#define DEBUGFS_XOPS(nm, readf, writef, openf, releasef) \
-{ \
-       .name = nm, \
-       .ops = { \
-               .read = readf, \
-               .write = writef, \
-               .llseek = generic_file_llseek, \
-               .open = openf, \
-               .release = releasef \
-       }, \
-}
-
-static const struct counter_info cntr_ops[] = {
-       DEBUGFS_OPS("counter_names", dev_names_read, NULL),
-       DEBUGFS_OPS("counters", dev_counters_read, NULL),
-       DEBUGFS_OPS("portcounter_names", portnames_read, NULL),
-};
-
-static const struct counter_info port_cntr_ops[] = {
-       DEBUGFS_OPS("port%dcounters", portcntrs_debugfs_read, NULL),
-       DEBUGFS_XOPS("i2c1", i2c1_debugfs_read, i2c1_debugfs_write,
-                    i2c1_debugfs_open, i2c1_debugfs_release),
-       DEBUGFS_XOPS("i2c2", i2c2_debugfs_read, i2c2_debugfs_write,
-                    i2c2_debugfs_open, i2c2_debugfs_release),
-       DEBUGFS_OPS("qsfp_dump%d", qsfp_debugfs_dump, NULL),
-       DEBUGFS_XOPS("qsfp1", qsfp1_debugfs_read, qsfp1_debugfs_write,
-                    qsfp1_debugfs_open, qsfp1_debugfs_release),
-       DEBUGFS_XOPS("qsfp2", qsfp2_debugfs_read, qsfp2_debugfs_write,
-                    qsfp2_debugfs_open, qsfp2_debugfs_release),
-       DEBUGFS_OPS("asic_flags", asic_flags_read, asic_flags_write),
-};
-
-void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd)
-{
-       char name[sizeof("port0counters") + 1];
-       char link[10];
-       struct hfi1_devdata *dd = dd_from_dev(ibd);
-       struct hfi1_pportdata *ppd;
-       int unit = dd->unit;
-       int i, j;
-
-       if (!hfi1_dbg_root)
-               return;
-       snprintf(name, sizeof(name), "%s_%d", class_name(), unit);
-       snprintf(link, sizeof(link), "%d", unit);
-       ibd->hfi1_ibdev_dbg = debugfs_create_dir(name, hfi1_dbg_root);
-       if (!ibd->hfi1_ibdev_dbg) {
-               pr_warn("create of %s failed\n", name);
-               return;
-       }
-       ibd->hfi1_ibdev_link =
-               debugfs_create_symlink(link, hfi1_dbg_root, name);
-       if (!ibd->hfi1_ibdev_link) {
-               pr_warn("create of %s symlink failed\n", name);
-               return;
-       }
-       DEBUGFS_SEQ_FILE_CREATE(opcode_stats, ibd->hfi1_ibdev_dbg, ibd);
-       DEBUGFS_SEQ_FILE_CREATE(ctx_stats, ibd->hfi1_ibdev_dbg, ibd);
-       DEBUGFS_SEQ_FILE_CREATE(qp_stats, ibd->hfi1_ibdev_dbg, ibd);
-       DEBUGFS_SEQ_FILE_CREATE(sdes, ibd->hfi1_ibdev_dbg, ibd);
-       /* dev counter files */
-       for (i = 0; i < ARRAY_SIZE(cntr_ops); i++)
-               DEBUGFS_FILE_CREATE(cntr_ops[i].name,
-                                   ibd->hfi1_ibdev_dbg,
-                                   dd,
-                                   &cntr_ops[i].ops, S_IRUGO);
-       /* per port files */
-       for (ppd = dd->pport, j = 0; j < dd->num_pports; j++, ppd++)
-               for (i = 0; i < ARRAY_SIZE(port_cntr_ops); i++) {
-                       snprintf(name,
-                                sizeof(name),
-                                port_cntr_ops[i].name,
-                                j + 1);
-                       DEBUGFS_FILE_CREATE(name,
-                                           ibd->hfi1_ibdev_dbg,
-                                           ppd,
-                                           &port_cntr_ops[i].ops,
-                                           !port_cntr_ops[i].ops.write ?
-                                           S_IRUGO : S_IRUGO | S_IWUSR);
-               }
-}
-
-void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd)
-{
-       if (!hfi1_dbg_root)
-               goto out;
-       debugfs_remove(ibd->hfi1_ibdev_link);
-       debugfs_remove_recursive(ibd->hfi1_ibdev_dbg);
-out:
-       ibd->hfi1_ibdev_dbg = NULL;
-       synchronize_rcu();
-}
-
-/*
- * driver stats field names, one line per stat, single string.  Used by
- * programs like hfistats to print the stats in a way which works for
- * different versions of drivers, without changing program source.
- * if hfi1_ib_stats changes, this needs to change.  Names need to be
- * 12 chars or less (w/o newline), for proper display by hfistats utility.
- */
-static const char * const hfi1_statnames[] = {
-       /* must be element 0*/
-       "KernIntr",
-       "ErrorIntr",
-       "Tx_Errs",
-       "Rcv_Errs",
-       "H/W_Errs",
-       "NoPIOBufs",
-       "CtxtsOpen",
-       "RcvLen_Errs",
-       "EgrBufFull",
-       "EgrHdrFull"
-};
-
-static void *_driver_stats_names_seq_start(struct seq_file *s, loff_t *pos)
-__acquires(RCU)
-{
-       rcu_read_lock();
-       if (*pos >= ARRAY_SIZE(hfi1_statnames))
-               return NULL;
-       return pos;
-}
-
-static void *_driver_stats_names_seq_next(
-       struct seq_file *s,
-       void *v,
-       loff_t *pos)
-{
-       ++*pos;
-       if (*pos >= ARRAY_SIZE(hfi1_statnames))
-               return NULL;
-       return pos;
-}
-
-static void _driver_stats_names_seq_stop(struct seq_file *s, void *v)
-__releases(RCU)
-{
-       rcu_read_unlock();
-}
-
-static int _driver_stats_names_seq_show(struct seq_file *s, void *v)
-{
-       loff_t *spos = v;
-
-       seq_printf(s, "%s\n", hfi1_statnames[*spos]);
-       return 0;
-}
-
-DEBUGFS_SEQ_FILE_OPS(driver_stats_names);
-DEBUGFS_SEQ_FILE_OPEN(driver_stats_names)
-DEBUGFS_FILE_OPS(driver_stats_names);
-
-static void *_driver_stats_seq_start(struct seq_file *s, loff_t *pos)
-__acquires(RCU)
-{
-       rcu_read_lock();
-       if (*pos >= ARRAY_SIZE(hfi1_statnames))
-               return NULL;
-       return pos;
-}
-
-static void *_driver_stats_seq_next(struct seq_file *s, void *v, loff_t *pos)
-{
-       ++*pos;
-       if (*pos >= ARRAY_SIZE(hfi1_statnames))
-               return NULL;
-       return pos;
-}
-
-static void _driver_stats_seq_stop(struct seq_file *s, void *v)
-__releases(RCU)
-{
-       rcu_read_unlock();
-}
-
-static u64 hfi1_sps_ints(void)
-{
-       unsigned long flags;
-       struct hfi1_devdata *dd;
-       u64 sps_ints = 0;
-
-       spin_lock_irqsave(&hfi1_devs_lock, flags);
-       list_for_each_entry(dd, &hfi1_dev_list, list) {
-               sps_ints += get_all_cpu_total(dd->int_counter);
-       }
-       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
-       return sps_ints;
-}
-
-static int _driver_stats_seq_show(struct seq_file *s, void *v)
-{
-       loff_t *spos = v;
-       char *buffer;
-       u64 *stats = (u64 *)&hfi1_stats;
-       size_t sz = seq_get_buf(s, &buffer);
-
-       if (sz < sizeof(u64))
-               return SEQ_SKIP;
-       /* special case for interrupts */
-       if (*spos == 0)
-               *(u64 *)buffer = hfi1_sps_ints();
-       else
-               *(u64 *)buffer = stats[*spos];
-       seq_commit(s,  sizeof(u64));
-       return 0;
-}
-
-DEBUGFS_SEQ_FILE_OPS(driver_stats);
-DEBUGFS_SEQ_FILE_OPEN(driver_stats)
-DEBUGFS_FILE_OPS(driver_stats);
-
-void hfi1_dbg_init(void)
-{
-       hfi1_dbg_root  = debugfs_create_dir(DRIVER_NAME, NULL);
-       if (!hfi1_dbg_root)
-               pr_warn("init of debugfs failed\n");
-       DEBUGFS_SEQ_FILE_CREATE(driver_stats_names, hfi1_dbg_root, NULL);
-       DEBUGFS_SEQ_FILE_CREATE(driver_stats, hfi1_dbg_root, NULL);
-}
-
-void hfi1_dbg_exit(void)
-{
-       debugfs_remove_recursive(hfi1_dbg_root);
-       hfi1_dbg_root = NULL;
-}
-
-#endif
diff --git a/drivers/staging/rdma/hfi1/debugfs.h b/drivers/staging/rdma/hfi1/debugfs.h
deleted file mode 100644 (file)
index b6fb681..0000000
+++ /dev/null
@@ -1,75 +0,0 @@
-#ifndef _HFI1_DEBUGFS_H
-#define _HFI1_DEBUGFS_H
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-struct hfi1_ibdev;
-#ifdef CONFIG_DEBUG_FS
-void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd);
-void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd);
-void hfi1_dbg_init(void);
-void hfi1_dbg_exit(void);
-#else
-static inline void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd)
-{
-}
-
-void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd)
-{
-}
-
-void hfi1_dbg_init(void)
-{
-}
-
-void hfi1_dbg_exit(void)
-{
-}
-
-#endif
-
-#endif                          /* _HFI1_DEBUGFS_H */
diff --git a/drivers/staging/rdma/hfi1/device.c b/drivers/staging/rdma/hfi1/device.c
deleted file mode 100644 (file)
index c05c39d..0000000
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/cdev.h>
-#include <linux/module.h>
-#include <linux/device.h>
-#include <linux/fs.h>
-
-#include "hfi.h"
-#include "device.h"
-
-static struct class *class;
-static struct class *user_class;
-static dev_t hfi1_dev;
-
-int hfi1_cdev_init(int minor, const char *name,
-                  const struct file_operations *fops,
-                  struct cdev *cdev, struct device **devp,
-                  bool user_accessible)
-{
-       const dev_t dev = MKDEV(MAJOR(hfi1_dev), minor);
-       struct device *device = NULL;
-       int ret;
-
-       cdev_init(cdev, fops);
-       cdev->owner = THIS_MODULE;
-       kobject_set_name(&cdev->kobj, name);
-
-       ret = cdev_add(cdev, dev, 1);
-       if (ret < 0) {
-               pr_err("Could not add cdev for minor %d, %s (err %d)\n",
-                      minor, name, -ret);
-               goto done;
-       }
-
-       if (user_accessible)
-               device = device_create(user_class, NULL, dev, NULL, "%s", name);
-       else
-               device = device_create(class, NULL, dev, NULL, "%s", name);
-
-       if (!IS_ERR(device))
-               goto done;
-       ret = PTR_ERR(device);
-       device = NULL;
-       pr_err("Could not create device for minor %d, %s (err %d)\n",
-              minor, name, -ret);
-       cdev_del(cdev);
-done:
-       *devp = device;
-       return ret;
-}
-
-void hfi1_cdev_cleanup(struct cdev *cdev, struct device **devp)
-{
-       struct device *device = *devp;
-
-       if (device) {
-               device_unregister(device);
-               *devp = NULL;
-
-               cdev_del(cdev);
-       }
-}
-
-static const char *hfi1_class_name = "hfi1";
-
-const char *class_name(void)
-{
-       return hfi1_class_name;
-}
-
-static char *hfi1_devnode(struct device *dev, umode_t *mode)
-{
-       if (mode)
-               *mode = 0600;
-       return kasprintf(GFP_KERNEL, "%s", dev_name(dev));
-}
-
-static const char *hfi1_class_name_user = "hfi1_user";
-static const char *class_name_user(void)
-{
-       return hfi1_class_name_user;
-}
-
-static char *hfi1_user_devnode(struct device *dev, umode_t *mode)
-{
-       if (mode)
-               *mode = 0666;
-       return kasprintf(GFP_KERNEL, "%s", dev_name(dev));
-}
-
-int __init dev_init(void)
-{
-       int ret;
-
-       ret = alloc_chrdev_region(&hfi1_dev, 0, HFI1_NMINORS, DRIVER_NAME);
-       if (ret < 0) {
-               pr_err("Could not allocate chrdev region (err %d)\n", -ret);
-               goto done;
-       }
-
-       class = class_create(THIS_MODULE, class_name());
-       if (IS_ERR(class)) {
-               ret = PTR_ERR(class);
-               pr_err("Could not create device class (err %d)\n", -ret);
-               unregister_chrdev_region(hfi1_dev, HFI1_NMINORS);
-               goto done;
-       }
-       class->devnode = hfi1_devnode;
-
-       user_class = class_create(THIS_MODULE, class_name_user());
-       if (IS_ERR(user_class)) {
-               ret = PTR_ERR(user_class);
-               pr_err("Could not create device class for user accessible files (err %d)\n",
-                      -ret);
-               class_destroy(class);
-               class = NULL;
-               user_class = NULL;
-               unregister_chrdev_region(hfi1_dev, HFI1_NMINORS);
-               goto done;
-       }
-       user_class->devnode = hfi1_user_devnode;
-
-done:
-       return ret;
-}
-
-void dev_cleanup(void)
-{
-       class_destroy(class);
-       class = NULL;
-
-       class_destroy(user_class);
-       user_class = NULL;
-
-       unregister_chrdev_region(hfi1_dev, HFI1_NMINORS);
-}
diff --git a/drivers/staging/rdma/hfi1/device.h b/drivers/staging/rdma/hfi1/device.h
deleted file mode 100644 (file)
index 5bb3e83..0000000
+++ /dev/null
@@ -1,59 +0,0 @@
-#ifndef _HFI1_DEVICE_H
-#define _HFI1_DEVICE_H
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-int hfi1_cdev_init(int minor, const char *name,
-                  const struct file_operations *fops,
-                  struct cdev *cdev, struct device **devp,
-                  bool user_accessible);
-void hfi1_cdev_cleanup(struct cdev *cdev, struct device **devp);
-const char *class_name(void);
-int __init dev_init(void);
-void dev_cleanup(void);
-
-#endif                          /* _HFI1_DEVICE_H */
diff --git a/drivers/staging/rdma/hfi1/diag.c b/drivers/staging/rdma/hfi1/diag.c
deleted file mode 100644 (file)
index bb2409a..0000000
+++ /dev/null
@@ -1,1925 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-/*
- * This file contains support for diagnostic functions.  It is accessed by
- * opening the hfi1_diag device, normally minor number 129.  Diagnostic use
- * of the chip may render the chip or board unusable until the driver
- * is unloaded, or in some cases, until the system is rebooted.
- *
- * Accesses to the chip through this interface are not similar to going
- * through the /sys/bus/pci resource mmap interface.
- */
-
-#include <linux/io.h>
-#include <linux/pci.h>
-#include <linux/poll.h>
-#include <linux/vmalloc.h>
-#include <linux/export.h>
-#include <linux/fs.h>
-#include <linux/uaccess.h>
-#include <linux/module.h>
-#include <rdma/ib_smi.h>
-#include "hfi.h"
-#include "device.h"
-#include "common.h"
-#include "verbs_txreq.h"
-#include "trace.h"
-
-#undef pr_fmt
-#define pr_fmt(fmt) DRIVER_NAME ": " fmt
-#define snoop_dbg(fmt, ...) \
-       hfi1_cdbg(SNOOP, fmt, ##__VA_ARGS__)
-
-/* Snoop option mask */
-#define SNOOP_DROP_SEND                BIT(0)
-#define SNOOP_USE_METADATA     BIT(1)
-#define SNOOP_SET_VL0TOVL15     BIT(2)
-
-static u8 snoop_flags;
-
-/*
- * Extract packet length from LRH header.
- * This is in Dwords so multiply by 4 to get size in bytes
- */
-#define HFI1_GET_PKT_LEN(x)      (((be16_to_cpu((x)->lrh[2]) & 0xFFF)) << 2)
-
-enum hfi1_filter_status {
-       HFI1_FILTER_HIT,
-       HFI1_FILTER_ERR,
-       HFI1_FILTER_MISS
-};
-
-/* snoop processing functions */
-rhf_rcv_function_ptr snoop_rhf_rcv_functions[8] = {
-       [RHF_RCV_TYPE_EXPECTED] = snoop_recv_handler,
-       [RHF_RCV_TYPE_EAGER]    = snoop_recv_handler,
-       [RHF_RCV_TYPE_IB]       = snoop_recv_handler,
-       [RHF_RCV_TYPE_ERROR]    = snoop_recv_handler,
-       [RHF_RCV_TYPE_BYPASS]   = snoop_recv_handler,
-       [RHF_RCV_TYPE_INVALID5] = process_receive_invalid,
-       [RHF_RCV_TYPE_INVALID6] = process_receive_invalid,
-       [RHF_RCV_TYPE_INVALID7] = process_receive_invalid
-};
-
-/* Snoop packet structure */
-struct snoop_packet {
-       struct list_head list;
-       u32 total_len;
-       u8 data[];
-};
-
-/* Do not make these an enum or it will blow up the capture_md */
-#define PKT_DIR_EGRESS 0x0
-#define PKT_DIR_INGRESS 0x1
-
-/* Packet capture metadata returned to the user with the packet. */
-struct capture_md {
-       u8 port;
-       u8 dir;
-       u8 reserved[6];
-       union {
-               u64 pbc;
-               u64 rhf;
-       } u;
-};
-
-static atomic_t diagpkt_count = ATOMIC_INIT(0);
-static struct cdev diagpkt_cdev;
-static struct device *diagpkt_device;
-
-static ssize_t diagpkt_write(struct file *fp, const char __user *data,
-                            size_t count, loff_t *off);
-
-static const struct file_operations diagpkt_file_ops = {
-       .owner = THIS_MODULE,
-       .write = diagpkt_write,
-       .llseek = noop_llseek,
-};
-
-/*
- * This is used for communication with user space for snoop extended IOCTLs
- */
-struct hfi1_link_info {
-       __be64 node_guid;
-       u8 port_mode;
-       u8 port_state;
-       u16 link_speed_active;
-       u16 link_width_active;
-       u16 vl15_init;
-       u8 port_number;
-       /*
-        * Add padding to make this a full IB SMP payload. Note: changing the
-        * size of this structure will make the IOCTLs created with _IOWR
-        * change.
-        * Be sure to run tests on all IOCTLs when making changes to this
-        * structure.
-        */
-       u8 res[47];
-};
-
-/*
- * This starts our ioctl sequence numbers *way* off from the ones
- * defined in ib_core.
- */
-#define SNOOP_CAPTURE_VERSION 0x1
-
-#define IB_IOCTL_MAGIC          0x1b /* See Documentation/ioctl-number.txt */
-#define HFI1_SNOOP_IOC_MAGIC IB_IOCTL_MAGIC
-#define HFI1_SNOOP_IOC_BASE_SEQ 0x80
-
-#define HFI1_SNOOP_IOCGETLINKSTATE \
-       _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ)
-#define HFI1_SNOOP_IOCSETLINKSTATE \
-       _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ + 1)
-#define HFI1_SNOOP_IOCCLEARQUEUE \
-       _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ + 2)
-#define HFI1_SNOOP_IOCCLEARFILTER \
-       _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ + 3)
-#define HFI1_SNOOP_IOCSETFILTER \
-       _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ + 4)
-#define HFI1_SNOOP_IOCGETVERSION \
-       _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ + 5)
-#define HFI1_SNOOP_IOCSET_OPTS \
-       _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ + 6)
-
-/*
- * These offsets +6/+7 could change, but these are already known and used
- * IOCTL numbers so don't change them without a good reason.
- */
-#define HFI1_SNOOP_IOCGETLINKSTATE_EXTRA \
-       _IOWR(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ + 6, \
-               struct hfi1_link_info)
-#define HFI1_SNOOP_IOCSETLINKSTATE_EXTRA \
-       _IOWR(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ + 7, \
-               struct hfi1_link_info)
-
-static int hfi1_snoop_open(struct inode *in, struct file *fp);
-static ssize_t hfi1_snoop_read(struct file *fp, char __user *data,
-                              size_t pkt_len, loff_t *off);
-static ssize_t hfi1_snoop_write(struct file *fp, const char __user *data,
-                               size_t count, loff_t *off);
-static long hfi1_ioctl(struct file *fp, unsigned int cmd, unsigned long arg);
-static unsigned int hfi1_snoop_poll(struct file *fp,
-                                   struct poll_table_struct *wait);
-static int hfi1_snoop_release(struct inode *in, struct file *fp);
-
-struct hfi1_packet_filter_command {
-       int opcode;
-       int length;
-       void *value_ptr;
-};
-
-/* Can't re-use PKT_DIR_*GRESS here because 0 means no packets for this */
-#define HFI1_SNOOP_INGRESS 0x1
-#define HFI1_SNOOP_EGRESS  0x2
-
-enum hfi1_packet_filter_opcodes {
-       FILTER_BY_LID,
-       FILTER_BY_DLID,
-       FILTER_BY_MAD_MGMT_CLASS,
-       FILTER_BY_QP_NUMBER,
-       FILTER_BY_PKT_TYPE,
-       FILTER_BY_SERVICE_LEVEL,
-       FILTER_BY_PKEY,
-       FILTER_BY_DIRECTION,
-};
-
-static const struct file_operations snoop_file_ops = {
-       .owner = THIS_MODULE,
-       .open = hfi1_snoop_open,
-       .read = hfi1_snoop_read,
-       .unlocked_ioctl = hfi1_ioctl,
-       .poll = hfi1_snoop_poll,
-       .write = hfi1_snoop_write,
-       .release = hfi1_snoop_release
-};
-
-struct hfi1_filter_array {
-       int (*filter)(void *, void *, void *);
-};
-
-static int hfi1_filter_lid(void *ibhdr, void *packet_data, void *value);
-static int hfi1_filter_dlid(void *ibhdr, void *packet_data, void *value);
-static int hfi1_filter_mad_mgmt_class(void *ibhdr, void *packet_data,
-                                     void *value);
-static int hfi1_filter_qp_number(void *ibhdr, void *packet_data, void *value);
-static int hfi1_filter_ibpacket_type(void *ibhdr, void *packet_data,
-                                    void *value);
-static int hfi1_filter_ib_service_level(void *ibhdr, void *packet_data,
-                                       void *value);
-static int hfi1_filter_ib_pkey(void *ibhdr, void *packet_data, void *value);
-static int hfi1_filter_direction(void *ibhdr, void *packet_data, void *value);
-
-static const struct hfi1_filter_array hfi1_filters[] = {
-       { hfi1_filter_lid },
-       { hfi1_filter_dlid },
-       { hfi1_filter_mad_mgmt_class },
-       { hfi1_filter_qp_number },
-       { hfi1_filter_ibpacket_type },
-       { hfi1_filter_ib_service_level },
-       { hfi1_filter_ib_pkey },
-       { hfi1_filter_direction },
-};
-
-#define HFI1_MAX_FILTERS       ARRAY_SIZE(hfi1_filters)
-#define HFI1_DIAG_MINOR_BASE   129
-
-static int hfi1_snoop_add(struct hfi1_devdata *dd, const char *name);
-
-int hfi1_diag_add(struct hfi1_devdata *dd)
-{
-       char name[16];
-       int ret = 0;
-
-       snprintf(name, sizeof(name), "%s_diagpkt%d", class_name(),
-                dd->unit);
-       /*
-        * Do this for each device as opposed to the normal diagpkt
-        * interface which is one per host
-        */
-       ret = hfi1_snoop_add(dd, name);
-       if (ret)
-               dd_dev_err(dd, "Unable to init snoop/capture device");
-
-       snprintf(name, sizeof(name), "%s_diagpkt", class_name());
-       if (atomic_inc_return(&diagpkt_count) == 1) {
-               ret = hfi1_cdev_init(HFI1_DIAGPKT_MINOR, name,
-                                    &diagpkt_file_ops, &diagpkt_cdev,
-                                    &diagpkt_device, false);
-       }
-
-       return ret;
-}
-
-/* this must be called w/ dd->snoop_in_lock held */
-static void drain_snoop_list(struct list_head *queue)
-{
-       struct list_head *pos, *q;
-       struct snoop_packet *packet;
-
-       list_for_each_safe(pos, q, queue) {
-               packet = list_entry(pos, struct snoop_packet, list);
-               list_del(pos);
-               kfree(packet);
-       }
-}
-
-static void hfi1_snoop_remove(struct hfi1_devdata *dd)
-{
-       unsigned long flags = 0;
-
-       spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
-       drain_snoop_list(&dd->hfi1_snoop.queue);
-       hfi1_cdev_cleanup(&dd->hfi1_snoop.cdev, &dd->hfi1_snoop.class_dev);
-       spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
-}
-
-void hfi1_diag_remove(struct hfi1_devdata *dd)
-{
-       hfi1_snoop_remove(dd);
-       if (atomic_dec_and_test(&diagpkt_count))
-               hfi1_cdev_cleanup(&diagpkt_cdev, &diagpkt_device);
-       hfi1_cdev_cleanup(&dd->diag_cdev, &dd->diag_device);
-}
-
-/*
- * Allocated structure shared between the credit return mechanism and
- * diagpkt_send().
- */
-struct diagpkt_wait {
-       struct completion credits_returned;
-       int code;
-       atomic_t count;
-};
-
-/*
- * When each side is finished with the structure, they call this.
- * The last user frees the structure.
- */
-static void put_diagpkt_wait(struct diagpkt_wait *wait)
-{
-       if (atomic_dec_and_test(&wait->count))
-               kfree(wait);
-}
-
-/*
- * Callback from the credit return code.  Set the complete, which
- * will let diapkt_send() continue.
- */
-static void diagpkt_complete(void *arg, int code)
-{
-       struct diagpkt_wait *wait = (struct diagpkt_wait *)arg;
-
-       wait->code = code;
-       complete(&wait->credits_returned);
-       put_diagpkt_wait(wait); /* finished with the structure */
-}
-
-/**
- * diagpkt_send - send a packet
- * @dp: diag packet descriptor
- */
-static ssize_t diagpkt_send(struct diag_pkt *dp)
-{
-       struct hfi1_devdata *dd;
-       struct send_context *sc;
-       struct pio_buf *pbuf;
-       u32 *tmpbuf = NULL;
-       ssize_t ret = 0;
-       u32 pkt_len, total_len;
-       pio_release_cb credit_cb = NULL;
-       void *credit_arg = NULL;
-       struct diagpkt_wait *wait = NULL;
-       int trycount = 0;
-
-       dd = hfi1_lookup(dp->unit);
-       if (!dd || !(dd->flags & HFI1_PRESENT) || !dd->kregbase) {
-               ret = -ENODEV;
-               goto bail;
-       }
-       if (!(dd->flags & HFI1_INITTED)) {
-               /* no hardware, freeze, etc. */
-               ret = -ENODEV;
-               goto bail;
-       }
-
-       if (dp->version != _DIAG_PKT_VERS) {
-               dd_dev_err(dd, "Invalid version %u for diagpkt_write\n",
-                          dp->version);
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       /* send count must be an exact number of dwords */
-       if (dp->len & 3) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       /* there is only port 1 */
-       if (dp->port != 1) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       /* need a valid context */
-       if (dp->sw_index >= dd->num_send_contexts) {
-               ret = -EINVAL;
-               goto bail;
-       }
-       /* can only use kernel contexts */
-       if (dd->send_contexts[dp->sw_index].type != SC_KERNEL &&
-           dd->send_contexts[dp->sw_index].type != SC_VL15) {
-               ret = -EINVAL;
-               goto bail;
-       }
-       /* must be allocated */
-       sc = dd->send_contexts[dp->sw_index].sc;
-       if (!sc) {
-               ret = -EINVAL;
-               goto bail;
-       }
-       /* must be enabled */
-       if (!(sc->flags & SCF_ENABLED)) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       /* allocate a buffer and copy the data in */
-       tmpbuf = vmalloc(dp->len);
-       if (!tmpbuf) {
-               ret = -ENOMEM;
-               goto bail;
-       }
-
-       if (copy_from_user(tmpbuf,
-                          (const void __user *)(unsigned long)dp->data,
-                          dp->len)) {
-               ret = -EFAULT;
-               goto bail;
-       }
-
-       /*
-        * pkt_len is how much data we have to write, includes header and data.
-        * total_len is length of the packet in Dwords plus the PBC should not
-        * include the CRC.
-        */
-       pkt_len = dp->len >> 2;
-       total_len = pkt_len + 2; /* PBC + packet */
-
-       /* if 0, fill in a default */
-       if (dp->pbc == 0) {
-               struct hfi1_pportdata *ppd = dd->pport;
-
-               hfi1_cdbg(PKT, "Generating PBC");
-               dp->pbc = create_pbc(ppd, 0, 0, 0, total_len);
-       } else {
-               hfi1_cdbg(PKT, "Using passed in PBC");
-       }
-
-       hfi1_cdbg(PKT, "Egress PBC content is 0x%llx", dp->pbc);
-
-       /*
-        * The caller wants to wait until the packet is sent and to
-        * check for errors.  The best we can do is wait until
-        * the buffer credits are returned and check if any packet
-        * error has occurred.  If there are any late errors, this
-        * could miss it.  If there are other senders who generate
-        * an error, this may find it.  However, in general, it
-        * should catch most.
-        */
-       if (dp->flags & F_DIAGPKT_WAIT) {
-               /* always force a credit return */
-               dp->pbc |= PBC_CREDIT_RETURN;
-               /* turn on credit return interrupts */
-               sc_add_credit_return_intr(sc);
-               wait = kmalloc(sizeof(*wait), GFP_KERNEL);
-               if (!wait) {
-                       ret = -ENOMEM;
-                       goto bail;
-               }
-               init_completion(&wait->credits_returned);
-               atomic_set(&wait->count, 2);
-               wait->code = PRC_OK;
-
-               credit_cb = diagpkt_complete;
-               credit_arg = wait;
-       }
-
-retry:
-       pbuf = sc_buffer_alloc(sc, total_len, credit_cb, credit_arg);
-       if (!pbuf) {
-               if (trycount == 0) {
-                       /* force a credit return and try again */
-                       sc_return_credits(sc);
-                       trycount = 1;
-                       goto retry;
-               }
-               /*
-                * No send buffer means no credit callback.  Undo
-                * the wait set-up that was done above.  We free wait
-                * because the callback will never be called.
-                */
-               if (dp->flags & F_DIAGPKT_WAIT) {
-                       sc_del_credit_return_intr(sc);
-                       kfree(wait);
-                       wait = NULL;
-               }
-               ret = -ENOSPC;
-               goto bail;
-       }
-
-       pio_copy(dd, pbuf, dp->pbc, tmpbuf, pkt_len);
-       /* no flush needed as the HW knows the packet size */
-
-       ret = sizeof(*dp);
-
-       if (dp->flags & F_DIAGPKT_WAIT) {
-               /* wait for credit return */
-               ret = wait_for_completion_interruptible(
-                                               &wait->credits_returned);
-               /*
-                * If the wait returns an error, the wait was interrupted,
-                * e.g. with a ^C in the user program.  The callback is
-                * still pending.  This is OK as the wait structure is
-                * kmalloc'ed and the structure will free itself when
-                * all users are done with it.
-                *
-                * A context disable occurs on a send context restart, so
-                * include that in the list of errors below to check for.
-                * NOTE: PRC_FILL_ERR is at best informational and cannot
-                * be depended on.
-                */
-               if (!ret && (((wait->code & PRC_STATUS_ERR) ||
-                             (wait->code & PRC_FILL_ERR) ||
-                             (wait->code & PRC_SC_DISABLE))))
-                       ret = -EIO;
-
-               put_diagpkt_wait(wait); /* finished with the structure */
-               sc_del_credit_return_intr(sc);
-       }
-
-bail:
-       vfree(tmpbuf);
-       return ret;
-}
-
-static ssize_t diagpkt_write(struct file *fp, const char __user *data,
-                            size_t count, loff_t *off)
-{
-       struct hfi1_devdata *dd;
-       struct send_context *sc;
-       u8 vl;
-
-       struct diag_pkt dp;
-
-       if (count != sizeof(dp))
-               return -EINVAL;
-
-       if (copy_from_user(&dp, data, sizeof(dp)))
-               return -EFAULT;
-
-       /*
-       * The Send Context is derived from the PbcVL value
-       * if PBC is populated
-       */
-       if (dp.pbc) {
-               dd = hfi1_lookup(dp.unit);
-               if (!dd)
-                       return -ENODEV;
-               vl = (dp.pbc >> PBC_VL_SHIFT) & PBC_VL_MASK;
-               sc = dd->vld[vl].sc;
-               if (sc) {
-                       dp.sw_index = sc->sw_index;
-                       hfi1_cdbg(
-                              PKT,
-                              "Packet sent over VL %d via Send Context %u(%u)",
-                              vl, sc->sw_index, sc->hw_context);
-               }
-       }
-
-       return diagpkt_send(&dp);
-}
-
-static int hfi1_snoop_add(struct hfi1_devdata *dd, const char *name)
-{
-       int ret = 0;
-
-       dd->hfi1_snoop.mode_flag = 0;
-       spin_lock_init(&dd->hfi1_snoop.snoop_lock);
-       INIT_LIST_HEAD(&dd->hfi1_snoop.queue);
-       init_waitqueue_head(&dd->hfi1_snoop.waitq);
-
-       ret = hfi1_cdev_init(HFI1_SNOOP_CAPTURE_BASE + dd->unit, name,
-                            &snoop_file_ops,
-                            &dd->hfi1_snoop.cdev, &dd->hfi1_snoop.class_dev,
-                            false);
-
-       if (ret) {
-               dd_dev_err(dd, "Couldn't create %s device: %d", name, ret);
-               hfi1_cdev_cleanup(&dd->hfi1_snoop.cdev,
-                                 &dd->hfi1_snoop.class_dev);
-       }
-
-       return ret;
-}
-
-static struct hfi1_devdata *hfi1_dd_from_sc_inode(struct inode *in)
-{
-       int unit = iminor(in) - HFI1_SNOOP_CAPTURE_BASE;
-       struct hfi1_devdata *dd;
-
-       dd = hfi1_lookup(unit);
-       return dd;
-}
-
-/* clear or restore send context integrity checks */
-static void adjust_integrity_checks(struct hfi1_devdata *dd)
-{
-       struct send_context *sc;
-       unsigned long sc_flags;
-       int i;
-
-       spin_lock_irqsave(&dd->sc_lock, sc_flags);
-       for (i = 0; i < dd->num_send_contexts; i++) {
-               int enable;
-
-               sc = dd->send_contexts[i].sc;
-
-               if (!sc)
-                       continue;       /* not allocated */
-
-               enable = likely(!HFI1_CAP_IS_KSET(NO_INTEGRITY)) &&
-                        dd->hfi1_snoop.mode_flag != HFI1_PORT_SNOOP_MODE;
-
-               set_pio_integrity(sc);
-
-               if (enable) /* take HFI_CAP_* flags into account */
-                       hfi1_init_ctxt(sc);
-       }
-       spin_unlock_irqrestore(&dd->sc_lock, sc_flags);
-}
-
-static int hfi1_snoop_open(struct inode *in, struct file *fp)
-{
-       int ret;
-       int mode_flag = 0;
-       unsigned long flags = 0;
-       struct hfi1_devdata *dd;
-       struct list_head *queue;
-
-       mutex_lock(&hfi1_mutex);
-
-       dd = hfi1_dd_from_sc_inode(in);
-       if (!dd) {
-               ret = -ENODEV;
-               goto bail;
-       }
-
-       /*
-        * File mode determines snoop or capture. Some existing user
-        * applications expect the capture device to be able to be opened RDWR
-        * because they expect a dedicated capture device. For this reason we
-        * support a module param to force capture mode even if the file open
-        * mode matches snoop.
-        */
-       if ((fp->f_flags & O_ACCMODE) == O_RDONLY) {
-               snoop_dbg("Capture Enabled");
-               mode_flag = HFI1_PORT_CAPTURE_MODE;
-       } else if ((fp->f_flags & O_ACCMODE) == O_RDWR) {
-               snoop_dbg("Snoop Enabled");
-               mode_flag = HFI1_PORT_SNOOP_MODE;
-       } else {
-               snoop_dbg("Invalid");
-               ret =  -EINVAL;
-               goto bail;
-       }
-       queue = &dd->hfi1_snoop.queue;
-
-       /*
-        * We are not supporting snoop and capture at the same time.
-        */
-       spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
-       if (dd->hfi1_snoop.mode_flag) {
-               ret = -EBUSY;
-               spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
-               goto bail;
-       }
-
-       dd->hfi1_snoop.mode_flag = mode_flag;
-       drain_snoop_list(queue);
-
-       dd->hfi1_snoop.filter_callback = NULL;
-       dd->hfi1_snoop.filter_value = NULL;
-
-       /*
-        * Send side packet integrity checks are not helpful when snooping so
-        * disable and re-enable when we stop snooping.
-        */
-       if (mode_flag == HFI1_PORT_SNOOP_MODE) {
-               /* clear after snoop mode is on */
-               adjust_integrity_checks(dd); /* clear */
-
-               /*
-                * We also do not want to be doing the DLID LMC check for
-                * ingressed packets.
-                */
-               dd->hfi1_snoop.dcc_cfg = read_csr(dd, DCC_CFG_PORT_CONFIG1);
-               write_csr(dd, DCC_CFG_PORT_CONFIG1,
-                         (dd->hfi1_snoop.dcc_cfg >> 32) << 32);
-       }
-
-       /*
-        * As soon as we set these function pointers the recv and send handlers
-        * are active. This is a race condition so we must make sure to drain
-        * the queue and init filter values above. Technically we should add
-        * locking here but all that will happen is on recv a packet will get
-        * allocated and get stuck on the snoop_lock before getting added to the
-        * queue. Same goes for send.
-        */
-       dd->rhf_rcv_function_map = snoop_rhf_rcv_functions;
-       dd->process_pio_send = snoop_send_pio_handler;
-       dd->process_dma_send = snoop_send_pio_handler;
-       dd->pio_inline_send = snoop_inline_pio_send;
-
-       spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
-       ret = 0;
-
-bail:
-       mutex_unlock(&hfi1_mutex);
-
-       return ret;
-}
-
-static int hfi1_snoop_release(struct inode *in, struct file *fp)
-{
-       unsigned long flags = 0;
-       struct hfi1_devdata *dd;
-       int mode_flag;
-
-       dd = hfi1_dd_from_sc_inode(in);
-       if (!dd)
-               return -ENODEV;
-
-       spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
-
-       /* clear the snoop mode before re-adjusting send context CSRs */
-       mode_flag = dd->hfi1_snoop.mode_flag;
-       dd->hfi1_snoop.mode_flag = 0;
-
-       /*
-        * Drain the queue and clear the filters we are done with it. Don't
-        * forget to restore the packet integrity checks
-        */
-       drain_snoop_list(&dd->hfi1_snoop.queue);
-       if (mode_flag == HFI1_PORT_SNOOP_MODE) {
-               /* restore after snoop mode is clear */
-               adjust_integrity_checks(dd); /* restore */
-
-               /*
-                * Also should probably reset the DCC_CONFIG1 register for DLID
-                * checking on incoming packets again. Use the value saved when
-                * opening the snoop device.
-                */
-               write_csr(dd, DCC_CFG_PORT_CONFIG1, dd->hfi1_snoop.dcc_cfg);
-       }
-
-       dd->hfi1_snoop.filter_callback = NULL;
-       kfree(dd->hfi1_snoop.filter_value);
-       dd->hfi1_snoop.filter_value = NULL;
-
-       /*
-        * User is done snooping and capturing, return control to the normal
-        * handler. Re-enable SDMA handling.
-        */
-       dd->rhf_rcv_function_map = dd->normal_rhf_rcv_functions;
-       dd->process_pio_send = hfi1_verbs_send_pio;
-       dd->process_dma_send = hfi1_verbs_send_dma;
-       dd->pio_inline_send = pio_copy;
-
-       spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
-
-       snoop_dbg("snoop/capture device released");
-
-       return 0;
-}
-
-static unsigned int hfi1_snoop_poll(struct file *fp,
-                                   struct poll_table_struct *wait)
-{
-       int ret = 0;
-       unsigned long flags = 0;
-
-       struct hfi1_devdata *dd;
-
-       dd = hfi1_dd_from_sc_inode(fp->f_inode);
-       if (!dd)
-               return -ENODEV;
-
-       spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
-
-       poll_wait(fp, &dd->hfi1_snoop.waitq, wait);
-       if (!list_empty(&dd->hfi1_snoop.queue))
-               ret |= POLLIN | POLLRDNORM;
-
-       spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
-       return ret;
-}
-
-static ssize_t hfi1_snoop_write(struct file *fp, const char __user *data,
-                               size_t count, loff_t *off)
-{
-       struct diag_pkt dpkt;
-       struct hfi1_devdata *dd;
-       size_t ret;
-       u8 byte_two, sl, sc5, sc4, vl, byte_one;
-       struct send_context *sc;
-       u32 len;
-       u64 pbc;
-       struct hfi1_ibport *ibp;
-       struct hfi1_pportdata *ppd;
-
-       dd = hfi1_dd_from_sc_inode(fp->f_inode);
-       if (!dd)
-               return -ENODEV;
-
-       ppd = dd->pport;
-       snoop_dbg("received %lu bytes from user", count);
-
-       memset(&dpkt, 0, sizeof(struct diag_pkt));
-       dpkt.version = _DIAG_PKT_VERS;
-       dpkt.unit = dd->unit;
-       dpkt.port = 1;
-
-       if (likely(!(snoop_flags & SNOOP_USE_METADATA))) {
-               /*
-               * We need to generate the PBC and not let diagpkt_send do it,
-               * to do this we need the VL and the length in dwords.
-               * The VL can be determined by using the SL and looking up the
-               * SC. Then the SC can be converted into VL. The exception to
-               * this is those packets which are from an SMI queue pair.
-               * Since we can't detect anything about the QP here we have to
-               * rely on the SC. If its 0xF then we assume its SMI and
-               * do not look at the SL.
-               */
-               if (copy_from_user(&byte_one, data, 1))
-                       return -EINVAL;
-
-               if (copy_from_user(&byte_two, data + 1, 1))
-                       return -EINVAL;
-
-               sc4 = (byte_one >> 4) & 0xf;
-               if (sc4 == 0xF) {
-                       snoop_dbg("Detected VL15 packet ignoring SL in packet");
-                       vl = sc4;
-               } else {
-                       sl = (byte_two >> 4) & 0xf;
-                       ibp = to_iport(&dd->verbs_dev.rdi.ibdev, 1);
-                       sc5 = ibp->sl_to_sc[sl];
-                       vl = sc_to_vlt(dd, sc5);
-                       if (vl != sc4) {
-                               snoop_dbg("VL %d does not match SC %d of packet",
-                                         vl, sc4);
-                               return -EINVAL;
-                       }
-               }
-
-               sc = dd->vld[vl].sc; /* Look up the context based on VL */
-               if (sc) {
-                       dpkt.sw_index = sc->sw_index;
-                       snoop_dbg("Sending on context %u(%u)", sc->sw_index,
-                                 sc->hw_context);
-               } else {
-                       snoop_dbg("Could not find context for vl %d", vl);
-                       return -EINVAL;
-               }
-
-               len = (count >> 2) + 2; /* Add in PBC */
-               pbc = create_pbc(ppd, 0, 0, vl, len);
-       } else {
-               if (copy_from_user(&pbc, data, sizeof(pbc)))
-                       return -EINVAL;
-               vl = (pbc >> PBC_VL_SHIFT) & PBC_VL_MASK;
-               sc = dd->vld[vl].sc; /* Look up the context based on VL */
-               if (sc) {
-                       dpkt.sw_index = sc->sw_index;
-               } else {
-                       snoop_dbg("Could not find context for vl %d", vl);
-                       return -EINVAL;
-               }
-               data += sizeof(pbc);
-               count -= sizeof(pbc);
-       }
-       dpkt.len = count;
-       dpkt.data = (unsigned long)data;
-
-       snoop_dbg("PBC: vl=0x%llx Length=0x%llx",
-                 (pbc >> 12) & 0xf,
-                 (pbc & 0xfff));
-
-       dpkt.pbc = pbc;
-       ret = diagpkt_send(&dpkt);
-       /*
-        * diagpkt_send only returns number of bytes in the diagpkt so patch
-        * that up here before returning.
-        */
-       if (ret == sizeof(dpkt))
-               return count;
-
-       return ret;
-}
-
-static ssize_t hfi1_snoop_read(struct file *fp, char __user *data,
-                              size_t pkt_len, loff_t *off)
-{
-       ssize_t ret = 0;
-       unsigned long flags = 0;
-       struct snoop_packet *packet = NULL;
-       struct hfi1_devdata *dd;
-
-       dd = hfi1_dd_from_sc_inode(fp->f_inode);
-       if (!dd)
-               return -ENODEV;
-
-       spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
-
-       while (list_empty(&dd->hfi1_snoop.queue)) {
-               spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
-
-               if (fp->f_flags & O_NONBLOCK)
-                       return -EAGAIN;
-
-               if (wait_event_interruptible(
-                               dd->hfi1_snoop.waitq,
-                               !list_empty(&dd->hfi1_snoop.queue)))
-                       return -EINTR;
-
-               spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
-       }
-
-       if (!list_empty(&dd->hfi1_snoop.queue)) {
-               packet = list_entry(dd->hfi1_snoop.queue.next,
-                                   struct snoop_packet, list);
-               list_del(&packet->list);
-               spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
-               if (pkt_len >= packet->total_len) {
-                       if (copy_to_user(data, packet->data,
-                                        packet->total_len))
-                               ret = -EFAULT;
-                       else
-                               ret = packet->total_len;
-               } else {
-                       ret = -EINVAL;
-               }
-
-               kfree(packet);
-       } else {
-               spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
-       }
-
-       return ret;
-}
-
-/**
- * hfi1_assign_snoop_link_credits -- Set up credits for VL15 and others
- * @ppd : ptr to hfi1 port data
- * @value : options from user space
- *
- * Assumes the rest of the CM credit registers are zero from a
- * previous global or credit reset.
- * Leave shared count at zero for both global and all vls.
- * In snoop mode ideally we don't use shared credits
- * Reserve 8.5k for VL15
- * If total credits less than 8.5kbytes return error.
- * Divide the rest of the credits across VL0 to VL7 and if
- * each of these levels has less than 34 credits (at least 2048 + 128 bytes)
- * return with an error.
- * The credit registers will be reset to zero on link negotiation or link up
- * so this function should be activated from user space only if the port has
- * gone past link negotiation and link up.
- *
- * Return -- 0 if successful else error condition
- *
- */
-static long hfi1_assign_snoop_link_credits(struct hfi1_pportdata *ppd,
-                                          int value)
-{
-#define  OPA_MIN_PER_VL_CREDITS  34  /* 2048 + 128 bytes */
-       struct buffer_control t;
-       int i;
-       struct hfi1_devdata *dd = ppd->dd;
-       u16  total_credits = (value >> 16) & 0xffff;
-       u16  vl15_credits = dd->vl15_init / 2;
-       u16  per_vl_credits;
-       __be16 be_per_vl_credits;
-
-       if (!(ppd->host_link_state & HLS_UP))
-               goto err_exit;
-       if (total_credits  <  vl15_credits)
-               goto err_exit;
-
-       per_vl_credits = (total_credits - vl15_credits) / TXE_NUM_DATA_VL;
-
-       if (per_vl_credits < OPA_MIN_PER_VL_CREDITS)
-               goto err_exit;
-
-       memset(&t, 0, sizeof(t));
-       be_per_vl_credits = cpu_to_be16(per_vl_credits);
-
-       for (i = 0; i < TXE_NUM_DATA_VL; i++)
-               t.vl[i].dedicated = be_per_vl_credits;
-
-       t.vl[15].dedicated  = cpu_to_be16(vl15_credits);
-       return set_buffer_control(ppd, &t);
-
-err_exit:
-       snoop_dbg("port_state = 0x%x, total_credits = %d, vl15_credits = %d",
-                 ppd->host_link_state, total_credits, vl15_credits);
-
-       return -EINVAL;
-}
-
-static long hfi1_ioctl(struct file *fp, unsigned int cmd, unsigned long arg)
-{
-       struct hfi1_devdata *dd;
-       void *filter_value = NULL;
-       long ret = 0;
-       int value = 0;
-       u8 phys_state = 0;
-       u8 link_state = 0;
-       u16 dev_state = 0;
-       unsigned long flags = 0;
-       unsigned long *argp = NULL;
-       struct hfi1_packet_filter_command filter_cmd = {0};
-       int mode_flag = 0;
-       struct hfi1_pportdata *ppd = NULL;
-       unsigned int index;
-       struct hfi1_link_info link_info;
-       int read_cmd, write_cmd, read_ok, write_ok;
-
-       dd = hfi1_dd_from_sc_inode(fp->f_inode);
-       if (!dd)
-               return -ENODEV;
-
-       mode_flag = dd->hfi1_snoop.mode_flag;
-       read_cmd = _IOC_DIR(cmd) & _IOC_READ;
-       write_cmd = _IOC_DIR(cmd) & _IOC_WRITE;
-       write_ok = access_ok(VERIFY_WRITE, (void __user *)arg, _IOC_SIZE(cmd));
-       read_ok = access_ok(VERIFY_READ, (void __user *)arg, _IOC_SIZE(cmd));
-
-       if ((read_cmd && !write_ok) || (write_cmd && !read_ok))
-               return -EFAULT;
-
-       if (!capable(CAP_SYS_ADMIN))
-               return -EPERM;
-
-       if ((mode_flag & HFI1_PORT_CAPTURE_MODE) &&
-           (cmd != HFI1_SNOOP_IOCCLEARQUEUE) &&
-           (cmd != HFI1_SNOOP_IOCCLEARFILTER) &&
-           (cmd != HFI1_SNOOP_IOCSETFILTER))
-               /* Capture devices are allowed only 3 operations
-                * 1.Clear capture queue
-                * 2.Clear capture filter
-                * 3.Set capture filter
-                * Other are invalid.
-                */
-               return -EINVAL;
-
-       switch (cmd) {
-       case HFI1_SNOOP_IOCSETLINKSTATE_EXTRA:
-               memset(&link_info, 0, sizeof(link_info));
-
-               if (copy_from_user(&link_info,
-                                  (struct hfi1_link_info __user *)arg,
-                                  sizeof(link_info)))
-                       return -EFAULT;
-
-               value = link_info.port_state;
-               index = link_info.port_number;
-               if (index > dd->num_pports - 1)
-                       return -EINVAL;
-
-               ppd = &dd->pport[index];
-               if (!ppd)
-                       return -EINVAL;
-
-               /* What we want to transition to */
-               phys_state = (value >> 4) & 0xF;
-               link_state = value & 0xF;
-               snoop_dbg("Setting link state 0x%x", value);
-
-               switch (link_state) {
-               case IB_PORT_NOP:
-                       if (phys_state == 0)
-                               break;
-                               /* fall through */
-               case IB_PORT_DOWN:
-                       switch (phys_state) {
-                       case 0:
-                               dev_state = HLS_DN_DOWNDEF;
-                               break;
-                       case 2:
-                               dev_state = HLS_DN_POLL;
-                               break;
-                       case 3:
-                               dev_state = HLS_DN_DISABLE;
-                               break;
-                       default:
-                               return -EINVAL;
-                       }
-                       ret = set_link_state(ppd, dev_state);
-                       break;
-               case IB_PORT_ARMED:
-                       ret = set_link_state(ppd, HLS_UP_ARMED);
-                       if (!ret)
-                               send_idle_sma(dd, SMA_IDLE_ARM);
-                       break;
-               case IB_PORT_ACTIVE:
-                       ret = set_link_state(ppd, HLS_UP_ACTIVE);
-                       if (!ret)
-                               send_idle_sma(dd, SMA_IDLE_ACTIVE);
-                       break;
-               default:
-                       return -EINVAL;
-               }
-
-               if (ret)
-                       break;
-               /* fall through */
-       case HFI1_SNOOP_IOCGETLINKSTATE:
-       case HFI1_SNOOP_IOCGETLINKSTATE_EXTRA:
-               if (cmd == HFI1_SNOOP_IOCGETLINKSTATE_EXTRA) {
-                       memset(&link_info, 0, sizeof(link_info));
-                       if (copy_from_user(&link_info,
-                                          (struct hfi1_link_info __user *)arg,
-                                          sizeof(link_info)))
-                               return -EFAULT;
-                       index = link_info.port_number;
-               } else {
-                       ret = __get_user(index, (int __user *)arg);
-                       if (ret !=  0)
-                               break;
-               }
-
-               if (index > dd->num_pports - 1)
-                       return -EINVAL;
-
-               ppd = &dd->pport[index];
-               if (!ppd)
-                       return -EINVAL;
-
-               value = hfi1_ibphys_portstate(ppd);
-               value <<= 4;
-               value |= driver_lstate(ppd);
-
-               snoop_dbg("Link port | Link State: %d", value);
-
-               if ((cmd == HFI1_SNOOP_IOCGETLINKSTATE_EXTRA) ||
-                   (cmd == HFI1_SNOOP_IOCSETLINKSTATE_EXTRA)) {
-                       link_info.port_state = value;
-                       link_info.node_guid = cpu_to_be64(ppd->guid);
-                       link_info.link_speed_active =
-                                               ppd->link_speed_active;
-                       link_info.link_width_active =
-                                               ppd->link_width_active;
-                       if (copy_to_user((struct hfi1_link_info __user *)arg,
-                                        &link_info, sizeof(link_info)))
-                               return -EFAULT;
-               } else {
-                       ret = __put_user(value, (int __user *)arg);
-               }
-               break;
-
-       case HFI1_SNOOP_IOCCLEARQUEUE:
-               snoop_dbg("Clearing snoop queue");
-               spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
-               drain_snoop_list(&dd->hfi1_snoop.queue);
-               spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
-               break;
-
-       case HFI1_SNOOP_IOCCLEARFILTER:
-               snoop_dbg("Clearing filter");
-               spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
-               if (dd->hfi1_snoop.filter_callback) {
-                       /* Drain packets first */
-                       drain_snoop_list(&dd->hfi1_snoop.queue);
-                       dd->hfi1_snoop.filter_callback = NULL;
-               }
-               kfree(dd->hfi1_snoop.filter_value);
-               dd->hfi1_snoop.filter_value = NULL;
-               spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
-               break;
-
-       case HFI1_SNOOP_IOCSETFILTER:
-               snoop_dbg("Setting filter");
-               /* just copy command structure */
-               argp = (unsigned long *)arg;
-               if (copy_from_user(&filter_cmd, (void __user *)argp,
-                                  sizeof(filter_cmd)))
-                       return -EFAULT;
-
-               if (filter_cmd.opcode >= HFI1_MAX_FILTERS) {
-                       pr_alert("Invalid opcode in request\n");
-                       return -EINVAL;
-               }
-
-               snoop_dbg("Opcode %d Len %d Ptr %p",
-                         filter_cmd.opcode, filter_cmd.length,
-                         filter_cmd.value_ptr);
-
-               filter_value = kcalloc(filter_cmd.length, sizeof(u8),
-                                      GFP_KERNEL);
-               if (!filter_value)
-                       return -ENOMEM;
-
-               /* copy remaining data from userspace */
-               if (copy_from_user((u8 *)filter_value,
-                                  (void __user *)filter_cmd.value_ptr,
-                                  filter_cmd.length)) {
-                       kfree(filter_value);
-                       return -EFAULT;
-               }
-               /* Drain packets first */
-               spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
-               drain_snoop_list(&dd->hfi1_snoop.queue);
-               dd->hfi1_snoop.filter_callback =
-                       hfi1_filters[filter_cmd.opcode].filter;
-               /* just in case we see back to back sets */
-               kfree(dd->hfi1_snoop.filter_value);
-               dd->hfi1_snoop.filter_value = filter_value;
-               spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
-               break;
-       case HFI1_SNOOP_IOCGETVERSION:
-               value = SNOOP_CAPTURE_VERSION;
-               snoop_dbg("Getting version: %d", value);
-               ret = __put_user(value, (int __user *)arg);
-               break;
-       case HFI1_SNOOP_IOCSET_OPTS:
-               snoop_flags = 0;
-               ret = __get_user(value, (int __user *)arg);
-               if (ret != 0)
-                       break;
-
-               snoop_dbg("Setting snoop option %d", value);
-               if (value & SNOOP_DROP_SEND)
-                       snoop_flags |= SNOOP_DROP_SEND;
-               if (value & SNOOP_USE_METADATA)
-                       snoop_flags |= SNOOP_USE_METADATA;
-               if (value & (SNOOP_SET_VL0TOVL15)) {
-                       ppd = &dd->pport[0];  /* first port will do */
-                       ret = hfi1_assign_snoop_link_credits(ppd, value);
-               }
-               break;
-       default:
-               return -ENOTTY;
-       }
-
-       return ret;
-}
-
-static void snoop_list_add_tail(struct snoop_packet *packet,
-                               struct hfi1_devdata *dd)
-{
-       unsigned long flags = 0;
-
-       spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
-       if (likely((dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE) ||
-                  (dd->hfi1_snoop.mode_flag & HFI1_PORT_CAPTURE_MODE))) {
-               list_add_tail(&packet->list, &dd->hfi1_snoop.queue);
-               snoop_dbg("Added packet to list");
-       }
-
-       /*
-        * Technically we can could have closed the snoop device while waiting
-        * on the above lock and it is gone now. The snoop mode_flag will
-        * prevent us from adding the packet to the queue though.
-        */
-
-       spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
-       wake_up_interruptible(&dd->hfi1_snoop.waitq);
-}
-
-static inline int hfi1_filter_check(void *val, const char *msg)
-{
-       if (!val) {
-               snoop_dbg("Error invalid %s value for filter", msg);
-               return HFI1_FILTER_ERR;
-       }
-       return 0;
-}
-
-static int hfi1_filter_lid(void *ibhdr, void *packet_data, void *value)
-{
-       struct hfi1_ib_header *hdr;
-       int ret;
-
-       ret = hfi1_filter_check(ibhdr, "header");
-       if (ret)
-               return ret;
-       ret = hfi1_filter_check(value, "user");
-       if (ret)
-               return ret;
-       hdr = (struct hfi1_ib_header *)ibhdr;
-
-       if (*((u16 *)value) == be16_to_cpu(hdr->lrh[3])) /* matches slid */
-               return HFI1_FILTER_HIT; /* matched */
-
-       return HFI1_FILTER_MISS; /* Not matched */
-}
-
-static int hfi1_filter_dlid(void *ibhdr, void *packet_data, void *value)
-{
-       struct hfi1_ib_header *hdr;
-       int ret;
-
-       ret = hfi1_filter_check(ibhdr, "header");
-       if (ret)
-               return ret;
-       ret = hfi1_filter_check(value, "user");
-       if (ret)
-               return ret;
-
-       hdr = (struct hfi1_ib_header *)ibhdr;
-
-       if (*((u16 *)value) == be16_to_cpu(hdr->lrh[1]))
-               return HFI1_FILTER_HIT;
-
-       return HFI1_FILTER_MISS;
-}
-
-/* Not valid for outgoing packets, send handler passes null for data*/
-static int hfi1_filter_mad_mgmt_class(void *ibhdr, void *packet_data,
-                                     void *value)
-{
-       struct hfi1_ib_header *hdr;
-       struct hfi1_other_headers *ohdr = NULL;
-       struct ib_smp *smp = NULL;
-       u32 qpn = 0;
-       int ret;
-
-       ret = hfi1_filter_check(ibhdr, "header");
-       if (ret)
-               return ret;
-       ret = hfi1_filter_check(packet_data, "packet_data");
-       if (ret)
-               return ret;
-       ret = hfi1_filter_check(value, "user");
-       if (ret)
-               return ret;
-
-       hdr = (struct hfi1_ib_header *)ibhdr;
-
-       /* Check for GRH */
-       if ((be16_to_cpu(hdr->lrh[0]) & 3) == HFI1_LRH_BTH)
-               ohdr = &hdr->u.oth; /* LRH + BTH + DETH */
-       else
-               ohdr = &hdr->u.l.oth; /* LRH + GRH + BTH + DETH */
-
-       qpn = be32_to_cpu(ohdr->bth[1]) & 0x00FFFFFF;
-       if (qpn <= 1) {
-               smp = (struct ib_smp *)packet_data;
-               if (*((u8 *)value) == smp->mgmt_class)
-                       return HFI1_FILTER_HIT;
-               else
-                       return HFI1_FILTER_MISS;
-       }
-       return HFI1_FILTER_ERR;
-}
-
-static int hfi1_filter_qp_number(void *ibhdr, void *packet_data, void *value)
-{
-       struct hfi1_ib_header *hdr;
-       struct hfi1_other_headers *ohdr = NULL;
-       int ret;
-
-       ret = hfi1_filter_check(ibhdr, "header");
-       if (ret)
-               return ret;
-       ret = hfi1_filter_check(value, "user");
-       if (ret)
-               return ret;
-
-       hdr = (struct hfi1_ib_header *)ibhdr;
-
-       /* Check for GRH */
-       if ((be16_to_cpu(hdr->lrh[0]) & 3) == HFI1_LRH_BTH)
-               ohdr = &hdr->u.oth; /* LRH + BTH + DETH */
-       else
-               ohdr = &hdr->u.l.oth; /* LRH + GRH + BTH + DETH */
-       if (*((u32 *)value) == (be32_to_cpu(ohdr->bth[1]) & 0x00FFFFFF))
-               return HFI1_FILTER_HIT;
-
-       return HFI1_FILTER_MISS;
-}
-
-static int hfi1_filter_ibpacket_type(void *ibhdr, void *packet_data,
-                                    void *value)
-{
-       u32 lnh = 0;
-       u8 opcode = 0;
-       struct hfi1_ib_header *hdr;
-       struct hfi1_other_headers *ohdr = NULL;
-       int ret;
-
-       ret = hfi1_filter_check(ibhdr, "header");
-       if (ret)
-               return ret;
-       ret = hfi1_filter_check(value, "user");
-       if (ret)
-               return ret;
-
-       hdr = (struct hfi1_ib_header *)ibhdr;
-
-       lnh = (be16_to_cpu(hdr->lrh[0]) & 3);
-
-       if (lnh == HFI1_LRH_BTH)
-               ohdr = &hdr->u.oth;
-       else if (lnh == HFI1_LRH_GRH)
-               ohdr = &hdr->u.l.oth;
-       else
-               return HFI1_FILTER_ERR;
-
-       opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
-
-       if (*((u8 *)value) == ((opcode >> 5) & 0x7))
-               return HFI1_FILTER_HIT;
-
-       return HFI1_FILTER_MISS;
-}
-
-static int hfi1_filter_ib_service_level(void *ibhdr, void *packet_data,
-                                       void *value)
-{
-       struct hfi1_ib_header *hdr;
-       int ret;
-
-       ret = hfi1_filter_check(ibhdr, "header");
-       if (ret)
-               return ret;
-       ret = hfi1_filter_check(value, "user");
-       if (ret)
-               return ret;
-
-       hdr = (struct hfi1_ib_header *)ibhdr;
-
-       if ((*((u8 *)value)) == ((be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF))
-               return HFI1_FILTER_HIT;
-
-       return HFI1_FILTER_MISS;
-}
-
-static int hfi1_filter_ib_pkey(void *ibhdr, void *packet_data, void *value)
-{
-       u32 lnh = 0;
-       struct hfi1_ib_header *hdr;
-       struct hfi1_other_headers *ohdr = NULL;
-       int ret;
-
-       ret = hfi1_filter_check(ibhdr, "header");
-       if (ret)
-               return ret;
-       ret = hfi1_filter_check(value, "user");
-       if (ret)
-               return ret;
-
-       hdr = (struct hfi1_ib_header *)ibhdr;
-
-       lnh = (be16_to_cpu(hdr->lrh[0]) & 3);
-       if (lnh == HFI1_LRH_BTH)
-               ohdr = &hdr->u.oth;
-       else if (lnh == HFI1_LRH_GRH)
-               ohdr = &hdr->u.l.oth;
-       else
-               return HFI1_FILTER_ERR;
-
-       /* P_key is 16-bit entity, however top most bit indicates
-        * type of membership. 0 for limited and 1 for Full.
-        * Limited members cannot accept information from other
-        * Limited members, but communication is allowed between
-        * every other combination of membership.
-        * Hence we'll omit comparing top-most bit while filtering
-        */
-
-       if ((*(u16 *)value & 0x7FFF) ==
-               ((be32_to_cpu(ohdr->bth[0])) & 0x7FFF))
-               return HFI1_FILTER_HIT;
-
-       return HFI1_FILTER_MISS;
-}
-
-/*
- * If packet_data is NULL then this is coming from one of the send functions.
- * Thus we know if its an ingressed or egressed packet.
- */
-static int hfi1_filter_direction(void *ibhdr, void *packet_data, void *value)
-{
-       u8 user_dir = *(u8 *)value;
-       int ret;
-
-       ret = hfi1_filter_check(value, "user");
-       if (ret)
-               return ret;
-
-       if (packet_data) {
-               /* Incoming packet */
-               if (user_dir & HFI1_SNOOP_INGRESS)
-                       return HFI1_FILTER_HIT;
-       } else {
-               /* Outgoing packet */
-               if (user_dir & HFI1_SNOOP_EGRESS)
-                       return HFI1_FILTER_HIT;
-       }
-
-       return HFI1_FILTER_MISS;
-}
-
-/*
- * Allocate a snoop packet. The structure that is stored in the ring buffer, not
- * to be confused with an hfi packet type.
- */
-static struct snoop_packet *allocate_snoop_packet(u32 hdr_len,
-                                                 u32 data_len,
-                                                 u32 md_len)
-{
-       struct snoop_packet *packet;
-
-       packet = kzalloc(sizeof(*packet) + hdr_len + data_len
-                        + md_len,
-                        GFP_ATOMIC | __GFP_NOWARN);
-       if (likely(packet))
-               INIT_LIST_HEAD(&packet->list);
-
-       return packet;
-}
-
-/*
- * Instead of having snoop and capture code intermixed with the recv functions,
- * both the interrupt handler and hfi1_ib_rcv() we are going to hijack the call
- * and land in here for snoop/capture but if not enabled the call will go
- * through as before. This gives us a single point to constrain all of the snoop
- * snoop recv logic. There is nothing special that needs to happen for bypass
- * packets. This routine should not try to look into the packet. It just copied
- * it. There is no guarantee for filters when it comes to bypass packets as
- * there is no specific support. Bottom line is this routine does now even know
- * what a bypass packet is.
- */
-int snoop_recv_handler(struct hfi1_packet *packet)
-{
-       struct hfi1_pportdata *ppd = packet->rcd->ppd;
-       struct hfi1_ib_header *hdr = packet->hdr;
-       int header_size = packet->hlen;
-       void *data = packet->ebuf;
-       u32 tlen = packet->tlen;
-       struct snoop_packet *s_packet = NULL;
-       int ret;
-       int snoop_mode = 0;
-       u32 md_len = 0;
-       struct capture_md md;
-
-       snoop_dbg("PACKET IN: hdr size %d tlen %d data %p", header_size, tlen,
-                 data);
-
-       trace_snoop_capture(ppd->dd, header_size, hdr, tlen - header_size,
-                           data);
-
-       if (!ppd->dd->hfi1_snoop.filter_callback) {
-               snoop_dbg("filter not set");
-               ret = HFI1_FILTER_HIT;
-       } else {
-               ret = ppd->dd->hfi1_snoop.filter_callback(hdr, data,
-                                       ppd->dd->hfi1_snoop.filter_value);
-       }
-
-       switch (ret) {
-       case HFI1_FILTER_ERR:
-               snoop_dbg("Error in filter call");
-               break;
-       case HFI1_FILTER_MISS:
-               snoop_dbg("Filter Miss");
-               break;
-       case HFI1_FILTER_HIT:
-
-               if (ppd->dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE)
-                       snoop_mode = 1;
-               if ((snoop_mode == 0) ||
-                   unlikely(snoop_flags & SNOOP_USE_METADATA))
-                       md_len = sizeof(struct capture_md);
-
-               s_packet = allocate_snoop_packet(header_size,
-                                                tlen - header_size,
-                                                md_len);
-
-               if (unlikely(!s_packet)) {
-                       dd_dev_warn_ratelimited(ppd->dd, "Unable to allocate snoop/capture packet\n");
-                       break;
-               }
-
-               if (md_len > 0) {
-                       memset(&md, 0, sizeof(struct capture_md));
-                       md.port = 1;
-                       md.dir = PKT_DIR_INGRESS;
-                       md.u.rhf = packet->rhf;
-                       memcpy(s_packet->data, &md, md_len);
-               }
-
-               /* We should always have a header */
-               if (hdr) {
-                       memcpy(s_packet->data + md_len, hdr, header_size);
-               } else {
-                       dd_dev_err(ppd->dd, "Unable to copy header to snoop/capture packet\n");
-                       kfree(s_packet);
-                       break;
-               }
-
-               /*
-                * Packets with no data are possible. If there is no data needed
-                * to take care of the last 4 bytes which are normally included
-                * with data buffers and are included in tlen.  Since we kzalloc
-                * the buffer we do not need to set any values but if we decide
-                * not to use kzalloc we should zero them.
-                */
-               if (data)
-                       memcpy(s_packet->data + header_size + md_len, data,
-                              tlen - header_size);
-
-               s_packet->total_len = tlen + md_len;
-               snoop_list_add_tail(s_packet, ppd->dd);
-
-               /*
-                * If we are snooping the packet not capturing then throw away
-                * after adding to the list.
-                */
-               snoop_dbg("Capturing packet");
-               if (ppd->dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE) {
-                       snoop_dbg("Throwing packet away");
-                       /*
-                        * If we are dropping the packet we still may need to
-                        * handle the case where error flags are set, this is
-                        * normally done by the type specific handler but that
-                        * won't be called in this case.
-                        */
-                       if (unlikely(rhf_err_flags(packet->rhf)))
-                               handle_eflags(packet);
-
-                       /* throw the packet on the floor */
-                       return RHF_RCV_CONTINUE;
-               }
-               break;
-       default:
-               break;
-       }
-
-       /*
-        * We do not care what type of packet came in here - just pass it off
-        * to the normal handler.
-        */
-       return ppd->dd->normal_rhf_rcv_functions[rhf_rcv_type(packet->rhf)]
-                       (packet);
-}
-
-/*
- * Handle snooping and capturing packets when sdma is being used.
- */
-int snoop_send_dma_handler(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
-                          u64 pbc)
-{
-       pr_alert("Snooping/Capture of Send DMA Packets Is Not Supported!\n");
-       snoop_dbg("Unsupported Operation");
-       return hfi1_verbs_send_dma(qp, ps, 0);
-}
-
-/*
- * Handle snooping and capturing packets when pio is being used. Does not handle
- * bypass packets. The only way to send a bypass packet currently is to use the
- * diagpkt interface. When that interface is enable snoop/capture is not.
- */
-int snoop_send_pio_handler(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
-                          u64 pbc)
-{
-       u32 hdrwords = qp->s_hdrwords;
-       struct rvt_sge_state *ss = qp->s_cur_sge;
-       u32 len = qp->s_cur_size;
-       u32 dwords = (len + 3) >> 2;
-       u32 plen = hdrwords + dwords + 2; /* includes pbc */
-       struct hfi1_pportdata *ppd = ps->ppd;
-       struct snoop_packet *s_packet = NULL;
-       u32 *hdr = (u32 *)&ps->s_txreq->phdr.hdr;
-       u32 length = 0;
-       struct rvt_sge_state temp_ss;
-       void *data = NULL;
-       void *data_start = NULL;
-       int ret;
-       int snoop_mode = 0;
-       int md_len = 0;
-       struct capture_md md;
-       u32 vl;
-       u32 hdr_len = hdrwords << 2;
-       u32 tlen = HFI1_GET_PKT_LEN(&ps->s_txreq->phdr.hdr);
-
-       md.u.pbc = 0;
-
-       snoop_dbg("PACKET OUT: hdrword %u len %u plen %u dwords %u tlen %u",
-                 hdrwords, len, plen, dwords, tlen);
-       if (ppd->dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE)
-               snoop_mode = 1;
-       if ((snoop_mode == 0) ||
-           unlikely(snoop_flags & SNOOP_USE_METADATA))
-               md_len = sizeof(struct capture_md);
-
-       /* not using ss->total_len as arg 2 b/c that does not count CRC */
-       s_packet = allocate_snoop_packet(hdr_len, tlen - hdr_len, md_len);
-
-       if (unlikely(!s_packet)) {
-               dd_dev_warn_ratelimited(ppd->dd, "Unable to allocate snoop/capture packet\n");
-               goto out;
-       }
-
-       s_packet->total_len = tlen + md_len;
-
-       if (md_len > 0) {
-               memset(&md, 0, sizeof(struct capture_md));
-               md.port = 1;
-               md.dir = PKT_DIR_EGRESS;
-               if (likely(pbc == 0)) {
-                       vl = be16_to_cpu(ps->s_txreq->phdr.hdr.lrh[0]) >> 12;
-                       md.u.pbc = create_pbc(ppd, 0, qp->s_srate, vl, plen);
-               } else {
-                       md.u.pbc = 0;
-               }
-               memcpy(s_packet->data, &md, md_len);
-       } else {
-               md.u.pbc = pbc;
-       }
-
-       /* Copy header */
-       if (likely(hdr)) {
-               memcpy(s_packet->data + md_len, hdr, hdr_len);
-       } else {
-               dd_dev_err(ppd->dd,
-                          "Unable to copy header to snoop/capture packet\n");
-               kfree(s_packet);
-               goto out;
-       }
-
-       if (ss) {
-               data = s_packet->data + hdr_len + md_len;
-               data_start = data;
-
-               /*
-                * Copy SGE State
-                * The update_sge() function below will not modify the
-                * individual SGEs in the array. It will make a copy each time
-                * and operate on that. So we only need to copy this instance
-                * and it won't impact PIO.
-                */
-               temp_ss = *ss;
-               length = len;
-
-               snoop_dbg("Need to copy %d bytes", length);
-               while (length) {
-                       void *addr = temp_ss.sge.vaddr;
-                       u32 slen = temp_ss.sge.length;
-
-                       if (slen > length) {
-                               slen = length;
-                               snoop_dbg("slen %d > len %d", slen, length);
-                       }
-                       snoop_dbg("copy %d to %p", slen, addr);
-                       memcpy(data, addr, slen);
-                       update_sge(&temp_ss, slen);
-                       length -= slen;
-                       data += slen;
-                       snoop_dbg("data is now %p bytes left %d", data, length);
-               }
-               snoop_dbg("Completed SGE copy");
-       }
-
-       /*
-        * Why do the filter check down here? Because the event tracing has its
-        * own filtering and we need to have the walked the SGE list.
-        */
-       if (!ppd->dd->hfi1_snoop.filter_callback) {
-               snoop_dbg("filter not set\n");
-               ret = HFI1_FILTER_HIT;
-       } else {
-               ret = ppd->dd->hfi1_snoop.filter_callback(
-                                       &ps->s_txreq->phdr.hdr,
-                                       NULL,
-                                       ppd->dd->hfi1_snoop.filter_value);
-       }
-
-       switch (ret) {
-       case HFI1_FILTER_ERR:
-               snoop_dbg("Error in filter call");
-               /* fall through */
-       case HFI1_FILTER_MISS:
-               snoop_dbg("Filter Miss");
-               kfree(s_packet);
-               break;
-       case HFI1_FILTER_HIT:
-               snoop_dbg("Capturing packet");
-               snoop_list_add_tail(s_packet, ppd->dd);
-
-               if (unlikely((snoop_flags & SNOOP_DROP_SEND) &&
-                            (ppd->dd->hfi1_snoop.mode_flag &
-                             HFI1_PORT_SNOOP_MODE))) {
-                       unsigned long flags;
-
-                       snoop_dbg("Dropping packet");
-                       if (qp->s_wqe) {
-                               spin_lock_irqsave(&qp->s_lock, flags);
-                               hfi1_send_complete(
-                                       qp,
-                                       qp->s_wqe,
-                                       IB_WC_SUCCESS);
-                               spin_unlock_irqrestore(&qp->s_lock, flags);
-                       } else if (qp->ibqp.qp_type == IB_QPT_RC) {
-                               spin_lock_irqsave(&qp->s_lock, flags);
-                               hfi1_rc_send_complete(qp,
-                                                     &ps->s_txreq->phdr.hdr);
-                               spin_unlock_irqrestore(&qp->s_lock, flags);
-                       }
-
-                       /*
-                        * If snoop is dropping the packet we need to put the
-                        * txreq back because no one else will.
-                        */
-                       hfi1_put_txreq(ps->s_txreq);
-                       return 0;
-               }
-               break;
-       default:
-               kfree(s_packet);
-               break;
-       }
-out:
-       return hfi1_verbs_send_pio(qp, ps, md.u.pbc);
-}
-
-/*
- * Callers of this must pass a hfi1_ib_header type for the from ptr. Currently
- * this can be used anywhere, but the intention is for inline ACKs for RC and
- * CCA packets. We don't restrict this usage though.
- */
-void snoop_inline_pio_send(struct hfi1_devdata *dd, struct pio_buf *pbuf,
-                          u64 pbc, const void *from, size_t count)
-{
-       int snoop_mode = 0;
-       int md_len = 0;
-       struct capture_md md;
-       struct snoop_packet *s_packet = NULL;
-
-       /*
-        * count is in dwords so we need to convert to bytes.
-        * We also need to account for CRC which would be tacked on by hardware.
-        */
-       int packet_len = (count << 2) + 4;
-       int ret;
-
-       snoop_dbg("ACK OUT: len %d", packet_len);
-
-       if (!dd->hfi1_snoop.filter_callback) {
-               snoop_dbg("filter not set");
-               ret = HFI1_FILTER_HIT;
-       } else {
-               ret = dd->hfi1_snoop.filter_callback(
-                               (struct hfi1_ib_header *)from,
-                               NULL,
-                               dd->hfi1_snoop.filter_value);
-       }
-
-       switch (ret) {
-       case HFI1_FILTER_ERR:
-               snoop_dbg("Error in filter call");
-               /* fall through */
-       case HFI1_FILTER_MISS:
-               snoop_dbg("Filter Miss");
-               break;
-       case HFI1_FILTER_HIT:
-               snoop_dbg("Capturing packet");
-               if (dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE)
-                       snoop_mode = 1;
-               if ((snoop_mode == 0) ||
-                   unlikely(snoop_flags & SNOOP_USE_METADATA))
-                       md_len = sizeof(struct capture_md);
-
-               s_packet = allocate_snoop_packet(packet_len, 0, md_len);
-
-               if (unlikely(!s_packet)) {
-                       dd_dev_warn_ratelimited(dd, "Unable to allocate snoop/capture packet\n");
-                       goto inline_pio_out;
-               }
-
-               s_packet->total_len = packet_len + md_len;
-
-               /* Fill in the metadata for the packet */
-               if (md_len > 0) {
-                       memset(&md, 0, sizeof(struct capture_md));
-                       md.port = 1;
-                       md.dir = PKT_DIR_EGRESS;
-                       md.u.pbc = pbc;
-                       memcpy(s_packet->data, &md, md_len);
-               }
-
-               /* Add the packet data which is a single buffer */
-               memcpy(s_packet->data + md_len, from, packet_len);
-
-               snoop_list_add_tail(s_packet, dd);
-
-               if (unlikely((snoop_flags & SNOOP_DROP_SEND) && snoop_mode)) {
-                       snoop_dbg("Dropping packet");
-                       return;
-               }
-               break;
-       default:
-               break;
-       }
-
-inline_pio_out:
-       pio_copy(dd, pbuf, pbc, from, count);
-}
diff --git a/drivers/staging/rdma/hfi1/dma.c b/drivers/staging/rdma/hfi1/dma.c
deleted file mode 100644 (file)
index 7e8dab8..0000000
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#include <linux/types.h>
-#include <linux/scatterlist.h>
-
-#include "verbs.h"
-
-#define BAD_DMA_ADDRESS ((u64)0)
-
-/*
- * The following functions implement driver specific replacements
- * for the ib_dma_*() functions.
- *
- * These functions return kernel virtual addresses instead of
- * device bus addresses since the driver uses the CPU to copy
- * data instead of using hardware DMA.
- */
-
-static int hfi1_mapping_error(struct ib_device *dev, u64 dma_addr)
-{
-       return dma_addr == BAD_DMA_ADDRESS;
-}
-
-static u64 hfi1_dma_map_single(struct ib_device *dev, void *cpu_addr,
-                              size_t size, enum dma_data_direction direction)
-{
-       if (WARN_ON(!valid_dma_direction(direction)))
-               return BAD_DMA_ADDRESS;
-
-       return (u64)cpu_addr;
-}
-
-static void hfi1_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size,
-                                 enum dma_data_direction direction)
-{
-       /* This is a stub, nothing to be done here */
-}
-
-static u64 hfi1_dma_map_page(struct ib_device *dev, struct page *page,
-                            unsigned long offset, size_t size,
-                           enum dma_data_direction direction)
-{
-       u64 addr;
-
-       if (WARN_ON(!valid_dma_direction(direction)))
-               return BAD_DMA_ADDRESS;
-
-       if (offset + size > PAGE_SIZE)
-               return BAD_DMA_ADDRESS;
-
-       addr = (u64)page_address(page);
-       if (addr)
-               addr += offset;
-
-       return addr;
-}
-
-static void hfi1_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size,
-                               enum dma_data_direction direction)
-{
-       /* This is a stub, nothing to be done here */
-}
-
-static int hfi1_map_sg(struct ib_device *dev, struct scatterlist *sgl,
-                      int nents, enum dma_data_direction direction)
-{
-       struct scatterlist *sg;
-       u64 addr;
-       int i;
-       int ret = nents;
-
-       if (WARN_ON(!valid_dma_direction(direction)))
-               return BAD_DMA_ADDRESS;
-
-       for_each_sg(sgl, sg, nents, i) {
-               addr = (u64)page_address(sg_page(sg));
-               if (!addr) {
-                       ret = 0;
-                       break;
-               }
-               sg->dma_address = addr + sg->offset;
-#ifdef CONFIG_NEED_SG_DMA_LENGTH
-               sg->dma_length = sg->length;
-#endif
-       }
-       return ret;
-}
-
-static void hfi1_unmap_sg(struct ib_device *dev,
-                         struct scatterlist *sg, int nents,
-                        enum dma_data_direction direction)
-{
-       /* This is a stub, nothing to be done here */
-}
-
-static void hfi1_sync_single_for_cpu(struct ib_device *dev, u64 addr,
-                                    size_t size, enum dma_data_direction dir)
-{
-}
-
-static void hfi1_sync_single_for_device(struct ib_device *dev, u64 addr,
-                                       size_t size,
-                                       enum dma_data_direction dir)
-{
-}
-
-static void *hfi1_dma_alloc_coherent(struct ib_device *dev, size_t size,
-                                    u64 *dma_handle, gfp_t flag)
-{
-       struct page *p;
-       void *addr = NULL;
-
-       p = alloc_pages(flag, get_order(size));
-       if (p)
-               addr = page_address(p);
-       if (dma_handle)
-               *dma_handle = (u64)addr;
-       return addr;
-}
-
-static void hfi1_dma_free_coherent(struct ib_device *dev, size_t size,
-                                  void *cpu_addr, u64 dma_handle)
-{
-       free_pages((unsigned long)cpu_addr, get_order(size));
-}
-
-struct ib_dma_mapping_ops hfi1_dma_mapping_ops = {
-       .mapping_error = hfi1_mapping_error,
-       .map_single = hfi1_dma_map_single,
-       .unmap_single = hfi1_dma_unmap_single,
-       .map_page = hfi1_dma_map_page,
-       .unmap_page = hfi1_dma_unmap_page,
-       .map_sg = hfi1_map_sg,
-       .unmap_sg = hfi1_unmap_sg,
-       .sync_single_for_cpu = hfi1_sync_single_for_cpu,
-       .sync_single_for_device = hfi1_sync_single_for_device,
-       .alloc_coherent = hfi1_dma_alloc_coherent,
-       .free_coherent = hfi1_dma_free_coherent
-};
diff --git a/drivers/staging/rdma/hfi1/driver.c b/drivers/staging/rdma/hfi1/driver.c
deleted file mode 100644 (file)
index 700c6fa..0000000
+++ /dev/null
@@ -1,1404 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/spinlock.h>
-#include <linux/pci.h>
-#include <linux/io.h>
-#include <linux/delay.h>
-#include <linux/netdevice.h>
-#include <linux/vmalloc.h>
-#include <linux/module.h>
-#include <linux/prefetch.h>
-#include <rdma/ib_verbs.h>
-
-#include "hfi.h"
-#include "trace.h"
-#include "qp.h"
-#include "sdma.h"
-
-#undef pr_fmt
-#define pr_fmt(fmt) DRIVER_NAME ": " fmt
-
-/*
- * The size has to be longer than this string, so we can append
- * board/chip information to it in the initialization code.
- */
-const char ib_hfi1_version[] = HFI1_DRIVER_VERSION "\n";
-
-DEFINE_SPINLOCK(hfi1_devs_lock);
-LIST_HEAD(hfi1_dev_list);
-DEFINE_MUTEX(hfi1_mutex);      /* general driver use */
-
-unsigned int hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU;
-module_param_named(max_mtu, hfi1_max_mtu, uint, S_IRUGO);
-MODULE_PARM_DESC(max_mtu, "Set max MTU bytes, default is " __stringify(
-                HFI1_DEFAULT_MAX_MTU));
-
-unsigned int hfi1_cu = 1;
-module_param_named(cu, hfi1_cu, uint, S_IRUGO);
-MODULE_PARM_DESC(cu, "Credit return units");
-
-unsigned long hfi1_cap_mask = HFI1_CAP_MASK_DEFAULT;
-static int hfi1_caps_set(const char *, const struct kernel_param *);
-static int hfi1_caps_get(char *, const struct kernel_param *);
-static const struct kernel_param_ops cap_ops = {
-       .set = hfi1_caps_set,
-       .get = hfi1_caps_get
-};
-module_param_cb(cap_mask, &cap_ops, &hfi1_cap_mask, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(cap_mask, "Bit mask of enabled/disabled HW features");
-
-MODULE_LICENSE("Dual BSD/GPL");
-MODULE_DESCRIPTION("Intel Omni-Path Architecture driver");
-MODULE_VERSION(HFI1_DRIVER_VERSION);
-
-/*
- * MAX_PKT_RCV is the max # if packets processed per receive interrupt.
- */
-#define MAX_PKT_RECV 64
-#define EGR_HEAD_UPDATE_THRESHOLD 16
-
-struct hfi1_ib_stats hfi1_stats;
-
-static int hfi1_caps_set(const char *val, const struct kernel_param *kp)
-{
-       int ret = 0;
-       unsigned long *cap_mask_ptr = (unsigned long *)kp->arg,
-               cap_mask = *cap_mask_ptr, value, diff,
-               write_mask = ((HFI1_CAP_WRITABLE_MASK << HFI1_CAP_USER_SHIFT) |
-                             HFI1_CAP_WRITABLE_MASK);
-
-       ret = kstrtoul(val, 0, &value);
-       if (ret) {
-               pr_warn("Invalid module parameter value for 'cap_mask'\n");
-               goto done;
-       }
-       /* Get the changed bits (except the locked bit) */
-       diff = value ^ (cap_mask & ~HFI1_CAP_LOCKED_SMASK);
-
-       /* Remove any bits that are not allowed to change after driver load */
-       if (HFI1_CAP_LOCKED() && (diff & ~write_mask)) {
-               pr_warn("Ignoring non-writable capability bits %#lx\n",
-                       diff & ~write_mask);
-               diff &= write_mask;
-       }
-
-       /* Mask off any reserved bits */
-       diff &= ~HFI1_CAP_RESERVED_MASK;
-       /* Clear any previously set and changing bits */
-       cap_mask &= ~diff;
-       /* Update the bits with the new capability */
-       cap_mask |= (value & diff);
-       /* Check for any kernel/user restrictions */
-       diff = (cap_mask & (HFI1_CAP_MUST_HAVE_KERN << HFI1_CAP_USER_SHIFT)) ^
-               ((cap_mask & HFI1_CAP_MUST_HAVE_KERN) << HFI1_CAP_USER_SHIFT);
-       cap_mask &= ~diff;
-       /* Set the bitmask to the final set */
-       *cap_mask_ptr = cap_mask;
-done:
-       return ret;
-}
-
-static int hfi1_caps_get(char *buffer, const struct kernel_param *kp)
-{
-       unsigned long cap_mask = *(unsigned long *)kp->arg;
-
-       cap_mask &= ~HFI1_CAP_LOCKED_SMASK;
-       cap_mask |= ((cap_mask & HFI1_CAP_K2U) << HFI1_CAP_USER_SHIFT);
-
-       return scnprintf(buffer, PAGE_SIZE, "0x%lx", cap_mask);
-}
-
-const char *get_unit_name(int unit)
-{
-       static char iname[16];
-
-       snprintf(iname, sizeof(iname), DRIVER_NAME "_%u", unit);
-       return iname;
-}
-
-const char *get_card_name(struct rvt_dev_info *rdi)
-{
-       struct hfi1_ibdev *ibdev = container_of(rdi, struct hfi1_ibdev, rdi);
-       struct hfi1_devdata *dd = container_of(ibdev,
-                                              struct hfi1_devdata, verbs_dev);
-       return get_unit_name(dd->unit);
-}
-
-struct pci_dev *get_pci_dev(struct rvt_dev_info *rdi)
-{
-       struct hfi1_ibdev *ibdev = container_of(rdi, struct hfi1_ibdev, rdi);
-       struct hfi1_devdata *dd = container_of(ibdev,
-                                              struct hfi1_devdata, verbs_dev);
-       return dd->pcidev;
-}
-
-/*
- * Return count of units with at least one port ACTIVE.
- */
-int hfi1_count_active_units(void)
-{
-       struct hfi1_devdata *dd;
-       struct hfi1_pportdata *ppd;
-       unsigned long flags;
-       int pidx, nunits_active = 0;
-
-       spin_lock_irqsave(&hfi1_devs_lock, flags);
-       list_for_each_entry(dd, &hfi1_dev_list, list) {
-               if (!(dd->flags & HFI1_PRESENT) || !dd->kregbase)
-                       continue;
-               for (pidx = 0; pidx < dd->num_pports; ++pidx) {
-                       ppd = dd->pport + pidx;
-                       if (ppd->lid && ppd->linkup) {
-                               nunits_active++;
-                               break;
-                       }
-               }
-       }
-       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
-       return nunits_active;
-}
-
-/*
- * Return count of all units, optionally return in arguments
- * the number of usable (present) units, and the number of
- * ports that are up.
- */
-int hfi1_count_units(int *npresentp, int *nupp)
-{
-       int nunits = 0, npresent = 0, nup = 0;
-       struct hfi1_devdata *dd;
-       unsigned long flags;
-       int pidx;
-       struct hfi1_pportdata *ppd;
-
-       spin_lock_irqsave(&hfi1_devs_lock, flags);
-
-       list_for_each_entry(dd, &hfi1_dev_list, list) {
-               nunits++;
-               if ((dd->flags & HFI1_PRESENT) && dd->kregbase)
-                       npresent++;
-               for (pidx = 0; pidx < dd->num_pports; ++pidx) {
-                       ppd = dd->pport + pidx;
-                       if (ppd->lid && ppd->linkup)
-                               nup++;
-               }
-       }
-
-       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
-
-       if (npresentp)
-               *npresentp = npresent;
-       if (nupp)
-               *nupp = nup;
-
-       return nunits;
-}
-
-/*
- * Get address of eager buffer from it's index (allocated in chunks, not
- * contiguous).
- */
-static inline void *get_egrbuf(const struct hfi1_ctxtdata *rcd, u64 rhf,
-                              u8 *update)
-{
-       u32 idx = rhf_egr_index(rhf), offset = rhf_egr_buf_offset(rhf);
-
-       *update |= !(idx & (rcd->egrbufs.threshold - 1)) && !offset;
-       return (void *)(((u64)(rcd->egrbufs.rcvtids[idx].addr)) +
-                       (offset * RCV_BUF_BLOCK_SIZE));
-}
-
-/*
- * Validate and encode the a given RcvArray Buffer size.
- * The function will check whether the given size falls within
- * allowed size ranges for the respective type and, optionally,
- * return the proper encoding.
- */
-inline int hfi1_rcvbuf_validate(u32 size, u8 type, u16 *encoded)
-{
-       if (unlikely(!PAGE_ALIGNED(size)))
-               return 0;
-       if (unlikely(size < MIN_EAGER_BUFFER))
-               return 0;
-       if (size >
-           (type == PT_EAGER ? MAX_EAGER_BUFFER : MAX_EXPECTED_BUFFER))
-               return 0;
-       if (encoded)
-               *encoded = ilog2(size / PAGE_SIZE) + 1;
-       return 1;
-}
-
-static void rcv_hdrerr(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd,
-                      struct hfi1_packet *packet)
-{
-       struct hfi1_message_header *rhdr = packet->hdr;
-       u32 rte = rhf_rcv_type_err(packet->rhf);
-       int lnh = be16_to_cpu(rhdr->lrh[0]) & 3;
-       struct hfi1_ibport *ibp = &ppd->ibport_data;
-       struct hfi1_devdata *dd = ppd->dd;
-       struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
-
-       if (packet->rhf & (RHF_VCRC_ERR | RHF_ICRC_ERR))
-               return;
-
-       if (packet->rhf & RHF_TID_ERR) {
-               /* For TIDERR and RC QPs preemptively schedule a NAK */
-               struct hfi1_ib_header *hdr = (struct hfi1_ib_header *)rhdr;
-               struct hfi1_other_headers *ohdr = NULL;
-               u32 tlen = rhf_pkt_len(packet->rhf); /* in bytes */
-               u16 lid  = be16_to_cpu(hdr->lrh[1]);
-               u32 qp_num;
-               u32 rcv_flags = 0;
-
-               /* Sanity check packet */
-               if (tlen < 24)
-                       goto drop;
-
-               /* Check for GRH */
-               if (lnh == HFI1_LRH_BTH) {
-                       ohdr = &hdr->u.oth;
-               } else if (lnh == HFI1_LRH_GRH) {
-                       u32 vtf;
-
-                       ohdr = &hdr->u.l.oth;
-                       if (hdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR)
-                               goto drop;
-                       vtf = be32_to_cpu(hdr->u.l.grh.version_tclass_flow);
-                       if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION)
-                               goto drop;
-                       rcv_flags |= HFI1_HAS_GRH;
-               } else {
-                       goto drop;
-               }
-               /* Get the destination QP number. */
-               qp_num = be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK;
-               if (lid < be16_to_cpu(IB_MULTICAST_LID_BASE)) {
-                       struct rvt_qp *qp;
-                       unsigned long flags;
-
-                       rcu_read_lock();
-                       qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
-                       if (!qp) {
-                               rcu_read_unlock();
-                               goto drop;
-                       }
-
-                       /*
-                        * Handle only RC QPs - for other QP types drop error
-                        * packet.
-                        */
-                       spin_lock_irqsave(&qp->r_lock, flags);
-
-                       /* Check for valid receive state. */
-                       if (!(ib_rvt_state_ops[qp->state] &
-                             RVT_PROCESS_RECV_OK)) {
-                               ibp->rvp.n_pkt_drops++;
-                       }
-
-                       switch (qp->ibqp.qp_type) {
-                       case IB_QPT_RC:
-                               hfi1_rc_hdrerr(
-                                       rcd,
-                                       hdr,
-                                       rcv_flags,
-                                       qp);
-                               break;
-                       default:
-                               /* For now don't handle any other QP types */
-                               break;
-                       }
-
-                       spin_unlock_irqrestore(&qp->r_lock, flags);
-                       rcu_read_unlock();
-               } /* Unicast QP */
-       } /* Valid packet with TIDErr */
-
-       /* handle "RcvTypeErr" flags */
-       switch (rte) {
-       case RHF_RTE_ERROR_OP_CODE_ERR:
-       {
-               u32 opcode;
-               void *ebuf = NULL;
-               __be32 *bth = NULL;
-
-               if (rhf_use_egr_bfr(packet->rhf))
-                       ebuf = packet->ebuf;
-
-               if (!ebuf)
-                       goto drop; /* this should never happen */
-
-               if (lnh == HFI1_LRH_BTH)
-                       bth = (__be32 *)ebuf;
-               else if (lnh == HFI1_LRH_GRH)
-                       bth = (__be32 *)((char *)ebuf + sizeof(struct ib_grh));
-               else
-                       goto drop;
-
-               opcode = be32_to_cpu(bth[0]) >> 24;
-               opcode &= 0xff;
-
-               if (opcode == IB_OPCODE_CNP) {
-                       /*
-                        * Only in pre-B0 h/w is the CNP_OPCODE handled
-                        * via this code path.
-                        */
-                       struct rvt_qp *qp = NULL;
-                       u32 lqpn, rqpn;
-                       u16 rlid;
-                       u8 svc_type, sl, sc5;
-
-                       sc5  = (be16_to_cpu(rhdr->lrh[0]) >> 12) & 0xf;
-                       if (rhf_dc_info(packet->rhf))
-                               sc5 |= 0x10;
-                       sl = ibp->sc_to_sl[sc5];
-
-                       lqpn = be32_to_cpu(bth[1]) & RVT_QPN_MASK;
-                       rcu_read_lock();
-                       qp = rvt_lookup_qpn(rdi, &ibp->rvp, lqpn);
-                       if (!qp) {
-                               rcu_read_unlock();
-                               goto drop;
-                       }
-
-                       switch (qp->ibqp.qp_type) {
-                       case IB_QPT_UD:
-                               rlid = 0;
-                               rqpn = 0;
-                               svc_type = IB_CC_SVCTYPE_UD;
-                               break;
-                       case IB_QPT_UC:
-                               rlid = be16_to_cpu(rhdr->lrh[3]);
-                               rqpn = qp->remote_qpn;
-                               svc_type = IB_CC_SVCTYPE_UC;
-                               break;
-                       default:
-                               goto drop;
-                       }
-
-                       process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type);
-                       rcu_read_unlock();
-               }
-
-               packet->rhf &= ~RHF_RCV_TYPE_ERR_SMASK;
-               break;
-       }
-       default:
-               break;
-       }
-
-drop:
-       return;
-}
-
-static inline void init_packet(struct hfi1_ctxtdata *rcd,
-                              struct hfi1_packet *packet)
-{
-       packet->rsize = rcd->rcvhdrqentsize; /* words */
-       packet->maxcnt = rcd->rcvhdrq_cnt * packet->rsize; /* words */
-       packet->rcd = rcd;
-       packet->updegr = 0;
-       packet->etail = -1;
-       packet->rhf_addr = get_rhf_addr(rcd);
-       packet->rhf = rhf_to_cpu(packet->rhf_addr);
-       packet->rhqoff = rcd->head;
-       packet->numpkt = 0;
-       packet->rcv_flags = 0;
-}
-
-static void process_ecn(struct rvt_qp *qp, struct hfi1_ib_header *hdr,
-                       struct hfi1_other_headers *ohdr,
-                       u64 rhf, u32 bth1, struct ib_grh *grh)
-{
-       struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
-       u32 rqpn = 0;
-       u16 rlid;
-       u8 sc5, svc_type;
-
-       switch (qp->ibqp.qp_type) {
-       case IB_QPT_SMI:
-       case IB_QPT_GSI:
-       case IB_QPT_UD:
-               rlid = be16_to_cpu(hdr->lrh[3]);
-               rqpn = be32_to_cpu(ohdr->u.ud.deth[1]) & RVT_QPN_MASK;
-               svc_type = IB_CC_SVCTYPE_UD;
-               break;
-       case IB_QPT_UC:
-               rlid = qp->remote_ah_attr.dlid;
-               rqpn = qp->remote_qpn;
-               svc_type = IB_CC_SVCTYPE_UC;
-               break;
-       case IB_QPT_RC:
-               rlid = qp->remote_ah_attr.dlid;
-               rqpn = qp->remote_qpn;
-               svc_type = IB_CC_SVCTYPE_RC;
-               break;
-       default:
-               return;
-       }
-
-       sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
-       if (rhf_dc_info(rhf))
-               sc5 |= 0x10;
-
-       if (bth1 & HFI1_FECN_SMASK) {
-               u16 pkey = (u16)be32_to_cpu(ohdr->bth[0]);
-               u16 dlid = be16_to_cpu(hdr->lrh[1]);
-
-               return_cnp(ibp, qp, rqpn, pkey, dlid, rlid, sc5, grh);
-       }
-
-       if (bth1 & HFI1_BECN_SMASK) {
-               struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-               u32 lqpn = bth1 & RVT_QPN_MASK;
-               u8 sl = ibp->sc_to_sl[sc5];
-
-               process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type);
-       }
-}
-
-struct ps_mdata {
-       struct hfi1_ctxtdata *rcd;
-       u32 rsize;
-       u32 maxcnt;
-       u32 ps_head;
-       u32 ps_tail;
-       u32 ps_seq;
-};
-
-static inline void init_ps_mdata(struct ps_mdata *mdata,
-                                struct hfi1_packet *packet)
-{
-       struct hfi1_ctxtdata *rcd = packet->rcd;
-
-       mdata->rcd = rcd;
-       mdata->rsize = packet->rsize;
-       mdata->maxcnt = packet->maxcnt;
-       mdata->ps_head = packet->rhqoff;
-
-       if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) {
-               mdata->ps_tail = get_rcvhdrtail(rcd);
-               if (rcd->ctxt == HFI1_CTRL_CTXT)
-                       mdata->ps_seq = rcd->seq_cnt;
-               else
-                       mdata->ps_seq = 0; /* not used with DMA_RTAIL */
-       } else {
-               mdata->ps_tail = 0; /* used only with DMA_RTAIL*/
-               mdata->ps_seq = rcd->seq_cnt;
-       }
-}
-
-static inline int ps_done(struct ps_mdata *mdata, u64 rhf,
-                         struct hfi1_ctxtdata *rcd)
-{
-       if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL))
-               return mdata->ps_head == mdata->ps_tail;
-       return mdata->ps_seq != rhf_rcv_seq(rhf);
-}
-
-static inline int ps_skip(struct ps_mdata *mdata, u64 rhf,
-                         struct hfi1_ctxtdata *rcd)
-{
-       /*
-        * Control context can potentially receive an invalid rhf.
-        * Drop such packets.
-        */
-       if ((rcd->ctxt == HFI1_CTRL_CTXT) && (mdata->ps_head != mdata->ps_tail))
-               return mdata->ps_seq != rhf_rcv_seq(rhf);
-
-       return 0;
-}
-
-static inline void update_ps_mdata(struct ps_mdata *mdata,
-                                  struct hfi1_ctxtdata *rcd)
-{
-       mdata->ps_head += mdata->rsize;
-       if (mdata->ps_head >= mdata->maxcnt)
-               mdata->ps_head = 0;
-
-       /* Control context must do seq counting */
-       if (!HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) ||
-           (rcd->ctxt == HFI1_CTRL_CTXT)) {
-               if (++mdata->ps_seq > 13)
-                       mdata->ps_seq = 1;
-       }
-}
-
-/*
- * prescan_rxq - search through the receive queue looking for packets
- * containing Excplicit Congestion Notifications (FECNs, or BECNs).
- * When an ECN is found, process the Congestion Notification, and toggle
- * it off.
- * This is declared as a macro to allow quick checking of the port to avoid
- * the overhead of a function call if not enabled.
- */
-#define prescan_rxq(rcd, packet) \
-       do { \
-               if (rcd->ppd->cc_prescan) \
-                       __prescan_rxq(packet); \
-       } while (0)
-static void __prescan_rxq(struct hfi1_packet *packet)
-{
-       struct hfi1_ctxtdata *rcd = packet->rcd;
-       struct ps_mdata mdata;
-
-       init_ps_mdata(&mdata, packet);
-
-       while (1) {
-               struct hfi1_devdata *dd = rcd->dd;
-               struct hfi1_ibport *ibp = &rcd->ppd->ibport_data;
-               __le32 *rhf_addr = (__le32 *)rcd->rcvhdrq + mdata.ps_head +
-                                        dd->rhf_offset;
-               struct rvt_qp *qp;
-               struct hfi1_ib_header *hdr;
-               struct hfi1_other_headers *ohdr;
-               struct ib_grh *grh = NULL;
-               struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
-               u64 rhf = rhf_to_cpu(rhf_addr);
-               u32 etype = rhf_rcv_type(rhf), qpn, bth1;
-               int is_ecn = 0;
-               u8 lnh;
-
-               if (ps_done(&mdata, rhf, rcd))
-                       break;
-
-               if (ps_skip(&mdata, rhf, rcd))
-                       goto next;
-
-               if (etype != RHF_RCV_TYPE_IB)
-                       goto next;
-
-               hdr = (struct hfi1_ib_header *)
-                       hfi1_get_msgheader(dd, rhf_addr);
-               lnh = be16_to_cpu(hdr->lrh[0]) & 3;
-
-               if (lnh == HFI1_LRH_BTH) {
-                       ohdr = &hdr->u.oth;
-               } else if (lnh == HFI1_LRH_GRH) {
-                       ohdr = &hdr->u.l.oth;
-                       grh = &hdr->u.l.grh;
-               } else {
-                       goto next; /* just in case */
-               }
-               bth1 = be32_to_cpu(ohdr->bth[1]);
-               is_ecn = !!(bth1 & (HFI1_FECN_SMASK | HFI1_BECN_SMASK));
-
-               if (!is_ecn)
-                       goto next;
-
-               qpn = bth1 & RVT_QPN_MASK;
-               rcu_read_lock();
-               qp = rvt_lookup_qpn(rdi, &ibp->rvp, qpn);
-
-               if (!qp) {
-                       rcu_read_unlock();
-                       goto next;
-               }
-
-               process_ecn(qp, hdr, ohdr, rhf, bth1, grh);
-               rcu_read_unlock();
-
-               /* turn off BECN, FECN */
-               bth1 &= ~(HFI1_FECN_SMASK | HFI1_BECN_SMASK);
-               ohdr->bth[1] = cpu_to_be32(bth1);
-next:
-               update_ps_mdata(&mdata, rcd);
-       }
-}
-
-static inline int skip_rcv_packet(struct hfi1_packet *packet, int thread)
-{
-       int ret = RCV_PKT_OK;
-
-       /* Set up for the next packet */
-       packet->rhqoff += packet->rsize;
-       if (packet->rhqoff >= packet->maxcnt)
-               packet->rhqoff = 0;
-
-       packet->numpkt++;
-       if (unlikely((packet->numpkt & (MAX_PKT_RECV - 1)) == 0)) {
-               if (thread) {
-                       cond_resched();
-               } else {
-                       ret = RCV_PKT_LIMIT;
-                       this_cpu_inc(*packet->rcd->dd->rcv_limit);
-               }
-       }
-
-       packet->rhf_addr = (__le32 *)packet->rcd->rcvhdrq + packet->rhqoff +
-                                    packet->rcd->dd->rhf_offset;
-       packet->rhf = rhf_to_cpu(packet->rhf_addr);
-
-       return ret;
-}
-
-static inline int process_rcv_packet(struct hfi1_packet *packet, int thread)
-{
-       int ret = RCV_PKT_OK;
-
-       packet->hdr = hfi1_get_msgheader(packet->rcd->dd,
-                                        packet->rhf_addr);
-       packet->hlen = (u8 *)packet->rhf_addr - (u8 *)packet->hdr;
-       packet->etype = rhf_rcv_type(packet->rhf);
-       /* total length */
-       packet->tlen = rhf_pkt_len(packet->rhf); /* in bytes */
-       /* retrieve eager buffer details */
-       packet->ebuf = NULL;
-       if (rhf_use_egr_bfr(packet->rhf)) {
-               packet->etail = rhf_egr_index(packet->rhf);
-               packet->ebuf = get_egrbuf(packet->rcd, packet->rhf,
-                                &packet->updegr);
-               /*
-                * Prefetch the contents of the eager buffer.  It is
-                * OK to send a negative length to prefetch_range().
-                * The +2 is the size of the RHF.
-                */
-               prefetch_range(packet->ebuf,
-                              packet->tlen - ((packet->rcd->rcvhdrqentsize -
-                                              (rhf_hdrq_offset(packet->rhf)
-                                               + 2)) * 4));
-       }
-
-       /*
-        * Call a type specific handler for the packet. We
-        * should be able to trust that etype won't be beyond
-        * the range of valid indexes. If so something is really
-        * wrong and we can probably just let things come
-        * crashing down. There is no need to eat another
-        * comparison in this performance critical code.
-        */
-       packet->rcd->dd->rhf_rcv_function_map[packet->etype](packet);
-       packet->numpkt++;
-
-       /* Set up for the next packet */
-       packet->rhqoff += packet->rsize;
-       if (packet->rhqoff >= packet->maxcnt)
-               packet->rhqoff = 0;
-
-       if (unlikely((packet->numpkt & (MAX_PKT_RECV - 1)) == 0)) {
-               if (thread) {
-                       cond_resched();
-               } else {
-                       ret = RCV_PKT_LIMIT;
-                       this_cpu_inc(*packet->rcd->dd->rcv_limit);
-               }
-       }
-
-       packet->rhf_addr = (__le32 *)packet->rcd->rcvhdrq + packet->rhqoff +
-                                     packet->rcd->dd->rhf_offset;
-       packet->rhf = rhf_to_cpu(packet->rhf_addr);
-
-       return ret;
-}
-
-static inline void process_rcv_update(int last, struct hfi1_packet *packet)
-{
-       /*
-        * Update head regs etc., every 16 packets, if not last pkt,
-        * to help prevent rcvhdrq overflows, when many packets
-        * are processed and queue is nearly full.
-        * Don't request an interrupt for intermediate updates.
-        */
-       if (!last && !(packet->numpkt & 0xf)) {
-               update_usrhead(packet->rcd, packet->rhqoff, packet->updegr,
-                              packet->etail, 0, 0);
-               packet->updegr = 0;
-       }
-       packet->rcv_flags = 0;
-}
-
-static inline void finish_packet(struct hfi1_packet *packet)
-{
-       /*
-        * Nothing we need to free for the packet.
-        *
-        * The only thing we need to do is a final update and call for an
-        * interrupt
-        */
-       update_usrhead(packet->rcd, packet->rcd->head, packet->updegr,
-                      packet->etail, rcv_intr_dynamic, packet->numpkt);
-}
-
-static inline void process_rcv_qp_work(struct hfi1_packet *packet)
-{
-       struct hfi1_ctxtdata *rcd;
-       struct rvt_qp *qp, *nqp;
-
-       rcd = packet->rcd;
-       rcd->head = packet->rhqoff;
-
-       /*
-        * Iterate over all QPs waiting to respond.
-        * The list won't change since the IRQ is only run on one CPU.
-        */
-       list_for_each_entry_safe(qp, nqp, &rcd->qp_wait_list, rspwait) {
-               list_del_init(&qp->rspwait);
-               if (qp->r_flags & RVT_R_RSP_NAK) {
-                       qp->r_flags &= ~RVT_R_RSP_NAK;
-                       hfi1_send_rc_ack(rcd, qp, 0);
-               }
-               if (qp->r_flags & RVT_R_RSP_SEND) {
-                       unsigned long flags;
-
-                       qp->r_flags &= ~RVT_R_RSP_SEND;
-                       spin_lock_irqsave(&qp->s_lock, flags);
-                       if (ib_rvt_state_ops[qp->state] &
-                                       RVT_PROCESS_OR_FLUSH_SEND)
-                               hfi1_schedule_send(qp);
-                       spin_unlock_irqrestore(&qp->s_lock, flags);
-               }
-               if (atomic_dec_and_test(&qp->refcount))
-                       wake_up(&qp->wait);
-       }
-}
-
-/*
- * Handle receive interrupts when using the no dma rtail option.
- */
-int handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *rcd, int thread)
-{
-       u32 seq;
-       int last = RCV_PKT_OK;
-       struct hfi1_packet packet;
-
-       init_packet(rcd, &packet);
-       seq = rhf_rcv_seq(packet.rhf);
-       if (seq != rcd->seq_cnt) {
-               last = RCV_PKT_DONE;
-               goto bail;
-       }
-
-       prescan_rxq(rcd, &packet);
-
-       while (last == RCV_PKT_OK) {
-               last = process_rcv_packet(&packet, thread);
-               seq = rhf_rcv_seq(packet.rhf);
-               if (++rcd->seq_cnt > 13)
-                       rcd->seq_cnt = 1;
-               if (seq != rcd->seq_cnt)
-                       last = RCV_PKT_DONE;
-               process_rcv_update(last, &packet);
-       }
-       process_rcv_qp_work(&packet);
-bail:
-       finish_packet(&packet);
-       return last;
-}
-
-int handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *rcd, int thread)
-{
-       u32 hdrqtail;
-       int last = RCV_PKT_OK;
-       struct hfi1_packet packet;
-
-       init_packet(rcd, &packet);
-       hdrqtail = get_rcvhdrtail(rcd);
-       if (packet.rhqoff == hdrqtail) {
-               last = RCV_PKT_DONE;
-               goto bail;
-       }
-       smp_rmb();  /* prevent speculative reads of dma'ed hdrq */
-
-       prescan_rxq(rcd, &packet);
-
-       while (last == RCV_PKT_OK) {
-               last = process_rcv_packet(&packet, thread);
-               if (packet.rhqoff == hdrqtail)
-                       last = RCV_PKT_DONE;
-               process_rcv_update(last, &packet);
-       }
-       process_rcv_qp_work(&packet);
-bail:
-       finish_packet(&packet);
-       return last;
-}
-
-static inline void set_all_nodma_rtail(struct hfi1_devdata *dd)
-{
-       int i;
-
-       for (i = HFI1_CTRL_CTXT + 1; i < dd->first_user_ctxt; i++)
-               dd->rcd[i]->do_interrupt =
-                       &handle_receive_interrupt_nodma_rtail;
-}
-
-static inline void set_all_dma_rtail(struct hfi1_devdata *dd)
-{
-       int i;
-
-       for (i = HFI1_CTRL_CTXT + 1; i < dd->first_user_ctxt; i++)
-               dd->rcd[i]->do_interrupt =
-                       &handle_receive_interrupt_dma_rtail;
-}
-
-void set_all_slowpath(struct hfi1_devdata *dd)
-{
-       int i;
-
-       /* HFI1_CTRL_CTXT must always use the slow path interrupt handler */
-       for (i = HFI1_CTRL_CTXT + 1; i < dd->first_user_ctxt; i++)
-               dd->rcd[i]->do_interrupt = &handle_receive_interrupt;
-}
-
-static inline int set_armed_to_active(struct hfi1_ctxtdata *rcd,
-                                     struct hfi1_packet packet,
-                                     struct hfi1_devdata *dd)
-{
-       struct work_struct *lsaw = &rcd->ppd->linkstate_active_work;
-       struct hfi1_message_header *hdr = hfi1_get_msgheader(packet.rcd->dd,
-                                                            packet.rhf_addr);
-
-       if (hdr2sc(hdr, packet.rhf) != 0xf) {
-               int hwstate = read_logical_state(dd);
-
-               if (hwstate != LSTATE_ACTIVE) {
-                       dd_dev_info(dd, "Unexpected link state %d\n", hwstate);
-                       return 0;
-               }
-
-               queue_work(rcd->ppd->hfi1_wq, lsaw);
-               return 1;
-       }
-       return 0;
-}
-
-/*
- * handle_receive_interrupt - receive a packet
- * @rcd: the context
- *
- * Called from interrupt handler for errors or receive interrupt.
- * This is the slow path interrupt handler.
- */
-int handle_receive_interrupt(struct hfi1_ctxtdata *rcd, int thread)
-{
-       struct hfi1_devdata *dd = rcd->dd;
-       u32 hdrqtail;
-       int needset, last = RCV_PKT_OK;
-       struct hfi1_packet packet;
-       int skip_pkt = 0;
-
-       /* Control context will always use the slow path interrupt handler */
-       needset = (rcd->ctxt == HFI1_CTRL_CTXT) ? 0 : 1;
-
-       init_packet(rcd, &packet);
-
-       if (!HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) {
-               u32 seq = rhf_rcv_seq(packet.rhf);
-
-               if (seq != rcd->seq_cnt) {
-                       last = RCV_PKT_DONE;
-                       goto bail;
-               }
-               hdrqtail = 0;
-       } else {
-               hdrqtail = get_rcvhdrtail(rcd);
-               if (packet.rhqoff == hdrqtail) {
-                       last = RCV_PKT_DONE;
-                       goto bail;
-               }
-               smp_rmb();  /* prevent speculative reads of dma'ed hdrq */
-
-               /*
-                * Control context can potentially receive an invalid
-                * rhf. Drop such packets.
-                */
-               if (rcd->ctxt == HFI1_CTRL_CTXT) {
-                       u32 seq = rhf_rcv_seq(packet.rhf);
-
-                       if (seq != rcd->seq_cnt)
-                               skip_pkt = 1;
-               }
-       }
-
-       prescan_rxq(rcd, &packet);
-
-       while (last == RCV_PKT_OK) {
-               if (unlikely(dd->do_drop &&
-                            atomic_xchg(&dd->drop_packet, DROP_PACKET_OFF) ==
-                            DROP_PACKET_ON)) {
-                       dd->do_drop = 0;
-
-                       /* On to the next packet */
-                       packet.rhqoff += packet.rsize;
-                       packet.rhf_addr = (__le32 *)rcd->rcvhdrq +
-                                         packet.rhqoff +
-                                         dd->rhf_offset;
-                       packet.rhf = rhf_to_cpu(packet.rhf_addr);
-
-               } else if (skip_pkt) {
-                       last = skip_rcv_packet(&packet, thread);
-                       skip_pkt = 0;
-               } else {
-                       /* Auto activate link on non-SC15 packet receive */
-                       if (unlikely(rcd->ppd->host_link_state ==
-                                    HLS_UP_ARMED) &&
-                           set_armed_to_active(rcd, packet, dd))
-                               goto bail;
-                       last = process_rcv_packet(&packet, thread);
-               }
-
-               if (!HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) {
-                       u32 seq = rhf_rcv_seq(packet.rhf);
-
-                       if (++rcd->seq_cnt > 13)
-                               rcd->seq_cnt = 1;
-                       if (seq != rcd->seq_cnt)
-                               last = RCV_PKT_DONE;
-                       if (needset) {
-                               dd_dev_info(dd, "Switching to NO_DMA_RTAIL\n");
-                               set_all_nodma_rtail(dd);
-                               needset = 0;
-                       }
-               } else {
-                       if (packet.rhqoff == hdrqtail)
-                               last = RCV_PKT_DONE;
-                       /*
-                        * Control context can potentially receive an invalid
-                        * rhf. Drop such packets.
-                        */
-                       if (rcd->ctxt == HFI1_CTRL_CTXT) {
-                               u32 seq = rhf_rcv_seq(packet.rhf);
-
-                               if (++rcd->seq_cnt > 13)
-                                       rcd->seq_cnt = 1;
-                               if (!last && (seq != rcd->seq_cnt))
-                                       skip_pkt = 1;
-                       }
-
-                       if (needset) {
-                               dd_dev_info(dd,
-                                           "Switching to DMA_RTAIL\n");
-                               set_all_dma_rtail(dd);
-                               needset = 0;
-                       }
-               }
-
-               process_rcv_update(last, &packet);
-       }
-
-       process_rcv_qp_work(&packet);
-
-bail:
-       /*
-        * Always write head at end, and setup rcv interrupt, even
-        * if no packets were processed.
-        */
-       finish_packet(&packet);
-       return last;
-}
-
-/*
- * We may discover in the interrupt that the hardware link state has
- * changed from ARMED to ACTIVE (due to the arrival of a non-SC15 packet),
- * and we need to update the driver's notion of the link state.  We cannot
- * run set_link_state from interrupt context, so we queue this function on
- * a workqueue.
- *
- * We delay the regular interrupt processing until after the state changes
- * so that the link will be in the correct state by the time any application
- * we wake up attempts to send a reply to any message it received.
- * (Subsequent receive interrupts may possibly force the wakeup before we
- * update the link state.)
- *
- * The rcd is freed in hfi1_free_ctxtdata after hfi1_postinit_cleanup invokes
- * dd->f_cleanup(dd) to disable the interrupt handler and flush workqueues,
- * so we're safe from use-after-free of the rcd.
- */
-void receive_interrupt_work(struct work_struct *work)
-{
-       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
-                                                 linkstate_active_work);
-       struct hfi1_devdata *dd = ppd->dd;
-       int i;
-
-       /* Received non-SC15 packet implies neighbor_normal */
-       ppd->neighbor_normal = 1;
-       set_link_state(ppd, HLS_UP_ACTIVE);
-
-       /*
-        * Interrupt all kernel contexts that could have had an
-        * interrupt during auto activation.
-        */
-       for (i = HFI1_CTRL_CTXT; i < dd->first_user_ctxt; i++)
-               force_recv_intr(dd->rcd[i]);
-}
-
-/*
- * Convert a given MTU size to the on-wire MAD packet enumeration.
- * Return -1 if the size is invalid.
- */
-int mtu_to_enum(u32 mtu, int default_if_bad)
-{
-       switch (mtu) {
-       case     0: return OPA_MTU_0;
-       case   256: return OPA_MTU_256;
-       case   512: return OPA_MTU_512;
-       case  1024: return OPA_MTU_1024;
-       case  2048: return OPA_MTU_2048;
-       case  4096: return OPA_MTU_4096;
-       case  8192: return OPA_MTU_8192;
-       case 10240: return OPA_MTU_10240;
-       }
-       return default_if_bad;
-}
-
-u16 enum_to_mtu(int mtu)
-{
-       switch (mtu) {
-       case OPA_MTU_0:     return 0;
-       case OPA_MTU_256:   return 256;
-       case OPA_MTU_512:   return 512;
-       case OPA_MTU_1024:  return 1024;
-       case OPA_MTU_2048:  return 2048;
-       case OPA_MTU_4096:  return 4096;
-       case OPA_MTU_8192:  return 8192;
-       case OPA_MTU_10240: return 10240;
-       default: return 0xffff;
-       }
-}
-
-/*
- * set_mtu - set the MTU
- * @ppd: the per port data
- *
- * We can handle "any" incoming size, the issue here is whether we
- * need to restrict our outgoing size.  We do not deal with what happens
- * to programs that are already running when the size changes.
- */
-int set_mtu(struct hfi1_pportdata *ppd)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       int i, drain, ret = 0, is_up = 0;
-
-       ppd->ibmtu = 0;
-       for (i = 0; i < ppd->vls_supported; i++)
-               if (ppd->ibmtu < dd->vld[i].mtu)
-                       ppd->ibmtu = dd->vld[i].mtu;
-       ppd->ibmaxlen = ppd->ibmtu + lrh_max_header_bytes(ppd->dd);
-
-       mutex_lock(&ppd->hls_lock);
-       if (ppd->host_link_state == HLS_UP_INIT ||
-           ppd->host_link_state == HLS_UP_ARMED ||
-           ppd->host_link_state == HLS_UP_ACTIVE)
-               is_up = 1;
-
-       drain = !is_ax(dd) && is_up;
-
-       if (drain)
-               /*
-                * MTU is specified per-VL. To ensure that no packet gets
-                * stuck (due, e.g., to the MTU for the packet's VL being
-                * reduced), empty the per-VL FIFOs before adjusting MTU.
-                */
-               ret = stop_drain_data_vls(dd);
-
-       if (ret) {
-               dd_dev_err(dd, "%s: cannot stop/drain VLs - refusing to change per-VL MTUs\n",
-                          __func__);
-               goto err;
-       }
-
-       hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_MTU, 0);
-
-       if (drain)
-               open_fill_data_vls(dd); /* reopen all VLs */
-
-err:
-       mutex_unlock(&ppd->hls_lock);
-
-       return ret;
-}
-
-int hfi1_set_lid(struct hfi1_pportdata *ppd, u32 lid, u8 lmc)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-
-       ppd->lid = lid;
-       ppd->lmc = lmc;
-       hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LIDLMC, 0);
-
-       dd_dev_info(dd, "IB%u:%u got a lid: 0x%x\n", dd->unit, ppd->port, lid);
-
-       return 0;
-}
-
-void shutdown_led_override(struct hfi1_pportdata *ppd)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-
-       /*
-        * This pairs with the memory barrier in hfi1_start_led_override to
-        * ensure that we read the correct state of LED beaconing represented
-        * by led_override_timer_active
-        */
-       smp_rmb();
-       if (atomic_read(&ppd->led_override_timer_active)) {
-               del_timer_sync(&ppd->led_override_timer);
-               atomic_set(&ppd->led_override_timer_active, 0);
-               /* Ensure the atomic_set is visible to all CPUs */
-               smp_wmb();
-       }
-
-       /* Hand control of the LED to the DC for normal operation */
-       write_csr(dd, DCC_CFG_LED_CNTRL, 0);
-}
-
-static void run_led_override(unsigned long opaque)
-{
-       struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)opaque;
-       struct hfi1_devdata *dd = ppd->dd;
-       unsigned long timeout;
-       int phase_idx;
-
-       if (!(dd->flags & HFI1_INITTED))
-               return;
-
-       phase_idx = ppd->led_override_phase & 1;
-
-       setextled(dd, phase_idx);
-
-       timeout = ppd->led_override_vals[phase_idx];
-
-       /* Set up for next phase */
-       ppd->led_override_phase = !ppd->led_override_phase;
-
-       mod_timer(&ppd->led_override_timer, jiffies + timeout);
-}
-
-/*
- * To have the LED blink in a particular pattern, provide timeon and timeoff
- * in milliseconds.
- * To turn off custom blinking and return to normal operation, use
- * shutdown_led_override()
- */
-void hfi1_start_led_override(struct hfi1_pportdata *ppd, unsigned int timeon,
-                            unsigned int timeoff)
-{
-       if (!(ppd->dd->flags & HFI1_INITTED))
-               return;
-
-       /* Convert to jiffies for direct use in timer */
-       ppd->led_override_vals[0] = msecs_to_jiffies(timeoff);
-       ppd->led_override_vals[1] = msecs_to_jiffies(timeon);
-
-       /* Arbitrarily start from LED on phase */
-       ppd->led_override_phase = 1;
-
-       /*
-        * If the timer has not already been started, do so. Use a "quick"
-        * timeout so the handler will be called soon to look at our request.
-        */
-       if (!timer_pending(&ppd->led_override_timer)) {
-               setup_timer(&ppd->led_override_timer, run_led_override,
-                           (unsigned long)ppd);
-               ppd->led_override_timer.expires = jiffies + 1;
-               add_timer(&ppd->led_override_timer);
-               atomic_set(&ppd->led_override_timer_active, 1);
-               /* Ensure the atomic_set is visible to all CPUs */
-               smp_wmb();
-       }
-}
-
-/**
- * hfi1_reset_device - reset the chip if possible
- * @unit: the device to reset
- *
- * Whether or not reset is successful, we attempt to re-initialize the chip
- * (that is, much like a driver unload/reload).  We clear the INITTED flag
- * so that the various entry points will fail until we reinitialize.  For
- * now, we only allow this if no user contexts are open that use chip resources
- */
-int hfi1_reset_device(int unit)
-{
-       int ret, i;
-       struct hfi1_devdata *dd = hfi1_lookup(unit);
-       struct hfi1_pportdata *ppd;
-       unsigned long flags;
-       int pidx;
-
-       if (!dd) {
-               ret = -ENODEV;
-               goto bail;
-       }
-
-       dd_dev_info(dd, "Reset on unit %u requested\n", unit);
-
-       if (!dd->kregbase || !(dd->flags & HFI1_PRESENT)) {
-               dd_dev_info(dd,
-                           "Invalid unit number %u or not initialized or not present\n",
-                           unit);
-               ret = -ENXIO;
-               goto bail;
-       }
-
-       spin_lock_irqsave(&dd->uctxt_lock, flags);
-       if (dd->rcd)
-               for (i = dd->first_user_ctxt; i < dd->num_rcv_contexts; i++) {
-                       if (!dd->rcd[i] || !dd->rcd[i]->cnt)
-                               continue;
-                       spin_unlock_irqrestore(&dd->uctxt_lock, flags);
-                       ret = -EBUSY;
-                       goto bail;
-               }
-       spin_unlock_irqrestore(&dd->uctxt_lock, flags);
-
-       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
-               ppd = dd->pport + pidx;
-
-               shutdown_led_override(ppd);
-       }
-       if (dd->flags & HFI1_HAS_SEND_DMA)
-               sdma_exit(dd);
-
-       hfi1_reset_cpu_counters(dd);
-
-       ret = hfi1_init(dd, 1);
-
-       if (ret)
-               dd_dev_err(dd,
-                          "Reinitialize unit %u after reset failed with %d\n",
-                          unit, ret);
-       else
-               dd_dev_info(dd, "Reinitialized unit %u after resetting\n",
-                           unit);
-
-bail:
-       return ret;
-}
-
-void handle_eflags(struct hfi1_packet *packet)
-{
-       struct hfi1_ctxtdata *rcd = packet->rcd;
-       u32 rte = rhf_rcv_type_err(packet->rhf);
-
-       rcv_hdrerr(rcd, rcd->ppd, packet);
-       if (rhf_err_flags(packet->rhf))
-               dd_dev_err(rcd->dd,
-                          "receive context %d: rhf 0x%016llx, errs [ %s%s%s%s%s%s%s%s] rte 0x%x\n",
-                          rcd->ctxt, packet->rhf,
-                          packet->rhf & RHF_K_HDR_LEN_ERR ? "k_hdr_len " : "",
-                          packet->rhf & RHF_DC_UNC_ERR ? "dc_unc " : "",
-                          packet->rhf & RHF_DC_ERR ? "dc " : "",
-                          packet->rhf & RHF_TID_ERR ? "tid " : "",
-                          packet->rhf & RHF_LEN_ERR ? "len " : "",
-                          packet->rhf & RHF_ECC_ERR ? "ecc " : "",
-                          packet->rhf & RHF_VCRC_ERR ? "vcrc " : "",
-                          packet->rhf & RHF_ICRC_ERR ? "icrc " : "",
-                          rte);
-}
-
-/*
- * The following functions are called by the interrupt handler. They are type
- * specific handlers for each packet type.
- */
-int process_receive_ib(struct hfi1_packet *packet)
-{
-       trace_hfi1_rcvhdr(packet->rcd->ppd->dd,
-                         packet->rcd->ctxt,
-                         rhf_err_flags(packet->rhf),
-                         RHF_RCV_TYPE_IB,
-                         packet->hlen,
-                         packet->tlen,
-                         packet->updegr,
-                         rhf_egr_index(packet->rhf));
-
-       if (unlikely(rhf_err_flags(packet->rhf))) {
-               handle_eflags(packet);
-               return RHF_RCV_CONTINUE;
-       }
-
-       hfi1_ib_rcv(packet);
-       return RHF_RCV_CONTINUE;
-}
-
-int process_receive_bypass(struct hfi1_packet *packet)
-{
-       if (unlikely(rhf_err_flags(packet->rhf)))
-               handle_eflags(packet);
-
-       dd_dev_err(packet->rcd->dd,
-                  "Bypass packets are not supported in normal operation. Dropping\n");
-       return RHF_RCV_CONTINUE;
-}
-
-int process_receive_error(struct hfi1_packet *packet)
-{
-       handle_eflags(packet);
-
-       if (unlikely(rhf_err_flags(packet->rhf)))
-               dd_dev_err(packet->rcd->dd,
-                          "Unhandled error packet received. Dropping.\n");
-
-       return RHF_RCV_CONTINUE;
-}
-
-int kdeth_process_expected(struct hfi1_packet *packet)
-{
-       if (unlikely(rhf_err_flags(packet->rhf)))
-               handle_eflags(packet);
-
-       dd_dev_err(packet->rcd->dd,
-                  "Unhandled expected packet received. Dropping.\n");
-       return RHF_RCV_CONTINUE;
-}
-
-int kdeth_process_eager(struct hfi1_packet *packet)
-{
-       if (unlikely(rhf_err_flags(packet->rhf)))
-               handle_eflags(packet);
-
-       dd_dev_err(packet->rcd->dd,
-                  "Unhandled eager packet received. Dropping.\n");
-       return RHF_RCV_CONTINUE;
-}
-
-int process_receive_invalid(struct hfi1_packet *packet)
-{
-       dd_dev_err(packet->rcd->dd, "Invalid packet type %d. Dropping\n",
-                  rhf_rcv_type(packet->rhf));
-       return RHF_RCV_CONTINUE;
-}
diff --git a/drivers/staging/rdma/hfi1/efivar.c b/drivers/staging/rdma/hfi1/efivar.c
deleted file mode 100644 (file)
index 106349f..0000000
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "efivar.h"
-
-/* GUID for HFI1 variables in EFI */
-#define HFI1_EFIVAR_GUID EFI_GUID(0xc50a953e, 0xa8b2, 0x42a6, \
-               0xbf, 0x89, 0xd3, 0x33, 0xa6, 0xe9, 0xe6, 0xd4)
-/* largest EFI data size we expect */
-#define EFI_DATA_SIZE 4096
-
-/*
- * Read the named EFI variable.  Return the size of the actual data in *size
- * and a kmalloc'ed buffer in *return_data.  The caller must free the
- * data.  It is guaranteed that *return_data will be NULL and *size = 0
- * if this routine fails.
- *
- * Return 0 on success, -errno on failure.
- */
-static int read_efi_var(const char *name, unsigned long *size,
-                       void **return_data)
-{
-       efi_status_t status;
-       efi_char16_t *uni_name;
-       efi_guid_t guid;
-       unsigned long temp_size;
-       void *temp_buffer;
-       void *data;
-       int i;
-       int ret;
-
-       /* set failure return values */
-       *size = 0;
-       *return_data = NULL;
-
-       if (!efi_enabled(EFI_RUNTIME_SERVICES))
-               return -EOPNOTSUPP;
-
-       uni_name = kcalloc(strlen(name) + 1, sizeof(efi_char16_t), GFP_KERNEL);
-       temp_buffer = kzalloc(EFI_DATA_SIZE, GFP_KERNEL);
-
-       if (!uni_name || !temp_buffer) {
-               ret = -ENOMEM;
-               goto fail;
-       }
-
-       /* input: the size of the buffer */
-       temp_size = EFI_DATA_SIZE;
-
-       /* convert ASCII to unicode - it is a 1:1 mapping */
-       for (i = 0; name[i]; i++)
-               uni_name[i] = name[i];
-
-       /* need a variable for our GUID */
-       guid = HFI1_EFIVAR_GUID;
-
-       /* call into EFI runtime services */
-       status = efi.get_variable(
-                       uni_name,
-                       &guid,
-                       NULL,
-                       &temp_size,
-                       temp_buffer);
-
-       /*
-        * It would be nice to call efi_status_to_err() here, but that
-        * is in the EFIVAR_FS code and may not be compiled in.
-        * However, even that is insufficient since it does not cover
-        * EFI_BUFFER_TOO_SMALL which could be an important return.
-        * For now, just split out succces or not found.
-        */
-       ret = status == EFI_SUCCESS   ? 0 :
-             status == EFI_NOT_FOUND ? -ENOENT :
-                                       -EINVAL;
-       if (ret)
-               goto fail;
-
-       /*
-        * We have successfully read the EFI variable into our
-        * temporary buffer.  Now allocate a correctly sized
-        * buffer.
-        */
-       data = kmemdup(temp_buffer, temp_size, GFP_KERNEL);
-       if (!data) {
-               ret = -ENOMEM;
-               goto fail;
-       }
-
-       *size = temp_size;
-       *return_data = data;
-
-fail:
-       kfree(uni_name);
-       kfree(temp_buffer);
-
-       return ret;
-}
-
-/*
- * Read an HFI1 EFI variable of the form:
- *     <PCIe address>-<kind>
- * Return an kalloc'ed array and size of the data.
- *
- * Returns 0 on success, -errno on failure.
- */
-int read_hfi1_efi_var(struct hfi1_devdata *dd, const char *kind,
-                     unsigned long *size, void **return_data)
-{
-       char name[64];
-
-       /* create a common prefix */
-       snprintf(name, sizeof(name), "%04x:%02x:%02x.%x-%s",
-                pci_domain_nr(dd->pcidev->bus),
-                dd->pcidev->bus->number,
-                PCI_SLOT(dd->pcidev->devfn),
-                PCI_FUNC(dd->pcidev->devfn),
-                kind);
-
-       return read_efi_var(name, size, return_data);
-}
diff --git a/drivers/staging/rdma/hfi1/efivar.h b/drivers/staging/rdma/hfi1/efivar.h
deleted file mode 100644 (file)
index 94e9e70..0000000
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#ifndef _HFI1_EFIVAR_H
-#define _HFI1_EFIVAR_H
-
-#include <linux/efi.h>
-
-#include "hfi.h"
-
-int read_hfi1_efi_var(struct hfi1_devdata *dd, const char *kind,
-                     unsigned long *size, void **return_data);
-
-#endif /* _HFI1_EFIVAR_H */
diff --git a/drivers/staging/rdma/hfi1/eprom.c b/drivers/staging/rdma/hfi1/eprom.c
deleted file mode 100644 (file)
index bd87715..0000000
+++ /dev/null
@@ -1,471 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#include <linux/delay.h>
-#include "hfi.h"
-#include "common.h"
-#include "eprom.h"
-
-/*
- * The EPROM is logically divided into three partitions:
- *     partition 0: the first 128K, visible from PCI ROM BAR
- *     partition 1: 4K config file (sector size)
- *     partition 2: the rest
- */
-#define P0_SIZE (128 * 1024)
-#define P1_SIZE   (4 * 1024)
-#define P1_START P0_SIZE
-#define P2_START (P0_SIZE + P1_SIZE)
-
-/* erase sizes supported by the controller */
-#define SIZE_4KB (4 * 1024)
-#define MASK_4KB (SIZE_4KB - 1)
-
-#define SIZE_32KB (32 * 1024)
-#define MASK_32KB (SIZE_32KB - 1)
-
-#define SIZE_64KB (64 * 1024)
-#define MASK_64KB (SIZE_64KB - 1)
-
-/* controller page size, in bytes */
-#define EP_PAGE_SIZE 256
-#define EEP_PAGE_MASK (EP_PAGE_SIZE - 1)
-
-/* controller commands */
-#define CMD_SHIFT 24
-#define CMD_NOP                            (0)
-#define CMD_PAGE_PROGRAM(addr)     ((0x02 << CMD_SHIFT) | addr)
-#define CMD_READ_DATA(addr)        ((0x03 << CMD_SHIFT) | addr)
-#define CMD_READ_SR1               ((0x05 << CMD_SHIFT))
-#define CMD_WRITE_ENABLE           ((0x06 << CMD_SHIFT))
-#define CMD_SECTOR_ERASE_4KB(addr)  ((0x20 << CMD_SHIFT) | addr)
-#define CMD_SECTOR_ERASE_32KB(addr) ((0x52 << CMD_SHIFT) | addr)
-#define CMD_CHIP_ERASE             ((0x60 << CMD_SHIFT))
-#define CMD_READ_MANUF_DEV_ID      ((0x90 << CMD_SHIFT))
-#define CMD_RELEASE_POWERDOWN_NOID  ((0xab << CMD_SHIFT))
-#define CMD_SECTOR_ERASE_64KB(addr) ((0xd8 << CMD_SHIFT) | addr)
-
-/* controller interface speeds */
-#define EP_SPEED_FULL 0x2      /* full speed */
-
-/* controller status register 1 bits */
-#define SR1_BUSY 0x1ull                /* the BUSY bit in SR1 */
-
-/* sleep length while waiting for controller */
-#define WAIT_SLEEP_US 100      /* must be larger than 5 (see usage) */
-#define COUNT_DELAY_SEC(n) ((n) * (1000000 / WAIT_SLEEP_US))
-
-/* GPIO pins */
-#define EPROM_WP_N BIT_ULL(14) /* EPROM write line */
-
-/*
- * How long to wait for the EPROM to become available, in ms.
- * The spec 32 Mb EPROM takes around 40s to erase then write.
- * Double it for safety.
- */
-#define EPROM_TIMEOUT 80000 /* ms */
-
-/*
- * Turn on external enable line that allows writing on the flash.
- */
-static void write_enable(struct hfi1_devdata *dd)
-{
-       /* raise signal */
-       write_csr(dd, ASIC_GPIO_OUT, read_csr(dd, ASIC_GPIO_OUT) | EPROM_WP_N);
-       /* raise enable */
-       write_csr(dd, ASIC_GPIO_OE, read_csr(dd, ASIC_GPIO_OE) | EPROM_WP_N);
-}
-
-/*
- * Turn off external enable line that allows writing on the flash.
- */
-static void write_disable(struct hfi1_devdata *dd)
-{
-       /* lower signal */
-       write_csr(dd, ASIC_GPIO_OUT, read_csr(dd, ASIC_GPIO_OUT) & ~EPROM_WP_N);
-       /* lower enable */
-       write_csr(dd, ASIC_GPIO_OE, read_csr(dd, ASIC_GPIO_OE) & ~EPROM_WP_N);
-}
-
-/*
- * Wait for the device to become not busy.  Must be called after all
- * write or erase operations.
- */
-static int wait_for_not_busy(struct hfi1_devdata *dd)
-{
-       unsigned long count = 0;
-       u64 reg;
-       int ret = 0;
-
-       /* starts page mode */
-       write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_READ_SR1);
-       while (1) {
-               udelay(WAIT_SLEEP_US);
-               usleep_range(WAIT_SLEEP_US - 5, WAIT_SLEEP_US + 5);
-               count++;
-               reg = read_csr(dd, ASIC_EEP_DATA);
-               if ((reg & SR1_BUSY) == 0)
-                       break;
-               /* 200s is the largest time for a 128Mb device */
-               if (count > COUNT_DELAY_SEC(200)) {
-                       dd_dev_err(dd, "waited too long for SPI FLASH busy to clear - failing\n");
-                       ret = -ETIMEDOUT;
-                       break; /* break, not goto - must stop page mode */
-               }
-       }
-
-       /* stop page mode with a NOP */
-       write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_NOP);
-
-       return ret;
-}
-
-/*
- * Read the device ID from the SPI controller.
- */
-static u32 read_device_id(struct hfi1_devdata *dd)
-{
-       /* read the Manufacture Device ID */
-       write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_READ_MANUF_DEV_ID);
-       return (u32)read_csr(dd, ASIC_EEP_DATA);
-}
-
-/*
- * Erase the whole flash.
- */
-static int erase_chip(struct hfi1_devdata *dd)
-{
-       int ret;
-
-       write_enable(dd);
-
-       write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_WRITE_ENABLE);
-       write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_CHIP_ERASE);
-       ret = wait_for_not_busy(dd);
-
-       write_disable(dd);
-
-       return ret;
-}
-
-/*
- * Erase a range.
- */
-static int erase_range(struct hfi1_devdata *dd, u32 start, u32 len)
-{
-       u32 end = start + len;
-       int ret = 0;
-
-       if (end < start)
-               return -EINVAL;
-
-       /* check the end points for the minimum erase */
-       if ((start & MASK_4KB) || (end & MASK_4KB)) {
-               dd_dev_err(dd,
-                          "%s: non-aligned range (0x%x,0x%x) for a 4KB erase\n",
-                          __func__, start, end);
-               return -EINVAL;
-       }
-
-       write_enable(dd);
-
-       while (start < end) {
-               write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_WRITE_ENABLE);
-               /* check in order of largest to smallest */
-               if (((start & MASK_64KB) == 0) && (start + SIZE_64KB <= end)) {
-                       write_csr(dd, ASIC_EEP_ADDR_CMD,
-                                 CMD_SECTOR_ERASE_64KB(start));
-                       start += SIZE_64KB;
-               } else if (((start & MASK_32KB) == 0) &&
-                          (start + SIZE_32KB <= end)) {
-                       write_csr(dd, ASIC_EEP_ADDR_CMD,
-                                 CMD_SECTOR_ERASE_32KB(start));
-                       start += SIZE_32KB;
-               } else {        /* 4KB will work */
-                       write_csr(dd, ASIC_EEP_ADDR_CMD,
-                                 CMD_SECTOR_ERASE_4KB(start));
-                       start += SIZE_4KB;
-               }
-               ret = wait_for_not_busy(dd);
-               if (ret)
-                       goto done;
-       }
-
-done:
-       write_disable(dd);
-
-       return ret;
-}
-
-/*
- * Read a 256 byte (64 dword) EPROM page.
- * All callers have verified the offset is at a page boundary.
- */
-static void read_page(struct hfi1_devdata *dd, u32 offset, u32 *result)
-{
-       int i;
-
-       write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_READ_DATA(offset));
-       for (i = 0; i < EP_PAGE_SIZE / sizeof(u32); i++)
-               result[i] = (u32)read_csr(dd, ASIC_EEP_DATA);
-       write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_NOP); /* close open page */
-}
-
-/*
- * Read length bytes starting at offset.  Copy to user address addr.
- */
-static int read_length(struct hfi1_devdata *dd, u32 start, u32 len, u64 addr)
-{
-       u32 offset;
-       u32 buffer[EP_PAGE_SIZE / sizeof(u32)];
-       int ret = 0;
-
-       /* reject anything not on an EPROM page boundary */
-       if ((start & EEP_PAGE_MASK) || (len & EEP_PAGE_MASK))
-               return -EINVAL;
-
-       for (offset = 0; offset < len; offset += EP_PAGE_SIZE) {
-               read_page(dd, start + offset, buffer);
-               if (copy_to_user((void __user *)(addr + offset),
-                                buffer, EP_PAGE_SIZE)) {
-                       ret = -EFAULT;
-                       goto done;
-               }
-       }
-
-done:
-       return ret;
-}
-
-/*
- * Write a 256 byte (64 dword) EPROM page.
- * All callers have verified the offset is at a page boundary.
- */
-static int write_page(struct hfi1_devdata *dd, u32 offset, u32 *data)
-{
-       int i;
-
-       write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_WRITE_ENABLE);
-       write_csr(dd, ASIC_EEP_DATA, data[0]);
-       write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_PAGE_PROGRAM(offset));
-       for (i = 1; i < EP_PAGE_SIZE / sizeof(u32); i++)
-               write_csr(dd, ASIC_EEP_DATA, data[i]);
-       /* will close the open page */
-       return wait_for_not_busy(dd);
-}
-
-/*
- * Write length bytes starting at offset.  Read from user address addr.
- */
-static int write_length(struct hfi1_devdata *dd, u32 start, u32 len, u64 addr)
-{
-       u32 offset;
-       u32 buffer[EP_PAGE_SIZE / sizeof(u32)];
-       int ret = 0;
-
-       /* reject anything not on an EPROM page boundary */
-       if ((start & EEP_PAGE_MASK) || (len & EEP_PAGE_MASK))
-               return -EINVAL;
-
-       write_enable(dd);
-
-       for (offset = 0; offset < len; offset += EP_PAGE_SIZE) {
-               if (copy_from_user(buffer, (void __user *)(addr + offset),
-                                  EP_PAGE_SIZE)) {
-                       ret = -EFAULT;
-                       goto done;
-               }
-               ret = write_page(dd, start + offset, buffer);
-               if (ret)
-                       goto done;
-       }
-
-done:
-       write_disable(dd);
-       return ret;
-}
-
-/* convert an range composite to a length, in bytes */
-static inline u32 extract_rlen(u32 composite)
-{
-       return (composite & 0xffff) * EP_PAGE_SIZE;
-}
-
-/* convert an range composite to a start, in bytes */
-static inline u32 extract_rstart(u32 composite)
-{
-       return (composite >> 16) * EP_PAGE_SIZE;
-}
-
-/*
- * Perform the given operation on the EPROM.  Called from user space.  The
- * user credentials have already been checked.
- *
- * Return 0 on success, -ERRNO on error
- */
-int handle_eprom_command(struct file *fp, const struct hfi1_cmd *cmd)
-{
-       struct hfi1_devdata *dd;
-       u32 dev_id;
-       u32 rlen;       /* range length */
-       u32 rstart;     /* range start */
-       int i_minor;
-       int ret = 0;
-
-       /*
-        * Map the device file to device data using the relative minor.
-        * The device file minor number is the unit number + 1.  0 is
-        * the generic device file - reject it.
-        */
-       i_minor = iminor(file_inode(fp)) - HFI1_USER_MINOR_BASE;
-       if (i_minor <= 0)
-               return -EINVAL;
-       dd = hfi1_lookup(i_minor - 1);
-       if (!dd) {
-               pr_err("%s: cannot find unit %d!\n", __func__, i_minor);
-               return -EINVAL;
-       }
-
-       /* some devices do not have an EPROM */
-       if (!dd->eprom_available)
-               return -EOPNOTSUPP;
-
-       ret = acquire_chip_resource(dd, CR_EPROM, EPROM_TIMEOUT);
-       if (ret) {
-               dd_dev_err(dd, "%s: unable to acquire EPROM resource\n",
-                          __func__);
-               goto done_asic;
-       }
-
-       dd_dev_info(dd, "%s: cmd: type %d, len 0x%x, addr 0x%016llx\n",
-                   __func__, cmd->type, cmd->len, cmd->addr);
-
-       switch (cmd->type) {
-       case HFI1_CMD_EP_INFO:
-               if (cmd->len != sizeof(u32)) {
-                       ret = -ERANGE;
-                       break;
-               }
-               dev_id = read_device_id(dd);
-               /* addr points to a u32 user buffer */
-               if (copy_to_user((void __user *)cmd->addr, &dev_id,
-                                sizeof(u32)))
-                       ret = -EFAULT;
-               break;
-
-       case HFI1_CMD_EP_ERASE_CHIP:
-               ret = erase_chip(dd);
-               break;
-
-       case HFI1_CMD_EP_ERASE_RANGE:
-               rlen = extract_rlen(cmd->len);
-               rstart = extract_rstart(cmd->len);
-               ret = erase_range(dd, rstart, rlen);
-               break;
-
-       case HFI1_CMD_EP_READ_RANGE:
-               rlen = extract_rlen(cmd->len);
-               rstart = extract_rstart(cmd->len);
-               ret = read_length(dd, rstart, rlen, cmd->addr);
-               break;
-
-       case HFI1_CMD_EP_WRITE_RANGE:
-               rlen = extract_rlen(cmd->len);
-               rstart = extract_rstart(cmd->len);
-               ret = write_length(dd, rstart, rlen, cmd->addr);
-               break;
-
-       default:
-               dd_dev_err(dd, "%s: unexpected command %d\n",
-                          __func__, cmd->type);
-               ret = -EINVAL;
-               break;
-       }
-
-       release_chip_resource(dd, CR_EPROM);
-done_asic:
-       return ret;
-}
-
-/*
- * Initialize the EPROM handler.
- */
-int eprom_init(struct hfi1_devdata *dd)
-{
-       int ret = 0;
-
-       /* only the discrete chip has an EPROM */
-       if (dd->pcidev->device != PCI_DEVICE_ID_INTEL0)
-               return 0;
-
-       /*
-        * It is OK if both HFIs reset the EPROM as long as they don't
-        * do it at the same time.
-        */
-       ret = acquire_chip_resource(dd, CR_EPROM, EPROM_TIMEOUT);
-       if (ret) {
-               dd_dev_err(dd,
-                          "%s: unable to acquire EPROM resource, no EPROM support\n",
-                          __func__);
-               goto done_asic;
-       }
-
-       /* reset EPROM to be sure it is in a good state */
-
-       /* set reset */
-       write_csr(dd, ASIC_EEP_CTL_STAT, ASIC_EEP_CTL_STAT_EP_RESET_SMASK);
-       /* clear reset, set speed */
-       write_csr(dd, ASIC_EEP_CTL_STAT,
-                 EP_SPEED_FULL << ASIC_EEP_CTL_STAT_RATE_SPI_SHIFT);
-
-       /* wake the device with command "release powerdown NoID" */
-       write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_RELEASE_POWERDOWN_NOID);
-
-       dd->eprom_available = true;
-       release_chip_resource(dd, CR_EPROM);
-done_asic:
-       return ret;
-}
diff --git a/drivers/staging/rdma/hfi1/eprom.h b/drivers/staging/rdma/hfi1/eprom.h
deleted file mode 100644 (file)
index d41f0b1..0000000
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-struct hfi1_cmd;
-struct hfi1_devdata;
-
-int eprom_init(struct hfi1_devdata *dd);
-int handle_eprom_command(struct file *fp, const struct hfi1_cmd *cmd);
diff --git a/drivers/staging/rdma/hfi1/file_ops.c b/drivers/staging/rdma/hfi1/file_ops.c
deleted file mode 100644 (file)
index c1c5bf8..0000000
+++ /dev/null
@@ -1,1773 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#include <linux/poll.h>
-#include <linux/cdev.h>
-#include <linux/vmalloc.h>
-#include <linux/io.h>
-
-#include <rdma/ib.h>
-
-#include "hfi.h"
-#include "pio.h"
-#include "device.h"
-#include "common.h"
-#include "trace.h"
-#include "user_sdma.h"
-#include "user_exp_rcv.h"
-#include "eprom.h"
-#include "aspm.h"
-#include "mmu_rb.h"
-
-#undef pr_fmt
-#define pr_fmt(fmt) DRIVER_NAME ": " fmt
-
-#define SEND_CTXT_HALT_TIMEOUT 1000 /* msecs */
-
-/*
- * File operation functions
- */
-static int hfi1_file_open(struct inode *, struct file *);
-static int hfi1_file_close(struct inode *, struct file *);
-static ssize_t hfi1_file_write(struct file *, const char __user *,
-                              size_t, loff_t *);
-static ssize_t hfi1_write_iter(struct kiocb *, struct iov_iter *);
-static unsigned int hfi1_poll(struct file *, struct poll_table_struct *);
-static int hfi1_file_mmap(struct file *, struct vm_area_struct *);
-
-static u64 kvirt_to_phys(void *);
-static int assign_ctxt(struct file *, struct hfi1_user_info *);
-static int init_subctxts(struct hfi1_ctxtdata *, const struct hfi1_user_info *);
-static int user_init(struct file *);
-static int get_ctxt_info(struct file *, void __user *, __u32);
-static int get_base_info(struct file *, void __user *, __u32);
-static int setup_ctxt(struct file *);
-static int setup_subctxt(struct hfi1_ctxtdata *);
-static int get_user_context(struct file *, struct hfi1_user_info *,
-                           int, unsigned);
-static int find_shared_ctxt(struct file *, const struct hfi1_user_info *);
-static int allocate_ctxt(struct file *, struct hfi1_devdata *,
-                        struct hfi1_user_info *);
-static unsigned int poll_urgent(struct file *, struct poll_table_struct *);
-static unsigned int poll_next(struct file *, struct poll_table_struct *);
-static int user_event_ack(struct hfi1_ctxtdata *, int, unsigned long);
-static int set_ctxt_pkey(struct hfi1_ctxtdata *, unsigned, u16);
-static int manage_rcvq(struct hfi1_ctxtdata *, unsigned, int);
-static int vma_fault(struct vm_area_struct *, struct vm_fault *);
-
-static const struct file_operations hfi1_file_ops = {
-       .owner = THIS_MODULE,
-       .write = hfi1_file_write,
-       .write_iter = hfi1_write_iter,
-       .open = hfi1_file_open,
-       .release = hfi1_file_close,
-       .poll = hfi1_poll,
-       .mmap = hfi1_file_mmap,
-       .llseek = noop_llseek,
-};
-
-static struct vm_operations_struct vm_ops = {
-       .fault = vma_fault,
-};
-
-/*
- * Types of memories mapped into user processes' space
- */
-enum mmap_types {
-       PIO_BUFS = 1,
-       PIO_BUFS_SOP,
-       PIO_CRED,
-       RCV_HDRQ,
-       RCV_EGRBUF,
-       UREGS,
-       EVENTS,
-       STATUS,
-       RTAIL,
-       SUBCTXT_UREGS,
-       SUBCTXT_RCV_HDRQ,
-       SUBCTXT_EGRBUF,
-       SDMA_COMP
-};
-
-/*
- * Masks and offsets defining the mmap tokens
- */
-#define HFI1_MMAP_OFFSET_MASK   0xfffULL
-#define HFI1_MMAP_OFFSET_SHIFT  0
-#define HFI1_MMAP_SUBCTXT_MASK  0xfULL
-#define HFI1_MMAP_SUBCTXT_SHIFT 12
-#define HFI1_MMAP_CTXT_MASK     0xffULL
-#define HFI1_MMAP_CTXT_SHIFT    16
-#define HFI1_MMAP_TYPE_MASK     0xfULL
-#define HFI1_MMAP_TYPE_SHIFT    24
-#define HFI1_MMAP_MAGIC_MASK    0xffffffffULL
-#define HFI1_MMAP_MAGIC_SHIFT   32
-
-#define HFI1_MMAP_MAGIC         0xdabbad00
-
-#define HFI1_MMAP_TOKEN_SET(field, val)        \
-       (((val) & HFI1_MMAP_##field##_MASK) << HFI1_MMAP_##field##_SHIFT)
-#define HFI1_MMAP_TOKEN_GET(field, token) \
-       (((token) >> HFI1_MMAP_##field##_SHIFT) & HFI1_MMAP_##field##_MASK)
-#define HFI1_MMAP_TOKEN(type, ctxt, subctxt, addr)   \
-       (HFI1_MMAP_TOKEN_SET(MAGIC, HFI1_MMAP_MAGIC) | \
-       HFI1_MMAP_TOKEN_SET(TYPE, type) | \
-       HFI1_MMAP_TOKEN_SET(CTXT, ctxt) | \
-       HFI1_MMAP_TOKEN_SET(SUBCTXT, subctxt) | \
-       HFI1_MMAP_TOKEN_SET(OFFSET, (offset_in_page(addr))))
-
-#define dbg(fmt, ...)                          \
-       pr_info(fmt, ##__VA_ARGS__)
-
-static inline int is_valid_mmap(u64 token)
-{
-       return (HFI1_MMAP_TOKEN_GET(MAGIC, token) == HFI1_MMAP_MAGIC);
-}
-
-static int hfi1_file_open(struct inode *inode, struct file *fp)
-{
-       /* The real work is performed later in assign_ctxt() */
-       fp->private_data = kzalloc(sizeof(struct hfi1_filedata), GFP_KERNEL);
-       if (fp->private_data) /* no cpu affinity by default */
-               ((struct hfi1_filedata *)fp->private_data)->rec_cpu_num = -1;
-       return fp->private_data ? 0 : -ENOMEM;
-}
-
-static ssize_t hfi1_file_write(struct file *fp, const char __user *data,
-                              size_t count, loff_t *offset)
-{
-       const struct hfi1_cmd __user *ucmd;
-       struct hfi1_filedata *fd = fp->private_data;
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-       struct hfi1_cmd cmd;
-       struct hfi1_user_info uinfo;
-       struct hfi1_tid_info tinfo;
-       unsigned long addr;
-       ssize_t consumed = 0, copy = 0, ret = 0;
-       void *dest = NULL;
-       __u64 user_val = 0;
-       int uctxt_required = 1;
-       int must_be_root = 0;
-
-       /* FIXME: This interface cannot continue out of staging */
-       if (WARN_ON_ONCE(!ib_safe_file_access(fp)))
-               return -EACCES;
-
-       if (count < sizeof(cmd)) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       ucmd = (const struct hfi1_cmd __user *)data;
-       if (copy_from_user(&cmd, ucmd, sizeof(cmd))) {
-               ret = -EFAULT;
-               goto bail;
-       }
-
-       consumed = sizeof(cmd);
-
-       switch (cmd.type) {
-       case HFI1_CMD_ASSIGN_CTXT:
-               uctxt_required = 0;     /* assigned user context not required */
-               copy = sizeof(uinfo);
-               dest = &uinfo;
-               break;
-       case HFI1_CMD_SDMA_STATUS_UPD:
-       case HFI1_CMD_CREDIT_UPD:
-               copy = 0;
-               break;
-       case HFI1_CMD_TID_UPDATE:
-       case HFI1_CMD_TID_FREE:
-       case HFI1_CMD_TID_INVAL_READ:
-               copy = sizeof(tinfo);
-               dest = &tinfo;
-               break;
-       case HFI1_CMD_USER_INFO:
-       case HFI1_CMD_RECV_CTRL:
-       case HFI1_CMD_POLL_TYPE:
-       case HFI1_CMD_ACK_EVENT:
-       case HFI1_CMD_CTXT_INFO:
-       case HFI1_CMD_SET_PKEY:
-       case HFI1_CMD_CTXT_RESET:
-               copy = 0;
-               user_val = cmd.addr;
-               break;
-       case HFI1_CMD_EP_INFO:
-       case HFI1_CMD_EP_ERASE_CHIP:
-       case HFI1_CMD_EP_ERASE_RANGE:
-       case HFI1_CMD_EP_READ_RANGE:
-       case HFI1_CMD_EP_WRITE_RANGE:
-               uctxt_required = 0;     /* assigned user context not required */
-               must_be_root = 1;       /* validate user */
-               copy = 0;
-               break;
-       default:
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       /* If the command comes with user data, copy it. */
-       if (copy) {
-               if (copy_from_user(dest, (void __user *)cmd.addr, copy)) {
-                       ret = -EFAULT;
-                       goto bail;
-               }
-               consumed += copy;
-       }
-
-       /*
-        * Make sure there is a uctxt when needed.
-        */
-       if (uctxt_required && !uctxt) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       /* only root can do these operations */
-       if (must_be_root && !capable(CAP_SYS_ADMIN)) {
-               ret = -EPERM;
-               goto bail;
-       }
-
-       switch (cmd.type) {
-       case HFI1_CMD_ASSIGN_CTXT:
-               ret = assign_ctxt(fp, &uinfo);
-               if (ret < 0)
-                       goto bail;
-               ret = setup_ctxt(fp);
-               if (ret)
-                       goto bail;
-               ret = user_init(fp);
-               break;
-       case HFI1_CMD_CTXT_INFO:
-               ret = get_ctxt_info(fp, (void __user *)(unsigned long)
-                                   user_val, cmd.len);
-               break;
-       case HFI1_CMD_USER_INFO:
-               ret = get_base_info(fp, (void __user *)(unsigned long)
-                                   user_val, cmd.len);
-               break;
-       case HFI1_CMD_SDMA_STATUS_UPD:
-               break;
-       case HFI1_CMD_CREDIT_UPD:
-               if (uctxt && uctxt->sc)
-                       sc_return_credits(uctxt->sc);
-               break;
-       case HFI1_CMD_TID_UPDATE:
-               ret = hfi1_user_exp_rcv_setup(fp, &tinfo);
-               if (!ret) {
-                       /*
-                        * Copy the number of tidlist entries we used
-                        * and the length of the buffer we registered.
-                        * These fields are adjacent in the structure so
-                        * we can copy them at the same time.
-                        */
-                       addr = (unsigned long)cmd.addr +
-                               offsetof(struct hfi1_tid_info, tidcnt);
-                       if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
-                                        sizeof(tinfo.tidcnt) +
-                                        sizeof(tinfo.length)))
-                               ret = -EFAULT;
-               }
-               break;
-       case HFI1_CMD_TID_INVAL_READ:
-               ret = hfi1_user_exp_rcv_invalid(fp, &tinfo);
-               if (ret)
-                       break;
-               addr = (unsigned long)cmd.addr +
-                       offsetof(struct hfi1_tid_info, tidcnt);
-               if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
-                                sizeof(tinfo.tidcnt)))
-                       ret = -EFAULT;
-               break;
-       case HFI1_CMD_TID_FREE:
-               ret = hfi1_user_exp_rcv_clear(fp, &tinfo);
-               if (ret)
-                       break;
-               addr = (unsigned long)cmd.addr +
-                       offsetof(struct hfi1_tid_info, tidcnt);
-               if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
-                                sizeof(tinfo.tidcnt)))
-                       ret = -EFAULT;
-               break;
-       case HFI1_CMD_RECV_CTRL:
-               ret = manage_rcvq(uctxt, fd->subctxt, (int)user_val);
-               break;
-       case HFI1_CMD_POLL_TYPE:
-               uctxt->poll_type = (typeof(uctxt->poll_type))user_val;
-               break;
-       case HFI1_CMD_ACK_EVENT:
-               ret = user_event_ack(uctxt, fd->subctxt, user_val);
-               break;
-       case HFI1_CMD_SET_PKEY:
-               if (HFI1_CAP_IS_USET(PKEY_CHECK))
-                       ret = set_ctxt_pkey(uctxt, fd->subctxt, user_val);
-               else
-                       ret = -EPERM;
-               break;
-       case HFI1_CMD_CTXT_RESET: {
-               struct send_context *sc;
-               struct hfi1_devdata *dd;
-
-               if (!uctxt || !uctxt->dd || !uctxt->sc) {
-                       ret = -EINVAL;
-                       break;
-               }
-               /*
-                * There is no protection here. User level has to
-                * guarantee that no one will be writing to the send
-                * context while it is being re-initialized.
-                * If user level breaks that guarantee, it will break
-                * it's own context and no one else's.
-                */
-               dd = uctxt->dd;
-               sc = uctxt->sc;
-               /*
-                * Wait until the interrupt handler has marked the
-                * context as halted or frozen. Report error if we time
-                * out.
-                */
-               wait_event_interruptible_timeout(
-                       sc->halt_wait, (sc->flags & SCF_HALTED),
-                       msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT));
-               if (!(sc->flags & SCF_HALTED)) {
-                       ret = -ENOLCK;
-                       break;
-               }
-               /*
-                * If the send context was halted due to a Freeze,
-                * wait until the device has been "unfrozen" before
-                * resetting the context.
-                */
-               if (sc->flags & SCF_FROZEN) {
-                       wait_event_interruptible_timeout(
-                               dd->event_queue,
-                               !(ACCESS_ONCE(dd->flags) & HFI1_FROZEN),
-                               msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT));
-                       if (dd->flags & HFI1_FROZEN) {
-                               ret = -ENOLCK;
-                               break;
-                       }
-                       if (dd->flags & HFI1_FORCED_FREEZE) {
-                               /*
-                                * Don't allow context reset if we are into
-                                * forced freeze
-                                */
-                               ret = -ENODEV;
-                               break;
-                       }
-                       sc_disable(sc);
-                       ret = sc_enable(sc);
-                       hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_ENB,
-                                    uctxt->ctxt);
-               } else {
-                       ret = sc_restart(sc);
-               }
-               if (!ret)
-                       sc_return_credits(sc);
-               break;
-       }
-       case HFI1_CMD_EP_INFO:
-       case HFI1_CMD_EP_ERASE_CHIP:
-       case HFI1_CMD_EP_ERASE_RANGE:
-       case HFI1_CMD_EP_READ_RANGE:
-       case HFI1_CMD_EP_WRITE_RANGE:
-               ret = handle_eprom_command(fp, &cmd);
-               break;
-       }
-
-       if (ret >= 0)
-               ret = consumed;
-bail:
-       return ret;
-}
-
-static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from)
-{
-       struct hfi1_filedata *fd = kiocb->ki_filp->private_data;
-       struct hfi1_user_sdma_pkt_q *pq = fd->pq;
-       struct hfi1_user_sdma_comp_q *cq = fd->cq;
-       int ret = 0, done = 0, reqs = 0;
-       unsigned long dim = from->nr_segs;
-
-       if (!cq || !pq) {
-               ret = -EIO;
-               goto done;
-       }
-
-       if (!iter_is_iovec(from) || !dim) {
-               ret = -EINVAL;
-               goto done;
-       }
-
-       hfi1_cdbg(SDMA, "SDMA request from %u:%u (%lu)",
-                 fd->uctxt->ctxt, fd->subctxt, dim);
-
-       if (atomic_read(&pq->n_reqs) == pq->n_max_reqs) {
-               ret = -ENOSPC;
-               goto done;
-       }
-
-       while (dim) {
-               unsigned long count = 0;
-
-               ret = hfi1_user_sdma_process_request(
-                       kiocb->ki_filp, (struct iovec *)(from->iov + done),
-                       dim, &count);
-               if (ret)
-                       goto done;
-               dim -= count;
-               done += count;
-               reqs++;
-       }
-done:
-       return ret ? ret : reqs;
-}
-
-static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma)
-{
-       struct hfi1_filedata *fd = fp->private_data;
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-       struct hfi1_devdata *dd;
-       unsigned long flags, pfn;
-       u64 token = vma->vm_pgoff << PAGE_SHIFT,
-               memaddr = 0;
-       u8 subctxt, mapio = 0, vmf = 0, type;
-       ssize_t memlen = 0;
-       int ret = 0;
-       u16 ctxt;
-
-       if (!is_valid_mmap(token) || !uctxt ||
-           !(vma->vm_flags & VM_SHARED)) {
-               ret = -EINVAL;
-               goto done;
-       }
-       dd = uctxt->dd;
-       ctxt = HFI1_MMAP_TOKEN_GET(CTXT, token);
-       subctxt = HFI1_MMAP_TOKEN_GET(SUBCTXT, token);
-       type = HFI1_MMAP_TOKEN_GET(TYPE, token);
-       if (ctxt != uctxt->ctxt || subctxt != fd->subctxt) {
-               ret = -EINVAL;
-               goto done;
-       }
-
-       flags = vma->vm_flags;
-
-       switch (type) {
-       case PIO_BUFS:
-       case PIO_BUFS_SOP:
-               memaddr = ((dd->physaddr + TXE_PIO_SEND) +
-                               /* chip pio base */
-                          (uctxt->sc->hw_context * BIT(16))) +
-                               /* 64K PIO space / ctxt */
-                       (type == PIO_BUFS_SOP ?
-                               (TXE_PIO_SIZE / 2) : 0); /* sop? */
-               /*
-                * Map only the amount allocated to the context, not the
-                * entire available context's PIO space.
-                */
-               memlen = PAGE_ALIGN(uctxt->sc->credits * PIO_BLOCK_SIZE);
-               flags &= ~VM_MAYREAD;
-               flags |= VM_DONTCOPY | VM_DONTEXPAND;
-               vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
-               mapio = 1;
-               break;
-       case PIO_CRED:
-               if (flags & VM_WRITE) {
-                       ret = -EPERM;
-                       goto done;
-               }
-               /*
-                * The credit return location for this context could be on the
-                * second or third page allocated for credit returns (if number
-                * of enabled contexts > 64 and 128 respectively).
-                */
-               memaddr = dd->cr_base[uctxt->numa_id].pa +
-                       (((u64)uctxt->sc->hw_free -
-                         (u64)dd->cr_base[uctxt->numa_id].va) & PAGE_MASK);
-               memlen = PAGE_SIZE;
-               flags &= ~VM_MAYWRITE;
-               flags |= VM_DONTCOPY | VM_DONTEXPAND;
-               /*
-                * The driver has already allocated memory for credit
-                * returns and programmed it into the chip. Has that
-                * memory been flagged as non-cached?
-                */
-               /* vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); */
-               mapio = 1;
-               break;
-       case RCV_HDRQ:
-               memaddr = uctxt->rcvhdrq_phys;
-               memlen = uctxt->rcvhdrq_size;
-               break;
-       case RCV_EGRBUF: {
-               unsigned long addr;
-               int i;
-               /*
-                * The RcvEgr buffer need to be handled differently
-                * as multiple non-contiguous pages need to be mapped
-                * into the user process.
-                */
-               memlen = uctxt->egrbufs.size;
-               if ((vma->vm_end - vma->vm_start) != memlen) {
-                       dd_dev_err(dd, "Eager buffer map size invalid (%lu != %lu)\n",
-                                  (vma->vm_end - vma->vm_start), memlen);
-                       ret = -EINVAL;
-                       goto done;
-               }
-               if (vma->vm_flags & VM_WRITE) {
-                       ret = -EPERM;
-                       goto done;
-               }
-               vma->vm_flags &= ~VM_MAYWRITE;
-               addr = vma->vm_start;
-               for (i = 0 ; i < uctxt->egrbufs.numbufs; i++) {
-                       ret = remap_pfn_range(
-                               vma, addr,
-                               uctxt->egrbufs.buffers[i].phys >> PAGE_SHIFT,
-                               uctxt->egrbufs.buffers[i].len,
-                               vma->vm_page_prot);
-                       if (ret < 0)
-                               goto done;
-                       addr += uctxt->egrbufs.buffers[i].len;
-               }
-               ret = 0;
-               goto done;
-       }
-       case UREGS:
-               /*
-                * Map only the page that contains this context's user
-                * registers.
-                */
-               memaddr = (unsigned long)
-                       (dd->physaddr + RXE_PER_CONTEXT_USER)
-                       + (uctxt->ctxt * RXE_PER_CONTEXT_SIZE);
-               /*
-                * TidFlow table is on the same page as the rest of the
-                * user registers.
-                */
-               memlen = PAGE_SIZE;
-               flags |= VM_DONTCOPY | VM_DONTEXPAND;
-               vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-               mapio = 1;
-               break;
-       case EVENTS:
-               /*
-                * Use the page where this context's flags are. User level
-                * knows where it's own bitmap is within the page.
-                */
-               memaddr = (unsigned long)(dd->events +
-                                         ((uctxt->ctxt - dd->first_user_ctxt) *
-                                          HFI1_MAX_SHARED_CTXTS)) & PAGE_MASK;
-               memlen = PAGE_SIZE;
-               /*
-                * v3.7 removes VM_RESERVED but the effect is kept by
-                * using VM_IO.
-                */
-               flags |= VM_IO | VM_DONTEXPAND;
-               vmf = 1;
-               break;
-       case STATUS:
-               memaddr = kvirt_to_phys((void *)dd->status);
-               memlen = PAGE_SIZE;
-               flags |= VM_IO | VM_DONTEXPAND;
-               break;
-       case RTAIL:
-               if (!HFI1_CAP_IS_USET(DMA_RTAIL)) {
-                       /*
-                        * If the memory allocation failed, the context alloc
-                        * also would have failed, so we would never get here
-                        */
-                       ret = -EINVAL;
-                       goto done;
-               }
-               if (flags & VM_WRITE) {
-                       ret = -EPERM;
-                       goto done;
-               }
-               memaddr = uctxt->rcvhdrqtailaddr_phys;
-               memlen = PAGE_SIZE;
-               flags &= ~VM_MAYWRITE;
-               break;
-       case SUBCTXT_UREGS:
-               memaddr = (u64)uctxt->subctxt_uregbase;
-               memlen = PAGE_SIZE;
-               flags |= VM_IO | VM_DONTEXPAND;
-               vmf = 1;
-               break;
-       case SUBCTXT_RCV_HDRQ:
-               memaddr = (u64)uctxt->subctxt_rcvhdr_base;
-               memlen = uctxt->rcvhdrq_size * uctxt->subctxt_cnt;
-               flags |= VM_IO | VM_DONTEXPAND;
-               vmf = 1;
-               break;
-       case SUBCTXT_EGRBUF:
-               memaddr = (u64)uctxt->subctxt_rcvegrbuf;
-               memlen = uctxt->egrbufs.size * uctxt->subctxt_cnt;
-               flags |= VM_IO | VM_DONTEXPAND;
-               flags &= ~VM_MAYWRITE;
-               vmf = 1;
-               break;
-       case SDMA_COMP: {
-               struct hfi1_user_sdma_comp_q *cq = fd->cq;
-
-               if (!cq) {
-                       ret = -EFAULT;
-                       goto done;
-               }
-               memaddr = (u64)cq->comps;
-               memlen = PAGE_ALIGN(sizeof(*cq->comps) * cq->nentries);
-               flags |= VM_IO | VM_DONTEXPAND;
-               vmf = 1;
-               break;
-       }
-       default:
-               ret = -EINVAL;
-               break;
-       }
-
-       if ((vma->vm_end - vma->vm_start) != memlen) {
-               hfi1_cdbg(PROC, "%u:%u Memory size mismatch %lu:%lu",
-                         uctxt->ctxt, fd->subctxt,
-                         (vma->vm_end - vma->vm_start), memlen);
-               ret = -EINVAL;
-               goto done;
-       }
-
-       vma->vm_flags = flags;
-       hfi1_cdbg(PROC,
-                 "%u:%u type:%u io/vf:%d/%d, addr:0x%llx, len:%lu(%lu), flags:0x%lx\n",
-                   ctxt, subctxt, type, mapio, vmf, memaddr, memlen,
-                   vma->vm_end - vma->vm_start, vma->vm_flags);
-       pfn = (unsigned long)(memaddr >> PAGE_SHIFT);
-       if (vmf) {
-               vma->vm_pgoff = pfn;
-               vma->vm_ops = &vm_ops;
-               ret = 0;
-       } else if (mapio) {
-               ret = io_remap_pfn_range(vma, vma->vm_start, pfn, memlen,
-                                        vma->vm_page_prot);
-       } else {
-               ret = remap_pfn_range(vma, vma->vm_start, pfn, memlen,
-                                     vma->vm_page_prot);
-       }
-done:
-       return ret;
-}
-
-/*
- * Local (non-chip) user memory is not mapped right away but as it is
- * accessed by the user-level code.
- */
-static int vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-       struct page *page;
-
-       page = vmalloc_to_page((void *)(vmf->pgoff << PAGE_SHIFT));
-       if (!page)
-               return VM_FAULT_SIGBUS;
-
-       get_page(page);
-       vmf->page = page;
-
-       return 0;
-}
-
-static unsigned int hfi1_poll(struct file *fp, struct poll_table_struct *pt)
-{
-       struct hfi1_ctxtdata *uctxt;
-       unsigned pollflag;
-
-       uctxt = ((struct hfi1_filedata *)fp->private_data)->uctxt;
-       if (!uctxt)
-               pollflag = POLLERR;
-       else if (uctxt->poll_type == HFI1_POLL_TYPE_URGENT)
-               pollflag = poll_urgent(fp, pt);
-       else  if (uctxt->poll_type == HFI1_POLL_TYPE_ANYRCV)
-               pollflag = poll_next(fp, pt);
-       else /* invalid */
-               pollflag = POLLERR;
-
-       return pollflag;
-}
-
-static int hfi1_file_close(struct inode *inode, struct file *fp)
-{
-       struct hfi1_filedata *fdata = fp->private_data;
-       struct hfi1_ctxtdata *uctxt = fdata->uctxt;
-       struct hfi1_devdata *dd;
-       unsigned long flags, *ev;
-
-       fp->private_data = NULL;
-
-       if (!uctxt)
-               goto done;
-
-       hfi1_cdbg(PROC, "freeing ctxt %u:%u", uctxt->ctxt, fdata->subctxt);
-       dd = uctxt->dd;
-       mutex_lock(&hfi1_mutex);
-
-       flush_wc();
-       /* drain user sdma queue */
-       hfi1_user_sdma_free_queues(fdata);
-
-       /* release the cpu */
-       hfi1_put_proc_affinity(dd, fdata->rec_cpu_num);
-
-       /*
-        * Clear any left over, unhandled events so the next process that
-        * gets this context doesn't get confused.
-        */
-       ev = dd->events + ((uctxt->ctxt - dd->first_user_ctxt) *
-                          HFI1_MAX_SHARED_CTXTS) + fdata->subctxt;
-       *ev = 0;
-
-       if (--uctxt->cnt) {
-               uctxt->active_slaves &= ~(1 << fdata->subctxt);
-               uctxt->subpid[fdata->subctxt] = 0;
-               mutex_unlock(&hfi1_mutex);
-               goto done;
-       }
-
-       spin_lock_irqsave(&dd->uctxt_lock, flags);
-       /*
-        * Disable receive context and interrupt available, reset all
-        * RcvCtxtCtrl bits to default values.
-        */
-       hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS |
-                    HFI1_RCVCTRL_TIDFLOW_DIS |
-                    HFI1_RCVCTRL_INTRAVAIL_DIS |
-                    HFI1_RCVCTRL_TAILUPD_DIS |
-                    HFI1_RCVCTRL_ONE_PKT_EGR_DIS |
-                    HFI1_RCVCTRL_NO_RHQ_DROP_DIS |
-                    HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt->ctxt);
-       /* Clear the context's J_KEY */
-       hfi1_clear_ctxt_jkey(dd, uctxt->ctxt);
-       /*
-        * Reset context integrity checks to default.
-        * (writes to CSRs probably belong in chip.c)
-        */
-       write_kctxt_csr(dd, uctxt->sc->hw_context, SEND_CTXT_CHECK_ENABLE,
-                       hfi1_pkt_default_send_ctxt_mask(dd, uctxt->sc->type));
-       sc_disable(uctxt->sc);
-       uctxt->pid = 0;
-       spin_unlock_irqrestore(&dd->uctxt_lock, flags);
-
-       dd->rcd[uctxt->ctxt] = NULL;
-
-       hfi1_user_exp_rcv_free(fdata);
-       hfi1_clear_ctxt_pkey(dd, uctxt->ctxt);
-
-       uctxt->rcvwait_to = 0;
-       uctxt->piowait_to = 0;
-       uctxt->rcvnowait = 0;
-       uctxt->pionowait = 0;
-       uctxt->event_flags = 0;
-
-       hfi1_stats.sps_ctxts--;
-       if (++dd->freectxts == dd->num_user_contexts)
-               aspm_enable_all(dd);
-       mutex_unlock(&hfi1_mutex);
-       hfi1_free_ctxtdata(dd, uctxt);
-done:
-       kfree(fdata);
-       return 0;
-}
-
-/*
- * Convert kernel *virtual* addresses to physical addresses.
- * This is used to vmalloc'ed addresses.
- */
-static u64 kvirt_to_phys(void *addr)
-{
-       struct page *page;
-       u64 paddr = 0;
-
-       page = vmalloc_to_page(addr);
-       if (page)
-               paddr = page_to_pfn(page) << PAGE_SHIFT;
-
-       return paddr;
-}
-
-static int assign_ctxt(struct file *fp, struct hfi1_user_info *uinfo)
-{
-       int i_minor, ret = 0;
-       unsigned swmajor, swminor, alg = HFI1_ALG_ACROSS;
-
-       swmajor = uinfo->userversion >> 16;
-       if (swmajor != HFI1_USER_SWMAJOR) {
-               ret = -ENODEV;
-               goto done;
-       }
-
-       swminor = uinfo->userversion & 0xffff;
-
-       if (uinfo->hfi1_alg < HFI1_ALG_COUNT)
-               alg = uinfo->hfi1_alg;
-
-       mutex_lock(&hfi1_mutex);
-       /* First, lets check if we need to setup a shared context? */
-       if (uinfo->subctxt_cnt) {
-               struct hfi1_filedata *fd = fp->private_data;
-
-               ret = find_shared_ctxt(fp, uinfo);
-               if (ret < 0)
-                       goto done_unlock;
-               if (ret)
-                       fd->rec_cpu_num = hfi1_get_proc_affinity(
-                               fd->uctxt->dd, fd->uctxt->numa_id);
-       }
-
-       /*
-        * We execute the following block if we couldn't find a
-        * shared context or if context sharing is not required.
-        */
-       if (!ret) {
-               i_minor = iminor(file_inode(fp)) - HFI1_USER_MINOR_BASE;
-               ret = get_user_context(fp, uinfo, i_minor - 1, alg);
-       }
-done_unlock:
-       mutex_unlock(&hfi1_mutex);
-done:
-       return ret;
-}
-
-/* return true if the device available for general use */
-static int usable_device(struct hfi1_devdata *dd)
-{
-       struct hfi1_pportdata *ppd = dd->pport;
-
-       return driver_lstate(ppd) == IB_PORT_ACTIVE;
-}
-
-static int get_user_context(struct file *fp, struct hfi1_user_info *uinfo,
-                           int devno, unsigned alg)
-{
-       struct hfi1_devdata *dd = NULL;
-       int ret = 0, devmax, npresent, nup, dev;
-
-       devmax = hfi1_count_units(&npresent, &nup);
-       if (!npresent) {
-               ret = -ENXIO;
-               goto done;
-       }
-       if (!nup) {
-               ret = -ENETDOWN;
-               goto done;
-       }
-       if (devno >= 0) {
-               dd = hfi1_lookup(devno);
-               if (!dd)
-                       ret = -ENODEV;
-               else if (!dd->freectxts)
-                       ret = -EBUSY;
-       } else {
-               struct hfi1_devdata *pdd;
-
-               if (alg == HFI1_ALG_ACROSS) {
-                       unsigned free = 0U;
-
-                       for (dev = 0; dev < devmax; dev++) {
-                               pdd = hfi1_lookup(dev);
-                               if (!pdd)
-                                       continue;
-                               if (!usable_device(pdd))
-                                       continue;
-                               if (pdd->freectxts &&
-                                   pdd->freectxts > free) {
-                                       dd = pdd;
-                                       free = pdd->freectxts;
-                               }
-                       }
-               } else {
-                       for (dev = 0; dev < devmax; dev++) {
-                               pdd = hfi1_lookup(dev);
-                               if (!pdd)
-                                       continue;
-                               if (!usable_device(pdd))
-                                       continue;
-                               if (pdd->freectxts) {
-                                       dd = pdd;
-                                       break;
-                               }
-                       }
-               }
-               if (!dd)
-                       ret = -EBUSY;
-       }
-done:
-       return ret ? ret : allocate_ctxt(fp, dd, uinfo);
-}
-
-static int find_shared_ctxt(struct file *fp,
-                           const struct hfi1_user_info *uinfo)
-{
-       int devmax, ndev, i;
-       int ret = 0;
-       struct hfi1_filedata *fd = fp->private_data;
-
-       devmax = hfi1_count_units(NULL, NULL);
-
-       for (ndev = 0; ndev < devmax; ndev++) {
-               struct hfi1_devdata *dd = hfi1_lookup(ndev);
-
-               if (!(dd && (dd->flags & HFI1_PRESENT) && dd->kregbase))
-                       continue;
-               for (i = dd->first_user_ctxt; i < dd->num_rcv_contexts; i++) {
-                       struct hfi1_ctxtdata *uctxt = dd->rcd[i];
-
-                       /* Skip ctxts which are not yet open */
-                       if (!uctxt || !uctxt->cnt)
-                               continue;
-                       /* Skip ctxt if it doesn't match the requested one */
-                       if (memcmp(uctxt->uuid, uinfo->uuid,
-                                  sizeof(uctxt->uuid)) ||
-                           uctxt->jkey != generate_jkey(current_uid()) ||
-                           uctxt->subctxt_id != uinfo->subctxt_id ||
-                           uctxt->subctxt_cnt != uinfo->subctxt_cnt)
-                               continue;
-
-                       /* Verify the sharing process matches the master */
-                       if (uctxt->userversion != uinfo->userversion ||
-                           uctxt->cnt >= uctxt->subctxt_cnt) {
-                               ret = -EINVAL;
-                               goto done;
-                       }
-                       fd->uctxt = uctxt;
-                       fd->subctxt  = uctxt->cnt++;
-                       uctxt->subpid[fd->subctxt] = current->pid;
-                       uctxt->active_slaves |= 1 << fd->subctxt;
-                       ret = 1;
-                       goto done;
-               }
-       }
-
-done:
-       return ret;
-}
-
-static int allocate_ctxt(struct file *fp, struct hfi1_devdata *dd,
-                        struct hfi1_user_info *uinfo)
-{
-       struct hfi1_filedata *fd = fp->private_data;
-       struct hfi1_ctxtdata *uctxt;
-       unsigned ctxt;
-       int ret, numa;
-
-       if (dd->flags & HFI1_FROZEN) {
-               /*
-                * Pick an error that is unique from all other errors
-                * that are returned so the user process knows that
-                * it tried to allocate while the SPC was frozen.  It
-                * it should be able to retry with success in a short
-                * while.
-                */
-               return -EIO;
-       }
-
-       for (ctxt = dd->first_user_ctxt; ctxt < dd->num_rcv_contexts; ctxt++)
-               if (!dd->rcd[ctxt])
-                       break;
-
-       if (ctxt == dd->num_rcv_contexts)
-               return -EBUSY;
-
-       fd->rec_cpu_num = hfi1_get_proc_affinity(dd, -1);
-       if (fd->rec_cpu_num != -1)
-               numa = cpu_to_node(fd->rec_cpu_num);
-       else
-               numa = numa_node_id();
-       uctxt = hfi1_create_ctxtdata(dd->pport, ctxt, numa);
-       if (!uctxt) {
-               dd_dev_err(dd,
-                          "Unable to allocate ctxtdata memory, failing open\n");
-               return -ENOMEM;
-       }
-       hfi1_cdbg(PROC, "[%u:%u] pid %u assigned to CPU %d (NUMA %u)",
-                 uctxt->ctxt, fd->subctxt, current->pid, fd->rec_cpu_num,
-                 uctxt->numa_id);
-
-       /*
-        * Allocate and enable a PIO send context.
-        */
-       uctxt->sc = sc_alloc(dd, SC_USER, uctxt->rcvhdrqentsize,
-                            uctxt->dd->node);
-       if (!uctxt->sc)
-               return -ENOMEM;
-
-       hfi1_cdbg(PROC, "allocated send context %u(%u)\n", uctxt->sc->sw_index,
-                 uctxt->sc->hw_context);
-       ret = sc_enable(uctxt->sc);
-       if (ret)
-               return ret;
-       /*
-        * Setup shared context resources if the user-level has requested
-        * shared contexts and this is the 'master' process.
-        * This has to be done here so the rest of the sub-contexts find the
-        * proper master.
-        */
-       if (uinfo->subctxt_cnt && !fd->subctxt) {
-               ret = init_subctxts(uctxt, uinfo);
-               /*
-                * On error, we don't need to disable and de-allocate the
-                * send context because it will be done during file close
-                */
-               if (ret)
-                       return ret;
-       }
-       uctxt->userversion = uinfo->userversion;
-       uctxt->pid = current->pid;
-       uctxt->flags = HFI1_CAP_UGET(MASK);
-       init_waitqueue_head(&uctxt->wait);
-       strlcpy(uctxt->comm, current->comm, sizeof(uctxt->comm));
-       memcpy(uctxt->uuid, uinfo->uuid, sizeof(uctxt->uuid));
-       uctxt->jkey = generate_jkey(current_uid());
-       INIT_LIST_HEAD(&uctxt->sdma_queues);
-       spin_lock_init(&uctxt->sdma_qlock);
-       hfi1_stats.sps_ctxts++;
-       /*
-        * Disable ASPM when there are open user/PSM contexts to avoid
-        * issues with ASPM L1 exit latency
-        */
-       if (dd->freectxts-- == dd->num_user_contexts)
-               aspm_disable_all(dd);
-       fd->uctxt = uctxt;
-
-       return 0;
-}
-
-static int init_subctxts(struct hfi1_ctxtdata *uctxt,
-                        const struct hfi1_user_info *uinfo)
-{
-       unsigned num_subctxts;
-
-       num_subctxts = uinfo->subctxt_cnt;
-       if (num_subctxts > HFI1_MAX_SHARED_CTXTS)
-               return -EINVAL;
-
-       uctxt->subctxt_cnt = uinfo->subctxt_cnt;
-       uctxt->subctxt_id = uinfo->subctxt_id;
-       uctxt->active_slaves = 1;
-       uctxt->redirect_seq_cnt = 1;
-       set_bit(HFI1_CTXT_MASTER_UNINIT, &uctxt->event_flags);
-
-       return 0;
-}
-
-static int setup_subctxt(struct hfi1_ctxtdata *uctxt)
-{
-       int ret = 0;
-       unsigned num_subctxts = uctxt->subctxt_cnt;
-
-       uctxt->subctxt_uregbase = vmalloc_user(PAGE_SIZE);
-       if (!uctxt->subctxt_uregbase) {
-               ret = -ENOMEM;
-               goto bail;
-       }
-       /* We can take the size of the RcvHdr Queue from the master */
-       uctxt->subctxt_rcvhdr_base = vmalloc_user(uctxt->rcvhdrq_size *
-                                                 num_subctxts);
-       if (!uctxt->subctxt_rcvhdr_base) {
-               ret = -ENOMEM;
-               goto bail_ureg;
-       }
-
-       uctxt->subctxt_rcvegrbuf = vmalloc_user(uctxt->egrbufs.size *
-                                               num_subctxts);
-       if (!uctxt->subctxt_rcvegrbuf) {
-               ret = -ENOMEM;
-               goto bail_rhdr;
-       }
-       goto bail;
-bail_rhdr:
-       vfree(uctxt->subctxt_rcvhdr_base);
-bail_ureg:
-       vfree(uctxt->subctxt_uregbase);
-       uctxt->subctxt_uregbase = NULL;
-bail:
-       return ret;
-}
-
-static int user_init(struct file *fp)
-{
-       unsigned int rcvctrl_ops = 0;
-       struct hfi1_filedata *fd = fp->private_data;
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-
-       /* make sure that the context has already been setup */
-       if (!test_bit(HFI1_CTXT_SETUP_DONE, &uctxt->event_flags))
-               return -EFAULT;
-
-       /* initialize poll variables... */
-       uctxt->urgent = 0;
-       uctxt->urgent_poll = 0;
-
-       /*
-        * Now enable the ctxt for receive.
-        * For chips that are set to DMA the tail register to memory
-        * when they change (and when the update bit transitions from
-        * 0 to 1.  So for those chips, we turn it off and then back on.
-        * This will (very briefly) affect any other open ctxts, but the
-        * duration is very short, and therefore isn't an issue.  We
-        * explicitly set the in-memory tail copy to 0 beforehand, so we
-        * don't have to wait to be sure the DMA update has happened
-        * (chip resets head/tail to 0 on transition to enable).
-        */
-       if (uctxt->rcvhdrtail_kvaddr)
-               clear_rcvhdrtail(uctxt);
-
-       /* Setup J_KEY before enabling the context */
-       hfi1_set_ctxt_jkey(uctxt->dd, uctxt->ctxt, uctxt->jkey);
-
-       rcvctrl_ops = HFI1_RCVCTRL_CTXT_ENB;
-       if (HFI1_CAP_KGET_MASK(uctxt->flags, HDRSUPP))
-               rcvctrl_ops |= HFI1_RCVCTRL_TIDFLOW_ENB;
-       /*
-        * Ignore the bit in the flags for now until proper
-        * support for multiple packet per rcv array entry is
-        * added.
-        */
-       if (!HFI1_CAP_KGET_MASK(uctxt->flags, MULTI_PKT_EGR))
-               rcvctrl_ops |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB;
-       if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_EGR_FULL))
-               rcvctrl_ops |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
-       if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_RHQ_FULL))
-               rcvctrl_ops |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
-       /*
-        * The RcvCtxtCtrl.TailUpd bit has to be explicitly written.
-        * We can't rely on the correct value to be set from prior
-        * uses of the chip or ctxt. Therefore, add the rcvctrl op
-        * for both cases.
-        */
-       if (HFI1_CAP_KGET_MASK(uctxt->flags, DMA_RTAIL))
-               rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_ENB;
-       else
-               rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_DIS;
-       hfi1_rcvctrl(uctxt->dd, rcvctrl_ops, uctxt->ctxt);
-
-       /* Notify any waiting slaves */
-       if (uctxt->subctxt_cnt) {
-               clear_bit(HFI1_CTXT_MASTER_UNINIT, &uctxt->event_flags);
-               wake_up(&uctxt->wait);
-       }
-
-       return 0;
-}
-
-static int get_ctxt_info(struct file *fp, void __user *ubase, __u32 len)
-{
-       struct hfi1_ctxt_info cinfo;
-       struct hfi1_filedata *fd = fp->private_data;
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-       int ret = 0;
-
-       memset(&cinfo, 0, sizeof(cinfo));
-       ret = hfi1_get_base_kinfo(uctxt, &cinfo);
-       if (ret < 0)
-               goto done;
-       cinfo.num_active = hfi1_count_active_units();
-       cinfo.unit = uctxt->dd->unit;
-       cinfo.ctxt = uctxt->ctxt;
-       cinfo.subctxt = fd->subctxt;
-       cinfo.rcvtids = roundup(uctxt->egrbufs.alloced,
-                               uctxt->dd->rcv_entries.group_size) +
-               uctxt->expected_count;
-       cinfo.credits = uctxt->sc->credits;
-       cinfo.numa_node = uctxt->numa_id;
-       cinfo.rec_cpu = fd->rec_cpu_num;
-       cinfo.send_ctxt = uctxt->sc->hw_context;
-
-       cinfo.egrtids = uctxt->egrbufs.alloced;
-       cinfo.rcvhdrq_cnt = uctxt->rcvhdrq_cnt;
-       cinfo.rcvhdrq_entsize = uctxt->rcvhdrqentsize << 2;
-       cinfo.sdma_ring_size = fd->cq->nentries;
-       cinfo.rcvegr_size = uctxt->egrbufs.rcvtid_size;
-
-       trace_hfi1_ctxt_info(uctxt->dd, uctxt->ctxt, fd->subctxt, cinfo);
-       if (copy_to_user(ubase, &cinfo, sizeof(cinfo)))
-               ret = -EFAULT;
-done:
-       return ret;
-}
-
-static int setup_ctxt(struct file *fp)
-{
-       struct hfi1_filedata *fd = fp->private_data;
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-       struct hfi1_devdata *dd = uctxt->dd;
-       int ret = 0;
-
-       /*
-        * Context should be set up only once, including allocation and
-        * programming of eager buffers. This is done if context sharing
-        * is not requested or by the master process.
-        */
-       if (!uctxt->subctxt_cnt || !fd->subctxt) {
-               ret = hfi1_init_ctxt(uctxt->sc);
-               if (ret)
-                       goto done;
-
-               /* Now allocate the RcvHdr queue and eager buffers. */
-               ret = hfi1_create_rcvhdrq(dd, uctxt);
-               if (ret)
-                       goto done;
-               ret = hfi1_setup_eagerbufs(uctxt);
-               if (ret)
-                       goto done;
-               if (uctxt->subctxt_cnt && !fd->subctxt) {
-                       ret = setup_subctxt(uctxt);
-                       if (ret)
-                               goto done;
-               }
-       } else {
-               ret = wait_event_interruptible(uctxt->wait, !test_bit(
-                                              HFI1_CTXT_MASTER_UNINIT,
-                                              &uctxt->event_flags));
-               if (ret)
-                       goto done;
-       }
-
-       ret = hfi1_user_sdma_alloc_queues(uctxt, fp);
-       if (ret)
-               goto done;
-       /*
-        * Expected receive has to be setup for all processes (including
-        * shared contexts). However, it has to be done after the master
-        * context has been fully configured as it depends on the
-        * eager/expected split of the RcvArray entries.
-        * Setting it up here ensures that the subcontexts will be waiting
-        * (due to the above wait_event_interruptible() until the master
-        * is setup.
-        */
-       ret = hfi1_user_exp_rcv_init(fp);
-       if (ret)
-               goto done;
-
-       set_bit(HFI1_CTXT_SETUP_DONE, &uctxt->event_flags);
-done:
-       return ret;
-}
-
-static int get_base_info(struct file *fp, void __user *ubase, __u32 len)
-{
-       struct hfi1_base_info binfo;
-       struct hfi1_filedata *fd = fp->private_data;
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-       struct hfi1_devdata *dd = uctxt->dd;
-       ssize_t sz;
-       unsigned offset;
-       int ret = 0;
-
-       trace_hfi1_uctxtdata(uctxt->dd, uctxt);
-
-       memset(&binfo, 0, sizeof(binfo));
-       binfo.hw_version = dd->revision;
-       binfo.sw_version = HFI1_KERN_SWVERSION;
-       binfo.bthqp = kdeth_qp;
-       binfo.jkey = uctxt->jkey;
-       /*
-        * If more than 64 contexts are enabled the allocated credit
-        * return will span two or three contiguous pages. Since we only
-        * map the page containing the context's credit return address,
-        * we need to calculate the offset in the proper page.
-        */
-       offset = ((u64)uctxt->sc->hw_free -
-                 (u64)dd->cr_base[uctxt->numa_id].va) % PAGE_SIZE;
-       binfo.sc_credits_addr = HFI1_MMAP_TOKEN(PIO_CRED, uctxt->ctxt,
-                                               fd->subctxt, offset);
-       binfo.pio_bufbase = HFI1_MMAP_TOKEN(PIO_BUFS, uctxt->ctxt,
-                                           fd->subctxt,
-                                           uctxt->sc->base_addr);
-       binfo.pio_bufbase_sop = HFI1_MMAP_TOKEN(PIO_BUFS_SOP,
-                                               uctxt->ctxt,
-                                               fd->subctxt,
-                                               uctxt->sc->base_addr);
-       binfo.rcvhdr_bufbase = HFI1_MMAP_TOKEN(RCV_HDRQ, uctxt->ctxt,
-                                              fd->subctxt,
-                                              uctxt->rcvhdrq);
-       binfo.rcvegr_bufbase = HFI1_MMAP_TOKEN(RCV_EGRBUF, uctxt->ctxt,
-                                              fd->subctxt,
-                                              uctxt->egrbufs.rcvtids[0].phys);
-       binfo.sdma_comp_bufbase = HFI1_MMAP_TOKEN(SDMA_COMP, uctxt->ctxt,
-                                                fd->subctxt, 0);
-       /*
-        * user regs are at
-        * (RXE_PER_CONTEXT_USER + (ctxt * RXE_PER_CONTEXT_SIZE))
-        */
-       binfo.user_regbase = HFI1_MMAP_TOKEN(UREGS, uctxt->ctxt,
-                                           fd->subctxt, 0);
-       offset = offset_in_page((((uctxt->ctxt - dd->first_user_ctxt) *
-                   HFI1_MAX_SHARED_CTXTS) + fd->subctxt) *
-                 sizeof(*dd->events));
-       binfo.events_bufbase = HFI1_MMAP_TOKEN(EVENTS, uctxt->ctxt,
-                                             fd->subctxt,
-                                             offset);
-       binfo.status_bufbase = HFI1_MMAP_TOKEN(STATUS, uctxt->ctxt,
-                                             fd->subctxt,
-                                             dd->status);
-       if (HFI1_CAP_IS_USET(DMA_RTAIL))
-               binfo.rcvhdrtail_base = HFI1_MMAP_TOKEN(RTAIL, uctxt->ctxt,
-                                                      fd->subctxt, 0);
-       if (uctxt->subctxt_cnt) {
-               binfo.subctxt_uregbase = HFI1_MMAP_TOKEN(SUBCTXT_UREGS,
-                                                       uctxt->ctxt,
-                                                       fd->subctxt, 0);
-               binfo.subctxt_rcvhdrbuf = HFI1_MMAP_TOKEN(SUBCTXT_RCV_HDRQ,
-                                                        uctxt->ctxt,
-                                                        fd->subctxt, 0);
-               binfo.subctxt_rcvegrbuf = HFI1_MMAP_TOKEN(SUBCTXT_EGRBUF,
-                                                        uctxt->ctxt,
-                                                        fd->subctxt, 0);
-       }
-       sz = (len < sizeof(binfo)) ? len : sizeof(binfo);
-       if (copy_to_user(ubase, &binfo, sz))
-               ret = -EFAULT;
-       return ret;
-}
-
-static unsigned int poll_urgent(struct file *fp,
-                               struct poll_table_struct *pt)
-{
-       struct hfi1_filedata *fd = fp->private_data;
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-       struct hfi1_devdata *dd = uctxt->dd;
-       unsigned pollflag;
-
-       poll_wait(fp, &uctxt->wait, pt);
-
-       spin_lock_irq(&dd->uctxt_lock);
-       if (uctxt->urgent != uctxt->urgent_poll) {
-               pollflag = POLLIN | POLLRDNORM;
-               uctxt->urgent_poll = uctxt->urgent;
-       } else {
-               pollflag = 0;
-               set_bit(HFI1_CTXT_WAITING_URG, &uctxt->event_flags);
-       }
-       spin_unlock_irq(&dd->uctxt_lock);
-
-       return pollflag;
-}
-
-static unsigned int poll_next(struct file *fp,
-                             struct poll_table_struct *pt)
-{
-       struct hfi1_filedata *fd = fp->private_data;
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-       struct hfi1_devdata *dd = uctxt->dd;
-       unsigned pollflag;
-
-       poll_wait(fp, &uctxt->wait, pt);
-
-       spin_lock_irq(&dd->uctxt_lock);
-       if (hdrqempty(uctxt)) {
-               set_bit(HFI1_CTXT_WAITING_RCV, &uctxt->event_flags);
-               hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_ENB, uctxt->ctxt);
-               pollflag = 0;
-       } else {
-               pollflag = POLLIN | POLLRDNORM;
-       }
-       spin_unlock_irq(&dd->uctxt_lock);
-
-       return pollflag;
-}
-
-/*
- * Find all user contexts in use, and set the specified bit in their
- * event mask.
- * See also find_ctxt() for a similar use, that is specific to send buffers.
- */
-int hfi1_set_uevent_bits(struct hfi1_pportdata *ppd, const int evtbit)
-{
-       struct hfi1_ctxtdata *uctxt;
-       struct hfi1_devdata *dd = ppd->dd;
-       unsigned ctxt;
-       int ret = 0;
-       unsigned long flags;
-
-       if (!dd->events) {
-               ret = -EINVAL;
-               goto done;
-       }
-
-       spin_lock_irqsave(&dd->uctxt_lock, flags);
-       for (ctxt = dd->first_user_ctxt; ctxt < dd->num_rcv_contexts;
-            ctxt++) {
-               uctxt = dd->rcd[ctxt];
-               if (uctxt) {
-                       unsigned long *evs = dd->events +
-                               (uctxt->ctxt - dd->first_user_ctxt) *
-                               HFI1_MAX_SHARED_CTXTS;
-                       int i;
-                       /*
-                        * subctxt_cnt is 0 if not shared, so do base
-                        * separately, first, then remaining subctxt, if any
-                        */
-                       set_bit(evtbit, evs);
-                       for (i = 1; i < uctxt->subctxt_cnt; i++)
-                               set_bit(evtbit, evs + i);
-               }
-       }
-       spin_unlock_irqrestore(&dd->uctxt_lock, flags);
-done:
-       return ret;
-}
-
-/**
- * manage_rcvq - manage a context's receive queue
- * @uctxt: the context
- * @subctxt: the sub-context
- * @start_stop: action to carry out
- *
- * start_stop == 0 disables receive on the context, for use in queue
- * overflow conditions.  start_stop==1 re-enables, to be used to
- * re-init the software copy of the head register
- */
-static int manage_rcvq(struct hfi1_ctxtdata *uctxt, unsigned subctxt,
-                      int start_stop)
-{
-       struct hfi1_devdata *dd = uctxt->dd;
-       unsigned int rcvctrl_op;
-
-       if (subctxt)
-               goto bail;
-       /* atomically clear receive enable ctxt. */
-       if (start_stop) {
-               /*
-                * On enable, force in-memory copy of the tail register to
-                * 0, so that protocol code doesn't have to worry about
-                * whether or not the chip has yet updated the in-memory
-                * copy or not on return from the system call. The chip
-                * always resets it's tail register back to 0 on a
-                * transition from disabled to enabled.
-                */
-               if (uctxt->rcvhdrtail_kvaddr)
-                       clear_rcvhdrtail(uctxt);
-               rcvctrl_op = HFI1_RCVCTRL_CTXT_ENB;
-       } else {
-               rcvctrl_op = HFI1_RCVCTRL_CTXT_DIS;
-       }
-       hfi1_rcvctrl(dd, rcvctrl_op, uctxt->ctxt);
-       /* always; new head should be equal to new tail; see above */
-bail:
-       return 0;
-}
-
-/*
- * clear the event notifier events for this context.
- * User process then performs actions appropriate to bit having been
- * set, if desired, and checks again in future.
- */
-static int user_event_ack(struct hfi1_ctxtdata *uctxt, int subctxt,
-                         unsigned long events)
-{
-       int i;
-       struct hfi1_devdata *dd = uctxt->dd;
-       unsigned long *evs;
-
-       if (!dd->events)
-               return 0;
-
-       evs = dd->events + ((uctxt->ctxt - dd->first_user_ctxt) *
-                           HFI1_MAX_SHARED_CTXTS) + subctxt;
-
-       for (i = 0; i <= _HFI1_MAX_EVENT_BIT; i++) {
-               if (!test_bit(i, &events))
-                       continue;
-               clear_bit(i, evs);
-       }
-       return 0;
-}
-
-static int set_ctxt_pkey(struct hfi1_ctxtdata *uctxt, unsigned subctxt,
-                        u16 pkey)
-{
-       int ret = -ENOENT, i, intable = 0;
-       struct hfi1_pportdata *ppd = uctxt->ppd;
-       struct hfi1_devdata *dd = uctxt->dd;
-
-       if (pkey == LIM_MGMT_P_KEY || pkey == FULL_MGMT_P_KEY) {
-               ret = -EINVAL;
-               goto done;
-       }
-
-       for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++)
-               if (pkey == ppd->pkeys[i]) {
-                       intable = 1;
-                       break;
-               }
-
-       if (intable)
-               ret = hfi1_set_ctxt_pkey(dd, uctxt->ctxt, pkey);
-done:
-       return ret;
-}
-
-static int ui_open(struct inode *inode, struct file *filp)
-{
-       struct hfi1_devdata *dd;
-
-       dd = container_of(inode->i_cdev, struct hfi1_devdata, ui_cdev);
-       filp->private_data = dd; /* for other methods */
-       return 0;
-}
-
-static int ui_release(struct inode *inode, struct file *filp)
-{
-       /* nothing to do */
-       return 0;
-}
-
-static loff_t ui_lseek(struct file *filp, loff_t offset, int whence)
-{
-       struct hfi1_devdata *dd = filp->private_data;
-
-       return fixed_size_llseek(filp, offset, whence,
-               (dd->kregend - dd->kregbase) + DC8051_DATA_MEM_SIZE);
-}
-
-/* NOTE: assumes unsigned long is 8 bytes */
-static ssize_t ui_read(struct file *filp, char __user *buf, size_t count,
-                      loff_t *f_pos)
-{
-       struct hfi1_devdata *dd = filp->private_data;
-       void __iomem *base = dd->kregbase;
-       unsigned long total, csr_off,
-               barlen = (dd->kregend - dd->kregbase);
-       u64 data;
-
-       /* only read 8 byte quantities */
-       if ((count % 8) != 0)
-               return -EINVAL;
-       /* offset must be 8-byte aligned */
-       if ((*f_pos % 8) != 0)
-               return -EINVAL;
-       /* destination buffer must be 8-byte aligned */
-       if ((unsigned long)buf % 8 != 0)
-               return -EINVAL;
-       /* must be in range */
-       if (*f_pos + count > (barlen + DC8051_DATA_MEM_SIZE))
-               return -EINVAL;
-       /* only set the base if we are not starting past the BAR */
-       if (*f_pos < barlen)
-               base += *f_pos;
-       csr_off = *f_pos;
-       for (total = 0; total < count; total += 8, csr_off += 8) {
-               /* accessing LCB CSRs requires more checks */
-               if (is_lcb_offset(csr_off)) {
-                       if (read_lcb_csr(dd, csr_off, (u64 *)&data))
-                               break; /* failed */
-               }
-               /*
-                * Cannot read ASIC GPIO/QSFP* clear and force CSRs without a
-                * false parity error.  Avoid the whole issue by not reading
-                * them.  These registers are defined as having a read value
-                * of 0.
-                */
-               else if (csr_off == ASIC_GPIO_CLEAR ||
-                        csr_off == ASIC_GPIO_FORCE ||
-                        csr_off == ASIC_QSFP1_CLEAR ||
-                        csr_off == ASIC_QSFP1_FORCE ||
-                        csr_off == ASIC_QSFP2_CLEAR ||
-                        csr_off == ASIC_QSFP2_FORCE)
-                       data = 0;
-               else if (csr_off >= barlen) {
-                       /*
-                        * read_8051_data can read more than just 8 bytes at
-                        * a time. However, folding this into the loop and
-                        * handling the reads in 8 byte increments allows us
-                        * to smoothly transition from chip memory to 8051
-                        * memory.
-                        */
-                       if (read_8051_data(dd,
-                                          (u32)(csr_off - barlen),
-                                          sizeof(data), &data))
-                               break; /* failed */
-               } else
-                       data = readq(base + total);
-               if (put_user(data, (unsigned long __user *)(buf + total)))
-                       break;
-       }
-       *f_pos += total;
-       return total;
-}
-
-/* NOTE: assumes unsigned long is 8 bytes */
-static ssize_t ui_write(struct file *filp, const char __user *buf,
-                       size_t count, loff_t *f_pos)
-{
-       struct hfi1_devdata *dd = filp->private_data;
-       void __iomem *base;
-       unsigned long total, data, csr_off;
-       int in_lcb;
-
-       /* only write 8 byte quantities */
-       if ((count % 8) != 0)
-               return -EINVAL;
-       /* offset must be 8-byte aligned */
-       if ((*f_pos % 8) != 0)
-               return -EINVAL;
-       /* source buffer must be 8-byte aligned */
-       if ((unsigned long)buf % 8 != 0)
-               return -EINVAL;
-       /* must be in range */
-       if (*f_pos + count > dd->kregend - dd->kregbase)
-               return -EINVAL;
-
-       base = (void __iomem *)dd->kregbase + *f_pos;
-       csr_off = *f_pos;
-       in_lcb = 0;
-       for (total = 0; total < count; total += 8, csr_off += 8) {
-               if (get_user(data, (unsigned long __user *)(buf + total)))
-                       break;
-               /* accessing LCB CSRs requires a special procedure */
-               if (is_lcb_offset(csr_off)) {
-                       if (!in_lcb) {
-                               int ret = acquire_lcb_access(dd, 1);
-
-                               if (ret)
-                                       break;
-                               in_lcb = 1;
-                       }
-               } else {
-                       if (in_lcb) {
-                               release_lcb_access(dd, 1);
-                               in_lcb = 0;
-                       }
-               }
-               writeq(data, base + total);
-       }
-       if (in_lcb)
-               release_lcb_access(dd, 1);
-       *f_pos += total;
-       return total;
-}
-
-static const struct file_operations ui_file_ops = {
-       .owner = THIS_MODULE,
-       .llseek = ui_lseek,
-       .read = ui_read,
-       .write = ui_write,
-       .open = ui_open,
-       .release = ui_release,
-};
-
-#define UI_OFFSET 192  /* device minor offset for UI devices */
-static int create_ui = 1;
-
-static struct cdev wildcard_cdev;
-static struct device *wildcard_device;
-
-static atomic_t user_count = ATOMIC_INIT(0);
-
-static void user_remove(struct hfi1_devdata *dd)
-{
-       if (atomic_dec_return(&user_count) == 0)
-               hfi1_cdev_cleanup(&wildcard_cdev, &wildcard_device);
-
-       hfi1_cdev_cleanup(&dd->user_cdev, &dd->user_device);
-       hfi1_cdev_cleanup(&dd->ui_cdev, &dd->ui_device);
-}
-
-static int user_add(struct hfi1_devdata *dd)
-{
-       char name[10];
-       int ret;
-
-       if (atomic_inc_return(&user_count) == 1) {
-               ret = hfi1_cdev_init(0, class_name(), &hfi1_file_ops,
-                                    &wildcard_cdev, &wildcard_device,
-                                    true);
-               if (ret)
-                       goto done;
-       }
-
-       snprintf(name, sizeof(name), "%s_%d", class_name(), dd->unit);
-       ret = hfi1_cdev_init(dd->unit + 1, name, &hfi1_file_ops,
-                            &dd->user_cdev, &dd->user_device,
-                            true);
-       if (ret)
-               goto done;
-
-       if (create_ui) {
-               snprintf(name, sizeof(name),
-                        "%s_ui%d", class_name(), dd->unit);
-               ret = hfi1_cdev_init(dd->unit + UI_OFFSET, name, &ui_file_ops,
-                                    &dd->ui_cdev, &dd->ui_device,
-                                    false);
-               if (ret)
-                       goto done;
-       }
-
-       return 0;
-done:
-       user_remove(dd);
-       return ret;
-}
-
-/*
- * Create per-unit files in /dev
- */
-int hfi1_device_create(struct hfi1_devdata *dd)
-{
-       int r, ret;
-
-       r = user_add(dd);
-       ret = hfi1_diag_add(dd);
-       if (r && !ret)
-               ret = r;
-       return ret;
-}
-
-/*
- * Remove per-unit files in /dev
- * void, core kernel returns no errors for this stuff
- */
-void hfi1_device_remove(struct hfi1_devdata *dd)
-{
-       user_remove(dd);
-       hfi1_diag_remove(dd);
-}
diff --git a/drivers/staging/rdma/hfi1/firmware.c b/drivers/staging/rdma/hfi1/firmware.c
deleted file mode 100644 (file)
index ed680fd..0000000
+++ /dev/null
@@ -1,2056 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/firmware.h>
-#include <linux/mutex.h>
-#include <linux/module.h>
-#include <linux/delay.h>
-#include <linux/crc32.h>
-
-#include "hfi.h"
-#include "trace.h"
-
-/*
- * Make it easy to toggle firmware file name and if it gets loaded by
- * editing the following. This may be something we do while in development
- * but not necessarily something a user would ever need to use.
- */
-#define DEFAULT_FW_8051_NAME_FPGA "hfi_dc8051.bin"
-#define DEFAULT_FW_8051_NAME_ASIC "hfi1_dc8051.fw"
-#define DEFAULT_FW_FABRIC_NAME "hfi1_fabric.fw"
-#define DEFAULT_FW_SBUS_NAME "hfi1_sbus.fw"
-#define DEFAULT_FW_PCIE_NAME "hfi1_pcie.fw"
-#define DEFAULT_PLATFORM_CONFIG_NAME "hfi1_platform.dat"
-#define ALT_FW_8051_NAME_ASIC "hfi1_dc8051_d.fw"
-#define ALT_FW_FABRIC_NAME "hfi1_fabric_d.fw"
-#define ALT_FW_SBUS_NAME "hfi1_sbus_d.fw"
-#define ALT_FW_PCIE_NAME "hfi1_pcie_d.fw"
-
-static uint fw_8051_load = 1;
-static uint fw_fabric_serdes_load = 1;
-static uint fw_pcie_serdes_load = 1;
-static uint fw_sbus_load = 1;
-
-/*
- * Access required in platform.c
- * Maintains state of whether the platform config was fetched via the
- * fallback option
- */
-uint platform_config_load;
-
-/* Firmware file names get set in hfi1_firmware_init() based on the above */
-static char *fw_8051_name;
-static char *fw_fabric_serdes_name;
-static char *fw_sbus_name;
-static char *fw_pcie_serdes_name;
-static char *platform_config_name;
-
-#define SBUS_MAX_POLL_COUNT 100
-#define SBUS_COUNTER(reg, name) \
-       (((reg) >> ASIC_STS_SBUS_COUNTERS_##name##_CNT_SHIFT) & \
-        ASIC_STS_SBUS_COUNTERS_##name##_CNT_MASK)
-
-/*
- * Firmware security header.
- */
-struct css_header {
-       u32 module_type;
-       u32 header_len;
-       u32 header_version;
-       u32 module_id;
-       u32 module_vendor;
-       u32 date;               /* BCD yyyymmdd */
-       u32 size;               /* in DWORDs */
-       u32 key_size;           /* in DWORDs */
-       u32 modulus_size;       /* in DWORDs */
-       u32 exponent_size;      /* in DWORDs */
-       u32 reserved[22];
-};
-
-/* expected field values */
-#define CSS_MODULE_TYPE           0x00000006
-#define CSS_HEADER_LEN    0x000000a1
-#define CSS_HEADER_VERSION 0x00010000
-#define CSS_MODULE_VENDOR  0x00008086
-
-#define KEY_SIZE      256
-#define MU_SIZE                8
-#define EXPONENT_SIZE  4
-
-/* the file itself */
-struct firmware_file {
-       struct css_header css_header;
-       u8 modulus[KEY_SIZE];
-       u8 exponent[EXPONENT_SIZE];
-       u8 signature[KEY_SIZE];
-       u8 firmware[];
-};
-
-struct augmented_firmware_file {
-       struct css_header css_header;
-       u8 modulus[KEY_SIZE];
-       u8 exponent[EXPONENT_SIZE];
-       u8 signature[KEY_SIZE];
-       u8 r2[KEY_SIZE];
-       u8 mu[MU_SIZE];
-       u8 firmware[];
-};
-
-/* augmented file size difference */
-#define AUGMENT_SIZE (sizeof(struct augmented_firmware_file) - \
-                                               sizeof(struct firmware_file))
-
-struct firmware_details {
-       /* Linux core piece */
-       const struct firmware *fw;
-
-       struct css_header *css_header;
-       u8 *firmware_ptr;               /* pointer to binary data */
-       u32 firmware_len;               /* length in bytes */
-       u8 *modulus;                    /* pointer to the modulus */
-       u8 *exponent;                   /* pointer to the exponent */
-       u8 *signature;                  /* pointer to the signature */
-       u8 *r2;                         /* pointer to r2 */
-       u8 *mu;                         /* pointer to mu */
-       struct augmented_firmware_file dummy_header;
-};
-
-/*
- * The mutex protects fw_state, fw_err, and all of the firmware_details
- * variables.
- */
-static DEFINE_MUTEX(fw_mutex);
-enum fw_state {
-       FW_EMPTY,
-       FW_TRY,
-       FW_FINAL,
-       FW_ERR
-};
-
-static enum fw_state fw_state = FW_EMPTY;
-static int fw_err;
-static struct firmware_details fw_8051;
-static struct firmware_details fw_fabric;
-static struct firmware_details fw_pcie;
-static struct firmware_details fw_sbus;
-static const struct firmware *platform_config;
-
-/* flags for turn_off_spicos() */
-#define SPICO_SBUS   0x1
-#define SPICO_FABRIC 0x2
-#define ENABLE_SPICO_SMASK 0x1
-
-/* security block commands */
-#define RSA_CMD_INIT  0x1
-#define RSA_CMD_START 0x2
-
-/* security block status */
-#define RSA_STATUS_IDLE   0x0
-#define RSA_STATUS_ACTIVE 0x1
-#define RSA_STATUS_DONE   0x2
-#define RSA_STATUS_FAILED 0x3
-
-/* RSA engine timeout, in ms */
-#define RSA_ENGINE_TIMEOUT 100 /* ms */
-
-/* hardware mutex timeout, in ms */
-#define HM_TIMEOUT 10 /* ms */
-
-/* 8051 memory access timeout, in us */
-#define DC8051_ACCESS_TIMEOUT 100 /* us */
-
-/* the number of fabric SerDes on the SBus */
-#define NUM_FABRIC_SERDES 4
-
-/* SBus fabric SerDes addresses, one set per HFI */
-static const u8 fabric_serdes_addrs[2][NUM_FABRIC_SERDES] = {
-       { 0x01, 0x02, 0x03, 0x04 },
-       { 0x28, 0x29, 0x2a, 0x2b }
-};
-
-/* SBus PCIe SerDes addresses, one set per HFI */
-static const u8 pcie_serdes_addrs[2][NUM_PCIE_SERDES] = {
-       { 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16,
-         0x18, 0x1a, 0x1c, 0x1e, 0x20, 0x22, 0x24, 0x26 },
-       { 0x2f, 0x31, 0x33, 0x35, 0x37, 0x39, 0x3b, 0x3d,
-         0x3f, 0x41, 0x43, 0x45, 0x47, 0x49, 0x4b, 0x4d }
-};
-
-/* SBus PCIe PCS addresses, one set per HFI */
-const u8 pcie_pcs_addrs[2][NUM_PCIE_SERDES] = {
-       { 0x09, 0x0b, 0x0d, 0x0f, 0x11, 0x13, 0x15, 0x17,
-         0x19, 0x1b, 0x1d, 0x1f, 0x21, 0x23, 0x25, 0x27 },
-       { 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e,
-         0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e }
-};
-
-/* SBus fabric SerDes broadcast addresses, one per HFI */
-static const u8 fabric_serdes_broadcast[2] = { 0xe4, 0xe5 };
-static const u8 all_fabric_serdes_broadcast = 0xe1;
-
-/* SBus PCIe SerDes broadcast addresses, one per HFI */
-const u8 pcie_serdes_broadcast[2] = { 0xe2, 0xe3 };
-static const u8 all_pcie_serdes_broadcast = 0xe0;
-
-/* forwards */
-static void dispose_one_firmware(struct firmware_details *fdet);
-static int load_fabric_serdes_firmware(struct hfi1_devdata *dd,
-                                      struct firmware_details *fdet);
-
-/*
- * Read a single 64-bit value from 8051 data memory.
- *
- * Expects:
- * o caller to have already set up data read, no auto increment
- * o caller to turn off read enable when finished
- *
- * The address argument is a byte offset.  Bits 0:2 in the address are
- * ignored - i.e. the hardware will always do aligned 8-byte reads as if
- * the lower bits are zero.
- *
- * Return 0 on success, -ENXIO on a read error (timeout).
- */
-static int __read_8051_data(struct hfi1_devdata *dd, u32 addr, u64 *result)
-{
-       u64 reg;
-       int count;
-
-       /* start the read at the given address */
-       reg = ((addr & DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK)
-                       << DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT)
-               | DC_DC8051_CFG_RAM_ACCESS_CTRL_READ_ENA_SMASK;
-       write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, reg);
-
-       /* wait until ACCESS_COMPLETED is set */
-       count = 0;
-       while ((read_csr(dd, DC_DC8051_CFG_RAM_ACCESS_STATUS)
-                   & DC_DC8051_CFG_RAM_ACCESS_STATUS_ACCESS_COMPLETED_SMASK)
-                   == 0) {
-               count++;
-               if (count > DC8051_ACCESS_TIMEOUT) {
-                       dd_dev_err(dd, "timeout reading 8051 data\n");
-                       return -ENXIO;
-               }
-               ndelay(10);
-       }
-
-       /* gather the data */
-       *result = read_csr(dd, DC_DC8051_CFG_RAM_ACCESS_RD_DATA);
-
-       return 0;
-}
-
-/*
- * Read 8051 data starting at addr, for len bytes.  Will read in 8-byte chunks.
- * Return 0 on success, -errno on error.
- */
-int read_8051_data(struct hfi1_devdata *dd, u32 addr, u32 len, u64 *result)
-{
-       unsigned long flags;
-       u32 done;
-       int ret = 0;
-
-       spin_lock_irqsave(&dd->dc8051_memlock, flags);
-
-       /* data read set-up, no auto-increment */
-       write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_SETUP, 0);
-
-       for (done = 0; done < len; addr += 8, done += 8, result++) {
-               ret = __read_8051_data(dd, addr, result);
-               if (ret)
-                       break;
-       }
-
-       /* turn off read enable */
-       write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, 0);
-
-       spin_unlock_irqrestore(&dd->dc8051_memlock, flags);
-
-       return ret;
-}
-
-/*
- * Write data or code to the 8051 code or data RAM.
- */
-static int write_8051(struct hfi1_devdata *dd, int code, u32 start,
-                     const u8 *data, u32 len)
-{
-       u64 reg;
-       u32 offset;
-       int aligned, count;
-
-       /* check alignment */
-       aligned = ((unsigned long)data & 0x7) == 0;
-
-       /* write set-up */
-       reg = (code ? DC_DC8051_CFG_RAM_ACCESS_SETUP_RAM_SEL_SMASK : 0ull)
-               | DC_DC8051_CFG_RAM_ACCESS_SETUP_AUTO_INCR_ADDR_SMASK;
-       write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_SETUP, reg);
-
-       reg = ((start & DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK)
-                       << DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT)
-               | DC_DC8051_CFG_RAM_ACCESS_CTRL_WRITE_ENA_SMASK;
-       write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, reg);
-
-       /* write */
-       for (offset = 0; offset < len; offset += 8) {
-               int bytes = len - offset;
-
-               if (bytes < 8) {
-                       reg = 0;
-                       memcpy(&reg, &data[offset], bytes);
-               } else if (aligned) {
-                       reg = *(u64 *)&data[offset];
-               } else {
-                       memcpy(&reg, &data[offset], 8);
-               }
-               write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_WR_DATA, reg);
-
-               /* wait until ACCESS_COMPLETED is set */
-               count = 0;
-               while ((read_csr(dd, DC_DC8051_CFG_RAM_ACCESS_STATUS)
-                   & DC_DC8051_CFG_RAM_ACCESS_STATUS_ACCESS_COMPLETED_SMASK)
-                   == 0) {
-                       count++;
-                       if (count > DC8051_ACCESS_TIMEOUT) {
-                               dd_dev_err(dd, "timeout writing 8051 data\n");
-                               return -ENXIO;
-                       }
-                       udelay(1);
-               }
-       }
-
-       /* turn off write access, auto increment (also sets to data access) */
-       write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, 0);
-       write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_SETUP, 0);
-
-       return 0;
-}
-
-/* return 0 if values match, non-zero and complain otherwise */
-static int invalid_header(struct hfi1_devdata *dd, const char *what,
-                         u32 actual, u32 expected)
-{
-       if (actual == expected)
-               return 0;
-
-       dd_dev_err(dd,
-                  "invalid firmware header field %s: expected 0x%x, actual 0x%x\n",
-                  what, expected, actual);
-       return 1;
-}
-
-/*
- * Verify that the static fields in the CSS header match.
- */
-static int verify_css_header(struct hfi1_devdata *dd, struct css_header *css)
-{
-       /* verify CSS header fields (most sizes are in DW, so add /4) */
-       if (invalid_header(dd, "module_type", css->module_type,
-                          CSS_MODULE_TYPE) ||
-           invalid_header(dd, "header_len", css->header_len,
-                          (sizeof(struct firmware_file) / 4)) ||
-           invalid_header(dd, "header_version", css->header_version,
-                          CSS_HEADER_VERSION) ||
-           invalid_header(dd, "module_vendor", css->module_vendor,
-                          CSS_MODULE_VENDOR) ||
-           invalid_header(dd, "key_size", css->key_size, KEY_SIZE / 4) ||
-           invalid_header(dd, "modulus_size", css->modulus_size,
-                          KEY_SIZE / 4) ||
-           invalid_header(dd, "exponent_size", css->exponent_size,
-                          EXPONENT_SIZE / 4)) {
-               return -EINVAL;
-       }
-       return 0;
-}
-
-/*
- * Make sure there are at least some bytes after the prefix.
- */
-static int payload_check(struct hfi1_devdata *dd, const char *name,
-                        long file_size, long prefix_size)
-{
-       /* make sure we have some payload */
-       if (prefix_size >= file_size) {
-               dd_dev_err(dd,
-                          "firmware \"%s\", size %ld, must be larger than %ld bytes\n",
-                          name, file_size, prefix_size);
-               return -EINVAL;
-       }
-
-       return 0;
-}
-
-/*
- * Request the firmware from the system.  Extract the pieces and fill in
- * fdet.  If successful, the caller will need to call dispose_one_firmware().
- * Returns 0 on success, -ERRNO on error.
- */
-static int obtain_one_firmware(struct hfi1_devdata *dd, const char *name,
-                              struct firmware_details *fdet)
-{
-       struct css_header *css;
-       int ret;
-
-       memset(fdet, 0, sizeof(*fdet));
-
-       ret = request_firmware(&fdet->fw, name, &dd->pcidev->dev);
-       if (ret) {
-               dd_dev_warn(dd, "cannot find firmware \"%s\", err %d\n",
-                           name, ret);
-               return ret;
-       }
-
-       /* verify the firmware */
-       if (fdet->fw->size < sizeof(struct css_header)) {
-               dd_dev_err(dd, "firmware \"%s\" is too small\n", name);
-               ret = -EINVAL;
-               goto done;
-       }
-       css = (struct css_header *)fdet->fw->data;
-
-       hfi1_cdbg(FIRMWARE, "Firmware %s details:", name);
-       hfi1_cdbg(FIRMWARE, "file size: 0x%lx bytes", fdet->fw->size);
-       hfi1_cdbg(FIRMWARE, "CSS structure:");
-       hfi1_cdbg(FIRMWARE, "  module_type    0x%x", css->module_type);
-       hfi1_cdbg(FIRMWARE, "  header_len     0x%03x (0x%03x bytes)",
-                 css->header_len, 4 * css->header_len);
-       hfi1_cdbg(FIRMWARE, "  header_version 0x%x", css->header_version);
-       hfi1_cdbg(FIRMWARE, "  module_id      0x%x", css->module_id);
-       hfi1_cdbg(FIRMWARE, "  module_vendor  0x%x", css->module_vendor);
-       hfi1_cdbg(FIRMWARE, "  date           0x%x", css->date);
-       hfi1_cdbg(FIRMWARE, "  size           0x%03x (0x%03x bytes)",
-                 css->size, 4 * css->size);
-       hfi1_cdbg(FIRMWARE, "  key_size       0x%03x (0x%03x bytes)",
-                 css->key_size, 4 * css->key_size);
-       hfi1_cdbg(FIRMWARE, "  modulus_size   0x%03x (0x%03x bytes)",
-                 css->modulus_size, 4 * css->modulus_size);
-       hfi1_cdbg(FIRMWARE, "  exponent_size  0x%03x (0x%03x bytes)",
-                 css->exponent_size, 4 * css->exponent_size);
-       hfi1_cdbg(FIRMWARE, "firmware size: 0x%lx bytes",
-                 fdet->fw->size - sizeof(struct firmware_file));
-
-       /*
-        * If the file does not have a valid CSS header, fail.
-        * Otherwise, check the CSS size field for an expected size.
-        * The augmented file has r2 and mu inserted after the header
-        * was generated, so there will be a known difference between
-        * the CSS header size and the actual file size.  Use this
-        * difference to identify an augmented file.
-        *
-        * Note: css->size is in DWORDs, multiply by 4 to get bytes.
-        */
-       ret = verify_css_header(dd, css);
-       if (ret) {
-               dd_dev_info(dd, "Invalid CSS header for \"%s\"\n", name);
-       } else if ((css->size * 4) == fdet->fw->size) {
-               /* non-augmented firmware file */
-               struct firmware_file *ff = (struct firmware_file *)
-                                                       fdet->fw->data;
-
-               /* make sure there are bytes in the payload */
-               ret = payload_check(dd, name, fdet->fw->size,
-                                   sizeof(struct firmware_file));
-               if (ret == 0) {
-                       fdet->css_header = css;
-                       fdet->modulus = ff->modulus;
-                       fdet->exponent = ff->exponent;
-                       fdet->signature = ff->signature;
-                       fdet->r2 = fdet->dummy_header.r2; /* use dummy space */
-                       fdet->mu = fdet->dummy_header.mu; /* use dummy space */
-                       fdet->firmware_ptr = ff->firmware;
-                       fdet->firmware_len = fdet->fw->size -
-                                               sizeof(struct firmware_file);
-                       /*
-                        * Header does not include r2 and mu - generate here.
-                        * For now, fail.
-                        */
-                       dd_dev_err(dd, "driver is unable to validate firmware without r2 and mu (not in firmware file)\n");
-                       ret = -EINVAL;
-               }
-       } else if ((css->size * 4) + AUGMENT_SIZE == fdet->fw->size) {
-               /* augmented firmware file */
-               struct augmented_firmware_file *aff =
-                       (struct augmented_firmware_file *)fdet->fw->data;
-
-               /* make sure there are bytes in the payload */
-               ret = payload_check(dd, name, fdet->fw->size,
-                                   sizeof(struct augmented_firmware_file));
-               if (ret == 0) {
-                       fdet->css_header = css;
-                       fdet->modulus = aff->modulus;
-                       fdet->exponent = aff->exponent;
-                       fdet->signature = aff->signature;
-                       fdet->r2 = aff->r2;
-                       fdet->mu = aff->mu;
-                       fdet->firmware_ptr = aff->firmware;
-                       fdet->firmware_len = fdet->fw->size -
-                                       sizeof(struct augmented_firmware_file);
-               }
-       } else {
-               /* css->size check failed */
-               dd_dev_err(dd,
-                          "invalid firmware header field size: expected 0x%lx or 0x%lx, actual 0x%x\n",
-                          fdet->fw->size / 4,
-                          (fdet->fw->size - AUGMENT_SIZE) / 4,
-                          css->size);
-
-               ret = -EINVAL;
-       }
-
-done:
-       /* if returning an error, clean up after ourselves */
-       if (ret)
-               dispose_one_firmware(fdet);
-       return ret;
-}
-
-static void dispose_one_firmware(struct firmware_details *fdet)
-{
-       release_firmware(fdet->fw);
-       /* erase all previous information */
-       memset(fdet, 0, sizeof(*fdet));
-}
-
-/*
- * Obtain the 4 firmwares from the OS.  All must be obtained at once or not
- * at all.  If called with the firmware state in FW_TRY, use alternate names.
- * On exit, this routine will have set the firmware state to one of FW_TRY,
- * FW_FINAL, or FW_ERR.
- *
- * Must be holding fw_mutex.
- */
-static void __obtain_firmware(struct hfi1_devdata *dd)
-{
-       int err = 0;
-
-       if (fw_state == FW_FINAL)       /* nothing more to obtain */
-               return;
-       if (fw_state == FW_ERR)         /* already in error */
-               return;
-
-       /* fw_state is FW_EMPTY or FW_TRY */
-retry:
-       if (fw_state == FW_TRY) {
-               /*
-                * We tried the original and it failed.  Move to the
-                * alternate.
-                */
-               dd_dev_warn(dd, "using alternate firmware names\n");
-               /*
-                * Let others run.  Some systems, when missing firmware, does
-                * something that holds for 30 seconds.  If we do that twice
-                * in a row it triggers task blocked warning.
-                */
-               cond_resched();
-               if (fw_8051_load)
-                       dispose_one_firmware(&fw_8051);
-               if (fw_fabric_serdes_load)
-                       dispose_one_firmware(&fw_fabric);
-               if (fw_sbus_load)
-                       dispose_one_firmware(&fw_sbus);
-               if (fw_pcie_serdes_load)
-                       dispose_one_firmware(&fw_pcie);
-               fw_8051_name = ALT_FW_8051_NAME_ASIC;
-               fw_fabric_serdes_name = ALT_FW_FABRIC_NAME;
-               fw_sbus_name = ALT_FW_SBUS_NAME;
-               fw_pcie_serdes_name = ALT_FW_PCIE_NAME;
-       }
-
-       if (fw_sbus_load) {
-               err = obtain_one_firmware(dd, fw_sbus_name, &fw_sbus);
-               if (err)
-                       goto done;
-       }
-
-       if (fw_pcie_serdes_load) {
-               err = obtain_one_firmware(dd, fw_pcie_serdes_name, &fw_pcie);
-               if (err)
-                       goto done;
-       }
-
-       if (fw_fabric_serdes_load) {
-               err = obtain_one_firmware(dd, fw_fabric_serdes_name,
-                                         &fw_fabric);
-               if (err)
-                       goto done;
-       }
-
-       if (fw_8051_load) {
-               err = obtain_one_firmware(dd, fw_8051_name, &fw_8051);
-               if (err)
-                       goto done;
-       }
-
-done:
-       if (err) {
-               /* oops, had problems obtaining a firmware */
-               if (fw_state == FW_EMPTY && dd->icode == ICODE_RTL_SILICON) {
-                       /* retry with alternate (RTL only) */
-                       fw_state = FW_TRY;
-                       goto retry;
-               }
-               dd_dev_err(dd, "unable to obtain working firmware\n");
-               fw_state = FW_ERR;
-               fw_err = -ENOENT;
-       } else {
-               /* success */
-               if (fw_state == FW_EMPTY &&
-                   dd->icode != ICODE_FUNCTIONAL_SIMULATOR)
-                       fw_state = FW_TRY;      /* may retry later */
-               else
-                       fw_state = FW_FINAL;    /* cannot try again */
-       }
-}
-
-/*
- * Called by all HFIs when loading their firmware - i.e. device probe time.
- * The first one will do the actual firmware load.  Use a mutex to resolve
- * any possible race condition.
- *
- * The call to this routine cannot be moved to driver load because the kernel
- * call request_firmware() requires a device which is only available after
- * the first device probe.
- */
-static int obtain_firmware(struct hfi1_devdata *dd)
-{
-       unsigned long timeout;
-       int err = 0;
-
-       mutex_lock(&fw_mutex);
-
-       /* 40s delay due to long delay on missing firmware on some systems */
-       timeout = jiffies + msecs_to_jiffies(40000);
-       while (fw_state == FW_TRY) {
-               /*
-                * Another device is trying the firmware.  Wait until it
-                * decides what works (or not).
-                */
-               if (time_after(jiffies, timeout)) {
-                       /* waited too long */
-                       dd_dev_err(dd, "Timeout waiting for firmware try");
-                       fw_state = FW_ERR;
-                       fw_err = -ETIMEDOUT;
-                       break;
-               }
-               mutex_unlock(&fw_mutex);
-               msleep(20);     /* arbitrary delay */
-               mutex_lock(&fw_mutex);
-       }
-       /* not in FW_TRY state */
-
-       if (fw_state == FW_FINAL) {
-               if (platform_config) {
-                       dd->platform_config.data = platform_config->data;
-                       dd->platform_config.size = platform_config->size;
-               }
-               goto done;      /* already acquired */
-       } else if (fw_state == FW_ERR) {
-               goto done;      /* already tried and failed */
-       }
-       /* fw_state is FW_EMPTY */
-
-       /* set fw_state to FW_TRY, FW_FINAL, or FW_ERR, and fw_err */
-       __obtain_firmware(dd);
-
-       if (platform_config_load) {
-               platform_config = NULL;
-               err = request_firmware(&platform_config, platform_config_name,
-                                      &dd->pcidev->dev);
-               if (err) {
-                       platform_config = NULL;
-                       goto done;
-               }
-               dd->platform_config.data = platform_config->data;
-               dd->platform_config.size = platform_config->size;
-       }
-
-done:
-       mutex_unlock(&fw_mutex);
-
-       return fw_err;
-}
-
-/*
- * Called when the driver unloads.  The timing is asymmetric with its
- * counterpart, obtain_firmware().  If called at device remove time,
- * then it is conceivable that another device could probe while the
- * firmware is being disposed.  The mutexes can be moved to do that
- * safely, but then the firmware would be requested from the OS multiple
- * times.
- *
- * No mutex is needed as the driver is unloading and there cannot be any
- * other callers.
- */
-void dispose_firmware(void)
-{
-       dispose_one_firmware(&fw_8051);
-       dispose_one_firmware(&fw_fabric);
-       dispose_one_firmware(&fw_pcie);
-       dispose_one_firmware(&fw_sbus);
-
-       release_firmware(platform_config);
-       platform_config = NULL;
-
-       /* retain the error state, otherwise revert to empty */
-       if (fw_state != FW_ERR)
-               fw_state = FW_EMPTY;
-}
-
-/*
- * Called with the result of a firmware download.
- *
- * Return 1 to retry loading the firmware, 0 to stop.
- */
-static int retry_firmware(struct hfi1_devdata *dd, int load_result)
-{
-       int retry;
-
-       mutex_lock(&fw_mutex);
-
-       if (load_result == 0) {
-               /*
-                * The load succeeded, so expect all others to do the same.
-                * Do not retry again.
-                */
-               if (fw_state == FW_TRY)
-                       fw_state = FW_FINAL;
-               retry = 0;      /* do NOT retry */
-       } else if (fw_state == FW_TRY) {
-               /* load failed, obtain alternate firmware */
-               __obtain_firmware(dd);
-               retry = (fw_state == FW_FINAL);
-       } else {
-               /* else in FW_FINAL or FW_ERR, no retry in either case */
-               retry = 0;
-       }
-
-       mutex_unlock(&fw_mutex);
-       return retry;
-}
-
-/*
- * Write a block of data to a given array CSR.  All calls will be in
- * multiples of 8 bytes.
- */
-static void write_rsa_data(struct hfi1_devdata *dd, int what,
-                          const u8 *data, int nbytes)
-{
-       int qw_size = nbytes / 8;
-       int i;
-
-       if (((unsigned long)data & 0x7) == 0) {
-               /* aligned */
-               u64 *ptr = (u64 *)data;
-
-               for (i = 0; i < qw_size; i++, ptr++)
-                       write_csr(dd, what + (8 * i), *ptr);
-       } else {
-               /* not aligned */
-               for (i = 0; i < qw_size; i++, data += 8) {
-                       u64 value;
-
-                       memcpy(&value, data, 8);
-                       write_csr(dd, what + (8 * i), value);
-               }
-       }
-}
-
-/*
- * Write a block of data to a given CSR as a stream of writes.  All calls will
- * be in multiples of 8 bytes.
- */
-static void write_streamed_rsa_data(struct hfi1_devdata *dd, int what,
-                                   const u8 *data, int nbytes)
-{
-       u64 *ptr = (u64 *)data;
-       int qw_size = nbytes / 8;
-
-       for (; qw_size > 0; qw_size--, ptr++)
-               write_csr(dd, what, *ptr);
-}
-
-/*
- * Download the signature and start the RSA mechanism.  Wait for
- * RSA_ENGINE_TIMEOUT before giving up.
- */
-static int run_rsa(struct hfi1_devdata *dd, const char *who,
-                  const u8 *signature)
-{
-       unsigned long timeout;
-       u64 reg;
-       u32 status;
-       int ret = 0;
-
-       /* write the signature */
-       write_rsa_data(dd, MISC_CFG_RSA_SIGNATURE, signature, KEY_SIZE);
-
-       /* initialize RSA */
-       write_csr(dd, MISC_CFG_RSA_CMD, RSA_CMD_INIT);
-
-       /*
-        * Make sure the engine is idle and insert a delay between the two
-        * writes to MISC_CFG_RSA_CMD.
-        */
-       status = (read_csr(dd, MISC_CFG_FW_CTRL)
-                          & MISC_CFG_FW_CTRL_RSA_STATUS_SMASK)
-                            >> MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT;
-       if (status != RSA_STATUS_IDLE) {
-               dd_dev_err(dd, "%s security engine not idle - giving up\n",
-                          who);
-               return -EBUSY;
-       }
-
-       /* start RSA */
-       write_csr(dd, MISC_CFG_RSA_CMD, RSA_CMD_START);
-
-       /*
-        * Look for the result.
-        *
-        * The RSA engine is hooked up to two MISC errors.  The driver
-        * masks these errors as they do not respond to the standard
-        * error "clear down" mechanism.  Look for these errors here and
-        * clear them when possible.  This routine will exit with the
-        * errors of the current run still set.
-        *
-        * MISC_FW_AUTH_FAILED_ERR
-        *      Firmware authorization failed.  This can be cleared by
-        *      re-initializing the RSA engine, then clearing the status bit.
-        *      Do not re-init the RSA angine immediately after a successful
-        *      run - this will reset the current authorization.
-        *
-        * MISC_KEY_MISMATCH_ERR
-        *      Key does not match.  The only way to clear this is to load
-        *      a matching key then clear the status bit.  If this error
-        *      is raised, it will persist outside of this routine until a
-        *      matching key is loaded.
-        */
-       timeout = msecs_to_jiffies(RSA_ENGINE_TIMEOUT) + jiffies;
-       while (1) {
-               status = (read_csr(dd, MISC_CFG_FW_CTRL)
-                          & MISC_CFG_FW_CTRL_RSA_STATUS_SMASK)
-                            >> MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT;
-
-               if (status == RSA_STATUS_IDLE) {
-                       /* should not happen */
-                       dd_dev_err(dd, "%s firmware security bad idle state\n",
-                                  who);
-                       ret = -EINVAL;
-                       break;
-               } else if (status == RSA_STATUS_DONE) {
-                       /* finished successfully */
-                       break;
-               } else if (status == RSA_STATUS_FAILED) {
-                       /* finished unsuccessfully */
-                       ret = -EINVAL;
-                       break;
-               }
-               /* else still active */
-
-               if (time_after(jiffies, timeout)) {
-                       /*
-                        * Timed out while active.  We can't reset the engine
-                        * if it is stuck active, but run through the
-                        * error code to see what error bits are set.
-                        */
-                       dd_dev_err(dd, "%s firmware security time out\n", who);
-                       ret = -ETIMEDOUT;
-                       break;
-               }
-
-               msleep(20);
-       }
-
-       /*
-        * Arrive here on success or failure.  Clear all RSA engine
-        * errors.  All current errors will stick - the RSA logic is keeping
-        * error high.  All previous errors will clear - the RSA logic
-        * is not keeping the error high.
-        */
-       write_csr(dd, MISC_ERR_CLEAR,
-                 MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK |
-                 MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK);
-       /*
-        * All that is left are the current errors.  Print warnings on
-        * authorization failure details, if any.  Firmware authorization
-        * can be retried, so these are only warnings.
-        */
-       reg = read_csr(dd, MISC_ERR_STATUS);
-       if (ret) {
-               if (reg & MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK)
-                       dd_dev_warn(dd, "%s firmware authorization failed\n",
-                                   who);
-               if (reg & MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK)
-                       dd_dev_warn(dd, "%s firmware key mismatch\n", who);
-       }
-
-       return ret;
-}
-
-static void load_security_variables(struct hfi1_devdata *dd,
-                                   struct firmware_details *fdet)
-{
-       /* Security variables a.  Write the modulus */
-       write_rsa_data(dd, MISC_CFG_RSA_MODULUS, fdet->modulus, KEY_SIZE);
-       /* Security variables b.  Write the r2 */
-       write_rsa_data(dd, MISC_CFG_RSA_R2, fdet->r2, KEY_SIZE);
-       /* Security variables c.  Write the mu */
-       write_rsa_data(dd, MISC_CFG_RSA_MU, fdet->mu, MU_SIZE);
-       /* Security variables d.  Write the header */
-       write_streamed_rsa_data(dd, MISC_CFG_SHA_PRELOAD,
-                               (u8 *)fdet->css_header,
-                               sizeof(struct css_header));
-}
-
-/* return the 8051 firmware state */
-static inline u32 get_firmware_state(struct hfi1_devdata *dd)
-{
-       u64 reg = read_csr(dd, DC_DC8051_STS_CUR_STATE);
-
-       return (reg >> DC_DC8051_STS_CUR_STATE_FIRMWARE_SHIFT)
-                               & DC_DC8051_STS_CUR_STATE_FIRMWARE_MASK;
-}
-
-/*
- * Wait until the firmware is up and ready to take host requests.
- * Return 0 on success, -ETIMEDOUT on timeout.
- */
-int wait_fm_ready(struct hfi1_devdata *dd, u32 mstimeout)
-{
-       unsigned long timeout;
-
-       /* in the simulator, the fake 8051 is always ready */
-       if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
-               return 0;
-
-       timeout = msecs_to_jiffies(mstimeout) + jiffies;
-       while (1) {
-               if (get_firmware_state(dd) == 0xa0)     /* ready */
-                       return 0;
-               if (time_after(jiffies, timeout))       /* timed out */
-                       return -ETIMEDOUT;
-               usleep_range(1950, 2050); /* sleep 2ms-ish */
-       }
-}
-
-/*
- * Load the 8051 firmware.
- */
-static int load_8051_firmware(struct hfi1_devdata *dd,
-                             struct firmware_details *fdet)
-{
-       u64 reg;
-       int ret;
-       u8 ver_a, ver_b;
-
-       /*
-        * DC Reset sequence
-        * Load DC 8051 firmware
-        */
-       /*
-        * DC reset step 1: Reset DC8051
-        */
-       reg = DC_DC8051_CFG_RST_M8051W_SMASK
-               | DC_DC8051_CFG_RST_CRAM_SMASK
-               | DC_DC8051_CFG_RST_DRAM_SMASK
-               | DC_DC8051_CFG_RST_IRAM_SMASK
-               | DC_DC8051_CFG_RST_SFR_SMASK;
-       write_csr(dd, DC_DC8051_CFG_RST, reg);
-
-       /*
-        * DC reset step 2 (optional): Load 8051 data memory with link
-        * configuration
-        */
-
-       /*
-        * DC reset step 3: Load DC8051 firmware
-        */
-       /* release all but the core reset */
-       reg = DC_DC8051_CFG_RST_M8051W_SMASK;
-       write_csr(dd, DC_DC8051_CFG_RST, reg);
-
-       /* Firmware load step 1 */
-       load_security_variables(dd, fdet);
-
-       /*
-        * Firmware load step 2.  Clear MISC_CFG_FW_CTRL.FW_8051_LOADED
-        */
-       write_csr(dd, MISC_CFG_FW_CTRL, 0);
-
-       /* Firmware load steps 3-5 */
-       ret = write_8051(dd, 1/*code*/, 0, fdet->firmware_ptr,
-                        fdet->firmware_len);
-       if (ret)
-               return ret;
-
-       /*
-        * DC reset step 4. Host starts the DC8051 firmware
-        */
-       /*
-        * Firmware load step 6.  Set MISC_CFG_FW_CTRL.FW_8051_LOADED
-        */
-       write_csr(dd, MISC_CFG_FW_CTRL, MISC_CFG_FW_CTRL_FW_8051_LOADED_SMASK);
-
-       /* Firmware load steps 7-10 */
-       ret = run_rsa(dd, "8051", fdet->signature);
-       if (ret)
-               return ret;
-
-       /* clear all reset bits, releasing the 8051 */
-       write_csr(dd, DC_DC8051_CFG_RST, 0ull);
-
-       /*
-        * DC reset step 5. Wait for firmware to be ready to accept host
-        * requests.
-        */
-       ret = wait_fm_ready(dd, TIMEOUT_8051_START);
-       if (ret) { /* timed out */
-               dd_dev_err(dd, "8051 start timeout, current state 0x%x\n",
-                          get_firmware_state(dd));
-               return -ETIMEDOUT;
-       }
-
-       read_misc_status(dd, &ver_a, &ver_b);
-       dd_dev_info(dd, "8051 firmware version %d.%d\n",
-                   (int)ver_b, (int)ver_a);
-       dd->dc8051_ver = dc8051_ver(ver_b, ver_a);
-
-       return 0;
-}
-
-/*
- * Write the SBus request register
- *
- * No need for masking - the arguments are sized exactly.
- */
-void sbus_request(struct hfi1_devdata *dd,
-                 u8 receiver_addr, u8 data_addr, u8 command, u32 data_in)
-{
-       write_csr(dd, ASIC_CFG_SBUS_REQUEST,
-                 ((u64)data_in << ASIC_CFG_SBUS_REQUEST_DATA_IN_SHIFT) |
-                 ((u64)command << ASIC_CFG_SBUS_REQUEST_COMMAND_SHIFT) |
-                 ((u64)data_addr << ASIC_CFG_SBUS_REQUEST_DATA_ADDR_SHIFT) |
-                 ((u64)receiver_addr <<
-                  ASIC_CFG_SBUS_REQUEST_RECEIVER_ADDR_SHIFT));
-}
-
-/*
- * Turn off the SBus and fabric serdes spicos.
- *
- * + Must be called with Sbus fast mode turned on.
- * + Must be called after fabric serdes broadcast is set up.
- * + Must be called before the 8051 is loaded - assumes 8051 is not loaded
- *   when using MISC_CFG_FW_CTRL.
- */
-static void turn_off_spicos(struct hfi1_devdata *dd, int flags)
-{
-       /* only needed on A0 */
-       if (!is_ax(dd))
-               return;
-
-       dd_dev_info(dd, "Turning off spicos:%s%s\n",
-                   flags & SPICO_SBUS ? " SBus" : "",
-                   flags & SPICO_FABRIC ? " fabric" : "");
-
-       write_csr(dd, MISC_CFG_FW_CTRL, ENABLE_SPICO_SMASK);
-       /* disable SBus spico */
-       if (flags & SPICO_SBUS)
-               sbus_request(dd, SBUS_MASTER_BROADCAST, 0x01,
-                            WRITE_SBUS_RECEIVER, 0x00000040);
-
-       /* disable the fabric serdes spicos */
-       if (flags & SPICO_FABRIC)
-               sbus_request(dd, fabric_serdes_broadcast[dd->hfi1_id],
-                            0x07, WRITE_SBUS_RECEIVER, 0x00000000);
-       write_csr(dd, MISC_CFG_FW_CTRL, 0);
-}
-
-/*
- * Reset all of the fabric serdes for this HFI in preparation to take the
- * link to Polling.
- *
- * To do a reset, we need to write to to the serdes registers.  Unfortunately,
- * the fabric serdes download to the other HFI on the ASIC will have turned
- * off the firmware validation on this HFI.  This means we can't write to the
- * registers to reset the serdes.  Work around this by performing a complete
- * re-download and validation of the fabric serdes firmware.  This, as a
- * by-product, will reset the serdes.  NOTE: the re-download requires that
- * the 8051 be in the Offline state.  I.e. not actively trying to use the
- * serdes.  This routine is called at the point where the link is Offline and
- * is getting ready to go to Polling.
- */
-void fabric_serdes_reset(struct hfi1_devdata *dd)
-{
-       int ret;
-
-       if (!fw_fabric_serdes_load)
-               return;
-
-       ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT);
-       if (ret) {
-               dd_dev_err(dd,
-                          "Cannot acquire SBus resource to reset fabric SerDes - perhaps you should reboot\n");
-               return;
-       }
-       set_sbus_fast_mode(dd);
-
-       if (is_ax(dd)) {
-               /* A0 serdes do not work with a re-download */
-               u8 ra = fabric_serdes_broadcast[dd->hfi1_id];
-
-               /* place SerDes in reset and disable SPICO */
-               sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000011);
-               /* wait 100 refclk cycles @ 156.25MHz => 640ns */
-               udelay(1);
-               /* remove SerDes reset */
-               sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000010);
-               /* turn SPICO enable on */
-               sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000002);
-       } else {
-               turn_off_spicos(dd, SPICO_FABRIC);
-               /*
-                * No need for firmware retry - what to download has already
-                * been decided.
-                * No need to pay attention to the load return - the only
-                * failure is a validation failure, which has already been
-                * checked by the initial download.
-                */
-               (void)load_fabric_serdes_firmware(dd, &fw_fabric);
-       }
-
-       clear_sbus_fast_mode(dd);
-       release_chip_resource(dd, CR_SBUS);
-}
-
-/* Access to the SBus in this routine should probably be serialized */
-int sbus_request_slow(struct hfi1_devdata *dd,
-                     u8 receiver_addr, u8 data_addr, u8 command, u32 data_in)
-{
-       u64 reg, count = 0;
-
-       /* make sure fast mode is clear */
-       clear_sbus_fast_mode(dd);
-
-       sbus_request(dd, receiver_addr, data_addr, command, data_in);
-       write_csr(dd, ASIC_CFG_SBUS_EXECUTE,
-                 ASIC_CFG_SBUS_EXECUTE_EXECUTE_SMASK);
-       /* Wait for both DONE and RCV_DATA_VALID to go high */
-       reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
-       while (!((reg & ASIC_STS_SBUS_RESULT_DONE_SMASK) &&
-                (reg & ASIC_STS_SBUS_RESULT_RCV_DATA_VALID_SMASK))) {
-               if (count++ >= SBUS_MAX_POLL_COUNT) {
-                       u64 counts = read_csr(dd, ASIC_STS_SBUS_COUNTERS);
-                       /*
-                        * If the loop has timed out, we are OK if DONE bit
-                        * is set and RCV_DATA_VALID and EXECUTE counters
-                        * are the same. If not, we cannot proceed.
-                        */
-                       if ((reg & ASIC_STS_SBUS_RESULT_DONE_SMASK) &&
-                           (SBUS_COUNTER(counts, RCV_DATA_VALID) ==
-                            SBUS_COUNTER(counts, EXECUTE)))
-                               break;
-                       return -ETIMEDOUT;
-               }
-               udelay(1);
-               reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
-       }
-       count = 0;
-       write_csr(dd, ASIC_CFG_SBUS_EXECUTE, 0);
-       /* Wait for DONE to clear after EXECUTE is cleared */
-       reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
-       while (reg & ASIC_STS_SBUS_RESULT_DONE_SMASK) {
-               if (count++ >= SBUS_MAX_POLL_COUNT)
-                       return -ETIME;
-               udelay(1);
-               reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
-       }
-       return 0;
-}
-
-static int load_fabric_serdes_firmware(struct hfi1_devdata *dd,
-                                      struct firmware_details *fdet)
-{
-       int i, err;
-       const u8 ra = fabric_serdes_broadcast[dd->hfi1_id]; /* receiver addr */
-
-       dd_dev_info(dd, "Downloading fabric firmware\n");
-
-       /* step 1: load security variables */
-       load_security_variables(dd, fdet);
-       /* step 2: place SerDes in reset and disable SPICO */
-       sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000011);
-       /* wait 100 refclk cycles @ 156.25MHz => 640ns */
-       udelay(1);
-       /* step 3:  remove SerDes reset */
-       sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000010);
-       /* step 4: assert IMEM override */
-       sbus_request(dd, ra, 0x00, WRITE_SBUS_RECEIVER, 0x40000000);
-       /* step 5: download SerDes machine code */
-       for (i = 0; i < fdet->firmware_len; i += 4) {
-               sbus_request(dd, ra, 0x0a, WRITE_SBUS_RECEIVER,
-                            *(u32 *)&fdet->firmware_ptr[i]);
-       }
-       /* step 6: IMEM override off */
-       sbus_request(dd, ra, 0x00, WRITE_SBUS_RECEIVER, 0x00000000);
-       /* step 7: turn ECC on */
-       sbus_request(dd, ra, 0x0b, WRITE_SBUS_RECEIVER, 0x000c0000);
-
-       /* steps 8-11: run the RSA engine */
-       err = run_rsa(dd, "fabric serdes", fdet->signature);
-       if (err)
-               return err;
-
-       /* step 12: turn SPICO enable on */
-       sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000002);
-       /* step 13: enable core hardware interrupts */
-       sbus_request(dd, ra, 0x08, WRITE_SBUS_RECEIVER, 0x00000000);
-
-       return 0;
-}
-
-static int load_sbus_firmware(struct hfi1_devdata *dd,
-                             struct firmware_details *fdet)
-{
-       int i, err;
-       const u8 ra = SBUS_MASTER_BROADCAST; /* receiver address */
-
-       dd_dev_info(dd, "Downloading SBus firmware\n");
-
-       /* step 1: load security variables */
-       load_security_variables(dd, fdet);
-       /* step 2: place SPICO into reset and enable off */
-       sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x000000c0);
-       /* step 3: remove reset, enable off, IMEM_CNTRL_EN on */
-       sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000240);
-       /* step 4: set starting IMEM address for burst download */
-       sbus_request(dd, ra, 0x03, WRITE_SBUS_RECEIVER, 0x80000000);
-       /* step 5: download the SBus Master machine code */
-       for (i = 0; i < fdet->firmware_len; i += 4) {
-               sbus_request(dd, ra, 0x14, WRITE_SBUS_RECEIVER,
-                            *(u32 *)&fdet->firmware_ptr[i]);
-       }
-       /* step 6: set IMEM_CNTL_EN off */
-       sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000040);
-       /* step 7: turn ECC on */
-       sbus_request(dd, ra, 0x16, WRITE_SBUS_RECEIVER, 0x000c0000);
-
-       /* steps 8-11: run the RSA engine */
-       err = run_rsa(dd, "SBus", fdet->signature);
-       if (err)
-               return err;
-
-       /* step 12: set SPICO_ENABLE on */
-       sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000140);
-
-       return 0;
-}
-
-static int load_pcie_serdes_firmware(struct hfi1_devdata *dd,
-                                    struct firmware_details *fdet)
-{
-       int i;
-       const u8 ra = SBUS_MASTER_BROADCAST; /* receiver address */
-
-       dd_dev_info(dd, "Downloading PCIe firmware\n");
-
-       /* step 1: load security variables */
-       load_security_variables(dd, fdet);
-       /* step 2: assert single step (halts the SBus Master spico) */
-       sbus_request(dd, ra, 0x05, WRITE_SBUS_RECEIVER, 0x00000001);
-       /* step 3: enable XDMEM access */
-       sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000d40);
-       /* step 4: load firmware into SBus Master XDMEM */
-       /*
-        * NOTE: the dmem address, write_en, and wdata are all pre-packed,
-        * we only need to pick up the bytes and write them
-        */
-       for (i = 0; i < fdet->firmware_len; i += 4) {
-               sbus_request(dd, ra, 0x04, WRITE_SBUS_RECEIVER,
-                            *(u32 *)&fdet->firmware_ptr[i]);
-       }
-       /* step 5: disable XDMEM access */
-       sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000140);
-       /* step 6: allow SBus Spico to run */
-       sbus_request(dd, ra, 0x05, WRITE_SBUS_RECEIVER, 0x00000000);
-
-       /*
-        * steps 7-11: run RSA, if it succeeds, firmware is available to
-        * be swapped
-        */
-       return run_rsa(dd, "PCIe serdes", fdet->signature);
-}
-
-/*
- * Set the given broadcast values on the given list of devices.
- */
-static void set_serdes_broadcast(struct hfi1_devdata *dd, u8 bg1, u8 bg2,
-                                const u8 *addrs, int count)
-{
-       while (--count >= 0) {
-               /*
-                * Set BROADCAST_GROUP_1 and BROADCAST_GROUP_2, leave
-                * defaults for everything else.  Do not read-modify-write,
-                * per instruction from the manufacturer.
-                *
-                * Register 0xfd:
-                *      bits    what
-                *      -----   ---------------------------------
-                *        0     IGNORE_BROADCAST  (default 0)
-                *      11:4    BROADCAST_GROUP_1 (default 0xff)
-                *      23:16   BROADCAST_GROUP_2 (default 0xff)
-                */
-               sbus_request(dd, addrs[count], 0xfd, WRITE_SBUS_RECEIVER,
-                            (u32)bg1 << 4 | (u32)bg2 << 16);
-       }
-}
-
-int acquire_hw_mutex(struct hfi1_devdata *dd)
-{
-       unsigned long timeout;
-       int try = 0;
-       u8 mask = 1 << dd->hfi1_id;
-       u8 user;
-
-retry:
-       timeout = msecs_to_jiffies(HM_TIMEOUT) + jiffies;
-       while (1) {
-               write_csr(dd, ASIC_CFG_MUTEX, mask);
-               user = (u8)read_csr(dd, ASIC_CFG_MUTEX);
-               if (user == mask)
-                       return 0; /* success */
-               if (time_after(jiffies, timeout))
-                       break; /* timed out */
-               msleep(20);
-       }
-
-       /* timed out */
-       dd_dev_err(dd,
-                  "Unable to acquire hardware mutex, mutex mask %u, my mask %u (%s)\n",
-                  (u32)user, (u32)mask, (try == 0) ? "retrying" : "giving up");
-
-       if (try == 0) {
-               /* break mutex and retry */
-               write_csr(dd, ASIC_CFG_MUTEX, 0);
-               try++;
-               goto retry;
-       }
-
-       return -EBUSY;
-}
-
-void release_hw_mutex(struct hfi1_devdata *dd)
-{
-       write_csr(dd, ASIC_CFG_MUTEX, 0);
-}
-
-/* return the given resource bit(s) as a mask for the given HFI */
-static inline u64 resource_mask(u32 hfi1_id, u32 resource)
-{
-       return ((u64)resource) << (hfi1_id ? CR_DYN_SHIFT : 0);
-}
-
-static void fail_mutex_acquire_message(struct hfi1_devdata *dd,
-                                      const char *func)
-{
-       dd_dev_err(dd,
-                  "%s: hardware mutex stuck - suggest rebooting the machine\n",
-                  func);
-}
-
-/*
- * Acquire access to a chip resource.
- *
- * Return 0 on success, -EBUSY if resource busy, -EIO if mutex acquire failed.
- */
-static int __acquire_chip_resource(struct hfi1_devdata *dd, u32 resource)
-{
-       u64 scratch0, all_bits, my_bit;
-       int ret;
-
-       if (resource & CR_DYN_MASK) {
-               /* a dynamic resource is in use if either HFI has set the bit */
-               if (dd->pcidev->device == PCI_DEVICE_ID_INTEL0 &&
-                   (resource & (CR_I2C1 | CR_I2C2))) {
-                       /* discrete devices must serialize across both chains */
-                       all_bits = resource_mask(0, CR_I2C1 | CR_I2C2) |
-                                       resource_mask(1, CR_I2C1 | CR_I2C2);
-               } else {
-                       all_bits = resource_mask(0, resource) |
-                                               resource_mask(1, resource);
-               }
-               my_bit = resource_mask(dd->hfi1_id, resource);
-       } else {
-               /* non-dynamic resources are not split between HFIs */
-               all_bits = resource;
-               my_bit = resource;
-       }
-
-       /* lock against other callers within the driver wanting a resource */
-       mutex_lock(&dd->asic_data->asic_resource_mutex);
-
-       ret = acquire_hw_mutex(dd);
-       if (ret) {
-               fail_mutex_acquire_message(dd, __func__);
-               ret = -EIO;
-               goto done;
-       }
-
-       scratch0 = read_csr(dd, ASIC_CFG_SCRATCH);
-       if (scratch0 & all_bits) {
-               ret = -EBUSY;
-       } else {
-               write_csr(dd, ASIC_CFG_SCRATCH, scratch0 | my_bit);
-               /* force write to be visible to other HFI on another OS */
-               (void)read_csr(dd, ASIC_CFG_SCRATCH);
-       }
-
-       release_hw_mutex(dd);
-
-done:
-       mutex_unlock(&dd->asic_data->asic_resource_mutex);
-       return ret;
-}
-
-/*
- * Acquire access to a chip resource, wait up to mswait milliseconds for
- * the resource to become available.
- *
- * Return 0 on success, -EBUSY if busy (even after wait), -EIO if mutex
- * acquire failed.
- */
-int acquire_chip_resource(struct hfi1_devdata *dd, u32 resource, u32 mswait)
-{
-       unsigned long timeout;
-       int ret;
-
-       timeout = jiffies + msecs_to_jiffies(mswait);
-       while (1) {
-               ret = __acquire_chip_resource(dd, resource);
-               if (ret != -EBUSY)
-                       return ret;
-               /* resource is busy, check our timeout */
-               if (time_after_eq(jiffies, timeout))
-                       return -EBUSY;
-               usleep_range(80, 120);  /* arbitrary delay */
-       }
-}
-
-/*
- * Release access to a chip resource
- */
-void release_chip_resource(struct hfi1_devdata *dd, u32 resource)
-{
-       u64 scratch0, bit;
-
-       /* only dynamic resources should ever be cleared */
-       if (!(resource & CR_DYN_MASK)) {
-               dd_dev_err(dd, "%s: invalid resource 0x%x\n", __func__,
-                          resource);
-               return;
-       }
-       bit = resource_mask(dd->hfi1_id, resource);
-
-       /* lock against other callers within the driver wanting a resource */
-       mutex_lock(&dd->asic_data->asic_resource_mutex);
-
-       if (acquire_hw_mutex(dd)) {
-               fail_mutex_acquire_message(dd, __func__);
-               goto done;
-       }
-
-       scratch0 = read_csr(dd, ASIC_CFG_SCRATCH);
-       if ((scratch0 & bit) != 0) {
-               scratch0 &= ~bit;
-               write_csr(dd, ASIC_CFG_SCRATCH, scratch0);
-               /* force write to be visible to other HFI on another OS */
-               (void)read_csr(dd, ASIC_CFG_SCRATCH);
-       } else {
-               dd_dev_warn(dd, "%s: id %d, resource 0x%x: bit not set\n",
-                           __func__, dd->hfi1_id, resource);
-       }
-
-       release_hw_mutex(dd);
-
-done:
-       mutex_unlock(&dd->asic_data->asic_resource_mutex);
-}
-
-/*
- * Return true if resource is set, false otherwise.  Print a warning
- * if not set and a function is supplied.
- */
-bool check_chip_resource(struct hfi1_devdata *dd, u32 resource,
-                        const char *func)
-{
-       u64 scratch0, bit;
-
-       if (resource & CR_DYN_MASK)
-               bit = resource_mask(dd->hfi1_id, resource);
-       else
-               bit = resource;
-
-       scratch0 = read_csr(dd, ASIC_CFG_SCRATCH);
-       if ((scratch0 & bit) == 0) {
-               if (func)
-                       dd_dev_warn(dd,
-                                   "%s: id %d, resource 0x%x, not acquired!\n",
-                                   func, dd->hfi1_id, resource);
-               return false;
-       }
-       return true;
-}
-
-static void clear_chip_resources(struct hfi1_devdata *dd, const char *func)
-{
-       u64 scratch0;
-
-       /* lock against other callers within the driver wanting a resource */
-       mutex_lock(&dd->asic_data->asic_resource_mutex);
-
-       if (acquire_hw_mutex(dd)) {
-               fail_mutex_acquire_message(dd, func);
-               goto done;
-       }
-
-       /* clear all dynamic access bits for this HFI */
-       scratch0 = read_csr(dd, ASIC_CFG_SCRATCH);
-       scratch0 &= ~resource_mask(dd->hfi1_id, CR_DYN_MASK);
-       write_csr(dd, ASIC_CFG_SCRATCH, scratch0);
-       /* force write to be visible to other HFI on another OS */
-       (void)read_csr(dd, ASIC_CFG_SCRATCH);
-
-       release_hw_mutex(dd);
-
-done:
-       mutex_unlock(&dd->asic_data->asic_resource_mutex);
-}
-
-void init_chip_resources(struct hfi1_devdata *dd)
-{
-       /* clear any holds left by us */
-       clear_chip_resources(dd, __func__);
-}
-
-void finish_chip_resources(struct hfi1_devdata *dd)
-{
-       /* clear any holds left by us */
-       clear_chip_resources(dd, __func__);
-}
-
-void set_sbus_fast_mode(struct hfi1_devdata *dd)
-{
-       write_csr(dd, ASIC_CFG_SBUS_EXECUTE,
-                 ASIC_CFG_SBUS_EXECUTE_FAST_MODE_SMASK);
-}
-
-void clear_sbus_fast_mode(struct hfi1_devdata *dd)
-{
-       u64 reg, count = 0;
-
-       reg = read_csr(dd, ASIC_STS_SBUS_COUNTERS);
-       while (SBUS_COUNTER(reg, EXECUTE) !=
-              SBUS_COUNTER(reg, RCV_DATA_VALID)) {
-               if (count++ >= SBUS_MAX_POLL_COUNT)
-                       break;
-               udelay(1);
-               reg = read_csr(dd, ASIC_STS_SBUS_COUNTERS);
-       }
-       write_csr(dd, ASIC_CFG_SBUS_EXECUTE, 0);
-}
-
-int load_firmware(struct hfi1_devdata *dd)
-{
-       int ret;
-
-       if (fw_fabric_serdes_load) {
-               ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT);
-               if (ret)
-                       return ret;
-
-               set_sbus_fast_mode(dd);
-
-               set_serdes_broadcast(dd, all_fabric_serdes_broadcast,
-                                    fabric_serdes_broadcast[dd->hfi1_id],
-                                    fabric_serdes_addrs[dd->hfi1_id],
-                                    NUM_FABRIC_SERDES);
-               turn_off_spicos(dd, SPICO_FABRIC);
-               do {
-                       ret = load_fabric_serdes_firmware(dd, &fw_fabric);
-               } while (retry_firmware(dd, ret));
-
-               clear_sbus_fast_mode(dd);
-               release_chip_resource(dd, CR_SBUS);
-               if (ret)
-                       return ret;
-       }
-
-       if (fw_8051_load) {
-               do {
-                       ret = load_8051_firmware(dd, &fw_8051);
-               } while (retry_firmware(dd, ret));
-               if (ret)
-                       return ret;
-       }
-
-       return 0;
-}
-
-int hfi1_firmware_init(struct hfi1_devdata *dd)
-{
-       /* only RTL can use these */
-       if (dd->icode != ICODE_RTL_SILICON) {
-               fw_fabric_serdes_load = 0;
-               fw_pcie_serdes_load = 0;
-               fw_sbus_load = 0;
-       }
-
-       /* no 8051 or QSFP on simulator */
-       if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
-               fw_8051_load = 0;
-               platform_config_load = 0;
-       }
-
-       if (!fw_8051_name) {
-               if (dd->icode == ICODE_RTL_SILICON)
-                       fw_8051_name = DEFAULT_FW_8051_NAME_ASIC;
-               else
-                       fw_8051_name = DEFAULT_FW_8051_NAME_FPGA;
-       }
-       if (!fw_fabric_serdes_name)
-               fw_fabric_serdes_name = DEFAULT_FW_FABRIC_NAME;
-       if (!fw_sbus_name)
-               fw_sbus_name = DEFAULT_FW_SBUS_NAME;
-       if (!fw_pcie_serdes_name)
-               fw_pcie_serdes_name = DEFAULT_FW_PCIE_NAME;
-       if (!platform_config_name)
-               platform_config_name = DEFAULT_PLATFORM_CONFIG_NAME;
-
-       return obtain_firmware(dd);
-}
-
-/*
- * This function is a helper function for parse_platform_config(...) and
- * does not check for validity of the platform configuration cache
- * (because we know it is invalid as we are building up the cache).
- * As such, this should not be called from anywhere other than
- * parse_platform_config
- */
-static int check_meta_version(struct hfi1_devdata *dd, u32 *system_table)
-{
-       u32 meta_ver, meta_ver_meta, ver_start, ver_len, mask;
-       struct platform_config_cache *pcfgcache = &dd->pcfg_cache;
-
-       if (!system_table)
-               return -EINVAL;
-
-       meta_ver_meta =
-       *(pcfgcache->config_tables[PLATFORM_CONFIG_SYSTEM_TABLE].table_metadata
-       + SYSTEM_TABLE_META_VERSION);
-
-       mask = ((1 << METADATA_TABLE_FIELD_START_LEN_BITS) - 1);
-       ver_start = meta_ver_meta & mask;
-
-       meta_ver_meta >>= METADATA_TABLE_FIELD_LEN_SHIFT;
-
-       mask = ((1 << METADATA_TABLE_FIELD_LEN_LEN_BITS) - 1);
-       ver_len = meta_ver_meta & mask;
-
-       ver_start /= 8;
-       meta_ver = *((u8 *)system_table + ver_start) & ((1 << ver_len) - 1);
-
-       if (meta_ver < 5) {
-               dd_dev_info(
-                       dd, "%s:Please update platform config\n", __func__);
-               return -EINVAL;
-       }
-       return 0;
-}
-
-int parse_platform_config(struct hfi1_devdata *dd)
-{
-       struct platform_config_cache *pcfgcache = &dd->pcfg_cache;
-       u32 *ptr = NULL;
-       u32 header1 = 0, header2 = 0, magic_num = 0, crc = 0, file_length = 0;
-       u32 record_idx = 0, table_type = 0, table_length_dwords = 0;
-       int ret = -EINVAL; /* assume failure */
-
-       if (!dd->platform_config.data) {
-               dd_dev_info(dd, "%s: Missing config file\n", __func__);
-               goto bail;
-       }
-       ptr = (u32 *)dd->platform_config.data;
-
-       magic_num = *ptr;
-       ptr++;
-       if (magic_num != PLATFORM_CONFIG_MAGIC_NUM) {
-               dd_dev_info(dd, "%s: Bad config file\n", __func__);
-               goto bail;
-       }
-
-       /* Field is file size in DWORDs */
-       file_length = (*ptr) * 4;
-       ptr++;
-
-       if (file_length > dd->platform_config.size) {
-               dd_dev_info(dd, "%s:File claims to be larger than read size\n",
-                           __func__);
-               goto bail;
-       } else if (file_length < dd->platform_config.size) {
-               dd_dev_info(dd,
-                           "%s:File claims to be smaller than read size, continuing\n",
-                           __func__);
-       }
-       /* exactly equal, perfection */
-
-       /*
-        * In both cases where we proceed, using the self-reported file length
-        * is the safer option
-        */
-       while (ptr < (u32 *)(dd->platform_config.data + file_length)) {
-               header1 = *ptr;
-               header2 = *(ptr + 1);
-               if (header1 != ~header2) {
-                       dd_dev_info(dd, "%s: Failed validation at offset %ld\n",
-                                   __func__, (ptr - (u32 *)
-                                              dd->platform_config.data));
-                       goto bail;
-               }
-
-               record_idx = *ptr &
-                       ((1 << PLATFORM_CONFIG_HEADER_RECORD_IDX_LEN_BITS) - 1);
-
-               table_length_dwords = (*ptr >>
-                               PLATFORM_CONFIG_HEADER_TABLE_LENGTH_SHIFT) &
-                     ((1 << PLATFORM_CONFIG_HEADER_TABLE_LENGTH_LEN_BITS) - 1);
-
-               table_type = (*ptr >> PLATFORM_CONFIG_HEADER_TABLE_TYPE_SHIFT) &
-                       ((1 << PLATFORM_CONFIG_HEADER_TABLE_TYPE_LEN_BITS) - 1);
-
-               /* Done with this set of headers */
-               ptr += 2;
-
-               if (record_idx) {
-                       /* data table */
-                       switch (table_type) {
-                       case PLATFORM_CONFIG_SYSTEM_TABLE:
-                               pcfgcache->config_tables[table_type].num_table =
-                                                                       1;
-                               ret = check_meta_version(dd, ptr);
-                               if (ret)
-                                       goto bail;
-                               break;
-                       case PLATFORM_CONFIG_PORT_TABLE:
-                               pcfgcache->config_tables[table_type].num_table =
-                                                                       2;
-                               break;
-                       case PLATFORM_CONFIG_RX_PRESET_TABLE:
-                               /* fall through */
-                       case PLATFORM_CONFIG_TX_PRESET_TABLE:
-                               /* fall through */
-                       case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
-                               /* fall through */
-                       case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
-                               pcfgcache->config_tables[table_type].num_table =
-                                                       table_length_dwords;
-                               break;
-                       default:
-                               dd_dev_info(dd,
-                                           "%s: Unknown data table %d, offset %ld\n",
-                                           __func__, table_type,
-                                           (ptr - (u32 *)
-                                            dd->platform_config.data));
-                               goto bail; /* We don't trust this file now */
-                       }
-                       pcfgcache->config_tables[table_type].table = ptr;
-               } else {
-                       /* metadata table */
-                       switch (table_type) {
-                       case PLATFORM_CONFIG_SYSTEM_TABLE:
-                               /* fall through */
-                       case PLATFORM_CONFIG_PORT_TABLE:
-                               /* fall through */
-                       case PLATFORM_CONFIG_RX_PRESET_TABLE:
-                               /* fall through */
-                       case PLATFORM_CONFIG_TX_PRESET_TABLE:
-                               /* fall through */
-                       case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
-                               /* fall through */
-                       case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
-                               break;
-                       default:
-                               dd_dev_info(dd,
-                                           "%s: Unknown meta table %d, offset %ld\n",
-                                           __func__, table_type,
-                                           (ptr -
-                                            (u32 *)dd->platform_config.data));
-                               goto bail; /* We don't trust this file now */
-                       }
-                       pcfgcache->config_tables[table_type].table_metadata =
-                                                                       ptr;
-               }
-
-               /* Calculate and check table crc */
-               crc = crc32_le(~(u32)0, (unsigned char const *)ptr,
-                              (table_length_dwords * 4));
-               crc ^= ~(u32)0;
-
-               /* Jump the table */
-               ptr += table_length_dwords;
-               if (crc != *ptr) {
-                       dd_dev_info(dd, "%s: Failed CRC check at offset %ld\n",
-                                   __func__, (ptr -
-                                              (u32 *)
-                                              dd->platform_config.data));
-                       goto bail;
-               }
-               /* Jump the CRC DWORD */
-               ptr++;
-       }
-
-       pcfgcache->cache_valid = 1;
-       return 0;
-bail:
-       memset(pcfgcache, 0, sizeof(struct platform_config_cache));
-       return ret;
-}
-
-static int get_platform_fw_field_metadata(struct hfi1_devdata *dd, int table,
-                                         int field, u32 *field_len_bits,
-                                         u32 *field_start_bits)
-{
-       struct platform_config_cache *pcfgcache = &dd->pcfg_cache;
-       u32 *src_ptr = NULL;
-
-       if (!pcfgcache->cache_valid)
-               return -EINVAL;
-
-       switch (table) {
-       case PLATFORM_CONFIG_SYSTEM_TABLE:
-               /* fall through */
-       case PLATFORM_CONFIG_PORT_TABLE:
-               /* fall through */
-       case PLATFORM_CONFIG_RX_PRESET_TABLE:
-               /* fall through */
-       case PLATFORM_CONFIG_TX_PRESET_TABLE:
-               /* fall through */
-       case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
-               /* fall through */
-       case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
-               if (field && field < platform_config_table_limits[table])
-                       src_ptr =
-                       pcfgcache->config_tables[table].table_metadata + field;
-               break;
-       default:
-               dd_dev_info(dd, "%s: Unknown table\n", __func__);
-               break;
-       }
-
-       if (!src_ptr)
-               return -EINVAL;
-
-       if (field_start_bits)
-               *field_start_bits = *src_ptr &
-                     ((1 << METADATA_TABLE_FIELD_START_LEN_BITS) - 1);
-
-       if (field_len_bits)
-               *field_len_bits = (*src_ptr >> METADATA_TABLE_FIELD_LEN_SHIFT)
-                      & ((1 << METADATA_TABLE_FIELD_LEN_LEN_BITS) - 1);
-
-       return 0;
-}
-
-/* This is the central interface to getting data out of the platform config
- * file. It depends on parse_platform_config() having populated the
- * platform_config_cache in hfi1_devdata, and checks the cache_valid member to
- * validate the sanity of the cache.
- *
- * The non-obvious parameters:
- * @table_index: Acts as a look up key into which instance of the tables the
- * relevant field is fetched from.
- *
- * This applies to the data tables that have multiple instances. The port table
- * is an exception to this rule as each HFI only has one port and thus the
- * relevant table can be distinguished by hfi_id.
- *
- * @data: pointer to memory that will be populated with the field requested.
- * @len: length of memory pointed by @data in bytes.
- */
-int get_platform_config_field(struct hfi1_devdata *dd,
-                             enum platform_config_table_type_encoding
-                             table_type, int table_index, int field_index,
-                             u32 *data, u32 len)
-{
-       int ret = 0, wlen = 0, seek = 0;
-       u32 field_len_bits = 0, field_start_bits = 0, *src_ptr = NULL;
-       struct platform_config_cache *pcfgcache = &dd->pcfg_cache;
-
-       if (data)
-               memset(data, 0, len);
-       else
-               return -EINVAL;
-
-       ret = get_platform_fw_field_metadata(dd, table_type, field_index,
-                                            &field_len_bits,
-                                            &field_start_bits);
-       if (ret)
-               return -EINVAL;
-
-       /* Convert length to bits */
-       len *= 8;
-
-       /* Our metadata function checked cache_valid and field_index for us */
-       switch (table_type) {
-       case PLATFORM_CONFIG_SYSTEM_TABLE:
-               src_ptr = pcfgcache->config_tables[table_type].table;
-
-               if (field_index != SYSTEM_TABLE_QSFP_POWER_CLASS_MAX) {
-                       if (len < field_len_bits)
-                               return -EINVAL;
-
-                       seek = field_start_bits / 8;
-                       wlen = field_len_bits / 8;
-
-                       src_ptr = (u32 *)((u8 *)src_ptr + seek);
-
-                       /*
-                        * We expect the field to be byte aligned and whole byte
-                        * lengths if we are here
-                        */
-                       memcpy(data, src_ptr, wlen);
-                       return 0;
-               }
-               break;
-       case PLATFORM_CONFIG_PORT_TABLE:
-               /* Port table is 4 DWORDS */
-               src_ptr = dd->hfi1_id ?
-                       pcfgcache->config_tables[table_type].table + 4 :
-                       pcfgcache->config_tables[table_type].table;
-               break;
-       case PLATFORM_CONFIG_RX_PRESET_TABLE:
-               /* fall through */
-       case PLATFORM_CONFIG_TX_PRESET_TABLE:
-               /* fall through */
-       case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
-               /* fall through */
-       case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
-               src_ptr = pcfgcache->config_tables[table_type].table;
-
-               if (table_index <
-                       pcfgcache->config_tables[table_type].num_table)
-                       src_ptr += table_index;
-               else
-                       src_ptr = NULL;
-               break;
-       default:
-               dd_dev_info(dd, "%s: Unknown table\n", __func__);
-               break;
-       }
-
-       if (!src_ptr || len < field_len_bits)
-               return -EINVAL;
-
-       src_ptr += (field_start_bits / 32);
-       *data = (*src_ptr >> (field_start_bits % 32)) &
-                       ((1 << field_len_bits) - 1);
-
-       return 0;
-}
-
-/*
- * Download the firmware needed for the Gen3 PCIe SerDes.  An update
- * to the SBus firmware is needed before updating the PCIe firmware.
- *
- * Note: caller must be holding the SBus resource.
- */
-int load_pcie_firmware(struct hfi1_devdata *dd)
-{
-       int ret = 0;
-
-       /* both firmware loads below use the SBus */
-       set_sbus_fast_mode(dd);
-
-       if (fw_sbus_load) {
-               turn_off_spicos(dd, SPICO_SBUS);
-               do {
-                       ret = load_sbus_firmware(dd, &fw_sbus);
-               } while (retry_firmware(dd, ret));
-               if (ret)
-                       goto done;
-       }
-
-       if (fw_pcie_serdes_load) {
-               dd_dev_info(dd, "Setting PCIe SerDes broadcast\n");
-               set_serdes_broadcast(dd, all_pcie_serdes_broadcast,
-                                    pcie_serdes_broadcast[dd->hfi1_id],
-                                    pcie_serdes_addrs[dd->hfi1_id],
-                                    NUM_PCIE_SERDES);
-               do {
-                       ret = load_pcie_serdes_firmware(dd, &fw_pcie);
-               } while (retry_firmware(dd, ret));
-               if (ret)
-                       goto done;
-       }
-
-done:
-       clear_sbus_fast_mode(dd);
-
-       return ret;
-}
-
-/*
- * Read the GUID from the hardware, store it in dd.
- */
-void read_guid(struct hfi1_devdata *dd)
-{
-       /* Take the DC out of reset to get a valid GUID value */
-       write_csr(dd, CCE_DC_CTRL, 0);
-       (void)read_csr(dd, CCE_DC_CTRL);
-
-       dd->base_guid = read_csr(dd, DC_DC8051_CFG_LOCAL_GUID);
-       dd_dev_info(dd, "GUID %llx",
-                   (unsigned long long)dd->base_guid);
-}
diff --git a/drivers/staging/rdma/hfi1/hfi.h b/drivers/staging/rdma/hfi1/hfi.h
deleted file mode 100644 (file)
index 7b78d56..0000000
+++ /dev/null
@@ -1,1949 +0,0 @@
-#ifndef _HFI1_KERNEL_H
-#define _HFI1_KERNEL_H
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/interrupt.h>
-#include <linux/pci.h>
-#include <linux/dma-mapping.h>
-#include <linux/mutex.h>
-#include <linux/list.h>
-#include <linux/scatterlist.h>
-#include <linux/slab.h>
-#include <linux/io.h>
-#include <linux/fs.h>
-#include <linux/completion.h>
-#include <linux/kref.h>
-#include <linux/sched.h>
-#include <linux/cdev.h>
-#include <linux/delay.h>
-#include <linux/kthread.h>
-#include <rdma/rdma_vt.h>
-
-#include "chip_registers.h"
-#include "common.h"
-#include "verbs.h"
-#include "pio.h"
-#include "chip.h"
-#include "mad.h"
-#include "qsfp.h"
-#include "platform.h"
-#include "affinity.h"
-
-/* bumped 1 from s/w major version of TrueScale */
-#define HFI1_CHIP_VERS_MAJ 3U
-
-/* don't care about this except printing */
-#define HFI1_CHIP_VERS_MIN 0U
-
-/* The Organization Unique Identifier (Mfg code), and its position in GUID */
-#define HFI1_OUI 0x001175
-#define HFI1_OUI_LSB 40
-
-#define DROP_PACKET_OFF                0
-#define DROP_PACKET_ON         1
-
-extern unsigned long hfi1_cap_mask;
-#define HFI1_CAP_KGET_MASK(mask, cap) ((mask) & HFI1_CAP_##cap)
-#define HFI1_CAP_UGET_MASK(mask, cap) \
-       (((mask) >> HFI1_CAP_USER_SHIFT) & HFI1_CAP_##cap)
-#define HFI1_CAP_KGET(cap) (HFI1_CAP_KGET_MASK(hfi1_cap_mask, cap))
-#define HFI1_CAP_UGET(cap) (HFI1_CAP_UGET_MASK(hfi1_cap_mask, cap))
-#define HFI1_CAP_IS_KSET(cap) (!!HFI1_CAP_KGET(cap))
-#define HFI1_CAP_IS_USET(cap) (!!HFI1_CAP_UGET(cap))
-#define HFI1_MISC_GET() ((hfi1_cap_mask >> HFI1_CAP_MISC_SHIFT) & \
-                       HFI1_CAP_MISC_MASK)
-/* Offline Disabled Reason is 4-bits */
-#define HFI1_ODR_MASK(rsn) ((rsn) & OPA_PI_MASK_OFFLINE_REASON)
-
-/*
- * Control context is always 0 and handles the error packets.
- * It also handles the VL15 and multicast packets.
- */
-#define HFI1_CTRL_CTXT    0
-
-/*
- * Driver context will store software counters for each of the events
- * associated with these status registers
- */
-#define NUM_CCE_ERR_STATUS_COUNTERS 41
-#define NUM_RCV_ERR_STATUS_COUNTERS 64
-#define NUM_MISC_ERR_STATUS_COUNTERS 13
-#define NUM_SEND_PIO_ERR_STATUS_COUNTERS 36
-#define NUM_SEND_DMA_ERR_STATUS_COUNTERS 4
-#define NUM_SEND_EGRESS_ERR_STATUS_COUNTERS 64
-#define NUM_SEND_ERR_STATUS_COUNTERS 3
-#define NUM_SEND_CTXT_ERR_STATUS_COUNTERS 5
-#define NUM_SEND_DMA_ENG_ERR_STATUS_COUNTERS 24
-
-/*
- * per driver stats, either not device nor port-specific, or
- * summed over all of the devices and ports.
- * They are described by name via ipathfs filesystem, so layout
- * and number of elements can change without breaking compatibility.
- * If members are added or deleted hfi1_statnames[] in debugfs.c must
- * change to match.
- */
-struct hfi1_ib_stats {
-       __u64 sps_ints; /* number of interrupts handled */
-       __u64 sps_errints; /* number of error interrupts */
-       __u64 sps_txerrs; /* tx-related packet errors */
-       __u64 sps_rcverrs; /* non-crc rcv packet errors */
-       __u64 sps_hwerrs; /* hardware errors reported (parity, etc.) */
-       __u64 sps_nopiobufs; /* no pio bufs avail from kernel */
-       __u64 sps_ctxts; /* number of contexts currently open */
-       __u64 sps_lenerrs; /* number of kernel packets where RHF != LRH len */
-       __u64 sps_buffull;
-       __u64 sps_hdrfull;
-};
-
-extern struct hfi1_ib_stats hfi1_stats;
-extern const struct pci_error_handlers hfi1_pci_err_handler;
-
-/*
- * First-cut criterion for "device is active" is
- * two thousand dwords combined Tx, Rx traffic per
- * 5-second interval. SMA packets are 64 dwords,
- * and occur "a few per second", presumably each way.
- */
-#define HFI1_TRAFFIC_ACTIVE_THRESHOLD (2000)
-
-/*
- * Below contains all data related to a single context (formerly called port).
- */
-
-#ifdef CONFIG_DEBUG_FS
-struct hfi1_opcode_stats_perctx;
-#endif
-
-struct ctxt_eager_bufs {
-       ssize_t size;            /* total size of eager buffers */
-       u32 count;               /* size of buffers array */
-       u32 numbufs;             /* number of buffers allocated */
-       u32 alloced;             /* number of rcvarray entries used */
-       u32 rcvtid_size;         /* size of each eager rcv tid */
-       u32 threshold;           /* head update threshold */
-       struct eager_buffer {
-               void *addr;
-               dma_addr_t phys;
-               ssize_t len;
-       } *buffers;
-       struct {
-               void *addr;
-               dma_addr_t phys;
-       } *rcvtids;
-};
-
-struct exp_tid_set {
-       struct list_head list;
-       u32 count;
-};
-
-struct hfi1_ctxtdata {
-       /* shadow the ctxt's RcvCtrl register */
-       u64 rcvctrl;
-       /* rcvhdrq base, needs mmap before useful */
-       void *rcvhdrq;
-       /* kernel virtual address where hdrqtail is updated */
-       volatile __le64 *rcvhdrtail_kvaddr;
-       /*
-        * Shared page for kernel to signal user processes that send buffers
-        * need disarming.  The process should call HFI1_CMD_DISARM_BUFS
-        * or HFI1_CMD_ACK_EVENT with IPATH_EVENT_DISARM_BUFS set.
-        */
-       unsigned long *user_event_mask;
-       /* when waiting for rcv or pioavail */
-       wait_queue_head_t wait;
-       /* rcvhdrq size (for freeing) */
-       size_t rcvhdrq_size;
-       /* number of rcvhdrq entries */
-       u16 rcvhdrq_cnt;
-       /* size of each of the rcvhdrq entries */
-       u16 rcvhdrqentsize;
-       /* mmap of hdrq, must fit in 44 bits */
-       dma_addr_t rcvhdrq_phys;
-       dma_addr_t rcvhdrqtailaddr_phys;
-       struct ctxt_eager_bufs egrbufs;
-       /* this receive context's assigned PIO ACK send context */
-       struct send_context *sc;
-
-       /* dynamic receive available interrupt timeout */
-       u32 rcvavail_timeout;
-       /*
-        * number of opens (including slave sub-contexts) on this instance
-        * (ignoring forks, dup, etc. for now)
-        */
-       int cnt;
-       /*
-        * how much space to leave at start of eager TID entries for
-        * protocol use, on each TID
-        */
-       /* instead of calculating it */
-       unsigned ctxt;
-       /* non-zero if ctxt is being shared. */
-       u16 subctxt_cnt;
-       /* non-zero if ctxt is being shared. */
-       u16 subctxt_id;
-       u8 uuid[16];
-       /* job key */
-       u16 jkey;
-       /* number of RcvArray groups for this context. */
-       u32 rcv_array_groups;
-       /* index of first eager TID entry. */
-       u32 eager_base;
-       /* number of expected TID entries */
-       u32 expected_count;
-       /* index of first expected TID entry. */
-       u32 expected_base;
-
-       struct exp_tid_set tid_group_list;
-       struct exp_tid_set tid_used_list;
-       struct exp_tid_set tid_full_list;
-
-       /* lock protecting all Expected TID data */
-       struct mutex exp_lock;
-       /* number of pio bufs for this ctxt (all procs, if shared) */
-       u32 piocnt;
-       /* first pio buffer for this ctxt */
-       u32 pio_base;
-       /* chip offset of PIO buffers for this ctxt */
-       u32 piobufs;
-       /* per-context configuration flags */
-       u32 flags;
-       /* per-context event flags for fileops/intr communication */
-       unsigned long event_flags;
-       /* WAIT_RCV that timed out, no interrupt */
-       u32 rcvwait_to;
-       /* WAIT_PIO that timed out, no interrupt */
-       u32 piowait_to;
-       /* WAIT_RCV already happened, no wait */
-       u32 rcvnowait;
-       /* WAIT_PIO already happened, no wait */
-       u32 pionowait;
-       /* total number of polled urgent packets */
-       u32 urgent;
-       /* saved total number of polled urgent packets for poll edge trigger */
-       u32 urgent_poll;
-       /* pid of process using this ctxt */
-       pid_t pid;
-       pid_t subpid[HFI1_MAX_SHARED_CTXTS];
-       /* same size as task_struct .comm[], command that opened context */
-       char comm[TASK_COMM_LEN];
-       /* so file ops can get at unit */
-       struct hfi1_devdata *dd;
-       /* so functions that need physical port can get it easily */
-       struct hfi1_pportdata *ppd;
-       /* A page of memory for rcvhdrhead, rcvegrhead, rcvegrtail * N */
-       void *subctxt_uregbase;
-       /* An array of pages for the eager receive buffers * N */
-       void *subctxt_rcvegrbuf;
-       /* An array of pages for the eager header queue entries * N */
-       void *subctxt_rcvhdr_base;
-       /* The version of the library which opened this ctxt */
-       u32 userversion;
-       /* Bitmask of active slaves */
-       u32 active_slaves;
-       /* Type of packets or conditions we want to poll for */
-       u16 poll_type;
-       /* receive packet sequence counter */
-       u8 seq_cnt;
-       u8 redirect_seq_cnt;
-       /* ctxt rcvhdrq head offset */
-       u32 head;
-       u32 pkt_count;
-       /* QPs waiting for context processing */
-       struct list_head qp_wait_list;
-       /* interrupt handling */
-       u64 imask;      /* clear interrupt mask */
-       int ireg;       /* clear interrupt register */
-       unsigned numa_id; /* numa node of this context */
-       /* verbs stats per CTX */
-       struct hfi1_opcode_stats_perctx *opstats;
-       /*
-        * This is the kernel thread that will keep making
-        * progress on the user sdma requests behind the scenes.
-        * There is one per context (shared contexts use the master's).
-        */
-       struct task_struct *progress;
-       struct list_head sdma_queues;
-       /* protect sdma queues */
-       spinlock_t sdma_qlock;
-
-       /* Is ASPM interrupt supported for this context */
-       bool aspm_intr_supported;
-       /* ASPM state (enabled/disabled) for this context */
-       bool aspm_enabled;
-       /* Timer for re-enabling ASPM if interrupt activity quietens down */
-       struct timer_list aspm_timer;
-       /* Lock to serialize between intr, timer intr and user threads */
-       spinlock_t aspm_lock;
-       /* Is ASPM processing enabled for this context (in intr context) */
-       bool aspm_intr_enable;
-       /* Last interrupt timestamp */
-       ktime_t aspm_ts_last_intr;
-       /* Last timestamp at which we scheduled a timer for this context */
-       ktime_t aspm_ts_timer_sched;
-
-       /*
-        * The interrupt handler for a particular receive context can vary
-        * throughout it's lifetime. This is not a lock protected data member so
-        * it must be updated atomically and the prev and new value must always
-        * be valid. Worst case is we process an extra interrupt and up to 64
-        * packets with the wrong interrupt handler.
-        */
-       int (*do_interrupt)(struct hfi1_ctxtdata *rcd, int threaded);
-};
-
-/*
- * Represents a single packet at a high level. Put commonly computed things in
- * here so we do not have to keep doing them over and over. The rule of thumb is
- * if something is used one time to derive some value, store that something in
- * here. If it is used multiple times, then store the result of that derivation
- * in here.
- */
-struct hfi1_packet {
-       void *ebuf;
-       void *hdr;
-       struct hfi1_ctxtdata *rcd;
-       __le32 *rhf_addr;
-       struct rvt_qp *qp;
-       struct hfi1_other_headers *ohdr;
-       u64 rhf;
-       u32 maxcnt;
-       u32 rhqoff;
-       u32 hdrqtail;
-       int numpkt;
-       u16 tlen;
-       u16 hlen;
-       s16 etail;
-       u16 rsize;
-       u8 updegr;
-       u8 rcv_flags;
-       u8 etype;
-};
-
-static inline bool has_sc4_bit(struct hfi1_packet *p)
-{
-       return !!rhf_dc_info(p->rhf);
-}
-
-/*
- * Private data for snoop/capture support.
- */
-struct hfi1_snoop_data {
-       int mode_flag;
-       struct cdev cdev;
-       struct device *class_dev;
-       /* protect snoop data */
-       spinlock_t snoop_lock;
-       struct list_head queue;
-       wait_queue_head_t waitq;
-       void *filter_value;
-       int (*filter_callback)(void *hdr, void *data, void *value);
-       u64 dcc_cfg; /* saved value of DCC Cfg register */
-};
-
-/* snoop mode_flag values */
-#define HFI1_PORT_SNOOP_MODE     1U
-#define HFI1_PORT_CAPTURE_MODE   2U
-
-struct rvt_sge_state;
-
-/*
- * Get/Set IB link-level config parameters for f_get/set_ib_cfg()
- * Mostly for MADs that set or query link parameters, also ipath
- * config interfaces
- */
-#define HFI1_IB_CFG_LIDLMC 0 /* LID (LS16b) and Mask (MS16b) */
-#define HFI1_IB_CFG_LWID_DG_ENB 1 /* allowed Link-width downgrade */
-#define HFI1_IB_CFG_LWID_ENB 2 /* allowed Link-width */
-#define HFI1_IB_CFG_LWID 3 /* currently active Link-width */
-#define HFI1_IB_CFG_SPD_ENB 4 /* allowed Link speeds */
-#define HFI1_IB_CFG_SPD 5 /* current Link spd */
-#define HFI1_IB_CFG_RXPOL_ENB 6 /* Auto-RX-polarity enable */
-#define HFI1_IB_CFG_LREV_ENB 7 /* Auto-Lane-reversal enable */
-#define HFI1_IB_CFG_LINKLATENCY 8 /* Link Latency (IB1.2 only) */
-#define HFI1_IB_CFG_HRTBT 9 /* IB heartbeat off/enable/auto; DDR/QDR only */
-#define HFI1_IB_CFG_OP_VLS 10 /* operational VLs */
-#define HFI1_IB_CFG_VL_HIGH_CAP 11 /* num of VL high priority weights */
-#define HFI1_IB_CFG_VL_LOW_CAP 12 /* num of VL low priority weights */
-#define HFI1_IB_CFG_OVERRUN_THRESH 13 /* IB overrun threshold */
-#define HFI1_IB_CFG_PHYERR_THRESH 14 /* IB PHY error threshold */
-#define HFI1_IB_CFG_LINKDEFAULT 15 /* IB link default (sleep/poll) */
-#define HFI1_IB_CFG_PKEYS 16 /* update partition keys */
-#define HFI1_IB_CFG_MTU 17 /* update MTU in IBC */
-#define HFI1_IB_CFG_VL_HIGH_LIMIT 19
-#define HFI1_IB_CFG_PMA_TICKS 20 /* PMA sample tick resolution */
-#define HFI1_IB_CFG_PORT 21 /* switch port we are connected to */
-
-/*
- * HFI or Host Link States
- *
- * These describe the states the driver thinks the logical and physical
- * states are in.  Used as an argument to set_link_state().  Implemented
- * as bits for easy multi-state checking.  The actual state can only be
- * one.
- */
-#define __HLS_UP_INIT_BP       0
-#define __HLS_UP_ARMED_BP      1
-#define __HLS_UP_ACTIVE_BP     2
-#define __HLS_DN_DOWNDEF_BP    3       /* link down default */
-#define __HLS_DN_POLL_BP       4
-#define __HLS_DN_DISABLE_BP    5
-#define __HLS_DN_OFFLINE_BP    6
-#define __HLS_VERIFY_CAP_BP    7
-#define __HLS_GOING_UP_BP      8
-#define __HLS_GOING_OFFLINE_BP  9
-#define __HLS_LINK_COOLDOWN_BP 10
-
-#define HLS_UP_INIT      BIT(__HLS_UP_INIT_BP)
-#define HLS_UP_ARMED     BIT(__HLS_UP_ARMED_BP)
-#define HLS_UP_ACTIVE    BIT(__HLS_UP_ACTIVE_BP)
-#define HLS_DN_DOWNDEF   BIT(__HLS_DN_DOWNDEF_BP) /* link down default */
-#define HLS_DN_POLL      BIT(__HLS_DN_POLL_BP)
-#define HLS_DN_DISABLE   BIT(__HLS_DN_DISABLE_BP)
-#define HLS_DN_OFFLINE   BIT(__HLS_DN_OFFLINE_BP)
-#define HLS_VERIFY_CAP   BIT(__HLS_VERIFY_CAP_BP)
-#define HLS_GOING_UP     BIT(__HLS_GOING_UP_BP)
-#define HLS_GOING_OFFLINE BIT(__HLS_GOING_OFFLINE_BP)
-#define HLS_LINK_COOLDOWN BIT(__HLS_LINK_COOLDOWN_BP)
-
-#define HLS_UP (HLS_UP_INIT | HLS_UP_ARMED | HLS_UP_ACTIVE)
-
-/* use this MTU size if none other is given */
-#define HFI1_DEFAULT_ACTIVE_MTU 10240
-/* use this MTU size as the default maximum */
-#define HFI1_DEFAULT_MAX_MTU 10240
-/* default partition key */
-#define DEFAULT_PKEY 0xffff
-
-/*
- * Possible fabric manager config parameters for fm_{get,set}_table()
- */
-#define FM_TBL_VL_HIGH_ARB             1 /* Get/set VL high prio weights */
-#define FM_TBL_VL_LOW_ARB              2 /* Get/set VL low prio weights */
-#define FM_TBL_BUFFER_CONTROL          3 /* Get/set Buffer Control */
-#define FM_TBL_SC2VLNT                 4 /* Get/set SC->VLnt */
-#define FM_TBL_VL_PREEMPT_ELEMS                5 /* Get (no set) VL preempt elems */
-#define FM_TBL_VL_PREEMPT_MATRIX       6 /* Get (no set) VL preempt matrix */
-
-/*
- * Possible "operations" for f_rcvctrl(ppd, op, ctxt)
- * these are bits so they can be combined, e.g.
- * HFI1_RCVCTRL_INTRAVAIL_ENB | HFI1_RCVCTRL_CTXT_ENB
- */
-#define HFI1_RCVCTRL_TAILUPD_ENB 0x01
-#define HFI1_RCVCTRL_TAILUPD_DIS 0x02
-#define HFI1_RCVCTRL_CTXT_ENB 0x04
-#define HFI1_RCVCTRL_CTXT_DIS 0x08
-#define HFI1_RCVCTRL_INTRAVAIL_ENB 0x10
-#define HFI1_RCVCTRL_INTRAVAIL_DIS 0x20
-#define HFI1_RCVCTRL_PKEY_ENB 0x40  /* Note, default is enabled */
-#define HFI1_RCVCTRL_PKEY_DIS 0x80
-#define HFI1_RCVCTRL_TIDFLOW_ENB 0x0400
-#define HFI1_RCVCTRL_TIDFLOW_DIS 0x0800
-#define HFI1_RCVCTRL_ONE_PKT_EGR_ENB 0x1000
-#define HFI1_RCVCTRL_ONE_PKT_EGR_DIS 0x2000
-#define HFI1_RCVCTRL_NO_RHQ_DROP_ENB 0x4000
-#define HFI1_RCVCTRL_NO_RHQ_DROP_DIS 0x8000
-#define HFI1_RCVCTRL_NO_EGR_DROP_ENB 0x10000
-#define HFI1_RCVCTRL_NO_EGR_DROP_DIS 0x20000
-
-/* partition enforcement flags */
-#define HFI1_PART_ENFORCE_IN   0x1
-#define HFI1_PART_ENFORCE_OUT  0x2
-
-/* how often we check for synthetic counter wrap around */
-#define SYNTH_CNT_TIME 2
-
-/* Counter flags */
-#define CNTR_NORMAL            0x0 /* Normal counters, just read register */
-#define CNTR_SYNTH             0x1 /* Synthetic counters, saturate at all 1s */
-#define CNTR_DISABLED          0x2 /* Disable this counter */
-#define CNTR_32BIT             0x4 /* Simulate 64 bits for this counter */
-#define CNTR_VL                        0x8 /* Per VL counter */
-#define CNTR_SDMA              0x10
-#define CNTR_INVALID_VL                -1  /* Specifies invalid VL */
-#define CNTR_MODE_W            0x0
-#define CNTR_MODE_R            0x1
-
-/* VLs Supported/Operational */
-#define HFI1_MIN_VLS_SUPPORTED 1
-#define HFI1_MAX_VLS_SUPPORTED 8
-
-static inline void incr_cntr64(u64 *cntr)
-{
-       if (*cntr < (u64)-1LL)
-               (*cntr)++;
-}
-
-static inline void incr_cntr32(u32 *cntr)
-{
-       if (*cntr < (u32)-1LL)
-               (*cntr)++;
-}
-
-#define MAX_NAME_SIZE 64
-struct hfi1_msix_entry {
-       enum irq_type type;
-       struct msix_entry msix;
-       void *arg;
-       char name[MAX_NAME_SIZE];
-       cpumask_t mask;
-};
-
-/* per-SL CCA information */
-struct cca_timer {
-       struct hrtimer hrtimer;
-       struct hfi1_pportdata *ppd; /* read-only */
-       int sl; /* read-only */
-       u16 ccti; /* read/write - current value of CCTI */
-};
-
-struct link_down_reason {
-       /*
-        * SMA-facing value.  Should be set from .latest when
-        * HLS_UP_* -> HLS_DN_* transition actually occurs.
-        */
-       u8 sma;
-       u8 latest;
-};
-
-enum {
-       LO_PRIO_TABLE,
-       HI_PRIO_TABLE,
-       MAX_PRIO_TABLE
-};
-
-struct vl_arb_cache {
-       /* protect vl arb cache */
-       spinlock_t lock;
-       struct ib_vl_weight_elem table[VL_ARB_TABLE_SIZE];
-};
-
-/*
- * The structure below encapsulates data relevant to a physical IB Port.
- * Current chips support only one such port, but the separation
- * clarifies things a bit. Note that to conform to IB conventions,
- * port-numbers are one-based. The first or only port is port1.
- */
-struct hfi1_pportdata {
-       struct hfi1_ibport ibport_data;
-
-       struct hfi1_devdata *dd;
-       struct kobject pport_cc_kobj;
-       struct kobject sc2vl_kobj;
-       struct kobject sl2sc_kobj;
-       struct kobject vl2mtu_kobj;
-
-       /* PHY support */
-       u32 port_type;
-       struct qsfp_data qsfp_info;
-
-       /* GUID for this interface, in host order */
-       u64 guid;
-       /* GUID for peer interface, in host order */
-       u64 neighbor_guid;
-
-       /* up or down physical link state */
-       u32 linkup;
-
-       /*
-        * this address is mapped read-only into user processes so they can
-        * get status cheaply, whenever they want.  One qword of status per port
-        */
-       u64 *statusp;
-
-       /* SendDMA related entries */
-
-       struct workqueue_struct *hfi1_wq;
-
-       /* move out of interrupt context */
-       struct work_struct link_vc_work;
-       struct work_struct link_up_work;
-       struct work_struct link_down_work;
-       struct work_struct sma_message_work;
-       struct work_struct freeze_work;
-       struct work_struct link_downgrade_work;
-       struct work_struct link_bounce_work;
-       /* host link state variables */
-       struct mutex hls_lock;
-       u32 host_link_state;
-
-       spinlock_t            sdma_alllock ____cacheline_aligned_in_smp;
-
-       u32 lstate;     /* logical link state */
-
-       /* these are the "32 bit" regs */
-
-       u32 ibmtu; /* The MTU programmed for this unit */
-       /*
-        * Current max size IB packet (in bytes) including IB headers, that
-        * we can send. Changes when ibmtu changes.
-        */
-       u32 ibmaxlen;
-       u32 current_egress_rate; /* units [10^6 bits/sec] */
-       /* LID programmed for this instance */
-       u16 lid;
-       /* list of pkeys programmed; 0 if not set */
-       u16 pkeys[MAX_PKEY_VALUES];
-       u16 link_width_supported;
-       u16 link_width_downgrade_supported;
-       u16 link_speed_supported;
-       u16 link_width_enabled;
-       u16 link_width_downgrade_enabled;
-       u16 link_speed_enabled;
-       u16 link_width_active;
-       u16 link_width_downgrade_tx_active;
-       u16 link_width_downgrade_rx_active;
-       u16 link_speed_active;
-       u8 vls_supported;
-       u8 vls_operational;
-       u8 actual_vls_operational;
-       /* LID mask control */
-       u8 lmc;
-       /* Rx Polarity inversion (compensate for ~tx on partner) */
-       u8 rx_pol_inv;
-
-       u8 hw_pidx;     /* physical port index */
-       u8 port;        /* IB port number and index into dd->pports - 1 */
-       /* type of neighbor node */
-       u8 neighbor_type;
-       u8 neighbor_normal;
-       u8 neighbor_fm_security; /* 1 if firmware checking is disabled */
-       u8 neighbor_port_number;
-       u8 is_sm_config_started;
-       u8 offline_disabled_reason;
-       u8 is_active_optimize_enabled;
-       u8 driver_link_ready;   /* driver ready for active link */
-       u8 link_enabled;        /* link enabled? */
-       u8 linkinit_reason;
-       u8 local_tx_rate;       /* rate given to 8051 firmware */
-       u8 last_pstate;         /* info only */
-
-       /* placeholders for IB MAD packet settings */
-       u8 overrun_threshold;
-       u8 phy_error_threshold;
-
-       /* Used to override LED behavior for things like maintenance beaconing*/
-       /*
-        * Alternates per phase of blink
-        * [0] holds LED off duration, [1] holds LED on duration
-        */
-       unsigned long led_override_vals[2];
-       u8 led_override_phase; /* LSB picks from vals[] */
-       atomic_t led_override_timer_active;
-       /* Used to flash LEDs in override mode */
-       struct timer_list led_override_timer;
-
-       u32 sm_trap_qp;
-       u32 sa_qp;
-
-       /*
-        * cca_timer_lock protects access to the per-SL cca_timer
-        * structures (specifically the ccti member).
-        */
-       spinlock_t cca_timer_lock ____cacheline_aligned_in_smp;
-       struct cca_timer cca_timer[OPA_MAX_SLS];
-
-       /* List of congestion control table entries */
-       struct ib_cc_table_entry_shadow ccti_entries[CC_TABLE_SHADOW_MAX];
-
-       /* congestion entries, each entry corresponding to a SL */
-       struct opa_congestion_setting_entry_shadow
-               congestion_entries[OPA_MAX_SLS];
-
-       /*
-        * cc_state_lock protects (write) access to the per-port
-        * struct cc_state.
-        */
-       spinlock_t cc_state_lock ____cacheline_aligned_in_smp;
-
-       struct cc_state __rcu *cc_state;
-
-       /* Total number of congestion control table entries */
-       u16 total_cct_entry;
-
-       /* Bit map identifying service level */
-       u32 cc_sl_control_map;
-
-       /* CA's max number of 64 entry units in the congestion control table */
-       u8 cc_max_table_entries;
-
-       /*
-        * begin congestion log related entries
-        * cc_log_lock protects all congestion log related data
-        */
-       spinlock_t cc_log_lock ____cacheline_aligned_in_smp;
-       u8 threshold_cong_event_map[OPA_MAX_SLS / 8];
-       u16 threshold_event_counter;
-       struct opa_hfi1_cong_log_event_internal cc_events[OPA_CONG_LOG_ELEMS];
-       int cc_log_idx; /* index for logging events */
-       int cc_mad_idx; /* index for reporting events */
-       /* end congestion log related entries */
-
-       struct vl_arb_cache vl_arb_cache[MAX_PRIO_TABLE];
-
-       /* port relative counter buffer */
-       u64 *cntrs;
-       /* port relative synthetic counter buffer */
-       u64 *scntrs;
-       /* port_xmit_discards are synthesized from different egress errors */
-       u64 port_xmit_discards;
-       u64 port_xmit_discards_vl[C_VL_COUNT];
-       u64 port_xmit_constraint_errors;
-       u64 port_rcv_constraint_errors;
-       /* count of 'link_err' interrupts from DC */
-       u64 link_downed;
-       /* number of times link retrained successfully */
-       u64 link_up;
-       /* number of times a link unknown frame was reported */
-       u64 unknown_frame_count;
-       /* port_ltp_crc_mode is returned in 'portinfo' MADs */
-       u16 port_ltp_crc_mode;
-       /* port_crc_mode_enabled is the crc we support */
-       u8 port_crc_mode_enabled;
-       /* mgmt_allowed is also returned in 'portinfo' MADs */
-       u8 mgmt_allowed;
-       u8 part_enforce; /* partition enforcement flags */
-       struct link_down_reason local_link_down_reason;
-       struct link_down_reason neigh_link_down_reason;
-       /* Value to be sent to link peer on LinkDown .*/
-       u8 remote_link_down_reason;
-       /* Error events that will cause a port bounce. */
-       u32 port_error_action;
-       struct work_struct linkstate_active_work;
-       /* Does this port need to prescan for FECNs */
-       bool cc_prescan;
-};
-
-typedef int (*rhf_rcv_function_ptr)(struct hfi1_packet *packet);
-
-typedef void (*opcode_handler)(struct hfi1_packet *packet);
-
-/* return values for the RHF receive functions */
-#define RHF_RCV_CONTINUE  0    /* keep going */
-#define RHF_RCV_DONE     1     /* stop, this packet processed */
-#define RHF_RCV_REPROCESS 2    /* stop. retain this packet */
-
-struct rcv_array_data {
-       u8 group_size;
-       u16 ngroups;
-       u16 nctxt_extra;
-};
-
-struct per_vl_data {
-       u16 mtu;
-       struct send_context *sc;
-};
-
-/* 16 to directly index */
-#define PER_VL_SEND_CONTEXTS 16
-
-struct err_info_rcvport {
-       u8 status_and_code;
-       u64 packet_flit1;
-       u64 packet_flit2;
-};
-
-struct err_info_constraint {
-       u8 status;
-       u16 pkey;
-       u32 slid;
-};
-
-struct hfi1_temp {
-       unsigned int curr;       /* current temperature */
-       unsigned int lo_lim;     /* low temperature limit */
-       unsigned int hi_lim;     /* high temperature limit */
-       unsigned int crit_lim;   /* critical temperature limit */
-       u8 triggers;      /* temperature triggers */
-};
-
-/* common data between shared ASIC HFIs */
-struct hfi1_asic_data {
-       struct hfi1_devdata *dds[2];    /* back pointers */
-       struct mutex asic_resource_mutex;
-};
-
-/* device data struct now contains only "general per-device" info.
- * fields related to a physical IB port are in a hfi1_pportdata struct.
- */
-struct sdma_engine;
-struct sdma_vl_map;
-
-#define BOARD_VERS_MAX 96 /* how long the version string can be */
-#define SERIAL_MAX 16 /* length of the serial number */
-
-typedef int (*send_routine)(struct rvt_qp *, struct hfi1_pkt_state *, u64);
-struct hfi1_devdata {
-       struct hfi1_ibdev verbs_dev;     /* must be first */
-       struct list_head list;
-       /* pointers to related structs for this device */
-       /* pci access data structure */
-       struct pci_dev *pcidev;
-       struct cdev user_cdev;
-       struct cdev diag_cdev;
-       struct cdev ui_cdev;
-       struct device *user_device;
-       struct device *diag_device;
-       struct device *ui_device;
-
-       /* mem-mapped pointer to base of chip regs */
-       u8 __iomem *kregbase;
-       /* end of mem-mapped chip space excluding sendbuf and user regs */
-       u8 __iomem *kregend;
-       /* physical address of chip for io_remap, etc. */
-       resource_size_t physaddr;
-       /* receive context data */
-       struct hfi1_ctxtdata **rcd;
-       /* send context data */
-       struct send_context_info *send_contexts;
-       /* map hardware send contexts to software index */
-       u8 *hw_to_sw;
-       /* spinlock for allocating and releasing send context resources */
-       spinlock_t sc_lock;
-       /* Per VL data. Enough for all VLs but not all elements are set/used. */
-       struct per_vl_data vld[PER_VL_SEND_CONTEXTS];
-       /* lock for pio_map */
-       spinlock_t pio_map_lock;
-       /* array of kernel send contexts */
-       struct send_context **kernel_send_context;
-       /* array of vl maps */
-       struct pio_vl_map __rcu *pio_map;
-       /* seqlock for sc2vl */
-       seqlock_t sc2vl_lock;
-       u64 sc2vl[4];
-       /* Send Context initialization lock. */
-       spinlock_t sc_init_lock;
-
-       /* fields common to all SDMA engines */
-
-       /* default flags to last descriptor */
-       u64 default_desc1;
-       volatile __le64                    *sdma_heads_dma; /* DMA'ed by chip */
-       dma_addr_t                          sdma_heads_phys;
-       void                               *sdma_pad_dma; /* DMA'ed by chip */
-       dma_addr_t                          sdma_pad_phys;
-       /* for deallocation */
-       size_t                              sdma_heads_size;
-       /* number from the chip */
-       u32                                 chip_sdma_engines;
-       /* num used */
-       u32                                 num_sdma;
-       /* lock for sdma_map */
-       spinlock_t                          sde_map_lock;
-       /* array of engines sized by num_sdma */
-       struct sdma_engine                 *per_sdma;
-       /* array of vl maps */
-       struct sdma_vl_map __rcu           *sdma_map;
-       /* SPC freeze waitqueue and variable */
-       wait_queue_head_t                 sdma_unfreeze_wq;
-       atomic_t                          sdma_unfreeze_count;
-
-       /* common data between shared ASIC HFIs in this OS */
-       struct hfi1_asic_data *asic_data;
-
-       /* hfi1_pportdata, points to array of (physical) port-specific
-        * data structs, indexed by pidx (0..n-1)
-        */
-       struct hfi1_pportdata *pport;
-
-       /* mem-mapped pointer to base of PIO buffers */
-       void __iomem *piobase;
-       /*
-        * write-combining mem-mapped pointer to base of RcvArray
-        * memory.
-        */
-       void __iomem *rcvarray_wc;
-       /*
-        * credit return base - a per-NUMA range of DMA address that
-        * the chip will use to update the per-context free counter
-        */
-       struct credit_return_base *cr_base;
-
-       /* send context numbers and sizes for each type */
-       struct sc_config_sizes sc_sizes[SC_MAX];
-
-       u32 lcb_access_count;           /* count of LCB users */
-
-       char *boardname; /* human readable board info */
-
-       /* device (not port) flags, basically device capabilities */
-       u32 flags;
-
-       /* reset value */
-       u64 z_int_counter;
-       u64 z_rcv_limit;
-       u64 z_send_schedule;
-       /* percpu int_counter */
-       u64 __percpu *int_counter;
-       u64 __percpu *rcv_limit;
-       u64 __percpu *send_schedule;
-       /* number of receive contexts in use by the driver */
-       u32 num_rcv_contexts;
-       /* number of pio send contexts in use by the driver */
-       u32 num_send_contexts;
-       /*
-        * number of ctxts available for PSM open
-        */
-       u32 freectxts;
-       /* total number of available user/PSM contexts */
-       u32 num_user_contexts;
-       /* base receive interrupt timeout, in CSR units */
-       u32 rcv_intr_timeout_csr;
-
-       u64 __iomem *egrtidbase;
-       spinlock_t sendctrl_lock; /* protect changes to SendCtrl */
-       spinlock_t rcvctrl_lock; /* protect changes to RcvCtrl */
-       /* around rcd and (user ctxts) ctxt_cnt use (intr vs free) */
-       spinlock_t uctxt_lock; /* rcd and user context changes */
-       /* exclusive access to 8051 */
-       spinlock_t dc8051_lock;
-       /* exclusive access to 8051 memory */
-       spinlock_t dc8051_memlock;
-       int dc8051_timed_out;   /* remember if the 8051 timed out */
-       /*
-        * A page that will hold event notification bitmaps for all
-        * contexts. This page will be mapped into all processes.
-        */
-       unsigned long *events;
-       /*
-        * per unit status, see also portdata statusp
-        * mapped read-only into user processes so they can get unit and
-        * IB link status cheaply
-        */
-       struct hfi1_status *status;
-       u32 freezelen; /* max length of freezemsg */
-
-       /* revision register shadow */
-       u64 revision;
-       /* Base GUID for device (network order) */
-       u64 base_guid;
-
-       /* these are the "32 bit" regs */
-
-       /* value we put in kr_rcvhdrsize */
-       u32 rcvhdrsize;
-       /* number of receive contexts the chip supports */
-       u32 chip_rcv_contexts;
-       /* number of receive array entries */
-       u32 chip_rcv_array_count;
-       /* number of PIO send contexts the chip supports */
-       u32 chip_send_contexts;
-       /* number of bytes in the PIO memory buffer */
-       u32 chip_pio_mem_size;
-       /* number of bytes in the SDMA memory buffer */
-       u32 chip_sdma_mem_size;
-
-       /* size of each rcvegrbuffer */
-       u32 rcvegrbufsize;
-       /* log2 of above */
-       u16 rcvegrbufsize_shift;
-       /* both sides of the PCIe link are gen3 capable */
-       u8 link_gen3_capable;
-       /* localbus width (1, 2,4,8,16,32) from config space  */
-       u32 lbus_width;
-       /* localbus speed in MHz */
-       u32 lbus_speed;
-       int unit; /* unit # of this chip */
-       int node; /* home node of this chip */
-
-       /* save these PCI fields to restore after a reset */
-       u32 pcibar0;
-       u32 pcibar1;
-       u32 pci_rom;
-       u16 pci_command;
-       u16 pcie_devctl;
-       u16 pcie_lnkctl;
-       u16 pcie_devctl2;
-       u32 pci_msix0;
-       u32 pci_lnkctl3;
-       u32 pci_tph2;
-
-       /*
-        * ASCII serial number, from flash, large enough for original
-        * all digit strings, and longer serial number format
-        */
-       u8 serial[SERIAL_MAX];
-       /* human readable board version */
-       u8 boardversion[BOARD_VERS_MAX];
-       u8 lbus_info[32]; /* human readable localbus info */
-       /* chip major rev, from CceRevision */
-       u8 majrev;
-       /* chip minor rev, from CceRevision */
-       u8 minrev;
-       /* hardware ID */
-       u8 hfi1_id;
-       /* implementation code */
-       u8 icode;
-       /* default link down value (poll/sleep) */
-       u8 link_default;
-       /* vAU of this device */
-       u8 vau;
-       /* vCU of this device */
-       u8 vcu;
-       /* link credits of this device */
-       u16 link_credits;
-       /* initial vl15 credits to use */
-       u16 vl15_init;
-
-       /* Misc small ints */
-       /* Number of physical ports available */
-       u8 num_pports;
-       /* Lowest context number which can be used by user processes */
-       u8 first_user_ctxt;
-       u8 n_krcv_queues;
-       u8 qos_shift;
-       u8 qpn_mask;
-
-       u16 rhf_offset; /* offset of RHF within receive header entry */
-       u16 irev;       /* implementation revision */
-       u16 dc8051_ver; /* 8051 firmware version */
-
-       struct platform_config platform_config;
-       struct platform_config_cache pcfg_cache;
-
-       struct diag_client *diag_client;
-       spinlock_t hfi1_diag_trans_lock; /* protect diag observer ops */
-
-       u8 psxmitwait_supported;
-       /* cycle length of PS* counters in HW (in picoseconds) */
-       u16 psxmitwait_check_rate;
-       /* high volume overflow errors deferred to tasklet */
-       struct tasklet_struct error_tasklet;
-
-       /* MSI-X information */
-       struct hfi1_msix_entry *msix_entries;
-       u32 num_msix_entries;
-
-       /* INTx information */
-       u32 requested_intx_irq;         /* did we request one? */
-       char intx_name[MAX_NAME_SIZE];  /* INTx name */
-
-       /* general interrupt: mask of handled interrupts */
-       u64 gi_mask[CCE_NUM_INT_CSRS];
-
-       struct rcv_array_data rcv_entries;
-
-       /*
-        * 64 bit synthetic counters
-        */
-       struct timer_list synth_stats_timer;
-
-       /*
-        * device counters
-        */
-       char *cntrnames;
-       size_t cntrnameslen;
-       size_t ndevcntrs;
-       u64 *cntrs;
-       u64 *scntrs;
-
-       /*
-        * remembered values for synthetic counters
-        */
-       u64 last_tx;
-       u64 last_rx;
-
-       /*
-        * per-port counters
-        */
-       size_t nportcntrs;
-       char *portcntrnames;
-       size_t portcntrnameslen;
-
-       struct hfi1_snoop_data hfi1_snoop;
-
-       struct err_info_rcvport err_info_rcvport;
-       struct err_info_constraint err_info_rcv_constraint;
-       struct err_info_constraint err_info_xmit_constraint;
-       u8 err_info_uncorrectable;
-       u8 err_info_fmconfig;
-
-       atomic_t drop_packet;
-       u8 do_drop;
-
-       /*
-        * Software counters for the status bits defined by the
-        * associated error status registers
-        */
-       u64 cce_err_status_cnt[NUM_CCE_ERR_STATUS_COUNTERS];
-       u64 rcv_err_status_cnt[NUM_RCV_ERR_STATUS_COUNTERS];
-       u64 misc_err_status_cnt[NUM_MISC_ERR_STATUS_COUNTERS];
-       u64 send_pio_err_status_cnt[NUM_SEND_PIO_ERR_STATUS_COUNTERS];
-       u64 send_dma_err_status_cnt[NUM_SEND_DMA_ERR_STATUS_COUNTERS];
-       u64 send_egress_err_status_cnt[NUM_SEND_EGRESS_ERR_STATUS_COUNTERS];
-       u64 send_err_status_cnt[NUM_SEND_ERR_STATUS_COUNTERS];
-
-       /* Software counter that spans all contexts */
-       u64 sw_ctxt_err_status_cnt[NUM_SEND_CTXT_ERR_STATUS_COUNTERS];
-       /* Software counter that spans all DMA engines */
-       u64 sw_send_dma_eng_err_status_cnt[
-               NUM_SEND_DMA_ENG_ERR_STATUS_COUNTERS];
-       /* Software counter that aggregates all cce_err_status errors */
-       u64 sw_cce_err_status_aggregate;
-
-       /* receive interrupt functions */
-       rhf_rcv_function_ptr *rhf_rcv_function_map;
-       rhf_rcv_function_ptr normal_rhf_rcv_functions[8];
-
-       /*
-        * Handlers for outgoing data so that snoop/capture does not
-        * have to have its hooks in the send path
-        */
-       send_routine process_pio_send;
-       send_routine process_dma_send;
-       void (*pio_inline_send)(struct hfi1_devdata *dd, struct pio_buf *pbuf,
-                               u64 pbc, const void *from, size_t count);
-
-       /* OUI comes from the HW. Used everywhere as 3 separate bytes. */
-       u8 oui1;
-       u8 oui2;
-       u8 oui3;
-       /* Timer and counter used to detect RcvBufOvflCnt changes */
-       struct timer_list rcverr_timer;
-       u32 rcv_ovfl_cnt;
-
-       wait_queue_head_t event_queue;
-
-       /* Save the enabled LCB error bits */
-       u64 lcb_err_en;
-       u8 dc_shutdown;
-
-       /* receive context tail dummy address */
-       __le64 *rcvhdrtail_dummy_kvaddr;
-       dma_addr_t rcvhdrtail_dummy_physaddr;
-
-       bool eprom_available;   /* true if EPROM is available for this device */
-       bool aspm_supported;    /* Does HW support ASPM */
-       bool aspm_enabled;      /* ASPM state: enabled/disabled */
-       /* Serialize ASPM enable/disable between multiple verbs contexts */
-       spinlock_t aspm_lock;
-       /* Number of verbs contexts which have disabled ASPM */
-       atomic_t aspm_disabled_cnt;
-
-       struct hfi1_affinity *affinity;
-};
-
-/* 8051 firmware version helper */
-#define dc8051_ver(a, b) ((a) << 8 | (b))
-
-/* f_put_tid types */
-#define PT_EXPECTED 0
-#define PT_EAGER    1
-#define PT_INVALID  2
-
-struct tid_rb_node;
-struct mmu_rb_node;
-
-/* Private data for file operations */
-struct hfi1_filedata {
-       struct hfi1_ctxtdata *uctxt;
-       unsigned subctxt;
-       struct hfi1_user_sdma_comp_q *cq;
-       struct hfi1_user_sdma_pkt_q *pq;
-       /* for cpu affinity; -1 if none */
-       int rec_cpu_num;
-       u32 tid_n_pinned;
-       struct rb_root tid_rb_root;
-       struct tid_rb_node **entry_to_rb;
-       spinlock_t tid_lock; /* protect tid_[limit,used] counters */
-       u32 tid_limit;
-       u32 tid_used;
-       u32 *invalid_tids;
-       u32 invalid_tid_idx;
-       /* protect invalid_tids array and invalid_tid_idx */
-       spinlock_t invalid_lock;
-};
-
-extern struct list_head hfi1_dev_list;
-extern spinlock_t hfi1_devs_lock;
-struct hfi1_devdata *hfi1_lookup(int unit);
-extern u32 hfi1_cpulist_count;
-extern unsigned long *hfi1_cpulist;
-
-extern unsigned int snoop_drop_send;
-extern unsigned int snoop_force_capture;
-int hfi1_init(struct hfi1_devdata *, int);
-int hfi1_count_units(int *npresentp, int *nupp);
-int hfi1_count_active_units(void);
-
-int hfi1_diag_add(struct hfi1_devdata *);
-void hfi1_diag_remove(struct hfi1_devdata *);
-void handle_linkup_change(struct hfi1_devdata *dd, u32 linkup);
-
-void handle_user_interrupt(struct hfi1_ctxtdata *rcd);
-
-int hfi1_create_rcvhdrq(struct hfi1_devdata *, struct hfi1_ctxtdata *);
-int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *);
-int hfi1_create_ctxts(struct hfi1_devdata *dd);
-struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *, u32, int);
-void hfi1_init_pportdata(struct pci_dev *, struct hfi1_pportdata *,
-                        struct hfi1_devdata *, u8, u8);
-void hfi1_free_ctxtdata(struct hfi1_devdata *, struct hfi1_ctxtdata *);
-
-int handle_receive_interrupt(struct hfi1_ctxtdata *, int);
-int handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *, int);
-int handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *, int);
-void set_all_slowpath(struct hfi1_devdata *dd);
-
-/* receive packet handler dispositions */
-#define RCV_PKT_OK      0x0 /* keep going */
-#define RCV_PKT_LIMIT   0x1 /* stop, hit limit, start thread */
-#define RCV_PKT_DONE    0x2 /* stop, no more packets detected */
-
-/* calculate the current RHF address */
-static inline __le32 *get_rhf_addr(struct hfi1_ctxtdata *rcd)
-{
-       return (__le32 *)rcd->rcvhdrq + rcd->head + rcd->dd->rhf_offset;
-}
-
-int hfi1_reset_device(int);
-
-/* return the driver's idea of the logical OPA port state */
-static inline u32 driver_lstate(struct hfi1_pportdata *ppd)
-{
-       return ppd->lstate; /* use the cached value */
-}
-
-void receive_interrupt_work(struct work_struct *work);
-
-/* extract service channel from header and rhf */
-static inline int hdr2sc(struct hfi1_message_header *hdr, u64 rhf)
-{
-       return ((be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf) |
-              ((!!(rhf & RHF_DC_INFO_SMASK)) << 4);
-}
-
-static inline u16 generate_jkey(kuid_t uid)
-{
-       return from_kuid(current_user_ns(), uid) & 0xffff;
-}
-
-/*
- * active_egress_rate
- *
- * returns the active egress rate in units of [10^6 bits/sec]
- */
-static inline u32 active_egress_rate(struct hfi1_pportdata *ppd)
-{
-       u16 link_speed = ppd->link_speed_active;
-       u16 link_width = ppd->link_width_active;
-       u32 egress_rate;
-
-       if (link_speed == OPA_LINK_SPEED_25G)
-               egress_rate = 25000;
-       else /* assume OPA_LINK_SPEED_12_5G */
-               egress_rate = 12500;
-
-       switch (link_width) {
-       case OPA_LINK_WIDTH_4X:
-               egress_rate *= 4;
-               break;
-       case OPA_LINK_WIDTH_3X:
-               egress_rate *= 3;
-               break;
-       case OPA_LINK_WIDTH_2X:
-               egress_rate *= 2;
-               break;
-       default:
-               /* assume IB_WIDTH_1X */
-               break;
-       }
-
-       return egress_rate;
-}
-
-/*
- * egress_cycles
- *
- * Returns the number of 'fabric clock cycles' to egress a packet
- * of length 'len' bytes, at 'rate' Mbit/s. Since the fabric clock
- * rate is (approximately) 805 MHz, the units of the returned value
- * are (1/805 MHz).
- */
-static inline u32 egress_cycles(u32 len, u32 rate)
-{
-       u32 cycles;
-
-       /*
-        * cycles is:
-        *
-        *          (length) [bits] / (rate) [bits/sec]
-        *  ---------------------------------------------------
-        *  fabric_clock_period == 1 /(805 * 10^6) [cycles/sec]
-        */
-
-       cycles = len * 8; /* bits */
-       cycles *= 805;
-       cycles /= rate;
-
-       return cycles;
-}
-
-void set_link_ipg(struct hfi1_pportdata *ppd);
-void process_becn(struct hfi1_pportdata *ppd, u8 sl,  u16 rlid, u32 lqpn,
-                 u32 rqpn, u8 svc_type);
-void return_cnp(struct hfi1_ibport *ibp, struct rvt_qp *qp, u32 remote_qpn,
-               u32 pkey, u32 slid, u32 dlid, u8 sc5,
-               const struct ib_grh *old_grh);
-#define PKEY_CHECK_INVALID -1
-int egress_pkey_check(struct hfi1_pportdata *ppd, __be16 *lrh, __be32 *bth,
-                     u8 sc5, int8_t s_pkey_index);
-
-#define PACKET_EGRESS_TIMEOUT 350
-static inline void pause_for_credit_return(struct hfi1_devdata *dd)
-{
-       /* Pause at least 1us, to ensure chip returns all credits */
-       u32 usec = cclock_to_ns(dd, PACKET_EGRESS_TIMEOUT) / 1000;
-
-       udelay(usec ? usec : 1);
-}
-
-/**
- * sc_to_vlt() reverse lookup sc to vl
- * @dd - devdata
- * @sc5 - 5 bit sc
- */
-static inline u8 sc_to_vlt(struct hfi1_devdata *dd, u8 sc5)
-{
-       unsigned seq;
-       u8 rval;
-
-       if (sc5 >= OPA_MAX_SCS)
-               return (u8)(0xff);
-
-       do {
-               seq = read_seqbegin(&dd->sc2vl_lock);
-               rval = *(((u8 *)dd->sc2vl) + sc5);
-       } while (read_seqretry(&dd->sc2vl_lock, seq));
-
-       return rval;
-}
-
-#define PKEY_MEMBER_MASK 0x8000
-#define PKEY_LOW_15_MASK 0x7fff
-
-/*
- * ingress_pkey_matches_entry - return 1 if the pkey matches ent (ent
- * being an entry from the ingress partition key table), return 0
- * otherwise. Use the matching criteria for ingress partition keys
- * specified in the OPAv1 spec., section 9.10.14.
- */
-static inline int ingress_pkey_matches_entry(u16 pkey, u16 ent)
-{
-       u16 mkey = pkey & PKEY_LOW_15_MASK;
-       u16 ment = ent & PKEY_LOW_15_MASK;
-
-       if (mkey == ment) {
-               /*
-                * If pkey[15] is clear (limited partition member),
-                * is bit 15 in the corresponding table element
-                * clear (limited member)?
-                */
-               if (!(pkey & PKEY_MEMBER_MASK))
-                       return !!(ent & PKEY_MEMBER_MASK);
-               return 1;
-       }
-       return 0;
-}
-
-/*
- * ingress_pkey_table_search - search the entire pkey table for
- * an entry which matches 'pkey'. return 0 if a match is found,
- * and 1 otherwise.
- */
-static int ingress_pkey_table_search(struct hfi1_pportdata *ppd, u16 pkey)
-{
-       int i;
-
-       for (i = 0; i < MAX_PKEY_VALUES; i++) {
-               if (ingress_pkey_matches_entry(pkey, ppd->pkeys[i]))
-                       return 0;
-       }
-       return 1;
-}
-
-/*
- * ingress_pkey_table_fail - record a failure of ingress pkey validation,
- * i.e., increment port_rcv_constraint_errors for the port, and record
- * the 'error info' for this failure.
- */
-static void ingress_pkey_table_fail(struct hfi1_pportdata *ppd, u16 pkey,
-                                   u16 slid)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-
-       incr_cntr64(&ppd->port_rcv_constraint_errors);
-       if (!(dd->err_info_rcv_constraint.status & OPA_EI_STATUS_SMASK)) {
-               dd->err_info_rcv_constraint.status |= OPA_EI_STATUS_SMASK;
-               dd->err_info_rcv_constraint.slid = slid;
-               dd->err_info_rcv_constraint.pkey = pkey;
-       }
-}
-
-/*
- * ingress_pkey_check - Return 0 if the ingress pkey is valid, return 1
- * otherwise. Use the criteria in the OPAv1 spec, section 9.10.14. idx
- * is a hint as to the best place in the partition key table to begin
- * searching. This function should not be called on the data path because
- * of performance reasons. On datapath pkey check is expected to be done
- * by HW and rcv_pkey_check function should be called instead.
- */
-static inline int ingress_pkey_check(struct hfi1_pportdata *ppd, u16 pkey,
-                                    u8 sc5, u8 idx, u16 slid)
-{
-       if (!(ppd->part_enforce & HFI1_PART_ENFORCE_IN))
-               return 0;
-
-       /* If SC15, pkey[0:14] must be 0x7fff */
-       if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
-               goto bad;
-
-       /* Is the pkey = 0x0, or 0x8000? */
-       if ((pkey & PKEY_LOW_15_MASK) == 0)
-               goto bad;
-
-       /* The most likely matching pkey has index 'idx' */
-       if (ingress_pkey_matches_entry(pkey, ppd->pkeys[idx]))
-               return 0;
-
-       /* no match - try the whole table */
-       if (!ingress_pkey_table_search(ppd, pkey))
-               return 0;
-
-bad:
-       ingress_pkey_table_fail(ppd, pkey, slid);
-       return 1;
-}
-
-/*
- * rcv_pkey_check - Return 0 if the ingress pkey is valid, return 1
- * otherwise. It only ensures pkey is vlid for QP0. This function
- * should be called on the data path instead of ingress_pkey_check
- * as on data path, pkey check is done by HW (except for QP0).
- */
-static inline int rcv_pkey_check(struct hfi1_pportdata *ppd, u16 pkey,
-                                u8 sc5, u16 slid)
-{
-       if (!(ppd->part_enforce & HFI1_PART_ENFORCE_IN))
-               return 0;
-
-       /* If SC15, pkey[0:14] must be 0x7fff */
-       if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
-               goto bad;
-
-       return 0;
-bad:
-       ingress_pkey_table_fail(ppd, pkey, slid);
-       return 1;
-}
-
-/* MTU handling */
-
-/* MTU enumeration, 256-4k match IB */
-#define OPA_MTU_0     0
-#define OPA_MTU_256   1
-#define OPA_MTU_512   2
-#define OPA_MTU_1024  3
-#define OPA_MTU_2048  4
-#define OPA_MTU_4096  5
-
-u32 lrh_max_header_bytes(struct hfi1_devdata *dd);
-int mtu_to_enum(u32 mtu, int default_if_bad);
-u16 enum_to_mtu(int);
-static inline int valid_ib_mtu(unsigned int mtu)
-{
-       return mtu == 256 || mtu == 512 ||
-               mtu == 1024 || mtu == 2048 ||
-               mtu == 4096;
-}
-
-static inline int valid_opa_max_mtu(unsigned int mtu)
-{
-       return mtu >= 2048 &&
-               (valid_ib_mtu(mtu) || mtu == 8192 || mtu == 10240);
-}
-
-int set_mtu(struct hfi1_pportdata *);
-
-int hfi1_set_lid(struct hfi1_pportdata *, u32, u8);
-void hfi1_disable_after_error(struct hfi1_devdata *);
-int hfi1_set_uevent_bits(struct hfi1_pportdata *, const int);
-int hfi1_rcvbuf_validate(u32, u8, u16 *);
-
-int fm_get_table(struct hfi1_pportdata *, int, void *);
-int fm_set_table(struct hfi1_pportdata *, int, void *);
-
-void set_up_vl15(struct hfi1_devdata *dd, u8 vau, u16 vl15buf);
-void reset_link_credits(struct hfi1_devdata *dd);
-void assign_remote_cm_au_table(struct hfi1_devdata *dd, u8 vcu);
-
-int snoop_recv_handler(struct hfi1_packet *packet);
-int snoop_send_dma_handler(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
-                          u64 pbc);
-int snoop_send_pio_handler(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
-                          u64 pbc);
-void snoop_inline_pio_send(struct hfi1_devdata *dd, struct pio_buf *pbuf,
-                          u64 pbc, const void *from, size_t count);
-int set_buffer_control(struct hfi1_pportdata *ppd, struct buffer_control *bc);
-
-static inline struct hfi1_devdata *dd_from_ppd(struct hfi1_pportdata *ppd)
-{
-       return ppd->dd;
-}
-
-static inline struct hfi1_devdata *dd_from_dev(struct hfi1_ibdev *dev)
-{
-       return container_of(dev, struct hfi1_devdata, verbs_dev);
-}
-
-static inline struct hfi1_devdata *dd_from_ibdev(struct ib_device *ibdev)
-{
-       return dd_from_dev(to_idev(ibdev));
-}
-
-static inline struct hfi1_pportdata *ppd_from_ibp(struct hfi1_ibport *ibp)
-{
-       return container_of(ibp, struct hfi1_pportdata, ibport_data);
-}
-
-static inline struct hfi1_ibdev *dev_from_rdi(struct rvt_dev_info *rdi)
-{
-       return container_of(rdi, struct hfi1_ibdev, rdi);
-}
-
-static inline struct hfi1_ibport *to_iport(struct ib_device *ibdev, u8 port)
-{
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       unsigned pidx = port - 1; /* IB number port from 1, hdw from 0 */
-
-       WARN_ON(pidx >= dd->num_pports);
-       return &dd->pport[pidx].ibport_data;
-}
-
-/*
- * Return the indexed PKEY from the port PKEY table.
- */
-static inline u16 hfi1_get_pkey(struct hfi1_ibport *ibp, unsigned index)
-{
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-       u16 ret;
-
-       if (index >= ARRAY_SIZE(ppd->pkeys))
-               ret = 0;
-       else
-               ret = ppd->pkeys[index];
-
-       return ret;
-}
-
-/*
- * Readers of cc_state must call get_cc_state() under rcu_read_lock().
- * Writers of cc_state must call get_cc_state() under cc_state_lock.
- */
-static inline struct cc_state *get_cc_state(struct hfi1_pportdata *ppd)
-{
-       return rcu_dereference(ppd->cc_state);
-}
-
-/*
- * values for dd->flags (_device_ related flags)
- */
-#define HFI1_INITTED           0x1    /* chip and driver up and initted */
-#define HFI1_PRESENT           0x2    /* chip accesses can be done */
-#define HFI1_FROZEN            0x4    /* chip in SPC freeze */
-#define HFI1_HAS_SDMA_TIMEOUT  0x8
-#define HFI1_HAS_SEND_DMA      0x10   /* Supports Send DMA */
-#define HFI1_FORCED_FREEZE     0x80   /* driver forced freeze mode */
-
-/* IB dword length mask in PBC (lower 11 bits); same for all chips */
-#define HFI1_PBC_LENGTH_MASK                     ((1 << 11) - 1)
-
-/* ctxt_flag bit offsets */
-               /* context has been setup */
-#define HFI1_CTXT_SETUP_DONE 1
-               /* waiting for a packet to arrive */
-#define HFI1_CTXT_WAITING_RCV   2
-               /* master has not finished initializing */
-#define HFI1_CTXT_MASTER_UNINIT 4
-               /* waiting for an urgent packet to arrive */
-#define HFI1_CTXT_WAITING_URG 5
-
-/* free up any allocated data at closes */
-struct hfi1_devdata *hfi1_init_dd(struct pci_dev *,
-                                 const struct pci_device_id *);
-void hfi1_free_devdata(struct hfi1_devdata *);
-void cc_state_reclaim(struct rcu_head *rcu);
-struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra);
-
-/* LED beaconing functions */
-void hfi1_start_led_override(struct hfi1_pportdata *ppd, unsigned int timeon,
-                            unsigned int timeoff);
-void shutdown_led_override(struct hfi1_pportdata *ppd);
-
-#define HFI1_CREDIT_RETURN_RATE (100)
-
-/*
- * The number of words for the KDETH protocol field.  If this is
- * larger then the actual field used, then part of the payload
- * will be in the header.
- *
- * Optimally, we want this sized so that a typical case will
- * use full cache lines.  The typical local KDETH header would
- * be:
- *
- *     Bytes   Field
- *       8     LRH
- *      12     BHT
- *      ??     KDETH
- *       8     RHF
- *     ---
- *      28 + KDETH
- *
- * For a 64-byte cache line, KDETH would need to be 36 bytes or 9 DWORDS
- */
-#define DEFAULT_RCVHDRSIZE 9
-
-/*
- * Maximal header byte count:
- *
- *     Bytes   Field
- *       8     LRH
- *      40     GRH (optional)
- *      12     BTH
- *      ??     KDETH
- *       8     RHF
- *     ---
- *      68 + KDETH
- *
- * We also want to maintain a cache line alignment to assist DMA'ing
- * of the header bytes.  Round up to a good size.
- */
-#define DEFAULT_RCVHDR_ENTSIZE 32
-
-bool hfi1_can_pin_pages(struct hfi1_devdata *, u32, u32);
-int hfi1_acquire_user_pages(unsigned long, size_t, bool, struct page **);
-void hfi1_release_user_pages(struct mm_struct *, struct page **, size_t, bool);
-
-static inline void clear_rcvhdrtail(const struct hfi1_ctxtdata *rcd)
-{
-       *((u64 *)rcd->rcvhdrtail_kvaddr) = 0ULL;
-}
-
-static inline u32 get_rcvhdrtail(const struct hfi1_ctxtdata *rcd)
-{
-       /*
-        * volatile because it's a DMA target from the chip, routine is
-        * inlined, and don't want register caching or reordering.
-        */
-       return (u32)le64_to_cpu(*rcd->rcvhdrtail_kvaddr);
-}
-
-/*
- * sysfs interface.
- */
-
-extern const char ib_hfi1_version[];
-
-int hfi1_device_create(struct hfi1_devdata *);
-void hfi1_device_remove(struct hfi1_devdata *);
-
-int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num,
-                          struct kobject *kobj);
-int hfi1_verbs_register_sysfs(struct hfi1_devdata *);
-void hfi1_verbs_unregister_sysfs(struct hfi1_devdata *);
-/* Hook for sysfs read of QSFP */
-int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len);
-
-int hfi1_pcie_init(struct pci_dev *, const struct pci_device_id *);
-void hfi1_pcie_cleanup(struct pci_dev *);
-int hfi1_pcie_ddinit(struct hfi1_devdata *, struct pci_dev *,
-                    const struct pci_device_id *);
-void hfi1_pcie_ddcleanup(struct hfi1_devdata *);
-void hfi1_pcie_flr(struct hfi1_devdata *);
-int pcie_speeds(struct hfi1_devdata *);
-void request_msix(struct hfi1_devdata *, u32 *, struct hfi1_msix_entry *);
-void hfi1_enable_intx(struct pci_dev *);
-void restore_pci_variables(struct hfi1_devdata *dd);
-int do_pcie_gen3_transition(struct hfi1_devdata *dd);
-int parse_platform_config(struct hfi1_devdata *dd);
-int get_platform_config_field(struct hfi1_devdata *dd,
-                             enum platform_config_table_type_encoding
-                             table_type, int table_index, int field_index,
-                             u32 *data, u32 len);
-
-const char *get_unit_name(int unit);
-const char *get_card_name(struct rvt_dev_info *rdi);
-struct pci_dev *get_pci_dev(struct rvt_dev_info *rdi);
-
-/*
- * Flush write combining store buffers (if present) and perform a write
- * barrier.
- */
-static inline void flush_wc(void)
-{
-       asm volatile("sfence" : : : "memory");
-}
-
-void handle_eflags(struct hfi1_packet *packet);
-int process_receive_ib(struct hfi1_packet *packet);
-int process_receive_bypass(struct hfi1_packet *packet);
-int process_receive_error(struct hfi1_packet *packet);
-int kdeth_process_expected(struct hfi1_packet *packet);
-int kdeth_process_eager(struct hfi1_packet *packet);
-int process_receive_invalid(struct hfi1_packet *packet);
-
-extern rhf_rcv_function_ptr snoop_rhf_rcv_functions[8];
-
-void update_sge(struct rvt_sge_state *ss, u32 length);
-
-/* global module parameter variables */
-extern unsigned int hfi1_max_mtu;
-extern unsigned int hfi1_cu;
-extern unsigned int user_credit_return_threshold;
-extern int num_user_contexts;
-extern unsigned n_krcvqs;
-extern uint krcvqs[];
-extern int krcvqsset;
-extern uint kdeth_qp;
-extern uint loopback;
-extern uint quick_linkup;
-extern uint rcv_intr_timeout;
-extern uint rcv_intr_count;
-extern uint rcv_intr_dynamic;
-extern ushort link_crc_mask;
-
-extern struct mutex hfi1_mutex;
-
-/* Number of seconds before our card status check...  */
-#define STATUS_TIMEOUT 60
-
-#define DRIVER_NAME            "hfi1"
-#define HFI1_USER_MINOR_BASE     0
-#define HFI1_TRACE_MINOR         127
-#define HFI1_DIAGPKT_MINOR       128
-#define HFI1_DIAG_MINOR_BASE     129
-#define HFI1_SNOOP_CAPTURE_BASE  200
-#define HFI1_NMINORS             255
-
-#define PCI_VENDOR_ID_INTEL 0x8086
-#define PCI_DEVICE_ID_INTEL0 0x24f0
-#define PCI_DEVICE_ID_INTEL1 0x24f1
-
-#define HFI1_PKT_USER_SC_INTEGRITY                                         \
-       (SEND_CTXT_CHECK_ENABLE_DISALLOW_NON_KDETH_PACKETS_SMASK            \
-       | SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK           \
-       | SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_SMASK              \
-       | SEND_CTXT_CHECK_ENABLE_DISALLOW_GRH_SMASK)
-
-#define HFI1_PKT_KERNEL_SC_INTEGRITY                                       \
-       (SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK)
-
-static inline u64 hfi1_pkt_default_send_ctxt_mask(struct hfi1_devdata *dd,
-                                                 u16 ctxt_type)
-{
-       u64 base_sc_integrity =
-       SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK
-       | SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK
-       | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK
-       | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK
-       | SEND_CTXT_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK
-       | SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK
-       | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK
-       | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK
-       | SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK
-       | SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_SMASK
-       | SEND_CTXT_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK
-       | SEND_CTXT_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK
-       | SEND_CTXT_CHECK_ENABLE_CHECK_OPCODE_SMASK
-       | SEND_CTXT_CHECK_ENABLE_CHECK_SLID_SMASK
-       | SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK
-       | SEND_CTXT_CHECK_ENABLE_CHECK_VL_SMASK
-       | SEND_CTXT_CHECK_ENABLE_CHECK_ENABLE_SMASK;
-
-       if (ctxt_type == SC_USER)
-               base_sc_integrity |= HFI1_PKT_USER_SC_INTEGRITY;
-       else
-               base_sc_integrity |= HFI1_PKT_KERNEL_SC_INTEGRITY;
-
-       if (is_ax(dd))
-               /* turn off send-side job key checks - A0 */
-               return base_sc_integrity &
-                      ~SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
-       return base_sc_integrity;
-}
-
-static inline u64 hfi1_pkt_base_sdma_integrity(struct hfi1_devdata *dd)
-{
-       u64 base_sdma_integrity =
-       SEND_DMA_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK
-       | SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK
-       | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK
-       | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK
-       | SEND_DMA_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK
-       | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK
-       | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK
-       | SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK
-       | SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_SMASK
-       | SEND_DMA_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK
-       | SEND_DMA_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK
-       | SEND_DMA_CHECK_ENABLE_CHECK_OPCODE_SMASK
-       | SEND_DMA_CHECK_ENABLE_CHECK_SLID_SMASK
-       | SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK
-       | SEND_DMA_CHECK_ENABLE_CHECK_VL_SMASK
-       | SEND_DMA_CHECK_ENABLE_CHECK_ENABLE_SMASK;
-
-       if (is_ax(dd))
-               /* turn off send-side job key checks - A0 */
-               return base_sdma_integrity &
-                      ~SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
-       return base_sdma_integrity;
-}
-
-/*
- * hfi1_early_err is used (only!) to print early errors before devdata is
- * allocated, or when dd->pcidev may not be valid, and at the tail end of
- * cleanup when devdata may have been freed, etc.  hfi1_dev_porterr is
- * the same as dd_dev_err, but is used when the message really needs
- * the IB port# to be definitive as to what's happening..
- */
-#define hfi1_early_err(dev, fmt, ...) \
-       dev_err(dev, fmt, ##__VA_ARGS__)
-
-#define hfi1_early_info(dev, fmt, ...) \
-       dev_info(dev, fmt, ##__VA_ARGS__)
-
-#define dd_dev_emerg(dd, fmt, ...) \
-       dev_emerg(&(dd)->pcidev->dev, "%s: " fmt, \
-                 get_unit_name((dd)->unit), ##__VA_ARGS__)
-#define dd_dev_err(dd, fmt, ...) \
-       dev_err(&(dd)->pcidev->dev, "%s: " fmt, \
-                       get_unit_name((dd)->unit), ##__VA_ARGS__)
-#define dd_dev_warn(dd, fmt, ...) \
-       dev_warn(&(dd)->pcidev->dev, "%s: " fmt, \
-                       get_unit_name((dd)->unit), ##__VA_ARGS__)
-
-#define dd_dev_warn_ratelimited(dd, fmt, ...) \
-       dev_warn_ratelimited(&(dd)->pcidev->dev, "%s: " fmt, \
-                       get_unit_name((dd)->unit), ##__VA_ARGS__)
-
-#define dd_dev_info(dd, fmt, ...) \
-       dev_info(&(dd)->pcidev->dev, "%s: " fmt, \
-                       get_unit_name((dd)->unit), ##__VA_ARGS__)
-
-#define dd_dev_dbg(dd, fmt, ...) \
-       dev_dbg(&(dd)->pcidev->dev, "%s: " fmt, \
-               get_unit_name((dd)->unit), ##__VA_ARGS__)
-
-#define hfi1_dev_porterr(dd, port, fmt, ...) \
-       dev_err(&(dd)->pcidev->dev, "%s: IB%u:%u " fmt, \
-                       get_unit_name((dd)->unit), (dd)->unit, (port), \
-                       ##__VA_ARGS__)
-
-/*
- * this is used for formatting hw error messages...
- */
-struct hfi1_hwerror_msgs {
-       u64 mask;
-       const char *msg;
-       size_t sz;
-};
-
-/* in intr.c... */
-void hfi1_format_hwerrors(u64 hwerrs,
-                         const struct hfi1_hwerror_msgs *hwerrmsgs,
-                         size_t nhwerrmsgs, char *msg, size_t lmsg);
-
-#define USER_OPCODE_CHECK_VAL 0xC0
-#define USER_OPCODE_CHECK_MASK 0xC0
-#define OPCODE_CHECK_VAL_DISABLED 0x0
-#define OPCODE_CHECK_MASK_DISABLED 0x0
-
-static inline void hfi1_reset_cpu_counters(struct hfi1_devdata *dd)
-{
-       struct hfi1_pportdata *ppd;
-       int i;
-
-       dd->z_int_counter = get_all_cpu_total(dd->int_counter);
-       dd->z_rcv_limit = get_all_cpu_total(dd->rcv_limit);
-       dd->z_send_schedule = get_all_cpu_total(dd->send_schedule);
-
-       ppd = (struct hfi1_pportdata *)(dd + 1);
-       for (i = 0; i < dd->num_pports; i++, ppd++) {
-               ppd->ibport_data.rvp.z_rc_acks =
-                       get_all_cpu_total(ppd->ibport_data.rvp.rc_acks);
-               ppd->ibport_data.rvp.z_rc_qacks =
-                       get_all_cpu_total(ppd->ibport_data.rvp.rc_qacks);
-       }
-}
-
-/* Control LED state */
-static inline void setextled(struct hfi1_devdata *dd, u32 on)
-{
-       if (on)
-               write_csr(dd, DCC_CFG_LED_CNTRL, 0x1F);
-       else
-               write_csr(dd, DCC_CFG_LED_CNTRL, 0x10);
-}
-
-/* return the i2c resource given the target */
-static inline u32 i2c_target(u32 target)
-{
-       return target ? CR_I2C2 : CR_I2C1;
-}
-
-/* return the i2c chain chip resource that this HFI uses for QSFP */
-static inline u32 qsfp_resource(struct hfi1_devdata *dd)
-{
-       return i2c_target(dd->hfi1_id);
-}
-
-int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp);
-
-#endif                          /* _HFI1_KERNEL_H */
diff --git a/drivers/staging/rdma/hfi1/init.c b/drivers/staging/rdma/hfi1/init.c
deleted file mode 100644 (file)
index 502b7cf..0000000
+++ /dev/null
@@ -1,1806 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/pci.h>
-#include <linux/netdevice.h>
-#include <linux/vmalloc.h>
-#include <linux/delay.h>
-#include <linux/idr.h>
-#include <linux/module.h>
-#include <linux/printk.h>
-#include <linux/hrtimer.h>
-#include <rdma/rdma_vt.h>
-
-#include "hfi.h"
-#include "device.h"
-#include "common.h"
-#include "trace.h"
-#include "mad.h"
-#include "sdma.h"
-#include "debugfs.h"
-#include "verbs.h"
-#include "aspm.h"
-
-#undef pr_fmt
-#define pr_fmt(fmt) DRIVER_NAME ": " fmt
-
-/*
- * min buffers we want to have per context, after driver
- */
-#define HFI1_MIN_USER_CTXT_BUFCNT 7
-
-#define HFI1_MIN_HDRQ_EGRBUF_CNT 2
-#define HFI1_MAX_HDRQ_EGRBUF_CNT 16352
-#define HFI1_MIN_EAGER_BUFFER_SIZE (4 * 1024) /* 4KB */
-#define HFI1_MAX_EAGER_BUFFER_SIZE (256 * 1024) /* 256KB */
-
-/*
- * Number of user receive contexts we are configured to use (to allow for more
- * pio buffers per ctxt, etc.)  Zero means use one user context per CPU.
- */
-int num_user_contexts = -1;
-module_param_named(num_user_contexts, num_user_contexts, uint, S_IRUGO);
-MODULE_PARM_DESC(
-       num_user_contexts, "Set max number of user contexts to use");
-
-uint krcvqs[RXE_NUM_DATA_VL];
-int krcvqsset;
-module_param_array(krcvqs, uint, &krcvqsset, S_IRUGO);
-MODULE_PARM_DESC(krcvqs, "Array of the number of non-control kernel receive queues by VL");
-
-/* computed based on above array */
-unsigned n_krcvqs;
-
-static unsigned hfi1_rcvarr_split = 25;
-module_param_named(rcvarr_split, hfi1_rcvarr_split, uint, S_IRUGO);
-MODULE_PARM_DESC(rcvarr_split, "Percent of context's RcvArray entries used for Eager buffers");
-
-static uint eager_buffer_size = (2 << 20); /* 2MB */
-module_param(eager_buffer_size, uint, S_IRUGO);
-MODULE_PARM_DESC(eager_buffer_size, "Size of the eager buffers, default: 2MB");
-
-static uint rcvhdrcnt = 2048; /* 2x the max eager buffer count */
-module_param_named(rcvhdrcnt, rcvhdrcnt, uint, S_IRUGO);
-MODULE_PARM_DESC(rcvhdrcnt, "Receive header queue count (default 2048)");
-
-static uint hfi1_hdrq_entsize = 32;
-module_param_named(hdrq_entsize, hfi1_hdrq_entsize, uint, S_IRUGO);
-MODULE_PARM_DESC(hdrq_entsize, "Size of header queue entries: 2 - 8B, 16 - 64B (default), 32 - 128B");
-
-unsigned int user_credit_return_threshold = 33;        /* default is 33% */
-module_param(user_credit_return_threshold, uint, S_IRUGO);
-MODULE_PARM_DESC(user_credit_return_threshold, "Credit return threshold for user send contexts, return when unreturned credits passes this many blocks (in percent of allocated blocks, 0 is off)");
-
-static inline u64 encode_rcv_header_entry_size(u16);
-
-static struct idr hfi1_unit_table;
-u32 hfi1_cpulist_count;
-unsigned long *hfi1_cpulist;
-
-/*
- * Common code for creating the receive context array.
- */
-int hfi1_create_ctxts(struct hfi1_devdata *dd)
-{
-       unsigned i;
-       int ret;
-
-       /* Control context has to be always 0 */
-       BUILD_BUG_ON(HFI1_CTRL_CTXT != 0);
-
-       dd->rcd = kzalloc_node(dd->num_rcv_contexts * sizeof(*dd->rcd),
-                              GFP_KERNEL, dd->node);
-       if (!dd->rcd)
-               goto nomem;
-
-       /* create one or more kernel contexts */
-       for (i = 0; i < dd->first_user_ctxt; ++i) {
-               struct hfi1_pportdata *ppd;
-               struct hfi1_ctxtdata *rcd;
-
-               ppd = dd->pport + (i % dd->num_pports);
-               rcd = hfi1_create_ctxtdata(ppd, i, dd->node);
-               if (!rcd) {
-                       dd_dev_err(dd,
-                                  "Unable to allocate kernel receive context, failing\n");
-                       goto nomem;
-               }
-               /*
-                * Set up the kernel context flags here and now because they
-                * use default values for all receive side memories.  User
-                * contexts will be handled as they are created.
-                */
-               rcd->flags = HFI1_CAP_KGET(MULTI_PKT_EGR) |
-                       HFI1_CAP_KGET(NODROP_RHQ_FULL) |
-                       HFI1_CAP_KGET(NODROP_EGR_FULL) |
-                       HFI1_CAP_KGET(DMA_RTAIL);
-
-               /* Control context must use DMA_RTAIL */
-               if (rcd->ctxt == HFI1_CTRL_CTXT)
-                       rcd->flags |= HFI1_CAP_DMA_RTAIL;
-               rcd->seq_cnt = 1;
-
-               rcd->sc = sc_alloc(dd, SC_ACK, rcd->rcvhdrqentsize, dd->node);
-               if (!rcd->sc) {
-                       dd_dev_err(dd,
-                                  "Unable to allocate kernel send context, failing\n");
-                       dd->rcd[rcd->ctxt] = NULL;
-                       hfi1_free_ctxtdata(dd, rcd);
-                       goto nomem;
-               }
-
-               ret = hfi1_init_ctxt(rcd->sc);
-               if (ret < 0) {
-                       dd_dev_err(dd,
-                                  "Failed to setup kernel receive context, failing\n");
-                       sc_free(rcd->sc);
-                       dd->rcd[rcd->ctxt] = NULL;
-                       hfi1_free_ctxtdata(dd, rcd);
-                       ret = -EFAULT;
-                       goto bail;
-               }
-       }
-
-       /*
-        * Initialize aspm, to be done after gen3 transition and setting up
-        * contexts and before enabling interrupts
-        */
-       aspm_init(dd);
-
-       return 0;
-nomem:
-       ret = -ENOMEM;
-bail:
-       kfree(dd->rcd);
-       dd->rcd = NULL;
-       return ret;
-}
-
-/*
- * Common code for user and kernel context setup.
- */
-struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt,
-                                          int numa)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       struct hfi1_ctxtdata *rcd;
-       unsigned kctxt_ngroups = 0;
-       u32 base;
-
-       if (dd->rcv_entries.nctxt_extra >
-           dd->num_rcv_contexts - dd->first_user_ctxt)
-               kctxt_ngroups = (dd->rcv_entries.nctxt_extra -
-                                (dd->num_rcv_contexts - dd->first_user_ctxt));
-       rcd = kzalloc(sizeof(*rcd), GFP_KERNEL);
-       if (rcd) {
-               u32 rcvtids, max_entries;
-
-               hfi1_cdbg(PROC, "setting up context %u\n", ctxt);
-
-               INIT_LIST_HEAD(&rcd->qp_wait_list);
-               rcd->ppd = ppd;
-               rcd->dd = dd;
-               rcd->cnt = 1;
-               rcd->ctxt = ctxt;
-               dd->rcd[ctxt] = rcd;
-               rcd->numa_id = numa;
-               rcd->rcv_array_groups = dd->rcv_entries.ngroups;
-
-               mutex_init(&rcd->exp_lock);
-
-               /*
-                * Calculate the context's RcvArray entry starting point.
-                * We do this here because we have to take into account all
-                * the RcvArray entries that previous context would have
-                * taken and we have to account for any extra groups
-                * assigned to the kernel or user contexts.
-                */
-               if (ctxt < dd->first_user_ctxt) {
-                       if (ctxt < kctxt_ngroups) {
-                               base = ctxt * (dd->rcv_entries.ngroups + 1);
-                               rcd->rcv_array_groups++;
-                       } else
-                               base = kctxt_ngroups +
-                                       (ctxt * dd->rcv_entries.ngroups);
-               } else {
-                       u16 ct = ctxt - dd->first_user_ctxt;
-
-                       base = ((dd->n_krcv_queues * dd->rcv_entries.ngroups) +
-                               kctxt_ngroups);
-                       if (ct < dd->rcv_entries.nctxt_extra) {
-                               base += ct * (dd->rcv_entries.ngroups + 1);
-                               rcd->rcv_array_groups++;
-                       } else
-                               base += dd->rcv_entries.nctxt_extra +
-                                       (ct * dd->rcv_entries.ngroups);
-               }
-               rcd->eager_base = base * dd->rcv_entries.group_size;
-
-               /* Validate and initialize Rcv Hdr Q variables */
-               if (rcvhdrcnt % HDRQ_INCREMENT) {
-                       dd_dev_err(dd,
-                                  "ctxt%u: header queue count %d must be divisible by %lu\n",
-                                  rcd->ctxt, rcvhdrcnt, HDRQ_INCREMENT);
-                       goto bail;
-               }
-               rcd->rcvhdrq_cnt = rcvhdrcnt;
-               rcd->rcvhdrqentsize = hfi1_hdrq_entsize;
-               /*
-                * Simple Eager buffer allocation: we have already pre-allocated
-                * the number of RcvArray entry groups. Each ctxtdata structure
-                * holds the number of groups for that context.
-                *
-                * To follow CSR requirements and maintain cacheline alignment,
-                * make sure all sizes and bases are multiples of group_size.
-                *
-                * The expected entry count is what is left after assigning
-                * eager.
-                */
-               max_entries = rcd->rcv_array_groups *
-                       dd->rcv_entries.group_size;
-               rcvtids = ((max_entries * hfi1_rcvarr_split) / 100);
-               rcd->egrbufs.count = round_down(rcvtids,
-                                               dd->rcv_entries.group_size);
-               if (rcd->egrbufs.count > MAX_EAGER_ENTRIES) {
-                       dd_dev_err(dd, "ctxt%u: requested too many RcvArray entries.\n",
-                                  rcd->ctxt);
-                       rcd->egrbufs.count = MAX_EAGER_ENTRIES;
-               }
-               hfi1_cdbg(PROC,
-                         "ctxt%u: max Eager buffer RcvArray entries: %u\n",
-                         rcd->ctxt, rcd->egrbufs.count);
-
-               /*
-                * Allocate array that will hold the eager buffer accounting
-                * data.
-                * This will allocate the maximum possible buffer count based
-                * on the value of the RcvArray split parameter.
-                * The resulting value will be rounded down to the closest
-                * multiple of dd->rcv_entries.group_size.
-                */
-               rcd->egrbufs.buffers = kcalloc(rcd->egrbufs.count,
-                                              sizeof(*rcd->egrbufs.buffers),
-                                              GFP_KERNEL);
-               if (!rcd->egrbufs.buffers)
-                       goto bail;
-               rcd->egrbufs.rcvtids = kcalloc(rcd->egrbufs.count,
-                                              sizeof(*rcd->egrbufs.rcvtids),
-                                              GFP_KERNEL);
-               if (!rcd->egrbufs.rcvtids)
-                       goto bail;
-               rcd->egrbufs.size = eager_buffer_size;
-               /*
-                * The size of the buffers programmed into the RcvArray
-                * entries needs to be big enough to handle the highest
-                * MTU supported.
-                */
-               if (rcd->egrbufs.size < hfi1_max_mtu) {
-                       rcd->egrbufs.size = __roundup_pow_of_two(hfi1_max_mtu);
-                       hfi1_cdbg(PROC,
-                                 "ctxt%u: eager bufs size too small. Adjusting to %zu\n",
-                                   rcd->ctxt, rcd->egrbufs.size);
-               }
-               rcd->egrbufs.rcvtid_size = HFI1_MAX_EAGER_BUFFER_SIZE;
-
-               if (ctxt < dd->first_user_ctxt) { /* N/A for PSM contexts */
-                       rcd->opstats = kzalloc(sizeof(*rcd->opstats),
-                               GFP_KERNEL);
-                       if (!rcd->opstats)
-                               goto bail;
-               }
-       }
-       return rcd;
-bail:
-       kfree(rcd->egrbufs.rcvtids);
-       kfree(rcd->egrbufs.buffers);
-       kfree(rcd);
-       return NULL;
-}
-
-/*
- * Convert a receive header entry size that to the encoding used in the CSR.
- *
- * Return a zero if the given size is invalid.
- */
-static inline u64 encode_rcv_header_entry_size(u16 size)
-{
-       /* there are only 3 valid receive header entry sizes */
-       if (size == 2)
-               return 1;
-       if (size == 16)
-               return 2;
-       else if (size == 32)
-               return 4;
-       return 0; /* invalid */
-}
-
-/*
- * Select the largest ccti value over all SLs to determine the intra-
- * packet gap for the link.
- *
- * called with cca_timer_lock held (to protect access to cca_timer
- * array), and rcu_read_lock() (to protect access to cc_state).
- */
-void set_link_ipg(struct hfi1_pportdata *ppd)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       struct cc_state *cc_state;
-       int i;
-       u16 cce, ccti_limit, max_ccti = 0;
-       u16 shift, mult;
-       u64 src;
-       u32 current_egress_rate; /* Mbits /sec */
-       u32 max_pkt_time;
-       /*
-        * max_pkt_time is the maximum packet egress time in units
-        * of the fabric clock period 1/(805 MHz).
-        */
-
-       cc_state = get_cc_state(ppd);
-
-       if (!cc_state)
-               /*
-                * This should _never_ happen - rcu_read_lock() is held,
-                * and set_link_ipg() should not be called if cc_state
-                * is NULL.
-                */
-               return;
-
-       for (i = 0; i < OPA_MAX_SLS; i++) {
-               u16 ccti = ppd->cca_timer[i].ccti;
-
-               if (ccti > max_ccti)
-                       max_ccti = ccti;
-       }
-
-       ccti_limit = cc_state->cct.ccti_limit;
-       if (max_ccti > ccti_limit)
-               max_ccti = ccti_limit;
-
-       cce = cc_state->cct.entries[max_ccti].entry;
-       shift = (cce & 0xc000) >> 14;
-       mult = (cce & 0x3fff);
-
-       current_egress_rate = active_egress_rate(ppd);
-
-       max_pkt_time = egress_cycles(ppd->ibmaxlen, current_egress_rate);
-
-       src = (max_pkt_time >> shift) * mult;
-
-       src &= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SMASK;
-       src <<= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SHIFT;
-
-       write_csr(dd, SEND_STATIC_RATE_CONTROL, src);
-}
-
-static enum hrtimer_restart cca_timer_fn(struct hrtimer *t)
-{
-       struct cca_timer *cca_timer;
-       struct hfi1_pportdata *ppd;
-       int sl;
-       u16 ccti_timer, ccti_min;
-       struct cc_state *cc_state;
-       unsigned long flags;
-       enum hrtimer_restart ret = HRTIMER_NORESTART;
-
-       cca_timer = container_of(t, struct cca_timer, hrtimer);
-       ppd = cca_timer->ppd;
-       sl = cca_timer->sl;
-
-       rcu_read_lock();
-
-       cc_state = get_cc_state(ppd);
-
-       if (!cc_state) {
-               rcu_read_unlock();
-               return HRTIMER_NORESTART;
-       }
-
-       /*
-        * 1) decrement ccti for SL
-        * 2) calculate IPG for link (set_link_ipg())
-        * 3) restart timer, unless ccti is at min value
-        */
-
-       ccti_min = cc_state->cong_setting.entries[sl].ccti_min;
-       ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer;
-
-       spin_lock_irqsave(&ppd->cca_timer_lock, flags);
-
-       if (cca_timer->ccti > ccti_min) {
-               cca_timer->ccti--;
-               set_link_ipg(ppd);
-       }
-
-       if (cca_timer->ccti > ccti_min) {
-               unsigned long nsec = 1024 * ccti_timer;
-               /* ccti_timer is in units of 1.024 usec */
-               hrtimer_forward_now(t, ns_to_ktime(nsec));
-               ret = HRTIMER_RESTART;
-       }
-
-       spin_unlock_irqrestore(&ppd->cca_timer_lock, flags);
-       rcu_read_unlock();
-       return ret;
-}
-
-/*
- * Common code for initializing the physical port structure.
- */
-void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd,
-                        struct hfi1_devdata *dd, u8 hw_pidx, u8 port)
-{
-       int i, size;
-       uint default_pkey_idx;
-
-       ppd->dd = dd;
-       ppd->hw_pidx = hw_pidx;
-       ppd->port = port; /* IB port number, not index */
-
-       default_pkey_idx = 1;
-
-       ppd->pkeys[default_pkey_idx] = DEFAULT_P_KEY;
-       if (loopback) {
-               hfi1_early_err(&pdev->dev,
-                              "Faking data partition 0x8001 in idx %u\n",
-                              !default_pkey_idx);
-               ppd->pkeys[!default_pkey_idx] = 0x8001;
-       }
-
-       INIT_WORK(&ppd->link_vc_work, handle_verify_cap);
-       INIT_WORK(&ppd->link_up_work, handle_link_up);
-       INIT_WORK(&ppd->link_down_work, handle_link_down);
-       INIT_WORK(&ppd->freeze_work, handle_freeze);
-       INIT_WORK(&ppd->link_downgrade_work, handle_link_downgrade);
-       INIT_WORK(&ppd->sma_message_work, handle_sma_message);
-       INIT_WORK(&ppd->link_bounce_work, handle_link_bounce);
-       INIT_WORK(&ppd->linkstate_active_work, receive_interrupt_work);
-       INIT_WORK(&ppd->qsfp_info.qsfp_work, qsfp_event);
-
-       mutex_init(&ppd->hls_lock);
-       spin_lock_init(&ppd->sdma_alllock);
-       spin_lock_init(&ppd->qsfp_info.qsfp_lock);
-
-       ppd->qsfp_info.ppd = ppd;
-       ppd->sm_trap_qp = 0x0;
-       ppd->sa_qp = 0x1;
-
-       ppd->hfi1_wq = NULL;
-
-       spin_lock_init(&ppd->cca_timer_lock);
-
-       for (i = 0; i < OPA_MAX_SLS; i++) {
-               hrtimer_init(&ppd->cca_timer[i].hrtimer, CLOCK_MONOTONIC,
-                            HRTIMER_MODE_REL);
-               ppd->cca_timer[i].ppd = ppd;
-               ppd->cca_timer[i].sl = i;
-               ppd->cca_timer[i].ccti = 0;
-               ppd->cca_timer[i].hrtimer.function = cca_timer_fn;
-       }
-
-       ppd->cc_max_table_entries = IB_CC_TABLE_CAP_DEFAULT;
-
-       spin_lock_init(&ppd->cc_state_lock);
-       spin_lock_init(&ppd->cc_log_lock);
-       size = sizeof(struct cc_state);
-       RCU_INIT_POINTER(ppd->cc_state, kzalloc(size, GFP_KERNEL));
-       if (!rcu_dereference(ppd->cc_state))
-               goto bail;
-       return;
-
-bail:
-
-       hfi1_early_err(&pdev->dev,
-                      "Congestion Control Agent disabled for port %d\n", port);
-}
-
-/*
- * Do initialization for device that is only needed on
- * first detect, not on resets.
- */
-static int loadtime_init(struct hfi1_devdata *dd)
-{
-       return 0;
-}
-
-/**
- * init_after_reset - re-initialize after a reset
- * @dd: the hfi1_ib device
- *
- * sanity check at least some of the values after reset, and
- * ensure no receive or transmit (explicitly, in case reset
- * failed
- */
-static int init_after_reset(struct hfi1_devdata *dd)
-{
-       int i;
-
-       /*
-        * Ensure chip does no sends or receives, tail updates, or
-        * pioavail updates while we re-initialize.  This is mostly
-        * for the driver data structures, not chip registers.
-        */
-       for (i = 0; i < dd->num_rcv_contexts; i++)
-               hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS |
-                                 HFI1_RCVCTRL_INTRAVAIL_DIS |
-                                 HFI1_RCVCTRL_TAILUPD_DIS, i);
-       pio_send_control(dd, PSC_GLOBAL_DISABLE);
-       for (i = 0; i < dd->num_send_contexts; i++)
-               sc_disable(dd->send_contexts[i].sc);
-
-       return 0;
-}
-
-static void enable_chip(struct hfi1_devdata *dd)
-{
-       u32 rcvmask;
-       u32 i;
-
-       /* enable PIO send */
-       pio_send_control(dd, PSC_GLOBAL_ENABLE);
-
-       /*
-        * Enable kernel ctxts' receive and receive interrupt.
-        * Other ctxts done as user opens and initializes them.
-        */
-       for (i = 0; i < dd->first_user_ctxt; ++i) {
-               rcvmask = HFI1_RCVCTRL_CTXT_ENB | HFI1_RCVCTRL_INTRAVAIL_ENB;
-               rcvmask |= HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, DMA_RTAIL) ?
-                       HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS;
-               if (!HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, MULTI_PKT_EGR))
-                       rcvmask |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB;
-               if (HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, NODROP_RHQ_FULL))
-                       rcvmask |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
-               if (HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, NODROP_EGR_FULL))
-                       rcvmask |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
-               hfi1_rcvctrl(dd, rcvmask, i);
-               sc_enable(dd->rcd[i]->sc);
-       }
-}
-
-/**
- * create_workqueues - create per port workqueues
- * @dd: the hfi1_ib device
- */
-static int create_workqueues(struct hfi1_devdata *dd)
-{
-       int pidx;
-       struct hfi1_pportdata *ppd;
-
-       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
-               ppd = dd->pport + pidx;
-               if (!ppd->hfi1_wq) {
-                       ppd->hfi1_wq =
-                               alloc_workqueue(
-                                   "hfi%d_%d",
-                                   WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE,
-                                   dd->num_sdma,
-                                   dd->unit, pidx);
-                       if (!ppd->hfi1_wq)
-                               goto wq_error;
-               }
-       }
-       return 0;
-wq_error:
-       pr_err("alloc_workqueue failed for port %d\n", pidx + 1);
-       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
-               ppd = dd->pport + pidx;
-               if (ppd->hfi1_wq) {
-                       destroy_workqueue(ppd->hfi1_wq);
-                       ppd->hfi1_wq = NULL;
-               }
-       }
-       return -ENOMEM;
-}
-
-/**
- * hfi1_init - do the actual initialization sequence on the chip
- * @dd: the hfi1_ib device
- * @reinit: re-initializing, so don't allocate new memory
- *
- * Do the actual initialization sequence on the chip.  This is done
- * both from the init routine called from the PCI infrastructure, and
- * when we reset the chip, or detect that it was reset internally,
- * or it's administratively re-enabled.
- *
- * Memory allocation here and in called routines is only done in
- * the first case (reinit == 0).  We have to be careful, because even
- * without memory allocation, we need to re-write all the chip registers
- * TIDs, etc. after the reset or enable has completed.
- */
-int hfi1_init(struct hfi1_devdata *dd, int reinit)
-{
-       int ret = 0, pidx, lastfail = 0;
-       unsigned i, len;
-       struct hfi1_ctxtdata *rcd;
-       struct hfi1_pportdata *ppd;
-
-       /* Set up recv low level handlers */
-       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_EXPECTED] =
-                                               kdeth_process_expected;
-       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_EAGER] =
-                                               kdeth_process_eager;
-       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_IB] = process_receive_ib;
-       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_ERROR] =
-                                               process_receive_error;
-       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_BYPASS] =
-                                               process_receive_bypass;
-       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID5] =
-                                               process_receive_invalid;
-       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID6] =
-                                               process_receive_invalid;
-       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID7] =
-                                               process_receive_invalid;
-       dd->rhf_rcv_function_map = dd->normal_rhf_rcv_functions;
-
-       /* Set up send low level handlers */
-       dd->process_pio_send = hfi1_verbs_send_pio;
-       dd->process_dma_send = hfi1_verbs_send_dma;
-       dd->pio_inline_send = pio_copy;
-
-       if (is_ax(dd)) {
-               atomic_set(&dd->drop_packet, DROP_PACKET_ON);
-               dd->do_drop = 1;
-       } else {
-               atomic_set(&dd->drop_packet, DROP_PACKET_OFF);
-               dd->do_drop = 0;
-       }
-
-       /* make sure the link is not "up" */
-       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
-               ppd = dd->pport + pidx;
-               ppd->linkup = 0;
-       }
-
-       if (reinit)
-               ret = init_after_reset(dd);
-       else
-               ret = loadtime_init(dd);
-       if (ret)
-               goto done;
-
-       /* allocate dummy tail memory for all receive contexts */
-       dd->rcvhdrtail_dummy_kvaddr = dma_zalloc_coherent(
-               &dd->pcidev->dev, sizeof(u64),
-               &dd->rcvhdrtail_dummy_physaddr,
-               GFP_KERNEL);
-
-       if (!dd->rcvhdrtail_dummy_kvaddr) {
-               dd_dev_err(dd, "cannot allocate dummy tail memory\n");
-               ret = -ENOMEM;
-               goto done;
-       }
-
-       /* dd->rcd can be NULL if early initialization failed */
-       for (i = 0; dd->rcd && i < dd->first_user_ctxt; ++i) {
-               /*
-                * Set up the (kernel) rcvhdr queue and egr TIDs.  If doing
-                * re-init, the simplest way to handle this is to free
-                * existing, and re-allocate.
-                * Need to re-create rest of ctxt 0 ctxtdata as well.
-                */
-               rcd = dd->rcd[i];
-               if (!rcd)
-                       continue;
-
-               rcd->do_interrupt = &handle_receive_interrupt;
-
-               lastfail = hfi1_create_rcvhdrq(dd, rcd);
-               if (!lastfail)
-                       lastfail = hfi1_setup_eagerbufs(rcd);
-               if (lastfail)
-                       dd_dev_err(dd,
-                                  "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n");
-       }
-       if (lastfail)
-               ret = lastfail;
-
-       /* Allocate enough memory for user event notification. */
-       len = PAGE_ALIGN(dd->chip_rcv_contexts * HFI1_MAX_SHARED_CTXTS *
-                        sizeof(*dd->events));
-       dd->events = vmalloc_user(len);
-       if (!dd->events)
-               dd_dev_err(dd, "Failed to allocate user events page\n");
-       /*
-        * Allocate a page for device and port status.
-        * Page will be shared amongst all user processes.
-        */
-       dd->status = vmalloc_user(PAGE_SIZE);
-       if (!dd->status)
-               dd_dev_err(dd, "Failed to allocate dev status page\n");
-       else
-               dd->freezelen = PAGE_SIZE - (sizeof(*dd->status) -
-                                            sizeof(dd->status->freezemsg));
-       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
-               ppd = dd->pport + pidx;
-               if (dd->status)
-                       /* Currently, we only have one port */
-                       ppd->statusp = &dd->status->port;
-
-               set_mtu(ppd);
-       }
-
-       /* enable chip even if we have an error, so we can debug cause */
-       enable_chip(dd);
-
-done:
-       /*
-        * Set status even if port serdes is not initialized
-        * so that diags will work.
-        */
-       if (dd->status)
-               dd->status->dev |= HFI1_STATUS_CHIP_PRESENT |
-                       HFI1_STATUS_INITTED;
-       if (!ret) {
-               /* enable all interrupts from the chip */
-               set_intr_state(dd, 1);
-
-               /* chip is OK for user apps; mark it as initialized */
-               for (pidx = 0; pidx < dd->num_pports; ++pidx) {
-                       ppd = dd->pport + pidx;
-
-                       /*
-                        * start the serdes - must be after interrupts are
-                        * enabled so we are notified when the link goes up
-                        */
-                       lastfail = bringup_serdes(ppd);
-                       if (lastfail)
-                               dd_dev_info(dd,
-                                           "Failed to bring up port %u\n",
-                                           ppd->port);
-
-                       /*
-                        * Set status even if port serdes is not initialized
-                        * so that diags will work.
-                        */
-                       if (ppd->statusp)
-                               *ppd->statusp |= HFI1_STATUS_CHIP_PRESENT |
-                                                       HFI1_STATUS_INITTED;
-                       if (!ppd->link_speed_enabled)
-                               continue;
-               }
-       }
-
-       /* if ret is non-zero, we probably should do some cleanup here... */
-       return ret;
-}
-
-static inline struct hfi1_devdata *__hfi1_lookup(int unit)
-{
-       return idr_find(&hfi1_unit_table, unit);
-}
-
-struct hfi1_devdata *hfi1_lookup(int unit)
-{
-       struct hfi1_devdata *dd;
-       unsigned long flags;
-
-       spin_lock_irqsave(&hfi1_devs_lock, flags);
-       dd = __hfi1_lookup(unit);
-       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
-
-       return dd;
-}
-
-/*
- * Stop the timers during unit shutdown, or after an error late
- * in initialization.
- */
-static void stop_timers(struct hfi1_devdata *dd)
-{
-       struct hfi1_pportdata *ppd;
-       int pidx;
-
-       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
-               ppd = dd->pport + pidx;
-               if (ppd->led_override_timer.data) {
-                       del_timer_sync(&ppd->led_override_timer);
-                       atomic_set(&ppd->led_override_timer_active, 0);
-               }
-       }
-}
-
-/**
- * shutdown_device - shut down a device
- * @dd: the hfi1_ib device
- *
- * This is called to make the device quiet when we are about to
- * unload the driver, and also when the device is administratively
- * disabled.   It does not free any data structures.
- * Everything it does has to be setup again by hfi1_init(dd, 1)
- */
-static void shutdown_device(struct hfi1_devdata *dd)
-{
-       struct hfi1_pportdata *ppd;
-       unsigned pidx;
-       int i;
-
-       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
-               ppd = dd->pport + pidx;
-
-               ppd->linkup = 0;
-               if (ppd->statusp)
-                       *ppd->statusp &= ~(HFI1_STATUS_IB_CONF |
-                                          HFI1_STATUS_IB_READY);
-       }
-       dd->flags &= ~HFI1_INITTED;
-
-       /* mask interrupts, but not errors */
-       set_intr_state(dd, 0);
-
-       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
-               ppd = dd->pport + pidx;
-               for (i = 0; i < dd->num_rcv_contexts; i++)
-                       hfi1_rcvctrl(dd, HFI1_RCVCTRL_TAILUPD_DIS |
-                                         HFI1_RCVCTRL_CTXT_DIS |
-                                         HFI1_RCVCTRL_INTRAVAIL_DIS |
-                                         HFI1_RCVCTRL_PKEY_DIS |
-                                         HFI1_RCVCTRL_ONE_PKT_EGR_DIS, i);
-               /*
-                * Gracefully stop all sends allowing any in progress to
-                * trickle out first.
-                */
-               for (i = 0; i < dd->num_send_contexts; i++)
-                       sc_flush(dd->send_contexts[i].sc);
-       }
-
-       /*
-        * Enough for anything that's going to trickle out to have actually
-        * done so.
-        */
-       udelay(20);
-
-       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
-               ppd = dd->pport + pidx;
-
-               /* disable all contexts */
-               for (i = 0; i < dd->num_send_contexts; i++)
-                       sc_disable(dd->send_contexts[i].sc);
-               /* disable the send device */
-               pio_send_control(dd, PSC_GLOBAL_DISABLE);
-
-               shutdown_led_override(ppd);
-
-               /*
-                * Clear SerdesEnable.
-                * We can't count on interrupts since we are stopping.
-                */
-               hfi1_quiet_serdes(ppd);
-
-               if (ppd->hfi1_wq) {
-                       destroy_workqueue(ppd->hfi1_wq);
-                       ppd->hfi1_wq = NULL;
-               }
-       }
-       sdma_exit(dd);
-}
-
-/**
- * hfi1_free_ctxtdata - free a context's allocated data
- * @dd: the hfi1_ib device
- * @rcd: the ctxtdata structure
- *
- * free up any allocated data for a context
- * This should not touch anything that would affect a simultaneous
- * re-allocation of context data, because it is called after hfi1_mutex
- * is released (and can be called from reinit as well).
- * It should never change any chip state, or global driver state.
- */
-void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
-{
-       unsigned e;
-
-       if (!rcd)
-               return;
-
-       if (rcd->rcvhdrq) {
-               dma_free_coherent(&dd->pcidev->dev, rcd->rcvhdrq_size,
-                                 rcd->rcvhdrq, rcd->rcvhdrq_phys);
-               rcd->rcvhdrq = NULL;
-               if (rcd->rcvhdrtail_kvaddr) {
-                       dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
-                                         (void *)rcd->rcvhdrtail_kvaddr,
-                                         rcd->rcvhdrqtailaddr_phys);
-                       rcd->rcvhdrtail_kvaddr = NULL;
-               }
-       }
-
-       /* all the RcvArray entries should have been cleared by now */
-       kfree(rcd->egrbufs.rcvtids);
-
-       for (e = 0; e < rcd->egrbufs.alloced; e++) {
-               if (rcd->egrbufs.buffers[e].phys)
-                       dma_free_coherent(&dd->pcidev->dev,
-                                         rcd->egrbufs.buffers[e].len,
-                                         rcd->egrbufs.buffers[e].addr,
-                                         rcd->egrbufs.buffers[e].phys);
-       }
-       kfree(rcd->egrbufs.buffers);
-
-       sc_free(rcd->sc);
-       vfree(rcd->user_event_mask);
-       vfree(rcd->subctxt_uregbase);
-       vfree(rcd->subctxt_rcvegrbuf);
-       vfree(rcd->subctxt_rcvhdr_base);
-       kfree(rcd->opstats);
-       kfree(rcd);
-}
-
-/*
- * Release our hold on the shared asic data.  If we are the last one,
- * free the structure.  Must be holding hfi1_devs_lock.
- */
-static void release_asic_data(struct hfi1_devdata *dd)
-{
-       int other;
-
-       if (!dd->asic_data)
-               return;
-       dd->asic_data->dds[dd->hfi1_id] = NULL;
-       other = dd->hfi1_id ? 0 : 1;
-       if (!dd->asic_data->dds[other]) {
-               /* we are the last holder, free it */
-               kfree(dd->asic_data);
-       }
-       dd->asic_data = NULL;
-}
-
-void hfi1_free_devdata(struct hfi1_devdata *dd)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&hfi1_devs_lock, flags);
-       idr_remove(&hfi1_unit_table, dd->unit);
-       list_del(&dd->list);
-       release_asic_data(dd);
-       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
-       free_platform_config(dd);
-       rcu_barrier(); /* wait for rcu callbacks to complete */
-       free_percpu(dd->int_counter);
-       free_percpu(dd->rcv_limit);
-       hfi1_dev_affinity_free(dd);
-       free_percpu(dd->send_schedule);
-       rvt_dealloc_device(&dd->verbs_dev.rdi);
-}
-
-/*
- * Allocate our primary per-unit data structure.  Must be done via verbs
- * allocator, because the verbs cleanup process both does cleanup and
- * free of the data structure.
- * "extra" is for chip-specific data.
- *
- * Use the idr mechanism to get a unit number for this unit.
- */
-struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra)
-{
-       unsigned long flags;
-       struct hfi1_devdata *dd;
-       int ret, nports;
-
-       /* extra is * number of ports */
-       nports = extra / sizeof(struct hfi1_pportdata);
-
-       dd = (struct hfi1_devdata *)rvt_alloc_device(sizeof(*dd) + extra,
-                                                    nports);
-       if (!dd)
-               return ERR_PTR(-ENOMEM);
-       dd->num_pports = nports;
-       dd->pport = (struct hfi1_pportdata *)(dd + 1);
-
-       INIT_LIST_HEAD(&dd->list);
-       idr_preload(GFP_KERNEL);
-       spin_lock_irqsave(&hfi1_devs_lock, flags);
-
-       ret = idr_alloc(&hfi1_unit_table, dd, 0, 0, GFP_NOWAIT);
-       if (ret >= 0) {
-               dd->unit = ret;
-               list_add(&dd->list, &hfi1_dev_list);
-       }
-
-       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
-       idr_preload_end();
-
-       if (ret < 0) {
-               hfi1_early_err(&pdev->dev,
-                              "Could not allocate unit ID: error %d\n", -ret);
-               goto bail;
-       }
-       /*
-        * Initialize all locks for the device. This needs to be as early as
-        * possible so locks are usable.
-        */
-       spin_lock_init(&dd->sc_lock);
-       spin_lock_init(&dd->sendctrl_lock);
-       spin_lock_init(&dd->rcvctrl_lock);
-       spin_lock_init(&dd->uctxt_lock);
-       spin_lock_init(&dd->hfi1_diag_trans_lock);
-       spin_lock_init(&dd->sc_init_lock);
-       spin_lock_init(&dd->dc8051_lock);
-       spin_lock_init(&dd->dc8051_memlock);
-       seqlock_init(&dd->sc2vl_lock);
-       spin_lock_init(&dd->sde_map_lock);
-       spin_lock_init(&dd->pio_map_lock);
-       init_waitqueue_head(&dd->event_queue);
-
-       dd->int_counter = alloc_percpu(u64);
-       if (!dd->int_counter) {
-               ret = -ENOMEM;
-               hfi1_early_err(&pdev->dev,
-                              "Could not allocate per-cpu int_counter\n");
-               goto bail;
-       }
-
-       dd->rcv_limit = alloc_percpu(u64);
-       if (!dd->rcv_limit) {
-               ret = -ENOMEM;
-               hfi1_early_err(&pdev->dev,
-                              "Could not allocate per-cpu rcv_limit\n");
-               goto bail;
-       }
-
-       dd->send_schedule = alloc_percpu(u64);
-       if (!dd->send_schedule) {
-               ret = -ENOMEM;
-               hfi1_early_err(&pdev->dev,
-                              "Could not allocate per-cpu int_counter\n");
-               goto bail;
-       }
-
-       if (!hfi1_cpulist_count) {
-               u32 count = num_online_cpus();
-
-               hfi1_cpulist = kcalloc(BITS_TO_LONGS(count), sizeof(long),
-                                      GFP_KERNEL);
-               if (hfi1_cpulist)
-                       hfi1_cpulist_count = count;
-               else
-                       hfi1_early_err(
-                       &pdev->dev,
-                       "Could not alloc cpulist info, cpu affinity might be wrong\n");
-       }
-       return dd;
-
-bail:
-       if (!list_empty(&dd->list))
-               list_del_init(&dd->list);
-       rvt_dealloc_device(&dd->verbs_dev.rdi);
-       return ERR_PTR(ret);
-}
-
-/*
- * Called from freeze mode handlers, and from PCI error
- * reporting code.  Should be paranoid about state of
- * system and data structures.
- */
-void hfi1_disable_after_error(struct hfi1_devdata *dd)
-{
-       if (dd->flags & HFI1_INITTED) {
-               u32 pidx;
-
-               dd->flags &= ~HFI1_INITTED;
-               if (dd->pport)
-                       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
-                               struct hfi1_pportdata *ppd;
-
-                               ppd = dd->pport + pidx;
-                               if (dd->flags & HFI1_PRESENT)
-                                       set_link_state(ppd, HLS_DN_DISABLE);
-
-                               if (ppd->statusp)
-                                       *ppd->statusp &= ~HFI1_STATUS_IB_READY;
-                       }
-       }
-
-       /*
-        * Mark as having had an error for driver, and also
-        * for /sys and status word mapped to user programs.
-        * This marks unit as not usable, until reset.
-        */
-       if (dd->status)
-               dd->status->dev |= HFI1_STATUS_HWERROR;
-}
-
-static void remove_one(struct pci_dev *);
-static int init_one(struct pci_dev *, const struct pci_device_id *);
-
-#define DRIVER_LOAD_MSG "Intel " DRIVER_NAME " loaded: "
-#define PFX DRIVER_NAME ": "
-
-static const struct pci_device_id hfi1_pci_tbl[] = {
-       { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL0) },
-       { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL1) },
-       { 0, }
-};
-
-MODULE_DEVICE_TABLE(pci, hfi1_pci_tbl);
-
-static struct pci_driver hfi1_pci_driver = {
-       .name = DRIVER_NAME,
-       .probe = init_one,
-       .remove = remove_one,
-       .id_table = hfi1_pci_tbl,
-       .err_handler = &hfi1_pci_err_handler,
-};
-
-static void __init compute_krcvqs(void)
-{
-       int i;
-
-       for (i = 0; i < krcvqsset; i++)
-               n_krcvqs += krcvqs[i];
-}
-
-/*
- * Do all the generic driver unit- and chip-independent memory
- * allocation and initialization.
- */
-static int __init hfi1_mod_init(void)
-{
-       int ret;
-
-       ret = dev_init();
-       if (ret)
-               goto bail;
-
-       /* validate max MTU before any devices start */
-       if (!valid_opa_max_mtu(hfi1_max_mtu)) {
-               pr_err("Invalid max_mtu 0x%x, using 0x%x instead\n",
-                      hfi1_max_mtu, HFI1_DEFAULT_MAX_MTU);
-               hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU;
-       }
-       /* valid CUs run from 1-128 in powers of 2 */
-       if (hfi1_cu > 128 || !is_power_of_2(hfi1_cu))
-               hfi1_cu = 1;
-       /* valid credit return threshold is 0-100, variable is unsigned */
-       if (user_credit_return_threshold > 100)
-               user_credit_return_threshold = 100;
-
-       compute_krcvqs();
-       /*
-        * sanitize receive interrupt count, time must wait until after
-        * the hardware type is known
-        */
-       if (rcv_intr_count > RCV_HDR_HEAD_COUNTER_MASK)
-               rcv_intr_count = RCV_HDR_HEAD_COUNTER_MASK;
-       /* reject invalid combinations */
-       if (rcv_intr_count == 0 && rcv_intr_timeout == 0) {
-               pr_err("Invalid mode: both receive interrupt count and available timeout are zero - setting interrupt count to 1\n");
-               rcv_intr_count = 1;
-       }
-       if (rcv_intr_count > 1 && rcv_intr_timeout == 0) {
-               /*
-                * Avoid indefinite packet delivery by requiring a timeout
-                * if count is > 1.
-                */
-               pr_err("Invalid mode: receive interrupt count greater than 1 and available timeout is zero - setting available timeout to 1\n");
-               rcv_intr_timeout = 1;
-       }
-       if (rcv_intr_dynamic && !(rcv_intr_count > 1 && rcv_intr_timeout > 0)) {
-               /*
-                * The dynamic algorithm expects a non-zero timeout
-                * and a count > 1.
-                */
-               pr_err("Invalid mode: dynamic receive interrupt mitigation with invalid count and timeout - turning dynamic off\n");
-               rcv_intr_dynamic = 0;
-       }
-
-       /* sanitize link CRC options */
-       link_crc_mask &= SUPPORTED_CRCS;
-
-       /*
-        * These must be called before the driver is registered with
-        * the PCI subsystem.
-        */
-       idr_init(&hfi1_unit_table);
-
-       hfi1_dbg_init();
-       ret = hfi1_wss_init();
-       if (ret < 0)
-               goto bail_wss;
-       ret = pci_register_driver(&hfi1_pci_driver);
-       if (ret < 0) {
-               pr_err("Unable to register driver: error %d\n", -ret);
-               goto bail_dev;
-       }
-       goto bail; /* all OK */
-
-bail_dev:
-       hfi1_wss_exit();
-bail_wss:
-       hfi1_dbg_exit();
-       idr_destroy(&hfi1_unit_table);
-       dev_cleanup();
-bail:
-       return ret;
-}
-
-module_init(hfi1_mod_init);
-
-/*
- * Do the non-unit driver cleanup, memory free, etc. at unload.
- */
-static void __exit hfi1_mod_cleanup(void)
-{
-       pci_unregister_driver(&hfi1_pci_driver);
-       hfi1_wss_exit();
-       hfi1_dbg_exit();
-       hfi1_cpulist_count = 0;
-       kfree(hfi1_cpulist);
-
-       idr_destroy(&hfi1_unit_table);
-       dispose_firmware();     /* asymmetric with obtain_firmware() */
-       dev_cleanup();
-}
-
-module_exit(hfi1_mod_cleanup);
-
-/* this can only be called after a successful initialization */
-static void cleanup_device_data(struct hfi1_devdata *dd)
-{
-       int ctxt;
-       int pidx;
-       struct hfi1_ctxtdata **tmp;
-       unsigned long flags;
-
-       /* users can't do anything more with chip */
-       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
-               struct hfi1_pportdata *ppd = &dd->pport[pidx];
-               struct cc_state *cc_state;
-               int i;
-
-               if (ppd->statusp)
-                       *ppd->statusp &= ~HFI1_STATUS_CHIP_PRESENT;
-
-               for (i = 0; i < OPA_MAX_SLS; i++)
-                       hrtimer_cancel(&ppd->cca_timer[i].hrtimer);
-
-               spin_lock(&ppd->cc_state_lock);
-               cc_state = get_cc_state(ppd);
-               rcu_assign_pointer(ppd->cc_state, NULL);
-               spin_unlock(&ppd->cc_state_lock);
-
-               if (cc_state)
-                       call_rcu(&cc_state->rcu, cc_state_reclaim);
-       }
-
-       free_credit_return(dd);
-
-       /*
-        * Free any resources still in use (usually just kernel contexts)
-        * at unload; we do for ctxtcnt, because that's what we allocate.
-        * We acquire lock to be really paranoid that rcd isn't being
-        * accessed from some interrupt-related code (that should not happen,
-        * but best to be sure).
-        */
-       spin_lock_irqsave(&dd->uctxt_lock, flags);
-       tmp = dd->rcd;
-       dd->rcd = NULL;
-       spin_unlock_irqrestore(&dd->uctxt_lock, flags);
-
-       if (dd->rcvhdrtail_dummy_kvaddr) {
-               dma_free_coherent(&dd->pcidev->dev, sizeof(u64),
-                                 (void *)dd->rcvhdrtail_dummy_kvaddr,
-                                 dd->rcvhdrtail_dummy_physaddr);
-                                 dd->rcvhdrtail_dummy_kvaddr = NULL;
-       }
-
-       for (ctxt = 0; tmp && ctxt < dd->num_rcv_contexts; ctxt++) {
-               struct hfi1_ctxtdata *rcd = tmp[ctxt];
-
-               tmp[ctxt] = NULL; /* debugging paranoia */
-               if (rcd) {
-                       hfi1_clear_tids(rcd);
-                       hfi1_free_ctxtdata(dd, rcd);
-               }
-       }
-       kfree(tmp);
-       free_pio_map(dd);
-       /* must follow rcv context free - need to remove rcv's hooks */
-       for (ctxt = 0; ctxt < dd->num_send_contexts; ctxt++)
-               sc_free(dd->send_contexts[ctxt].sc);
-       dd->num_send_contexts = 0;
-       kfree(dd->send_contexts);
-       dd->send_contexts = NULL;
-       kfree(dd->hw_to_sw);
-       dd->hw_to_sw = NULL;
-       kfree(dd->boardname);
-       vfree(dd->events);
-       vfree(dd->status);
-}
-
-/*
- * Clean up on unit shutdown, or error during unit load after
- * successful initialization.
- */
-static void postinit_cleanup(struct hfi1_devdata *dd)
-{
-       hfi1_start_cleanup(dd);
-
-       hfi1_pcie_ddcleanup(dd);
-       hfi1_pcie_cleanup(dd->pcidev);
-
-       cleanup_device_data(dd);
-
-       hfi1_free_devdata(dd);
-}
-
-static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
-{
-       int ret = 0, j, pidx, initfail;
-       struct hfi1_devdata *dd = NULL;
-       struct hfi1_pportdata *ppd;
-
-       /* First, lock the non-writable module parameters */
-       HFI1_CAP_LOCK();
-
-       /* Validate some global module parameters */
-       if (rcvhdrcnt <= HFI1_MIN_HDRQ_EGRBUF_CNT) {
-               hfi1_early_err(&pdev->dev, "Header queue  count too small\n");
-               ret = -EINVAL;
-               goto bail;
-       }
-       if (rcvhdrcnt > HFI1_MAX_HDRQ_EGRBUF_CNT) {
-               hfi1_early_err(&pdev->dev,
-                              "Receive header queue count cannot be greater than %u\n",
-                              HFI1_MAX_HDRQ_EGRBUF_CNT);
-               ret = -EINVAL;
-               goto bail;
-       }
-       /* use the encoding function as a sanitization check */
-       if (!encode_rcv_header_entry_size(hfi1_hdrq_entsize)) {
-               hfi1_early_err(&pdev->dev, "Invalid HdrQ Entry size %u\n",
-                              hfi1_hdrq_entsize);
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       /* The receive eager buffer size must be set before the receive
-        * contexts are created.
-        *
-        * Set the eager buffer size.  Validate that it falls in a range
-        * allowed by the hardware - all powers of 2 between the min and
-        * max.  The maximum valid MTU is within the eager buffer range
-        * so we do not need to cap the max_mtu by an eager buffer size
-        * setting.
-        */
-       if (eager_buffer_size) {
-               if (!is_power_of_2(eager_buffer_size))
-                       eager_buffer_size =
-                               roundup_pow_of_two(eager_buffer_size);
-               eager_buffer_size =
-                       clamp_val(eager_buffer_size,
-                                 MIN_EAGER_BUFFER * 8,
-                                 MAX_EAGER_BUFFER_TOTAL);
-               hfi1_early_info(&pdev->dev, "Eager buffer size %u\n",
-                               eager_buffer_size);
-       } else {
-               hfi1_early_err(&pdev->dev, "Invalid Eager buffer size of 0\n");
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       /* restrict value of hfi1_rcvarr_split */
-       hfi1_rcvarr_split = clamp_val(hfi1_rcvarr_split, 0, 100);
-
-       ret = hfi1_pcie_init(pdev, ent);
-       if (ret)
-               goto bail;
-
-       /*
-        * Do device-specific initialization, function table setup, dd
-        * allocation, etc.
-        */
-       switch (ent->device) {
-       case PCI_DEVICE_ID_INTEL0:
-       case PCI_DEVICE_ID_INTEL1:
-               dd = hfi1_init_dd(pdev, ent);
-               break;
-       default:
-               hfi1_early_err(&pdev->dev,
-                              "Failing on unknown Intel deviceid 0x%x\n",
-                              ent->device);
-               ret = -ENODEV;
-       }
-
-       if (IS_ERR(dd))
-               ret = PTR_ERR(dd);
-       if (ret)
-               goto clean_bail; /* error already printed */
-
-       ret = create_workqueues(dd);
-       if (ret)
-               goto clean_bail;
-
-       /* do the generic initialization */
-       initfail = hfi1_init(dd, 0);
-
-       ret = hfi1_register_ib_device(dd);
-
-       /*
-        * Now ready for use.  this should be cleared whenever we
-        * detect a reset, or initiate one.  If earlier failure,
-        * we still create devices, so diags, etc. can be used
-        * to determine cause of problem.
-        */
-       if (!initfail && !ret) {
-               dd->flags |= HFI1_INITTED;
-               /* create debufs files after init and ib register */
-               hfi1_dbg_ibdev_init(&dd->verbs_dev);
-       }
-
-       j = hfi1_device_create(dd);
-       if (j)
-               dd_dev_err(dd, "Failed to create /dev devices: %d\n", -j);
-
-       if (initfail || ret) {
-               stop_timers(dd);
-               flush_workqueue(ib_wq);
-               for (pidx = 0; pidx < dd->num_pports; ++pidx) {
-                       hfi1_quiet_serdes(dd->pport + pidx);
-                       ppd = dd->pport + pidx;
-                       if (ppd->hfi1_wq) {
-                               destroy_workqueue(ppd->hfi1_wq);
-                               ppd->hfi1_wq = NULL;
-                       }
-               }
-               if (!j)
-                       hfi1_device_remove(dd);
-               if (!ret)
-                       hfi1_unregister_ib_device(dd);
-               postinit_cleanup(dd);
-               if (initfail)
-                       ret = initfail;
-               goto bail;      /* everything already cleaned */
-       }
-
-       sdma_start(dd);
-
-       return 0;
-
-clean_bail:
-       hfi1_pcie_cleanup(pdev);
-bail:
-       return ret;
-}
-
-static void remove_one(struct pci_dev *pdev)
-{
-       struct hfi1_devdata *dd = pci_get_drvdata(pdev);
-
-       /* close debugfs files before ib unregister */
-       hfi1_dbg_ibdev_exit(&dd->verbs_dev);
-       /* unregister from IB core */
-       hfi1_unregister_ib_device(dd);
-
-       /*
-        * Disable the IB link, disable interrupts on the device,
-        * clear dma engines, etc.
-        */
-       shutdown_device(dd);
-
-       stop_timers(dd);
-
-       /* wait until all of our (qsfp) queue_work() calls complete */
-       flush_workqueue(ib_wq);
-
-       hfi1_device_remove(dd);
-
-       postinit_cleanup(dd);
-}
-
-/**
- * hfi1_create_rcvhdrq - create a receive header queue
- * @dd: the hfi1_ib device
- * @rcd: the context data
- *
- * This must be contiguous memory (from an i/o perspective), and must be
- * DMA'able (which means for some systems, it will go through an IOMMU,
- * or be forced into a low address range).
- */
-int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
-{
-       unsigned amt;
-       u64 reg;
-
-       if (!rcd->rcvhdrq) {
-               dma_addr_t phys_hdrqtail;
-               gfp_t gfp_flags;
-
-               /*
-                * rcvhdrqentsize is in DWs, so we have to convert to bytes
-                * (* sizeof(u32)).
-                */
-               amt = PAGE_ALIGN(rcd->rcvhdrq_cnt * rcd->rcvhdrqentsize *
-                                sizeof(u32));
-
-               gfp_flags = (rcd->ctxt >= dd->first_user_ctxt) ?
-                       GFP_USER : GFP_KERNEL;
-               rcd->rcvhdrq = dma_zalloc_coherent(
-                       &dd->pcidev->dev, amt, &rcd->rcvhdrq_phys,
-                       gfp_flags | __GFP_COMP);
-
-               if (!rcd->rcvhdrq) {
-                       dd_dev_err(dd,
-                                  "attempt to allocate %d bytes for ctxt %u rcvhdrq failed\n",
-                                  amt, rcd->ctxt);
-                       goto bail;
-               }
-
-               if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) {
-                       rcd->rcvhdrtail_kvaddr = dma_zalloc_coherent(
-                               &dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail,
-                               gfp_flags);
-                       if (!rcd->rcvhdrtail_kvaddr)
-                               goto bail_free;
-                       rcd->rcvhdrqtailaddr_phys = phys_hdrqtail;
-               }
-
-               rcd->rcvhdrq_size = amt;
-       }
-       /*
-        * These values are per-context:
-        *      RcvHdrCnt
-        *      RcvHdrEntSize
-        *      RcvHdrSize
-        */
-       reg = ((u64)(rcd->rcvhdrq_cnt >> HDRQ_SIZE_SHIFT)
-                       & RCV_HDR_CNT_CNT_MASK)
-               << RCV_HDR_CNT_CNT_SHIFT;
-       write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_CNT, reg);
-       reg = (encode_rcv_header_entry_size(rcd->rcvhdrqentsize)
-                       & RCV_HDR_ENT_SIZE_ENT_SIZE_MASK)
-               << RCV_HDR_ENT_SIZE_ENT_SIZE_SHIFT;
-       write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_ENT_SIZE, reg);
-       reg = (dd->rcvhdrsize & RCV_HDR_SIZE_HDR_SIZE_MASK)
-               << RCV_HDR_SIZE_HDR_SIZE_SHIFT;
-       write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_SIZE, reg);
-
-       /*
-        * Program dummy tail address for every receive context
-        * before enabling any receive context
-        */
-       write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_TAIL_ADDR,
-                       dd->rcvhdrtail_dummy_physaddr);
-
-       return 0;
-
-bail_free:
-       dd_dev_err(dd,
-                  "attempt to allocate 1 page for ctxt %u rcvhdrqtailaddr failed\n",
-                  rcd->ctxt);
-       vfree(rcd->user_event_mask);
-       rcd->user_event_mask = NULL;
-       dma_free_coherent(&dd->pcidev->dev, amt, rcd->rcvhdrq,
-                         rcd->rcvhdrq_phys);
-       rcd->rcvhdrq = NULL;
-bail:
-       return -ENOMEM;
-}
-
-/**
- * allocate eager buffers, both kernel and user contexts.
- * @rcd: the context we are setting up.
- *
- * Allocate the eager TID buffers and program them into hip.
- * They are no longer completely contiguous, we do multiple allocation
- * calls.  Otherwise we get the OOM code involved, by asking for too
- * much per call, with disastrous results on some kernels.
- */
-int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd)
-{
-       struct hfi1_devdata *dd = rcd->dd;
-       u32 max_entries, egrtop, alloced_bytes = 0, idx = 0;
-       gfp_t gfp_flags;
-       u16 order;
-       int ret = 0;
-       u16 round_mtu = roundup_pow_of_two(hfi1_max_mtu);
-
-       /*
-        * GFP_USER, but without GFP_FS, so buffer cache can be
-        * coalesced (we hope); otherwise, even at order 4,
-        * heavy filesystem activity makes these fail, and we can
-        * use compound pages.
-        */
-       gfp_flags = __GFP_RECLAIM | __GFP_IO | __GFP_COMP;
-
-       /*
-        * The minimum size of the eager buffers is a groups of MTU-sized
-        * buffers.
-        * The global eager_buffer_size parameter is checked against the
-        * theoretical lower limit of the value. Here, we check against the
-        * MTU.
-        */
-       if (rcd->egrbufs.size < (round_mtu * dd->rcv_entries.group_size))
-               rcd->egrbufs.size = round_mtu * dd->rcv_entries.group_size;
-       /*
-        * If using one-pkt-per-egr-buffer, lower the eager buffer
-        * size to the max MTU (page-aligned).
-        */
-       if (!HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR))
-               rcd->egrbufs.rcvtid_size = round_mtu;
-
-       /*
-        * Eager buffers sizes of 1MB or less require smaller TID sizes
-        * to satisfy the "multiple of 8 RcvArray entries" requirement.
-        */
-       if (rcd->egrbufs.size <= (1 << 20))
-               rcd->egrbufs.rcvtid_size = max((unsigned long)round_mtu,
-                       rounddown_pow_of_two(rcd->egrbufs.size / 8));
-
-       while (alloced_bytes < rcd->egrbufs.size &&
-              rcd->egrbufs.alloced < rcd->egrbufs.count) {
-               rcd->egrbufs.buffers[idx].addr =
-                       dma_zalloc_coherent(&dd->pcidev->dev,
-                                           rcd->egrbufs.rcvtid_size,
-                                           &rcd->egrbufs.buffers[idx].phys,
-                                           gfp_flags);
-               if (rcd->egrbufs.buffers[idx].addr) {
-                       rcd->egrbufs.buffers[idx].len =
-                               rcd->egrbufs.rcvtid_size;
-                       rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].addr =
-                               rcd->egrbufs.buffers[idx].addr;
-                       rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].phys =
-                               rcd->egrbufs.buffers[idx].phys;
-                       rcd->egrbufs.alloced++;
-                       alloced_bytes += rcd->egrbufs.rcvtid_size;
-                       idx++;
-               } else {
-                       u32 new_size, i, j;
-                       u64 offset = 0;
-
-                       /*
-                        * Fail the eager buffer allocation if:
-                        *   - we are already using the lowest acceptable size
-                        *   - we are using one-pkt-per-egr-buffer (this implies
-                        *     that we are accepting only one size)
-                        */
-                       if (rcd->egrbufs.rcvtid_size == round_mtu ||
-                           !HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) {
-                               dd_dev_err(dd, "ctxt%u: Failed to allocate eager buffers\n",
-                                          rcd->ctxt);
-                               goto bail_rcvegrbuf_phys;
-                       }
-
-                       new_size = rcd->egrbufs.rcvtid_size / 2;
-
-                       /*
-                        * If the first attempt to allocate memory failed, don't
-                        * fail everything but continue with the next lower
-                        * size.
-                        */
-                       if (idx == 0) {
-                               rcd->egrbufs.rcvtid_size = new_size;
-                               continue;
-                       }
-
-                       /*
-                        * Re-partition already allocated buffers to a smaller
-                        * size.
-                        */
-                       rcd->egrbufs.alloced = 0;
-                       for (i = 0, j = 0, offset = 0; j < idx; i++) {
-                               if (i >= rcd->egrbufs.count)
-                                       break;
-                               rcd->egrbufs.rcvtids[i].phys =
-                                       rcd->egrbufs.buffers[j].phys + offset;
-                               rcd->egrbufs.rcvtids[i].addr =
-                                       rcd->egrbufs.buffers[j].addr + offset;
-                               rcd->egrbufs.alloced++;
-                               if ((rcd->egrbufs.buffers[j].phys + offset +
-                                    new_size) ==
-                                   (rcd->egrbufs.buffers[j].phys +
-                                    rcd->egrbufs.buffers[j].len)) {
-                                       j++;
-                                       offset = 0;
-                               } else {
-                                       offset += new_size;
-                               }
-                       }
-                       rcd->egrbufs.rcvtid_size = new_size;
-               }
-       }
-       rcd->egrbufs.numbufs = idx;
-       rcd->egrbufs.size = alloced_bytes;
-
-       hfi1_cdbg(PROC,
-                 "ctxt%u: Alloced %u rcv tid entries @ %uKB, total %zuKB\n",
-                 rcd->ctxt, rcd->egrbufs.alloced, rcd->egrbufs.rcvtid_size,
-                 rcd->egrbufs.size);
-
-       /*
-        * Set the contexts rcv array head update threshold to the closest
-        * power of 2 (so we can use a mask instead of modulo) below half
-        * the allocated entries.
-        */
-       rcd->egrbufs.threshold =
-               rounddown_pow_of_two(rcd->egrbufs.alloced / 2);
-       /*
-        * Compute the expected RcvArray entry base. This is done after
-        * allocating the eager buffers in order to maximize the
-        * expected RcvArray entries for the context.
-        */
-       max_entries = rcd->rcv_array_groups * dd->rcv_entries.group_size;
-       egrtop = roundup(rcd->egrbufs.alloced, dd->rcv_entries.group_size);
-       rcd->expected_count = max_entries - egrtop;
-       if (rcd->expected_count > MAX_TID_PAIR_ENTRIES * 2)
-               rcd->expected_count = MAX_TID_PAIR_ENTRIES * 2;
-
-       rcd->expected_base = rcd->eager_base + egrtop;
-       hfi1_cdbg(PROC, "ctxt%u: eager:%u, exp:%u, egrbase:%u, expbase:%u\n",
-                 rcd->ctxt, rcd->egrbufs.alloced, rcd->expected_count,
-                 rcd->eager_base, rcd->expected_base);
-
-       if (!hfi1_rcvbuf_validate(rcd->egrbufs.rcvtid_size, PT_EAGER, &order)) {
-               hfi1_cdbg(PROC,
-                         "ctxt%u: current Eager buffer size is invalid %u\n",
-                         rcd->ctxt, rcd->egrbufs.rcvtid_size);
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       for (idx = 0; idx < rcd->egrbufs.alloced; idx++) {
-               hfi1_put_tid(dd, rcd->eager_base + idx, PT_EAGER,
-                            rcd->egrbufs.rcvtids[idx].phys, order);
-               cond_resched();
-       }
-       goto bail;
-
-bail_rcvegrbuf_phys:
-       for (idx = 0; idx < rcd->egrbufs.alloced &&
-            rcd->egrbufs.buffers[idx].addr;
-            idx++) {
-               dma_free_coherent(&dd->pcidev->dev,
-                                 rcd->egrbufs.buffers[idx].len,
-                                 rcd->egrbufs.buffers[idx].addr,
-                                 rcd->egrbufs.buffers[idx].phys);
-               rcd->egrbufs.buffers[idx].addr = NULL;
-               rcd->egrbufs.buffers[idx].phys = 0;
-               rcd->egrbufs.buffers[idx].len = 0;
-       }
-bail:
-       return ret;
-}
diff --git a/drivers/staging/rdma/hfi1/intr.c b/drivers/staging/rdma/hfi1/intr.c
deleted file mode 100644 (file)
index 65348d1..0000000
+++ /dev/null
@@ -1,200 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/pci.h>
-#include <linux/delay.h>
-
-#include "hfi.h"
-#include "common.h"
-#include "sdma.h"
-
-/**
- * format_hwmsg - format a single hwerror message
- * @msg message buffer
- * @msgl length of message buffer
- * @hwmsg message to add to message buffer
- */
-static void format_hwmsg(char *msg, size_t msgl, const char *hwmsg)
-{
-       strlcat(msg, "[", msgl);
-       strlcat(msg, hwmsg, msgl);
-       strlcat(msg, "]", msgl);
-}
-
-/**
- * hfi1_format_hwerrors - format hardware error messages for display
- * @hwerrs hardware errors bit vector
- * @hwerrmsgs hardware error descriptions
- * @nhwerrmsgs number of hwerrmsgs
- * @msg message buffer
- * @msgl message buffer length
- */
-void hfi1_format_hwerrors(u64 hwerrs, const struct hfi1_hwerror_msgs *hwerrmsgs,
-                         size_t nhwerrmsgs, char *msg, size_t msgl)
-{
-       int i;
-
-       for (i = 0; i < nhwerrmsgs; i++)
-               if (hwerrs & hwerrmsgs[i].mask)
-                       format_hwmsg(msg, msgl, hwerrmsgs[i].msg);
-}
-
-static void signal_ib_event(struct hfi1_pportdata *ppd, enum ib_event_type ev)
-{
-       struct ib_event event;
-       struct hfi1_devdata *dd = ppd->dd;
-
-       /*
-        * Only call ib_dispatch_event() if the IB device has been
-        * registered.  HFI1_INITED is set iff the driver has successfully
-        * registered with the IB core.
-        */
-       if (!(dd->flags & HFI1_INITTED))
-               return;
-       event.device = &dd->verbs_dev.rdi.ibdev;
-       event.element.port_num = ppd->port;
-       event.event = ev;
-       ib_dispatch_event(&event);
-}
-
-/*
- * Handle a linkup or link down notification.
- * This is called outside an interrupt.
- */
-void handle_linkup_change(struct hfi1_devdata *dd, u32 linkup)
-{
-       struct hfi1_pportdata *ppd = &dd->pport[0];
-       enum ib_event_type ev;
-
-       if (!(ppd->linkup ^ !!linkup))
-               return; /* no change, nothing to do */
-
-       if (linkup) {
-               /*
-                * Quick linkup and all link up on the simulator does not
-                * trigger or implement:
-                *      - VerifyCap interrupt
-                *      - VerifyCap frames
-                * But rather moves directly to LinkUp.
-                *
-                * Do the work of the VerifyCap interrupt handler,
-                * handle_verify_cap(), but do not try moving the state to
-                * LinkUp as we are already there.
-                *
-                * NOTE: This uses this device's vAU, vCU, and vl15_init for
-                * the remote values.  Both sides must be using the values.
-                */
-               if (quick_linkup || dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
-                       set_up_vl15(dd, dd->vau, dd->vl15_init);
-                       assign_remote_cm_au_table(dd, dd->vcu);
-                       ppd->neighbor_guid =
-                               read_csr(dd, DC_DC8051_STS_REMOTE_GUID);
-                       ppd->neighbor_type =
-                               read_csr(dd, DC_DC8051_STS_REMOTE_NODE_TYPE) &
-                                       DC_DC8051_STS_REMOTE_NODE_TYPE_VAL_MASK;
-                       ppd->neighbor_port_number =
-                               read_csr(dd, DC_DC8051_STS_REMOTE_PORT_NO) &
-                                        DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK;
-                       dd_dev_info(dd, "Neighbor GUID: %llx Neighbor type %d\n",
-                                   ppd->neighbor_guid,
-                                   ppd->neighbor_type);
-               }
-
-               /* physical link went up */
-               ppd->linkup = 1;
-               ppd->offline_disabled_reason =
-                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE);
-
-               /* link widths are not available until the link is fully up */
-               get_linkup_link_widths(ppd);
-
-       } else {
-               /* physical link went down */
-               ppd->linkup = 0;
-
-               /* clear HW details of the previous connection */
-               reset_link_credits(dd);
-
-               /* freeze after a link down to guarantee a clean egress */
-               start_freeze_handling(ppd, FREEZE_SELF | FREEZE_LINK_DOWN);
-
-               ev = IB_EVENT_PORT_ERR;
-
-               hfi1_set_uevent_bits(ppd, _HFI1_EVENT_LINKDOWN_BIT);
-
-               /* if we are down, the neighbor is down */
-               ppd->neighbor_normal = 0;
-
-               /* notify IB of the link change */
-               signal_ib_event(ppd, ev);
-       }
-}
-
-/*
- * Handle receive or urgent interrupts for user contexts.  This means a user
- * process was waiting for a packet to arrive, and didn't want to poll.
- */
-void handle_user_interrupt(struct hfi1_ctxtdata *rcd)
-{
-       struct hfi1_devdata *dd = rcd->dd;
-       unsigned long flags;
-
-       spin_lock_irqsave(&dd->uctxt_lock, flags);
-       if (!rcd->cnt)
-               goto done;
-
-       if (test_and_clear_bit(HFI1_CTXT_WAITING_RCV, &rcd->event_flags)) {
-               wake_up_interruptible(&rcd->wait);
-               hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_DIS, rcd->ctxt);
-       } else if (test_and_clear_bit(HFI1_CTXT_WAITING_URG,
-                                                       &rcd->event_flags)) {
-               rcd->urgent++;
-               wake_up_interruptible(&rcd->wait);
-       }
-done:
-       spin_unlock_irqrestore(&dd->uctxt_lock, flags);
-}
diff --git a/drivers/staging/rdma/hfi1/iowait.h b/drivers/staging/rdma/hfi1/iowait.h
deleted file mode 100644 (file)
index 2ec6ef3..0000000
+++ /dev/null
@@ -1,300 +0,0 @@
-#ifndef _HFI1_IOWAIT_H
-#define _HFI1_IOWAIT_H
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/list.h>
-#include <linux/workqueue.h>
-#include <linux/sched.h>
-
-#include "sdma_txreq.h"
-
-/*
- * typedef (*restart_t)() - restart callback
- * @work: pointer to work structure
- */
-typedef void (*restart_t)(struct work_struct *work);
-
-struct sdma_txreq;
-struct sdma_engine;
-/**
- * struct iowait - linkage for delayed progress/waiting
- * @list: used to add/insert into QP/PQ wait lists
- * @tx_head: overflow list of sdma_txreq's
- * @sleep: no space callback
- * @wakeup: space callback wakeup
- * @sdma_drained: sdma count drained
- * @iowork: workqueue overhead
- * @wait_dma: wait for sdma_busy == 0
- * @wait_pio: wait for pio_busy == 0
- * @sdma_busy: # of packets in flight
- * @count: total number of descriptors in tx_head'ed list
- * @tx_limit: limit for overflow queuing
- * @tx_count: number of tx entry's in tx_head'ed list
- *
- * This is to be embedded in user's state structure
- * (QP or PQ).
- *
- * The sleep and wakeup members are a
- * bit misnamed.   They do not strictly
- * speaking sleep or wake up, but they
- * are callbacks for the ULP to implement
- * what ever queuing/dequeuing of
- * the embedded iowait and its containing struct
- * when a resource shortage like SDMA ring space is seen.
- *
- * Both potentially have locks help
- * so sleeping is not allowed.
- *
- * The wait_dma member along with the iow
- */
-
-struct iowait {
-       struct list_head list;
-       struct list_head tx_head;
-       int (*sleep)(
-               struct sdma_engine *sde,
-               struct iowait *wait,
-               struct sdma_txreq *tx,
-               unsigned seq);
-       void (*wakeup)(struct iowait *wait, int reason);
-       void (*sdma_drained)(struct iowait *wait);
-       struct work_struct iowork;
-       wait_queue_head_t wait_dma;
-       wait_queue_head_t wait_pio;
-       atomic_t sdma_busy;
-       atomic_t pio_busy;
-       u32 count;
-       u32 tx_limit;
-       u32 tx_count;
-};
-
-#define SDMA_AVAIL_REASON 0
-
-/**
- * iowait_init() - initialize wait structure
- * @wait: wait struct to initialize
- * @tx_limit: limit for overflow queuing
- * @func: restart function for workqueue
- * @sleep: sleep function for no space
- * @resume: wakeup function for no space
- *
- * This function initializes the iowait
- * structure embedded in the QP or PQ.
- *
- */
-
-static inline void iowait_init(
-       struct iowait *wait,
-       u32 tx_limit,
-       void (*func)(struct work_struct *work),
-       int (*sleep)(
-               struct sdma_engine *sde,
-               struct iowait *wait,
-               struct sdma_txreq *tx,
-               unsigned seq),
-       void (*wakeup)(struct iowait *wait, int reason),
-       void (*sdma_drained)(struct iowait *wait))
-{
-       wait->count = 0;
-       INIT_LIST_HEAD(&wait->list);
-       INIT_LIST_HEAD(&wait->tx_head);
-       INIT_WORK(&wait->iowork, func);
-       init_waitqueue_head(&wait->wait_dma);
-       init_waitqueue_head(&wait->wait_pio);
-       atomic_set(&wait->sdma_busy, 0);
-       atomic_set(&wait->pio_busy, 0);
-       wait->tx_limit = tx_limit;
-       wait->sleep = sleep;
-       wait->wakeup = wakeup;
-       wait->sdma_drained = sdma_drained;
-}
-
-/**
- * iowait_schedule() - initialize wait structure
- * @wait: wait struct to schedule
- * @wq: workqueue for schedule
- * @cpu: cpu
- */
-static inline void iowait_schedule(
-       struct iowait *wait,
-       struct workqueue_struct *wq,
-       int cpu)
-{
-       queue_work_on(cpu, wq, &wait->iowork);
-}
-
-/**
- * iowait_sdma_drain() - wait for DMAs to drain
- *
- * @wait: iowait structure
- *
- * This will delay until the iowait sdmas have
- * completed.
- */
-static inline void iowait_sdma_drain(struct iowait *wait)
-{
-       wait_event(wait->wait_dma, !atomic_read(&wait->sdma_busy));
-}
-
-/**
- * iowait_sdma_pending() - return sdma pending count
- *
- * @wait: iowait structure
- *
- */
-static inline int iowait_sdma_pending(struct iowait *wait)
-{
-       return atomic_read(&wait->sdma_busy);
-}
-
-/**
- * iowait_sdma_inc - note sdma io pending
- * @wait: iowait structure
- */
-static inline void iowait_sdma_inc(struct iowait *wait)
-{
-       atomic_inc(&wait->sdma_busy);
-}
-
-/**
- * iowait_sdma_add - add count to pending
- * @wait: iowait structure
- */
-static inline void iowait_sdma_add(struct iowait *wait, int count)
-{
-       atomic_add(count, &wait->sdma_busy);
-}
-
-/**
- * iowait_sdma_dec - note sdma complete
- * @wait: iowait structure
- */
-static inline int iowait_sdma_dec(struct iowait *wait)
-{
-       return atomic_dec_and_test(&wait->sdma_busy);
-}
-
-/**
- * iowait_pio_drain() - wait for pios to drain
- *
- * @wait: iowait structure
- *
- * This will delay until the iowait pios have
- * completed.
- */
-static inline void iowait_pio_drain(struct iowait *wait)
-{
-       wait_event_timeout(wait->wait_pio,
-                          !atomic_read(&wait->pio_busy),
-                          HZ);
-}
-
-/**
- * iowait_pio_pending() - return pio pending count
- *
- * @wait: iowait structure
- *
- */
-static inline int iowait_pio_pending(struct iowait *wait)
-{
-       return atomic_read(&wait->pio_busy);
-}
-
-/**
- * iowait_pio_inc - note pio pending
- * @wait: iowait structure
- */
-static inline void iowait_pio_inc(struct iowait *wait)
-{
-       atomic_inc(&wait->pio_busy);
-}
-
-/**
- * iowait_sdma_dec - note pio complete
- * @wait: iowait structure
- */
-static inline int iowait_pio_dec(struct iowait *wait)
-{
-       return atomic_dec_and_test(&wait->pio_busy);
-}
-
-/**
- * iowait_drain_wakeup() - trigger iowait_drain() waiter
- *
- * @wait: iowait structure
- *
- * This will trigger any waiters.
- */
-static inline void iowait_drain_wakeup(struct iowait *wait)
-{
-       wake_up(&wait->wait_dma);
-       wake_up(&wait->wait_pio);
-       if (wait->sdma_drained)
-               wait->sdma_drained(wait);
-}
-
-/**
- * iowait_get_txhead() - get packet off of iowait list
- *
- * @wait wait struture
- */
-static inline struct sdma_txreq *iowait_get_txhead(struct iowait *wait)
-{
-       struct sdma_txreq *tx = NULL;
-
-       if (!list_empty(&wait->tx_head)) {
-               tx = list_first_entry(
-                       &wait->tx_head,
-                       struct sdma_txreq,
-                       list);
-               list_del_init(&tx->list);
-       }
-       return tx;
-}
-
-#endif
diff --git a/drivers/staging/rdma/hfi1/mad.c b/drivers/staging/rdma/hfi1/mad.c
deleted file mode 100644 (file)
index ed58cf2..0000000
+++ /dev/null
@@ -1,4416 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/net.h>
-#define OPA_NUM_PKEY_BLOCKS_PER_SMP (OPA_SMP_DR_DATA_SIZE \
-                       / (OPA_PARTITION_TABLE_BLK_SIZE * sizeof(u16)))
-
-#include "hfi.h"
-#include "mad.h"
-#include "trace.h"
-#include "qp.h"
-
-/* the reset value from the FM is supposed to be 0xffff, handle both */
-#define OPA_LINK_WIDTH_RESET_OLD 0x0fff
-#define OPA_LINK_WIDTH_RESET 0xffff
-
-static int reply(struct ib_mad_hdr *smp)
-{
-       /*
-        * The verbs framework will handle the directed/LID route
-        * packet changes.
-        */
-       smp->method = IB_MGMT_METHOD_GET_RESP;
-       if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
-               smp->status |= IB_SMP_DIRECTION;
-       return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
-}
-
-static inline void clear_opa_smp_data(struct opa_smp *smp)
-{
-       void *data = opa_get_smp_data(smp);
-       size_t size = opa_get_smp_data_size(smp);
-
-       memset(data, 0, size);
-}
-
-static void send_trap(struct hfi1_ibport *ibp, void *data, unsigned len)
-{
-       struct ib_mad_send_buf *send_buf;
-       struct ib_mad_agent *agent;
-       struct opa_smp *smp;
-       int ret;
-       unsigned long flags;
-       unsigned long timeout;
-       int pkey_idx;
-       u32 qpn = ppd_from_ibp(ibp)->sm_trap_qp;
-
-       agent = ibp->rvp.send_agent;
-       if (!agent)
-               return;
-
-       /* o14-3.2.1 */
-       if (ppd_from_ibp(ibp)->lstate != IB_PORT_ACTIVE)
-               return;
-
-       /* o14-2 */
-       if (ibp->rvp.trap_timeout && time_before(jiffies,
-                                                ibp->rvp.trap_timeout))
-               return;
-
-       pkey_idx = hfi1_lookup_pkey_idx(ibp, LIM_MGMT_P_KEY);
-       if (pkey_idx < 0) {
-               pr_warn("%s: failed to find limited mgmt pkey, defaulting 0x%x\n",
-                       __func__, hfi1_get_pkey(ibp, 1));
-               pkey_idx = 1;
-       }
-
-       send_buf = ib_create_send_mad(agent, qpn, pkey_idx, 0,
-                                     IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
-                                     GFP_ATOMIC, IB_MGMT_BASE_VERSION);
-       if (IS_ERR(send_buf))
-               return;
-
-       smp = send_buf->mad;
-       smp->base_version = OPA_MGMT_BASE_VERSION;
-       smp->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED;
-       smp->class_version = OPA_SMI_CLASS_VERSION;
-       smp->method = IB_MGMT_METHOD_TRAP;
-       ibp->rvp.tid++;
-       smp->tid = cpu_to_be64(ibp->rvp.tid);
-       smp->attr_id = IB_SMP_ATTR_NOTICE;
-       /* o14-1: smp->mkey = 0; */
-       memcpy(smp->route.lid.data, data, len);
-
-       spin_lock_irqsave(&ibp->rvp.lock, flags);
-       if (!ibp->rvp.sm_ah) {
-               if (ibp->rvp.sm_lid != be16_to_cpu(IB_LID_PERMISSIVE)) {
-                       struct ib_ah *ah;
-
-                       ah = hfi1_create_qp0_ah(ibp, ibp->rvp.sm_lid);
-                       if (IS_ERR(ah)) {
-                               ret = PTR_ERR(ah);
-                       } else {
-                               send_buf->ah = ah;
-                               ibp->rvp.sm_ah = ibah_to_rvtah(ah);
-                               ret = 0;
-                       }
-               } else {
-                       ret = -EINVAL;
-               }
-       } else {
-               send_buf->ah = &ibp->rvp.sm_ah->ibah;
-               ret = 0;
-       }
-       spin_unlock_irqrestore(&ibp->rvp.lock, flags);
-
-       if (!ret)
-               ret = ib_post_send_mad(send_buf, NULL);
-       if (!ret) {
-               /* 4.096 usec. */
-               timeout = (4096 * (1UL << ibp->rvp.subnet_timeout)) / 1000;
-               ibp->rvp.trap_timeout = jiffies + usecs_to_jiffies(timeout);
-       } else {
-               ib_free_send_mad(send_buf);
-               ibp->rvp.trap_timeout = 0;
-       }
-}
-
-/*
- * Send a bad [PQ]_Key trap (ch. 14.3.8).
- */
-void hfi1_bad_pqkey(struct hfi1_ibport *ibp, __be16 trap_num, u32 key, u32 sl,
-                   u32 qp1, u32 qp2, u16 lid1, u16 lid2)
-{
-       struct opa_mad_notice_attr data;
-       u32 lid = ppd_from_ibp(ibp)->lid;
-       u32 _lid1 = lid1;
-       u32 _lid2 = lid2;
-
-       memset(&data, 0, sizeof(data));
-
-       if (trap_num == OPA_TRAP_BAD_P_KEY)
-               ibp->rvp.pkey_violations++;
-       else
-               ibp->rvp.qkey_violations++;
-       ibp->rvp.n_pkt_drops++;
-
-       /* Send violation trap */
-       data.generic_type = IB_NOTICE_TYPE_SECURITY;
-       data.prod_type_lsb = IB_NOTICE_PROD_CA;
-       data.trap_num = trap_num;
-       data.issuer_lid = cpu_to_be32(lid);
-       data.ntc_257_258.lid1 = cpu_to_be32(_lid1);
-       data.ntc_257_258.lid2 = cpu_to_be32(_lid2);
-       data.ntc_257_258.key = cpu_to_be32(key);
-       data.ntc_257_258.sl = sl << 3;
-       data.ntc_257_258.qp1 = cpu_to_be32(qp1);
-       data.ntc_257_258.qp2 = cpu_to_be32(qp2);
-
-       send_trap(ibp, &data, sizeof(data));
-}
-
-/*
- * Send a bad M_Key trap (ch. 14.3.9).
- */
-static void bad_mkey(struct hfi1_ibport *ibp, struct ib_mad_hdr *mad,
-                    __be64 mkey, __be32 dr_slid, u8 return_path[], u8 hop_cnt)
-{
-       struct opa_mad_notice_attr data;
-       u32 lid = ppd_from_ibp(ibp)->lid;
-
-       memset(&data, 0, sizeof(data));
-       /* Send violation trap */
-       data.generic_type = IB_NOTICE_TYPE_SECURITY;
-       data.prod_type_lsb = IB_NOTICE_PROD_CA;
-       data.trap_num = OPA_TRAP_BAD_M_KEY;
-       data.issuer_lid = cpu_to_be32(lid);
-       data.ntc_256.lid = data.issuer_lid;
-       data.ntc_256.method = mad->method;
-       data.ntc_256.attr_id = mad->attr_id;
-       data.ntc_256.attr_mod = mad->attr_mod;
-       data.ntc_256.mkey = mkey;
-       if (mad->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
-               data.ntc_256.dr_slid = dr_slid;
-               data.ntc_256.dr_trunc_hop = IB_NOTICE_TRAP_DR_NOTICE;
-               if (hop_cnt > ARRAY_SIZE(data.ntc_256.dr_rtn_path)) {
-                       data.ntc_256.dr_trunc_hop |=
-                               IB_NOTICE_TRAP_DR_TRUNC;
-                       hop_cnt = ARRAY_SIZE(data.ntc_256.dr_rtn_path);
-               }
-               data.ntc_256.dr_trunc_hop |= hop_cnt;
-               memcpy(data.ntc_256.dr_rtn_path, return_path,
-                      hop_cnt);
-       }
-
-       send_trap(ibp, &data, sizeof(data));
-}
-
-/*
- * Send a Port Capability Mask Changed trap (ch. 14.3.11).
- */
-void hfi1_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num)
-{
-       struct opa_mad_notice_attr data;
-       struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
-       struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
-       struct hfi1_ibport *ibp = &dd->pport[port_num - 1].ibport_data;
-       u32 lid = ppd_from_ibp(ibp)->lid;
-
-       memset(&data, 0, sizeof(data));
-
-       data.generic_type = IB_NOTICE_TYPE_INFO;
-       data.prod_type_lsb = IB_NOTICE_PROD_CA;
-       data.trap_num = OPA_TRAP_CHANGE_CAPABILITY;
-       data.issuer_lid = cpu_to_be32(lid);
-       data.ntc_144.lid = data.issuer_lid;
-       data.ntc_144.new_cap_mask = cpu_to_be32(ibp->rvp.port_cap_flags);
-
-       send_trap(ibp, &data, sizeof(data));
-}
-
-/*
- * Send a System Image GUID Changed trap (ch. 14.3.12).
- */
-void hfi1_sys_guid_chg(struct hfi1_ibport *ibp)
-{
-       struct opa_mad_notice_attr data;
-       u32 lid = ppd_from_ibp(ibp)->lid;
-
-       memset(&data, 0, sizeof(data));
-
-       data.generic_type = IB_NOTICE_TYPE_INFO;
-       data.prod_type_lsb = IB_NOTICE_PROD_CA;
-       data.trap_num = OPA_TRAP_CHANGE_SYSGUID;
-       data.issuer_lid = cpu_to_be32(lid);
-       data.ntc_145.new_sys_guid = ib_hfi1_sys_image_guid;
-       data.ntc_145.lid = data.issuer_lid;
-
-       send_trap(ibp, &data, sizeof(data));
-}
-
-/*
- * Send a Node Description Changed trap (ch. 14.3.13).
- */
-void hfi1_node_desc_chg(struct hfi1_ibport *ibp)
-{
-       struct opa_mad_notice_attr data;
-       u32 lid = ppd_from_ibp(ibp)->lid;
-
-       memset(&data, 0, sizeof(data));
-
-       data.generic_type = IB_NOTICE_TYPE_INFO;
-       data.prod_type_lsb = IB_NOTICE_PROD_CA;
-       data.trap_num = OPA_TRAP_CHANGE_CAPABILITY;
-       data.issuer_lid = cpu_to_be32(lid);
-       data.ntc_144.lid = data.issuer_lid;
-       data.ntc_144.change_flags =
-               cpu_to_be16(OPA_NOTICE_TRAP_NODE_DESC_CHG);
-
-       send_trap(ibp, &data, sizeof(data));
-}
-
-static int __subn_get_opa_nodedesc(struct opa_smp *smp, u32 am,
-                                  u8 *data, struct ib_device *ibdev,
-                                  u8 port, u32 *resp_len)
-{
-       struct opa_node_description *nd;
-
-       if (am) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       nd = (struct opa_node_description *)data;
-
-       memcpy(nd->data, ibdev->node_desc, sizeof(nd->data));
-
-       if (resp_len)
-               *resp_len += sizeof(*nd);
-
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-static int __subn_get_opa_nodeinfo(struct opa_smp *smp, u32 am, u8 *data,
-                                  struct ib_device *ibdev, u8 port,
-                                  u32 *resp_len)
-{
-       struct opa_node_info *ni;
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       unsigned pidx = port - 1; /* IB number port from 1, hw from 0 */
-
-       ni = (struct opa_node_info *)data;
-
-       /* GUID 0 is illegal */
-       if (am || pidx >= dd->num_pports || dd->pport[pidx].guid == 0) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       ni->port_guid = cpu_to_be64(dd->pport[pidx].guid);
-       ni->base_version = OPA_MGMT_BASE_VERSION;
-       ni->class_version = OPA_SMI_CLASS_VERSION;
-       ni->node_type = 1;     /* channel adapter */
-       ni->num_ports = ibdev->phys_port_cnt;
-       /* This is already in network order */
-       ni->system_image_guid = ib_hfi1_sys_image_guid;
-       /* Use first-port GUID as node */
-       ni->node_guid = cpu_to_be64(dd->pport->guid);
-       ni->partition_cap = cpu_to_be16(hfi1_get_npkeys(dd));
-       ni->device_id = cpu_to_be16(dd->pcidev->device);
-       ni->revision = cpu_to_be32(dd->minrev);
-       ni->local_port_num = port;
-       ni->vendor_id[0] = dd->oui1;
-       ni->vendor_id[1] = dd->oui2;
-       ni->vendor_id[2] = dd->oui3;
-
-       if (resp_len)
-               *resp_len += sizeof(*ni);
-
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-static int subn_get_nodeinfo(struct ib_smp *smp, struct ib_device *ibdev,
-                            u8 port)
-{
-       struct ib_node_info *nip = (struct ib_node_info *)&smp->data;
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       unsigned pidx = port - 1; /* IB number port from 1, hw from 0 */
-
-       /* GUID 0 is illegal */
-       if (smp->attr_mod || pidx >= dd->num_pports ||
-           dd->pport[pidx].guid == 0)
-               smp->status |= IB_SMP_INVALID_FIELD;
-       else
-               nip->port_guid = cpu_to_be64(dd->pport[pidx].guid);
-
-       nip->base_version = OPA_MGMT_BASE_VERSION;
-       nip->class_version = OPA_SMI_CLASS_VERSION;
-       nip->node_type = 1;     /* channel adapter */
-       nip->num_ports = ibdev->phys_port_cnt;
-       /* This is already in network order */
-       nip->sys_guid = ib_hfi1_sys_image_guid;
-        /* Use first-port GUID as node */
-       nip->node_guid = cpu_to_be64(dd->pport->guid);
-       nip->partition_cap = cpu_to_be16(hfi1_get_npkeys(dd));
-       nip->device_id = cpu_to_be16(dd->pcidev->device);
-       nip->revision = cpu_to_be32(dd->minrev);
-       nip->local_port_num = port;
-       nip->vendor_id[0] = dd->oui1;
-       nip->vendor_id[1] = dd->oui2;
-       nip->vendor_id[2] = dd->oui3;
-
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-static void set_link_width_enabled(struct hfi1_pportdata *ppd, u32 w)
-{
-       (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LWID_ENB, w);
-}
-
-static void set_link_width_downgrade_enabled(struct hfi1_pportdata *ppd, u32 w)
-{
-       (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LWID_DG_ENB, w);
-}
-
-static void set_link_speed_enabled(struct hfi1_pportdata *ppd, u32 s)
-{
-       (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_SPD_ENB, s);
-}
-
-static int check_mkey(struct hfi1_ibport *ibp, struct ib_mad_hdr *mad,
-                     int mad_flags, __be64 mkey, __be32 dr_slid,
-                     u8 return_path[], u8 hop_cnt)
-{
-       int valid_mkey = 0;
-       int ret = 0;
-
-       /* Is the mkey in the process of expiring? */
-       if (ibp->rvp.mkey_lease_timeout &&
-           time_after_eq(jiffies, ibp->rvp.mkey_lease_timeout)) {
-               /* Clear timeout and mkey protection field. */
-               ibp->rvp.mkey_lease_timeout = 0;
-               ibp->rvp.mkeyprot = 0;
-       }
-
-       if ((mad_flags & IB_MAD_IGNORE_MKEY) ||  ibp->rvp.mkey == 0 ||
-           ibp->rvp.mkey == mkey)
-               valid_mkey = 1;
-
-       /* Unset lease timeout on any valid Get/Set/TrapRepress */
-       if (valid_mkey && ibp->rvp.mkey_lease_timeout &&
-           (mad->method == IB_MGMT_METHOD_GET ||
-            mad->method == IB_MGMT_METHOD_SET ||
-            mad->method == IB_MGMT_METHOD_TRAP_REPRESS))
-               ibp->rvp.mkey_lease_timeout = 0;
-
-       if (!valid_mkey) {
-               switch (mad->method) {
-               case IB_MGMT_METHOD_GET:
-                       /* Bad mkey not a violation below level 2 */
-                       if (ibp->rvp.mkeyprot < 2)
-                               break;
-               case IB_MGMT_METHOD_SET:
-               case IB_MGMT_METHOD_TRAP_REPRESS:
-                       if (ibp->rvp.mkey_violations != 0xFFFF)
-                               ++ibp->rvp.mkey_violations;
-                       if (!ibp->rvp.mkey_lease_timeout &&
-                           ibp->rvp.mkey_lease_period)
-                               ibp->rvp.mkey_lease_timeout = jiffies +
-                                       ibp->rvp.mkey_lease_period * HZ;
-                       /* Generate a trap notice. */
-                       bad_mkey(ibp, mad, mkey, dr_slid, return_path,
-                                hop_cnt);
-                       ret = 1;
-               }
-       }
-
-       return ret;
-}
-
-/*
- * The SMA caches reads from LCB registers in case the LCB is unavailable.
- * (The LCB is unavailable in certain link states, for example.)
- */
-struct lcb_datum {
-       u32 off;
-       u64 val;
-};
-
-static struct lcb_datum lcb_cache[] = {
-       { DC_LCB_STS_ROUND_TRIP_LTP_CNT, 0 },
-};
-
-static int write_lcb_cache(u32 off, u64 val)
-{
-       int i;
-
-       for (i = 0; i < ARRAY_SIZE(lcb_cache); i++) {
-               if (lcb_cache[i].off == off) {
-                       lcb_cache[i].val = val;
-                       return 0;
-               }
-       }
-
-       pr_warn("%s bad offset 0x%x\n", __func__, off);
-       return -1;
-}
-
-static int read_lcb_cache(u32 off, u64 *val)
-{
-       int i;
-
-       for (i = 0; i < ARRAY_SIZE(lcb_cache); i++) {
-               if (lcb_cache[i].off == off) {
-                       *val = lcb_cache[i].val;
-                       return 0;
-               }
-       }
-
-       pr_warn("%s bad offset 0x%x\n", __func__, off);
-       return -1;
-}
-
-void read_ltp_rtt(struct hfi1_devdata *dd)
-{
-       u64 reg;
-
-       if (read_lcb_csr(dd, DC_LCB_STS_ROUND_TRIP_LTP_CNT, &reg))
-               dd_dev_err(dd, "%s: unable to read LTP RTT\n", __func__);
-       else
-               write_lcb_cache(DC_LCB_STS_ROUND_TRIP_LTP_CNT, reg);
-}
-
-static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
-                                  struct ib_device *ibdev, u8 port,
-                                  u32 *resp_len)
-{
-       int i;
-       struct hfi1_devdata *dd;
-       struct hfi1_pportdata *ppd;
-       struct hfi1_ibport *ibp;
-       struct opa_port_info *pi = (struct opa_port_info *)data;
-       u8 mtu;
-       u8 credit_rate;
-       u8 is_beaconing_active;
-       u32 state;
-       u32 num_ports = OPA_AM_NPORT(am);
-       u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
-       u32 buffer_units;
-       u64 tmp = 0;
-
-       if (num_ports != 1) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       dd = dd_from_ibdev(ibdev);
-       /* IB numbers ports from 1, hw from 0 */
-       ppd = dd->pport + (port - 1);
-       ibp = &ppd->ibport_data;
-
-       if (ppd->vls_supported / 2 > ARRAY_SIZE(pi->neigh_mtu.pvlx_to_mtu) ||
-           ppd->vls_supported > ARRAY_SIZE(dd->vld)) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       pi->lid = cpu_to_be32(ppd->lid);
-
-       /* Only return the mkey if the protection field allows it. */
-       if (!(smp->method == IB_MGMT_METHOD_GET &&
-             ibp->rvp.mkey != smp->mkey &&
-             ibp->rvp.mkeyprot == 1))
-               pi->mkey = ibp->rvp.mkey;
-
-       pi->subnet_prefix = ibp->rvp.gid_prefix;
-       pi->sm_lid = cpu_to_be32(ibp->rvp.sm_lid);
-       pi->ib_cap_mask = cpu_to_be32(ibp->rvp.port_cap_flags);
-       pi->mkey_lease_period = cpu_to_be16(ibp->rvp.mkey_lease_period);
-       pi->sm_trap_qp = cpu_to_be32(ppd->sm_trap_qp);
-       pi->sa_qp = cpu_to_be32(ppd->sa_qp);
-
-       pi->link_width.enabled = cpu_to_be16(ppd->link_width_enabled);
-       pi->link_width.supported = cpu_to_be16(ppd->link_width_supported);
-       pi->link_width.active = cpu_to_be16(ppd->link_width_active);
-
-       pi->link_width_downgrade.supported =
-                       cpu_to_be16(ppd->link_width_downgrade_supported);
-       pi->link_width_downgrade.enabled =
-                       cpu_to_be16(ppd->link_width_downgrade_enabled);
-       pi->link_width_downgrade.tx_active =
-                       cpu_to_be16(ppd->link_width_downgrade_tx_active);
-       pi->link_width_downgrade.rx_active =
-                       cpu_to_be16(ppd->link_width_downgrade_rx_active);
-
-       pi->link_speed.supported = cpu_to_be16(ppd->link_speed_supported);
-       pi->link_speed.active = cpu_to_be16(ppd->link_speed_active);
-       pi->link_speed.enabled = cpu_to_be16(ppd->link_speed_enabled);
-
-       state = driver_lstate(ppd);
-
-       if (start_of_sm_config && (state == IB_PORT_INIT))
-               ppd->is_sm_config_started = 1;
-
-       pi->port_phys_conf = (ppd->port_type & 0xf);
-
-#if PI_LED_ENABLE_SUP
-       pi->port_states.ledenable_offlinereason = ppd->neighbor_normal << 4;
-       pi->port_states.ledenable_offlinereason |=
-               ppd->is_sm_config_started << 5;
-       /*
-        * This pairs with the memory barrier in hfi1_start_led_override to
-        * ensure that we read the correct state of LED beaconing represented
-        * by led_override_timer_active
-        */
-       smp_rmb();
-       is_beaconing_active = !!atomic_read(&ppd->led_override_timer_active);
-       pi->port_states.ledenable_offlinereason |= is_beaconing_active << 6;
-       pi->port_states.ledenable_offlinereason |=
-               ppd->offline_disabled_reason;
-#else
-       pi->port_states.offline_reason = ppd->neighbor_normal << 4;
-       pi->port_states.offline_reason |= ppd->is_sm_config_started << 5;
-       pi->port_states.offline_reason |= ppd->offline_disabled_reason;
-#endif /* PI_LED_ENABLE_SUP */
-
-       pi->port_states.portphysstate_portstate =
-               (hfi1_ibphys_portstate(ppd) << 4) | state;
-
-       pi->mkeyprotect_lmc = (ibp->rvp.mkeyprot << 6) | ppd->lmc;
-
-       memset(pi->neigh_mtu.pvlx_to_mtu, 0, sizeof(pi->neigh_mtu.pvlx_to_mtu));
-       for (i = 0; i < ppd->vls_supported; i++) {
-               mtu = mtu_to_enum(dd->vld[i].mtu, HFI1_DEFAULT_ACTIVE_MTU);
-               if ((i % 2) == 0)
-                       pi->neigh_mtu.pvlx_to_mtu[i / 2] |= (mtu << 4);
-               else
-                       pi->neigh_mtu.pvlx_to_mtu[i / 2] |= mtu;
-       }
-       /* don't forget VL 15 */
-       mtu = mtu_to_enum(dd->vld[15].mtu, 2048);
-       pi->neigh_mtu.pvlx_to_mtu[15 / 2] |= mtu;
-       pi->smsl = ibp->rvp.sm_sl & OPA_PI_MASK_SMSL;
-       pi->operational_vls = hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_OP_VLS);
-       pi->partenforce_filterraw |=
-               (ppd->linkinit_reason & OPA_PI_MASK_LINKINIT_REASON);
-       if (ppd->part_enforce & HFI1_PART_ENFORCE_IN)
-               pi->partenforce_filterraw |= OPA_PI_MASK_PARTITION_ENFORCE_IN;
-       if (ppd->part_enforce & HFI1_PART_ENFORCE_OUT)
-               pi->partenforce_filterraw |= OPA_PI_MASK_PARTITION_ENFORCE_OUT;
-       pi->mkey_violations = cpu_to_be16(ibp->rvp.mkey_violations);
-       /* P_KeyViolations are counted by hardware. */
-       pi->pkey_violations = cpu_to_be16(ibp->rvp.pkey_violations);
-       pi->qkey_violations = cpu_to_be16(ibp->rvp.qkey_violations);
-
-       pi->vl.cap = ppd->vls_supported;
-       pi->vl.high_limit = cpu_to_be16(ibp->rvp.vl_high_limit);
-       pi->vl.arb_high_cap = (u8)hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_VL_HIGH_CAP);
-       pi->vl.arb_low_cap = (u8)hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_VL_LOW_CAP);
-
-       pi->clientrereg_subnettimeout = ibp->rvp.subnet_timeout;
-
-       pi->port_link_mode  = cpu_to_be16(OPA_PORT_LINK_MODE_OPA << 10 |
-                                         OPA_PORT_LINK_MODE_OPA << 5 |
-                                         OPA_PORT_LINK_MODE_OPA);
-
-       pi->port_ltp_crc_mode = cpu_to_be16(ppd->port_ltp_crc_mode);
-
-       pi->port_mode = cpu_to_be16(
-                               ppd->is_active_optimize_enabled ?
-                                       OPA_PI_MASK_PORT_ACTIVE_OPTOMIZE : 0);
-
-       pi->port_packet_format.supported =
-               cpu_to_be16(OPA_PORT_PACKET_FORMAT_9B);
-       pi->port_packet_format.enabled =
-               cpu_to_be16(OPA_PORT_PACKET_FORMAT_9B);
-
-       /* flit_control.interleave is (OPA V1, version .76):
-        * bits         use
-        * ----         ---
-        * 2            res
-        * 2            DistanceSupported
-        * 2            DistanceEnabled
-        * 5            MaxNextLevelTxEnabled
-        * 5            MaxNestLevelRxSupported
-        *
-        * HFI supports only "distance mode 1" (see OPA V1, version .76,
-        * section 9.6.2), so set DistanceSupported, DistanceEnabled
-        * to 0x1.
-        */
-       pi->flit_control.interleave = cpu_to_be16(0x1400);
-
-       pi->link_down_reason = ppd->local_link_down_reason.sma;
-       pi->neigh_link_down_reason = ppd->neigh_link_down_reason.sma;
-       pi->port_error_action = cpu_to_be32(ppd->port_error_action);
-       pi->mtucap = mtu_to_enum(hfi1_max_mtu, IB_MTU_4096);
-
-       /* 32.768 usec. response time (guessing) */
-       pi->resptimevalue = 3;
-
-       pi->local_port_num = port;
-
-       /* buffer info for FM */
-       pi->overall_buffer_space = cpu_to_be16(dd->link_credits);
-
-       pi->neigh_node_guid = cpu_to_be64(ppd->neighbor_guid);
-       pi->neigh_port_num = ppd->neighbor_port_number;
-       pi->port_neigh_mode =
-               (ppd->neighbor_type & OPA_PI_MASK_NEIGH_NODE_TYPE) |
-               (ppd->mgmt_allowed ? OPA_PI_MASK_NEIGH_MGMT_ALLOWED : 0) |
-               (ppd->neighbor_fm_security ?
-                       OPA_PI_MASK_NEIGH_FW_AUTH_BYPASS : 0);
-
-       /* HFIs shall always return VL15 credits to their
-        * neighbor in a timely manner, without any credit return pacing.
-        */
-       credit_rate = 0;
-       buffer_units  = (dd->vau) & OPA_PI_MASK_BUF_UNIT_BUF_ALLOC;
-       buffer_units |= (dd->vcu << 3) & OPA_PI_MASK_BUF_UNIT_CREDIT_ACK;
-       buffer_units |= (credit_rate << 6) &
-                               OPA_PI_MASK_BUF_UNIT_VL15_CREDIT_RATE;
-       buffer_units |= (dd->vl15_init << 11) & OPA_PI_MASK_BUF_UNIT_VL15_INIT;
-       pi->buffer_units = cpu_to_be32(buffer_units);
-
-       pi->opa_cap_mask = cpu_to_be16(OPA_CAP_MASK3_IsSharedSpaceSupported);
-
-       /* HFI supports a replay buffer 128 LTPs in size */
-       pi->replay_depth.buffer = 0x80;
-       /* read the cached value of DC_LCB_STS_ROUND_TRIP_LTP_CNT */
-       read_lcb_cache(DC_LCB_STS_ROUND_TRIP_LTP_CNT, &tmp);
-
-       /*
-        * this counter is 16 bits wide, but the replay_depth.wire
-        * variable is only 8 bits
-        */
-       if (tmp > 0xff)
-               tmp = 0xff;
-       pi->replay_depth.wire = tmp;
-
-       if (resp_len)
-               *resp_len += sizeof(struct opa_port_info);
-
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-/**
- * get_pkeys - return the PKEY table
- * @dd: the hfi1_ib device
- * @port: the IB port number
- * @pkeys: the pkey table is placed here
- */
-static int get_pkeys(struct hfi1_devdata *dd, u8 port, u16 *pkeys)
-{
-       struct hfi1_pportdata *ppd = dd->pport + port - 1;
-
-       memcpy(pkeys, ppd->pkeys, sizeof(ppd->pkeys));
-
-       return 0;
-}
-
-static int __subn_get_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data,
-                                   struct ib_device *ibdev, u8 port,
-                                   u32 *resp_len)
-{
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       u32 n_blocks_req = OPA_AM_NBLK(am);
-       u32 start_block = am & 0x7ff;
-       __be16 *p;
-       u16 *q;
-       int i;
-       u16 n_blocks_avail;
-       unsigned npkeys = hfi1_get_npkeys(dd);
-       size_t size;
-
-       if (n_blocks_req == 0) {
-               pr_warn("OPA Get PKey AM Invalid : P = %d; B = 0x%x; N = 0x%x\n",
-                       port, start_block, n_blocks_req);
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       n_blocks_avail = (u16)(npkeys / OPA_PARTITION_TABLE_BLK_SIZE) + 1;
-
-       size = (n_blocks_req * OPA_PARTITION_TABLE_BLK_SIZE) * sizeof(u16);
-
-       if (start_block + n_blocks_req > n_blocks_avail ||
-           n_blocks_req > OPA_NUM_PKEY_BLOCKS_PER_SMP) {
-               pr_warn("OPA Get PKey AM Invalid : s 0x%x; req 0x%x; "
-                       "avail 0x%x; blk/smp 0x%lx\n",
-                       start_block, n_blocks_req, n_blocks_avail,
-                       OPA_NUM_PKEY_BLOCKS_PER_SMP);
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       p = (__be16 *)data;
-       q = (u16 *)data;
-       /* get the real pkeys if we are requesting the first block */
-       if (start_block == 0) {
-               get_pkeys(dd, port, q);
-               for (i = 0; i < npkeys; i++)
-                       p[i] = cpu_to_be16(q[i]);
-               if (resp_len)
-                       *resp_len += size;
-       } else {
-               smp->status |= IB_SMP_INVALID_FIELD;
-       }
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-enum {
-       HFI_TRANSITION_DISALLOWED,
-       HFI_TRANSITION_IGNORED,
-       HFI_TRANSITION_ALLOWED,
-       HFI_TRANSITION_UNDEFINED,
-};
-
-/*
- * Use shortened names to improve readability of
- * {logical,physical}_state_transitions
- */
-enum {
-       __D = HFI_TRANSITION_DISALLOWED,
-       __I = HFI_TRANSITION_IGNORED,
-       __A = HFI_TRANSITION_ALLOWED,
-       __U = HFI_TRANSITION_UNDEFINED,
-};
-
-/*
- * IB_PORTPHYSSTATE_POLLING (2) through OPA_PORTPHYSSTATE_MAX (11) are
- * represented in physical_state_transitions.
- */
-#define __N_PHYSTATES (OPA_PORTPHYSSTATE_MAX - IB_PORTPHYSSTATE_POLLING + 1)
-
-/*
- * Within physical_state_transitions, rows represent "old" states,
- * columns "new" states, and physical_state_transitions.allowed[old][new]
- * indicates if the transition from old state to new state is legal (see
- * OPAg1v1, Table 6-4).
- */
-static const struct {
-       u8 allowed[__N_PHYSTATES][__N_PHYSTATES];
-} physical_state_transitions = {
-       {
-               /* 2    3    4    5    6    7    8    9   10   11 */
-       /* 2 */ { __A, __A, __D, __D, __D, __D, __D, __D, __D, __D },
-       /* 3 */ { __A, __I, __D, __D, __D, __D, __D, __D, __D, __A },
-       /* 4 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
-       /* 5 */ { __A, __A, __D, __I, __D, __D, __D, __D, __D, __D },
-       /* 6 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
-       /* 7 */ { __D, __A, __D, __D, __D, __I, __D, __D, __D, __D },
-       /* 8 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
-       /* 9 */ { __I, __A, __D, __D, __D, __D, __D, __I, __D, __D },
-       /*10 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
-       /*11 */ { __D, __A, __D, __D, __D, __D, __D, __D, __D, __I },
-       }
-};
-
-/*
- * IB_PORT_DOWN (1) through IB_PORT_ACTIVE_DEFER (5) are represented
- * logical_state_transitions
- */
-
-#define __N_LOGICAL_STATES (IB_PORT_ACTIVE_DEFER - IB_PORT_DOWN + 1)
-
-/*
- * Within logical_state_transitions rows represent "old" states,
- * columns "new" states, and logical_state_transitions.allowed[old][new]
- * indicates if the transition from old state to new state is legal (see
- * OPAg1v1, Table 9-12).
- */
-static const struct {
-       u8 allowed[__N_LOGICAL_STATES][__N_LOGICAL_STATES];
-} logical_state_transitions = {
-       {
-               /* 1    2    3    4    5 */
-       /* 1 */ { __I, __D, __D, __D, __U},
-       /* 2 */ { __D, __I, __A, __D, __U},
-       /* 3 */ { __D, __D, __I, __A, __U},
-       /* 4 */ { __D, __D, __I, __I, __U},
-       /* 5 */ { __U, __U, __U, __U, __U},
-       }
-};
-
-static int logical_transition_allowed(int old, int new)
-{
-       if (old < IB_PORT_NOP || old > IB_PORT_ACTIVE_DEFER ||
-           new < IB_PORT_NOP || new > IB_PORT_ACTIVE_DEFER) {
-               pr_warn("invalid logical state(s) (old %d new %d)\n",
-                       old, new);
-               return HFI_TRANSITION_UNDEFINED;
-       }
-
-       if (new == IB_PORT_NOP)
-               return HFI_TRANSITION_ALLOWED; /* always allowed */
-
-       /* adjust states for indexing into logical_state_transitions */
-       old -= IB_PORT_DOWN;
-       new -= IB_PORT_DOWN;
-
-       if (old < 0 || new < 0)
-               return HFI_TRANSITION_UNDEFINED;
-       return logical_state_transitions.allowed[old][new];
-}
-
-static int physical_transition_allowed(int old, int new)
-{
-       if (old < IB_PORTPHYSSTATE_NOP || old > OPA_PORTPHYSSTATE_MAX ||
-           new < IB_PORTPHYSSTATE_NOP || new > OPA_PORTPHYSSTATE_MAX) {
-               pr_warn("invalid physical state(s) (old %d new %d)\n",
-                       old, new);
-               return HFI_TRANSITION_UNDEFINED;
-       }
-
-       if (new == IB_PORTPHYSSTATE_NOP)
-               return HFI_TRANSITION_ALLOWED; /* always allowed */
-
-       /* adjust states for indexing into physical_state_transitions */
-       old -= IB_PORTPHYSSTATE_POLLING;
-       new -= IB_PORTPHYSSTATE_POLLING;
-
-       if (old < 0 || new < 0)
-               return HFI_TRANSITION_UNDEFINED;
-       return physical_state_transitions.allowed[old][new];
-}
-
-static int port_states_transition_allowed(struct hfi1_pportdata *ppd,
-                                         u32 logical_new, u32 physical_new)
-{
-       u32 physical_old = driver_physical_state(ppd);
-       u32 logical_old = driver_logical_state(ppd);
-       int ret, logical_allowed, physical_allowed;
-
-       ret = logical_transition_allowed(logical_old, logical_new);
-       logical_allowed = ret;
-
-       if (ret == HFI_TRANSITION_DISALLOWED ||
-           ret == HFI_TRANSITION_UNDEFINED) {
-               pr_warn("invalid logical state transition %s -> %s\n",
-                       opa_lstate_name(logical_old),
-                       opa_lstate_name(logical_new));
-               return ret;
-       }
-
-       ret = physical_transition_allowed(physical_old, physical_new);
-       physical_allowed = ret;
-
-       if (ret == HFI_TRANSITION_DISALLOWED ||
-           ret == HFI_TRANSITION_UNDEFINED) {
-               pr_warn("invalid physical state transition %s -> %s\n",
-                       opa_pstate_name(physical_old),
-                       opa_pstate_name(physical_new));
-               return ret;
-       }
-
-       if (logical_allowed == HFI_TRANSITION_IGNORED &&
-           physical_allowed == HFI_TRANSITION_IGNORED)
-               return HFI_TRANSITION_IGNORED;
-
-       /*
-        * A change request of Physical Port State from
-        * 'Offline' to 'Polling' should be ignored.
-        */
-       if ((physical_old == OPA_PORTPHYSSTATE_OFFLINE) &&
-           (physical_new == IB_PORTPHYSSTATE_POLLING))
-               return HFI_TRANSITION_IGNORED;
-
-       /*
-        * Either physical_allowed or logical_allowed is
-        * HFI_TRANSITION_ALLOWED.
-        */
-       return HFI_TRANSITION_ALLOWED;
-}
-
-static int set_port_states(struct hfi1_pportdata *ppd, struct opa_smp *smp,
-                          u32 logical_state, u32 phys_state,
-                          int suppress_idle_sma)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       u32 link_state;
-       int ret;
-
-       ret = port_states_transition_allowed(ppd, logical_state, phys_state);
-       if (ret == HFI_TRANSITION_DISALLOWED ||
-           ret == HFI_TRANSITION_UNDEFINED) {
-               /* error message emitted above */
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return 0;
-       }
-
-       if (ret == HFI_TRANSITION_IGNORED)
-               return 0;
-
-       if ((phys_state != IB_PORTPHYSSTATE_NOP) &&
-           !(logical_state == IB_PORT_DOWN ||
-             logical_state == IB_PORT_NOP)){
-               pr_warn("SubnSet(OPA_PortInfo) port state invalid: logical_state 0x%x physical_state 0x%x\n",
-                       logical_state, phys_state);
-               smp->status |= IB_SMP_INVALID_FIELD;
-       }
-
-       /*
-        * Logical state changes are summarized in OPAv1g1 spec.,
-        * Table 9-12; physical state changes are summarized in
-        * OPAv1g1 spec., Table 6.4.
-        */
-       switch (logical_state) {
-       case IB_PORT_NOP:
-               if (phys_state == IB_PORTPHYSSTATE_NOP)
-                       break;
-               /* FALLTHROUGH */
-       case IB_PORT_DOWN:
-               if (phys_state == IB_PORTPHYSSTATE_NOP) {
-                       link_state = HLS_DN_DOWNDEF;
-               } else if (phys_state == IB_PORTPHYSSTATE_POLLING) {
-                       link_state = HLS_DN_POLL;
-                       set_link_down_reason(ppd, OPA_LINKDOWN_REASON_FM_BOUNCE,
-                                            0, OPA_LINKDOWN_REASON_FM_BOUNCE);
-               } else if (phys_state == IB_PORTPHYSSTATE_DISABLED) {
-                       link_state = HLS_DN_DISABLE;
-               } else {
-                       pr_warn("SubnSet(OPA_PortInfo) invalid physical state 0x%x\n",
-                               phys_state);
-                       smp->status |= IB_SMP_INVALID_FIELD;
-                       break;
-               }
-
-               if ((link_state == HLS_DN_POLL ||
-                    link_state == HLS_DN_DOWNDEF)) {
-                       /*
-                        * Going to poll.  No matter what the current state,
-                        * always move offline first, then tune and start the
-                        * link.  This correctly handles a FM link bounce and
-                        * a link enable.  Going offline is a no-op if already
-                        * offline.
-                        */
-                       set_link_state(ppd, HLS_DN_OFFLINE);
-                       tune_serdes(ppd);
-                       start_link(ppd);
-               } else {
-                       set_link_state(ppd, link_state);
-               }
-               if (link_state == HLS_DN_DISABLE &&
-                   (ppd->offline_disabled_reason >
-                    HFI1_ODR_MASK(OPA_LINKDOWN_REASON_SMA_DISABLED) ||
-                    ppd->offline_disabled_reason ==
-                    HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE)))
-                       ppd->offline_disabled_reason =
-                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_SMA_DISABLED);
-               /*
-                * Don't send a reply if the response would be sent
-                * through the disabled port.
-                */
-               if (link_state == HLS_DN_DISABLE && smp->hop_cnt)
-                       return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
-               break;
-       case IB_PORT_ARMED:
-               ret = set_link_state(ppd, HLS_UP_ARMED);
-               if ((ret == 0) && (suppress_idle_sma == 0))
-                       send_idle_sma(dd, SMA_IDLE_ARM);
-               break;
-       case IB_PORT_ACTIVE:
-               if (ppd->neighbor_normal) {
-                       ret = set_link_state(ppd, HLS_UP_ACTIVE);
-                       if (ret == 0)
-                               send_idle_sma(dd, SMA_IDLE_ACTIVE);
-               } else {
-                       pr_warn("SubnSet(OPA_PortInfo) Cannot move to Active with NeighborNormal 0\n");
-                       smp->status |= IB_SMP_INVALID_FIELD;
-               }
-               break;
-       default:
-               pr_warn("SubnSet(OPA_PortInfo) invalid logical state 0x%x\n",
-                       logical_state);
-               smp->status |= IB_SMP_INVALID_FIELD;
-       }
-
-       return 0;
-}
-
-/**
- * subn_set_opa_portinfo - set port information
- * @smp: the incoming SM packet
- * @ibdev: the infiniband device
- * @port: the port on the device
- *
- */
-static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
-                                  struct ib_device *ibdev, u8 port,
-                                  u32 *resp_len)
-{
-       struct opa_port_info *pi = (struct opa_port_info *)data;
-       struct ib_event event;
-       struct hfi1_devdata *dd;
-       struct hfi1_pportdata *ppd;
-       struct hfi1_ibport *ibp;
-       u8 clientrereg;
-       unsigned long flags;
-       u32 smlid, opa_lid; /* tmp vars to hold LID values */
-       u16 lid;
-       u8 ls_old, ls_new, ps_new;
-       u8 vls;
-       u8 msl;
-       u8 crc_enabled;
-       u16 lse, lwe, mtu;
-       u32 num_ports = OPA_AM_NPORT(am);
-       u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
-       int ret, i, invalid = 0, call_set_mtu = 0;
-       int call_link_downgrade_policy = 0;
-
-       if (num_ports != 1) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       opa_lid = be32_to_cpu(pi->lid);
-       if (opa_lid & 0xFFFF0000) {
-               pr_warn("OPA_PortInfo lid out of range: %X\n", opa_lid);
-               smp->status |= IB_SMP_INVALID_FIELD;
-               goto get_only;
-       }
-
-       lid = (u16)(opa_lid & 0x0000FFFF);
-
-       smlid = be32_to_cpu(pi->sm_lid);
-       if (smlid & 0xFFFF0000) {
-               pr_warn("OPA_PortInfo SM lid out of range: %X\n", smlid);
-               smp->status |= IB_SMP_INVALID_FIELD;
-               goto get_only;
-       }
-       smlid &= 0x0000FFFF;
-
-       clientrereg = (pi->clientrereg_subnettimeout &
-                       OPA_PI_MASK_CLIENT_REREGISTER);
-
-       dd = dd_from_ibdev(ibdev);
-       /* IB numbers ports from 1, hw from 0 */
-       ppd = dd->pport + (port - 1);
-       ibp = &ppd->ibport_data;
-       event.device = ibdev;
-       event.element.port_num = port;
-
-       ls_old = driver_lstate(ppd);
-
-       ibp->rvp.mkey = pi->mkey;
-       ibp->rvp.gid_prefix = pi->subnet_prefix;
-       ibp->rvp.mkey_lease_period = be16_to_cpu(pi->mkey_lease_period);
-
-       /* Must be a valid unicast LID address. */
-       if ((lid == 0 && ls_old > IB_PORT_INIT) ||
-           lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               pr_warn("SubnSet(OPA_PortInfo) lid invalid 0x%x\n",
-                       lid);
-       } else if (ppd->lid != lid ||
-                ppd->lmc != (pi->mkeyprotect_lmc & OPA_PI_MASK_LMC)) {
-               if (ppd->lid != lid)
-                       hfi1_set_uevent_bits(ppd, _HFI1_EVENT_LID_CHANGE_BIT);
-               if (ppd->lmc != (pi->mkeyprotect_lmc & OPA_PI_MASK_LMC))
-                       hfi1_set_uevent_bits(ppd, _HFI1_EVENT_LMC_CHANGE_BIT);
-               hfi1_set_lid(ppd, lid, pi->mkeyprotect_lmc & OPA_PI_MASK_LMC);
-               event.event = IB_EVENT_LID_CHANGE;
-               ib_dispatch_event(&event);
-       }
-
-       msl = pi->smsl & OPA_PI_MASK_SMSL;
-       if (pi->partenforce_filterraw & OPA_PI_MASK_LINKINIT_REASON)
-               ppd->linkinit_reason =
-                       (pi->partenforce_filterraw &
-                        OPA_PI_MASK_LINKINIT_REASON);
-       /* enable/disable SW pkey checking as per FM control */
-       if (pi->partenforce_filterraw & OPA_PI_MASK_PARTITION_ENFORCE_IN)
-               ppd->part_enforce |= HFI1_PART_ENFORCE_IN;
-       else
-               ppd->part_enforce &= ~HFI1_PART_ENFORCE_IN;
-
-       if (pi->partenforce_filterraw & OPA_PI_MASK_PARTITION_ENFORCE_OUT)
-               ppd->part_enforce |= HFI1_PART_ENFORCE_OUT;
-       else
-               ppd->part_enforce &= ~HFI1_PART_ENFORCE_OUT;
-
-       /* Must be a valid unicast LID address. */
-       if ((smlid == 0 && ls_old > IB_PORT_INIT) ||
-           smlid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               pr_warn("SubnSet(OPA_PortInfo) smlid invalid 0x%x\n", smlid);
-       } else if (smlid != ibp->rvp.sm_lid || msl != ibp->rvp.sm_sl) {
-               pr_warn("SubnSet(OPA_PortInfo) smlid 0x%x\n", smlid);
-               spin_lock_irqsave(&ibp->rvp.lock, flags);
-               if (ibp->rvp.sm_ah) {
-                       if (smlid != ibp->rvp.sm_lid)
-                               ibp->rvp.sm_ah->attr.dlid = smlid;
-                       if (msl != ibp->rvp.sm_sl)
-                               ibp->rvp.sm_ah->attr.sl = msl;
-               }
-               spin_unlock_irqrestore(&ibp->rvp.lock, flags);
-               if (smlid != ibp->rvp.sm_lid)
-                       ibp->rvp.sm_lid = smlid;
-               if (msl != ibp->rvp.sm_sl)
-                       ibp->rvp.sm_sl = msl;
-               event.event = IB_EVENT_SM_CHANGE;
-               ib_dispatch_event(&event);
-       }
-
-       if (pi->link_down_reason == 0) {
-               ppd->local_link_down_reason.sma = 0;
-               ppd->local_link_down_reason.latest = 0;
-       }
-
-       if (pi->neigh_link_down_reason == 0) {
-               ppd->neigh_link_down_reason.sma = 0;
-               ppd->neigh_link_down_reason.latest = 0;
-       }
-
-       ppd->sm_trap_qp = be32_to_cpu(pi->sm_trap_qp);
-       ppd->sa_qp = be32_to_cpu(pi->sa_qp);
-
-       ppd->port_error_action = be32_to_cpu(pi->port_error_action);
-       lwe = be16_to_cpu(pi->link_width.enabled);
-       if (lwe) {
-               if (lwe == OPA_LINK_WIDTH_RESET ||
-                   lwe == OPA_LINK_WIDTH_RESET_OLD)
-                       set_link_width_enabled(ppd, ppd->link_width_supported);
-               else if ((lwe & ~ppd->link_width_supported) == 0)
-                       set_link_width_enabled(ppd, lwe);
-               else
-                       smp->status |= IB_SMP_INVALID_FIELD;
-       }
-       lwe = be16_to_cpu(pi->link_width_downgrade.enabled);
-       /* LWD.E is always applied - 0 means "disabled" */
-       if (lwe == OPA_LINK_WIDTH_RESET ||
-           lwe == OPA_LINK_WIDTH_RESET_OLD) {
-               set_link_width_downgrade_enabled(ppd,
-                                                ppd->
-                                                link_width_downgrade_supported
-                                                );
-       } else if ((lwe & ~ppd->link_width_downgrade_supported) == 0) {
-               /* only set and apply if something changed */
-               if (lwe != ppd->link_width_downgrade_enabled) {
-                       set_link_width_downgrade_enabled(ppd, lwe);
-                       call_link_downgrade_policy = 1;
-               }
-       } else {
-               smp->status |= IB_SMP_INVALID_FIELD;
-       }
-       lse = be16_to_cpu(pi->link_speed.enabled);
-       if (lse) {
-               if (lse & be16_to_cpu(pi->link_speed.supported))
-                       set_link_speed_enabled(ppd, lse);
-               else
-                       smp->status |= IB_SMP_INVALID_FIELD;
-       }
-
-       ibp->rvp.mkeyprot =
-               (pi->mkeyprotect_lmc & OPA_PI_MASK_MKEY_PROT_BIT) >> 6;
-       ibp->rvp.vl_high_limit = be16_to_cpu(pi->vl.high_limit) & 0xFF;
-       (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_VL_HIGH_LIMIT,
-                                   ibp->rvp.vl_high_limit);
-
-       if (ppd->vls_supported / 2 > ARRAY_SIZE(pi->neigh_mtu.pvlx_to_mtu) ||
-           ppd->vls_supported > ARRAY_SIZE(dd->vld)) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-       for (i = 0; i < ppd->vls_supported; i++) {
-               if ((i % 2) == 0)
-                       mtu = enum_to_mtu((pi->neigh_mtu.pvlx_to_mtu[i / 2] >>
-                                          4) & 0xF);
-               else
-                       mtu = enum_to_mtu(pi->neigh_mtu.pvlx_to_mtu[i / 2] &
-                                         0xF);
-               if (mtu == 0xffff) {
-                       pr_warn("SubnSet(OPA_PortInfo) mtu invalid %d (0x%x)\n",
-                               mtu,
-                               (pi->neigh_mtu.pvlx_to_mtu[0] >> 4) & 0xF);
-                       smp->status |= IB_SMP_INVALID_FIELD;
-                       mtu = hfi1_max_mtu; /* use a valid MTU */
-               }
-               if (dd->vld[i].mtu != mtu) {
-                       dd_dev_info(dd,
-                                   "MTU change on vl %d from %d to %d\n",
-                                   i, dd->vld[i].mtu, mtu);
-                       dd->vld[i].mtu = mtu;
-                       call_set_mtu++;
-               }
-       }
-       /* As per OPAV1 spec: VL15 must support and be configured
-        * for operation with a 2048 or larger MTU.
-        */
-       mtu = enum_to_mtu(pi->neigh_mtu.pvlx_to_mtu[15 / 2] & 0xF);
-       if (mtu < 2048 || mtu == 0xffff)
-               mtu = 2048;
-       if (dd->vld[15].mtu != mtu) {
-               dd_dev_info(dd,
-                           "MTU change on vl 15 from %d to %d\n",
-                           dd->vld[15].mtu, mtu);
-               dd->vld[15].mtu = mtu;
-               call_set_mtu++;
-       }
-       if (call_set_mtu)
-               set_mtu(ppd);
-
-       /* Set operational VLs */
-       vls = pi->operational_vls & OPA_PI_MASK_OPERATIONAL_VL;
-       if (vls) {
-               if (vls > ppd->vls_supported) {
-                       pr_warn("SubnSet(OPA_PortInfo) VL's supported invalid %d\n",
-                               pi->operational_vls);
-                       smp->status |= IB_SMP_INVALID_FIELD;
-               } else {
-                       if (hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_OP_VLS,
-                                           vls) == -EINVAL)
-                               smp->status |= IB_SMP_INVALID_FIELD;
-               }
-       }
-
-       if (pi->mkey_violations == 0)
-               ibp->rvp.mkey_violations = 0;
-
-       if (pi->pkey_violations == 0)
-               ibp->rvp.pkey_violations = 0;
-
-       if (pi->qkey_violations == 0)
-               ibp->rvp.qkey_violations = 0;
-
-       ibp->rvp.subnet_timeout =
-               pi->clientrereg_subnettimeout & OPA_PI_MASK_SUBNET_TIMEOUT;
-
-       crc_enabled = be16_to_cpu(pi->port_ltp_crc_mode);
-       crc_enabled >>= 4;
-       crc_enabled &= 0xf;
-
-       if (crc_enabled != 0)
-               ppd->port_crc_mode_enabled = port_ltp_to_cap(crc_enabled);
-
-       ppd->is_active_optimize_enabled =
-                       !!(be16_to_cpu(pi->port_mode)
-                                       & OPA_PI_MASK_PORT_ACTIVE_OPTOMIZE);
-
-       ls_new = pi->port_states.portphysstate_portstate &
-                       OPA_PI_MASK_PORT_STATE;
-       ps_new = (pi->port_states.portphysstate_portstate &
-                       OPA_PI_MASK_PORT_PHYSICAL_STATE) >> 4;
-
-       if (ls_old == IB_PORT_INIT) {
-               if (start_of_sm_config) {
-                       if (ls_new == ls_old || (ls_new == IB_PORT_ARMED))
-                               ppd->is_sm_config_started = 1;
-               } else if (ls_new == IB_PORT_ARMED) {
-                       if (ppd->is_sm_config_started == 0)
-                               invalid = 1;
-               }
-       }
-
-       /* Handle CLIENT_REREGISTER event b/c SM asked us for it */
-       if (clientrereg) {
-               event.event = IB_EVENT_CLIENT_REREGISTER;
-               ib_dispatch_event(&event);
-       }
-
-       /*
-        * Do the port state change now that the other link parameters
-        * have been set.
-        * Changing the port physical state only makes sense if the link
-        * is down or is being set to down.
-        */
-
-       ret = set_port_states(ppd, smp, ls_new, ps_new, invalid);
-       if (ret)
-               return ret;
-
-       ret = __subn_get_opa_portinfo(smp, am, data, ibdev, port, resp_len);
-
-       /* restore re-reg bit per o14-12.2.1 */
-       pi->clientrereg_subnettimeout |= clientrereg;
-
-       /*
-        * Apply the new link downgrade policy.  This may result in a link
-        * bounce.  Do this after everything else so things are settled.
-        * Possible problem: if setting the port state above fails, then
-        * the policy change is not applied.
-        */
-       if (call_link_downgrade_policy)
-               apply_link_downgrade_policy(ppd, 0);
-
-       return ret;
-
-get_only:
-       return __subn_get_opa_portinfo(smp, am, data, ibdev, port, resp_len);
-}
-
-/**
- * set_pkeys - set the PKEY table for ctxt 0
- * @dd: the hfi1_ib device
- * @port: the IB port number
- * @pkeys: the PKEY table
- */
-static int set_pkeys(struct hfi1_devdata *dd, u8 port, u16 *pkeys)
-{
-       struct hfi1_pportdata *ppd;
-       int i;
-       int changed = 0;
-       int update_includes_mgmt_partition = 0;
-
-       /*
-        * IB port one/two always maps to context zero/one,
-        * always a kernel context, no locking needed
-        * If we get here with ppd setup, no need to check
-        * that rcd is valid.
-        */
-       ppd = dd->pport + (port - 1);
-       /*
-        * If the update does not include the management pkey, don't do it.
-        */
-       for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) {
-               if (pkeys[i] == LIM_MGMT_P_KEY) {
-                       update_includes_mgmt_partition = 1;
-                       break;
-               }
-       }
-
-       if (!update_includes_mgmt_partition)
-               return 1;
-
-       for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) {
-               u16 key = pkeys[i];
-               u16 okey = ppd->pkeys[i];
-
-               if (key == okey)
-                       continue;
-               /*
-                * The SM gives us the complete PKey table. We have
-                * to ensure that we put the PKeys in the matching
-                * slots.
-                */
-               ppd->pkeys[i] = key;
-               changed = 1;
-       }
-
-       if (changed) {
-               struct ib_event event;
-
-               (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_PKEYS, 0);
-
-               event.event = IB_EVENT_PKEY_CHANGE;
-               event.device = &dd->verbs_dev.rdi.ibdev;
-               event.element.port_num = port;
-               ib_dispatch_event(&event);
-       }
-       return 0;
-}
-
-static int __subn_set_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data,
-                                   struct ib_device *ibdev, u8 port,
-                                   u32 *resp_len)
-{
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       u32 n_blocks_sent = OPA_AM_NBLK(am);
-       u32 start_block = am & 0x7ff;
-       u16 *p = (u16 *)data;
-       __be16 *q = (__be16 *)data;
-       int i;
-       u16 n_blocks_avail;
-       unsigned npkeys = hfi1_get_npkeys(dd);
-
-       if (n_blocks_sent == 0) {
-               pr_warn("OPA Get PKey AM Invalid : P = %d; B = 0x%x; N = 0x%x\n",
-                       port, start_block, n_blocks_sent);
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       n_blocks_avail = (u16)(npkeys / OPA_PARTITION_TABLE_BLK_SIZE) + 1;
-
-       if (start_block + n_blocks_sent > n_blocks_avail ||
-           n_blocks_sent > OPA_NUM_PKEY_BLOCKS_PER_SMP) {
-               pr_warn("OPA Set PKey AM Invalid : s 0x%x; req 0x%x; avail 0x%x; blk/smp 0x%lx\n",
-                       start_block, n_blocks_sent, n_blocks_avail,
-                       OPA_NUM_PKEY_BLOCKS_PER_SMP);
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       for (i = 0; i < n_blocks_sent * OPA_PARTITION_TABLE_BLK_SIZE; i++)
-               p[i] = be16_to_cpu(q[i]);
-
-       if (start_block == 0 && set_pkeys(dd, port, p) != 0) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       return __subn_get_opa_pkeytable(smp, am, data, ibdev, port, resp_len);
-}
-
-static int get_sc2vlt_tables(struct hfi1_devdata *dd, void *data)
-{
-       u64 *val = data;
-
-       *val++ = read_csr(dd, SEND_SC2VLT0);
-       *val++ = read_csr(dd, SEND_SC2VLT1);
-       *val++ = read_csr(dd, SEND_SC2VLT2);
-       *val++ = read_csr(dd, SEND_SC2VLT3);
-       return 0;
-}
-
-#define ILLEGAL_VL 12
-/*
- * filter_sc2vlt changes mappings to VL15 to ILLEGAL_VL (except
- * for SC15, which must map to VL15). If we don't remap things this
- * way it is possible for VL15 counters to increment when we try to
- * send on a SC which is mapped to an invalid VL.
- */
-static void filter_sc2vlt(void *data)
-{
-       int i;
-       u8 *pd = data;
-
-       for (i = 0; i < OPA_MAX_SCS; i++) {
-               if (i == 15)
-                       continue;
-               if ((pd[i] & 0x1f) == 0xf)
-                       pd[i] = ILLEGAL_VL;
-       }
-}
-
-static int set_sc2vlt_tables(struct hfi1_devdata *dd, void *data)
-{
-       u64 *val = data;
-
-       filter_sc2vlt(data);
-
-       write_csr(dd, SEND_SC2VLT0, *val++);
-       write_csr(dd, SEND_SC2VLT1, *val++);
-       write_csr(dd, SEND_SC2VLT2, *val++);
-       write_csr(dd, SEND_SC2VLT3, *val++);
-       write_seqlock_irq(&dd->sc2vl_lock);
-       memcpy(dd->sc2vl, data, sizeof(dd->sc2vl));
-       write_sequnlock_irq(&dd->sc2vl_lock);
-       return 0;
-}
-
-static int __subn_get_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data,
-                                  struct ib_device *ibdev, u8 port,
-                                  u32 *resp_len)
-{
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-       u8 *p = data;
-       size_t size = ARRAY_SIZE(ibp->sl_to_sc); /* == 32 */
-       unsigned i;
-
-       if (am) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       for (i = 0; i < ARRAY_SIZE(ibp->sl_to_sc); i++)
-               *p++ = ibp->sl_to_sc[i];
-
-       if (resp_len)
-               *resp_len += size;
-
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-static int __subn_set_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data,
-                                  struct ib_device *ibdev, u8 port,
-                                  u32 *resp_len)
-{
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-       u8 *p = data;
-       int i;
-       u8 sc;
-
-       if (am) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       for (i = 0; i <  ARRAY_SIZE(ibp->sl_to_sc); i++) {
-               sc = *p++;
-               if (ibp->sl_to_sc[i] != sc) {
-                       ibp->sl_to_sc[i] = sc;
-
-                       /* Put all stale qps into error state */
-                       hfi1_error_port_qps(ibp, i);
-               }
-       }
-
-       return __subn_get_opa_sl_to_sc(smp, am, data, ibdev, port, resp_len);
-}
-
-static int __subn_get_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data,
-                                  struct ib_device *ibdev, u8 port,
-                                  u32 *resp_len)
-{
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-       u8 *p = data;
-       size_t size = ARRAY_SIZE(ibp->sc_to_sl); /* == 32 */
-       unsigned i;
-
-       if (am) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       for (i = 0; i < ARRAY_SIZE(ibp->sc_to_sl); i++)
-               *p++ = ibp->sc_to_sl[i];
-
-       if (resp_len)
-               *resp_len += size;
-
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-static int __subn_set_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data,
-                                  struct ib_device *ibdev, u8 port,
-                                  u32 *resp_len)
-{
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-       u8 *p = data;
-       int i;
-
-       if (am) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       for (i = 0; i < ARRAY_SIZE(ibp->sc_to_sl); i++)
-               ibp->sc_to_sl[i] = *p++;
-
-       return __subn_get_opa_sc_to_sl(smp, am, data, ibdev, port, resp_len);
-}
-
-static int __subn_get_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data,
-                                   struct ib_device *ibdev, u8 port,
-                                   u32 *resp_len)
-{
-       u32 n_blocks = OPA_AM_NBLK(am);
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       void *vp = (void *)data;
-       size_t size = 4 * sizeof(u64);
-
-       if (n_blocks != 1) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       get_sc2vlt_tables(dd, vp);
-
-       if (resp_len)
-               *resp_len += size;
-
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-static int __subn_set_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data,
-                                   struct ib_device *ibdev, u8 port,
-                                   u32 *resp_len)
-{
-       u32 n_blocks = OPA_AM_NBLK(am);
-       int async_update = OPA_AM_ASYNC(am);
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       void *vp = (void *)data;
-       struct hfi1_pportdata *ppd;
-       int lstate;
-
-       if (n_blocks != 1 || async_update) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       /* IB numbers ports from 1, hw from 0 */
-       ppd = dd->pport + (port - 1);
-       lstate = driver_lstate(ppd);
-       /*
-        * it's known that async_update is 0 by this point, but include
-        * the explicit check for clarity
-        */
-       if (!async_update &&
-           (lstate == IB_PORT_ARMED || lstate == IB_PORT_ACTIVE)) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       set_sc2vlt_tables(dd, vp);
-
-       return __subn_get_opa_sc_to_vlt(smp, am, data, ibdev, port, resp_len);
-}
-
-static int __subn_get_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data,
-                                    struct ib_device *ibdev, u8 port,
-                                    u32 *resp_len)
-{
-       u32 n_blocks = OPA_AM_NPORT(am);
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       struct hfi1_pportdata *ppd;
-       void *vp = (void *)data;
-       int size;
-
-       if (n_blocks != 1) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       ppd = dd->pport + (port - 1);
-
-       size = fm_get_table(ppd, FM_TBL_SC2VLNT, vp);
-
-       if (resp_len)
-               *resp_len += size;
-
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-static int __subn_set_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data,
-                                    struct ib_device *ibdev, u8 port,
-                                    u32 *resp_len)
-{
-       u32 n_blocks = OPA_AM_NPORT(am);
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       struct hfi1_pportdata *ppd;
-       void *vp = (void *)data;
-       int lstate;
-
-       if (n_blocks != 1) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       /* IB numbers ports from 1, hw from 0 */
-       ppd = dd->pport + (port - 1);
-       lstate = driver_lstate(ppd);
-       if (lstate == IB_PORT_ARMED || lstate == IB_PORT_ACTIVE) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       ppd = dd->pport + (port - 1);
-
-       fm_set_table(ppd, FM_TBL_SC2VLNT, vp);
-
-       return __subn_get_opa_sc_to_vlnt(smp, am, data, ibdev, port,
-                                        resp_len);
-}
-
-static int __subn_get_opa_psi(struct opa_smp *smp, u32 am, u8 *data,
-                             struct ib_device *ibdev, u8 port,
-                             u32 *resp_len)
-{
-       u32 nports = OPA_AM_NPORT(am);
-       u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
-       u32 lstate;
-       struct hfi1_ibport *ibp;
-       struct hfi1_pportdata *ppd;
-       struct opa_port_state_info *psi = (struct opa_port_state_info *)data;
-
-       if (nports != 1) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       ibp = to_iport(ibdev, port);
-       ppd = ppd_from_ibp(ibp);
-
-       lstate = driver_lstate(ppd);
-
-       if (start_of_sm_config && (lstate == IB_PORT_INIT))
-               ppd->is_sm_config_started = 1;
-
-#if PI_LED_ENABLE_SUP
-       psi->port_states.ledenable_offlinereason = ppd->neighbor_normal << 4;
-       psi->port_states.ledenable_offlinereason |=
-               ppd->is_sm_config_started << 5;
-       psi->port_states.ledenable_offlinereason |=
-               ppd->offline_disabled_reason;
-#else
-       psi->port_states.offline_reason = ppd->neighbor_normal << 4;
-       psi->port_states.offline_reason |= ppd->is_sm_config_started << 5;
-       psi->port_states.offline_reason |= ppd->offline_disabled_reason;
-#endif /* PI_LED_ENABLE_SUP */
-
-       psi->port_states.portphysstate_portstate =
-               (hfi1_ibphys_portstate(ppd) << 4) | (lstate & 0xf);
-       psi->link_width_downgrade_tx_active =
-               cpu_to_be16(ppd->link_width_downgrade_tx_active);
-       psi->link_width_downgrade_rx_active =
-               cpu_to_be16(ppd->link_width_downgrade_rx_active);
-       if (resp_len)
-               *resp_len += sizeof(struct opa_port_state_info);
-
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-static int __subn_set_opa_psi(struct opa_smp *smp, u32 am, u8 *data,
-                             struct ib_device *ibdev, u8 port,
-                             u32 *resp_len)
-{
-       u32 nports = OPA_AM_NPORT(am);
-       u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
-       u32 ls_old;
-       u8 ls_new, ps_new;
-       struct hfi1_ibport *ibp;
-       struct hfi1_pportdata *ppd;
-       struct opa_port_state_info *psi = (struct opa_port_state_info *)data;
-       int ret, invalid = 0;
-
-       if (nports != 1) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       ibp = to_iport(ibdev, port);
-       ppd = ppd_from_ibp(ibp);
-
-       ls_old = driver_lstate(ppd);
-
-       ls_new = port_states_to_logical_state(&psi->port_states);
-       ps_new = port_states_to_phys_state(&psi->port_states);
-
-       if (ls_old == IB_PORT_INIT) {
-               if (start_of_sm_config) {
-                       if (ls_new == ls_old || (ls_new == IB_PORT_ARMED))
-                               ppd->is_sm_config_started = 1;
-               } else if (ls_new == IB_PORT_ARMED) {
-                       if (ppd->is_sm_config_started == 0)
-                               invalid = 1;
-               }
-       }
-
-       ret = set_port_states(ppd, smp, ls_new, ps_new, invalid);
-       if (ret)
-               return ret;
-
-       if (invalid)
-               smp->status |= IB_SMP_INVALID_FIELD;
-
-       return __subn_get_opa_psi(smp, am, data, ibdev, port, resp_len);
-}
-
-static int __subn_get_opa_cable_info(struct opa_smp *smp, u32 am, u8 *data,
-                                    struct ib_device *ibdev, u8 port,
-                                    u32 *resp_len)
-{
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       u32 addr = OPA_AM_CI_ADDR(am);
-       u32 len = OPA_AM_CI_LEN(am) + 1;
-       int ret;
-
-#define __CI_PAGE_SIZE BIT(7) /* 128 bytes */
-#define __CI_PAGE_MASK ~(__CI_PAGE_SIZE - 1)
-#define __CI_PAGE_NUM(a) ((a) & __CI_PAGE_MASK)
-
-       /*
-        * check that addr is within spec, and
-        * addr and (addr + len - 1) are on the same "page"
-        */
-       if (addr >= 4096 ||
-           (__CI_PAGE_NUM(addr) != __CI_PAGE_NUM(addr + len - 1))) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       ret = get_cable_info(dd, port, addr, len, data);
-
-       if (ret == -ENODEV) {
-               smp->status |= IB_SMP_UNSUP_METH_ATTR;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       /* The address range for the CableInfo SMA query is wider than the
-        * memory available on the QSFP cable. We want to return a valid
-        * response, albeit zeroed out, for address ranges beyond available
-        * memory but that are within the CableInfo query spec
-        */
-       if (ret < 0 && ret != -ERANGE) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       if (resp_len)
-               *resp_len += len;
-
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-static int __subn_get_opa_bct(struct opa_smp *smp, u32 am, u8 *data,
-                             struct ib_device *ibdev, u8 port, u32 *resp_len)
-{
-       u32 num_ports = OPA_AM_NPORT(am);
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       struct hfi1_pportdata *ppd;
-       struct buffer_control *p = (struct buffer_control *)data;
-       int size;
-
-       if (num_ports != 1) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       ppd = dd->pport + (port - 1);
-       size = fm_get_table(ppd, FM_TBL_BUFFER_CONTROL, p);
-       trace_bct_get(dd, p);
-       if (resp_len)
-               *resp_len += size;
-
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-static int __subn_set_opa_bct(struct opa_smp *smp, u32 am, u8 *data,
-                             struct ib_device *ibdev, u8 port, u32 *resp_len)
-{
-       u32 num_ports = OPA_AM_NPORT(am);
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       struct hfi1_pportdata *ppd;
-       struct buffer_control *p = (struct buffer_control *)data;
-
-       if (num_ports != 1) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-       ppd = dd->pport + (port - 1);
-       trace_bct_set(dd, p);
-       if (fm_set_table(ppd, FM_TBL_BUFFER_CONTROL, p) < 0) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       return __subn_get_opa_bct(smp, am, data, ibdev, port, resp_len);
-}
-
-static int __subn_get_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data,
-                                struct ib_device *ibdev, u8 port,
-                                u32 *resp_len)
-{
-       struct hfi1_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port));
-       u32 num_ports = OPA_AM_NPORT(am);
-       u8 section = (am & 0x00ff0000) >> 16;
-       u8 *p = data;
-       int size = 0;
-
-       if (num_ports != 1) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       switch (section) {
-       case OPA_VLARB_LOW_ELEMENTS:
-               size = fm_get_table(ppd, FM_TBL_VL_LOW_ARB, p);
-               break;
-       case OPA_VLARB_HIGH_ELEMENTS:
-               size = fm_get_table(ppd, FM_TBL_VL_HIGH_ARB, p);
-               break;
-       case OPA_VLARB_PREEMPT_ELEMENTS:
-               size = fm_get_table(ppd, FM_TBL_VL_PREEMPT_ELEMS, p);
-               break;
-       case OPA_VLARB_PREEMPT_MATRIX:
-               size = fm_get_table(ppd, FM_TBL_VL_PREEMPT_MATRIX, p);
-               break;
-       default:
-               pr_warn("OPA SubnGet(VL Arb) AM Invalid : 0x%x\n",
-                       be32_to_cpu(smp->attr_mod));
-               smp->status |= IB_SMP_INVALID_FIELD;
-               break;
-       }
-
-       if (size > 0 && resp_len)
-               *resp_len += size;
-
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-static int __subn_set_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data,
-                                struct ib_device *ibdev, u8 port,
-                                u32 *resp_len)
-{
-       struct hfi1_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port));
-       u32 num_ports = OPA_AM_NPORT(am);
-       u8 section = (am & 0x00ff0000) >> 16;
-       u8 *p = data;
-
-       if (num_ports != 1) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       switch (section) {
-       case OPA_VLARB_LOW_ELEMENTS:
-               (void)fm_set_table(ppd, FM_TBL_VL_LOW_ARB, p);
-               break;
-       case OPA_VLARB_HIGH_ELEMENTS:
-               (void)fm_set_table(ppd, FM_TBL_VL_HIGH_ARB, p);
-               break;
-       /*
-        * neither OPA_VLARB_PREEMPT_ELEMENTS, or OPA_VLARB_PREEMPT_MATRIX
-        * can be changed from the default values
-        */
-       case OPA_VLARB_PREEMPT_ELEMENTS:
-               /* FALLTHROUGH */
-       case OPA_VLARB_PREEMPT_MATRIX:
-               smp->status |= IB_SMP_UNSUP_METH_ATTR;
-               break;
-       default:
-               pr_warn("OPA SubnSet(VL Arb) AM Invalid : 0x%x\n",
-                       be32_to_cpu(smp->attr_mod));
-               smp->status |= IB_SMP_INVALID_FIELD;
-               break;
-       }
-
-       return __subn_get_opa_vl_arb(smp, am, data, ibdev, port, resp_len);
-}
-
-struct opa_pma_mad {
-       struct ib_mad_hdr mad_hdr;
-       u8 data[2024];
-} __packed;
-
-struct opa_class_port_info {
-       u8 base_version;
-       u8 class_version;
-       __be16 cap_mask;
-       __be32 cap_mask2_resp_time;
-
-       u8 redirect_gid[16];
-       __be32 redirect_tc_fl;
-       __be32 redirect_lid;
-       __be32 redirect_sl_qp;
-       __be32 redirect_qkey;
-
-       u8 trap_gid[16];
-       __be32 trap_tc_fl;
-       __be32 trap_lid;
-       __be32 trap_hl_qp;
-       __be32 trap_qkey;
-
-       __be16 trap_pkey;
-       __be16 redirect_pkey;
-
-       u8 trap_sl_rsvd;
-       u8 reserved[3];
-} __packed;
-
-struct opa_port_status_req {
-       __u8 port_num;
-       __u8 reserved[3];
-       __be32 vl_select_mask;
-};
-
-#define VL_MASK_ALL            0x000080ff
-
-struct opa_port_status_rsp {
-       __u8 port_num;
-       __u8 reserved[3];
-       __be32  vl_select_mask;
-
-       /* Data counters */
-       __be64 port_xmit_data;
-       __be64 port_rcv_data;
-       __be64 port_xmit_pkts;
-       __be64 port_rcv_pkts;
-       __be64 port_multicast_xmit_pkts;
-       __be64 port_multicast_rcv_pkts;
-       __be64 port_xmit_wait;
-       __be64 sw_port_congestion;
-       __be64 port_rcv_fecn;
-       __be64 port_rcv_becn;
-       __be64 port_xmit_time_cong;
-       __be64 port_xmit_wasted_bw;
-       __be64 port_xmit_wait_data;
-       __be64 port_rcv_bubble;
-       __be64 port_mark_fecn;
-       /* Error counters */
-       __be64 port_rcv_constraint_errors;
-       __be64 port_rcv_switch_relay_errors;
-       __be64 port_xmit_discards;
-       __be64 port_xmit_constraint_errors;
-       __be64 port_rcv_remote_physical_errors;
-       __be64 local_link_integrity_errors;
-       __be64 port_rcv_errors;
-       __be64 excessive_buffer_overruns;
-       __be64 fm_config_errors;
-       __be32 link_error_recovery;
-       __be32 link_downed;
-       u8 uncorrectable_errors;
-
-       u8 link_quality_indicator; /* 5res, 3bit */
-       u8 res2[6];
-       struct _vls_pctrs {
-               /* per-VL Data counters */
-               __be64 port_vl_xmit_data;
-               __be64 port_vl_rcv_data;
-               __be64 port_vl_xmit_pkts;
-               __be64 port_vl_rcv_pkts;
-               __be64 port_vl_xmit_wait;
-               __be64 sw_port_vl_congestion;
-               __be64 port_vl_rcv_fecn;
-               __be64 port_vl_rcv_becn;
-               __be64 port_xmit_time_cong;
-               __be64 port_vl_xmit_wasted_bw;
-               __be64 port_vl_xmit_wait_data;
-               __be64 port_vl_rcv_bubble;
-               __be64 port_vl_mark_fecn;
-               __be64 port_vl_xmit_discards;
-       } vls[0]; /* real array size defined by # bits set in vl_select_mask */
-};
-
-enum counter_selects {
-       CS_PORT_XMIT_DATA                       = (1 << 31),
-       CS_PORT_RCV_DATA                        = (1 << 30),
-       CS_PORT_XMIT_PKTS                       = (1 << 29),
-       CS_PORT_RCV_PKTS                        = (1 << 28),
-       CS_PORT_MCAST_XMIT_PKTS                 = (1 << 27),
-       CS_PORT_MCAST_RCV_PKTS                  = (1 << 26),
-       CS_PORT_XMIT_WAIT                       = (1 << 25),
-       CS_SW_PORT_CONGESTION                   = (1 << 24),
-       CS_PORT_RCV_FECN                        = (1 << 23),
-       CS_PORT_RCV_BECN                        = (1 << 22),
-       CS_PORT_XMIT_TIME_CONG                  = (1 << 21),
-       CS_PORT_XMIT_WASTED_BW                  = (1 << 20),
-       CS_PORT_XMIT_WAIT_DATA                  = (1 << 19),
-       CS_PORT_RCV_BUBBLE                      = (1 << 18),
-       CS_PORT_MARK_FECN                       = (1 << 17),
-       CS_PORT_RCV_CONSTRAINT_ERRORS           = (1 << 16),
-       CS_PORT_RCV_SWITCH_RELAY_ERRORS         = (1 << 15),
-       CS_PORT_XMIT_DISCARDS                   = (1 << 14),
-       CS_PORT_XMIT_CONSTRAINT_ERRORS          = (1 << 13),
-       CS_PORT_RCV_REMOTE_PHYSICAL_ERRORS      = (1 << 12),
-       CS_LOCAL_LINK_INTEGRITY_ERRORS          = (1 << 11),
-       CS_PORT_RCV_ERRORS                      = (1 << 10),
-       CS_EXCESSIVE_BUFFER_OVERRUNS            = (1 << 9),
-       CS_FM_CONFIG_ERRORS                     = (1 << 8),
-       CS_LINK_ERROR_RECOVERY                  = (1 << 7),
-       CS_LINK_DOWNED                          = (1 << 6),
-       CS_UNCORRECTABLE_ERRORS                 = (1 << 5),
-};
-
-struct opa_clear_port_status {
-       __be64 port_select_mask[4];
-       __be32 counter_select_mask;
-};
-
-struct opa_aggregate {
-       __be16 attr_id;
-       __be16 err_reqlength;   /* 1 bit, 8 res, 7 bit */
-       __be32 attr_mod;
-       u8 data[0];
-};
-
-#define MSK_LLI 0x000000f0
-#define MSK_LLI_SFT 4
-#define MSK_LER 0x0000000f
-#define MSK_LER_SFT 0
-#define ADD_LLI 8
-#define ADD_LER 2
-
-/* Request contains first three fields, response contains those plus the rest */
-struct opa_port_data_counters_msg {
-       __be64 port_select_mask[4];
-       __be32 vl_select_mask;
-       __be32 resolution;
-
-       /* Response fields follow */
-       struct _port_dctrs {
-               u8 port_number;
-               u8 reserved2[3];
-               __be32 link_quality_indicator; /* 29res, 3bit */
-
-               /* Data counters */
-               __be64 port_xmit_data;
-               __be64 port_rcv_data;
-               __be64 port_xmit_pkts;
-               __be64 port_rcv_pkts;
-               __be64 port_multicast_xmit_pkts;
-               __be64 port_multicast_rcv_pkts;
-               __be64 port_xmit_wait;
-               __be64 sw_port_congestion;
-               __be64 port_rcv_fecn;
-               __be64 port_rcv_becn;
-               __be64 port_xmit_time_cong;
-               __be64 port_xmit_wasted_bw;
-               __be64 port_xmit_wait_data;
-               __be64 port_rcv_bubble;
-               __be64 port_mark_fecn;
-
-               __be64 port_error_counter_summary;
-               /* Sum of error counts/port */
-
-               struct _vls_dctrs {
-                       /* per-VL Data counters */
-                       __be64 port_vl_xmit_data;
-                       __be64 port_vl_rcv_data;
-                       __be64 port_vl_xmit_pkts;
-                       __be64 port_vl_rcv_pkts;
-                       __be64 port_vl_xmit_wait;
-                       __be64 sw_port_vl_congestion;
-                       __be64 port_vl_rcv_fecn;
-                       __be64 port_vl_rcv_becn;
-                       __be64 port_xmit_time_cong;
-                       __be64 port_vl_xmit_wasted_bw;
-                       __be64 port_vl_xmit_wait_data;
-                       __be64 port_vl_rcv_bubble;
-                       __be64 port_vl_mark_fecn;
-               } vls[0];
-               /* array size defined by #bits set in vl_select_mask*/
-       } port[1]; /* array size defined by  #ports in attribute modifier */
-};
-
-struct opa_port_error_counters64_msg {
-       /*
-        * Request contains first two fields, response contains the
-        * whole magilla
-        */
-       __be64 port_select_mask[4];
-       __be32 vl_select_mask;
-
-       /* Response-only fields follow */
-       __be32 reserved1;
-       struct _port_ectrs {
-               u8 port_number;
-               u8 reserved2[7];
-               __be64 port_rcv_constraint_errors;
-               __be64 port_rcv_switch_relay_errors;
-               __be64 port_xmit_discards;
-               __be64 port_xmit_constraint_errors;
-               __be64 port_rcv_remote_physical_errors;
-               __be64 local_link_integrity_errors;
-               __be64 port_rcv_errors;
-               __be64 excessive_buffer_overruns;
-               __be64 fm_config_errors;
-               __be32 link_error_recovery;
-               __be32 link_downed;
-               u8 uncorrectable_errors;
-               u8 reserved3[7];
-               struct _vls_ectrs {
-                       __be64 port_vl_xmit_discards;
-               } vls[0];
-               /* array size defined by #bits set in vl_select_mask */
-       } port[1]; /* array size defined by #ports in attribute modifier */
-};
-
-struct opa_port_error_info_msg {
-       __be64 port_select_mask[4];
-       __be32 error_info_select_mask;
-       __be32 reserved1;
-       struct _port_ei {
-               u8 port_number;
-               u8 reserved2[7];
-
-               /* PortRcvErrorInfo */
-               struct {
-                       u8 status_and_code;
-                       union {
-                               u8 raw[17];
-                               struct {
-                                       /* EI1to12 format */
-                                       u8 packet_flit1[8];
-                                       u8 packet_flit2[8];
-                                       u8 remaining_flit_bits12;
-                               } ei1to12;
-                               struct {
-                                       u8 packet_bytes[8];
-                                       u8 remaining_flit_bits;
-                               } ei13;
-                       } ei;
-                       u8 reserved3[6];
-               } __packed port_rcv_ei;
-
-               /* ExcessiveBufferOverrunInfo */
-               struct {
-                       u8 status_and_sc;
-                       u8 reserved4[7];
-               } __packed excessive_buffer_overrun_ei;
-
-               /* PortXmitConstraintErrorInfo */
-               struct {
-                       u8 status;
-                       u8 reserved5;
-                       __be16 pkey;
-                       __be32 slid;
-               } __packed port_xmit_constraint_ei;
-
-               /* PortRcvConstraintErrorInfo */
-               struct {
-                       u8 status;
-                       u8 reserved6;
-                       __be16 pkey;
-                       __be32 slid;
-               } __packed port_rcv_constraint_ei;
-
-               /* PortRcvSwitchRelayErrorInfo */
-               struct {
-                       u8 status_and_code;
-                       u8 reserved7[3];
-                       __u32 error_info;
-               } __packed port_rcv_switch_relay_ei;
-
-               /* UncorrectableErrorInfo */
-               struct {
-                       u8 status_and_code;
-                       u8 reserved8;
-               } __packed uncorrectable_ei;
-
-               /* FMConfigErrorInfo */
-               struct {
-                       u8 status_and_code;
-                       u8 error_info;
-               } __packed fm_config_ei;
-               __u32 reserved9;
-       } port[1]; /* actual array size defined by #ports in attr modifier */
-};
-
-/* opa_port_error_info_msg error_info_select_mask bit definitions */
-enum error_info_selects {
-       ES_PORT_RCV_ERROR_INFO                  = (1 << 31),
-       ES_EXCESSIVE_BUFFER_OVERRUN_INFO        = (1 << 30),
-       ES_PORT_XMIT_CONSTRAINT_ERROR_INFO      = (1 << 29),
-       ES_PORT_RCV_CONSTRAINT_ERROR_INFO       = (1 << 28),
-       ES_PORT_RCV_SWITCH_RELAY_ERROR_INFO     = (1 << 27),
-       ES_UNCORRECTABLE_ERROR_INFO             = (1 << 26),
-       ES_FM_CONFIG_ERROR_INFO                 = (1 << 25)
-};
-
-static int pma_get_opa_classportinfo(struct opa_pma_mad *pmp,
-                                    struct ib_device *ibdev, u32 *resp_len)
-{
-       struct opa_class_port_info *p =
-               (struct opa_class_port_info *)pmp->data;
-
-       memset(pmp->data, 0, sizeof(pmp->data));
-
-       if (pmp->mad_hdr.attr_mod != 0)
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-
-       p->base_version = OPA_MGMT_BASE_VERSION;
-       p->class_version = OPA_SMI_CLASS_VERSION;
-       /*
-        * Expected response time is 4.096 usec. * 2^18 == 1.073741824 sec.
-        */
-       p->cap_mask2_resp_time = cpu_to_be32(18);
-
-       if (resp_len)
-               *resp_len += sizeof(*p);
-
-       return reply((struct ib_mad_hdr *)pmp);
-}
-
-static void a0_portstatus(struct hfi1_pportdata *ppd,
-                         struct opa_port_status_rsp *rsp, u32 vl_select_mask)
-{
-       if (!is_bx(ppd->dd)) {
-               unsigned long vl;
-               u64 sum_vl_xmit_wait = 0;
-               u32 vl_all_mask = VL_MASK_ALL;
-
-               for_each_set_bit(vl, (unsigned long *)&(vl_all_mask),
-                                8 * sizeof(vl_all_mask)) {
-                       u64 tmp = sum_vl_xmit_wait +
-                                 read_port_cntr(ppd, C_TX_WAIT_VL,
-                                                idx_from_vl(vl));
-                       if (tmp < sum_vl_xmit_wait) {
-                               /* we wrapped */
-                               sum_vl_xmit_wait = (u64)~0;
-                               break;
-                       }
-                       sum_vl_xmit_wait = tmp;
-               }
-               if (be64_to_cpu(rsp->port_xmit_wait) > sum_vl_xmit_wait)
-                       rsp->port_xmit_wait = cpu_to_be64(sum_vl_xmit_wait);
-       }
-}
-
-static int pma_get_opa_portstatus(struct opa_pma_mad *pmp,
-                                 struct ib_device *ibdev,
-                                 u8 port, u32 *resp_len)
-{
-       struct opa_port_status_req *req =
-               (struct opa_port_status_req *)pmp->data;
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       struct opa_port_status_rsp *rsp;
-       u32 vl_select_mask = be32_to_cpu(req->vl_select_mask);
-       unsigned long vl;
-       size_t response_data_size;
-       u32 nports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
-       u8 port_num = req->port_num;
-       u8 num_vls = hweight32(vl_select_mask);
-       struct _vls_pctrs *vlinfo;
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-       int vfi;
-       u64 tmp, tmp2;
-
-       response_data_size = sizeof(struct opa_port_status_rsp) +
-                               num_vls * sizeof(struct _vls_pctrs);
-       if (response_data_size > sizeof(pmp->data)) {
-               pmp->mad_hdr.status |= OPA_PM_STATUS_REQUEST_TOO_LARGE;
-               return reply((struct ib_mad_hdr *)pmp);
-       }
-
-       if (nports != 1 || (port_num && port_num != port) ||
-           num_vls > OPA_MAX_VLS || (vl_select_mask & ~VL_MASK_ALL)) {
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)pmp);
-       }
-
-       memset(pmp->data, 0, sizeof(pmp->data));
-
-       rsp = (struct opa_port_status_rsp *)pmp->data;
-       if (port_num)
-               rsp->port_num = port_num;
-       else
-               rsp->port_num = port;
-
-       rsp->port_rcv_constraint_errors =
-               cpu_to_be64(read_port_cntr(ppd, C_SW_RCV_CSTR_ERR,
-                                          CNTR_INVALID_VL));
-
-       hfi1_read_link_quality(dd, &rsp->link_quality_indicator);
-
-       rsp->vl_select_mask = cpu_to_be32(vl_select_mask);
-       rsp->port_xmit_data = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_FLITS,
-                                         CNTR_INVALID_VL));
-       rsp->port_rcv_data = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FLITS,
-                                        CNTR_INVALID_VL));
-       rsp->port_xmit_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_PKTS,
-                                         CNTR_INVALID_VL));
-       rsp->port_rcv_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_PKTS,
-                                        CNTR_INVALID_VL));
-       rsp->port_multicast_xmit_pkts =
-               cpu_to_be64(read_dev_cntr(dd, C_DC_MC_XMIT_PKTS,
-                                         CNTR_INVALID_VL));
-       rsp->port_multicast_rcv_pkts =
-               cpu_to_be64(read_dev_cntr(dd, C_DC_MC_RCV_PKTS,
-                                         CNTR_INVALID_VL));
-       rsp->port_xmit_wait =
-               cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL));
-       rsp->port_rcv_fecn =
-               cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL));
-       rsp->port_rcv_becn =
-               cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL));
-       rsp->port_xmit_discards =
-               cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD,
-                                          CNTR_INVALID_VL));
-       rsp->port_xmit_constraint_errors =
-               cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR,
-                                          CNTR_INVALID_VL));
-       rsp->port_rcv_remote_physical_errors =
-               cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
-                                         CNTR_INVALID_VL));
-       tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL);
-       tmp2 = tmp + read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL);
-       if (tmp2 < tmp) {
-               /* overflow/wrapped */
-               rsp->local_link_integrity_errors = cpu_to_be64(~0);
-       } else {
-               rsp->local_link_integrity_errors = cpu_to_be64(tmp2);
-       }
-       tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL);
-       tmp2 = tmp + read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
-                                  CNTR_INVALID_VL);
-       if (tmp2 > (u32)UINT_MAX || tmp2 < tmp) {
-               /* overflow/wrapped */
-               rsp->link_error_recovery = cpu_to_be32(~0);
-       } else {
-               rsp->link_error_recovery = cpu_to_be32(tmp2);
-       }
-       rsp->port_rcv_errors =
-               cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL));
-       rsp->excessive_buffer_overruns =
-               cpu_to_be64(read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL));
-       rsp->fm_config_errors =
-               cpu_to_be64(read_dev_cntr(dd, C_DC_FM_CFG_ERR,
-                                         CNTR_INVALID_VL));
-       rsp->link_downed = cpu_to_be32(read_port_cntr(ppd, C_SW_LINK_DOWN,
-                                                     CNTR_INVALID_VL));
-
-       /* rsp->uncorrectable_errors is 8 bits wide, and it pegs at 0xff */
-       tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL);
-       rsp->uncorrectable_errors = tmp < 0x100 ? (tmp & 0xff) : 0xff;
-
-       vlinfo = &rsp->vls[0];
-       vfi = 0;
-       /* The vl_select_mask has been checked above, and we know
-        * that it contains only entries which represent valid VLs.
-        * So in the for_each_set_bit() loop below, we don't need
-        * any additional checks for vl.
-        */
-       for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
-                        8 * sizeof(vl_select_mask)) {
-               memset(vlinfo, 0, sizeof(*vlinfo));
-
-               tmp = read_dev_cntr(dd, C_DC_RX_FLIT_VL, idx_from_vl(vl));
-               rsp->vls[vfi].port_vl_rcv_data = cpu_to_be64(tmp);
-
-               rsp->vls[vfi].port_vl_rcv_pkts =
-                       cpu_to_be64(read_dev_cntr(dd, C_DC_RX_PKT_VL,
-                                                 idx_from_vl(vl)));
-
-               rsp->vls[vfi].port_vl_xmit_data =
-                       cpu_to_be64(read_port_cntr(ppd, C_TX_FLIT_VL,
-                                                  idx_from_vl(vl)));
-
-               rsp->vls[vfi].port_vl_xmit_pkts =
-                       cpu_to_be64(read_port_cntr(ppd, C_TX_PKT_VL,
-                                                  idx_from_vl(vl)));
-
-               rsp->vls[vfi].port_vl_xmit_wait =
-                       cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT_VL,
-                                                  idx_from_vl(vl)));
-
-               rsp->vls[vfi].port_vl_rcv_fecn =
-                       cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN_VL,
-                                                 idx_from_vl(vl)));
-
-               rsp->vls[vfi].port_vl_rcv_becn =
-                       cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN_VL,
-                                                 idx_from_vl(vl)));
-
-               vlinfo++;
-               vfi++;
-       }
-
-       a0_portstatus(ppd, rsp, vl_select_mask);
-
-       if (resp_len)
-               *resp_len += response_data_size;
-
-       return reply((struct ib_mad_hdr *)pmp);
-}
-
-static u64 get_error_counter_summary(struct ib_device *ibdev, u8 port,
-                                    u8 res_lli, u8 res_ler)
-{
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-       u64 error_counter_summary = 0, tmp;
-
-       error_counter_summary += read_port_cntr(ppd, C_SW_RCV_CSTR_ERR,
-                                               CNTR_INVALID_VL);
-       /* port_rcv_switch_relay_errors is 0 for HFIs */
-       error_counter_summary += read_port_cntr(ppd, C_SW_XMIT_DSCD,
-                                               CNTR_INVALID_VL);
-       error_counter_summary += read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR,
-                                               CNTR_INVALID_VL);
-       error_counter_summary += read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
-                                              CNTR_INVALID_VL);
-       /* local link integrity must be right-shifted by the lli resolution */
-       tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL);
-       tmp += read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL);
-       error_counter_summary += (tmp >> res_lli);
-       /* link error recovery must b right-shifted by the ler resolution */
-       tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL);
-       tmp += read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, CNTR_INVALID_VL);
-       error_counter_summary += (tmp >> res_ler);
-       error_counter_summary += read_dev_cntr(dd, C_DC_RCV_ERR,
-                                              CNTR_INVALID_VL);
-       error_counter_summary += read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL);
-       error_counter_summary += read_dev_cntr(dd, C_DC_FM_CFG_ERR,
-                                              CNTR_INVALID_VL);
-       /* ppd->link_downed is a 32-bit value */
-       error_counter_summary += read_port_cntr(ppd, C_SW_LINK_DOWN,
-                                               CNTR_INVALID_VL);
-       tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL);
-       /* this is an 8-bit quantity */
-       error_counter_summary += tmp < 0x100 ? (tmp & 0xff) : 0xff;
-
-       return error_counter_summary;
-}
-
-static void a0_datacounters(struct hfi1_pportdata *ppd, struct _port_dctrs *rsp,
-                           u32 vl_select_mask)
-{
-       if (!is_bx(ppd->dd)) {
-               unsigned long vl;
-               u64 sum_vl_xmit_wait = 0;
-               u32 vl_all_mask = VL_MASK_ALL;
-
-               for_each_set_bit(vl, (unsigned long *)&(vl_all_mask),
-                                8 * sizeof(vl_all_mask)) {
-                       u64 tmp = sum_vl_xmit_wait +
-                                 read_port_cntr(ppd, C_TX_WAIT_VL,
-                                                idx_from_vl(vl));
-                       if (tmp < sum_vl_xmit_wait) {
-                               /* we wrapped */
-                               sum_vl_xmit_wait = (u64)~0;
-                               break;
-                       }
-                       sum_vl_xmit_wait = tmp;
-               }
-               if (be64_to_cpu(rsp->port_xmit_wait) > sum_vl_xmit_wait)
-                       rsp->port_xmit_wait = cpu_to_be64(sum_vl_xmit_wait);
-       }
-}
-
-static void pma_get_opa_port_dctrs(struct ib_device *ibdev,
-                                  struct _port_dctrs *rsp)
-{
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-
-       rsp->port_xmit_data = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_FLITS,
-                                               CNTR_INVALID_VL));
-       rsp->port_rcv_data = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FLITS,
-                                               CNTR_INVALID_VL));
-       rsp->port_xmit_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_PKTS,
-                                               CNTR_INVALID_VL));
-       rsp->port_rcv_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_PKTS,
-                                               CNTR_INVALID_VL));
-       rsp->port_multicast_xmit_pkts =
-               cpu_to_be64(read_dev_cntr(dd, C_DC_MC_XMIT_PKTS,
-                                         CNTR_INVALID_VL));
-       rsp->port_multicast_rcv_pkts =
-               cpu_to_be64(read_dev_cntr(dd, C_DC_MC_RCV_PKTS,
-                                         CNTR_INVALID_VL));
-}
-
-static int pma_get_opa_datacounters(struct opa_pma_mad *pmp,
-                                   struct ib_device *ibdev,
-                                   u8 port, u32 *resp_len)
-{
-       struct opa_port_data_counters_msg *req =
-               (struct opa_port_data_counters_msg *)pmp->data;
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-       struct _port_dctrs *rsp;
-       struct _vls_dctrs *vlinfo;
-       size_t response_data_size;
-       u32 num_ports;
-       u8 num_pslm;
-       u8 lq, num_vls;
-       u8 res_lli, res_ler;
-       u64 port_mask;
-       unsigned long port_num;
-       unsigned long vl;
-       u32 vl_select_mask;
-       int vfi;
-
-       num_ports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
-       num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
-       num_vls = hweight32(be32_to_cpu(req->vl_select_mask));
-       vl_select_mask = be32_to_cpu(req->vl_select_mask);
-       res_lli = (u8)(be32_to_cpu(req->resolution) & MSK_LLI) >> MSK_LLI_SFT;
-       res_lli = res_lli ? res_lli + ADD_LLI : 0;
-       res_ler = (u8)(be32_to_cpu(req->resolution) & MSK_LER) >> MSK_LER_SFT;
-       res_ler = res_ler ? res_ler + ADD_LER : 0;
-
-       if (num_ports != 1 || (vl_select_mask & ~VL_MASK_ALL)) {
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)pmp);
-       }
-
-       /* Sanity check */
-       response_data_size = sizeof(struct opa_port_data_counters_msg) +
-                               num_vls * sizeof(struct _vls_dctrs);
-
-       if (response_data_size > sizeof(pmp->data)) {
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)pmp);
-       }
-
-       /*
-        * The bit set in the mask needs to be consistent with the
-        * port the request came in on.
-        */
-       port_mask = be64_to_cpu(req->port_select_mask[3]);
-       port_num = find_first_bit((unsigned long *)&port_mask,
-                                 sizeof(port_mask));
-
-       if ((u8)port_num != port) {
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)pmp);
-       }
-
-       rsp = &req->port[0];
-       memset(rsp, 0, sizeof(*rsp));
-
-       rsp->port_number = port;
-       /*
-        * Note that link_quality_indicator is a 32 bit quantity in
-        * 'datacounters' queries (as opposed to 'portinfo' queries,
-        * where it's a byte).
-        */
-       hfi1_read_link_quality(dd, &lq);
-       rsp->link_quality_indicator = cpu_to_be32((u32)lq);
-       pma_get_opa_port_dctrs(ibdev, rsp);
-
-       rsp->port_xmit_wait =
-               cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL));
-       rsp->port_rcv_fecn =
-               cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL));
-       rsp->port_rcv_becn =
-               cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL));
-       rsp->port_error_counter_summary =
-               cpu_to_be64(get_error_counter_summary(ibdev, port,
-                                                     res_lli, res_ler));
-
-       vlinfo = &rsp->vls[0];
-       vfi = 0;
-       /* The vl_select_mask has been checked above, and we know
-        * that it contains only entries which represent valid VLs.
-        * So in the for_each_set_bit() loop below, we don't need
-        * any additional checks for vl.
-        */
-       for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
-                        8 * sizeof(req->vl_select_mask)) {
-               memset(vlinfo, 0, sizeof(*vlinfo));
-
-               rsp->vls[vfi].port_vl_xmit_data =
-                       cpu_to_be64(read_port_cntr(ppd, C_TX_FLIT_VL,
-                                                  idx_from_vl(vl)));
-
-               rsp->vls[vfi].port_vl_rcv_data =
-                       cpu_to_be64(read_dev_cntr(dd, C_DC_RX_FLIT_VL,
-                                                 idx_from_vl(vl)));
-
-               rsp->vls[vfi].port_vl_xmit_pkts =
-                       cpu_to_be64(read_port_cntr(ppd, C_TX_PKT_VL,
-                                                  idx_from_vl(vl)));
-
-               rsp->vls[vfi].port_vl_rcv_pkts =
-                       cpu_to_be64(read_dev_cntr(dd, C_DC_RX_PKT_VL,
-                                                 idx_from_vl(vl)));
-
-               rsp->vls[vfi].port_vl_xmit_wait =
-                       cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT_VL,
-                                                  idx_from_vl(vl)));
-
-               rsp->vls[vfi].port_vl_rcv_fecn =
-                       cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN_VL,
-                                                 idx_from_vl(vl)));
-               rsp->vls[vfi].port_vl_rcv_becn =
-                       cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN_VL,
-                                                 idx_from_vl(vl)));
-
-               /* rsp->port_vl_xmit_time_cong is 0 for HFIs */
-               /* rsp->port_vl_xmit_wasted_bw ??? */
-               /* port_vl_xmit_wait_data - TXE (table 13-9 HFI spec) ???
-                * does this differ from rsp->vls[vfi].port_vl_xmit_wait
-                */
-               /*rsp->vls[vfi].port_vl_mark_fecn =
-                *      cpu_to_be64(read_csr(dd, DCC_PRF_PORT_VL_MARK_FECN_CNT
-                *              + offset));
-                */
-               vlinfo++;
-               vfi++;
-       }
-
-       a0_datacounters(ppd, rsp, vl_select_mask);
-
-       if (resp_len)
-               *resp_len += response_data_size;
-
-       return reply((struct ib_mad_hdr *)pmp);
-}
-
-static int pma_get_ib_portcounters_ext(struct ib_pma_mad *pmp,
-                                      struct ib_device *ibdev, u8 port)
-{
-       struct ib_pma_portcounters_ext *p = (struct ib_pma_portcounters_ext *)
-                                               pmp->data;
-       struct _port_dctrs rsp;
-
-       if (pmp->mad_hdr.attr_mod != 0 || p->port_select != port) {
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-               goto bail;
-       }
-
-       memset(&rsp, 0, sizeof(rsp));
-       pma_get_opa_port_dctrs(ibdev, &rsp);
-
-       p->port_xmit_data = rsp.port_xmit_data;
-       p->port_rcv_data = rsp.port_rcv_data;
-       p->port_xmit_packets = rsp.port_xmit_pkts;
-       p->port_rcv_packets = rsp.port_rcv_pkts;
-       p->port_unicast_xmit_packets = 0;
-       p->port_unicast_rcv_packets =  0;
-       p->port_multicast_xmit_packets = rsp.port_multicast_xmit_pkts;
-       p->port_multicast_rcv_packets = rsp.port_multicast_rcv_pkts;
-
-bail:
-       return reply((struct ib_mad_hdr *)pmp);
-}
-
-static void pma_get_opa_port_ectrs(struct ib_device *ibdev,
-                                  struct _port_ectrs *rsp, u8 port)
-{
-       u64 tmp, tmp2;
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-
-       tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL);
-       tmp2 = tmp + read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
-                                       CNTR_INVALID_VL);
-       if (tmp2 > (u32)UINT_MAX || tmp2 < tmp) {
-               /* overflow/wrapped */
-               rsp->link_error_recovery = cpu_to_be32(~0);
-       } else {
-               rsp->link_error_recovery = cpu_to_be32(tmp2);
-       }
-
-       rsp->link_downed = cpu_to_be32(read_port_cntr(ppd, C_SW_LINK_DOWN,
-                                               CNTR_INVALID_VL));
-       rsp->port_rcv_errors =
-               cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL));
-       rsp->port_rcv_remote_physical_errors =
-               cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
-                                         CNTR_INVALID_VL));
-       rsp->port_rcv_switch_relay_errors = 0;
-       rsp->port_xmit_discards =
-               cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD,
-                                          CNTR_INVALID_VL));
-       rsp->port_xmit_constraint_errors =
-               cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR,
-                                          CNTR_INVALID_VL));
-       rsp->port_rcv_constraint_errors =
-               cpu_to_be64(read_port_cntr(ppd, C_SW_RCV_CSTR_ERR,
-                                          CNTR_INVALID_VL));
-       tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL);
-       tmp2 = tmp + read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL);
-       if (tmp2 < tmp) {
-               /* overflow/wrapped */
-               rsp->local_link_integrity_errors = cpu_to_be64(~0);
-       } else {
-               rsp->local_link_integrity_errors = cpu_to_be64(tmp2);
-       }
-       rsp->excessive_buffer_overruns =
-               cpu_to_be64(read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL));
-}
-
-static int pma_get_opa_porterrors(struct opa_pma_mad *pmp,
-                                 struct ib_device *ibdev,
-                                 u8 port, u32 *resp_len)
-{
-       size_t response_data_size;
-       struct _port_ectrs *rsp;
-       u8 port_num;
-       struct opa_port_error_counters64_msg *req;
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       u32 num_ports;
-       u8 num_pslm;
-       u8 num_vls;
-       struct hfi1_ibport *ibp;
-       struct hfi1_pportdata *ppd;
-       struct _vls_ectrs *vlinfo;
-       unsigned long vl;
-       u64 port_mask, tmp;
-       u32 vl_select_mask;
-       int vfi;
-
-       req = (struct opa_port_error_counters64_msg *)pmp->data;
-
-       num_ports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
-
-       num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
-       num_vls = hweight32(be32_to_cpu(req->vl_select_mask));
-
-       if (num_ports != 1 || num_ports != num_pslm) {
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)pmp);
-       }
-
-       response_data_size = sizeof(struct opa_port_error_counters64_msg) +
-                               num_vls * sizeof(struct _vls_ectrs);
-
-       if (response_data_size > sizeof(pmp->data)) {
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)pmp);
-       }
-       /*
-        * The bit set in the mask needs to be consistent with the
-        * port the request came in on.
-        */
-       port_mask = be64_to_cpu(req->port_select_mask[3]);
-       port_num = find_first_bit((unsigned long *)&port_mask,
-                                 sizeof(port_mask));
-
-       if (port_num != port) {
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)pmp);
-       }
-
-       rsp = &req->port[0];
-
-       ibp = to_iport(ibdev, port_num);
-       ppd = ppd_from_ibp(ibp);
-
-       memset(rsp, 0, sizeof(*rsp));
-       rsp->port_number = port_num;
-
-       pma_get_opa_port_ectrs(ibdev, rsp, port_num);
-
-       rsp->port_rcv_remote_physical_errors =
-               cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
-                                         CNTR_INVALID_VL));
-       rsp->fm_config_errors =
-               cpu_to_be64(read_dev_cntr(dd, C_DC_FM_CFG_ERR,
-                                         CNTR_INVALID_VL));
-       tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL);
-
-       rsp->uncorrectable_errors = tmp < 0x100 ? (tmp & 0xff) : 0xff;
-
-       vlinfo = &rsp->vls[0];
-       vfi = 0;
-       vl_select_mask = be32_to_cpu(req->vl_select_mask);
-       for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
-                        8 * sizeof(req->vl_select_mask)) {
-               memset(vlinfo, 0, sizeof(*vlinfo));
-               /* vlinfo->vls[vfi].port_vl_xmit_discards ??? */
-               vlinfo += 1;
-               vfi++;
-       }
-
-       if (resp_len)
-               *resp_len += response_data_size;
-
-       return reply((struct ib_mad_hdr *)pmp);
-}
-
-static int pma_get_ib_portcounters(struct ib_pma_mad *pmp,
-                                  struct ib_device *ibdev, u8 port)
-{
-       struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
-               pmp->data;
-       struct _port_ectrs rsp;
-       u64 temp_link_overrun_errors;
-       u64 temp_64;
-       u32 temp_32;
-
-       memset(&rsp, 0, sizeof(rsp));
-       pma_get_opa_port_ectrs(ibdev, &rsp, port);
-
-       if (pmp->mad_hdr.attr_mod != 0 || p->port_select != port) {
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-               goto bail;
-       }
-
-       p->symbol_error_counter = 0; /* N/A for OPA */
-
-       temp_32 = be32_to_cpu(rsp.link_error_recovery);
-       if (temp_32 > 0xFFUL)
-               p->link_error_recovery_counter = 0xFF;
-       else
-               p->link_error_recovery_counter = (u8)temp_32;
-
-       temp_32 = be32_to_cpu(rsp.link_downed);
-       if (temp_32 > 0xFFUL)
-               p->link_downed_counter = 0xFF;
-       else
-               p->link_downed_counter = (u8)temp_32;
-
-       temp_64 = be64_to_cpu(rsp.port_rcv_errors);
-       if (temp_64 > 0xFFFFUL)
-               p->port_rcv_errors = cpu_to_be16(0xFFFF);
-       else
-               p->port_rcv_errors = cpu_to_be16((u16)temp_64);
-
-       temp_64 = be64_to_cpu(rsp.port_rcv_remote_physical_errors);
-       if (temp_64 > 0xFFFFUL)
-               p->port_rcv_remphys_errors = cpu_to_be16(0xFFFF);
-       else
-               p->port_rcv_remphys_errors = cpu_to_be16((u16)temp_64);
-
-       temp_64 = be64_to_cpu(rsp.port_rcv_switch_relay_errors);
-       p->port_rcv_switch_relay_errors = cpu_to_be16((u16)temp_64);
-
-       temp_64 = be64_to_cpu(rsp.port_xmit_discards);
-       if (temp_64 > 0xFFFFUL)
-               p->port_xmit_discards = cpu_to_be16(0xFFFF);
-       else
-               p->port_xmit_discards = cpu_to_be16((u16)temp_64);
-
-       temp_64 = be64_to_cpu(rsp.port_xmit_constraint_errors);
-       if (temp_64 > 0xFFUL)
-               p->port_xmit_constraint_errors = 0xFF;
-       else
-               p->port_xmit_constraint_errors = (u8)temp_64;
-
-       temp_64 = be64_to_cpu(rsp.port_rcv_constraint_errors);
-       if (temp_64 > 0xFFUL)
-               p->port_rcv_constraint_errors = 0xFFUL;
-       else
-               p->port_rcv_constraint_errors = (u8)temp_64;
-
-       /* LocalLink: 7:4, BufferOverrun: 3:0 */
-       temp_64 = be64_to_cpu(rsp.local_link_integrity_errors);
-       if (temp_64 > 0xFUL)
-               temp_64 = 0xFUL;
-
-       temp_link_overrun_errors = temp_64 << 4;
-
-       temp_64 = be64_to_cpu(rsp.excessive_buffer_overruns);
-       if (temp_64 > 0xFUL)
-               temp_64 = 0xFUL;
-       temp_link_overrun_errors |= temp_64;
-
-       p->link_overrun_errors = (u8)temp_link_overrun_errors;
-
-       p->vl15_dropped = 0; /* N/A for OPA */
-
-bail:
-       return reply((struct ib_mad_hdr *)pmp);
-}
-
-static int pma_get_opa_errorinfo(struct opa_pma_mad *pmp,
-                                struct ib_device *ibdev,
-                                u8 port, u32 *resp_len)
-{
-       size_t response_data_size;
-       struct _port_ei *rsp;
-       struct opa_port_error_info_msg *req;
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       u64 port_mask;
-       u32 num_ports;
-       u8 port_num;
-       u8 num_pslm;
-       u64 reg;
-
-       req = (struct opa_port_error_info_msg *)pmp->data;
-       rsp = &req->port[0];
-
-       num_ports = OPA_AM_NPORT(be32_to_cpu(pmp->mad_hdr.attr_mod));
-       num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
-
-       memset(rsp, 0, sizeof(*rsp));
-
-       if (num_ports != 1 || num_ports != num_pslm) {
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)pmp);
-       }
-
-       /* Sanity check */
-       response_data_size = sizeof(struct opa_port_error_info_msg);
-
-       if (response_data_size > sizeof(pmp->data)) {
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)pmp);
-       }
-
-       /*
-        * The bit set in the mask needs to be consistent with the port
-        * the request came in on.
-        */
-       port_mask = be64_to_cpu(req->port_select_mask[3]);
-       port_num = find_first_bit((unsigned long *)&port_mask,
-                                 sizeof(port_mask));
-
-       if (port_num != port) {
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)pmp);
-       }
-
-       /* PortRcvErrorInfo */
-       rsp->port_rcv_ei.status_and_code =
-               dd->err_info_rcvport.status_and_code;
-       memcpy(&rsp->port_rcv_ei.ei.ei1to12.packet_flit1,
-              &dd->err_info_rcvport.packet_flit1, sizeof(u64));
-       memcpy(&rsp->port_rcv_ei.ei.ei1to12.packet_flit2,
-              &dd->err_info_rcvport.packet_flit2, sizeof(u64));
-
-       /* ExcessiverBufferOverrunInfo */
-       reg = read_csr(dd, RCV_ERR_INFO);
-       if (reg & RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK) {
-               /*
-                * if the RcvExcessBufferOverrun bit is set, save SC of
-                * first pkt that encountered an excess buffer overrun
-                */
-               u8 tmp = (u8)reg;
-
-               tmp &=  RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SC_SMASK;
-               tmp <<= 2;
-               rsp->excessive_buffer_overrun_ei.status_and_sc = tmp;
-               /* set the status bit */
-               rsp->excessive_buffer_overrun_ei.status_and_sc |= 0x80;
-       }
-
-       rsp->port_xmit_constraint_ei.status =
-               dd->err_info_xmit_constraint.status;
-       rsp->port_xmit_constraint_ei.pkey =
-               cpu_to_be16(dd->err_info_xmit_constraint.pkey);
-       rsp->port_xmit_constraint_ei.slid =
-               cpu_to_be32(dd->err_info_xmit_constraint.slid);
-
-       rsp->port_rcv_constraint_ei.status =
-               dd->err_info_rcv_constraint.status;
-       rsp->port_rcv_constraint_ei.pkey =
-               cpu_to_be16(dd->err_info_rcv_constraint.pkey);
-       rsp->port_rcv_constraint_ei.slid =
-               cpu_to_be32(dd->err_info_rcv_constraint.slid);
-
-       /* UncorrectableErrorInfo */
-       rsp->uncorrectable_ei.status_and_code = dd->err_info_uncorrectable;
-
-       /* FMConfigErrorInfo */
-       rsp->fm_config_ei.status_and_code = dd->err_info_fmconfig;
-
-       if (resp_len)
-               *resp_len += response_data_size;
-
-       return reply((struct ib_mad_hdr *)pmp);
-}
-
-static int pma_set_opa_portstatus(struct opa_pma_mad *pmp,
-                                 struct ib_device *ibdev,
-                                 u8 port, u32 *resp_len)
-{
-       struct opa_clear_port_status *req =
-               (struct opa_clear_port_status *)pmp->data;
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-       u32 nports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
-       u64 portn = be64_to_cpu(req->port_select_mask[3]);
-       u32 counter_select = be32_to_cpu(req->counter_select_mask);
-       u32 vl_select_mask = VL_MASK_ALL; /* clear all per-vl cnts */
-       unsigned long vl;
-
-       if ((nports != 1) || (portn != 1 << port)) {
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)pmp);
-       }
-       /*
-        * only counters returned by pma_get_opa_portstatus() are
-        * handled, so when pma_get_opa_portstatus() gets a fix,
-        * the corresponding change should be made here as well.
-        */
-
-       if (counter_select & CS_PORT_XMIT_DATA)
-               write_dev_cntr(dd, C_DC_XMIT_FLITS, CNTR_INVALID_VL, 0);
-
-       if (counter_select & CS_PORT_RCV_DATA)
-               write_dev_cntr(dd, C_DC_RCV_FLITS, CNTR_INVALID_VL, 0);
-
-       if (counter_select & CS_PORT_XMIT_PKTS)
-               write_dev_cntr(dd, C_DC_XMIT_PKTS, CNTR_INVALID_VL, 0);
-
-       if (counter_select & CS_PORT_RCV_PKTS)
-               write_dev_cntr(dd, C_DC_RCV_PKTS, CNTR_INVALID_VL, 0);
-
-       if (counter_select & CS_PORT_MCAST_XMIT_PKTS)
-               write_dev_cntr(dd, C_DC_MC_XMIT_PKTS, CNTR_INVALID_VL, 0);
-
-       if (counter_select & CS_PORT_MCAST_RCV_PKTS)
-               write_dev_cntr(dd, C_DC_MC_RCV_PKTS, CNTR_INVALID_VL, 0);
-
-       if (counter_select & CS_PORT_XMIT_WAIT)
-               write_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL, 0);
-
-       /* ignore cs_sw_portCongestion for HFIs */
-
-       if (counter_select & CS_PORT_RCV_FECN)
-               write_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL, 0);
-
-       if (counter_select & CS_PORT_RCV_BECN)
-               write_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL, 0);
-
-       /* ignore cs_port_xmit_time_cong for HFIs */
-       /* ignore cs_port_xmit_wasted_bw for now */
-       /* ignore cs_port_xmit_wait_data for now */
-       if (counter_select & CS_PORT_RCV_BUBBLE)
-               write_dev_cntr(dd, C_DC_RCV_BBL, CNTR_INVALID_VL, 0);
-
-       /* Only applicable for switch */
-       /* if (counter_select & CS_PORT_MARK_FECN)
-        *      write_csr(dd, DCC_PRF_PORT_MARK_FECN_CNT, 0);
-        */
-
-       if (counter_select & CS_PORT_RCV_CONSTRAINT_ERRORS)
-               write_port_cntr(ppd, C_SW_RCV_CSTR_ERR, CNTR_INVALID_VL, 0);
-
-       /* ignore cs_port_rcv_switch_relay_errors for HFIs */
-       if (counter_select & CS_PORT_XMIT_DISCARDS)
-               write_port_cntr(ppd, C_SW_XMIT_DSCD, CNTR_INVALID_VL, 0);
-
-       if (counter_select & CS_PORT_XMIT_CONSTRAINT_ERRORS)
-               write_port_cntr(ppd, C_SW_XMIT_CSTR_ERR, CNTR_INVALID_VL, 0);
-
-       if (counter_select & CS_PORT_RCV_REMOTE_PHYSICAL_ERRORS)
-               write_dev_cntr(dd, C_DC_RMT_PHY_ERR, CNTR_INVALID_VL, 0);
-
-       if (counter_select & CS_LOCAL_LINK_INTEGRITY_ERRORS) {
-               write_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL, 0);
-               write_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL, 0);
-       }
-
-       if (counter_select & CS_LINK_ERROR_RECOVERY) {
-               write_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL, 0);
-               write_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
-                              CNTR_INVALID_VL, 0);
-       }
-
-       if (counter_select & CS_PORT_RCV_ERRORS)
-               write_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL, 0);
-
-       if (counter_select & CS_EXCESSIVE_BUFFER_OVERRUNS) {
-               write_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL, 0);
-               dd->rcv_ovfl_cnt = 0;
-       }
-
-       if (counter_select & CS_FM_CONFIG_ERRORS)
-               write_dev_cntr(dd, C_DC_FM_CFG_ERR, CNTR_INVALID_VL, 0);
-
-       if (counter_select & CS_LINK_DOWNED)
-               write_port_cntr(ppd, C_SW_LINK_DOWN, CNTR_INVALID_VL, 0);
-
-       if (counter_select & CS_UNCORRECTABLE_ERRORS)
-               write_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL, 0);
-
-       for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
-                        8 * sizeof(vl_select_mask)) {
-               if (counter_select & CS_PORT_XMIT_DATA)
-                       write_port_cntr(ppd, C_TX_FLIT_VL, idx_from_vl(vl), 0);
-
-               if (counter_select & CS_PORT_RCV_DATA)
-                       write_dev_cntr(dd, C_DC_RX_FLIT_VL, idx_from_vl(vl), 0);
-
-               if (counter_select & CS_PORT_XMIT_PKTS)
-                       write_port_cntr(ppd, C_TX_PKT_VL, idx_from_vl(vl), 0);
-
-               if (counter_select & CS_PORT_RCV_PKTS)
-                       write_dev_cntr(dd, C_DC_RX_PKT_VL, idx_from_vl(vl), 0);
-
-               if (counter_select & CS_PORT_XMIT_WAIT)
-                       write_port_cntr(ppd, C_TX_WAIT_VL, idx_from_vl(vl), 0);
-
-               /* sw_port_vl_congestion is 0 for HFIs */
-               if (counter_select & CS_PORT_RCV_FECN)
-                       write_dev_cntr(dd, C_DC_RCV_FCN_VL, idx_from_vl(vl), 0);
-
-               if (counter_select & CS_PORT_RCV_BECN)
-                       write_dev_cntr(dd, C_DC_RCV_BCN_VL, idx_from_vl(vl), 0);
-
-               /* port_vl_xmit_time_cong is 0 for HFIs */
-               /* port_vl_xmit_wasted_bw ??? */
-               /* port_vl_xmit_wait_data - TXE (table 13-9 HFI spec) ??? */
-               if (counter_select & CS_PORT_RCV_BUBBLE)
-                       write_dev_cntr(dd, C_DC_RCV_BBL_VL, idx_from_vl(vl), 0);
-
-               /* if (counter_select & CS_PORT_MARK_FECN)
-                *     write_csr(dd, DCC_PRF_PORT_VL_MARK_FECN_CNT + offset, 0);
-                */
-               /* port_vl_xmit_discards ??? */
-       }
-
-       if (resp_len)
-               *resp_len += sizeof(*req);
-
-       return reply((struct ib_mad_hdr *)pmp);
-}
-
-static int pma_set_opa_errorinfo(struct opa_pma_mad *pmp,
-                                struct ib_device *ibdev,
-                                u8 port, u32 *resp_len)
-{
-       struct _port_ei *rsp;
-       struct opa_port_error_info_msg *req;
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       u64 port_mask;
-       u32 num_ports;
-       u8 port_num;
-       u8 num_pslm;
-       u32 error_info_select;
-
-       req = (struct opa_port_error_info_msg *)pmp->data;
-       rsp = &req->port[0];
-
-       num_ports = OPA_AM_NPORT(be32_to_cpu(pmp->mad_hdr.attr_mod));
-       num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
-
-       memset(rsp, 0, sizeof(*rsp));
-
-       if (num_ports != 1 || num_ports != num_pslm) {
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)pmp);
-       }
-
-       /*
-        * The bit set in the mask needs to be consistent with the port
-        * the request came in on.
-        */
-       port_mask = be64_to_cpu(req->port_select_mask[3]);
-       port_num = find_first_bit((unsigned long *)&port_mask,
-                                 sizeof(port_mask));
-
-       if (port_num != port) {
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)pmp);
-       }
-
-       error_info_select = be32_to_cpu(req->error_info_select_mask);
-
-       /* PortRcvErrorInfo */
-       if (error_info_select & ES_PORT_RCV_ERROR_INFO)
-               /* turn off status bit */
-               dd->err_info_rcvport.status_and_code &= ~OPA_EI_STATUS_SMASK;
-
-       /* ExcessiverBufferOverrunInfo */
-       if (error_info_select & ES_EXCESSIVE_BUFFER_OVERRUN_INFO)
-               /*
-                * status bit is essentially kept in the h/w - bit 5 of
-                * RCV_ERR_INFO
-                */
-               write_csr(dd, RCV_ERR_INFO,
-                         RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK);
-
-       if (error_info_select & ES_PORT_XMIT_CONSTRAINT_ERROR_INFO)
-               dd->err_info_xmit_constraint.status &= ~OPA_EI_STATUS_SMASK;
-
-       if (error_info_select & ES_PORT_RCV_CONSTRAINT_ERROR_INFO)
-               dd->err_info_rcv_constraint.status &= ~OPA_EI_STATUS_SMASK;
-
-       /* UncorrectableErrorInfo */
-       if (error_info_select & ES_UNCORRECTABLE_ERROR_INFO)
-               /* turn off status bit */
-               dd->err_info_uncorrectable &= ~OPA_EI_STATUS_SMASK;
-
-       /* FMConfigErrorInfo */
-       if (error_info_select & ES_FM_CONFIG_ERROR_INFO)
-               /* turn off status bit */
-               dd->err_info_fmconfig &= ~OPA_EI_STATUS_SMASK;
-
-       if (resp_len)
-               *resp_len += sizeof(*req);
-
-       return reply((struct ib_mad_hdr *)pmp);
-}
-
-struct opa_congestion_info_attr {
-       __be16 congestion_info;
-       u8 control_table_cap;   /* Multiple of 64 entry unit CCTs */
-       u8 congestion_log_length;
-} __packed;
-
-static int __subn_get_opa_cong_info(struct opa_smp *smp, u32 am, u8 *data,
-                                   struct ib_device *ibdev, u8 port,
-                                   u32 *resp_len)
-{
-       struct opa_congestion_info_attr *p =
-               (struct opa_congestion_info_attr *)data;
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-
-       p->congestion_info = 0;
-       p->control_table_cap = ppd->cc_max_table_entries;
-       p->congestion_log_length = OPA_CONG_LOG_ELEMS;
-
-       if (resp_len)
-               *resp_len += sizeof(*p);
-
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-static int __subn_get_opa_cong_setting(struct opa_smp *smp, u32 am,
-                                      u8 *data, struct ib_device *ibdev,
-                                      u8 port, u32 *resp_len)
-{
-       int i;
-       struct opa_congestion_setting_attr *p =
-               (struct opa_congestion_setting_attr *)data;
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-       struct opa_congestion_setting_entry_shadow *entries;
-       struct cc_state *cc_state;
-
-       rcu_read_lock();
-
-       cc_state = get_cc_state(ppd);
-
-       if (!cc_state) {
-               rcu_read_unlock();
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       entries = cc_state->cong_setting.entries;
-       p->port_control = cpu_to_be16(cc_state->cong_setting.port_control);
-       p->control_map = cpu_to_be32(cc_state->cong_setting.control_map);
-       for (i = 0; i < OPA_MAX_SLS; i++) {
-               p->entries[i].ccti_increase = entries[i].ccti_increase;
-               p->entries[i].ccti_timer = cpu_to_be16(entries[i].ccti_timer);
-               p->entries[i].trigger_threshold =
-                       entries[i].trigger_threshold;
-               p->entries[i].ccti_min = entries[i].ccti_min;
-       }
-
-       rcu_read_unlock();
-
-       if (resp_len)
-               *resp_len += sizeof(*p);
-
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-static int __subn_set_opa_cong_setting(struct opa_smp *smp, u32 am, u8 *data,
-                                      struct ib_device *ibdev, u8 port,
-                                      u32 *resp_len)
-{
-       struct opa_congestion_setting_attr *p =
-               (struct opa_congestion_setting_attr *)data;
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-       struct opa_congestion_setting_entry_shadow *entries;
-       int i;
-
-       ppd->cc_sl_control_map = be32_to_cpu(p->control_map);
-
-       entries = ppd->congestion_entries;
-       for (i = 0; i < OPA_MAX_SLS; i++) {
-               entries[i].ccti_increase = p->entries[i].ccti_increase;
-               entries[i].ccti_timer = be16_to_cpu(p->entries[i].ccti_timer);
-               entries[i].trigger_threshold =
-                       p->entries[i].trigger_threshold;
-               entries[i].ccti_min = p->entries[i].ccti_min;
-       }
-
-       return __subn_get_opa_cong_setting(smp, am, data, ibdev, port,
-                                          resp_len);
-}
-
-static int __subn_get_opa_hfi1_cong_log(struct opa_smp *smp, u32 am,
-                                       u8 *data, struct ib_device *ibdev,
-                                       u8 port, u32 *resp_len)
-{
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-       struct opa_hfi1_cong_log *cong_log = (struct opa_hfi1_cong_log *)data;
-       s64 ts;
-       int i;
-
-       if (am != 0) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       spin_lock_irq(&ppd->cc_log_lock);
-
-       cong_log->log_type = OPA_CC_LOG_TYPE_HFI;
-       cong_log->congestion_flags = 0;
-       cong_log->threshold_event_counter =
-               cpu_to_be16(ppd->threshold_event_counter);
-       memcpy(cong_log->threshold_cong_event_map,
-              ppd->threshold_cong_event_map,
-              sizeof(cong_log->threshold_cong_event_map));
-       /* keep timestamp in units of 1.024 usec */
-       ts = ktime_to_ns(ktime_get()) / 1024;
-       cong_log->current_time_stamp = cpu_to_be32(ts);
-       for (i = 0; i < OPA_CONG_LOG_ELEMS; i++) {
-               struct opa_hfi1_cong_log_event_internal *cce =
-                       &ppd->cc_events[ppd->cc_mad_idx++];
-               if (ppd->cc_mad_idx == OPA_CONG_LOG_ELEMS)
-                       ppd->cc_mad_idx = 0;
-               /*
-                * Entries which are older than twice the time
-                * required to wrap the counter are supposed to
-                * be zeroed (CA10-49 IBTA, release 1.2.1, V1).
-                */
-               if ((u64)(ts - cce->timestamp) > (2 * UINT_MAX))
-                       continue;
-               memcpy(cong_log->events[i].local_qp_cn_entry, &cce->lqpn, 3);
-               memcpy(cong_log->events[i].remote_qp_number_cn_entry,
-                      &cce->rqpn, 3);
-               cong_log->events[i].sl_svc_type_cn_entry =
-                       ((cce->sl & 0x1f) << 3) | (cce->svc_type & 0x7);
-               cong_log->events[i].remote_lid_cn_entry =
-                       cpu_to_be32(cce->rlid);
-               cong_log->events[i].timestamp_cn_entry =
-                       cpu_to_be32(cce->timestamp);
-       }
-
-       /*
-        * Reset threshold_cong_event_map, and threshold_event_counter
-        * to 0 when log is read.
-        */
-       memset(ppd->threshold_cong_event_map, 0x0,
-              sizeof(ppd->threshold_cong_event_map));
-       ppd->threshold_event_counter = 0;
-
-       spin_unlock_irq(&ppd->cc_log_lock);
-
-       if (resp_len)
-               *resp_len += sizeof(struct opa_hfi1_cong_log);
-
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-static int __subn_get_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data,
-                                  struct ib_device *ibdev, u8 port,
-                                  u32 *resp_len)
-{
-       struct ib_cc_table_attr *cc_table_attr =
-               (struct ib_cc_table_attr *)data;
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-       u32 start_block = OPA_AM_START_BLK(am);
-       u32 n_blocks = OPA_AM_NBLK(am);
-       struct ib_cc_table_entry_shadow *entries;
-       int i, j;
-       u32 sentry, eentry;
-       struct cc_state *cc_state;
-
-       /* sanity check n_blocks, start_block */
-       if (n_blocks == 0 ||
-           start_block + n_blocks > ppd->cc_max_table_entries) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       rcu_read_lock();
-
-       cc_state = get_cc_state(ppd);
-
-       if (!cc_state) {
-               rcu_read_unlock();
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       sentry = start_block * IB_CCT_ENTRIES;
-       eentry = sentry + (IB_CCT_ENTRIES * n_blocks);
-
-       cc_table_attr->ccti_limit = cpu_to_be16(cc_state->cct.ccti_limit);
-
-       entries = cc_state->cct.entries;
-
-       /* return n_blocks, though the last block may not be full */
-       for (j = 0, i = sentry; i < eentry; j++, i++)
-               cc_table_attr->ccti_entries[j].entry =
-                       cpu_to_be16(entries[i].entry);
-
-       rcu_read_unlock();
-
-       if (resp_len)
-               *resp_len += sizeof(u16) * (IB_CCT_ENTRIES * n_blocks + 1);
-
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-void cc_state_reclaim(struct rcu_head *rcu)
-{
-       struct cc_state *cc_state = container_of(rcu, struct cc_state, rcu);
-
-       kfree(cc_state);
-}
-
-static int __subn_set_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data,
-                                  struct ib_device *ibdev, u8 port,
-                                  u32 *resp_len)
-{
-       struct ib_cc_table_attr *p = (struct ib_cc_table_attr *)data;
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-       u32 start_block = OPA_AM_START_BLK(am);
-       u32 n_blocks = OPA_AM_NBLK(am);
-       struct ib_cc_table_entry_shadow *entries;
-       int i, j;
-       u32 sentry, eentry;
-       u16 ccti_limit;
-       struct cc_state *old_cc_state, *new_cc_state;
-
-       /* sanity check n_blocks, start_block */
-       if (n_blocks == 0 ||
-           start_block + n_blocks > ppd->cc_max_table_entries) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       sentry = start_block * IB_CCT_ENTRIES;
-       eentry = sentry + ((n_blocks - 1) * IB_CCT_ENTRIES) +
-                (be16_to_cpu(p->ccti_limit)) % IB_CCT_ENTRIES + 1;
-
-       /* sanity check ccti_limit */
-       ccti_limit = be16_to_cpu(p->ccti_limit);
-       if (ccti_limit + 1 > eentry) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       new_cc_state = kzalloc(sizeof(*new_cc_state), GFP_KERNEL);
-       if (!new_cc_state)
-               goto getit;
-
-       spin_lock(&ppd->cc_state_lock);
-
-       old_cc_state = get_cc_state(ppd);
-
-       if (!old_cc_state) {
-               spin_unlock(&ppd->cc_state_lock);
-               kfree(new_cc_state);
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       *new_cc_state = *old_cc_state;
-
-       new_cc_state->cct.ccti_limit = ccti_limit;
-
-       entries = ppd->ccti_entries;
-       ppd->total_cct_entry = ccti_limit + 1;
-
-       for (j = 0, i = sentry; i < eentry; j++, i++)
-               entries[i].entry = be16_to_cpu(p->ccti_entries[j].entry);
-
-       memcpy(new_cc_state->cct.entries, entries,
-              eentry * sizeof(struct ib_cc_table_entry));
-
-       new_cc_state->cong_setting.port_control = IB_CC_CCS_PC_SL_BASED;
-       new_cc_state->cong_setting.control_map = ppd->cc_sl_control_map;
-       memcpy(new_cc_state->cong_setting.entries, ppd->congestion_entries,
-              OPA_MAX_SLS * sizeof(struct opa_congestion_setting_entry));
-
-       rcu_assign_pointer(ppd->cc_state, new_cc_state);
-
-       spin_unlock(&ppd->cc_state_lock);
-
-       call_rcu(&old_cc_state->rcu, cc_state_reclaim);
-
-getit:
-       return __subn_get_opa_cc_table(smp, am, data, ibdev, port, resp_len);
-}
-
-struct opa_led_info {
-       __be32 rsvd_led_mask;
-       __be32 rsvd;
-};
-
-#define OPA_LED_SHIFT  31
-#define OPA_LED_MASK   BIT(OPA_LED_SHIFT)
-
-static int __subn_get_opa_led_info(struct opa_smp *smp, u32 am, u8 *data,
-                                  struct ib_device *ibdev, u8 port,
-                                  u32 *resp_len)
-{
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       struct hfi1_pportdata *ppd = dd->pport;
-       struct opa_led_info *p = (struct opa_led_info *)data;
-       u32 nport = OPA_AM_NPORT(am);
-       u32 is_beaconing_active;
-
-       if (nport != 1) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       /*
-        * This pairs with the memory barrier in hfi1_start_led_override to
-        * ensure that we read the correct state of LED beaconing represented
-        * by led_override_timer_active
-        */
-       smp_rmb();
-       is_beaconing_active = !!atomic_read(&ppd->led_override_timer_active);
-       p->rsvd_led_mask = cpu_to_be32(is_beaconing_active << OPA_LED_SHIFT);
-
-       if (resp_len)
-               *resp_len += sizeof(struct opa_led_info);
-
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-static int __subn_set_opa_led_info(struct opa_smp *smp, u32 am, u8 *data,
-                                  struct ib_device *ibdev, u8 port,
-                                  u32 *resp_len)
-{
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       struct opa_led_info *p = (struct opa_led_info *)data;
-       u32 nport = OPA_AM_NPORT(am);
-       int on = !!(be32_to_cpu(p->rsvd_led_mask) & OPA_LED_MASK);
-
-       if (nport != 1) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       if (on)
-               hfi1_start_led_override(dd->pport, 2000, 1500);
-       else
-               shutdown_led_override(dd->pport);
-
-       return __subn_get_opa_led_info(smp, am, data, ibdev, port, resp_len);
-}
-
-static int subn_get_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am,
-                           u8 *data, struct ib_device *ibdev, u8 port,
-                           u32 *resp_len)
-{
-       int ret;
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-
-       switch (attr_id) {
-       case IB_SMP_ATTR_NODE_DESC:
-               ret = __subn_get_opa_nodedesc(smp, am, data, ibdev, port,
-                                             resp_len);
-               break;
-       case IB_SMP_ATTR_NODE_INFO:
-               ret = __subn_get_opa_nodeinfo(smp, am, data, ibdev, port,
-                                             resp_len);
-               break;
-       case IB_SMP_ATTR_PORT_INFO:
-               ret = __subn_get_opa_portinfo(smp, am, data, ibdev, port,
-                                             resp_len);
-               break;
-       case IB_SMP_ATTR_PKEY_TABLE:
-               ret = __subn_get_opa_pkeytable(smp, am, data, ibdev, port,
-                                              resp_len);
-               break;
-       case OPA_ATTRIB_ID_SL_TO_SC_MAP:
-               ret = __subn_get_opa_sl_to_sc(smp, am, data, ibdev, port,
-                                             resp_len);
-               break;
-       case OPA_ATTRIB_ID_SC_TO_SL_MAP:
-               ret = __subn_get_opa_sc_to_sl(smp, am, data, ibdev, port,
-                                             resp_len);
-               break;
-       case OPA_ATTRIB_ID_SC_TO_VLT_MAP:
-               ret = __subn_get_opa_sc_to_vlt(smp, am, data, ibdev, port,
-                                              resp_len);
-               break;
-       case OPA_ATTRIB_ID_SC_TO_VLNT_MAP:
-               ret = __subn_get_opa_sc_to_vlnt(smp, am, data, ibdev, port,
-                                               resp_len);
-               break;
-       case OPA_ATTRIB_ID_PORT_STATE_INFO:
-               ret = __subn_get_opa_psi(smp, am, data, ibdev, port,
-                                        resp_len);
-               break;
-       case OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE:
-               ret = __subn_get_opa_bct(smp, am, data, ibdev, port,
-                                        resp_len);
-               break;
-       case OPA_ATTRIB_ID_CABLE_INFO:
-               ret = __subn_get_opa_cable_info(smp, am, data, ibdev, port,
-                                               resp_len);
-               break;
-       case IB_SMP_ATTR_VL_ARB_TABLE:
-               ret = __subn_get_opa_vl_arb(smp, am, data, ibdev, port,
-                                           resp_len);
-               break;
-       case OPA_ATTRIB_ID_CONGESTION_INFO:
-               ret = __subn_get_opa_cong_info(smp, am, data, ibdev, port,
-                                              resp_len);
-               break;
-       case OPA_ATTRIB_ID_HFI_CONGESTION_SETTING:
-               ret = __subn_get_opa_cong_setting(smp, am, data, ibdev,
-                                                 port, resp_len);
-               break;
-       case OPA_ATTRIB_ID_HFI_CONGESTION_LOG:
-               ret = __subn_get_opa_hfi1_cong_log(smp, am, data, ibdev,
-                                                  port, resp_len);
-               break;
-       case OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE:
-               ret = __subn_get_opa_cc_table(smp, am, data, ibdev, port,
-                                             resp_len);
-               break;
-       case IB_SMP_ATTR_LED_INFO:
-               ret = __subn_get_opa_led_info(smp, am, data, ibdev, port,
-                                             resp_len);
-               break;
-       case IB_SMP_ATTR_SM_INFO:
-               if (ibp->rvp.port_cap_flags & IB_PORT_SM_DISABLED)
-                       return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
-               if (ibp->rvp.port_cap_flags & IB_PORT_SM)
-                       return IB_MAD_RESULT_SUCCESS;
-               /* FALLTHROUGH */
-       default:
-               smp->status |= IB_SMP_UNSUP_METH_ATTR;
-               ret = reply((struct ib_mad_hdr *)smp);
-               break;
-       }
-       return ret;
-}
-
-static int subn_set_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am,
-                           u8 *data, struct ib_device *ibdev, u8 port,
-                           u32 *resp_len)
-{
-       int ret;
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-
-       switch (attr_id) {
-       case IB_SMP_ATTR_PORT_INFO:
-               ret = __subn_set_opa_portinfo(smp, am, data, ibdev, port,
-                                             resp_len);
-               break;
-       case IB_SMP_ATTR_PKEY_TABLE:
-               ret = __subn_set_opa_pkeytable(smp, am, data, ibdev, port,
-                                              resp_len);
-               break;
-       case OPA_ATTRIB_ID_SL_TO_SC_MAP:
-               ret = __subn_set_opa_sl_to_sc(smp, am, data, ibdev, port,
-                                             resp_len);
-               break;
-       case OPA_ATTRIB_ID_SC_TO_SL_MAP:
-               ret = __subn_set_opa_sc_to_sl(smp, am, data, ibdev, port,
-                                             resp_len);
-               break;
-       case OPA_ATTRIB_ID_SC_TO_VLT_MAP:
-               ret = __subn_set_opa_sc_to_vlt(smp, am, data, ibdev, port,
-                                              resp_len);
-               break;
-       case OPA_ATTRIB_ID_SC_TO_VLNT_MAP:
-               ret = __subn_set_opa_sc_to_vlnt(smp, am, data, ibdev, port,
-                                               resp_len);
-               break;
-       case OPA_ATTRIB_ID_PORT_STATE_INFO:
-               ret = __subn_set_opa_psi(smp, am, data, ibdev, port,
-                                        resp_len);
-               break;
-       case OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE:
-               ret = __subn_set_opa_bct(smp, am, data, ibdev, port,
-                                        resp_len);
-               break;
-       case IB_SMP_ATTR_VL_ARB_TABLE:
-               ret = __subn_set_opa_vl_arb(smp, am, data, ibdev, port,
-                                           resp_len);
-               break;
-       case OPA_ATTRIB_ID_HFI_CONGESTION_SETTING:
-               ret = __subn_set_opa_cong_setting(smp, am, data, ibdev,
-                                                 port, resp_len);
-               break;
-       case OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE:
-               ret = __subn_set_opa_cc_table(smp, am, data, ibdev, port,
-                                             resp_len);
-               break;
-       case IB_SMP_ATTR_LED_INFO:
-               ret = __subn_set_opa_led_info(smp, am, data, ibdev, port,
-                                             resp_len);
-               break;
-       case IB_SMP_ATTR_SM_INFO:
-               if (ibp->rvp.port_cap_flags & IB_PORT_SM_DISABLED)
-                       return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
-               if (ibp->rvp.port_cap_flags & IB_PORT_SM)
-                       return IB_MAD_RESULT_SUCCESS;
-               /* FALLTHROUGH */
-       default:
-               smp->status |= IB_SMP_UNSUP_METH_ATTR;
-               ret = reply((struct ib_mad_hdr *)smp);
-               break;
-       }
-       return ret;
-}
-
-static inline void set_aggr_error(struct opa_aggregate *ag)
-{
-       ag->err_reqlength |= cpu_to_be16(0x8000);
-}
-
-static int subn_get_opa_aggregate(struct opa_smp *smp,
-                                 struct ib_device *ibdev, u8 port,
-                                 u32 *resp_len)
-{
-       int i;
-       u32 num_attr = be32_to_cpu(smp->attr_mod) & 0x000000ff;
-       u8 *next_smp = opa_get_smp_data(smp);
-
-       if (num_attr < 1 || num_attr > 117) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       for (i = 0; i < num_attr; i++) {
-               struct opa_aggregate *agg;
-               size_t agg_data_len;
-               size_t agg_size;
-               u32 am;
-
-               agg = (struct opa_aggregate *)next_smp;
-               agg_data_len = (be16_to_cpu(agg->err_reqlength) & 0x007f) * 8;
-               agg_size = sizeof(*agg) + agg_data_len;
-               am = be32_to_cpu(agg->attr_mod);
-
-               *resp_len += agg_size;
-
-               if (next_smp + agg_size > ((u8 *)smp) + sizeof(*smp)) {
-                       smp->status |= IB_SMP_INVALID_FIELD;
-                       return reply((struct ib_mad_hdr *)smp);
-               }
-
-               /* zero the payload for this segment */
-               memset(next_smp + sizeof(*agg), 0, agg_data_len);
-
-               (void)subn_get_opa_sma(agg->attr_id, smp, am, agg->data,
-                                       ibdev, port, NULL);
-               if (smp->status & ~IB_SMP_DIRECTION) {
-                       set_aggr_error(agg);
-                       return reply((struct ib_mad_hdr *)smp);
-               }
-               next_smp += agg_size;
-       }
-
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-static int subn_set_opa_aggregate(struct opa_smp *smp,
-                                 struct ib_device *ibdev, u8 port,
-                                 u32 *resp_len)
-{
-       int i;
-       u32 num_attr = be32_to_cpu(smp->attr_mod) & 0x000000ff;
-       u8 *next_smp = opa_get_smp_data(smp);
-
-       if (num_attr < 1 || num_attr > 117) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       for (i = 0; i < num_attr; i++) {
-               struct opa_aggregate *agg;
-               size_t agg_data_len;
-               size_t agg_size;
-               u32 am;
-
-               agg = (struct opa_aggregate *)next_smp;
-               agg_data_len = (be16_to_cpu(agg->err_reqlength) & 0x007f) * 8;
-               agg_size = sizeof(*agg) + agg_data_len;
-               am = be32_to_cpu(agg->attr_mod);
-
-               *resp_len += agg_size;
-
-               if (next_smp + agg_size > ((u8 *)smp) + sizeof(*smp)) {
-                       smp->status |= IB_SMP_INVALID_FIELD;
-                       return reply((struct ib_mad_hdr *)smp);
-               }
-
-               (void)subn_set_opa_sma(agg->attr_id, smp, am, agg->data,
-                                       ibdev, port, NULL);
-               if (smp->status & ~IB_SMP_DIRECTION) {
-                       set_aggr_error(agg);
-                       return reply((struct ib_mad_hdr *)smp);
-               }
-               next_smp += agg_size;
-       }
-
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-/*
- * OPAv1 specifies that, on the transition to link up, these counters
- * are cleared:
- *   PortRcvErrors [*]
- *   LinkErrorRecovery
- *   LocalLinkIntegrityErrors
- *   ExcessiveBufferOverruns [*]
- *
- * [*] Error info associated with these counters is retained, but the
- * error info status is reset to 0.
- */
-void clear_linkup_counters(struct hfi1_devdata *dd)
-{
-       /* PortRcvErrors */
-       write_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL, 0);
-       dd->err_info_rcvport.status_and_code &= ~OPA_EI_STATUS_SMASK;
-       /* LinkErrorRecovery */
-       write_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL, 0);
-       write_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, CNTR_INVALID_VL, 0);
-       /* LocalLinkIntegrityErrors */
-       write_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL, 0);
-       write_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL, 0);
-       /* ExcessiveBufferOverruns */
-       write_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL, 0);
-       dd->rcv_ovfl_cnt = 0;
-       dd->err_info_xmit_constraint.status &= ~OPA_EI_STATUS_SMASK;
-}
-
-/*
- * is_local_mad() returns 1 if 'mad' is sent from, and destined to the
- * local node, 0 otherwise.
- */
-static int is_local_mad(struct hfi1_ibport *ibp, const struct opa_mad *mad,
-                       const struct ib_wc *in_wc)
-{
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-       const struct opa_smp *smp = (const struct opa_smp *)mad;
-
-       if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
-               return (smp->hop_cnt == 0 &&
-                       smp->route.dr.dr_slid == OPA_LID_PERMISSIVE &&
-                       smp->route.dr.dr_dlid == OPA_LID_PERMISSIVE);
-       }
-
-       return (in_wc->slid == ppd->lid);
-}
-
-/*
- * opa_local_smp_check() should only be called on MADs for which
- * is_local_mad() returns true. It applies the SMP checks that are
- * specific to SMPs which are sent from, and destined to this node.
- * opa_local_smp_check() returns 0 if the SMP passes its checks, 1
- * otherwise.
- *
- * SMPs which arrive from other nodes are instead checked by
- * opa_smp_check().
- */
-static int opa_local_smp_check(struct hfi1_ibport *ibp,
-                              const struct ib_wc *in_wc)
-{
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-       u16 slid = in_wc->slid;
-       u16 pkey;
-
-       if (in_wc->pkey_index >= ARRAY_SIZE(ppd->pkeys))
-               return 1;
-
-       pkey = ppd->pkeys[in_wc->pkey_index];
-       /*
-        * We need to do the "node-local" checks specified in OPAv1,
-        * rev 0.90, section 9.10.26, which are:
-        *   - pkey is 0x7fff, or 0xffff
-        *   - Source QPN == 0 || Destination QPN == 0
-        *   - the MAD header's management class is either
-        *     IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE or
-        *     IB_MGMT_CLASS_SUBN_LID_ROUTED
-        *   - SLID != 0
-        *
-        * However, we know (and so don't need to check again) that,
-        * for local SMPs, the MAD stack passes MADs with:
-        *   - Source QPN of 0
-        *   - MAD mgmt_class is IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
-        *   - SLID is either: OPA_LID_PERMISSIVE (0xFFFFFFFF), or
-        *     our own port's lid
-        *
-        */
-       if (pkey == LIM_MGMT_P_KEY || pkey == FULL_MGMT_P_KEY)
-               return 0;
-       ingress_pkey_table_fail(ppd, pkey, slid);
-       return 1;
-}
-
-static int process_subn_opa(struct ib_device *ibdev, int mad_flags,
-                           u8 port, const struct opa_mad *in_mad,
-                           struct opa_mad *out_mad,
-                           u32 *resp_len)
-{
-       struct opa_smp *smp = (struct opa_smp *)out_mad;
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-       u8 *data;
-       u32 am;
-       __be16 attr_id;
-       int ret;
-
-       *out_mad = *in_mad;
-       data = opa_get_smp_data(smp);
-
-       am = be32_to_cpu(smp->attr_mod);
-       attr_id = smp->attr_id;
-       if (smp->class_version != OPA_SMI_CLASS_VERSION) {
-               smp->status |= IB_SMP_UNSUP_VERSION;
-               ret = reply((struct ib_mad_hdr *)smp);
-               return ret;
-       }
-       ret = check_mkey(ibp, (struct ib_mad_hdr *)smp, mad_flags, smp->mkey,
-                        smp->route.dr.dr_slid, smp->route.dr.return_path,
-                        smp->hop_cnt);
-       if (ret) {
-               u32 port_num = be32_to_cpu(smp->attr_mod);
-
-               /*
-                * If this is a get/set portinfo, we already check the
-                * M_Key if the MAD is for another port and the M_Key
-                * is OK on the receiving port. This check is needed
-                * to increment the error counters when the M_Key
-                * fails to match on *both* ports.
-                */
-               if (attr_id == IB_SMP_ATTR_PORT_INFO &&
-                   (smp->method == IB_MGMT_METHOD_GET ||
-                    smp->method == IB_MGMT_METHOD_SET) &&
-                   port_num && port_num <= ibdev->phys_port_cnt &&
-                   port != port_num)
-                       (void)check_mkey(to_iport(ibdev, port_num),
-                                         (struct ib_mad_hdr *)smp, 0,
-                                         smp->mkey, smp->route.dr.dr_slid,
-                                         smp->route.dr.return_path,
-                                         smp->hop_cnt);
-               ret = IB_MAD_RESULT_FAILURE;
-               return ret;
-       }
-
-       *resp_len = opa_get_smp_header_size(smp);
-
-       switch (smp->method) {
-       case IB_MGMT_METHOD_GET:
-               switch (attr_id) {
-               default:
-                       clear_opa_smp_data(smp);
-                       ret = subn_get_opa_sma(attr_id, smp, am, data,
-                                              ibdev, port, resp_len);
-                       break;
-               case OPA_ATTRIB_ID_AGGREGATE:
-                       ret = subn_get_opa_aggregate(smp, ibdev, port,
-                                                    resp_len);
-                       break;
-               }
-               break;
-       case IB_MGMT_METHOD_SET:
-               switch (attr_id) {
-               default:
-                       ret = subn_set_opa_sma(attr_id, smp, am, data,
-                                              ibdev, port, resp_len);
-                       break;
-               case OPA_ATTRIB_ID_AGGREGATE:
-                       ret = subn_set_opa_aggregate(smp, ibdev, port,
-                                                    resp_len);
-                       break;
-               }
-               break;
-       case IB_MGMT_METHOD_TRAP:
-       case IB_MGMT_METHOD_REPORT:
-       case IB_MGMT_METHOD_REPORT_RESP:
-       case IB_MGMT_METHOD_GET_RESP:
-               /*
-                * The ib_mad module will call us to process responses
-                * before checking for other consumers.
-                * Just tell the caller to process it normally.
-                */
-               ret = IB_MAD_RESULT_SUCCESS;
-               break;
-       default:
-               smp->status |= IB_SMP_UNSUP_METHOD;
-               ret = reply((struct ib_mad_hdr *)smp);
-               break;
-       }
-
-       return ret;
-}
-
-static int process_subn(struct ib_device *ibdev, int mad_flags,
-                       u8 port, const struct ib_mad *in_mad,
-                       struct ib_mad *out_mad)
-{
-       struct ib_smp *smp = (struct ib_smp *)out_mad;
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-       int ret;
-
-       *out_mad = *in_mad;
-       if (smp->class_version != 1) {
-               smp->status |= IB_SMP_UNSUP_VERSION;
-               ret = reply((struct ib_mad_hdr *)smp);
-               return ret;
-       }
-
-       ret = check_mkey(ibp, (struct ib_mad_hdr *)smp, mad_flags,
-                        smp->mkey, (__force __be32)smp->dr_slid,
-                        smp->return_path, smp->hop_cnt);
-       if (ret) {
-               u32 port_num = be32_to_cpu(smp->attr_mod);
-
-               /*
-                * If this is a get/set portinfo, we already check the
-                * M_Key if the MAD is for another port and the M_Key
-                * is OK on the receiving port. This check is needed
-                * to increment the error counters when the M_Key
-                * fails to match on *both* ports.
-                */
-               if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO &&
-                   (smp->method == IB_MGMT_METHOD_GET ||
-                    smp->method == IB_MGMT_METHOD_SET) &&
-                   port_num && port_num <= ibdev->phys_port_cnt &&
-                   port != port_num)
-                       (void)check_mkey(to_iport(ibdev, port_num),
-                                        (struct ib_mad_hdr *)smp, 0,
-                                        smp->mkey,
-                                        (__force __be32)smp->dr_slid,
-                                        smp->return_path, smp->hop_cnt);
-               ret = IB_MAD_RESULT_FAILURE;
-               return ret;
-       }
-
-       switch (smp->method) {
-       case IB_MGMT_METHOD_GET:
-               switch (smp->attr_id) {
-               case IB_SMP_ATTR_NODE_INFO:
-                       ret = subn_get_nodeinfo(smp, ibdev, port);
-                       break;
-               default:
-                       smp->status |= IB_SMP_UNSUP_METH_ATTR;
-                       ret = reply((struct ib_mad_hdr *)smp);
-                       break;
-               }
-               break;
-       }
-
-       return ret;
-}
-
-static int process_perf(struct ib_device *ibdev, u8 port,
-                       const struct ib_mad *in_mad,
-                       struct ib_mad *out_mad)
-{
-       struct ib_pma_mad *pmp = (struct ib_pma_mad *)out_mad;
-       struct ib_class_port_info *cpi = (struct ib_class_port_info *)
-                                               &pmp->data;
-       int ret = IB_MAD_RESULT_FAILURE;
-
-       *out_mad = *in_mad;
-       if (pmp->mad_hdr.class_version != 1) {
-               pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION;
-               ret = reply((struct ib_mad_hdr *)pmp);
-               return ret;
-       }
-
-       switch (pmp->mad_hdr.method) {
-       case IB_MGMT_METHOD_GET:
-               switch (pmp->mad_hdr.attr_id) {
-               case IB_PMA_PORT_COUNTERS:
-                       ret = pma_get_ib_portcounters(pmp, ibdev, port);
-                       break;
-               case IB_PMA_PORT_COUNTERS_EXT:
-                       ret = pma_get_ib_portcounters_ext(pmp, ibdev, port);
-                       break;
-               case IB_PMA_CLASS_PORT_INFO:
-                       cpi->capability_mask = IB_PMA_CLASS_CAP_EXT_WIDTH;
-                       ret = reply((struct ib_mad_hdr *)pmp);
-                       break;
-               default:
-                       pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
-                       ret = reply((struct ib_mad_hdr *)pmp);
-                       break;
-               }
-               break;
-
-       case IB_MGMT_METHOD_SET:
-               if (pmp->mad_hdr.attr_id) {
-                       pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
-                       ret = reply((struct ib_mad_hdr *)pmp);
-               }
-               break;
-
-       case IB_MGMT_METHOD_TRAP:
-       case IB_MGMT_METHOD_GET_RESP:
-               /*
-                * The ib_mad module will call us to process responses
-                * before checking for other consumers.
-                * Just tell the caller to process it normally.
-                */
-               ret = IB_MAD_RESULT_SUCCESS;
-               break;
-
-       default:
-               pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD;
-               ret = reply((struct ib_mad_hdr *)pmp);
-               break;
-       }
-
-       return ret;
-}
-
-static int process_perf_opa(struct ib_device *ibdev, u8 port,
-                           const struct opa_mad *in_mad,
-                           struct opa_mad *out_mad, u32 *resp_len)
-{
-       struct opa_pma_mad *pmp = (struct opa_pma_mad *)out_mad;
-       int ret;
-
-       *out_mad = *in_mad;
-
-       if (pmp->mad_hdr.class_version != OPA_SMI_CLASS_VERSION) {
-               pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION;
-               return reply((struct ib_mad_hdr *)pmp);
-       }
-
-       *resp_len = sizeof(pmp->mad_hdr);
-
-       switch (pmp->mad_hdr.method) {
-       case IB_MGMT_METHOD_GET:
-               switch (pmp->mad_hdr.attr_id) {
-               case IB_PMA_CLASS_PORT_INFO:
-                       ret = pma_get_opa_classportinfo(pmp, ibdev, resp_len);
-                       break;
-               case OPA_PM_ATTRIB_ID_PORT_STATUS:
-                       ret = pma_get_opa_portstatus(pmp, ibdev, port,
-                                                    resp_len);
-                       break;
-               case OPA_PM_ATTRIB_ID_DATA_PORT_COUNTERS:
-                       ret = pma_get_opa_datacounters(pmp, ibdev, port,
-                                                      resp_len);
-                       break;
-               case OPA_PM_ATTRIB_ID_ERROR_PORT_COUNTERS:
-                       ret = pma_get_opa_porterrors(pmp, ibdev, port,
-                                                    resp_len);
-                       break;
-               case OPA_PM_ATTRIB_ID_ERROR_INFO:
-                       ret = pma_get_opa_errorinfo(pmp, ibdev, port,
-                                                   resp_len);
-                       break;
-               default:
-                       pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
-                       ret = reply((struct ib_mad_hdr *)pmp);
-                       break;
-               }
-               break;
-
-       case IB_MGMT_METHOD_SET:
-               switch (pmp->mad_hdr.attr_id) {
-               case OPA_PM_ATTRIB_ID_CLEAR_PORT_STATUS:
-                       ret = pma_set_opa_portstatus(pmp, ibdev, port,
-                                                    resp_len);
-                       break;
-               case OPA_PM_ATTRIB_ID_ERROR_INFO:
-                       ret = pma_set_opa_errorinfo(pmp, ibdev, port,
-                                                   resp_len);
-                       break;
-               default:
-                       pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
-                       ret = reply((struct ib_mad_hdr *)pmp);
-                       break;
-               }
-               break;
-
-       case IB_MGMT_METHOD_TRAP:
-       case IB_MGMT_METHOD_GET_RESP:
-               /*
-                * The ib_mad module will call us to process responses
-                * before checking for other consumers.
-                * Just tell the caller to process it normally.
-                */
-               ret = IB_MAD_RESULT_SUCCESS;
-               break;
-
-       default:
-               pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD;
-               ret = reply((struct ib_mad_hdr *)pmp);
-               break;
-       }
-
-       return ret;
-}
-
-static int hfi1_process_opa_mad(struct ib_device *ibdev, int mad_flags,
-                               u8 port, const struct ib_wc *in_wc,
-                               const struct ib_grh *in_grh,
-                               const struct opa_mad *in_mad,
-                               struct opa_mad *out_mad, size_t *out_mad_size,
-                               u16 *out_mad_pkey_index)
-{
-       int ret;
-       int pkey_idx;
-       u32 resp_len = 0;
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-
-       pkey_idx = hfi1_lookup_pkey_idx(ibp, LIM_MGMT_P_KEY);
-       if (pkey_idx < 0) {
-               pr_warn("failed to find limited mgmt pkey, defaulting 0x%x\n",
-                       hfi1_get_pkey(ibp, 1));
-               pkey_idx = 1;
-       }
-       *out_mad_pkey_index = (u16)pkey_idx;
-
-       switch (in_mad->mad_hdr.mgmt_class) {
-       case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
-       case IB_MGMT_CLASS_SUBN_LID_ROUTED:
-               if (is_local_mad(ibp, in_mad, in_wc)) {
-                       ret = opa_local_smp_check(ibp, in_wc);
-                       if (ret)
-                               return IB_MAD_RESULT_FAILURE;
-               }
-               ret = process_subn_opa(ibdev, mad_flags, port, in_mad,
-                                      out_mad, &resp_len);
-               goto bail;
-       case IB_MGMT_CLASS_PERF_MGMT:
-               ret = process_perf_opa(ibdev, port, in_mad, out_mad,
-                                      &resp_len);
-               goto bail;
-
-       default:
-               ret = IB_MAD_RESULT_SUCCESS;
-       }
-
-bail:
-       if (ret & IB_MAD_RESULT_REPLY)
-               *out_mad_size = round_up(resp_len, 8);
-       else if (ret & IB_MAD_RESULT_SUCCESS)
-               *out_mad_size = in_wc->byte_len - sizeof(struct ib_grh);
-
-       return ret;
-}
-
-static int hfi1_process_ib_mad(struct ib_device *ibdev, int mad_flags, u8 port,
-                              const struct ib_wc *in_wc,
-                              const struct ib_grh *in_grh,
-                              const struct ib_mad *in_mad,
-                              struct ib_mad *out_mad)
-{
-       int ret;
-
-       switch (in_mad->mad_hdr.mgmt_class) {
-       case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
-       case IB_MGMT_CLASS_SUBN_LID_ROUTED:
-               ret = process_subn(ibdev, mad_flags, port, in_mad, out_mad);
-               break;
-       case IB_MGMT_CLASS_PERF_MGMT:
-               ret = process_perf(ibdev, port, in_mad, out_mad);
-               break;
-       default:
-               ret = IB_MAD_RESULT_SUCCESS;
-               break;
-       }
-
-       return ret;
-}
-
-/**
- * hfi1_process_mad - process an incoming MAD packet
- * @ibdev: the infiniband device this packet came in on
- * @mad_flags: MAD flags
- * @port: the port number this packet came in on
- * @in_wc: the work completion entry for this packet
- * @in_grh: the global route header for this packet
- * @in_mad: the incoming MAD
- * @out_mad: any outgoing MAD reply
- *
- * Returns IB_MAD_RESULT_SUCCESS if this is a MAD that we are not
- * interested in processing.
- *
- * Note that the verbs framework has already done the MAD sanity checks,
- * and hop count/pointer updating for IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
- * MADs.
- *
- * This is called by the ib_mad module.
- */
-int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u8 port,
-                    const struct ib_wc *in_wc, const struct ib_grh *in_grh,
-                    const struct ib_mad_hdr *in_mad, size_t in_mad_size,
-                    struct ib_mad_hdr *out_mad, size_t *out_mad_size,
-                    u16 *out_mad_pkey_index)
-{
-       switch (in_mad->base_version) {
-       case OPA_MGMT_BASE_VERSION:
-               if (unlikely(in_mad_size != sizeof(struct opa_mad))) {
-                       dev_err(ibdev->dma_device, "invalid in_mad_size\n");
-                       return IB_MAD_RESULT_FAILURE;
-               }
-               return hfi1_process_opa_mad(ibdev, mad_flags, port,
-                                           in_wc, in_grh,
-                                           (struct opa_mad *)in_mad,
-                                           (struct opa_mad *)out_mad,
-                                           out_mad_size,
-                                           out_mad_pkey_index);
-       case IB_MGMT_BASE_VERSION:
-               return hfi1_process_ib_mad(ibdev, mad_flags, port,
-                                         in_wc, in_grh,
-                                         (const struct ib_mad *)in_mad,
-                                         (struct ib_mad *)out_mad);
-       default:
-               break;
-       }
-
-       return IB_MAD_RESULT_FAILURE;
-}
diff --git a/drivers/staging/rdma/hfi1/mad.h b/drivers/staging/rdma/hfi1/mad.h
deleted file mode 100644 (file)
index 55ee086..0000000
+++ /dev/null
@@ -1,437 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#ifndef _HFI1_MAD_H
-#define _HFI1_MAD_H
-
-#include <rdma/ib_pma.h>
-#define USE_PI_LED_ENABLE      1 /*
-                                  * use led enabled bit in struct
-                                  * opa_port_states, if available
-                                  */
-#include <rdma/opa_smi.h>
-#include <rdma/opa_port_info.h>
-#ifndef PI_LED_ENABLE_SUP
-#define PI_LED_ENABLE_SUP 0
-#endif
-#include "opa_compat.h"
-
-/*
- * OPA Traps
- */
-#define OPA_TRAP_GID_NOW_IN_SERVICE             cpu_to_be16(64)
-#define OPA_TRAP_GID_OUT_OF_SERVICE             cpu_to_be16(65)
-#define OPA_TRAP_ADD_MULTICAST_GROUP            cpu_to_be16(66)
-#define OPA_TRAL_DEL_MULTICAST_GROUP            cpu_to_be16(67)
-#define OPA_TRAP_UNPATH                         cpu_to_be16(68)
-#define OPA_TRAP_REPATH                         cpu_to_be16(69)
-#define OPA_TRAP_PORT_CHANGE_STATE              cpu_to_be16(128)
-#define OPA_TRAP_LINK_INTEGRITY                 cpu_to_be16(129)
-#define OPA_TRAP_EXCESSIVE_BUFFER_OVERRUN       cpu_to_be16(130)
-#define OPA_TRAP_FLOW_WATCHDOG                  cpu_to_be16(131)
-#define OPA_TRAP_CHANGE_CAPABILITY              cpu_to_be16(144)
-#define OPA_TRAP_CHANGE_SYSGUID                 cpu_to_be16(145)
-#define OPA_TRAP_BAD_M_KEY                      cpu_to_be16(256)
-#define OPA_TRAP_BAD_P_KEY                      cpu_to_be16(257)
-#define OPA_TRAP_BAD_Q_KEY                      cpu_to_be16(258)
-#define OPA_TRAP_SWITCH_BAD_PKEY                cpu_to_be16(259)
-#define OPA_SMA_TRAP_DATA_LINK_WIDTH            cpu_to_be16(2048)
-
-/*
- * Generic trap/notice other local changes flags (trap 144).
- */
-#define        OPA_NOTICE_TRAP_LWDE_CHG        0x08 /* Link Width Downgrade Enable
-                                             * changed
-                                             */
-#define OPA_NOTICE_TRAP_LSE_CHG         0x04 /* Link Speed Enable changed */
-#define OPA_NOTICE_TRAP_LWE_CHG         0x02 /* Link Width Enable changed */
-#define OPA_NOTICE_TRAP_NODE_DESC_CHG   0x01
-
-struct opa_mad_notice_attr {
-       u8 generic_type;
-       u8 prod_type_msb;
-       __be16 prod_type_lsb;
-       __be16 trap_num;
-       __be16 toggle_count;
-       __be32 issuer_lid;
-       __be32 reserved1;
-       union ib_gid issuer_gid;
-
-       union {
-               struct {
-                       u8      details[64];
-               } raw_data;
-
-               struct {
-                       union ib_gid    gid;
-               } __packed ntc_64_65_66_67;
-
-               struct {
-                       __be32  lid;
-               } __packed ntc_128;
-
-               struct {
-                       __be32  lid;            /* where violation happened */
-                       u8      port_num;       /* where violation happened */
-               } __packed ntc_129_130_131;
-
-               struct {
-                       __be32  lid;            /* LID where change occurred */
-                       __be32  new_cap_mask;   /* new capability mask */
-                       __be16  reserved2;
-                       __be16  cap_mask;
-                       __be16  change_flags;   /* low 4 bits only */
-               } __packed ntc_144;
-
-               struct {
-                       __be64  new_sys_guid;
-                       __be32  lid;            /* lid where sys guid changed */
-               } __packed ntc_145;
-
-               struct {
-                       __be32  lid;
-                       __be32  dr_slid;
-                       u8      method;
-                       u8      dr_trunc_hop;
-                       __be16  attr_id;
-                       __be32  attr_mod;
-                       __be64  mkey;
-                       u8      dr_rtn_path[30];
-               } __packed ntc_256;
-
-               struct {
-                       __be32          lid1;
-                       __be32          lid2;
-                       __be32          key;
-                       u8              sl;     /* SL: high 5 bits */
-                       u8              reserved3[3];
-                       union ib_gid    gid1;
-                       union ib_gid    gid2;
-                       __be32          qp1;    /* high 8 bits reserved */
-                       __be32          qp2;    /* high 8 bits reserved */
-               } __packed ntc_257_258;
-
-               struct {
-                       __be16          flags;  /* low 8 bits reserved */
-                       __be16          pkey;
-                       __be32          lid1;
-                       __be32          lid2;
-                       u8              sl;     /* SL: high 5 bits */
-                       u8              reserved4[3];
-                       union ib_gid    gid1;
-                       union ib_gid    gid2;
-                       __be32          qp1;    /* high 8 bits reserved */
-                       __be32          qp2;    /* high 8 bits reserved */
-               } __packed ntc_259;
-
-               struct {
-                       __be32  lid;
-               } __packed ntc_2048;
-
-       };
-       u8      class_data[0];
-};
-
-#define IB_VLARB_LOWPRI_0_31    1
-#define IB_VLARB_LOWPRI_32_63   2
-#define IB_VLARB_HIGHPRI_0_31   3
-#define IB_VLARB_HIGHPRI_32_63  4
-
-#define OPA_MAX_PREEMPT_CAP         32
-#define OPA_VLARB_LOW_ELEMENTS       0
-#define OPA_VLARB_HIGH_ELEMENTS      1
-#define OPA_VLARB_PREEMPT_ELEMENTS   2
-#define OPA_VLARB_PREEMPT_MATRIX     3
-
-#define IB_PMA_PORT_COUNTERS_CONG       cpu_to_be16(0xFF00)
-
-struct ib_pma_portcounters_cong {
-       u8 reserved;
-       u8 reserved1;
-       __be16 port_check_rate;
-       __be16 symbol_error_counter;
-       u8 link_error_recovery_counter;
-       u8 link_downed_counter;
-       __be16 port_rcv_errors;
-       __be16 port_rcv_remphys_errors;
-       __be16 port_rcv_switch_relay_errors;
-       __be16 port_xmit_discards;
-       u8 port_xmit_constraint_errors;
-       u8 port_rcv_constraint_errors;
-       u8 reserved2;
-       u8 link_overrun_errors; /* LocalLink: 7:4, BufferOverrun: 3:0 */
-       __be16 reserved3;
-       __be16 vl15_dropped;
-       __be64 port_xmit_data;
-       __be64 port_rcv_data;
-       __be64 port_xmit_packets;
-       __be64 port_rcv_packets;
-       __be64 port_xmit_wait;
-       __be64 port_adr_events;
-} __packed;
-
-#define IB_SMP_UNSUP_VERSION    cpu_to_be16(0x0004)
-#define IB_SMP_UNSUP_METHOD     cpu_to_be16(0x0008)
-#define IB_SMP_UNSUP_METH_ATTR  cpu_to_be16(0x000C)
-#define IB_SMP_INVALID_FIELD    cpu_to_be16(0x001C)
-
-#define OPA_MAX_PREEMPT_CAP         32
-#define OPA_VLARB_LOW_ELEMENTS       0
-#define OPA_VLARB_HIGH_ELEMENTS      1
-#define OPA_VLARB_PREEMPT_ELEMENTS   2
-#define OPA_VLARB_PREEMPT_MATRIX     3
-
-#define HFI1_XMIT_RATE_UNSUPPORTED               0x0
-#define HFI1_XMIT_RATE_PICO                      0x7
-/* number of 4nsec cycles equaling 2secs */
-#define HFI1_CONG_TIMER_PSINTERVAL               0x1DCD64EC
-
-#define IB_CC_SVCTYPE_RC 0x0
-#define IB_CC_SVCTYPE_UC 0x1
-#define IB_CC_SVCTYPE_RD 0x2
-#define IB_CC_SVCTYPE_UD 0x3
-
-/*
- * There should be an equivalent IB #define for the following, but
- * I cannot find it.
- */
-#define OPA_CC_LOG_TYPE_HFI    2
-
-struct opa_hfi1_cong_log_event_internal {
-       u32 lqpn;
-       u32 rqpn;
-       u8 sl;
-       u8 svc_type;
-       u32 rlid;
-       s64 timestamp; /* wider than 32 bits to detect 32 bit rollover */
-};
-
-struct opa_hfi1_cong_log_event {
-       u8 local_qp_cn_entry[3];
-       u8 remote_qp_number_cn_entry[3];
-       u8 sl_svc_type_cn_entry; /* 5 bits SL, 3 bits svc type */
-       u8 reserved;
-       __be32 remote_lid_cn_entry;
-       __be32 timestamp_cn_entry;
-} __packed;
-
-#define OPA_CONG_LOG_ELEMS     96
-
-struct opa_hfi1_cong_log {
-       u8 log_type;
-       u8 congestion_flags;
-       __be16 threshold_event_counter;
-       __be32 current_time_stamp;
-       u8 threshold_cong_event_map[OPA_MAX_SLS / 8];
-       struct opa_hfi1_cong_log_event events[OPA_CONG_LOG_ELEMS];
-} __packed;
-
-#define IB_CC_TABLE_CAP_DEFAULT 31
-
-/* Port control flags */
-#define IB_CC_CCS_PC_SL_BASED 0x01
-
-struct opa_congestion_setting_entry {
-       u8 ccti_increase;
-       u8 reserved;
-       __be16 ccti_timer;
-       u8 trigger_threshold;
-       u8 ccti_min; /* min CCTI for cc table */
-} __packed;
-
-struct opa_congestion_setting_entry_shadow {
-       u8 ccti_increase;
-       u8 reserved;
-       u16 ccti_timer;
-       u8 trigger_threshold;
-       u8 ccti_min; /* min CCTI for cc table */
-} __packed;
-
-struct opa_congestion_setting_attr {
-       __be32 control_map;
-       __be16 port_control;
-       struct opa_congestion_setting_entry entries[OPA_MAX_SLS];
-} __packed;
-
-struct opa_congestion_setting_attr_shadow {
-       u32 control_map;
-       u16 port_control;
-       struct opa_congestion_setting_entry_shadow entries[OPA_MAX_SLS];
-} __packed;
-
-#define IB_CC_TABLE_ENTRY_INCREASE_DEFAULT 1
-#define IB_CC_TABLE_ENTRY_TIMER_DEFAULT 1
-
-/* 64 Congestion Control table entries in a single MAD */
-#define IB_CCT_ENTRIES 64
-#define IB_CCT_MIN_ENTRIES (IB_CCT_ENTRIES * 2)
-
-struct ib_cc_table_entry {
-       __be16 entry; /* shift:2, multiplier:14 */
-};
-
-struct ib_cc_table_entry_shadow {
-       u16 entry; /* shift:2, multiplier:14 */
-};
-
-struct ib_cc_table_attr {
-       __be16 ccti_limit; /* max CCTI for cc table */
-       struct ib_cc_table_entry ccti_entries[IB_CCT_ENTRIES];
-} __packed;
-
-struct ib_cc_table_attr_shadow {
-       u16 ccti_limit; /* max CCTI for cc table */
-       struct ib_cc_table_entry_shadow ccti_entries[IB_CCT_ENTRIES];
-} __packed;
-
-#define CC_TABLE_SHADOW_MAX \
-       (IB_CC_TABLE_CAP_DEFAULT * IB_CCT_ENTRIES)
-
-struct cc_table_shadow {
-       u16 ccti_limit; /* max CCTI for cc table */
-       struct ib_cc_table_entry_shadow entries[CC_TABLE_SHADOW_MAX];
-} __packed;
-
-/*
- * struct cc_state combines the (active) per-port congestion control
- * table, and the (active) per-SL congestion settings. cc_state data
- * may need to be read in code paths that we want to be fast, so it
- * is an RCU protected structure.
- */
-struct cc_state {
-       struct rcu_head rcu;
-       struct cc_table_shadow cct;
-       struct opa_congestion_setting_attr_shadow cong_setting;
-};
-
-/*
- * OPA BufferControl MAD
- */
-
-/* attribute modifier macros */
-#define OPA_AM_NPORT_SHIFT     24
-#define OPA_AM_NPORT_MASK      0xff
-#define OPA_AM_NPORT_SMASK     (OPA_AM_NPORT_MASK << OPA_AM_NPORT_SHIFT)
-#define OPA_AM_NPORT(am)       (((am) >> OPA_AM_NPORT_SHIFT) & \
-                                       OPA_AM_NPORT_MASK)
-
-#define OPA_AM_NBLK_SHIFT      24
-#define OPA_AM_NBLK_MASK       0xff
-#define OPA_AM_NBLK_SMASK      (OPA_AM_NBLK_MASK << OPA_AM_NBLK_SHIFT)
-#define OPA_AM_NBLK(am)                (((am) >> OPA_AM_NBLK_SHIFT) & \
-                                       OPA_AM_NBLK_MASK)
-
-#define OPA_AM_START_BLK_SHIFT 0
-#define OPA_AM_START_BLK_MASK  0xff
-#define OPA_AM_START_BLK_SMASK (OPA_AM_START_BLK_MASK << \
-                                       OPA_AM_START_BLK_SHIFT)
-#define OPA_AM_START_BLK(am)   (((am) >> OPA_AM_START_BLK_SHIFT) & \
-                                       OPA_AM_START_BLK_MASK)
-
-#define OPA_AM_PORTNUM_SHIFT   0
-#define OPA_AM_PORTNUM_MASK    0xff
-#define OPA_AM_PORTNUM_SMASK   (OPA_AM_PORTNUM_MASK << OPA_AM_PORTNUM_SHIFT)
-#define OPA_AM_PORTNUM(am)     (((am) >> OPA_AM_PORTNUM_SHIFT) & \
-                                       OPA_AM_PORTNUM_MASK)
-
-#define OPA_AM_ASYNC_SHIFT     12
-#define OPA_AM_ASYNC_MASK      0x1
-#define OPA_AM_ASYNC_SMASK     (OPA_AM_ASYNC_MASK << OPA_AM_ASYNC_SHIFT)
-#define OPA_AM_ASYNC(am)       (((am) >> OPA_AM_ASYNC_SHIFT) & \
-                                       OPA_AM_ASYNC_MASK)
-
-#define OPA_AM_START_SM_CFG_SHIFT      9
-#define OPA_AM_START_SM_CFG_MASK       0x1
-#define OPA_AM_START_SM_CFG_SMASK      (OPA_AM_START_SM_CFG_MASK << \
-                                               OPA_AM_START_SM_CFG_SHIFT)
-#define OPA_AM_START_SM_CFG(am)                (((am) >> OPA_AM_START_SM_CFG_SHIFT) \
-                                               & OPA_AM_START_SM_CFG_MASK)
-
-#define OPA_AM_CI_ADDR_SHIFT   19
-#define OPA_AM_CI_ADDR_MASK    0xfff
-#define OPA_AM_CI_ADDR_SMASK   (OPA_AM_CI_ADDR_MASK << OPA_CI_ADDR_SHIFT)
-#define OPA_AM_CI_ADDR(am)     (((am) >> OPA_AM_CI_ADDR_SHIFT) & \
-                                       OPA_AM_CI_ADDR_MASK)
-
-#define OPA_AM_CI_LEN_SHIFT    13
-#define OPA_AM_CI_LEN_MASK     0x3f
-#define OPA_AM_CI_LEN_SMASK    (OPA_AM_CI_LEN_MASK << OPA_CI_LEN_SHIFT)
-#define OPA_AM_CI_LEN(am)      (((am) >> OPA_AM_CI_LEN_SHIFT) & \
-                                       OPA_AM_CI_LEN_MASK)
-
-/* error info macros */
-#define OPA_EI_STATUS_SMASK    0x80
-#define OPA_EI_CODE_SMASK      0x0f
-
-struct vl_limit {
-       __be16 dedicated;
-       __be16 shared;
-};
-
-struct buffer_control {
-       __be16 reserved;
-       __be16 overall_shared_limit;
-       struct vl_limit vl[OPA_MAX_VLS];
-};
-
-struct sc2vlnt {
-       u8 vlnt[32]; /* 5 bit VL, 3 bits reserved */
-};
-
-/*
- * The PortSamplesControl.CounterMasks field is an array of 3 bit fields
- * which specify the N'th counter's capabilities. See ch. 16.1.3.2.
- * We support 5 counters which only count the mandatory quantities.
- */
-#define COUNTER_MASK(q, n) (q << ((9 - n) * 3))
-#define COUNTER_MASK0_9 \
-       cpu_to_be32(COUNTER_MASK(1, 0) | \
-                   COUNTER_MASK(1, 1) | \
-                   COUNTER_MASK(1, 2) | \
-                   COUNTER_MASK(1, 3) | \
-                   COUNTER_MASK(1, 4))
-
-#endif                         /* _HFI1_MAD_H */
diff --git a/drivers/staging/rdma/hfi1/mmu_rb.c b/drivers/staging/rdma/hfi1/mmu_rb.c
deleted file mode 100644 (file)
index 2b0e91d..0000000
+++ /dev/null
@@ -1,325 +0,0 @@
-/*
- * Copyright(c) 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#include <linux/list.h>
-#include <linux/mmu_notifier.h>
-#include <linux/interval_tree_generic.h>
-
-#include "mmu_rb.h"
-#include "trace.h"
-
-struct mmu_rb_handler {
-       struct list_head list;
-       struct mmu_notifier mn;
-       struct rb_root *root;
-       spinlock_t lock;        /* protect the RB tree */
-       struct mmu_rb_ops *ops;
-};
-
-static LIST_HEAD(mmu_rb_handlers);
-static DEFINE_SPINLOCK(mmu_rb_lock); /* protect mmu_rb_handlers list */
-
-static unsigned long mmu_node_start(struct mmu_rb_node *);
-static unsigned long mmu_node_last(struct mmu_rb_node *);
-static struct mmu_rb_handler *find_mmu_handler(struct rb_root *);
-static inline void mmu_notifier_page(struct mmu_notifier *, struct mm_struct *,
-                                    unsigned long);
-static inline void mmu_notifier_range_start(struct mmu_notifier *,
-                                           struct mm_struct *,
-                                           unsigned long, unsigned long);
-static void mmu_notifier_mem_invalidate(struct mmu_notifier *,
-                                       struct mm_struct *,
-                                       unsigned long, unsigned long);
-static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *,
-                                          unsigned long, unsigned long);
-
-static struct mmu_notifier_ops mn_opts = {
-       .invalidate_page = mmu_notifier_page,
-       .invalidate_range_start = mmu_notifier_range_start,
-};
-
-INTERVAL_TREE_DEFINE(struct mmu_rb_node, node, unsigned long, __last,
-                    mmu_node_start, mmu_node_last, static, __mmu_int_rb);
-
-static unsigned long mmu_node_start(struct mmu_rb_node *node)
-{
-       return node->addr & PAGE_MASK;
-}
-
-static unsigned long mmu_node_last(struct mmu_rb_node *node)
-{
-       return PAGE_ALIGN(node->addr + node->len) - 1;
-}
-
-int hfi1_mmu_rb_register(struct rb_root *root, struct mmu_rb_ops *ops)
-{
-       struct mmu_rb_handler *handlr;
-       unsigned long flags;
-
-       if (!ops->invalidate)
-               return -EINVAL;
-
-       handlr = kmalloc(sizeof(*handlr), GFP_KERNEL);
-       if (!handlr)
-               return -ENOMEM;
-
-       handlr->root = root;
-       handlr->ops = ops;
-       INIT_HLIST_NODE(&handlr->mn.hlist);
-       spin_lock_init(&handlr->lock);
-       handlr->mn.ops = &mn_opts;
-       spin_lock_irqsave(&mmu_rb_lock, flags);
-       list_add_tail(&handlr->list, &mmu_rb_handlers);
-       spin_unlock_irqrestore(&mmu_rb_lock, flags);
-
-       return mmu_notifier_register(&handlr->mn, current->mm);
-}
-
-void hfi1_mmu_rb_unregister(struct rb_root *root)
-{
-       struct mmu_rb_handler *handler = find_mmu_handler(root);
-       unsigned long flags;
-
-       if (!handler)
-               return;
-
-       /* Unregister first so we don't get any more notifications. */
-       if (current->mm)
-               mmu_notifier_unregister(&handler->mn, current->mm);
-
-       spin_lock_irqsave(&mmu_rb_lock, flags);
-       list_del(&handler->list);
-       spin_unlock_irqrestore(&mmu_rb_lock, flags);
-
-       spin_lock_irqsave(&handler->lock, flags);
-       if (!RB_EMPTY_ROOT(root)) {
-               struct rb_node *node;
-               struct mmu_rb_node *rbnode;
-
-               while ((node = rb_first(root))) {
-                       rbnode = rb_entry(node, struct mmu_rb_node, node);
-                       rb_erase(node, root);
-                       if (handler->ops->remove)
-                               handler->ops->remove(root, rbnode, NULL);
-               }
-       }
-       spin_unlock_irqrestore(&handler->lock, flags);
-
-       kfree(handler);
-}
-
-int hfi1_mmu_rb_insert(struct rb_root *root, struct mmu_rb_node *mnode)
-{
-       struct mmu_rb_handler *handler = find_mmu_handler(root);
-       struct mmu_rb_node *node;
-       unsigned long flags;
-       int ret = 0;
-
-       if (!handler)
-               return -EINVAL;
-
-       spin_lock_irqsave(&handler->lock, flags);
-       hfi1_cdbg(MMU, "Inserting node addr 0x%llx, len %u", mnode->addr,
-                 mnode->len);
-       node = __mmu_rb_search(handler, mnode->addr, mnode->len);
-       if (node) {
-               ret = -EINVAL;
-               goto unlock;
-       }
-       __mmu_int_rb_insert(mnode, root);
-
-       if (handler->ops->insert) {
-               ret = handler->ops->insert(root, mnode);
-               if (ret)
-                       __mmu_int_rb_remove(mnode, root);
-       }
-unlock:
-       spin_unlock_irqrestore(&handler->lock, flags);
-       return ret;
-}
-
-/* Caller must hold handler lock */
-static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *handler,
-                                          unsigned long addr,
-                                          unsigned long len)
-{
-       struct mmu_rb_node *node = NULL;
-
-       hfi1_cdbg(MMU, "Searching for addr 0x%llx, len %u", addr, len);
-       if (!handler->ops->filter) {
-               node = __mmu_int_rb_iter_first(handler->root, addr,
-                                              (addr + len) - 1);
-       } else {
-               for (node = __mmu_int_rb_iter_first(handler->root, addr,
-                                                   (addr + len) - 1);
-                    node;
-                    node = __mmu_int_rb_iter_next(node, addr,
-                                                  (addr + len) - 1)) {
-                       if (handler->ops->filter(node, addr, len))
-                               return node;
-               }
-       }
-       return node;
-}
-
-/* Caller must *not* hold handler lock. */
-static void __mmu_rb_remove(struct mmu_rb_handler *handler,
-                           struct mmu_rb_node *node, struct mm_struct *mm)
-{
-       unsigned long flags;
-
-       /* Validity of handler and node pointers has been checked by caller. */
-       hfi1_cdbg(MMU, "Removing node addr 0x%llx, len %u", node->addr,
-                 node->len);
-       spin_lock_irqsave(&handler->lock, flags);
-       __mmu_int_rb_remove(node, handler->root);
-       spin_unlock_irqrestore(&handler->lock, flags);
-
-       if (handler->ops->remove)
-               handler->ops->remove(handler->root, node, mm);
-}
-
-struct mmu_rb_node *hfi1_mmu_rb_search(struct rb_root *root, unsigned long addr,
-                                      unsigned long len)
-{
-       struct mmu_rb_handler *handler = find_mmu_handler(root);
-       struct mmu_rb_node *node;
-       unsigned long flags;
-
-       if (!handler)
-               return ERR_PTR(-EINVAL);
-
-       spin_lock_irqsave(&handler->lock, flags);
-       node = __mmu_rb_search(handler, addr, len);
-       spin_unlock_irqrestore(&handler->lock, flags);
-
-       return node;
-}
-
-struct mmu_rb_node *hfi1_mmu_rb_extract(struct rb_root *root,
-                                       unsigned long addr, unsigned long len)
-{
-       struct mmu_rb_handler *handler = find_mmu_handler(root);
-       struct mmu_rb_node *node;
-       unsigned long flags;
-
-       if (!handler)
-               return ERR_PTR(-EINVAL);
-
-       spin_lock_irqsave(&handler->lock, flags);
-       node = __mmu_rb_search(handler, addr, len);
-       if (node)
-               __mmu_int_rb_remove(node, handler->root);
-       spin_unlock_irqrestore(&handler->lock, flags);
-
-       return node;
-}
-
-void hfi1_mmu_rb_remove(struct rb_root *root, struct mmu_rb_node *node)
-{
-       struct mmu_rb_handler *handler = find_mmu_handler(root);
-
-       if (!handler || !node)
-               return;
-
-       __mmu_rb_remove(handler, node, NULL);
-}
-
-static struct mmu_rb_handler *find_mmu_handler(struct rb_root *root)
-{
-       struct mmu_rb_handler *handler;
-       unsigned long flags;
-
-       spin_lock_irqsave(&mmu_rb_lock, flags);
-       list_for_each_entry(handler, &mmu_rb_handlers, list) {
-               if (handler->root == root)
-                       goto unlock;
-       }
-       handler = NULL;
-unlock:
-       spin_unlock_irqrestore(&mmu_rb_lock, flags);
-       return handler;
-}
-
-static inline void mmu_notifier_page(struct mmu_notifier *mn,
-                                    struct mm_struct *mm, unsigned long addr)
-{
-       mmu_notifier_mem_invalidate(mn, mm, addr, addr + PAGE_SIZE);
-}
-
-static inline void mmu_notifier_range_start(struct mmu_notifier *mn,
-                                           struct mm_struct *mm,
-                                           unsigned long start,
-                                           unsigned long end)
-{
-       mmu_notifier_mem_invalidate(mn, mm, start, end);
-}
-
-static void mmu_notifier_mem_invalidate(struct mmu_notifier *mn,
-                                       struct mm_struct *mm,
-                                       unsigned long start, unsigned long end)
-{
-       struct mmu_rb_handler *handler =
-               container_of(mn, struct mmu_rb_handler, mn);
-       struct rb_root *root = handler->root;
-       struct mmu_rb_node *node, *ptr = NULL;
-       unsigned long flags;
-
-       spin_lock_irqsave(&handler->lock, flags);
-       for (node = __mmu_int_rb_iter_first(root, start, end - 1);
-            node; node = ptr) {
-               /* Guard against node removal. */
-               ptr = __mmu_int_rb_iter_next(node, start, end - 1);
-               hfi1_cdbg(MMU, "Invalidating node addr 0x%llx, len %u",
-                         node->addr, node->len);
-               if (handler->ops->invalidate(root, node)) {
-                       __mmu_int_rb_remove(node, root);
-                       if (handler->ops->remove)
-                               handler->ops->remove(root, node, mm);
-               }
-       }
-       spin_unlock_irqrestore(&handler->lock, flags);
-}
diff --git a/drivers/staging/rdma/hfi1/mmu_rb.h b/drivers/staging/rdma/hfi1/mmu_rb.h
deleted file mode 100644 (file)
index 7a57b9c..0000000
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright(c) 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#ifndef _HFI1_MMU_RB_H
-#define _HFI1_MMU_RB_H
-
-#include "hfi.h"
-
-struct mmu_rb_node {
-       unsigned long addr;
-       unsigned long len;
-       unsigned long __last;
-       struct rb_node node;
-};
-
-struct mmu_rb_ops {
-       bool (*filter)(struct mmu_rb_node *, unsigned long, unsigned long);
-       int (*insert)(struct rb_root *, struct mmu_rb_node *);
-       void (*remove)(struct rb_root *, struct mmu_rb_node *,
-                      struct mm_struct *);
-       int (*invalidate)(struct rb_root *, struct mmu_rb_node *);
-};
-
-int hfi1_mmu_rb_register(struct rb_root *root, struct mmu_rb_ops *ops);
-void hfi1_mmu_rb_unregister(struct rb_root *);
-int hfi1_mmu_rb_insert(struct rb_root *, struct mmu_rb_node *);
-void hfi1_mmu_rb_remove(struct rb_root *, struct mmu_rb_node *);
-struct mmu_rb_node *hfi1_mmu_rb_search(struct rb_root *, unsigned long,
-                                      unsigned long);
-struct mmu_rb_node *hfi1_mmu_rb_extract(struct rb_root *, unsigned long,
-                                       unsigned long);
-
-#endif /* _HFI1_MMU_RB_H */
diff --git a/drivers/staging/rdma/hfi1/opa_compat.h b/drivers/staging/rdma/hfi1/opa_compat.h
deleted file mode 100644 (file)
index 6ef3c1c..0000000
+++ /dev/null
@@ -1,111 +0,0 @@
-#ifndef _LINUX_H
-#define _LINUX_H
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-/*
- * This header file is for OPA-specific definitions which are
- * required by the HFI driver, and which aren't yet in the Linux
- * IB core. We'll collect these all here, then merge them into
- * the kernel when that's convenient.
- */
-
-/* OPA SMA attribute IDs */
-#define OPA_ATTRIB_ID_CONGESTION_INFO          cpu_to_be16(0x008b)
-#define OPA_ATTRIB_ID_HFI_CONGESTION_LOG       cpu_to_be16(0x008f)
-#define OPA_ATTRIB_ID_HFI_CONGESTION_SETTING   cpu_to_be16(0x0090)
-#define OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE cpu_to_be16(0x0091)
-
-/* OPA PMA attribute IDs */
-#define OPA_PM_ATTRIB_ID_PORT_STATUS           cpu_to_be16(0x0040)
-#define OPA_PM_ATTRIB_ID_CLEAR_PORT_STATUS     cpu_to_be16(0x0041)
-#define OPA_PM_ATTRIB_ID_DATA_PORT_COUNTERS    cpu_to_be16(0x0042)
-#define OPA_PM_ATTRIB_ID_ERROR_PORT_COUNTERS   cpu_to_be16(0x0043)
-#define OPA_PM_ATTRIB_ID_ERROR_INFO            cpu_to_be16(0x0044)
-
-/* OPA status codes */
-#define OPA_PM_STATUS_REQUEST_TOO_LARGE                cpu_to_be16(0x100)
-
-static inline u8 port_states_to_logical_state(struct opa_port_states *ps)
-{
-       return ps->portphysstate_portstate & OPA_PI_MASK_PORT_STATE;
-}
-
-static inline u8 port_states_to_phys_state(struct opa_port_states *ps)
-{
-       return ((ps->portphysstate_portstate &
-                 OPA_PI_MASK_PORT_PHYSICAL_STATE) >> 4) & 0xf;
-}
-
-/*
- * OPA port physical states
- * IB Volume 1, Table 146 PortInfo/IB Volume 2 Section 5.4.2(1) PortPhysState
- * values.
- *
- * When writing, only values 0-3 are valid, other values are ignored.
- * When reading, 0 is reserved.
- *
- * Returned by the ibphys_portstate() routine.
- */
-enum opa_port_phys_state {
-       IB_PORTPHYSSTATE_NOP = 0,
-       /* 1 is reserved */
-       IB_PORTPHYSSTATE_POLLING = 2,
-       IB_PORTPHYSSTATE_DISABLED = 3,
-       IB_PORTPHYSSTATE_TRAINING = 4,
-       IB_PORTPHYSSTATE_LINKUP = 5,
-       IB_PORTPHYSSTATE_LINK_ERROR_RECOVERY = 6,
-       IB_PORTPHYSSTATE_PHY_TEST = 7,
-       /* 8 is reserved */
-       OPA_PORTPHYSSTATE_OFFLINE = 9,
-       OPA_PORTPHYSSTATE_GANGED = 10,
-       OPA_PORTPHYSSTATE_TEST = 11,
-       OPA_PORTPHYSSTATE_MAX = 11,
-       /* values 12-15 are reserved/ignored */
-};
-
-#endif /* _LINUX_H */
diff --git a/drivers/staging/rdma/hfi1/pcie.c b/drivers/staging/rdma/hfi1/pcie.c
deleted file mode 100644 (file)
index 0bac21e..0000000
+++ /dev/null
@@ -1,1338 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/pci.h>
-#include <linux/io.h>
-#include <linux/delay.h>
-#include <linux/vmalloc.h>
-#include <linux/aer.h>
-#include <linux/module.h>
-
-#include "hfi.h"
-#include "chip_registers.h"
-#include "aspm.h"
-
-/* link speed vector for Gen3 speed - not in Linux headers */
-#define GEN1_SPEED_VECTOR 0x1
-#define GEN2_SPEED_VECTOR 0x2
-#define GEN3_SPEED_VECTOR 0x3
-
-/*
- * This file contains PCIe utility routines.
- */
-
-/*
- * Code to adjust PCIe capabilities.
- */
-static void tune_pcie_caps(struct hfi1_devdata *);
-
-/*
- * Do all the common PCIe setup and initialization.
- * devdata is not yet allocated, and is not allocated until after this
- * routine returns success.  Therefore dd_dev_err() can't be used for error
- * printing.
- */
-int hfi1_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent)
-{
-       int ret;
-
-       ret = pci_enable_device(pdev);
-       if (ret) {
-               /*
-                * This can happen (in theory) iff:
-                * We did a chip reset, and then failed to reprogram the
-                * BAR, or the chip reset due to an internal error.  We then
-                * unloaded the driver and reloaded it.
-                *
-                * Both reset cases set the BAR back to initial state.  For
-                * the latter case, the AER sticky error bit at offset 0x718
-                * should be set, but the Linux kernel doesn't yet know
-                * about that, it appears.  If the original BAR was retained
-                * in the kernel data structures, this may be OK.
-                */
-               hfi1_early_err(&pdev->dev, "pci enable failed: error %d\n",
-                              -ret);
-               goto done;
-       }
-
-       ret = pci_request_regions(pdev, DRIVER_NAME);
-       if (ret) {
-               hfi1_early_err(&pdev->dev,
-                              "pci_request_regions fails: err %d\n", -ret);
-               goto bail;
-       }
-
-       ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
-       if (ret) {
-               /*
-                * If the 64 bit setup fails, try 32 bit.  Some systems
-                * do not setup 64 bit maps on systems with 2GB or less
-                * memory installed.
-                */
-               ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
-               if (ret) {
-                       hfi1_early_err(&pdev->dev,
-                                      "Unable to set DMA mask: %d\n", ret);
-                       goto bail;
-               }
-               ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
-       } else {
-               ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
-       }
-       if (ret) {
-               hfi1_early_err(&pdev->dev,
-                              "Unable to set DMA consistent mask: %d\n", ret);
-               goto bail;
-       }
-
-       pci_set_master(pdev);
-       (void)pci_enable_pcie_error_reporting(pdev);
-       goto done;
-
-bail:
-       hfi1_pcie_cleanup(pdev);
-done:
-       return ret;
-}
-
-/*
- * Clean what was done in hfi1_pcie_init()
- */
-void hfi1_pcie_cleanup(struct pci_dev *pdev)
-{
-       pci_disable_device(pdev);
-       /*
-        * Release regions should be called after the disable. OK to
-        * call if request regions has not been called or failed.
-        */
-       pci_release_regions(pdev);
-}
-
-/*
- * Do remaining PCIe setup, once dd is allocated, and save away
- * fields required to re-initialize after a chip reset, or for
- * various other purposes
- */
-int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev,
-                    const struct pci_device_id *ent)
-{
-       unsigned long len;
-       resource_size_t addr;
-
-       dd->pcidev = pdev;
-       pci_set_drvdata(pdev, dd);
-
-       addr = pci_resource_start(pdev, 0);
-       len = pci_resource_len(pdev, 0);
-
-       /*
-        * The TXE PIO buffers are at the tail end of the chip space.
-        * Cut them off and map them separately.
-        */
-
-       /* sanity check vs expectations */
-       if (len != TXE_PIO_SEND + TXE_PIO_SIZE) {
-               dd_dev_err(dd, "chip PIO range does not match\n");
-               return -EINVAL;
-       }
-
-       dd->kregbase = ioremap_nocache(addr, TXE_PIO_SEND);
-       if (!dd->kregbase)
-               return -ENOMEM;
-
-       dd->piobase = ioremap_wc(addr + TXE_PIO_SEND, TXE_PIO_SIZE);
-       if (!dd->piobase) {
-               iounmap(dd->kregbase);
-               return -ENOMEM;
-       }
-
-       dd->flags |= HFI1_PRESENT;      /* now register routines work */
-
-       dd->kregend = dd->kregbase + TXE_PIO_SEND;
-       dd->physaddr = addr;        /* used for io_remap, etc. */
-
-       /*
-        * Re-map the chip's RcvArray as write-combining to allow us
-        * to write an entire cacheline worth of entries in one shot.
-        * If this re-map fails, just continue - the RcvArray programming
-        * function will handle both cases.
-        */
-       dd->chip_rcv_array_count = read_csr(dd, RCV_ARRAY_CNT);
-       dd->rcvarray_wc = ioremap_wc(addr + RCV_ARRAY,
-                                    dd->chip_rcv_array_count * 8);
-       dd_dev_info(dd, "WC Remapped RcvArray: %p\n", dd->rcvarray_wc);
-       /*
-        * Save BARs and command to rewrite after device reset.
-        */
-       dd->pcibar0 = addr;
-       dd->pcibar1 = addr >> 32;
-       pci_read_config_dword(dd->pcidev, PCI_ROM_ADDRESS, &dd->pci_rom);
-       pci_read_config_word(dd->pcidev, PCI_COMMAND, &dd->pci_command);
-       pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL, &dd->pcie_devctl);
-       pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL, &dd->pcie_lnkctl);
-       pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL2,
-                                 &dd->pcie_devctl2);
-       pci_read_config_dword(dd->pcidev, PCI_CFG_MSIX0, &dd->pci_msix0);
-       pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE1, &dd->pci_lnkctl3);
-       pci_read_config_dword(dd->pcidev, PCIE_CFG_TPH2, &dd->pci_tph2);
-
-       return 0;
-}
-
-/*
- * Do PCIe cleanup related to dd, after chip-specific cleanup, etc.  Just prior
- * to releasing the dd memory.
- * Void because all of the core pcie cleanup functions are void.
- */
-void hfi1_pcie_ddcleanup(struct hfi1_devdata *dd)
-{
-       u64 __iomem *base = (void __iomem *)dd->kregbase;
-
-       dd->flags &= ~HFI1_PRESENT;
-       dd->kregbase = NULL;
-       iounmap(base);
-       if (dd->rcvarray_wc)
-               iounmap(dd->rcvarray_wc);
-       if (dd->piobase)
-               iounmap(dd->piobase);
-}
-
-/*
- * Do a Function Level Reset (FLR) on the device.
- * Based on static function drivers/pci/pci.c:pcie_flr().
- */
-void hfi1_pcie_flr(struct hfi1_devdata *dd)
-{
-       int i;
-       u16 status;
-
-       /* no need to check for the capability - we know the device has it */
-
-       /* wait for Transaction Pending bit to clear, at most a few ms */
-       for (i = 0; i < 4; i++) {
-               if (i)
-                       msleep((1 << (i - 1)) * 100);
-
-               pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVSTA, &status);
-               if (!(status & PCI_EXP_DEVSTA_TRPND))
-                       goto clear;
-       }
-
-       dd_dev_err(dd, "Transaction Pending bit is not clearing, proceeding with reset anyway\n");
-
-clear:
-       pcie_capability_set_word(dd->pcidev, PCI_EXP_DEVCTL,
-                                PCI_EXP_DEVCTL_BCR_FLR);
-       /* PCIe spec requires the function to be back within 100ms */
-       msleep(100);
-}
-
-static void msix_setup(struct hfi1_devdata *dd, int pos, u32 *msixcnt,
-                      struct hfi1_msix_entry *hfi1_msix_entry)
-{
-       int ret;
-       int nvec = *msixcnt;
-       struct msix_entry *msix_entry;
-       int i;
-
-       /*
-        * We can't pass hfi1_msix_entry array to msix_setup
-        * so use a dummy msix_entry array and copy the allocated
-        * irq back to the hfi1_msix_entry array.
-        */
-       msix_entry = kmalloc_array(nvec, sizeof(*msix_entry), GFP_KERNEL);
-       if (!msix_entry) {
-               ret = -ENOMEM;
-               goto do_intx;
-       }
-
-       for (i = 0; i < nvec; i++)
-               msix_entry[i] = hfi1_msix_entry[i].msix;
-
-       ret = pci_enable_msix_range(dd->pcidev, msix_entry, 1, nvec);
-       if (ret < 0)
-               goto free_msix_entry;
-       nvec = ret;
-
-       for (i = 0; i < nvec; i++)
-               hfi1_msix_entry[i].msix = msix_entry[i];
-
-       kfree(msix_entry);
-       *msixcnt = nvec;
-       return;
-
-free_msix_entry:
-       kfree(msix_entry);
-
-do_intx:
-       dd_dev_err(dd, "pci_enable_msix_range %d vectors failed: %d, falling back to INTx\n",
-                  nvec, ret);
-       *msixcnt = 0;
-       hfi1_enable_intx(dd->pcidev);
-}
-
-/* return the PCIe link speed from the given link status */
-static u32 extract_speed(u16 linkstat)
-{
-       u32 speed;
-
-       switch (linkstat & PCI_EXP_LNKSTA_CLS) {
-       default: /* not defined, assume Gen1 */
-       case PCI_EXP_LNKSTA_CLS_2_5GB:
-               speed = 2500; /* Gen 1, 2.5GHz */
-               break;
-       case PCI_EXP_LNKSTA_CLS_5_0GB:
-               speed = 5000; /* Gen 2, 5GHz */
-               break;
-       case GEN3_SPEED_VECTOR:
-               speed = 8000; /* Gen 3, 8GHz */
-               break;
-       }
-       return speed;
-}
-
-/* return the PCIe link speed from the given link status */
-static u32 extract_width(u16 linkstat)
-{
-       return (linkstat & PCI_EXP_LNKSTA_NLW) >> PCI_EXP_LNKSTA_NLW_SHIFT;
-}
-
-/* read the link status and set dd->{lbus_width,lbus_speed,lbus_info} */
-static void update_lbus_info(struct hfi1_devdata *dd)
-{
-       u16 linkstat;
-
-       pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKSTA, &linkstat);
-       dd->lbus_width = extract_width(linkstat);
-       dd->lbus_speed = extract_speed(linkstat);
-       snprintf(dd->lbus_info, sizeof(dd->lbus_info),
-                "PCIe,%uMHz,x%u", dd->lbus_speed, dd->lbus_width);
-}
-
-/*
- * Read in the current PCIe link width and speed.  Find if the link is
- * Gen3 capable.
- */
-int pcie_speeds(struct hfi1_devdata *dd)
-{
-       u32 linkcap;
-       struct pci_dev *parent = dd->pcidev->bus->self;
-
-       if (!pci_is_pcie(dd->pcidev)) {
-               dd_dev_err(dd, "Can't find PCI Express capability!\n");
-               return -EINVAL;
-       }
-
-       /* find if our max speed is Gen3 and parent supports Gen3 speeds */
-       dd->link_gen3_capable = 1;
-
-       pcie_capability_read_dword(dd->pcidev, PCI_EXP_LNKCAP, &linkcap);
-       if ((linkcap & PCI_EXP_LNKCAP_SLS) != GEN3_SPEED_VECTOR) {
-               dd_dev_info(dd,
-                           "This HFI is not Gen3 capable, max speed 0x%x, need 0x3\n",
-                           linkcap & PCI_EXP_LNKCAP_SLS);
-               dd->link_gen3_capable = 0;
-       }
-
-       /*
-        * bus->max_bus_speed is set from the bridge's linkcap Max Link Speed
-        */
-       if (parent && dd->pcidev->bus->max_bus_speed != PCIE_SPEED_8_0GT) {
-               dd_dev_info(dd, "Parent PCIe bridge does not support Gen3\n");
-               dd->link_gen3_capable = 0;
-       }
-
-       /* obtain the link width and current speed */
-       update_lbus_info(dd);
-
-       dd_dev_info(dd, "%s\n", dd->lbus_info);
-
-       return 0;
-}
-
-/*
- * Returns in *nent:
- *     - actual number of interrupts allocated
- *     - 0 if fell back to INTx.
- */
-void request_msix(struct hfi1_devdata *dd, u32 *nent,
-                 struct hfi1_msix_entry *entry)
-{
-       int pos;
-
-       pos = dd->pcidev->msix_cap;
-       if (*nent && pos) {
-               msix_setup(dd, pos, nent, entry);
-               /* did it, either MSI-X or INTx */
-       } else {
-               *nent = 0;
-               hfi1_enable_intx(dd->pcidev);
-       }
-
-       tune_pcie_caps(dd);
-}
-
-void hfi1_enable_intx(struct pci_dev *pdev)
-{
-       /* first, turn on INTx */
-       pci_intx(pdev, 1);
-       /* then turn off MSI-X */
-       pci_disable_msix(pdev);
-}
-
-/* restore command and BARs after a reset has wiped them out */
-void restore_pci_variables(struct hfi1_devdata *dd)
-{
-       pci_write_config_word(dd->pcidev, PCI_COMMAND, dd->pci_command);
-       pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_0, dd->pcibar0);
-       pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_1, dd->pcibar1);
-       pci_write_config_dword(dd->pcidev, PCI_ROM_ADDRESS, dd->pci_rom);
-       pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL, dd->pcie_devctl);
-       pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL, dd->pcie_lnkctl);
-       pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL2,
-                                  dd->pcie_devctl2);
-       pci_write_config_dword(dd->pcidev, PCI_CFG_MSIX0, dd->pci_msix0);
-       pci_write_config_dword(dd->pcidev, PCIE_CFG_SPCIE1, dd->pci_lnkctl3);
-       pci_write_config_dword(dd->pcidev, PCIE_CFG_TPH2, dd->pci_tph2);
-}
-
-/*
- * BIOS may not set PCIe bus-utilization parameters for best performance.
- * Check and optionally adjust them to maximize our throughput.
- */
-static int hfi1_pcie_caps;
-module_param_named(pcie_caps, hfi1_pcie_caps, int, S_IRUGO);
-MODULE_PARM_DESC(pcie_caps, "Max PCIe tuning: Payload (0..3), ReadReq (4..7)");
-
-uint aspm_mode = ASPM_MODE_DISABLED;
-module_param_named(aspm, aspm_mode, uint, S_IRUGO);
-MODULE_PARM_DESC(aspm, "PCIe ASPM: 0: disable, 1: enable, 2: dynamic");
-
-static void tune_pcie_caps(struct hfi1_devdata *dd)
-{
-       struct pci_dev *parent;
-       u16 rc_mpss, rc_mps, ep_mpss, ep_mps;
-       u16 rc_mrrs, ep_mrrs, max_mrrs, ectl;
-
-       /*
-        * Turn on extended tags in DevCtl in case the BIOS has turned it off
-        * to improve WFR SDMA bandwidth
-        */
-       pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL, &ectl);
-       if (!(ectl & PCI_EXP_DEVCTL_EXT_TAG)) {
-               dd_dev_info(dd, "Enabling PCIe extended tags\n");
-               ectl |= PCI_EXP_DEVCTL_EXT_TAG;
-               pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL, ectl);
-       }
-       /* Find out supported and configured values for parent (root) */
-       parent = dd->pcidev->bus->self;
-       /*
-        * The driver cannot perform the tuning if it does not have
-        * access to the upstream component.
-        */
-       if (!parent)
-               return;
-       if (!pci_is_root_bus(parent->bus)) {
-               dd_dev_info(dd, "Parent not root\n");
-               return;
-       }
-
-       if (!pci_is_pcie(parent) || !pci_is_pcie(dd->pcidev))
-               return;
-       rc_mpss = parent->pcie_mpss;
-       rc_mps = ffs(pcie_get_mps(parent)) - 8;
-       /* Find out supported and configured values for endpoint (us) */
-       ep_mpss = dd->pcidev->pcie_mpss;
-       ep_mps = ffs(pcie_get_mps(dd->pcidev)) - 8;
-
-       /* Find max payload supported by root, endpoint */
-       if (rc_mpss > ep_mpss)
-               rc_mpss = ep_mpss;
-
-       /* If Supported greater than limit in module param, limit it */
-       if (rc_mpss > (hfi1_pcie_caps & 7))
-               rc_mpss = hfi1_pcie_caps & 7;
-       /* If less than (allowed, supported), bump root payload */
-       if (rc_mpss > rc_mps) {
-               rc_mps = rc_mpss;
-               pcie_set_mps(parent, 128 << rc_mps);
-       }
-       /* If less than (allowed, supported), bump endpoint payload */
-       if (rc_mpss > ep_mps) {
-               ep_mps = rc_mpss;
-               pcie_set_mps(dd->pcidev, 128 << ep_mps);
-       }
-
-       /*
-        * Now the Read Request size.
-        * No field for max supported, but PCIe spec limits it to 4096,
-        * which is code '5' (log2(4096) - 7)
-        */
-       max_mrrs = 5;
-       if (max_mrrs > ((hfi1_pcie_caps >> 4) & 7))
-               max_mrrs = (hfi1_pcie_caps >> 4) & 7;
-
-       max_mrrs = 128 << max_mrrs;
-       rc_mrrs = pcie_get_readrq(parent);
-       ep_mrrs = pcie_get_readrq(dd->pcidev);
-
-       if (max_mrrs > rc_mrrs) {
-               rc_mrrs = max_mrrs;
-               pcie_set_readrq(parent, rc_mrrs);
-       }
-       if (max_mrrs > ep_mrrs) {
-               ep_mrrs = max_mrrs;
-               pcie_set_readrq(dd->pcidev, ep_mrrs);
-       }
-}
-
-/* End of PCIe capability tuning */
-
-/*
- * From here through hfi1_pci_err_handler definition is invoked via
- * PCI error infrastructure, registered via pci
- */
-static pci_ers_result_t
-pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
-{
-       struct hfi1_devdata *dd = pci_get_drvdata(pdev);
-       pci_ers_result_t ret = PCI_ERS_RESULT_RECOVERED;
-
-       switch (state) {
-       case pci_channel_io_normal:
-               dd_dev_info(dd, "State Normal, ignoring\n");
-               break;
-
-       case pci_channel_io_frozen:
-               dd_dev_info(dd, "State Frozen, requesting reset\n");
-               pci_disable_device(pdev);
-               ret = PCI_ERS_RESULT_NEED_RESET;
-               break;
-
-       case pci_channel_io_perm_failure:
-               if (dd) {
-                       dd_dev_info(dd, "State Permanent Failure, disabling\n");
-                       /* no more register accesses! */
-                       dd->flags &= ~HFI1_PRESENT;
-                       hfi1_disable_after_error(dd);
-               }
-                /* else early, or other problem */
-               ret =  PCI_ERS_RESULT_DISCONNECT;
-               break;
-
-       default: /* shouldn't happen */
-               dd_dev_info(dd, "HFI1 PCI errors detected (state %d)\n",
-                           state);
-               break;
-       }
-       return ret;
-}
-
-static pci_ers_result_t
-pci_mmio_enabled(struct pci_dev *pdev)
-{
-       u64 words = 0U;
-       struct hfi1_devdata *dd = pci_get_drvdata(pdev);
-       pci_ers_result_t ret = PCI_ERS_RESULT_RECOVERED;
-
-       if (dd && dd->pport) {
-               words = read_port_cntr(dd->pport, C_RX_WORDS, CNTR_INVALID_VL);
-               if (words == ~0ULL)
-                       ret = PCI_ERS_RESULT_NEED_RESET;
-               dd_dev_info(dd,
-                           "HFI1 mmio_enabled function called, read wordscntr %Lx, returning %d\n",
-                           words, ret);
-       }
-       return  ret;
-}
-
-static pci_ers_result_t
-pci_slot_reset(struct pci_dev *pdev)
-{
-       struct hfi1_devdata *dd = pci_get_drvdata(pdev);
-
-       dd_dev_info(dd, "HFI1 slot_reset function called, ignored\n");
-       return PCI_ERS_RESULT_CAN_RECOVER;
-}
-
-static pci_ers_result_t
-pci_link_reset(struct pci_dev *pdev)
-{
-       struct hfi1_devdata *dd = pci_get_drvdata(pdev);
-
-       dd_dev_info(dd, "HFI1 link_reset function called, ignored\n");
-       return PCI_ERS_RESULT_CAN_RECOVER;
-}
-
-static void
-pci_resume(struct pci_dev *pdev)
-{
-       struct hfi1_devdata *dd = pci_get_drvdata(pdev);
-
-       dd_dev_info(dd, "HFI1 resume function called\n");
-       pci_cleanup_aer_uncorrect_error_status(pdev);
-       /*
-        * Running jobs will fail, since it's asynchronous
-        * unlike sysfs-requested reset.   Better than
-        * doing nothing.
-        */
-       hfi1_init(dd, 1); /* same as re-init after reset */
-}
-
-const struct pci_error_handlers hfi1_pci_err_handler = {
-       .error_detected = pci_error_detected,
-       .mmio_enabled = pci_mmio_enabled,
-       .link_reset = pci_link_reset,
-       .slot_reset = pci_slot_reset,
-       .resume = pci_resume,
-};
-
-/*============================================================================*/
-/* PCIe Gen3 support */
-
-/*
- * This code is separated out because it is expected to be removed in the
- * final shipping product.  If not, then it will be revisited and items
- * will be moved to more standard locations.
- */
-
-/* ASIC_PCI_SD_HOST_STATUS.FW_DNLD_STS field values */
-#define DL_STATUS_HFI0 0x1     /* hfi0 firmware download complete */
-#define DL_STATUS_HFI1 0x2     /* hfi1 firmware download complete */
-#define DL_STATUS_BOTH 0x3     /* hfi0 and hfi1 firmware download complete */
-
-/* ASIC_PCI_SD_HOST_STATUS.FW_DNLD_ERR field values */
-#define DL_ERR_NONE            0x0     /* no error */
-#define DL_ERR_SWAP_PARITY     0x1     /* parity error in SerDes interrupt */
-                                       /*   or response data */
-#define DL_ERR_DISABLED        0x2     /* hfi disabled */
-#define DL_ERR_SECURITY        0x3     /* security check failed */
-#define DL_ERR_SBUS            0x4     /* SBus status error */
-#define DL_ERR_XFR_PARITY      0x5     /* parity error during ROM transfer*/
-
-/* gasket block secondary bus reset delay */
-#define SBR_DELAY_US 200000    /* 200ms */
-
-/* mask for PCIe capability register lnkctl2 target link speed */
-#define LNKCTL2_TARGET_LINK_SPEED_MASK 0xf
-
-static uint pcie_target = 3;
-module_param(pcie_target, uint, S_IRUGO);
-MODULE_PARM_DESC(pcie_target, "PCIe target speed (0 skip, 1-3 Gen1-3)");
-
-static uint pcie_force;
-module_param(pcie_force, uint, S_IRUGO);
-MODULE_PARM_DESC(pcie_force, "Force driver to do a PCIe firmware download even if already at target speed");
-
-static uint pcie_retry = 5;
-module_param(pcie_retry, uint, S_IRUGO);
-MODULE_PARM_DESC(pcie_retry, "Driver will try this many times to reach requested speed");
-
-#define UNSET_PSET 255
-#define DEFAULT_DISCRETE_PSET 2        /* discrete HFI */
-#define DEFAULT_MCP_PSET 4     /* MCP HFI */
-static uint pcie_pset = UNSET_PSET;
-module_param(pcie_pset, uint, S_IRUGO);
-MODULE_PARM_DESC(pcie_pset, "PCIe Eq Pset value to use, range is 0-10");
-
-/* equalization columns */
-#define PREC 0
-#define ATTN 1
-#define POST 2
-
-/* discrete silicon preliminary equalization values */
-static const u8 discrete_preliminary_eq[11][3] = {
-       /* prec   attn   post */
-       {  0x00,  0x00,  0x12 },        /* p0 */
-       {  0x00,  0x00,  0x0c },        /* p1 */
-       {  0x00,  0x00,  0x0f },        /* p2 */
-       {  0x00,  0x00,  0x09 },        /* p3 */
-       {  0x00,  0x00,  0x00 },        /* p4 */
-       {  0x06,  0x00,  0x00 },        /* p5 */
-       {  0x09,  0x00,  0x00 },        /* p6 */
-       {  0x06,  0x00,  0x0f },        /* p7 */
-       {  0x09,  0x00,  0x09 },        /* p8 */
-       {  0x0c,  0x00,  0x00 },        /* p9 */
-       {  0x00,  0x00,  0x18 },        /* p10 */
-};
-
-/* integrated silicon preliminary equalization values */
-static const u8 integrated_preliminary_eq[11][3] = {
-       /* prec   attn   post */
-       {  0x00,  0x1e,  0x07 },        /* p0 */
-       {  0x00,  0x1e,  0x05 },        /* p1 */
-       {  0x00,  0x1e,  0x06 },        /* p2 */
-       {  0x00,  0x1e,  0x04 },        /* p3 */
-       {  0x00,  0x1e,  0x00 },        /* p4 */
-       {  0x03,  0x1e,  0x00 },        /* p5 */
-       {  0x04,  0x1e,  0x00 },        /* p6 */
-       {  0x03,  0x1e,  0x06 },        /* p7 */
-       {  0x03,  0x1e,  0x04 },        /* p8 */
-       {  0x05,  0x1e,  0x00 },        /* p9 */
-       {  0x00,  0x1e,  0x0a },        /* p10 */
-};
-
-/* helper to format the value to write to hardware */
-#define eq_value(pre, curr, post) \
-       ((((u32)(pre)) << \
-                       PCIE_CFG_REG_PL102_GEN3_EQ_PRE_CURSOR_PSET_SHIFT) \
-       | (((u32)(curr)) << PCIE_CFG_REG_PL102_GEN3_EQ_CURSOR_PSET_SHIFT) \
-       | (((u32)(post)) << \
-               PCIE_CFG_REG_PL102_GEN3_EQ_POST_CURSOR_PSET_SHIFT))
-
-/*
- * Load the given EQ preset table into the PCIe hardware.
- */
-static int load_eq_table(struct hfi1_devdata *dd, const u8 eq[11][3], u8 fs,
-                        u8 div)
-{
-       struct pci_dev *pdev = dd->pcidev;
-       u32 hit_error = 0;
-       u32 violation;
-       u32 i;
-       u8 c_minus1, c0, c_plus1;
-
-       for (i = 0; i < 11; i++) {
-               /* set index */
-               pci_write_config_dword(pdev, PCIE_CFG_REG_PL103, i);
-               /* write the value */
-               c_minus1 = eq[i][PREC] / div;
-               c0 = fs - (eq[i][PREC] / div) - (eq[i][POST] / div);
-               c_plus1 = eq[i][POST] / div;
-               pci_write_config_dword(pdev, PCIE_CFG_REG_PL102,
-                                      eq_value(c_minus1, c0, c_plus1));
-               /* check if these coefficients violate EQ rules */
-               pci_read_config_dword(dd->pcidev, PCIE_CFG_REG_PL105,
-                                     &violation);
-               if (violation
-                   & PCIE_CFG_REG_PL105_GEN3_EQ_VIOLATE_COEF_RULES_SMASK){
-                       if (hit_error == 0) {
-                               dd_dev_err(dd,
-                                          "Gen3 EQ Table Coefficient rule violations\n");
-                               dd_dev_err(dd, "         prec   attn   post\n");
-                       }
-                       dd_dev_err(dd, "   p%02d:   %02x     %02x     %02x\n",
-                                  i, (u32)eq[i][0], (u32)eq[i][1],
-                                  (u32)eq[i][2]);
-                       dd_dev_err(dd, "            %02x     %02x     %02x\n",
-                                  (u32)c_minus1, (u32)c0, (u32)c_plus1);
-                       hit_error = 1;
-               }
-       }
-       if (hit_error)
-               return -EINVAL;
-       return 0;
-}
-
-/*
- * Steps to be done after the PCIe firmware is downloaded and
- * before the SBR for the Pcie Gen3.
- * The SBus resource is already being held.
- */
-static void pcie_post_steps(struct hfi1_devdata *dd)
-{
-       int i;
-
-       set_sbus_fast_mode(dd);
-       /*
-        * Write to the PCIe PCSes to set the G3_LOCKED_NEXT bits to 1.
-        * This avoids a spurious framing error that can otherwise be
-        * generated by the MAC layer.
-        *
-        * Use individual addresses since no broadcast is set up.
-        */
-       for (i = 0; i < NUM_PCIE_SERDES; i++) {
-               sbus_request(dd, pcie_pcs_addrs[dd->hfi1_id][i],
-                            0x03, WRITE_SBUS_RECEIVER, 0x00022132);
-       }
-
-       clear_sbus_fast_mode(dd);
-}
-
-/*
- * Trigger a secondary bus reset (SBR) on ourselves using our parent.
- *
- * Based on pci_parent_bus_reset() which is not exported by the
- * kernel core.
- */
-static int trigger_sbr(struct hfi1_devdata *dd)
-{
-       struct pci_dev *dev = dd->pcidev;
-       struct pci_dev *pdev;
-
-       /* need a parent */
-       if (!dev->bus->self) {
-               dd_dev_err(dd, "%s: no parent device\n", __func__);
-               return -ENOTTY;
-       }
-
-       /* should not be anyone else on the bus */
-       list_for_each_entry(pdev, &dev->bus->devices, bus_list)
-               if (pdev != dev) {
-                       dd_dev_err(dd,
-                                  "%s: another device is on the same bus\n",
-                                  __func__);
-                       return -ENOTTY;
-               }
-
-       /*
-        * A secondary bus reset (SBR) issues a hot reset to our device.
-        * The following routine does a 1s wait after the reset is dropped
-        * per PCI Trhfa (recovery time).  PCIe 3.0 section 6.6.1 -
-        * Conventional Reset, paragraph 3, line 35 also says that a 1s
-        * delay after a reset is required.  Per spec requirements,
-        * the link is either working or not after that point.
-        */
-       pci_reset_bridge_secondary_bus(dev->bus->self);
-
-       return 0;
-}
-
-/*
- * Write the given gasket interrupt register.
- */
-static void write_gasket_interrupt(struct hfi1_devdata *dd, int index,
-                                  u16 code, u16 data)
-{
-       write_csr(dd, ASIC_PCIE_SD_INTRPT_LIST + (index * 8),
-                 (((u64)code << ASIC_PCIE_SD_INTRPT_LIST_INTRPT_CODE_SHIFT) |
-                  ((u64)data << ASIC_PCIE_SD_INTRPT_LIST_INTRPT_DATA_SHIFT)));
-}
-
-/*
- * Tell the gasket logic how to react to the reset.
- */
-static void arm_gasket_logic(struct hfi1_devdata *dd)
-{
-       u64 reg;
-
-       reg = (((u64)1 << dd->hfi1_id) <<
-              ASIC_PCIE_SD_HOST_CMD_INTRPT_CMD_SHIFT) |
-             ((u64)pcie_serdes_broadcast[dd->hfi1_id] <<
-              ASIC_PCIE_SD_HOST_CMD_SBUS_RCVR_ADDR_SHIFT |
-              ASIC_PCIE_SD_HOST_CMD_SBR_MODE_SMASK |
-              ((u64)SBR_DELAY_US & ASIC_PCIE_SD_HOST_CMD_TIMER_MASK) <<
-              ASIC_PCIE_SD_HOST_CMD_TIMER_SHIFT);
-       write_csr(dd, ASIC_PCIE_SD_HOST_CMD, reg);
-       /* read back to push the write */
-       read_csr(dd, ASIC_PCIE_SD_HOST_CMD);
-}
-
-/*
- * CCE_PCIE_CTRL long name helpers
- * We redefine these shorter macros to use in the code while leaving
- * chip_registers.h to be autogenerated from the hardware spec.
- */
-#define LANE_BUNDLE_MASK              CCE_PCIE_CTRL_PCIE_LANE_BUNDLE_MASK
-#define LANE_BUNDLE_SHIFT             CCE_PCIE_CTRL_PCIE_LANE_BUNDLE_SHIFT
-#define LANE_DELAY_MASK               CCE_PCIE_CTRL_PCIE_LANE_DELAY_MASK
-#define LANE_DELAY_SHIFT              CCE_PCIE_CTRL_PCIE_LANE_DELAY_SHIFT
-#define MARGIN_OVERWRITE_ENABLE_SHIFT CCE_PCIE_CTRL_XMT_MARGIN_OVERWRITE_ENABLE_SHIFT
-#define MARGIN_SHIFT                  CCE_PCIE_CTRL_XMT_MARGIN_SHIFT
-#define MARGIN_G1_G2_OVERWRITE_MASK   CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_OVERWRITE_ENABLE_MASK
-#define MARGIN_G1_G2_OVERWRITE_SHIFT  CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_OVERWRITE_ENABLE_SHIFT
-#define MARGIN_GEN1_GEN2_MASK         CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_MASK
-#define MARGIN_GEN1_GEN2_SHIFT        CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_SHIFT
-
- /*
-  * Write xmt_margin for full-swing (WFR-B) or half-swing (WFR-C).
-  */
-static void write_xmt_margin(struct hfi1_devdata *dd, const char *fname)
-{
-       u64 pcie_ctrl;
-       u64 xmt_margin;
-       u64 xmt_margin_oe;
-       u64 lane_delay;
-       u64 lane_bundle;
-
-       pcie_ctrl = read_csr(dd, CCE_PCIE_CTRL);
-
-       /*
-        * For Discrete, use full-swing.
-        *  - PCIe TX defaults to full-swing.
-        *    Leave this register as default.
-        * For Integrated, use half-swing
-        *  - Copy xmt_margin and xmt_margin_oe
-        *    from Gen1/Gen2 to Gen3.
-        */
-       if (dd->pcidev->device == PCI_DEVICE_ID_INTEL1) { /* integrated */
-               /* extract initial fields */
-               xmt_margin = (pcie_ctrl >> MARGIN_GEN1_GEN2_SHIFT)
-                             & MARGIN_GEN1_GEN2_MASK;
-               xmt_margin_oe = (pcie_ctrl >> MARGIN_G1_G2_OVERWRITE_SHIFT)
-                                & MARGIN_G1_G2_OVERWRITE_MASK;
-               lane_delay = (pcie_ctrl >> LANE_DELAY_SHIFT) & LANE_DELAY_MASK;
-               lane_bundle = (pcie_ctrl >> LANE_BUNDLE_SHIFT)
-                              & LANE_BUNDLE_MASK;
-
-               /*
-                * For A0, EFUSE values are not set.  Override with the
-                * correct values.
-                */
-               if (is_ax(dd)) {
-                       /*
-                        * xmt_margin and OverwiteEnabel should be the
-                        * same for Gen1/Gen2 and Gen3
-                        */
-                       xmt_margin = 0x5;
-                       xmt_margin_oe = 0x1;
-                       lane_delay = 0xF; /* Delay 240ns. */
-                       lane_bundle = 0x0; /* Set to 1 lane. */
-               }
-
-               /* overwrite existing values */
-               pcie_ctrl = (xmt_margin << MARGIN_GEN1_GEN2_SHIFT)
-                       | (xmt_margin_oe << MARGIN_G1_G2_OVERWRITE_SHIFT)
-                       | (xmt_margin << MARGIN_SHIFT)
-                       | (xmt_margin_oe << MARGIN_OVERWRITE_ENABLE_SHIFT)
-                       | (lane_delay << LANE_DELAY_SHIFT)
-                       | (lane_bundle << LANE_BUNDLE_SHIFT);
-
-               write_csr(dd, CCE_PCIE_CTRL, pcie_ctrl);
-       }
-
-       dd_dev_dbg(dd, "%s: program XMT margin, CcePcieCtrl 0x%llx\n",
-                  fname, pcie_ctrl);
-}
-
-/*
- * Do all the steps needed to transition the PCIe link to Gen3 speed.
- */
-int do_pcie_gen3_transition(struct hfi1_devdata *dd)
-{
-       struct pci_dev *parent = dd->pcidev->bus->self;
-       u64 fw_ctrl;
-       u64 reg, therm;
-       u32 reg32, fs, lf;
-       u32 status, err;
-       int ret;
-       int do_retry, retry_count = 0;
-       uint default_pset;
-       u16 target_vector, target_speed;
-       u16 lnkctl2, vendor;
-       u8 div;
-       const u8 (*eq)[3];
-       int return_error = 0;
-
-       /* PCIe Gen3 is for the ASIC only */
-       if (dd->icode != ICODE_RTL_SILICON)
-               return 0;
-
-       if (pcie_target == 1) {                 /* target Gen1 */
-               target_vector = GEN1_SPEED_VECTOR;
-               target_speed = 2500;
-       } else if (pcie_target == 2) {          /* target Gen2 */
-               target_vector = GEN2_SPEED_VECTOR;
-               target_speed = 5000;
-       } else if (pcie_target == 3) {          /* target Gen3 */
-               target_vector = GEN3_SPEED_VECTOR;
-               target_speed = 8000;
-       } else {
-               /* off or invalid target - skip */
-               dd_dev_info(dd, "%s: Skipping PCIe transition\n", __func__);
-               return 0;
-       }
-
-       /* if already at target speed, done (unless forced) */
-       if (dd->lbus_speed == target_speed) {
-               dd_dev_info(dd, "%s: PCIe already at gen%d, %s\n", __func__,
-                           pcie_target,
-                           pcie_force ? "re-doing anyway" : "skipping");
-               if (!pcie_force)
-                       return 0;
-       }
-
-       /*
-        * The driver cannot do the transition if it has no access to the
-        * upstream component
-        */
-       if (!parent) {
-               dd_dev_info(dd, "%s: No upstream, Can't do gen3 transition\n",
-                           __func__);
-               return 0;
-       }
-
-       /*
-        * Do the Gen3 transition.  Steps are those of the PCIe Gen3
-        * recipe.
-        */
-
-       /* step 1: pcie link working in gen1/gen2 */
-
-       /* step 2: if either side is not capable of Gen3, done */
-       if (pcie_target == 3 && !dd->link_gen3_capable) {
-               dd_dev_err(dd, "The PCIe link is not Gen3 capable\n");
-               ret = -ENOSYS;
-               goto done_no_mutex;
-       }
-
-       /* hold the SBus resource across the firmware download and SBR */
-       ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT);
-       if (ret) {
-               dd_dev_err(dd, "%s: unable to acquire SBus resource\n",
-                          __func__);
-               return ret;
-       }
-
-       /* make sure thermal polling is not causing interrupts */
-       therm = read_csr(dd, ASIC_CFG_THERM_POLL_EN);
-       if (therm) {
-               write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x0);
-               msleep(100);
-               dd_dev_info(dd, "%s: Disabled therm polling\n",
-                           __func__);
-       }
-
-retry:
-       /* the SBus download will reset the spico for thermal */
-
-       /* step 3: download SBus Master firmware */
-       /* step 4: download PCIe Gen3 SerDes firmware */
-       dd_dev_info(dd, "%s: downloading firmware\n", __func__);
-       ret = load_pcie_firmware(dd);
-       if (ret) {
-               /* do not proceed if the firmware cannot be downloaded */
-               return_error = 1;
-               goto done;
-       }
-
-       /* step 5: set up device parameter settings */
-       dd_dev_info(dd, "%s: setting PCIe registers\n", __func__);
-
-       /*
-        * PcieCfgSpcie1 - Link Control 3
-        * Leave at reset value.  No need to set PerfEq - link equalization
-        * will be performed automatically after the SBR when the target
-        * speed is 8GT/s.
-        */
-
-       /* clear all 16 per-lane error bits (PCIe: Lane Error Status) */
-       pci_write_config_dword(dd->pcidev, PCIE_CFG_SPCIE2, 0xffff);
-
-       /* step 5a: Set Synopsys Port Logic registers */
-
-       /*
-        * PcieCfgRegPl2 - Port Force Link
-        *
-        * Set the low power field to 0x10 to avoid unnecessary power
-        * management messages.  All other fields are zero.
-        */
-       reg32 = 0x10ul << PCIE_CFG_REG_PL2_LOW_PWR_ENT_CNT_SHIFT;
-       pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL2, reg32);
-
-       /*
-        * PcieCfgRegPl100 - Gen3 Control
-        *
-        * turn off PcieCfgRegPl100.Gen3ZRxDcNonCompl
-        * turn on PcieCfgRegPl100.EqEieosCnt
-        * Everything else zero.
-        */
-       reg32 = PCIE_CFG_REG_PL100_EQ_EIEOS_CNT_SMASK;
-       pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL100, reg32);
-
-       /*
-        * PcieCfgRegPl101 - Gen3 EQ FS and LF
-        * PcieCfgRegPl102 - Gen3 EQ Presets to Coefficients Mapping
-        * PcieCfgRegPl103 - Gen3 EQ Preset Index
-        * PcieCfgRegPl105 - Gen3 EQ Status
-        *
-        * Give initial EQ settings.
-        */
-       if (dd->pcidev->device == PCI_DEVICE_ID_INTEL0) { /* discrete */
-               /* 1000mV, FS=24, LF = 8 */
-               fs = 24;
-               lf = 8;
-               div = 3;
-               eq = discrete_preliminary_eq;
-               default_pset = DEFAULT_DISCRETE_PSET;
-       } else {
-               /* 400mV, FS=29, LF = 9 */
-               fs = 29;
-               lf = 9;
-               div = 1;
-               eq = integrated_preliminary_eq;
-               default_pset = DEFAULT_MCP_PSET;
-       }
-       pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL101,
-                              (fs <<
-                               PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_FS_SHIFT) |
-                              (lf <<
-                               PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_LF_SHIFT));
-       ret = load_eq_table(dd, eq, fs, div);
-       if (ret)
-               goto done;
-
-       /*
-        * PcieCfgRegPl106 - Gen3 EQ Control
-        *
-        * Set Gen3EqPsetReqVec, leave other fields 0.
-        */
-       if (pcie_pset == UNSET_PSET)
-               pcie_pset = default_pset;
-       if (pcie_pset > 10) {   /* valid range is 0-10, inclusive */
-               dd_dev_err(dd, "%s: Invalid Eq Pset %u, setting to %d\n",
-                          __func__, pcie_pset, default_pset);
-               pcie_pset = default_pset;
-       }
-       dd_dev_info(dd, "%s: using EQ Pset %u\n", __func__, pcie_pset);
-       pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL106,
-                              ((1 << pcie_pset) <<
-                       PCIE_CFG_REG_PL106_GEN3_EQ_PSET_REQ_VEC_SHIFT) |
-                       PCIE_CFG_REG_PL106_GEN3_EQ_EVAL2MS_DISABLE_SMASK |
-                       PCIE_CFG_REG_PL106_GEN3_EQ_PHASE23_EXIT_MODE_SMASK);
-
-       /*
-        * step 5b: Do post firmware download steps via SBus
-        */
-       dd_dev_info(dd, "%s: doing pcie post steps\n", __func__);
-       pcie_post_steps(dd);
-
-       /*
-        * step 5c: Program gasket interrupts
-        */
-       /* set the Rx Bit Rate to REFCLK ratio */
-       write_gasket_interrupt(dd, 0, 0x0006, 0x0050);
-       /* disable pCal for PCIe Gen3 RX equalization */
-       write_gasket_interrupt(dd, 1, 0x0026, 0x5b01);
-       /*
-        * Enable iCal for PCIe Gen3 RX equalization, and set which
-        * evaluation of RX_EQ_EVAL will launch the iCal procedure.
-        */
-       write_gasket_interrupt(dd, 2, 0x0026, 0x5202);
-       /* terminate list */
-       write_gasket_interrupt(dd, 3, 0x0000, 0x0000);
-
-       /*
-        * step 5d: program XMT margin
-        */
-       write_xmt_margin(dd, __func__);
-
-       /*
-        * step 5e: disable active state power management (ASPM). It
-        * will be enabled if required later
-        */
-       dd_dev_info(dd, "%s: clearing ASPM\n", __func__);
-       aspm_hw_disable_l1(dd);
-
-       /*
-        * step 5f: clear DirectSpeedChange
-        * PcieCfgRegPl67.DirectSpeedChange must be zero to prevent the
-        * change in the speed target from starting before we are ready.
-        * This field defaults to 0 and we are not changing it, so nothing
-        * needs to be done.
-        */
-
-       /* step 5g: Set target link speed */
-       /*
-        * Set target link speed to be target on both device and parent.
-        * On setting the parent: Some system BIOSs "helpfully" set the
-        * parent target speed to Gen2 to match the ASIC's initial speed.
-        * We can set the target Gen3 because we have already checked
-        * that it is Gen3 capable earlier.
-        */
-       dd_dev_info(dd, "%s: setting parent target link speed\n", __func__);
-       pcie_capability_read_word(parent, PCI_EXP_LNKCTL2, &lnkctl2);
-       dd_dev_info(dd, "%s: ..old link control2: 0x%x\n", __func__,
-                   (u32)lnkctl2);
-       /* only write to parent if target is not as high as ours */
-       if ((lnkctl2 & LNKCTL2_TARGET_LINK_SPEED_MASK) < target_vector) {
-               lnkctl2 &= ~LNKCTL2_TARGET_LINK_SPEED_MASK;
-               lnkctl2 |= target_vector;
-               dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__,
-                           (u32)lnkctl2);
-               pcie_capability_write_word(parent, PCI_EXP_LNKCTL2, lnkctl2);
-       } else {
-               dd_dev_info(dd, "%s: ..target speed is OK\n", __func__);
-       }
-
-       dd_dev_info(dd, "%s: setting target link speed\n", __func__);
-       pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL2, &lnkctl2);
-       dd_dev_info(dd, "%s: ..old link control2: 0x%x\n", __func__,
-                   (u32)lnkctl2);
-       lnkctl2 &= ~LNKCTL2_TARGET_LINK_SPEED_MASK;
-       lnkctl2 |= target_vector;
-       dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__,
-                   (u32)lnkctl2);
-       pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL2, lnkctl2);
-
-       /* step 5h: arm gasket logic */
-       /* hold DC in reset across the SBR */
-       write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_DC_RESET_SMASK);
-       (void)read_csr(dd, CCE_DC_CTRL); /* DC reset hold */
-       /* save firmware control across the SBR */
-       fw_ctrl = read_csr(dd, MISC_CFG_FW_CTRL);
-
-       dd_dev_info(dd, "%s: arming gasket logic\n", __func__);
-       arm_gasket_logic(dd);
-
-       /*
-        * step 6: quiesce PCIe link
-        * The chip has already been reset, so there will be no traffic
-        * from the chip.  Linux has no easy way to enforce that it will
-        * not try to access the device, so we just need to hope it doesn't
-        * do it while we are doing the reset.
-        */
-
-       /*
-        * step 7: initiate the secondary bus reset (SBR)
-        * step 8: hardware brings the links back up
-        * step 9: wait for link speed transition to be complete
-        */
-       dd_dev_info(dd, "%s: calling trigger_sbr\n", __func__);
-       ret = trigger_sbr(dd);
-       if (ret)
-               goto done;
-
-       /* step 10: decide what to do next */
-
-       /* check if we can read PCI space */
-       ret = pci_read_config_word(dd->pcidev, PCI_VENDOR_ID, &vendor);
-       if (ret) {
-               dd_dev_info(dd,
-                           "%s: read of VendorID failed after SBR, err %d\n",
-                           __func__, ret);
-               return_error = 1;
-               goto done;
-       }
-       if (vendor == 0xffff) {
-               dd_dev_info(dd, "%s: VendorID is all 1s after SBR\n", __func__);
-               return_error = 1;
-               ret = -EIO;
-               goto done;
-       }
-
-       /* restore PCI space registers we know were reset */
-       dd_dev_info(dd, "%s: calling restore_pci_variables\n", __func__);
-       restore_pci_variables(dd);
-       /* restore firmware control */
-       write_csr(dd, MISC_CFG_FW_CTRL, fw_ctrl);
-
-       /*
-        * Check the gasket block status.
-        *
-        * This is the first CSR read after the SBR.  If the read returns
-        * all 1s (fails), the link did not make it back.
-        *
-        * Once we're sure we can read and write, clear the DC reset after
-        * the SBR.  Then check for any per-lane errors. Then look over
-        * the status.
-        */
-       reg = read_csr(dd, ASIC_PCIE_SD_HOST_STATUS);
-       dd_dev_info(dd, "%s: gasket block status: 0x%llx\n", __func__, reg);
-       if (reg == ~0ull) {     /* PCIe read failed/timeout */
-               dd_dev_err(dd, "SBR failed - unable to read from device\n");
-               return_error = 1;
-               ret = -ENOSYS;
-               goto done;
-       }
-
-       /* clear the DC reset */
-       write_csr(dd, CCE_DC_CTRL, 0);
-
-       /* Set the LED off */
-       setextled(dd, 0);
-
-       /* check for any per-lane errors */
-       pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE2, &reg32);
-       dd_dev_info(dd, "%s: per-lane errors: 0x%x\n", __func__, reg32);
-
-       /* extract status, look for our HFI */
-       status = (reg >> ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_SHIFT)
-                       & ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_MASK;
-       if ((status & (1 << dd->hfi1_id)) == 0) {
-               dd_dev_err(dd,
-                          "%s: gasket status 0x%x, expecting 0x%x\n",
-                          __func__, status, 1 << dd->hfi1_id);
-               ret = -EIO;
-               goto done;
-       }
-
-       /* extract error */
-       err = (reg >> ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_SHIFT)
-               & ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_MASK;
-       if (err) {
-               dd_dev_err(dd, "%s: gasket error %d\n", __func__, err);
-               ret = -EIO;
-               goto done;
-       }
-
-       /* update our link information cache */
-       update_lbus_info(dd);
-       dd_dev_info(dd, "%s: new speed and width: %s\n", __func__,
-                   dd->lbus_info);
-
-       if (dd->lbus_speed != target_speed) { /* not target */
-               /* maybe retry */
-               do_retry = retry_count < pcie_retry;
-               dd_dev_err(dd, "PCIe link speed did not switch to Gen%d%s\n",
-                          pcie_target, do_retry ? ", retrying" : "");
-               retry_count++;
-               if (do_retry) {
-                       msleep(100); /* allow time to settle */
-                       goto retry;
-               }
-               ret = -EIO;
-       }
-
-done:
-       if (therm) {
-               write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x1);
-               msleep(100);
-               dd_dev_info(dd, "%s: Re-enable therm polling\n",
-                           __func__);
-       }
-       release_chip_resource(dd, CR_SBUS);
-done_no_mutex:
-       /* return no error if it is OK to be at current speed */
-       if (ret && !return_error) {
-               dd_dev_err(dd, "Proceeding at current speed PCIe speed\n");
-               ret = 0;
-       }
-
-       dd_dev_info(dd, "%s: done\n", __func__);
-       return ret;
-}
diff --git a/drivers/staging/rdma/hfi1/pio.c b/drivers/staging/rdma/hfi1/pio.c
deleted file mode 100644 (file)
index c67b9ad..0000000
+++ /dev/null
@@ -1,2073 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/delay.h>
-#include "hfi.h"
-#include "qp.h"
-#include "trace.h"
-
-#define SC_CTXT_PACKET_EGRESS_TIMEOUT 350 /* in chip cycles */
-
-#define SC(name) SEND_CTXT_##name
-/*
- * Send Context functions
- */
-static void sc_wait_for_packet_egress(struct send_context *sc, int pause);
-
-/*
- * Set the CM reset bit and wait for it to clear.  Use the provided
- * sendctrl register.  This routine has no locking.
- */
-void __cm_reset(struct hfi1_devdata *dd, u64 sendctrl)
-{
-       write_csr(dd, SEND_CTRL, sendctrl | SEND_CTRL_CM_RESET_SMASK);
-       while (1) {
-               udelay(1);
-               sendctrl = read_csr(dd, SEND_CTRL);
-               if ((sendctrl & SEND_CTRL_CM_RESET_SMASK) == 0)
-                       break;
-       }
-}
-
-/* defined in header release 48 and higher */
-#ifndef SEND_CTRL_UNSUPPORTED_VL_SHIFT
-#define SEND_CTRL_UNSUPPORTED_VL_SHIFT 3
-#define SEND_CTRL_UNSUPPORTED_VL_MASK 0xffull
-#define SEND_CTRL_UNSUPPORTED_VL_SMASK (SEND_CTRL_UNSUPPORTED_VL_MASK \
-               << SEND_CTRL_UNSUPPORTED_VL_SHIFT)
-#endif
-
-/* global control of PIO send */
-void pio_send_control(struct hfi1_devdata *dd, int op)
-{
-       u64 reg, mask;
-       unsigned long flags;
-       int write = 1;  /* write sendctrl back */
-       int flush = 0;  /* re-read sendctrl to make sure it is flushed */
-
-       spin_lock_irqsave(&dd->sendctrl_lock, flags);
-
-       reg = read_csr(dd, SEND_CTRL);
-       switch (op) {
-       case PSC_GLOBAL_ENABLE:
-               reg |= SEND_CTRL_SEND_ENABLE_SMASK;
-       /* Fall through */
-       case PSC_DATA_VL_ENABLE:
-               /* Disallow sending on VLs not enabled */
-               mask = (((~0ull) << num_vls) & SEND_CTRL_UNSUPPORTED_VL_MASK) <<
-                               SEND_CTRL_UNSUPPORTED_VL_SHIFT;
-               reg = (reg & ~SEND_CTRL_UNSUPPORTED_VL_SMASK) | mask;
-               break;
-       case PSC_GLOBAL_DISABLE:
-               reg &= ~SEND_CTRL_SEND_ENABLE_SMASK;
-               break;
-       case PSC_GLOBAL_VLARB_ENABLE:
-               reg |= SEND_CTRL_VL_ARBITER_ENABLE_SMASK;
-               break;
-       case PSC_GLOBAL_VLARB_DISABLE:
-               reg &= ~SEND_CTRL_VL_ARBITER_ENABLE_SMASK;
-               break;
-       case PSC_CM_RESET:
-               __cm_reset(dd, reg);
-               write = 0; /* CSR already written (and flushed) */
-               break;
-       case PSC_DATA_VL_DISABLE:
-               reg |= SEND_CTRL_UNSUPPORTED_VL_SMASK;
-               flush = 1;
-               break;
-       default:
-               dd_dev_err(dd, "%s: invalid control %d\n", __func__, op);
-               break;
-       }
-
-       if (write) {
-               write_csr(dd, SEND_CTRL, reg);
-               if (flush)
-                       (void)read_csr(dd, SEND_CTRL); /* flush write */
-       }
-
-       spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
-}
-
-/* number of send context memory pools */
-#define NUM_SC_POOLS 2
-
-/* Send Context Size (SCS) wildcards */
-#define SCS_POOL_0 -1
-#define SCS_POOL_1 -2
-
-/* Send Context Count (SCC) wildcards */
-#define SCC_PER_VL -1
-#define SCC_PER_CPU  -2
-#define SCC_PER_KRCVQ  -3
-
-/* Send Context Size (SCS) constants */
-#define SCS_ACK_CREDITS  32
-#define SCS_VL15_CREDITS 102   /* 3 pkts of 2048B data + 128B header */
-
-#define PIO_THRESHOLD_CEILING 4096
-
-#define PIO_WAIT_BATCH_SIZE 5
-
-/* default send context sizes */
-static struct sc_config_sizes sc_config_sizes[SC_MAX] = {
-       [SC_KERNEL] = { .size  = SCS_POOL_0,    /* even divide, pool 0 */
-                       .count = SCC_PER_VL },  /* one per NUMA */
-       [SC_ACK]    = { .size  = SCS_ACK_CREDITS,
-                       .count = SCC_PER_KRCVQ },
-       [SC_USER]   = { .size  = SCS_POOL_0,    /* even divide, pool 0 */
-                       .count = SCC_PER_CPU }, /* one per CPU */
-       [SC_VL15]   = { .size  = SCS_VL15_CREDITS,
-                       .count = 1 },
-
-};
-
-/* send context memory pool configuration */
-struct mem_pool_config {
-       int centipercent;       /* % of memory, in 100ths of 1% */
-       int absolute_blocks;    /* absolute block count */
-};
-
-/* default memory pool configuration: 100% in pool 0 */
-static struct mem_pool_config sc_mem_pool_config[NUM_SC_POOLS] = {
-       /* centi%, abs blocks */
-       {  10000,     -1 },             /* pool 0 */
-       {      0,     -1 },             /* pool 1 */
-};
-
-/* memory pool information, used when calculating final sizes */
-struct mem_pool_info {
-       int centipercent;       /*
-                                * 100th of 1% of memory to use, -1 if blocks
-                                * already set
-                                */
-       int count;              /* count of contexts in the pool */
-       int blocks;             /* block size of the pool */
-       int size;               /* context size, in blocks */
-};
-
-/*
- * Convert a pool wildcard to a valid pool index.  The wildcards
- * start at -1 and increase negatively.  Map them as:
- *     -1 => 0
- *     -2 => 1
- *     etc.
- *
- * Return -1 on non-wildcard input, otherwise convert to a pool number.
- */
-static int wildcard_to_pool(int wc)
-{
-       if (wc >= 0)
-               return -1;      /* non-wildcard */
-       return -wc - 1;
-}
-
-static const char *sc_type_names[SC_MAX] = {
-       "kernel",
-       "ack",
-       "user",
-       "vl15"
-};
-
-static const char *sc_type_name(int index)
-{
-       if (index < 0 || index >= SC_MAX)
-               return "unknown";
-       return sc_type_names[index];
-}
-
-/*
- * Read the send context memory pool configuration and send context
- * size configuration.  Replace any wildcards and come up with final
- * counts and sizes for the send context types.
- */
-int init_sc_pools_and_sizes(struct hfi1_devdata *dd)
-{
-       struct mem_pool_info mem_pool_info[NUM_SC_POOLS] = { { 0 } };
-       int total_blocks = (dd->chip_pio_mem_size / PIO_BLOCK_SIZE) - 1;
-       int total_contexts = 0;
-       int fixed_blocks;
-       int pool_blocks;
-       int used_blocks;
-       int cp_total;           /* centipercent total */
-       int ab_total;           /* absolute block total */
-       int extra;
-       int i;
-
-       /*
-        * When SDMA is enabled, kernel context pio packet size is capped by
-        * "piothreshold". Reduce pio buffer allocation for kernel context by
-        * setting it to a fixed size. The allocation allows 3-deep buffering
-        * of the largest pio packets plus up to 128 bytes header, sufficient
-        * to maintain verbs performance.
-        *
-        * When SDMA is disabled, keep the default pooling allocation.
-        */
-       if (HFI1_CAP_IS_KSET(SDMA)) {
-               u16 max_pkt_size = (piothreshold < PIO_THRESHOLD_CEILING) ?
-                                        piothreshold : PIO_THRESHOLD_CEILING;
-               sc_config_sizes[SC_KERNEL].size =
-                       3 * (max_pkt_size + 128) / PIO_BLOCK_SIZE;
-       }
-
-       /*
-        * Step 0:
-        *      - copy the centipercents/absolute sizes from the pool config
-        *      - sanity check these values
-        *      - add up centipercents, then later check for full value
-        *      - add up absolute blocks, then later check for over-commit
-        */
-       cp_total = 0;
-       ab_total = 0;
-       for (i = 0; i < NUM_SC_POOLS; i++) {
-               int cp = sc_mem_pool_config[i].centipercent;
-               int ab = sc_mem_pool_config[i].absolute_blocks;
-
-               /*
-                * A negative value is "unused" or "invalid".  Both *can*
-                * be valid, but centipercent wins, so check that first
-                */
-               if (cp >= 0) {                  /* centipercent valid */
-                       cp_total += cp;
-               } else if (ab >= 0) {           /* absolute blocks valid */
-                       ab_total += ab;
-               } else {                        /* neither valid */
-                       dd_dev_err(
-                               dd,
-                               "Send context memory pool %d: both the block count and centipercent are invalid\n",
-                               i);
-                       return -EINVAL;
-               }
-
-               mem_pool_info[i].centipercent = cp;
-               mem_pool_info[i].blocks = ab;
-       }
-
-       /* do not use both % and absolute blocks for different pools */
-       if (cp_total != 0 && ab_total != 0) {
-               dd_dev_err(
-                       dd,
-                       "All send context memory pools must be described as either centipercent or blocks, no mixing between pools\n");
-               return -EINVAL;
-       }
-
-       /* if any percentages are present, they must add up to 100% x 100 */
-       if (cp_total != 0 && cp_total != 10000) {
-               dd_dev_err(
-                       dd,
-                       "Send context memory pool centipercent is %d, expecting 10000\n",
-                       cp_total);
-               return -EINVAL;
-       }
-
-       /* the absolute pool total cannot be more than the mem total */
-       if (ab_total > total_blocks) {
-               dd_dev_err(
-                       dd,
-                       "Send context memory pool absolute block count %d is larger than the memory size %d\n",
-                       ab_total, total_blocks);
-               return -EINVAL;
-       }
-
-       /*
-        * Step 2:
-        *      - copy from the context size config
-        *      - replace context type wildcard counts with real values
-        *      - add up non-memory pool block sizes
-        *      - add up memory pool user counts
-        */
-       fixed_blocks = 0;
-       for (i = 0; i < SC_MAX; i++) {
-               int count = sc_config_sizes[i].count;
-               int size = sc_config_sizes[i].size;
-               int pool;
-
-               /*
-                * Sanity check count: Either a positive value or
-                * one of the expected wildcards is valid.  The positive
-                * value is checked later when we compare against total
-                * memory available.
-                */
-               if (i == SC_ACK) {
-                       count = dd->n_krcv_queues;
-               } else if (i == SC_KERNEL) {
-                       count = INIT_SC_PER_VL * num_vls;
-               } else if (count == SCC_PER_CPU) {
-                       count = dd->num_rcv_contexts - dd->n_krcv_queues;
-               } else if (count < 0) {
-                       dd_dev_err(
-                               dd,
-                               "%s send context invalid count wildcard %d\n",
-                               sc_type_name(i), count);
-                       return -EINVAL;
-               }
-               if (total_contexts + count > dd->chip_send_contexts)
-                       count = dd->chip_send_contexts - total_contexts;
-
-               total_contexts += count;
-
-               /*
-                * Sanity check pool: The conversion will return a pool
-                * number or -1 if a fixed (non-negative) value.  The fixed
-                * value is checked later when we compare against
-                * total memory available.
-                */
-               pool = wildcard_to_pool(size);
-               if (pool == -1) {                       /* non-wildcard */
-                       fixed_blocks += size * count;
-               } else if (pool < NUM_SC_POOLS) {       /* valid wildcard */
-                       mem_pool_info[pool].count += count;
-               } else {                                /* invalid wildcard */
-                       dd_dev_err(
-                               dd,
-                               "%s send context invalid pool wildcard %d\n",
-                               sc_type_name(i), size);
-                       return -EINVAL;
-               }
-
-               dd->sc_sizes[i].count = count;
-               dd->sc_sizes[i].size = size;
-       }
-       if (fixed_blocks > total_blocks) {
-               dd_dev_err(
-                       dd,
-                       "Send context fixed block count, %u, larger than total block count %u\n",
-                       fixed_blocks, total_blocks);
-               return -EINVAL;
-       }
-
-       /* step 3: calculate the blocks in the pools, and pool context sizes */
-       pool_blocks = total_blocks - fixed_blocks;
-       if (ab_total > pool_blocks) {
-               dd_dev_err(
-                       dd,
-                       "Send context fixed pool sizes, %u, larger than pool block count %u\n",
-                       ab_total, pool_blocks);
-               return -EINVAL;
-       }
-       /* subtract off the fixed pool blocks */
-       pool_blocks -= ab_total;
-
-       for (i = 0; i < NUM_SC_POOLS; i++) {
-               struct mem_pool_info *pi = &mem_pool_info[i];
-
-               /* % beats absolute blocks */
-               if (pi->centipercent >= 0)
-                       pi->blocks = (pool_blocks * pi->centipercent) / 10000;
-
-               if (pi->blocks == 0 && pi->count != 0) {
-                       dd_dev_err(
-                               dd,
-                               "Send context memory pool %d has %u contexts, but no blocks\n",
-                               i, pi->count);
-                       return -EINVAL;
-               }
-               if (pi->count == 0) {
-                       /* warn about wasted blocks */
-                       if (pi->blocks != 0)
-                               dd_dev_err(
-                                       dd,
-                                       "Send context memory pool %d has %u blocks, but zero contexts\n",
-                                       i, pi->blocks);
-                       pi->size = 0;
-               } else {
-                       pi->size = pi->blocks / pi->count;
-               }
-       }
-
-       /* step 4: fill in the context type sizes from the pool sizes */
-       used_blocks = 0;
-       for (i = 0; i < SC_MAX; i++) {
-               if (dd->sc_sizes[i].size < 0) {
-                       unsigned pool = wildcard_to_pool(dd->sc_sizes[i].size);
-
-                       WARN_ON_ONCE(pool >= NUM_SC_POOLS);
-                       dd->sc_sizes[i].size = mem_pool_info[pool].size;
-               }
-               /* make sure we are not larger than what is allowed by the HW */
-#define PIO_MAX_BLOCKS 1024
-               if (dd->sc_sizes[i].size > PIO_MAX_BLOCKS)
-                       dd->sc_sizes[i].size = PIO_MAX_BLOCKS;
-
-               /* calculate our total usage */
-               used_blocks += dd->sc_sizes[i].size * dd->sc_sizes[i].count;
-       }
-       extra = total_blocks - used_blocks;
-       if (extra != 0)
-               dd_dev_info(dd, "unused send context blocks: %d\n", extra);
-
-       return total_contexts;
-}
-
-int init_send_contexts(struct hfi1_devdata *dd)
-{
-       u16 base;
-       int ret, i, j, context;
-
-       ret = init_credit_return(dd);
-       if (ret)
-               return ret;
-
-       dd->hw_to_sw = kmalloc_array(TXE_NUM_CONTEXTS, sizeof(u8),
-                                       GFP_KERNEL);
-       dd->send_contexts = kcalloc(dd->num_send_contexts,
-                                       sizeof(struct send_context_info),
-                                       GFP_KERNEL);
-       if (!dd->send_contexts || !dd->hw_to_sw) {
-               kfree(dd->hw_to_sw);
-               kfree(dd->send_contexts);
-               free_credit_return(dd);
-               return -ENOMEM;
-       }
-
-       /* hardware context map starts with invalid send context indices */
-       for (i = 0; i < TXE_NUM_CONTEXTS; i++)
-               dd->hw_to_sw[i] = INVALID_SCI;
-
-       /*
-        * All send contexts have their credit sizes.  Allocate credits
-        * for each context one after another from the global space.
-        */
-       context = 0;
-       base = 1;
-       for (i = 0; i < SC_MAX; i++) {
-               struct sc_config_sizes *scs = &dd->sc_sizes[i];
-
-               for (j = 0; j < scs->count; j++) {
-                       struct send_context_info *sci =
-                                               &dd->send_contexts[context];
-                       sci->type = i;
-                       sci->base = base;
-                       sci->credits = scs->size;
-
-                       context++;
-                       base += scs->size;
-               }
-       }
-
-       return 0;
-}
-
-/*
- * Allocate a software index and hardware context of the given type.
- *
- * Must be called with dd->sc_lock held.
- */
-static int sc_hw_alloc(struct hfi1_devdata *dd, int type, u32 *sw_index,
-                      u32 *hw_context)
-{
-       struct send_context_info *sci;
-       u32 index;
-       u32 context;
-
-       for (index = 0, sci = &dd->send_contexts[0];
-                       index < dd->num_send_contexts; index++, sci++) {
-               if (sci->type == type && sci->allocated == 0) {
-                       sci->allocated = 1;
-                       /* use a 1:1 mapping, but make them non-equal */
-                       context = dd->chip_send_contexts - index - 1;
-                       dd->hw_to_sw[context] = index;
-                       *sw_index = index;
-                       *hw_context = context;
-                       return 0; /* success */
-               }
-       }
-       dd_dev_err(dd, "Unable to locate a free type %d send context\n", type);
-       return -ENOSPC;
-}
-
-/*
- * Free the send context given by its software index.
- *
- * Must be called with dd->sc_lock held.
- */
-static void sc_hw_free(struct hfi1_devdata *dd, u32 sw_index, u32 hw_context)
-{
-       struct send_context_info *sci;
-
-       sci = &dd->send_contexts[sw_index];
-       if (!sci->allocated) {
-               dd_dev_err(dd, "%s: sw_index %u not allocated? hw_context %u\n",
-                          __func__, sw_index, hw_context);
-       }
-       sci->allocated = 0;
-       dd->hw_to_sw[hw_context] = INVALID_SCI;
-}
-
-/* return the base context of a context in a group */
-static inline u32 group_context(u32 context, u32 group)
-{
-       return (context >> group) << group;
-}
-
-/* return the size of a group */
-static inline u32 group_size(u32 group)
-{
-       return 1 << group;
-}
-
-/*
- * Obtain the credit return addresses, kernel virtual and physical, for the
- * given sc.
- *
- * To understand this routine:
- * o va and pa are arrays of struct credit_return.  One for each physical
- *   send context, per NUMA.
- * o Each send context always looks in its relative location in a struct
- *   credit_return for its credit return.
- * o Each send context in a group must have its return address CSR programmed
- *   with the same value.  Use the address of the first send context in the
- *   group.
- */
-static void cr_group_addresses(struct send_context *sc, dma_addr_t *pa)
-{
-       u32 gc = group_context(sc->hw_context, sc->group);
-       u32 index = sc->hw_context & 0x7;
-
-       sc->hw_free = &sc->dd->cr_base[sc->node].va[gc].cr[index];
-       *pa = (unsigned long)
-              &((struct credit_return *)sc->dd->cr_base[sc->node].pa)[gc];
-}
-
-/*
- * Work queue function triggered in error interrupt routine for
- * kernel contexts.
- */
-static void sc_halted(struct work_struct *work)
-{
-       struct send_context *sc;
-
-       sc = container_of(work, struct send_context, halt_work);
-       sc_restart(sc);
-}
-
-/*
- * Calculate PIO block threshold for this send context using the given MTU.
- * Trigger a return when one MTU plus optional header of credits remain.
- *
- * Parameter mtu is in bytes.
- * Parameter hdrqentsize is in DWORDs.
- *
- * Return value is what to write into the CSR: trigger return when
- * unreturned credits pass this count.
- */
-u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize)
-{
-       u32 release_credits;
-       u32 threshold;
-
-       /* add in the header size, then divide by the PIO block size */
-       mtu += hdrqentsize << 2;
-       release_credits = DIV_ROUND_UP(mtu, PIO_BLOCK_SIZE);
-
-       /* check against this context's credits */
-       if (sc->credits <= release_credits)
-               threshold = 1;
-       else
-               threshold = sc->credits - release_credits;
-
-       return threshold;
-}
-
-/*
- * Calculate credit threshold in terms of percent of the allocated credits.
- * Trigger when unreturned credits equal or exceed the percentage of the whole.
- *
- * Return value is what to write into the CSR: trigger return when
- * unreturned credits pass this count.
- */
-u32 sc_percent_to_threshold(struct send_context *sc, u32 percent)
-{
-       return (sc->credits * percent) / 100;
-}
-
-/*
- * Set the credit return threshold.
- */
-void sc_set_cr_threshold(struct send_context *sc, u32 new_threshold)
-{
-       unsigned long flags;
-       u32 old_threshold;
-       int force_return = 0;
-
-       spin_lock_irqsave(&sc->credit_ctrl_lock, flags);
-
-       old_threshold = (sc->credit_ctrl >>
-                               SC(CREDIT_CTRL_THRESHOLD_SHIFT))
-                        & SC(CREDIT_CTRL_THRESHOLD_MASK);
-
-       if (new_threshold != old_threshold) {
-               sc->credit_ctrl =
-                       (sc->credit_ctrl
-                               & ~SC(CREDIT_CTRL_THRESHOLD_SMASK))
-                       | ((new_threshold
-                               & SC(CREDIT_CTRL_THRESHOLD_MASK))
-                          << SC(CREDIT_CTRL_THRESHOLD_SHIFT));
-               write_kctxt_csr(sc->dd, sc->hw_context,
-                               SC(CREDIT_CTRL), sc->credit_ctrl);
-
-               /* force a credit return on change to avoid a possible stall */
-               force_return = 1;
-       }
-
-       spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags);
-
-       if (force_return)
-               sc_return_credits(sc);
-}
-
-/*
- * set_pio_integrity
- *
- * Set the CHECK_ENABLE register for the send context 'sc'.
- */
-void set_pio_integrity(struct send_context *sc)
-{
-       struct hfi1_devdata *dd = sc->dd;
-       u64 reg = 0;
-       u32 hw_context = sc->hw_context;
-       int type = sc->type;
-
-       /*
-        * No integrity checks if HFI1_CAP_NO_INTEGRITY is set, or if
-        * we're snooping.
-        */
-       if (likely(!HFI1_CAP_IS_KSET(NO_INTEGRITY)) &&
-           dd->hfi1_snoop.mode_flag != HFI1_PORT_SNOOP_MODE)
-               reg = hfi1_pkt_default_send_ctxt_mask(dd, type);
-
-       write_kctxt_csr(dd, hw_context, SC(CHECK_ENABLE), reg);
-}
-
-static u32 get_buffers_allocated(struct send_context *sc)
-{
-       int cpu;
-       u32 ret = 0;
-
-       for_each_possible_cpu(cpu)
-               ret += *per_cpu_ptr(sc->buffers_allocated, cpu);
-       return ret;
-}
-
-static void reset_buffers_allocated(struct send_context *sc)
-{
-       int cpu;
-
-       for_each_possible_cpu(cpu)
-               (*per_cpu_ptr(sc->buffers_allocated, cpu)) = 0;
-}
-
-/*
- * Allocate a NUMA relative send context structure of the given type along
- * with a HW context.
- */
-struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
-                             uint hdrqentsize, int numa)
-{
-       struct send_context_info *sci;
-       struct send_context *sc = NULL;
-       dma_addr_t pa;
-       unsigned long flags;
-       u64 reg;
-       u32 thresh;
-       u32 sw_index;
-       u32 hw_context;
-       int ret;
-       u8 opval, opmask;
-
-       /* do not allocate while frozen */
-       if (dd->flags & HFI1_FROZEN)
-               return NULL;
-
-       sc = kzalloc_node(sizeof(*sc), GFP_KERNEL, numa);
-       if (!sc)
-               return NULL;
-
-       sc->buffers_allocated = alloc_percpu(u32);
-       if (!sc->buffers_allocated) {
-               kfree(sc);
-               dd_dev_err(dd,
-                          "Cannot allocate buffers_allocated per cpu counters\n"
-                         );
-               return NULL;
-       }
-
-       spin_lock_irqsave(&dd->sc_lock, flags);
-       ret = sc_hw_alloc(dd, type, &sw_index, &hw_context);
-       if (ret) {
-               spin_unlock_irqrestore(&dd->sc_lock, flags);
-               free_percpu(sc->buffers_allocated);
-               kfree(sc);
-               return NULL;
-       }
-
-       sci = &dd->send_contexts[sw_index];
-       sci->sc = sc;
-
-       sc->dd = dd;
-       sc->node = numa;
-       sc->type = type;
-       spin_lock_init(&sc->alloc_lock);
-       spin_lock_init(&sc->release_lock);
-       spin_lock_init(&sc->credit_ctrl_lock);
-       INIT_LIST_HEAD(&sc->piowait);
-       INIT_WORK(&sc->halt_work, sc_halted);
-       init_waitqueue_head(&sc->halt_wait);
-
-       /* grouping is always single context for now */
-       sc->group = 0;
-
-       sc->sw_index = sw_index;
-       sc->hw_context = hw_context;
-       cr_group_addresses(sc, &pa);
-       sc->credits = sci->credits;
-
-/* PIO Send Memory Address details */
-#define PIO_ADDR_CONTEXT_MASK 0xfful
-#define PIO_ADDR_CONTEXT_SHIFT 16
-       sc->base_addr = dd->piobase + ((hw_context & PIO_ADDR_CONTEXT_MASK)
-                                       << PIO_ADDR_CONTEXT_SHIFT);
-
-       /* set base and credits */
-       reg = ((sci->credits & SC(CTRL_CTXT_DEPTH_MASK))
-                                       << SC(CTRL_CTXT_DEPTH_SHIFT))
-               | ((sci->base & SC(CTRL_CTXT_BASE_MASK))
-                                       << SC(CTRL_CTXT_BASE_SHIFT));
-       write_kctxt_csr(dd, hw_context, SC(CTRL), reg);
-
-       set_pio_integrity(sc);
-
-       /* unmask all errors */
-       write_kctxt_csr(dd, hw_context, SC(ERR_MASK), (u64)-1);
-
-       /* set the default partition key */
-       write_kctxt_csr(dd, hw_context, SC(CHECK_PARTITION_KEY),
-                       (SC(CHECK_PARTITION_KEY_VALUE_MASK) &
-                        DEFAULT_PKEY) <<
-                       SC(CHECK_PARTITION_KEY_VALUE_SHIFT));
-
-       /* per context type checks */
-       if (type == SC_USER) {
-               opval = USER_OPCODE_CHECK_VAL;
-               opmask = USER_OPCODE_CHECK_MASK;
-       } else {
-               opval = OPCODE_CHECK_VAL_DISABLED;
-               opmask = OPCODE_CHECK_MASK_DISABLED;
-       }
-
-       /* set the send context check opcode mask and value */
-       write_kctxt_csr(dd, hw_context, SC(CHECK_OPCODE),
-                       ((u64)opmask << SC(CHECK_OPCODE_MASK_SHIFT)) |
-                       ((u64)opval << SC(CHECK_OPCODE_VALUE_SHIFT)));
-
-       /* set up credit return */
-       reg = pa & SC(CREDIT_RETURN_ADDR_ADDRESS_SMASK);
-       write_kctxt_csr(dd, hw_context, SC(CREDIT_RETURN_ADDR), reg);
-
-       /*
-        * Calculate the initial credit return threshold.
-        *
-        * For Ack contexts, set a threshold for half the credits.
-        * For User contexts use the given percentage.  This has been
-        * sanitized on driver start-up.
-        * For Kernel contexts, use the default MTU plus a header
-        * or half the credits, whichever is smaller. This should
-        * work for both the 3-deep buffering allocation and the
-        * pooling allocation.
-        */
-       if (type == SC_ACK) {
-               thresh = sc_percent_to_threshold(sc, 50);
-       } else if (type == SC_USER) {
-               thresh = sc_percent_to_threshold(sc,
-                                                user_credit_return_threshold);
-       } else { /* kernel */
-               thresh = min(sc_percent_to_threshold(sc, 50),
-                            sc_mtu_to_threshold(sc, hfi1_max_mtu,
-                                                hdrqentsize));
-       }
-       reg = thresh << SC(CREDIT_CTRL_THRESHOLD_SHIFT);
-       /* add in early return */
-       if (type == SC_USER && HFI1_CAP_IS_USET(EARLY_CREDIT_RETURN))
-               reg |= SC(CREDIT_CTRL_EARLY_RETURN_SMASK);
-       else if (HFI1_CAP_IS_KSET(EARLY_CREDIT_RETURN)) /* kernel, ack */
-               reg |= SC(CREDIT_CTRL_EARLY_RETURN_SMASK);
-
-       /* set up write-through credit_ctrl */
-       sc->credit_ctrl = reg;
-       write_kctxt_csr(dd, hw_context, SC(CREDIT_CTRL), reg);
-
-       /* User send contexts should not allow sending on VL15 */
-       if (type == SC_USER) {
-               reg = 1ULL << 15;
-               write_kctxt_csr(dd, hw_context, SC(CHECK_VL), reg);
-       }
-
-       spin_unlock_irqrestore(&dd->sc_lock, flags);
-
-       /*
-        * Allocate shadow ring to track outstanding PIO buffers _after_
-        * unlocking.  We don't know the size until the lock is held and
-        * we can't allocate while the lock is held.  No one is using
-        * the context yet, so allocate it now.
-        *
-        * User contexts do not get a shadow ring.
-        */
-       if (type != SC_USER) {
-               /*
-                * Size the shadow ring 1 larger than the number of credits
-                * so head == tail can mean empty.
-                */
-               sc->sr_size = sci->credits + 1;
-               sc->sr = kzalloc_node(sizeof(union pio_shadow_ring) *
-                               sc->sr_size, GFP_KERNEL, numa);
-               if (!sc->sr) {
-                       sc_free(sc);
-                       return NULL;
-               }
-       }
-
-       hfi1_cdbg(PIO,
-                 "Send context %u(%u) %s group %u credits %u credit_ctrl 0x%llx threshold %u\n",
-                 sw_index,
-                 hw_context,
-                 sc_type_name(type),
-                 sc->group,
-                 sc->credits,
-                 sc->credit_ctrl,
-                 thresh);
-
-       return sc;
-}
-
-/* free a per-NUMA send context structure */
-void sc_free(struct send_context *sc)
-{
-       struct hfi1_devdata *dd;
-       unsigned long flags;
-       u32 sw_index;
-       u32 hw_context;
-
-       if (!sc)
-               return;
-
-       sc->flags |= SCF_IN_FREE;       /* ensure no restarts */
-       dd = sc->dd;
-       if (!list_empty(&sc->piowait))
-               dd_dev_err(dd, "piowait list not empty!\n");
-       sw_index = sc->sw_index;
-       hw_context = sc->hw_context;
-       sc_disable(sc); /* make sure the HW is disabled */
-       flush_work(&sc->halt_work);
-
-       spin_lock_irqsave(&dd->sc_lock, flags);
-       dd->send_contexts[sw_index].sc = NULL;
-
-       /* clear/disable all registers set in sc_alloc */
-       write_kctxt_csr(dd, hw_context, SC(CTRL), 0);
-       write_kctxt_csr(dd, hw_context, SC(CHECK_ENABLE), 0);
-       write_kctxt_csr(dd, hw_context, SC(ERR_MASK), 0);
-       write_kctxt_csr(dd, hw_context, SC(CHECK_PARTITION_KEY), 0);
-       write_kctxt_csr(dd, hw_context, SC(CHECK_OPCODE), 0);
-       write_kctxt_csr(dd, hw_context, SC(CREDIT_RETURN_ADDR), 0);
-       write_kctxt_csr(dd, hw_context, SC(CREDIT_CTRL), 0);
-
-       /* release the index and context for re-use */
-       sc_hw_free(dd, sw_index, hw_context);
-       spin_unlock_irqrestore(&dd->sc_lock, flags);
-
-       kfree(sc->sr);
-       free_percpu(sc->buffers_allocated);
-       kfree(sc);
-}
-
-/* disable the context */
-void sc_disable(struct send_context *sc)
-{
-       u64 reg;
-       unsigned long flags;
-       struct pio_buf *pbuf;
-
-       if (!sc)
-               return;
-
-       /* do all steps, even if already disabled */
-       spin_lock_irqsave(&sc->alloc_lock, flags);
-       reg = read_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL));
-       reg &= ~SC(CTRL_CTXT_ENABLE_SMASK);
-       sc->flags &= ~SCF_ENABLED;
-       sc_wait_for_packet_egress(sc, 1);
-       write_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL), reg);
-       spin_unlock_irqrestore(&sc->alloc_lock, flags);
-
-       /*
-        * Flush any waiters.  Once the context is disabled,
-        * credit return interrupts are stopped (although there
-        * could be one in-process when the context is disabled).
-        * Wait one microsecond for any lingering interrupts, then
-        * proceed with the flush.
-        */
-       udelay(1);
-       spin_lock_irqsave(&sc->release_lock, flags);
-       if (sc->sr) {   /* this context has a shadow ring */
-               while (sc->sr_tail != sc->sr_head) {
-                       pbuf = &sc->sr[sc->sr_tail].pbuf;
-                       if (pbuf->cb)
-                               (*pbuf->cb)(pbuf->arg, PRC_SC_DISABLE);
-                       sc->sr_tail++;
-                       if (sc->sr_tail >= sc->sr_size)
-                               sc->sr_tail = 0;
-               }
-       }
-       spin_unlock_irqrestore(&sc->release_lock, flags);
-}
-
-/* return SendEgressCtxtStatus.PacketOccupancy */
-#define packet_occupancy(r) \
-       (((r) & SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SMASK)\
-       >> SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SHIFT)
-
-/* is egress halted on the context? */
-#define egress_halted(r) \
-       ((r) & SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_HALT_STATUS_SMASK)
-
-/* wait for packet egress, optionally pause for credit return  */
-static void sc_wait_for_packet_egress(struct send_context *sc, int pause)
-{
-       struct hfi1_devdata *dd = sc->dd;
-       u64 reg = 0;
-       u64 reg_prev;
-       u32 loop = 0;
-
-       while (1) {
-               reg_prev = reg;
-               reg = read_csr(dd, sc->hw_context * 8 +
-                              SEND_EGRESS_CTXT_STATUS);
-               /* done if egress is stopped */
-               if (egress_halted(reg))
-                       break;
-               reg = packet_occupancy(reg);
-               if (reg == 0)
-                       break;
-               /* counter is reset if occupancy count changes */
-               if (reg != reg_prev)
-                       loop = 0;
-               if (loop > 500) {
-                       /* timed out - bounce the link */
-                       dd_dev_err(dd,
-                                  "%s: context %u(%u) timeout waiting for packets to egress, remaining count %u, bouncing link\n",
-                                  __func__, sc->sw_index,
-                                  sc->hw_context, (u32)reg);
-                       queue_work(dd->pport->hfi1_wq,
-                                  &dd->pport->link_bounce_work);
-                       break;
-               }
-               loop++;
-               udelay(1);
-       }
-
-       if (pause)
-               /* Add additional delay to ensure chip returns all credits */
-               pause_for_credit_return(dd);
-}
-
-void sc_wait(struct hfi1_devdata *dd)
-{
-       int i;
-
-       for (i = 0; i < dd->num_send_contexts; i++) {
-               struct send_context *sc = dd->send_contexts[i].sc;
-
-               if (!sc)
-                       continue;
-               sc_wait_for_packet_egress(sc, 0);
-       }
-}
-
-/*
- * Restart a context after it has been halted due to error.
- *
- * If the first step fails - wait for the halt to be asserted, return early.
- * Otherwise complain about timeouts but keep going.
- *
- * It is expected that allocations (enabled flag bit) have been shut off
- * already (only applies to kernel contexts).
- */
-int sc_restart(struct send_context *sc)
-{
-       struct hfi1_devdata *dd = sc->dd;
-       u64 reg;
-       u32 loop;
-       int count;
-
-       /* bounce off if not halted, or being free'd */
-       if (!(sc->flags & SCF_HALTED) || (sc->flags & SCF_IN_FREE))
-               return -EINVAL;
-
-       dd_dev_info(dd, "restarting send context %u(%u)\n", sc->sw_index,
-                   sc->hw_context);
-
-       /*
-        * Step 1: Wait for the context to actually halt.
-        *
-        * The error interrupt is asynchronous to actually setting halt
-        * on the context.
-        */
-       loop = 0;
-       while (1) {
-               reg = read_kctxt_csr(dd, sc->hw_context, SC(STATUS));
-               if (reg & SC(STATUS_CTXT_HALTED_SMASK))
-                       break;
-               if (loop > 100) {
-                       dd_dev_err(dd, "%s: context %u(%u) not halting, skipping\n",
-                                  __func__, sc->sw_index, sc->hw_context);
-                       return -ETIME;
-               }
-               loop++;
-               udelay(1);
-       }
-
-       /*
-        * Step 2: Ensure no users are still trying to write to PIO.
-        *
-        * For kernel contexts, we have already turned off buffer allocation.
-        * Now wait for the buffer count to go to zero.
-        *
-        * For user contexts, the user handling code has cut off write access
-        * to the context's PIO pages before calling this routine and will
-        * restore write access after this routine returns.
-        */
-       if (sc->type != SC_USER) {
-               /* kernel context */
-               loop = 0;
-               while (1) {
-                       count = get_buffers_allocated(sc);
-                       if (count == 0)
-                               break;
-                       if (loop > 100) {
-                               dd_dev_err(dd,
-                                          "%s: context %u(%u) timeout waiting for PIO buffers to zero, remaining %d\n",
-                                          __func__, sc->sw_index,
-                                          sc->hw_context, count);
-                       }
-                       loop++;
-                       udelay(1);
-               }
-       }
-
-       /*
-        * Step 3: Wait for all packets to egress.
-        * This is done while disabling the send context
-        *
-        * Step 4: Disable the context
-        *
-        * This is a superset of the halt.  After the disable, the
-        * errors can be cleared.
-        */
-       sc_disable(sc);
-
-       /*
-        * Step 5: Enable the context
-        *
-        * This enable will clear the halted flag and per-send context
-        * error flags.
-        */
-       return sc_enable(sc);
-}
-
-/*
- * PIO freeze processing.  To be called after the TXE block is fully frozen.
- * Go through all frozen send contexts and disable them.  The contexts are
- * already stopped by the freeze.
- */
-void pio_freeze(struct hfi1_devdata *dd)
-{
-       struct send_context *sc;
-       int i;
-
-       for (i = 0; i < dd->num_send_contexts; i++) {
-               sc = dd->send_contexts[i].sc;
-               /*
-                * Don't disable unallocated, unfrozen, or user send contexts.
-                * User send contexts will be disabled when the process
-                * calls into the driver to reset its context.
-                */
-               if (!sc || !(sc->flags & SCF_FROZEN) || sc->type == SC_USER)
-                       continue;
-
-               /* only need to disable, the context is already stopped */
-               sc_disable(sc);
-       }
-}
-
-/*
- * Unfreeze PIO for kernel send contexts.  The precondition for calling this
- * is that all PIO send contexts have been disabled and the SPC freeze has
- * been cleared.  Now perform the last step and re-enable each kernel context.
- * User (PSM) processing will occur when PSM calls into the kernel to
- * acknowledge the freeze.
- */
-void pio_kernel_unfreeze(struct hfi1_devdata *dd)
-{
-       struct send_context *sc;
-       int i;
-
-       for (i = 0; i < dd->num_send_contexts; i++) {
-               sc = dd->send_contexts[i].sc;
-               if (!sc || !(sc->flags & SCF_FROZEN) || sc->type == SC_USER)
-                       continue;
-
-               sc_enable(sc);  /* will clear the sc frozen flag */
-       }
-}
-
-/*
- * Wait for the SendPioInitCtxt.PioInitInProgress bit to clear.
- * Returns:
- *     -ETIMEDOUT - if we wait too long
- *     -EIO       - if there was an error
- */
-static int pio_init_wait_progress(struct hfi1_devdata *dd)
-{
-       u64 reg;
-       int max, count = 0;
-
-       /* max is the longest possible HW init time / delay */
-       max = (dd->icode == ICODE_FPGA_EMULATION) ? 120 : 5;
-       while (1) {
-               reg = read_csr(dd, SEND_PIO_INIT_CTXT);
-               if (!(reg & SEND_PIO_INIT_CTXT_PIO_INIT_IN_PROGRESS_SMASK))
-                       break;
-               if (count >= max)
-                       return -ETIMEDOUT;
-               udelay(5);
-               count++;
-       }
-
-       return reg & SEND_PIO_INIT_CTXT_PIO_INIT_ERR_SMASK ? -EIO : 0;
-}
-
-/*
- * Reset all of the send contexts to their power-on state.  Used
- * only during manual init - no lock against sc_enable needed.
- */
-void pio_reset_all(struct hfi1_devdata *dd)
-{
-       int ret;
-
-       /* make sure the init engine is not busy */
-       ret = pio_init_wait_progress(dd);
-       /* ignore any timeout */
-       if (ret == -EIO) {
-               /* clear the error */
-               write_csr(dd, SEND_PIO_ERR_CLEAR,
-                         SEND_PIO_ERR_CLEAR_PIO_INIT_SM_IN_ERR_SMASK);
-       }
-
-       /* reset init all */
-       write_csr(dd, SEND_PIO_INIT_CTXT,
-                 SEND_PIO_INIT_CTXT_PIO_ALL_CTXT_INIT_SMASK);
-       udelay(2);
-       ret = pio_init_wait_progress(dd);
-       if (ret < 0) {
-               dd_dev_err(dd,
-                          "PIO send context init %s while initializing all PIO blocks\n",
-                          ret == -ETIMEDOUT ? "is stuck" : "had an error");
-       }
-}
-
-/* enable the context */
-int sc_enable(struct send_context *sc)
-{
-       u64 sc_ctrl, reg, pio;
-       struct hfi1_devdata *dd;
-       unsigned long flags;
-       int ret = 0;
-
-       if (!sc)
-               return -EINVAL;
-       dd = sc->dd;
-
-       /*
-        * Obtain the allocator lock to guard against any allocation
-        * attempts (which should not happen prior to context being
-        * enabled). On the release/disable side we don't need to
-        * worry about locking since the releaser will not do anything
-        * if the context accounting values have not changed.
-        */
-       spin_lock_irqsave(&sc->alloc_lock, flags);
-       sc_ctrl = read_kctxt_csr(dd, sc->hw_context, SC(CTRL));
-       if ((sc_ctrl & SC(CTRL_CTXT_ENABLE_SMASK)))
-               goto unlock; /* already enabled */
-
-       /* IMPORTANT: only clear free and fill if transitioning 0 -> 1 */
-
-       *sc->hw_free = 0;
-       sc->free = 0;
-       sc->alloc_free = 0;
-       sc->fill = 0;
-       sc->sr_head = 0;
-       sc->sr_tail = 0;
-       sc->flags = 0;
-       /* the alloc lock insures no fast path allocation */
-       reset_buffers_allocated(sc);
-
-       /*
-        * Clear all per-context errors.  Some of these will be set when
-        * we are re-enabling after a context halt.  Now that the context
-        * is disabled, the halt will not clear until after the PIO init
-        * engine runs below.
-        */
-       reg = read_kctxt_csr(dd, sc->hw_context, SC(ERR_STATUS));
-       if (reg)
-               write_kctxt_csr(dd, sc->hw_context, SC(ERR_CLEAR), reg);
-
-       /*
-        * The HW PIO initialization engine can handle only one init
-        * request at a time. Serialize access to each device's engine.
-        */
-       spin_lock(&dd->sc_init_lock);
-       /*
-        * Since access to this code block is serialized and
-        * each access waits for the initialization to complete
-        * before releasing the lock, the PIO initialization engine
-        * should not be in use, so we don't have to wait for the
-        * InProgress bit to go down.
-        */
-       pio = ((sc->hw_context & SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_MASK) <<
-              SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_SHIFT) |
-               SEND_PIO_INIT_CTXT_PIO_SINGLE_CTXT_INIT_SMASK;
-       write_csr(dd, SEND_PIO_INIT_CTXT, pio);
-       /*
-        * Wait until the engine is done.  Give the chip the required time
-        * so, hopefully, we read the register just once.
-        */
-       udelay(2);
-       ret = pio_init_wait_progress(dd);
-       spin_unlock(&dd->sc_init_lock);
-       if (ret) {
-               dd_dev_err(dd,
-                          "sctxt%u(%u): Context not enabled due to init failure %d\n",
-                          sc->sw_index, sc->hw_context, ret);
-               goto unlock;
-       }
-
-       /*
-        * All is well. Enable the context.
-        */
-       sc_ctrl |= SC(CTRL_CTXT_ENABLE_SMASK);
-       write_kctxt_csr(dd, sc->hw_context, SC(CTRL), sc_ctrl);
-       /*
-        * Read SendCtxtCtrl to force the write out and prevent a timing
-        * hazard where a PIO write may reach the context before the enable.
-        */
-       read_kctxt_csr(dd, sc->hw_context, SC(CTRL));
-       sc->flags |= SCF_ENABLED;
-
-unlock:
-       spin_unlock_irqrestore(&sc->alloc_lock, flags);
-
-       return ret;
-}
-
-/* force a credit return on the context */
-void sc_return_credits(struct send_context *sc)
-{
-       if (!sc)
-               return;
-
-       /* a 0->1 transition schedules a credit return */
-       write_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE),
-                       SC(CREDIT_FORCE_FORCE_RETURN_SMASK));
-       /*
-        * Ensure that the write is flushed and the credit return is
-        * scheduled. We care more about the 0 -> 1 transition.
-        */
-       read_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE));
-       /* set back to 0 for next time */
-       write_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE), 0);
-}
-
-/* allow all in-flight packets to drain on the context */
-void sc_flush(struct send_context *sc)
-{
-       if (!sc)
-               return;
-
-       sc_wait_for_packet_egress(sc, 1);
-}
-
-/* drop all packets on the context, no waiting until they are sent */
-void sc_drop(struct send_context *sc)
-{
-       if (!sc)
-               return;
-
-       dd_dev_info(sc->dd, "%s: context %u(%u) - not implemented\n",
-                   __func__, sc->sw_index, sc->hw_context);
-}
-
-/*
- * Start the software reaction to a context halt or SPC freeze:
- *     - mark the context as halted or frozen
- *     - stop buffer allocations
- *
- * Called from the error interrupt.  Other work is deferred until
- * out of the interrupt.
- */
-void sc_stop(struct send_context *sc, int flag)
-{
-       unsigned long flags;
-
-       /* mark the context */
-       sc->flags |= flag;
-
-       /* stop buffer allocations */
-       spin_lock_irqsave(&sc->alloc_lock, flags);
-       sc->flags &= ~SCF_ENABLED;
-       spin_unlock_irqrestore(&sc->alloc_lock, flags);
-       wake_up(&sc->halt_wait);
-}
-
-#define BLOCK_DWORDS (PIO_BLOCK_SIZE / sizeof(u32))
-#define dwords_to_blocks(x) DIV_ROUND_UP(x, BLOCK_DWORDS)
-
-/*
- * The send context buffer "allocator".
- *
- * @sc: the PIO send context we are allocating from
- * @len: length of whole packet - including PBC - in dwords
- * @cb: optional callback to call when the buffer is finished sending
- * @arg: argument for cb
- *
- * Return a pointer to a PIO buffer if successful, NULL if not enough room.
- */
-struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len,
-                               pio_release_cb cb, void *arg)
-{
-       struct pio_buf *pbuf = NULL;
-       unsigned long flags;
-       unsigned long avail;
-       unsigned long blocks = dwords_to_blocks(dw_len);
-       unsigned long start_fill;
-       int trycount = 0;
-       u32 head, next;
-
-       spin_lock_irqsave(&sc->alloc_lock, flags);
-       if (!(sc->flags & SCF_ENABLED)) {
-               spin_unlock_irqrestore(&sc->alloc_lock, flags);
-               goto done;
-       }
-
-retry:
-       avail = (unsigned long)sc->credits - (sc->fill - sc->alloc_free);
-       if (blocks > avail) {
-               /* not enough room */
-               if (unlikely(trycount)) { /* already tried to get more room */
-                       spin_unlock_irqrestore(&sc->alloc_lock, flags);
-                       goto done;
-               }
-               /* copy from receiver cache line and recalculate */
-               sc->alloc_free = ACCESS_ONCE(sc->free);
-               avail =
-                       (unsigned long)sc->credits -
-                       (sc->fill - sc->alloc_free);
-               if (blocks > avail) {
-                       /* still no room, actively update */
-                       spin_unlock_irqrestore(&sc->alloc_lock, flags);
-                       sc_release_update(sc);
-                       spin_lock_irqsave(&sc->alloc_lock, flags);
-                       sc->alloc_free = ACCESS_ONCE(sc->free);
-                       trycount++;
-                       goto retry;
-               }
-       }
-
-       /* there is enough room */
-
-       preempt_disable();
-       this_cpu_inc(*sc->buffers_allocated);
-
-       /* read this once */
-       head = sc->sr_head;
-
-       /* "allocate" the buffer */
-       start_fill = sc->fill;
-       sc->fill += blocks;
-
-       /*
-        * Fill the parts that the releaser looks at before moving the head.
-        * The only necessary piece is the sent_at field.  The credits
-        * we have just allocated cannot have been returned yet, so the
-        * cb and arg will not be looked at for a "while".  Put them
-        * on this side of the memory barrier anyway.
-        */
-       pbuf = &sc->sr[head].pbuf;
-       pbuf->sent_at = sc->fill;
-       pbuf->cb = cb;
-       pbuf->arg = arg;
-       pbuf->sc = sc;  /* could be filled in at sc->sr init time */
-       /* make sure this is in memory before updating the head */
-
-       /* calculate next head index, do not store */
-       next = head + 1;
-       if (next >= sc->sr_size)
-               next = 0;
-       /*
-        * update the head - must be last! - the releaser can look at fields
-        * in pbuf once we move the head
-        */
-       smp_wmb();
-       sc->sr_head = next;
-       spin_unlock_irqrestore(&sc->alloc_lock, flags);
-
-       /* finish filling in the buffer outside the lock */
-       pbuf->start = sc->base_addr + ((start_fill % sc->credits)
-                                                       * PIO_BLOCK_SIZE);
-       pbuf->size = sc->credits * PIO_BLOCK_SIZE;
-       pbuf->end = sc->base_addr + pbuf->size;
-       pbuf->block_count = blocks;
-       pbuf->qw_written = 0;
-       pbuf->carry_bytes = 0;
-       pbuf->carry.val64 = 0;
-done:
-       return pbuf;
-}
-
-/*
- * There are at least two entities that can turn on credit return
- * interrupts and they can overlap.  Avoid problems by implementing
- * a count scheme that is enforced by a lock.  The lock is needed because
- * the count and CSR write must be paired.
- */
-
-/*
- * Start credit return interrupts.  This is managed by a count.  If already
- * on, just increment the count.
- */
-void sc_add_credit_return_intr(struct send_context *sc)
-{
-       unsigned long flags;
-
-       /* lock must surround both the count change and the CSR update */
-       spin_lock_irqsave(&sc->credit_ctrl_lock, flags);
-       if (sc->credit_intr_count == 0) {
-               sc->credit_ctrl |= SC(CREDIT_CTRL_CREDIT_INTR_SMASK);
-               write_kctxt_csr(sc->dd, sc->hw_context,
-                               SC(CREDIT_CTRL), sc->credit_ctrl);
-       }
-       sc->credit_intr_count++;
-       spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags);
-}
-
-/*
- * Stop credit return interrupts.  This is managed by a count.  Decrement the
- * count, if the last user, then turn the credit interrupts off.
- */
-void sc_del_credit_return_intr(struct send_context *sc)
-{
-       unsigned long flags;
-
-       WARN_ON(sc->credit_intr_count == 0);
-
-       /* lock must surround both the count change and the CSR update */
-       spin_lock_irqsave(&sc->credit_ctrl_lock, flags);
-       sc->credit_intr_count--;
-       if (sc->credit_intr_count == 0) {
-               sc->credit_ctrl &= ~SC(CREDIT_CTRL_CREDIT_INTR_SMASK);
-               write_kctxt_csr(sc->dd, sc->hw_context,
-                               SC(CREDIT_CTRL), sc->credit_ctrl);
-       }
-       spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags);
-}
-
-/*
- * The caller must be careful when calling this.  All needint calls
- * must be paired with !needint.
- */
-void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint)
-{
-       if (needint)
-               sc_add_credit_return_intr(sc);
-       else
-               sc_del_credit_return_intr(sc);
-       trace_hfi1_wantpiointr(sc, needint, sc->credit_ctrl);
-       if (needint) {
-               mmiowb();
-               sc_return_credits(sc);
-       }
-}
-
-/**
- * sc_piobufavail - callback when a PIO buffer is available
- * @sc: the send context
- *
- * This is called from the interrupt handler when a PIO buffer is
- * available after hfi1_verbs_send() returned an error that no buffers were
- * available. Disable the interrupt if there are no more QPs waiting.
- */
-static void sc_piobufavail(struct send_context *sc)
-{
-       struct hfi1_devdata *dd = sc->dd;
-       struct hfi1_ibdev *dev = &dd->verbs_dev;
-       struct list_head *list;
-       struct rvt_qp *qps[PIO_WAIT_BATCH_SIZE];
-       struct rvt_qp *qp;
-       struct hfi1_qp_priv *priv;
-       unsigned long flags;
-       unsigned i, n = 0;
-
-       if (dd->send_contexts[sc->sw_index].type != SC_KERNEL &&
-           dd->send_contexts[sc->sw_index].type != SC_VL15)
-               return;
-       list = &sc->piowait;
-       /*
-        * Note: checking that the piowait list is empty and clearing
-        * the buffer available interrupt needs to be atomic or we
-        * could end up with QPs on the wait list with the interrupt
-        * disabled.
-        */
-       write_seqlock_irqsave(&dev->iowait_lock, flags);
-       while (!list_empty(list)) {
-               struct iowait *wait;
-
-               if (n == ARRAY_SIZE(qps))
-                       break;
-               wait = list_first_entry(list, struct iowait, list);
-               qp = iowait_to_qp(wait);
-               priv = qp->priv;
-               list_del_init(&priv->s_iowait.list);
-               /* refcount held until actual wake up */
-               qps[n++] = qp;
-       }
-       /*
-        * If there had been waiters and there are more
-        * insure that we redo the force to avoid a potential hang.
-        */
-       if (n) {
-               hfi1_sc_wantpiobuf_intr(sc, 0);
-               if (!list_empty(list))
-                       hfi1_sc_wantpiobuf_intr(sc, 1);
-       }
-       write_sequnlock_irqrestore(&dev->iowait_lock, flags);
-
-       for (i = 0; i < n; i++)
-               hfi1_qp_wakeup(qps[i],
-                              RVT_S_WAIT_PIO | RVT_S_WAIT_PIO_DRAIN);
-}
-
-/* translate a send credit update to a bit code of reasons */
-static inline int fill_code(u64 hw_free)
-{
-       int code = 0;
-
-       if (hw_free & CR_STATUS_SMASK)
-               code |= PRC_STATUS_ERR;
-       if (hw_free & CR_CREDIT_RETURN_DUE_TO_PBC_SMASK)
-               code |= PRC_PBC;
-       if (hw_free & CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SMASK)
-               code |= PRC_THRESHOLD;
-       if (hw_free & CR_CREDIT_RETURN_DUE_TO_ERR_SMASK)
-               code |= PRC_FILL_ERR;
-       if (hw_free & CR_CREDIT_RETURN_DUE_TO_FORCE_SMASK)
-               code |= PRC_SC_DISABLE;
-       return code;
-}
-
-/* use the jiffies compare to get the wrap right */
-#define sent_before(a, b) time_before(a, b)    /* a < b */
-
-/*
- * The send context buffer "releaser".
- */
-void sc_release_update(struct send_context *sc)
-{
-       struct pio_buf *pbuf;
-       u64 hw_free;
-       u32 head, tail;
-       unsigned long old_free;
-       unsigned long free;
-       unsigned long extra;
-       unsigned long flags;
-       int code;
-
-       if (!sc)
-               return;
-
-       spin_lock_irqsave(&sc->release_lock, flags);
-       /* update free */
-       hw_free = le64_to_cpu(*sc->hw_free);            /* volatile read */
-       old_free = sc->free;
-       extra = (((hw_free & CR_COUNTER_SMASK) >> CR_COUNTER_SHIFT)
-                       - (old_free & CR_COUNTER_MASK))
-                               & CR_COUNTER_MASK;
-       free = old_free + extra;
-       trace_hfi1_piofree(sc, extra);
-
-       /* call sent buffer callbacks */
-       code = -1;                              /* code not yet set */
-       head = ACCESS_ONCE(sc->sr_head);        /* snapshot the head */
-       tail = sc->sr_tail;
-       while (head != tail) {
-               pbuf = &sc->sr[tail].pbuf;
-
-               if (sent_before(free, pbuf->sent_at)) {
-                       /* not sent yet */
-                       break;
-               }
-               if (pbuf->cb) {
-                       if (code < 0) /* fill in code on first user */
-                               code = fill_code(hw_free);
-                       (*pbuf->cb)(pbuf->arg, code);
-               }
-
-               tail++;
-               if (tail >= sc->sr_size)
-                       tail = 0;
-       }
-       sc->sr_tail = tail;
-       /* make sure tail is updated before free */
-       smp_wmb();
-       sc->free = free;
-       spin_unlock_irqrestore(&sc->release_lock, flags);
-       sc_piobufavail(sc);
-}
-
-/*
- * Send context group releaser.  Argument is the send context that caused
- * the interrupt.  Called from the send context interrupt handler.
- *
- * Call release on all contexts in the group.
- *
- * This routine takes the sc_lock without an irqsave because it is only
- * called from an interrupt handler.  Adjust if that changes.
- */
-void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context)
-{
-       struct send_context *sc;
-       u32 sw_index;
-       u32 gc, gc_end;
-
-       spin_lock(&dd->sc_lock);
-       sw_index = dd->hw_to_sw[hw_context];
-       if (unlikely(sw_index >= dd->num_send_contexts)) {
-               dd_dev_err(dd, "%s: invalid hw (%u) to sw (%u) mapping\n",
-                          __func__, hw_context, sw_index);
-               goto done;
-       }
-       sc = dd->send_contexts[sw_index].sc;
-       if (unlikely(!sc))
-               goto done;
-
-       gc = group_context(hw_context, sc->group);
-       gc_end = gc + group_size(sc->group);
-       for (; gc < gc_end; gc++) {
-               sw_index = dd->hw_to_sw[gc];
-               if (unlikely(sw_index >= dd->num_send_contexts)) {
-                       dd_dev_err(dd,
-                                  "%s: invalid hw (%u) to sw (%u) mapping\n",
-                                  __func__, hw_context, sw_index);
-                       continue;
-               }
-               sc_release_update(dd->send_contexts[sw_index].sc);
-       }
-done:
-       spin_unlock(&dd->sc_lock);
-}
-
-/*
- * pio_select_send_context_vl() - select send context
- * @dd: devdata
- * @selector: a spreading factor
- * @vl: this vl
- *
- * This function returns a send context based on the selector and a vl.
- * The mapping fields are protected by RCU
- */
-struct send_context *pio_select_send_context_vl(struct hfi1_devdata *dd,
-                                               u32 selector, u8 vl)
-{
-       struct pio_vl_map *m;
-       struct pio_map_elem *e;
-       struct send_context *rval;
-
-       /*
-        * NOTE This should only happen if SC->VL changed after the initial
-        * checks on the QP/AH
-        * Default will return VL0's send context below
-        */
-       if (unlikely(vl >= num_vls)) {
-               rval = NULL;
-               goto done;
-       }
-
-       rcu_read_lock();
-       m = rcu_dereference(dd->pio_map);
-       if (unlikely(!m)) {
-               rcu_read_unlock();
-               return dd->vld[0].sc;
-       }
-       e = m->map[vl & m->mask];
-       rval = e->ksc[selector & e->mask];
-       rcu_read_unlock();
-
-done:
-       rval = !rval ? dd->vld[0].sc : rval;
-       return rval;
-}
-
-/*
- * pio_select_send_context_sc() - select send context
- * @dd: devdata
- * @selector: a spreading factor
- * @sc5: the 5 bit sc
- *
- * This function returns an send context based on the selector and an sc
- */
-struct send_context *pio_select_send_context_sc(struct hfi1_devdata *dd,
-                                               u32 selector, u8 sc5)
-{
-       u8 vl = sc_to_vlt(dd, sc5);
-
-       return pio_select_send_context_vl(dd, selector, vl);
-}
-
-/*
- * Free the indicated map struct
- */
-static void pio_map_free(struct pio_vl_map *m)
-{
-       int i;
-
-       for (i = 0; m && i < m->actual_vls; i++)
-               kfree(m->map[i]);
-       kfree(m);
-}
-
-/*
- * Handle RCU callback
- */
-static void pio_map_rcu_callback(struct rcu_head *list)
-{
-       struct pio_vl_map *m = container_of(list, struct pio_vl_map, list);
-
-       pio_map_free(m);
-}
-
-/*
- * pio_map_init - called when #vls change
- * @dd: hfi1_devdata
- * @port: port number
- * @num_vls: number of vls
- * @vl_scontexts: per vl send context mapping (optional)
- *
- * This routine changes the mapping based on the number of vls.
- *
- * vl_scontexts is used to specify a non-uniform vl/send context
- * loading. NULL implies auto computing the loading and giving each
- * VL an uniform distribution of send contexts per VL.
- *
- * The auto algorithm computers the sc_per_vl and the number of extra
- * send contexts. Any extra send contexts are added from the last VL
- * on down
- *
- * rcu locking is used here to control access to the mapping fields.
- *
- * If either the num_vls or num_send_contexts are non-power of 2, the
- * array sizes in the struct pio_vl_map and the struct pio_map_elem are
- * rounded up to the next highest power of 2 and the first entry is
- * reused in a round robin fashion.
- *
- * If an error occurs the map change is not done and the mapping is not
- * chaged.
- *
- */
-int pio_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls, u8 *vl_scontexts)
-{
-       int i, j;
-       int extra, sc_per_vl;
-       int scontext = 1;
-       int num_kernel_send_contexts = 0;
-       u8 lvl_scontexts[OPA_MAX_VLS];
-       struct pio_vl_map *oldmap, *newmap;
-
-       if (!vl_scontexts) {
-               /* send context 0 reserved for VL15 */
-               for (i = 1; i < dd->num_send_contexts; i++)
-                       if (dd->send_contexts[i].type == SC_KERNEL)
-                               num_kernel_send_contexts++;
-               /* truncate divide */
-               sc_per_vl = num_kernel_send_contexts / num_vls;
-               /* extras */
-               extra = num_kernel_send_contexts % num_vls;
-               vl_scontexts = lvl_scontexts;
-               /* add extras from last vl down */
-               for (i = num_vls - 1; i >= 0; i--, extra--)
-                       vl_scontexts[i] = sc_per_vl + (extra > 0 ? 1 : 0);
-       }
-       /* build new map */
-       newmap = kzalloc(sizeof(*newmap) +
-                        roundup_pow_of_two(num_vls) *
-                        sizeof(struct pio_map_elem *),
-                        GFP_KERNEL);
-       if (!newmap)
-               goto bail;
-       newmap->actual_vls = num_vls;
-       newmap->vls = roundup_pow_of_two(num_vls);
-       newmap->mask = (1 << ilog2(newmap->vls)) - 1;
-       for (i = 0; i < newmap->vls; i++) {
-               /* save for wrap around */
-               int first_scontext = scontext;
-
-               if (i < newmap->actual_vls) {
-                       int sz = roundup_pow_of_two(vl_scontexts[i]);
-
-                       /* only allocate once */
-                       newmap->map[i] = kzalloc(sizeof(*newmap->map[i]) +
-                                                sz * sizeof(struct
-                                                            send_context *),
-                                                GFP_KERNEL);
-                       if (!newmap->map[i])
-                               goto bail;
-                       newmap->map[i]->mask = (1 << ilog2(sz)) - 1;
-                       /* assign send contexts */
-                       for (j = 0; j < sz; j++) {
-                               if (dd->kernel_send_context[scontext])
-                                       newmap->map[i]->ksc[j] =
-                                       dd->kernel_send_context[scontext];
-                               if (++scontext >= first_scontext +
-                                                 vl_scontexts[i])
-                                       /* wrap back to first send context */
-                                       scontext = first_scontext;
-                       }
-               } else {
-                       /* just re-use entry without allocating */
-                       newmap->map[i] = newmap->map[i % num_vls];
-               }
-               scontext = first_scontext + vl_scontexts[i];
-       }
-       /* newmap in hand, save old map */
-       spin_lock_irq(&dd->pio_map_lock);
-       oldmap = rcu_dereference_protected(dd->pio_map,
-                                          lockdep_is_held(&dd->pio_map_lock));
-
-       /* publish newmap */
-       rcu_assign_pointer(dd->pio_map, newmap);
-
-       spin_unlock_irq(&dd->pio_map_lock);
-       /* success, free any old map after grace period */
-       if (oldmap)
-               call_rcu(&oldmap->list, pio_map_rcu_callback);
-       return 0;
-bail:
-       /* free any partial allocation */
-       pio_map_free(newmap);
-       return -ENOMEM;
-}
-
-void free_pio_map(struct hfi1_devdata *dd)
-{
-       /* Free PIO map if allocated */
-       if (rcu_access_pointer(dd->pio_map)) {
-               spin_lock_irq(&dd->pio_map_lock);
-               pio_map_free(rcu_access_pointer(dd->pio_map));
-               RCU_INIT_POINTER(dd->pio_map, NULL);
-               spin_unlock_irq(&dd->pio_map_lock);
-               synchronize_rcu();
-       }
-       kfree(dd->kernel_send_context);
-       dd->kernel_send_context = NULL;
-}
-
-int init_pervl_scs(struct hfi1_devdata *dd)
-{
-       int i;
-       u64 mask, all_vl_mask = (u64)0x80ff; /* VLs 0-7, 15 */
-       u64 data_vls_mask = (u64)0x00ff; /* VLs 0-7 */
-       u32 ctxt;
-       struct hfi1_pportdata *ppd = dd->pport;
-
-       dd->vld[15].sc = sc_alloc(dd, SC_VL15,
-                                 dd->rcd[0]->rcvhdrqentsize, dd->node);
-       if (!dd->vld[15].sc)
-               goto nomem;
-       hfi1_init_ctxt(dd->vld[15].sc);
-       dd->vld[15].mtu = enum_to_mtu(OPA_MTU_2048);
-
-       dd->kernel_send_context = kmalloc_node(dd->num_send_contexts *
-                                       sizeof(struct send_context *),
-                                       GFP_KERNEL, dd->node);
-       dd->kernel_send_context[0] = dd->vld[15].sc;
-
-       for (i = 0; i < num_vls; i++) {
-               /*
-                * Since this function does not deal with a specific
-                * receive context but we need the RcvHdrQ entry size,
-                * use the size from rcd[0]. It is guaranteed to be
-                * valid at this point and will remain the same for all
-                * receive contexts.
-                */
-               dd->vld[i].sc = sc_alloc(dd, SC_KERNEL,
-                                        dd->rcd[0]->rcvhdrqentsize, dd->node);
-               if (!dd->vld[i].sc)
-                       goto nomem;
-               dd->kernel_send_context[i + 1] = dd->vld[i].sc;
-               hfi1_init_ctxt(dd->vld[i].sc);
-               /* non VL15 start with the max MTU */
-               dd->vld[i].mtu = hfi1_max_mtu;
-       }
-       for (i = num_vls; i < INIT_SC_PER_VL * num_vls; i++) {
-               dd->kernel_send_context[i + 1] =
-               sc_alloc(dd, SC_KERNEL, dd->rcd[0]->rcvhdrqentsize, dd->node);
-               if (!dd->kernel_send_context[i + 1])
-                       goto nomem;
-               hfi1_init_ctxt(dd->kernel_send_context[i + 1]);
-       }
-
-       sc_enable(dd->vld[15].sc);
-       ctxt = dd->vld[15].sc->hw_context;
-       mask = all_vl_mask & ~(1LL << 15);
-       write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask);
-       dd_dev_info(dd,
-                   "Using send context %u(%u) for VL15\n",
-                   dd->vld[15].sc->sw_index, ctxt);
-
-       for (i = 0; i < num_vls; i++) {
-               sc_enable(dd->vld[i].sc);
-               ctxt = dd->vld[i].sc->hw_context;
-               mask = all_vl_mask & ~(data_vls_mask);
-               write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask);
-       }
-       for (i = num_vls; i < INIT_SC_PER_VL * num_vls; i++) {
-               sc_enable(dd->kernel_send_context[i + 1]);
-               ctxt = dd->kernel_send_context[i + 1]->hw_context;
-               mask = all_vl_mask & ~(data_vls_mask);
-               write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask);
-       }
-
-       if (pio_map_init(dd, ppd->port - 1, num_vls, NULL))
-               goto nomem;
-       return 0;
-nomem:
-       sc_free(dd->vld[15].sc);
-       for (i = 0; i < num_vls; i++)
-               sc_free(dd->vld[i].sc);
-       for (i = num_vls; i < INIT_SC_PER_VL * num_vls; i++)
-               sc_free(dd->kernel_send_context[i + 1]);
-       return -ENOMEM;
-}
-
-int init_credit_return(struct hfi1_devdata *dd)
-{
-       int ret;
-       int num_numa;
-       int i;
-
-       num_numa = num_online_nodes();
-       /* enforce the expectation that the numas are compact */
-       for (i = 0; i < num_numa; i++) {
-               if (!node_online(i)) {
-                       dd_dev_err(dd, "NUMA nodes are not compact\n");
-                       ret = -EINVAL;
-                       goto done;
-               }
-       }
-
-       dd->cr_base = kcalloc(
-               num_numa,
-               sizeof(struct credit_return_base),
-               GFP_KERNEL);
-       if (!dd->cr_base) {
-               dd_dev_err(dd, "Unable to allocate credit return base\n");
-               ret = -ENOMEM;
-               goto done;
-       }
-       for (i = 0; i < num_numa; i++) {
-               int bytes = TXE_NUM_CONTEXTS * sizeof(struct credit_return);
-
-               set_dev_node(&dd->pcidev->dev, i);
-               dd->cr_base[i].va = dma_zalloc_coherent(
-                                       &dd->pcidev->dev,
-                                       bytes,
-                                       &dd->cr_base[i].pa,
-                                       GFP_KERNEL);
-               if (!dd->cr_base[i].va) {
-                       set_dev_node(&dd->pcidev->dev, dd->node);
-                       dd_dev_err(dd,
-                                  "Unable to allocate credit return DMA range for NUMA %d\n",
-                                  i);
-                       ret = -ENOMEM;
-                       goto done;
-               }
-       }
-       set_dev_node(&dd->pcidev->dev, dd->node);
-
-       ret = 0;
-done:
-       return ret;
-}
-
-void free_credit_return(struct hfi1_devdata *dd)
-{
-       int num_numa;
-       int i;
-
-       if (!dd->cr_base)
-               return;
-
-       num_numa = num_online_nodes();
-       for (i = 0; i < num_numa; i++) {
-               if (dd->cr_base[i].va) {
-                       dma_free_coherent(&dd->pcidev->dev,
-                                         TXE_NUM_CONTEXTS *
-                                         sizeof(struct credit_return),
-                                         dd->cr_base[i].va,
-                                         dd->cr_base[i].pa);
-               }
-       }
-       kfree(dd->cr_base);
-       dd->cr_base = NULL;
-}
diff --git a/drivers/staging/rdma/hfi1/pio.h b/drivers/staging/rdma/hfi1/pio.h
deleted file mode 100644 (file)
index 53a08ed..0000000
+++ /dev/null
@@ -1,328 +0,0 @@
-#ifndef _PIO_H
-#define _PIO_H
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-/* send context types */
-#define SC_KERNEL 0
-#define SC_ACK    1
-#define SC_USER   2
-#define SC_VL15   3
-#define SC_MAX    4
-
-/* invalid send context index */
-#define INVALID_SCI 0xff
-
-/* PIO buffer release callback function */
-typedef void (*pio_release_cb)(void *arg, int code);
-
-/* PIO release codes - in bits, as there could more than one that apply */
-#define PRC_OK         0       /* no known error */
-#define PRC_STATUS_ERR 0x01    /* credit return due to status error */
-#define PRC_PBC                0x02    /* credit return due to PBC */
-#define PRC_THRESHOLD  0x04    /* credit return due to threshold */
-#define PRC_FILL_ERR   0x08    /* credit return due fill error */
-#define PRC_FORCE      0x10    /* credit return due credit force */
-#define PRC_SC_DISABLE 0x20    /* clean-up after a context disable */
-
-/* byte helper */
-union mix {
-       u64 val64;
-       u32 val32[2];
-       u8  val8[8];
-};
-
-/* an allocated PIO buffer */
-struct pio_buf {
-       struct send_context *sc;/* back pointer to owning send context */
-       pio_release_cb cb;      /* called when the buffer is released */
-       void *arg;              /* argument for cb */
-       void __iomem *start;    /* buffer start address */
-       void __iomem *end;      /* context end address */
-       unsigned long size;     /* context size, in bytes */
-       unsigned long sent_at;  /* buffer is sent when <= free */
-       u32 block_count;        /* size of buffer, in blocks */
-       u32 qw_written;         /* QW written so far */
-       u32 carry_bytes;        /* number of valid bytes in carry */
-       union mix carry;        /* pending unwritten bytes */
-};
-
-/* cache line aligned pio buffer array */
-union pio_shadow_ring {
-       struct pio_buf pbuf;
-       u64 unused[16];         /* cache line spacer */
-} ____cacheline_aligned;
-
-/* per-NUMA send context */
-struct send_context {
-       /* read-only after init */
-       struct hfi1_devdata *dd;                /* device */
-       void __iomem *base_addr;        /* start of PIO memory */
-       union pio_shadow_ring *sr;      /* shadow ring */
-
-       volatile __le64 *hw_free;       /* HW free counter */
-       struct work_struct halt_work;   /* halted context work queue entry */
-       unsigned long flags;            /* flags */
-       int node;                       /* context home node */
-       int type;                       /* context type */
-       u32 sw_index;                   /* software index number */
-       u32 hw_context;                 /* hardware context number */
-       u32 credits;                    /* number of blocks in context */
-       u32 sr_size;                    /* size of the shadow ring */
-       u32 group;                      /* credit return group */
-       /* allocator fields */
-       spinlock_t alloc_lock ____cacheline_aligned_in_smp;
-       unsigned long fill;             /* official alloc count */
-       unsigned long alloc_free;       /* copy of free (less cache thrash) */
-       u32 sr_head;                    /* shadow ring head */
-       /* releaser fields */
-       spinlock_t release_lock ____cacheline_aligned_in_smp;
-       unsigned long free;             /* official free count */
-       u32 sr_tail;                    /* shadow ring tail */
-       /* list for PIO waiters */
-       struct list_head piowait  ____cacheline_aligned_in_smp;
-       spinlock_t credit_ctrl_lock ____cacheline_aligned_in_smp;
-       u64 credit_ctrl;                /* cache for credit control */
-       u32 credit_intr_count;          /* count of credit intr users */
-       u32 __percpu *buffers_allocated;/* count of buffers allocated */
-       wait_queue_head_t halt_wait;    /* wait until kernel sees interrupt */
-};
-
-/* send context flags */
-#define SCF_ENABLED 0x01
-#define SCF_IN_FREE 0x02
-#define SCF_HALTED  0x04
-#define SCF_FROZEN  0x08
-
-struct send_context_info {
-       struct send_context *sc;        /* allocated working context */
-       u16 allocated;                  /* has this been allocated? */
-       u16 type;                       /* context type */
-       u16 base;                       /* base in PIO array */
-       u16 credits;                    /* size in PIO array */
-};
-
-/* DMA credit return, index is always (context & 0x7) */
-struct credit_return {
-       volatile __le64 cr[8];
-};
-
-/* NUMA indexed credit return array */
-struct credit_return_base {
-       struct credit_return *va;
-       dma_addr_t pa;
-};
-
-/* send context configuration sizes (one per type) */
-struct sc_config_sizes {
-       short int size;
-       short int count;
-};
-
-/*
- * The diagram below details the relationship of the mapping structures
- *
- * Since the mapping now allows for non-uniform send contexts per vl, the
- * number of send contexts for a vl is either the vl_scontexts[vl] or
- * a computation based on num_kernel_send_contexts/num_vls:
- *
- * For example:
- * nactual = vl_scontexts ? vl_scontexts[vl] : num_kernel_send_contexts/num_vls
- *
- * n = roundup to next highest power of 2 using nactual
- *
- * In the case where there are num_kernel_send_contexts/num_vls doesn't divide
- * evenly, the extras are added from the last vl downward.
- *
- * For the case where n > nactual, the send contexts are assigned
- * in a round robin fashion wrapping back to the first send context
- * for a particular vl.
- *
- *               dd->pio_map
- *                    |                                   pio_map_elem[0]
- *                    |                                +--------------------+
- *                    v                                |       mask         |
- *               pio_vl_map                            |--------------------|
- *      +--------------------------+                   | ksc[0] -> sc 1     |
- *      |    list (RCU)            |                   |--------------------|
- *      |--------------------------|                 ->| ksc[1] -> sc 2     |
- *      |    mask                  |              --/  |--------------------|
- *      |--------------------------|            -/     |        *           |
- *      |    actual_vls (max 8)    |          -/       |--------------------|
- *      |--------------------------|       --/         | ksc[n] -> sc n     |
- *      |    vls (max 8)           |     -/            +--------------------+
- *      |--------------------------|  --/
- *      |    map[0]                |-/
- *      |--------------------------|                   +--------------------+
- *      |    map[1]                |---                |       mask         |
- *      |--------------------------|   \----           |--------------------|
- *      |           *              |        \--        | ksc[0] -> sc 1+n   |
- *      |           *              |           \----   |--------------------|
- *      |           *              |                \->| ksc[1] -> sc 2+n   |
- *      |--------------------------|                   |--------------------|
- *      |   map[vls - 1]           |-                  |         *          |
- *      +--------------------------+ \-                |--------------------|
- *                                     \-              | ksc[m] -> sc m+n   |
- *                                       \             +--------------------+
- *                                        \-
- *                                          \
- *                                           \-        +--------------------+
- *                                             \-      |       mask         |
- *                                               \     |--------------------|
- *                                                \-   | ksc[0] -> sc 1+m+n |
- *                                                  \- |--------------------|
- *                                                    >| ksc[1] -> sc 2+m+n |
- *                                                     |--------------------|
- *                                                     |         *          |
- *                                                     |--------------------|
- *                                                     | ksc[o] -> sc o+m+n |
- *                                                     +--------------------+
- *
- */
-
-/* Initial number of send contexts per VL */
-#define INIT_SC_PER_VL 2
-
-/*
- * struct pio_map_elem - mapping for a vl
- * @mask - selector mask
- * @ksc - array of kernel send contexts for this vl
- *
- * The mask is used to "mod" the selector to
- * produce index into the trailing array of
- * kscs
- */
-struct pio_map_elem {
-       u32 mask;
-       struct send_context *ksc[0];
-};
-
-/*
- * struct pio_vl_map - mapping for a vl
- * @list - rcu head for free callback
- * @mask - vl mask to "mod" the vl to produce an index to map array
- * @actual_vls - number of vls
- * @vls - numbers of vls rounded to next power of 2
- * @map - array of pio_map_elem entries
- *
- * This is the parent mapping structure. The trailing members of the
- * struct point to pio_map_elem entries, which in turn point to an
- * array of kscs for that vl.
- */
-struct pio_vl_map {
-       struct rcu_head list;
-       u32 mask;
-       u8 actual_vls;
-       u8 vls;
-       struct pio_map_elem *map[0];
-};
-
-int pio_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls,
-                u8 *vl_scontexts);
-void free_pio_map(struct hfi1_devdata *dd);
-struct send_context *pio_select_send_context_vl(struct hfi1_devdata *dd,
-                                               u32 selector, u8 vl);
-struct send_context *pio_select_send_context_sc(struct hfi1_devdata *dd,
-                                               u32 selector, u8 sc5);
-
-/* send context functions */
-int init_credit_return(struct hfi1_devdata *dd);
-void free_credit_return(struct hfi1_devdata *dd);
-int init_sc_pools_and_sizes(struct hfi1_devdata *dd);
-int init_send_contexts(struct hfi1_devdata *dd);
-int init_credit_return(struct hfi1_devdata *dd);
-int init_pervl_scs(struct hfi1_devdata *dd);
-struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
-                             uint hdrqentsize, int numa);
-void sc_free(struct send_context *sc);
-int sc_enable(struct send_context *sc);
-void sc_disable(struct send_context *sc);
-int sc_restart(struct send_context *sc);
-void sc_return_credits(struct send_context *sc);
-void sc_flush(struct send_context *sc);
-void sc_drop(struct send_context *sc);
-void sc_stop(struct send_context *sc, int bit);
-struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len,
-                               pio_release_cb cb, void *arg);
-void sc_release_update(struct send_context *sc);
-void sc_return_credits(struct send_context *sc);
-void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context);
-void sc_add_credit_return_intr(struct send_context *sc);
-void sc_del_credit_return_intr(struct send_context *sc);
-void sc_set_cr_threshold(struct send_context *sc, u32 new_threshold);
-u32 sc_percent_to_threshold(struct send_context *sc, u32 percent);
-u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize);
-void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint);
-void sc_wait(struct hfi1_devdata *dd);
-void set_pio_integrity(struct send_context *sc);
-
-/* support functions */
-void pio_reset_all(struct hfi1_devdata *dd);
-void pio_freeze(struct hfi1_devdata *dd);
-void pio_kernel_unfreeze(struct hfi1_devdata *dd);
-
-/* global PIO send control operations */
-#define PSC_GLOBAL_ENABLE 0
-#define PSC_GLOBAL_DISABLE 1
-#define PSC_GLOBAL_VLARB_ENABLE 2
-#define PSC_GLOBAL_VLARB_DISABLE 3
-#define PSC_CM_RESET 4
-#define PSC_DATA_VL_ENABLE 5
-#define PSC_DATA_VL_DISABLE 6
-
-void __cm_reset(struct hfi1_devdata *dd, u64 sendctrl);
-void pio_send_control(struct hfi1_devdata *dd, int op);
-
-/* PIO copy routines */
-void pio_copy(struct hfi1_devdata *dd, struct pio_buf *pbuf, u64 pbc,
-             const void *from, size_t count);
-void seg_pio_copy_start(struct pio_buf *pbuf, u64 pbc,
-                       const void *from, size_t nbytes);
-void seg_pio_copy_mid(struct pio_buf *pbuf, const void *from, size_t nbytes);
-void seg_pio_copy_end(struct pio_buf *pbuf);
-
-#endif /* _PIO_H */
diff --git a/drivers/staging/rdma/hfi1/pio_copy.c b/drivers/staging/rdma/hfi1/pio_copy.c
deleted file mode 100644 (file)
index 8c25e1b..0000000
+++ /dev/null
@@ -1,867 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "hfi.h"
-
-/* additive distance between non-SOP and SOP space */
-#define SOP_DISTANCE (TXE_PIO_SIZE / 2)
-#define PIO_BLOCK_MASK (PIO_BLOCK_SIZE - 1)
-/* number of QUADWORDs in a block */
-#define PIO_BLOCK_QWS (PIO_BLOCK_SIZE / sizeof(u64))
-
-/**
- * pio_copy - copy data block to MMIO space
- * @pbuf: a number of blocks allocated within a PIO send context
- * @pbc: PBC to send
- * @from: source, must be 8 byte aligned
- * @count: number of DWORD (32-bit) quantities to copy from source
- *
- * Copy data from source to PIO Send Buffer memory, 8 bytes at a time.
- * Must always write full BLOCK_SIZE bytes blocks.  The first block must
- * be written to the corresponding SOP=1 address.
- *
- * Known:
- * o pbuf->start always starts on a block boundary
- * o pbuf can wrap only at a block boundary
- */
-void pio_copy(struct hfi1_devdata *dd, struct pio_buf *pbuf, u64 pbc,
-             const void *from, size_t count)
-{
-       void __iomem *dest = pbuf->start + SOP_DISTANCE;
-       void __iomem *send = dest + PIO_BLOCK_SIZE;
-       void __iomem *dend;                     /* 8-byte data end */
-
-       /* write the PBC */
-       writeq(pbc, dest);
-       dest += sizeof(u64);
-
-       /* calculate where the QWORD data ends - in SOP=1 space */
-       dend = dest + ((count >> 1) * sizeof(u64));
-
-       if (dend < send) {
-               /*
-                * all QWORD data is within the SOP block, does *not*
-                * reach the end of the SOP block
-                */
-
-               while (dest < dend) {
-                       writeq(*(u64 *)from, dest);
-                       from += sizeof(u64);
-                       dest += sizeof(u64);
-               }
-               /*
-                * No boundary checks are needed here:
-                * 0. We're not on the SOP block boundary
-                * 1. The possible DWORD dangle will still be within
-                *    the SOP block
-                * 2. We cannot wrap except on a block boundary.
-                */
-       } else {
-               /* QWORD data extends _to_ or beyond the SOP block */
-
-               /* write 8-byte SOP chunk data */
-               while (dest < send) {
-                       writeq(*(u64 *)from, dest);
-                       from += sizeof(u64);
-                       dest += sizeof(u64);
-               }
-               /* drop out of the SOP range */
-               dest -= SOP_DISTANCE;
-               dend -= SOP_DISTANCE;
-
-               /*
-                * If the wrap comes before or matches the data end,
-                * copy until until the wrap, then wrap.
-                *
-                * If the data ends at the end of the SOP above and
-                * the buffer wraps, then pbuf->end == dend == dest
-                * and nothing will get written, but we will wrap in
-                * case there is a dangling DWORD.
-                */
-               if (pbuf->end <= dend) {
-                       while (dest < pbuf->end) {
-                               writeq(*(u64 *)from, dest);
-                               from += sizeof(u64);
-                               dest += sizeof(u64);
-                       }
-
-                       dest -= pbuf->size;
-                       dend -= pbuf->size;
-               }
-
-               /* write 8-byte non-SOP, non-wrap chunk data */
-               while (dest < dend) {
-                       writeq(*(u64 *)from, dest);
-                       from += sizeof(u64);
-                       dest += sizeof(u64);
-               }
-       }
-       /* at this point we have wrapped if we are going to wrap */
-
-       /* write dangling u32, if any */
-       if (count & 1) {
-               union mix val;
-
-               val.val64 = 0;
-               val.val32[0] = *(u32 *)from;
-               writeq(val.val64, dest);
-               dest += sizeof(u64);
-       }
-       /*
-        * fill in rest of block, no need to check pbuf->end
-        * as we only wrap on a block boundary
-        */
-       while (((unsigned long)dest & PIO_BLOCK_MASK) != 0) {
-               writeq(0, dest);
-               dest += sizeof(u64);
-       }
-
-       /* finished with this buffer */
-       this_cpu_dec(*pbuf->sc->buffers_allocated);
-       preempt_enable();
-}
-
-/* USE_SHIFTS is faster in user-space tests on a Xeon X5570 @ 2.93GHz */
-#define USE_SHIFTS 1
-#ifdef USE_SHIFTS
-/*
- * Handle carry bytes using shifts and masks.
- *
- * NOTE: the value the unused portion of carry is expected to always be zero.
- */
-
-/*
- * "zero" shift - bit shift used to zero out upper bytes.  Input is
- * the count of LSB bytes to preserve.
- */
-#define zshift(x) (8 * (8 - (x)))
-
-/*
- * "merge" shift - bit shift used to merge with carry bytes.  Input is
- * the LSB byte count to move beyond.
- */
-#define mshift(x) (8 * (x))
-
-/*
- * Read nbytes bytes from "from" and return them in the LSB bytes
- * of pbuf->carry.  Other bytes are zeroed.  Any previous value
- * pbuf->carry is lost.
- *
- * NOTES:
- * o do not read from from if nbytes is zero
- * o from may _not_ be u64 aligned
- * o nbytes must not span a QW boundary
- */
-static inline void read_low_bytes(struct pio_buf *pbuf, const void *from,
-                                 unsigned int nbytes)
-{
-       unsigned long off;
-
-       if (nbytes == 0) {
-               pbuf->carry.val64 = 0;
-       } else {
-               /* align our pointer */
-               off = (unsigned long)from & 0x7;
-               from = (void *)((unsigned long)from & ~0x7l);
-               pbuf->carry.val64 = ((*(u64 *)from)
-                               << zshift(nbytes + off))/* zero upper bytes */
-                               >> zshift(nbytes);      /* place at bottom */
-       }
-       pbuf->carry_bytes = nbytes;
-}
-
-/*
- * Read nbytes bytes from "from" and put them at the next significant bytes
- * of pbuf->carry.  Unused bytes are zeroed.  It is expected that the extra
- * read does not overfill carry.
- *
- * NOTES:
- * o from may _not_ be u64 aligned
- * o nbytes may span a QW boundary
- */
-static inline void read_extra_bytes(struct pio_buf *pbuf,
-                                   const void *from, unsigned int nbytes)
-{
-       unsigned long off = (unsigned long)from & 0x7;
-       unsigned int room, xbytes;
-
-       /* align our pointer */
-       from = (void *)((unsigned long)from & ~0x7l);
-
-       /* check count first - don't read anything if count is zero */
-       while (nbytes) {
-               /* find the number of bytes in this u64 */
-               room = 8 - off; /* this u64 has room for this many bytes */
-               xbytes = min(room, nbytes);
-
-               /*
-                * shift down to zero lower bytes, shift up to zero upper
-                * bytes, shift back down to move into place
-                */
-               pbuf->carry.val64 |= (((*(u64 *)from)
-                                       >> mshift(off))
-                                       << zshift(xbytes))
-                                       >> zshift(xbytes + pbuf->carry_bytes);
-               off = 0;
-               pbuf->carry_bytes += xbytes;
-               nbytes -= xbytes;
-               from += sizeof(u64);
-       }
-}
-
-/*
- * Zero extra bytes from the end of pbuf->carry.
- *
- * NOTES:
- * o zbytes <= old_bytes
- */
-static inline void zero_extra_bytes(struct pio_buf *pbuf, unsigned int zbytes)
-{
-       unsigned int remaining;
-
-       if (zbytes == 0)        /* nothing to do */
-               return;
-
-       remaining = pbuf->carry_bytes - zbytes; /* remaining bytes */
-
-       /* NOTE: zshift only guaranteed to work if remaining != 0 */
-       if (remaining)
-               pbuf->carry.val64 = (pbuf->carry.val64 << zshift(remaining))
-                                       >> zshift(remaining);
-       else
-               pbuf->carry.val64 = 0;
-       pbuf->carry_bytes = remaining;
-}
-
-/*
- * Write a quad word using parts of pbuf->carry and the next 8 bytes of src.
- * Put the unused part of the next 8 bytes of src into the LSB bytes of
- * pbuf->carry with the upper bytes zeroed..
- *
- * NOTES:
- * o result must keep unused bytes zeroed
- * o src must be u64 aligned
- */
-static inline void merge_write8(
-       struct pio_buf *pbuf,
-       void __iomem *dest,
-       const void *src)
-{
-       u64 new, temp;
-
-       new = *(u64 *)src;
-       temp = pbuf->carry.val64 | (new << mshift(pbuf->carry_bytes));
-       writeq(temp, dest);
-       pbuf->carry.val64 = new >> zshift(pbuf->carry_bytes);
-}
-
-/*
- * Write a quad word using all bytes of carry.
- */
-static inline void carry8_write8(union mix carry, void __iomem *dest)
-{
-       writeq(carry.val64, dest);
-}
-
-/*
- * Write a quad word using all the valid bytes of carry.  If carry
- * has zero valid bytes, nothing is written.
- * Returns 0 on nothing written, non-zero on quad word written.
- */
-static inline int carry_write8(struct pio_buf *pbuf, void __iomem *dest)
-{
-       if (pbuf->carry_bytes) {
-               /* unused bytes are always kept zeroed, so just write */
-               writeq(pbuf->carry.val64, dest);
-               return 1;
-       }
-
-       return 0;
-}
-
-#else /* USE_SHIFTS */
-/*
- * Handle carry bytes using byte copies.
- *
- * NOTE: the value the unused portion of carry is left uninitialized.
- */
-
-/*
- * Jump copy - no-loop copy for < 8 bytes.
- */
-static inline void jcopy(u8 *dest, const u8 *src, u32 n)
-{
-       switch (n) {
-       case 7:
-               *dest++ = *src++;
-       case 6:
-               *dest++ = *src++;
-       case 5:
-               *dest++ = *src++;
-       case 4:
-               *dest++ = *src++;
-       case 3:
-               *dest++ = *src++;
-       case 2:
-               *dest++ = *src++;
-       case 1:
-               *dest++ = *src++;
-       }
-}
-
-/*
- * Read nbytes from "from" and and place them in the low bytes
- * of pbuf->carry.  Other bytes are left as-is.  Any previous
- * value in pbuf->carry is lost.
- *
- * NOTES:
- * o do not read from from if nbytes is zero
- * o from may _not_ be u64 aligned.
- */
-static inline void read_low_bytes(struct pio_buf *pbuf, const void *from,
-                                 unsigned int nbytes)
-{
-       jcopy(&pbuf->carry.val8[0], from, nbytes);
-       pbuf->carry_bytes = nbytes;
-}
-
-/*
- * Read nbytes bytes from "from" and put them at the end of pbuf->carry.
- * It is expected that the extra read does not overfill carry.
- *
- * NOTES:
- * o from may _not_ be u64 aligned
- * o nbytes may span a QW boundary
- */
-static inline void read_extra_bytes(struct pio_buf *pbuf,
-                                   const void *from, unsigned int nbytes)
-{
-       jcopy(&pbuf->carry.val8[pbuf->carry_bytes], from, nbytes);
-       pbuf->carry_bytes += nbytes;
-}
-
-/*
- * Zero extra bytes from the end of pbuf->carry.
- *
- * We do not care about the value of unused bytes in carry, so just
- * reduce the byte count.
- *
- * NOTES:
- * o zbytes <= old_bytes
- */
-static inline void zero_extra_bytes(struct pio_buf *pbuf, unsigned int zbytes)
-{
-       pbuf->carry_bytes -= zbytes;
-}
-
-/*
- * Write a quad word using parts of pbuf->carry and the next 8 bytes of src.
- * Put the unused part of the next 8 bytes of src into the low bytes of
- * pbuf->carry.
- */
-static inline void merge_write8(
-       struct pio_buf *pbuf,
-       void *dest,
-       const void *src)
-{
-       u32 remainder = 8 - pbuf->carry_bytes;
-
-       jcopy(&pbuf->carry.val8[pbuf->carry_bytes], src, remainder);
-       writeq(pbuf->carry.val64, dest);
-       jcopy(&pbuf->carry.val8[0], src + remainder, pbuf->carry_bytes);
-}
-
-/*
- * Write a quad word using all bytes of carry.
- */
-static inline void carry8_write8(union mix carry, void *dest)
-{
-       writeq(carry.val64, dest);
-}
-
-/*
- * Write a quad word using all the valid bytes of carry.  If carry
- * has zero valid bytes, nothing is written.
- * Returns 0 on nothing written, non-zero on quad word written.
- */
-static inline int carry_write8(struct pio_buf *pbuf, void *dest)
-{
-       if (pbuf->carry_bytes) {
-               u64 zero = 0;
-
-               jcopy(&pbuf->carry.val8[pbuf->carry_bytes], (u8 *)&zero,
-                     8 - pbuf->carry_bytes);
-               writeq(pbuf->carry.val64, dest);
-               return 1;
-       }
-
-       return 0;
-}
-#endif /* USE_SHIFTS */
-
-/*
- * Segmented PIO Copy - start
- *
- * Start a PIO copy.
- *
- * @pbuf: destination buffer
- * @pbc: the PBC for the PIO buffer
- * @from: data source, QWORD aligned
- * @nbytes: bytes to copy
- */
-void seg_pio_copy_start(struct pio_buf *pbuf, u64 pbc,
-                       const void *from, size_t nbytes)
-{
-       void __iomem *dest = pbuf->start + SOP_DISTANCE;
-       void __iomem *send = dest + PIO_BLOCK_SIZE;
-       void __iomem *dend;                     /* 8-byte data end */
-
-       writeq(pbc, dest);
-       dest += sizeof(u64);
-
-       /* calculate where the QWORD data ends - in SOP=1 space */
-       dend = dest + ((nbytes >> 3) * sizeof(u64));
-
-       if (dend < send) {
-               /*
-                * all QWORD data is within the SOP block, does *not*
-                * reach the end of the SOP block
-                */
-
-               while (dest < dend) {
-                       writeq(*(u64 *)from, dest);
-                       from += sizeof(u64);
-                       dest += sizeof(u64);
-               }
-               /*
-                * No boundary checks are needed here:
-                * 0. We're not on the SOP block boundary
-                * 1. The possible DWORD dangle will still be within
-                *    the SOP block
-                * 2. We cannot wrap except on a block boundary.
-                */
-       } else {
-               /* QWORD data extends _to_ or beyond the SOP block */
-
-               /* write 8-byte SOP chunk data */
-               while (dest < send) {
-                       writeq(*(u64 *)from, dest);
-                       from += sizeof(u64);
-                       dest += sizeof(u64);
-               }
-               /* drop out of the SOP range */
-               dest -= SOP_DISTANCE;
-               dend -= SOP_DISTANCE;
-
-               /*
-                * If the wrap comes before or matches the data end,
-                * copy until until the wrap, then wrap.
-                *
-                * If the data ends at the end of the SOP above and
-                * the buffer wraps, then pbuf->end == dend == dest
-                * and nothing will get written, but we will wrap in
-                * case there is a dangling DWORD.
-                */
-               if (pbuf->end <= dend) {
-                       while (dest < pbuf->end) {
-                               writeq(*(u64 *)from, dest);
-                               from += sizeof(u64);
-                               dest += sizeof(u64);
-                       }
-
-                       dest -= pbuf->size;
-                       dend -= pbuf->size;
-               }
-
-               /* write 8-byte non-SOP, non-wrap chunk data */
-               while (dest < dend) {
-                       writeq(*(u64 *)from, dest);
-                       from += sizeof(u64);
-                       dest += sizeof(u64);
-               }
-       }
-       /* at this point we have wrapped if we are going to wrap */
-
-       /* ...but it doesn't matter as we're done writing */
-
-       /* save dangling bytes, if any */
-       read_low_bytes(pbuf, from, nbytes & 0x7);
-
-       pbuf->qw_written = 1 /*PBC*/ + (nbytes >> 3);
-}
-
-/*
- * Mid copy helper, "mixed case" - source is 64-bit aligned but carry
- * bytes are non-zero.
- *
- * Whole u64s must be written to the chip, so bytes must be manually merged.
- *
- * @pbuf: destination buffer
- * @from: data source, is QWORD aligned.
- * @nbytes: bytes to copy
- *
- * Must handle nbytes < 8.
- */
-static void mid_copy_mix(struct pio_buf *pbuf, const void *from, size_t nbytes)
-{
-       void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
-       void __iomem *dend;                     /* 8-byte data end */
-       unsigned long qw_to_write = (pbuf->carry_bytes + nbytes) >> 3;
-       unsigned long bytes_left = (pbuf->carry_bytes + nbytes) & 0x7;
-
-       /* calculate 8-byte data end */
-       dend = dest + (qw_to_write * sizeof(u64));
-
-       if (pbuf->qw_written < PIO_BLOCK_QWS) {
-               /*
-                * Still within SOP block.  We don't need to check for
-                * wrap because we are still in the first block and
-                * can only wrap on block boundaries.
-                */
-               void __iomem *send;             /* SOP end */
-               void __iomem *xend;
-
-               /*
-                * calculate the end of data or end of block, whichever
-                * comes first
-                */
-               send = pbuf->start + PIO_BLOCK_SIZE;
-               xend = min(send, dend);
-
-               /* shift up to SOP=1 space */
-               dest += SOP_DISTANCE;
-               xend += SOP_DISTANCE;
-
-               /* write 8-byte chunk data */
-               while (dest < xend) {
-                       merge_write8(pbuf, dest, from);
-                       from += sizeof(u64);
-                       dest += sizeof(u64);
-               }
-
-               /* shift down to SOP=0 space */
-               dest -= SOP_DISTANCE;
-       }
-       /*
-        * At this point dest could be (either, both, or neither):
-        * - at dend
-        * - at the wrap
-        */
-
-       /*
-        * If the wrap comes before or matches the data end,
-        * copy until until the wrap, then wrap.
-        *
-        * If dest is at the wrap, we will fall into the if,
-        * not do the loop, when wrap.
-        *
-        * If the data ends at the end of the SOP above and
-        * the buffer wraps, then pbuf->end == dend == dest
-        * and nothing will get written.
-        */
-       if (pbuf->end <= dend) {
-               while (dest < pbuf->end) {
-                       merge_write8(pbuf, dest, from);
-                       from += sizeof(u64);
-                       dest += sizeof(u64);
-               }
-
-               dest -= pbuf->size;
-               dend -= pbuf->size;
-       }
-
-       /* write 8-byte non-SOP, non-wrap chunk data */
-       while (dest < dend) {
-               merge_write8(pbuf, dest, from);
-               from += sizeof(u64);
-               dest += sizeof(u64);
-       }
-
-       /* adjust carry */
-       if (pbuf->carry_bytes < bytes_left) {
-               /* need to read more */
-               read_extra_bytes(pbuf, from, bytes_left - pbuf->carry_bytes);
-       } else {
-               /* remove invalid bytes */
-               zero_extra_bytes(pbuf, pbuf->carry_bytes - bytes_left);
-       }
-
-       pbuf->qw_written += qw_to_write;
-}
-
-/*
- * Mid copy helper, "straight case" - source pointer is 64-bit aligned
- * with no carry bytes.
- *
- * @pbuf: destination buffer
- * @from: data source, is QWORD aligned
- * @nbytes: bytes to copy
- *
- * Must handle nbytes < 8.
- */
-static void mid_copy_straight(struct pio_buf *pbuf,
-                             const void *from, size_t nbytes)
-{
-       void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
-       void __iomem *dend;                     /* 8-byte data end */
-
-       /* calculate 8-byte data end */
-       dend = dest + ((nbytes >> 3) * sizeof(u64));
-
-       if (pbuf->qw_written < PIO_BLOCK_QWS) {
-               /*
-                * Still within SOP block.  We don't need to check for
-                * wrap because we are still in the first block and
-                * can only wrap on block boundaries.
-                */
-               void __iomem *send;             /* SOP end */
-               void __iomem *xend;
-
-               /*
-                * calculate the end of data or end of block, whichever
-                * comes first
-                */
-               send = pbuf->start + PIO_BLOCK_SIZE;
-               xend = min(send, dend);
-
-               /* shift up to SOP=1 space */
-               dest += SOP_DISTANCE;
-               xend += SOP_DISTANCE;
-
-               /* write 8-byte chunk data */
-               while (dest < xend) {
-                       writeq(*(u64 *)from, dest);
-                       from += sizeof(u64);
-                       dest += sizeof(u64);
-               }
-
-               /* shift down to SOP=0 space */
-               dest -= SOP_DISTANCE;
-       }
-       /*
-        * At this point dest could be (either, both, or neither):
-        * - at dend
-        * - at the wrap
-        */
-
-       /*
-        * If the wrap comes before or matches the data end,
-        * copy until until the wrap, then wrap.
-        *
-        * If dest is at the wrap, we will fall into the if,
-        * not do the loop, when wrap.
-        *
-        * If the data ends at the end of the SOP above and
-        * the buffer wraps, then pbuf->end == dend == dest
-        * and nothing will get written.
-        */
-       if (pbuf->end <= dend) {
-               while (dest < pbuf->end) {
-                       writeq(*(u64 *)from, dest);
-                       from += sizeof(u64);
-                       dest += sizeof(u64);
-               }
-
-               dest -= pbuf->size;
-               dend -= pbuf->size;
-       }
-
-       /* write 8-byte non-SOP, non-wrap chunk data */
-       while (dest < dend) {
-               writeq(*(u64 *)from, dest);
-               from += sizeof(u64);
-               dest += sizeof(u64);
-       }
-
-       /* we know carry_bytes was zero on entry to this routine */
-       read_low_bytes(pbuf, from, nbytes & 0x7);
-
-       pbuf->qw_written += nbytes >> 3;
-}
-
-/*
- * Segmented PIO Copy - middle
- *
- * Must handle any aligned tail and any aligned source with any byte count.
- *
- * @pbuf: a number of blocks allocated within a PIO send context
- * @from: data source
- * @nbytes: number of bytes to copy
- */
-void seg_pio_copy_mid(struct pio_buf *pbuf, const void *from, size_t nbytes)
-{
-       unsigned long from_align = (unsigned long)from & 0x7;
-
-       if (pbuf->carry_bytes + nbytes < 8) {
-               /* not enough bytes to fill a QW */
-               read_extra_bytes(pbuf, from, nbytes);
-               return;
-       }
-
-       if (from_align) {
-               /* misaligned source pointer - align it */
-               unsigned long to_align;
-
-               /* bytes to read to align "from" */
-               to_align = 8 - from_align;
-
-               /*
-                * In the advance-to-alignment logic below, we do not need
-                * to check if we are using more than nbytes.  This is because
-                * if we are here, we already know that carry+nbytes will
-                * fill at least one QW.
-                */
-               if (pbuf->carry_bytes + to_align < 8) {
-                       /* not enough align bytes to fill a QW */
-                       read_extra_bytes(pbuf, from, to_align);
-                       from += to_align;
-                       nbytes -= to_align;
-               } else {
-                       /* bytes to fill carry */
-                       unsigned long to_fill = 8 - pbuf->carry_bytes;
-                       /* bytes left over to be read */
-                       unsigned long extra = to_align - to_fill;
-                       void __iomem *dest;
-
-                       /* fill carry... */
-                       read_extra_bytes(pbuf, from, to_fill);
-                       from += to_fill;
-                       nbytes -= to_fill;
-
-                       /* ...now write carry */
-                       dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
-
-                       /*
-                        * The two checks immediately below cannot both be
-                        * true, hence the else.  If we have wrapped, we
-                        * cannot still be within the first block.
-                        * Conversely, if we are still in the first block, we
-                        * cannot have wrapped.  We do the wrap check first
-                        * as that is more likely.
-                        */
-                       /* adjust if we've wrapped */
-                       if (dest >= pbuf->end)
-                               dest -= pbuf->size;
-                       /* jump to SOP range if within the first block */
-                       else if (pbuf->qw_written < PIO_BLOCK_QWS)
-                               dest += SOP_DISTANCE;
-
-                       carry8_write8(pbuf->carry, dest);
-                       pbuf->qw_written++;
-
-                       /* read any extra bytes to do final alignment */
-                       /* this will overwrite anything in pbuf->carry */
-                       read_low_bytes(pbuf, from, extra);
-                       from += extra;
-                       nbytes -= extra;
-               }
-
-               /* at this point, from is QW aligned */
-       }
-
-       if (pbuf->carry_bytes)
-               mid_copy_mix(pbuf, from, nbytes);
-       else
-               mid_copy_straight(pbuf, from, nbytes);
-}
-
-/*
- * Segmented PIO Copy - end
- *
- * Write any remainder (in pbuf->carry) and finish writing the whole block.
- *
- * @pbuf: a number of blocks allocated within a PIO send context
- */
-void seg_pio_copy_end(struct pio_buf *pbuf)
-{
-       void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
-
-       /*
-        * The two checks immediately below cannot both be true, hence the
-        * else.  If we have wrapped, we cannot still be within the first
-        * block.  Conversely, if we are still in the first block, we
-        * cannot have wrapped.  We do the wrap check first as that is
-        * more likely.
-        */
-       /* adjust if we have wrapped */
-       if (dest >= pbuf->end)
-               dest -= pbuf->size;
-       /* jump to the SOP range if within the first block */
-       else if (pbuf->qw_written < PIO_BLOCK_QWS)
-               dest += SOP_DISTANCE;
-
-       /* write final bytes, if any */
-       if (carry_write8(pbuf, dest)) {
-               dest += sizeof(u64);
-               /*
-                * NOTE: We do not need to recalculate whether dest needs
-                * SOP_DISTANCE or not.
-                *
-                * If we are in the first block and the dangle write
-                * keeps us in the same block, dest will need
-                * to retain SOP_DISTANCE in the loop below.
-                *
-                * If we are in the first block and the dangle write pushes
-                * us to the next block, then loop below will not run
-                * and dest is not used.  Hence we do not need to update
-                * it.
-                *
-                * If we are past the first block, then SOP_DISTANCE
-                * was never added, so there is nothing to do.
-                */
-       }
-
-       /* fill in rest of block */
-       while (((unsigned long)dest & PIO_BLOCK_MASK) != 0) {
-               writeq(0, dest);
-               dest += sizeof(u64);
-       }
-
-       /* finished with this buffer */
-       this_cpu_dec(*pbuf->sc->buffers_allocated);
-       preempt_enable();
-}
diff --git a/drivers/staging/rdma/hfi1/platform.c b/drivers/staging/rdma/hfi1/platform.c
deleted file mode 100644 (file)
index 8fe8a20..0000000
+++ /dev/null
@@ -1,902 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "hfi.h"
-#include "efivar.h"
-
-void get_platform_config(struct hfi1_devdata *dd)
-{
-       int ret = 0;
-       unsigned long size = 0;
-       u8 *temp_platform_config = NULL;
-
-       ret = read_hfi1_efi_var(dd, "configuration", &size,
-                               (void **)&temp_platform_config);
-       if (ret) {
-               dd_dev_info(dd,
-                           "%s: Failed to get platform config from UEFI, falling back to request firmware\n",
-                           __func__);
-               /* fall back to request firmware */
-               platform_config_load = 1;
-               goto bail;
-       }
-
-       dd->platform_config.data = temp_platform_config;
-       dd->platform_config.size = size;
-
-bail:
-       /* exit */;
-}
-
-void free_platform_config(struct hfi1_devdata *dd)
-{
-       if (!platform_config_load) {
-               /*
-                * was loaded from EFI, release memory
-                * allocated by read_efi_var
-                */
-               kfree(dd->platform_config.data);
-       }
-       /*
-        * else do nothing, dispose_firmware will release
-        * struct firmware platform_config on driver exit
-        */
-}
-
-int set_qsfp_tx(struct hfi1_pportdata *ppd, int on)
-{
-       u8 tx_ctrl_byte = on ? 0x0 : 0xF;
-       int ret = 0;
-
-       ret = qsfp_write(ppd, ppd->dd->hfi1_id, QSFP_TX_CTRL_BYTE_OFFS,
-                        &tx_ctrl_byte, 1);
-       /* we expected 1, so consider 0 an error */
-       if (ret == 0)
-               ret = -EIO;
-       else if (ret == 1)
-               ret = 0;
-       return ret;
-}
-
-static int qual_power(struct hfi1_pportdata *ppd)
-{
-       u32 cable_power_class = 0, power_class_max = 0;
-       u8 *cache = ppd->qsfp_info.cache;
-       int ret = 0;
-
-       ret = get_platform_config_field(
-               ppd->dd, PLATFORM_CONFIG_SYSTEM_TABLE, 0,
-               SYSTEM_TABLE_QSFP_POWER_CLASS_MAX, &power_class_max, 4);
-       if (ret)
-               return ret;
-
-       cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
-
-       if (cable_power_class > power_class_max)
-               ppd->offline_disabled_reason =
-                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY);
-
-       if (ppd->offline_disabled_reason ==
-                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY)) {
-               dd_dev_info(
-                       ppd->dd,
-                       "%s: Port disabled due to system power restrictions\n",
-                       __func__);
-               ret = -EPERM;
-       }
-       return ret;
-}
-
-static int qual_bitrate(struct hfi1_pportdata *ppd)
-{
-       u16 lss = ppd->link_speed_supported, lse = ppd->link_speed_enabled;
-       u8 *cache = ppd->qsfp_info.cache;
-
-       if ((lss & OPA_LINK_SPEED_25G) && (lse & OPA_LINK_SPEED_25G) &&
-           cache[QSFP_NOM_BIT_RATE_250_OFFS] < 0x64)
-               ppd->offline_disabled_reason =
-                          HFI1_ODR_MASK(OPA_LINKDOWN_REASON_LINKSPEED_POLICY);
-
-       if ((lss & OPA_LINK_SPEED_12_5G) && (lse & OPA_LINK_SPEED_12_5G) &&
-           cache[QSFP_NOM_BIT_RATE_100_OFFS] < 0x7D)
-               ppd->offline_disabled_reason =
-                          HFI1_ODR_MASK(OPA_LINKDOWN_REASON_LINKSPEED_POLICY);
-
-       if (ppd->offline_disabled_reason ==
-                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_LINKSPEED_POLICY)) {
-               dd_dev_info(
-                       ppd->dd,
-                       "%s: Cable failed bitrate check, disabling port\n",
-                       __func__);
-               return -EPERM;
-       }
-       return 0;
-}
-
-static int set_qsfp_high_power(struct hfi1_pportdata *ppd)
-{
-       u8 cable_power_class = 0, power_ctrl_byte = 0;
-       u8 *cache = ppd->qsfp_info.cache;
-       int ret;
-
-       cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
-
-       if (cable_power_class > QSFP_POWER_CLASS_1) {
-               power_ctrl_byte = cache[QSFP_PWR_CTRL_BYTE_OFFS];
-
-               power_ctrl_byte |= 1;
-               power_ctrl_byte &= ~(0x2);
-
-               ret = qsfp_write(ppd, ppd->dd->hfi1_id,
-                                QSFP_PWR_CTRL_BYTE_OFFS,
-                                &power_ctrl_byte, 1);
-               if (ret != 1)
-                       return -EIO;
-
-               if (cable_power_class > QSFP_POWER_CLASS_4) {
-                       power_ctrl_byte |= (1 << 2);
-                       ret = qsfp_write(ppd, ppd->dd->hfi1_id,
-                                        QSFP_PWR_CTRL_BYTE_OFFS,
-                                        &power_ctrl_byte, 1);
-                       if (ret != 1)
-                               return -EIO;
-               }
-
-               /* SFF 8679 rev 1.7 LPMode Deassert time */
-               msleep(300);
-       }
-       return 0;
-}
-
-static void apply_rx_cdr(struct hfi1_pportdata *ppd,
-                        u32 rx_preset_index,
-                        u8 *cdr_ctrl_byte)
-{
-       u32 rx_preset;
-       u8 *cache = ppd->qsfp_info.cache;
-       int cable_power_class;
-
-       if (!((cache[QSFP_MOD_PWR_OFFS] & 0x4) &&
-             (cache[QSFP_CDR_INFO_OFFS] & 0x40)))
-               return;
-
-       /* RX CDR present, bypass supported */
-       cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
-
-       if (cable_power_class <= QSFP_POWER_CLASS_3) {
-               /* Power class <= 3, ignore config & turn RX CDR on */
-               *cdr_ctrl_byte |= 0xF;
-               return;
-       }
-
-       get_platform_config_field(
-               ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE,
-               rx_preset_index, RX_PRESET_TABLE_QSFP_RX_CDR_APPLY,
-               &rx_preset, 4);
-
-       if (!rx_preset) {
-               dd_dev_info(
-                       ppd->dd,
-                       "%s: RX_CDR_APPLY is set to disabled\n",
-                       __func__);
-               return;
-       }
-       get_platform_config_field(
-               ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE,
-               rx_preset_index, RX_PRESET_TABLE_QSFP_RX_CDR,
-               &rx_preset, 4);
-
-       /* Expand cdr setting to all 4 lanes */
-       rx_preset = (rx_preset | (rx_preset << 1) |
-                       (rx_preset << 2) | (rx_preset << 3));
-
-       if (rx_preset) {
-               *cdr_ctrl_byte |= rx_preset;
-       } else {
-               *cdr_ctrl_byte &= rx_preset;
-               /* Preserve current TX CDR status */
-               *cdr_ctrl_byte |= (cache[QSFP_CDR_CTRL_BYTE_OFFS] & 0xF0);
-       }
-}
-
-static void apply_tx_cdr(struct hfi1_pportdata *ppd,
-                        u32 tx_preset_index,
-                        u8 *cdr_ctrl_byte)
-{
-       u32 tx_preset;
-       u8 *cache = ppd->qsfp_info.cache;
-       int cable_power_class;
-
-       if (!((cache[QSFP_MOD_PWR_OFFS] & 0x8) &&
-             (cache[QSFP_CDR_INFO_OFFS] & 0x80)))
-               return;
-
-       /* TX CDR present, bypass supported */
-       cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
-
-       if (cable_power_class <= QSFP_POWER_CLASS_3) {
-               /* Power class <= 3, ignore config & turn TX CDR on */
-               *cdr_ctrl_byte |= 0xF0;
-               return;
-       }
-
-       get_platform_config_field(
-               ppd->dd,
-               PLATFORM_CONFIG_TX_PRESET_TABLE, tx_preset_index,
-               TX_PRESET_TABLE_QSFP_TX_CDR_APPLY, &tx_preset, 4);
-
-       if (!tx_preset) {
-               dd_dev_info(
-                       ppd->dd,
-                       "%s: TX_CDR_APPLY is set to disabled\n",
-                       __func__);
-               return;
-       }
-       get_platform_config_field(
-               ppd->dd,
-               PLATFORM_CONFIG_TX_PRESET_TABLE,
-               tx_preset_index,
-               TX_PRESET_TABLE_QSFP_TX_CDR, &tx_preset, 4);
-
-       /* Expand cdr setting to all 4 lanes */
-       tx_preset = (tx_preset | (tx_preset << 1) |
-                       (tx_preset << 2) | (tx_preset << 3));
-
-       if (tx_preset)
-               *cdr_ctrl_byte |= (tx_preset << 4);
-       else
-               /* Preserve current/determined RX CDR status */
-               *cdr_ctrl_byte &= ((tx_preset << 4) | 0xF);
-}
-
-static void apply_cdr_settings(
-               struct hfi1_pportdata *ppd, u32 rx_preset_index,
-               u32 tx_preset_index)
-{
-       u8 *cache = ppd->qsfp_info.cache;
-       u8 cdr_ctrl_byte = cache[QSFP_CDR_CTRL_BYTE_OFFS];
-
-       apply_rx_cdr(ppd, rx_preset_index, &cdr_ctrl_byte);
-
-       apply_tx_cdr(ppd, tx_preset_index, &cdr_ctrl_byte);
-
-       qsfp_write(ppd, ppd->dd->hfi1_id, QSFP_CDR_CTRL_BYTE_OFFS,
-                  &cdr_ctrl_byte, 1);
-}
-
-static void apply_tx_eq_auto(struct hfi1_pportdata *ppd)
-{
-       u8 *cache = ppd->qsfp_info.cache;
-       u8 tx_eq;
-
-       if (!(cache[QSFP_EQ_INFO_OFFS] & 0x8))
-               return;
-       /* Disable adaptive TX EQ if present */
-       tx_eq = cache[(128 * 3) + 241];
-       tx_eq &= 0xF0;
-       qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 241, &tx_eq, 1);
-}
-
-static void apply_tx_eq_prog(struct hfi1_pportdata *ppd, u32 tx_preset_index)
-{
-       u8 *cache = ppd->qsfp_info.cache;
-       u32 tx_preset;
-       u8 tx_eq;
-
-       if (!(cache[QSFP_EQ_INFO_OFFS] & 0x4))
-               return;
-
-       get_platform_config_field(
-               ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE,
-               tx_preset_index, TX_PRESET_TABLE_QSFP_TX_EQ_APPLY,
-               &tx_preset, 4);
-       if (!tx_preset) {
-               dd_dev_info(
-                       ppd->dd,
-                       "%s: TX_EQ_APPLY is set to disabled\n",
-                       __func__);
-               return;
-       }
-       get_platform_config_field(
-                       ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE,
-                       tx_preset_index, TX_PRESET_TABLE_QSFP_TX_EQ,
-                       &tx_preset, 4);
-
-       if (((cache[(128 * 3) + 224] & 0xF0) >> 4) < tx_preset) {
-               dd_dev_info(
-                       ppd->dd,
-                       "%s: TX EQ %x unsupported\n",
-                       __func__, tx_preset);
-
-               dd_dev_info(
-                       ppd->dd,
-                       "%s: Applying EQ %x\n",
-                       __func__, cache[608] & 0xF0);
-
-               tx_preset = (cache[608] & 0xF0) >> 4;
-       }
-
-       tx_eq = tx_preset | (tx_preset << 4);
-       qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 234, &tx_eq, 1);
-       qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 235, &tx_eq, 1);
-}
-
-static void apply_rx_eq_emp(struct hfi1_pportdata *ppd, u32 rx_preset_index)
-{
-       u32 rx_preset;
-       u8 rx_eq, *cache = ppd->qsfp_info.cache;
-
-       if (!(cache[QSFP_EQ_INFO_OFFS] & 0x2))
-               return;
-       get_platform_config_field(
-                       ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE,
-                       rx_preset_index, RX_PRESET_TABLE_QSFP_RX_EMP_APPLY,
-                       &rx_preset, 4);
-
-       if (!rx_preset) {
-               dd_dev_info(
-                       ppd->dd,
-                       "%s: RX_EMP_APPLY is set to disabled\n",
-                       __func__);
-               return;
-       }
-       get_platform_config_field(
-               ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE,
-               rx_preset_index, RX_PRESET_TABLE_QSFP_RX_EMP,
-               &rx_preset, 4);
-
-       if ((cache[(128 * 3) + 224] & 0xF) < rx_preset) {
-               dd_dev_info(
-                       ppd->dd,
-                       "%s: Requested RX EMP %x\n",
-                       __func__, rx_preset);
-
-               dd_dev_info(
-                       ppd->dd,
-                       "%s: Applying supported EMP %x\n",
-                       __func__, cache[608] & 0xF);
-
-               rx_preset = cache[608] & 0xF;
-       }
-
-       rx_eq = rx_preset | (rx_preset << 4);
-
-       qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 236, &rx_eq, 1);
-       qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 237, &rx_eq, 1);
-}
-
-static void apply_eq_settings(struct hfi1_pportdata *ppd,
-                             u32 rx_preset_index, u32 tx_preset_index)
-{
-       u8 *cache = ppd->qsfp_info.cache;
-
-       /* no point going on w/o a page 3 */
-       if (cache[2] & 4) {
-               dd_dev_info(ppd->dd,
-                           "%s: Upper page 03 not present\n",
-                           __func__);
-               return;
-       }
-
-       apply_tx_eq_auto(ppd);
-
-       apply_tx_eq_prog(ppd, tx_preset_index);
-
-       apply_rx_eq_emp(ppd, rx_preset_index);
-}
-
-static void apply_rx_amplitude_settings(
-               struct hfi1_pportdata *ppd, u32 rx_preset_index,
-               u32 tx_preset_index)
-{
-       u32 rx_preset;
-       u8 rx_amp = 0, i = 0, preferred = 0, *cache = ppd->qsfp_info.cache;
-
-       /* no point going on w/o a page 3 */
-       if (cache[2] & 4) {
-               dd_dev_info(ppd->dd,
-                           "%s: Upper page 03 not present\n",
-                           __func__);
-               return;
-       }
-       if (!(cache[QSFP_EQ_INFO_OFFS] & 0x1)) {
-               dd_dev_info(ppd->dd,
-                           "%s: RX_AMP_APPLY is set to disabled\n",
-                           __func__);
-               return;
-       }
-
-       get_platform_config_field(ppd->dd,
-                                 PLATFORM_CONFIG_RX_PRESET_TABLE,
-                                 rx_preset_index,
-                                 RX_PRESET_TABLE_QSFP_RX_AMP_APPLY,
-                                 &rx_preset, 4);
-
-       if (!rx_preset) {
-               dd_dev_info(ppd->dd,
-                           "%s: RX_AMP_APPLY is set to disabled\n",
-                           __func__);
-               return;
-       }
-       get_platform_config_field(ppd->dd,
-                                 PLATFORM_CONFIG_RX_PRESET_TABLE,
-                                 rx_preset_index,
-                                 RX_PRESET_TABLE_QSFP_RX_AMP,
-                                 &rx_preset, 4);
-
-       dd_dev_info(ppd->dd,
-                   "%s: Requested RX AMP %x\n",
-                   __func__,
-                   rx_preset);
-
-       for (i = 0; i < 4; i++) {
-               if (cache[(128 * 3) + 225] & (1 << i)) {
-                       preferred = i;
-                       if (preferred == rx_preset)
-                               break;
-               }
-       }
-
-       /*
-        * Verify that preferred RX amplitude is not just a
-        * fall through of the default
-        */
-       if (!preferred && !(cache[(128 * 3) + 225] & 0x1)) {
-               dd_dev_info(ppd->dd, "No supported RX AMP, not applying\n");
-               return;
-       }
-
-       dd_dev_info(ppd->dd,
-                   "%s: Applying RX AMP %x\n", __func__, preferred);
-
-       rx_amp = preferred | (preferred << 4);
-       qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 238, &rx_amp, 1);
-       qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 239, &rx_amp, 1);
-}
-
-#define OPA_INVALID_INDEX 0xFFF
-
-static void apply_tx_lanes(struct hfi1_pportdata *ppd, u8 field_id,
-                          u32 config_data, const char *message)
-{
-       u8 i;
-       int ret = HCMD_SUCCESS;
-
-       for (i = 0; i < 4; i++) {
-               ret = load_8051_config(ppd->dd, field_id, i, config_data);
-               if (ret != HCMD_SUCCESS) {
-                       dd_dev_err(
-                               ppd->dd,
-                               "%s: %s for lane %u failed\n",
-                               message, __func__, i);
-               }
-       }
-}
-
-static void apply_tunings(
-               struct hfi1_pportdata *ppd, u32 tx_preset_index,
-               u8 tuning_method, u32 total_atten, u8 limiting_active)
-{
-       int ret = 0;
-       u32 config_data = 0, tx_preset = 0;
-       u8 precur = 0, attn = 0, postcur = 0, external_device_config = 0;
-       u8 *cache = ppd->qsfp_info.cache;
-
-       /* Enable external device config if channel is limiting active */
-       read_8051_config(ppd->dd, LINK_OPTIMIZATION_SETTINGS,
-                        GENERAL_CONFIG, &config_data);
-       config_data |= limiting_active;
-       ret = load_8051_config(ppd->dd, LINK_OPTIMIZATION_SETTINGS,
-                              GENERAL_CONFIG, config_data);
-       if (ret != HCMD_SUCCESS)
-               dd_dev_err(
-                       ppd->dd,
-                       "%s: Failed to set enable external device config\n",
-                       __func__);
-
-       config_data = 0; /* re-init  */
-       /* Pass tuning method to 8051 */
-       read_8051_config(ppd->dd, LINK_TUNING_PARAMETERS, GENERAL_CONFIG,
-                        &config_data);
-       config_data |= tuning_method;
-       ret = load_8051_config(ppd->dd, LINK_TUNING_PARAMETERS, GENERAL_CONFIG,
-                              config_data);
-       if (ret != HCMD_SUCCESS)
-               dd_dev_err(ppd->dd, "%s: Failed to set tuning method\n",
-                          __func__);
-
-       /* Set same channel loss for both TX and RX */
-       config_data = 0 | (total_atten << 16) | (total_atten << 24);
-       apply_tx_lanes(ppd, CHANNEL_LOSS_SETTINGS, config_data,
-                      "Setting channel loss");
-
-       /* Inform 8051 of cable capabilities */
-       if (ppd->qsfp_info.cache_valid) {
-               external_device_config =
-                       ((cache[QSFP_MOD_PWR_OFFS] & 0x4) << 3) |
-                       ((cache[QSFP_MOD_PWR_OFFS] & 0x8) << 2) |
-                       ((cache[QSFP_EQ_INFO_OFFS] & 0x2) << 1) |
-                       (cache[QSFP_EQ_INFO_OFFS] & 0x4);
-               ret = read_8051_config(ppd->dd, DC_HOST_COMM_SETTINGS,
-                                      GENERAL_CONFIG, &config_data);
-               /* Clear, then set the external device config field */
-               config_data &= ~(0xFF << 24);
-               config_data |= (external_device_config << 24);
-               ret = load_8051_config(ppd->dd, DC_HOST_COMM_SETTINGS,
-                                      GENERAL_CONFIG, config_data);
-               if (ret != HCMD_SUCCESS)
-                       dd_dev_info(ppd->dd,
-                                   "%s: Failed set ext device config params\n",
-                                   __func__);
-       }
-
-       if (tx_preset_index == OPA_INVALID_INDEX) {
-               if (ppd->port_type == PORT_TYPE_QSFP && limiting_active)
-                       dd_dev_info(ppd->dd, "%s: Invalid Tx preset index\n",
-                                   __func__);
-               return;
-       }
-
-       /* Following for limiting active channels only */
-       get_platform_config_field(
-               ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE, tx_preset_index,
-               TX_PRESET_TABLE_PRECUR, &tx_preset, 4);
-       precur = tx_preset;
-
-       get_platform_config_field(
-               ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE,
-               tx_preset_index, TX_PRESET_TABLE_ATTN, &tx_preset, 4);
-       attn = tx_preset;
-
-       get_platform_config_field(
-               ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE,
-               tx_preset_index, TX_PRESET_TABLE_POSTCUR, &tx_preset, 4);
-       postcur = tx_preset;
-
-       config_data = precur | (attn << 8) | (postcur << 16);
-
-       apply_tx_lanes(ppd, TX_EQ_SETTINGS, config_data,
-                      "Applying TX settings");
-}
-
-/* Must be holding the QSFP i2c resource */
-static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset,
-                           u32 *ptr_rx_preset, u32 *ptr_total_atten)
-{
-       int ret;
-       u16 lss = ppd->link_speed_supported, lse = ppd->link_speed_enabled;
-       u8 *cache = ppd->qsfp_info.cache;
-
-       ppd->qsfp_info.limiting_active = 1;
-
-       ret = set_qsfp_tx(ppd, 0);
-       if (ret)
-               return ret;
-
-       ret = qual_power(ppd);
-       if (ret)
-               return ret;
-
-       ret = qual_bitrate(ppd);
-       if (ret)
-               return ret;
-
-       if (ppd->qsfp_info.reset_needed) {
-               reset_qsfp(ppd);
-               ppd->qsfp_info.reset_needed = 0;
-               refresh_qsfp_cache(ppd, &ppd->qsfp_info);
-       } else {
-               ppd->qsfp_info.reset_needed = 1;
-       }
-
-       ret = set_qsfp_high_power(ppd);
-       if (ret)
-               return ret;
-
-       if (cache[QSFP_EQ_INFO_OFFS] & 0x4) {
-               ret = get_platform_config_field(
-                       ppd->dd,
-                       PLATFORM_CONFIG_PORT_TABLE, 0,
-                       PORT_TABLE_TX_PRESET_IDX_ACTIVE_EQ,
-                       ptr_tx_preset, 4);
-               if (ret) {
-                       *ptr_tx_preset = OPA_INVALID_INDEX;
-                       return ret;
-               }
-       } else {
-               ret = get_platform_config_field(
-                       ppd->dd,
-                       PLATFORM_CONFIG_PORT_TABLE, 0,
-                       PORT_TABLE_TX_PRESET_IDX_ACTIVE_NO_EQ,
-                       ptr_tx_preset, 4);
-               if (ret) {
-                       *ptr_tx_preset = OPA_INVALID_INDEX;
-                       return ret;
-               }
-       }
-
-       ret = get_platform_config_field(
-               ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
-               PORT_TABLE_RX_PRESET_IDX, ptr_rx_preset, 4);
-       if (ret) {
-               *ptr_rx_preset = OPA_INVALID_INDEX;
-               return ret;
-       }
-
-       if ((lss & OPA_LINK_SPEED_25G) && (lse & OPA_LINK_SPEED_25G))
-               get_platform_config_field(
-                       ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
-                       PORT_TABLE_LOCAL_ATTEN_25G, ptr_total_atten, 4);
-       else if ((lss & OPA_LINK_SPEED_12_5G) && (lse & OPA_LINK_SPEED_12_5G))
-               get_platform_config_field(
-                       ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
-                       PORT_TABLE_LOCAL_ATTEN_12G, ptr_total_atten, 4);
-
-       apply_cdr_settings(ppd, *ptr_rx_preset, *ptr_tx_preset);
-
-       apply_eq_settings(ppd, *ptr_rx_preset, *ptr_tx_preset);
-
-       apply_rx_amplitude_settings(ppd, *ptr_rx_preset, *ptr_tx_preset);
-
-       ret = set_qsfp_tx(ppd, 1);
-
-       return ret;
-}
-
-static int tune_qsfp(struct hfi1_pportdata *ppd,
-                    u32 *ptr_tx_preset, u32 *ptr_rx_preset,
-                    u8 *ptr_tuning_method, u32 *ptr_total_atten)
-{
-       u32 cable_atten = 0, remote_atten = 0, platform_atten = 0;
-       u16 lss = ppd->link_speed_supported, lse = ppd->link_speed_enabled;
-       int ret = 0;
-       u8 *cache = ppd->qsfp_info.cache;
-
-       switch ((cache[QSFP_MOD_TECH_OFFS] & 0xF0) >> 4) {
-       case 0xA ... 0xB:
-               ret = get_platform_config_field(
-                       ppd->dd,
-                       PLATFORM_CONFIG_PORT_TABLE, 0,
-                       PORT_TABLE_LOCAL_ATTEN_25G,
-                       &platform_atten, 4);
-               if (ret)
-                       return ret;
-
-               if ((lss & OPA_LINK_SPEED_25G) && (lse & OPA_LINK_SPEED_25G))
-                       cable_atten = cache[QSFP_CU_ATTEN_12G_OFFS];
-               else if ((lss & OPA_LINK_SPEED_12_5G) &&
-                        (lse & OPA_LINK_SPEED_12_5G))
-                       cable_atten = cache[QSFP_CU_ATTEN_7G_OFFS];
-
-               /* Fallback to configured attenuation if cable memory is bad */
-               if (cable_atten == 0 || cable_atten > 36) {
-                       ret = get_platform_config_field(
-                               ppd->dd,
-                               PLATFORM_CONFIG_SYSTEM_TABLE, 0,
-                               SYSTEM_TABLE_QSFP_ATTENUATION_DEFAULT_25G,
-                               &cable_atten, 4);
-                       if (ret)
-                               return ret;
-               }
-
-               ret = get_platform_config_field(
-                       ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
-                       PORT_TABLE_REMOTE_ATTEN_25G, &remote_atten, 4);
-               if (ret)
-                       return ret;
-
-               *ptr_total_atten = platform_atten + cable_atten + remote_atten;
-
-               *ptr_tuning_method = OPA_PASSIVE_TUNING;
-               break;
-       case 0x0 ... 0x9: /* fallthrough */
-       case 0xC: /* fallthrough */
-       case 0xE:
-               ret = tune_active_qsfp(ppd, ptr_tx_preset, ptr_rx_preset,
-                                      ptr_total_atten);
-               if (ret)
-                       return ret;
-
-               *ptr_tuning_method = OPA_ACTIVE_TUNING;
-               break;
-       case 0xD: /* fallthrough */
-       case 0xF:
-       default:
-               dd_dev_info(ppd->dd, "%s: Unknown/unsupported cable\n",
-                           __func__);
-               break;
-       }
-       return ret;
-}
-
-/*
- * This function communicates its success or failure via ppd->driver_link_ready
- * Thus, it depends on its association with start_link(...) which checks
- * driver_link_ready before proceeding with the link negotiation and
- * initialization process.
- */
-void tune_serdes(struct hfi1_pportdata *ppd)
-{
-       int ret = 0;
-       u32 total_atten = 0;
-       u32 remote_atten = 0, platform_atten = 0;
-       u32 rx_preset_index, tx_preset_index;
-       u8 tuning_method = 0, limiting_active = 0;
-       struct hfi1_devdata *dd = ppd->dd;
-
-       rx_preset_index = OPA_INVALID_INDEX;
-       tx_preset_index = OPA_INVALID_INDEX;
-
-       /* the link defaults to enabled */
-       ppd->link_enabled = 1;
-       /* the driver link ready state defaults to not ready */
-       ppd->driver_link_ready = 0;
-       ppd->offline_disabled_reason = HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE);
-
-       /* Skip the tuning for testing (loopback != none) and simulations */
-       if (loopback != LOOPBACK_NONE ||
-           ppd->dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
-               ppd->driver_link_ready = 1;
-               return;
-       }
-
-       ret = get_platform_config_field(ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
-                                       PORT_TABLE_PORT_TYPE, &ppd->port_type,
-                                       4);
-       if (ret)
-               ppd->port_type = PORT_TYPE_UNKNOWN;
-
-       switch (ppd->port_type) {
-       case PORT_TYPE_DISCONNECTED:
-               ppd->offline_disabled_reason =
-                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_DISCONNECTED);
-               dd_dev_info(dd, "%s: Port disconnected, disabling port\n",
-                           __func__);
-               goto bail;
-       case PORT_TYPE_FIXED:
-               /* platform_atten, remote_atten pre-zeroed to catch error */
-               get_platform_config_field(
-                       ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
-                       PORT_TABLE_LOCAL_ATTEN_25G, &platform_atten, 4);
-
-               get_platform_config_field(
-                       ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
-                       PORT_TABLE_REMOTE_ATTEN_25G, &remote_atten, 4);
-
-               total_atten = platform_atten + remote_atten;
-
-               tuning_method = OPA_PASSIVE_TUNING;
-               break;
-       case PORT_TYPE_VARIABLE:
-               if (qsfp_mod_present(ppd)) {
-                       /*
-                        * platform_atten, remote_atten pre-zeroed to
-                        * catch error
-                        */
-                       get_platform_config_field(
-                               ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
-                               PORT_TABLE_LOCAL_ATTEN_25G,
-                               &platform_atten, 4);
-
-                       get_platform_config_field(
-                               ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
-                               PORT_TABLE_REMOTE_ATTEN_25G,
-                               &remote_atten, 4);
-
-                       total_atten = platform_atten + remote_atten;
-
-                       tuning_method = OPA_PASSIVE_TUNING;
-               } else {
-                       ppd->offline_disabled_reason =
-                            HFI1_ODR_MASK(OPA_LINKDOWN_REASON_CHASSIS_CONFIG);
-                       goto bail;
-               }
-               break;
-       case PORT_TYPE_QSFP:
-               if (qsfp_mod_present(ppd)) {
-                       ret = acquire_chip_resource(ppd->dd,
-                                                   qsfp_resource(ppd->dd),
-                                                   QSFP_WAIT);
-                       if (ret) {
-                               dd_dev_err(ppd->dd, "%s: hfi%d: cannot lock i2c chain\n",
-                                          __func__, (int)ppd->dd->hfi1_id);
-                               goto bail;
-                       }
-                       refresh_qsfp_cache(ppd, &ppd->qsfp_info);
-
-                       if (ppd->qsfp_info.cache_valid) {
-                               ret = tune_qsfp(ppd,
-                                               &tx_preset_index,
-                                               &rx_preset_index,
-                                               &tuning_method,
-                                               &total_atten);
-
-                               /*
-                                * We may have modified the QSFP memory, so
-                                * update the cache to reflect the changes
-                                */
-                               refresh_qsfp_cache(ppd, &ppd->qsfp_info);
-                               limiting_active =
-                                               ppd->qsfp_info.limiting_active;
-                       } else {
-                               dd_dev_err(dd,
-                                          "%s: Reading QSFP memory failed\n",
-                                          __func__);
-                               ret = -EINVAL; /* a fail indication */
-                       }
-                       release_chip_resource(ppd->dd, qsfp_resource(ppd->dd));
-                       if (ret)
-                               goto bail;
-               } else {
-                       ppd->offline_disabled_reason =
-                          HFI1_ODR_MASK(
-                               OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED);
-                       goto bail;
-               }
-               break;
-       default:
-               dd_dev_info(ppd->dd, "%s: Unknown port type\n", __func__);
-               ppd->port_type = PORT_TYPE_UNKNOWN;
-               tuning_method = OPA_UNKNOWN_TUNING;
-               total_atten = 0;
-               limiting_active = 0;
-               tx_preset_index = OPA_INVALID_INDEX;
-               break;
-       }
-
-       if (ppd->offline_disabled_reason ==
-                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE))
-               apply_tunings(ppd, tx_preset_index, tuning_method,
-                             total_atten, limiting_active);
-
-       if (!ret)
-               ppd->driver_link_ready = 1;
-
-       return;
-bail:
-       ppd->driver_link_ready = 0;
-}
diff --git a/drivers/staging/rdma/hfi1/platform.h b/drivers/staging/rdma/hfi1/platform.h
deleted file mode 100644 (file)
index 19620cf..0000000
+++ /dev/null
@@ -1,304 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#ifndef __PLATFORM_H
-#define __PLATFORM_H
-
-#define METADATA_TABLE_FIELD_START_SHIFT               0
-#define METADATA_TABLE_FIELD_START_LEN_BITS            15
-#define METADATA_TABLE_FIELD_LEN_SHIFT                 16
-#define METADATA_TABLE_FIELD_LEN_LEN_BITS              16
-
-/* Header structure */
-#define PLATFORM_CONFIG_HEADER_RECORD_IDX_SHIFT                        0
-#define PLATFORM_CONFIG_HEADER_RECORD_IDX_LEN_BITS             6
-#define PLATFORM_CONFIG_HEADER_TABLE_LENGTH_SHIFT              16
-#define PLATFORM_CONFIG_HEADER_TABLE_LENGTH_LEN_BITS           12
-#define PLATFORM_CONFIG_HEADER_TABLE_TYPE_SHIFT                        28
-#define PLATFORM_CONFIG_HEADER_TABLE_TYPE_LEN_BITS             4
-
-enum platform_config_table_type_encoding {
-       PLATFORM_CONFIG_TABLE_RESERVED,
-       PLATFORM_CONFIG_SYSTEM_TABLE,
-       PLATFORM_CONFIG_PORT_TABLE,
-       PLATFORM_CONFIG_RX_PRESET_TABLE,
-       PLATFORM_CONFIG_TX_PRESET_TABLE,
-       PLATFORM_CONFIG_QSFP_ATTEN_TABLE,
-       PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE,
-       PLATFORM_CONFIG_TABLE_MAX
-};
-
-enum platform_config_system_table_fields {
-       SYSTEM_TABLE_RESERVED,
-       SYSTEM_TABLE_NODE_STRING,
-       SYSTEM_TABLE_SYSTEM_IMAGE_GUID,
-       SYSTEM_TABLE_NODE_GUID,
-       SYSTEM_TABLE_REVISION,
-       SYSTEM_TABLE_VENDOR_OUI,
-       SYSTEM_TABLE_META_VERSION,
-       SYSTEM_TABLE_DEVICE_ID,
-       SYSTEM_TABLE_PARTITION_ENFORCEMENT_CAP,
-       SYSTEM_TABLE_QSFP_POWER_CLASS_MAX,
-       SYSTEM_TABLE_QSFP_ATTENUATION_DEFAULT_12G,
-       SYSTEM_TABLE_QSFP_ATTENUATION_DEFAULT_25G,
-       SYSTEM_TABLE_VARIABLE_TABLE_ENTRIES_PER_PORT,
-       SYSTEM_TABLE_MAX
-};
-
-enum platform_config_port_table_fields {
-       PORT_TABLE_RESERVED,
-       PORT_TABLE_PORT_TYPE,
-       PORT_TABLE_LOCAL_ATTEN_12G,
-       PORT_TABLE_LOCAL_ATTEN_25G,
-       PORT_TABLE_LINK_SPEED_SUPPORTED,
-       PORT_TABLE_LINK_WIDTH_SUPPORTED,
-       PORT_TABLE_AUTO_LANE_SHEDDING_ENABLED,
-       PORT_TABLE_EXTERNAL_LOOPBACK_ALLOWED,
-       PORT_TABLE_VL_CAP,
-       PORT_TABLE_MTU_CAP,
-       PORT_TABLE_TX_LANE_ENABLE_MASK,
-       PORT_TABLE_LOCAL_MAX_TIMEOUT,
-       PORT_TABLE_REMOTE_ATTEN_12G,
-       PORT_TABLE_REMOTE_ATTEN_25G,
-       PORT_TABLE_TX_PRESET_IDX_ACTIVE_NO_EQ,
-       PORT_TABLE_TX_PRESET_IDX_ACTIVE_EQ,
-       PORT_TABLE_RX_PRESET_IDX,
-       PORT_TABLE_CABLE_REACH_CLASS,
-       PORT_TABLE_MAX
-};
-
-enum platform_config_rx_preset_table_fields {
-       RX_PRESET_TABLE_RESERVED,
-       RX_PRESET_TABLE_QSFP_RX_CDR_APPLY,
-       RX_PRESET_TABLE_QSFP_RX_EMP_APPLY,
-       RX_PRESET_TABLE_QSFP_RX_AMP_APPLY,
-       RX_PRESET_TABLE_QSFP_RX_CDR,
-       RX_PRESET_TABLE_QSFP_RX_EMP,
-       RX_PRESET_TABLE_QSFP_RX_AMP,
-       RX_PRESET_TABLE_MAX
-};
-
-enum platform_config_tx_preset_table_fields {
-       TX_PRESET_TABLE_RESERVED,
-       TX_PRESET_TABLE_PRECUR,
-       TX_PRESET_TABLE_ATTN,
-       TX_PRESET_TABLE_POSTCUR,
-       TX_PRESET_TABLE_QSFP_TX_CDR_APPLY,
-       TX_PRESET_TABLE_QSFP_TX_EQ_APPLY,
-       TX_PRESET_TABLE_QSFP_TX_CDR,
-       TX_PRESET_TABLE_QSFP_TX_EQ,
-       TX_PRESET_TABLE_MAX
-};
-
-enum platform_config_qsfp_attn_table_fields {
-       QSFP_ATTEN_TABLE_RESERVED,
-       QSFP_ATTEN_TABLE_TX_PRESET_IDX,
-       QSFP_ATTEN_TABLE_RX_PRESET_IDX,
-       QSFP_ATTEN_TABLE_MAX
-};
-
-enum platform_config_variable_settings_table_fields {
-       VARIABLE_SETTINGS_TABLE_RESERVED,
-       VARIABLE_SETTINGS_TABLE_TX_PRESET_IDX,
-       VARIABLE_SETTINGS_TABLE_RX_PRESET_IDX,
-       VARIABLE_SETTINGS_TABLE_MAX
-};
-
-struct platform_config {
-       size_t size;
-       const u8 *data;
-};
-
-struct platform_config_data {
-       u32 *table;
-       u32 *table_metadata;
-       u32 num_table;
-};
-
-/*
- * This struct acts as a quick reference into the platform_data binary image
- * and is populated by parse_platform_config(...) depending on the specific
- * META_VERSION
- */
-struct platform_config_cache {
-       u8  cache_valid;
-       struct platform_config_data config_tables[PLATFORM_CONFIG_TABLE_MAX];
-};
-
-static const u32 platform_config_table_limits[PLATFORM_CONFIG_TABLE_MAX] = {
-       0,
-       SYSTEM_TABLE_MAX,
-       PORT_TABLE_MAX,
-       RX_PRESET_TABLE_MAX,
-       TX_PRESET_TABLE_MAX,
-       QSFP_ATTEN_TABLE_MAX,
-       VARIABLE_SETTINGS_TABLE_MAX
-};
-
-/* This section defines default values and encodings for the
- * fields defined for each table above
- */
-
-/*
- * =====================================================
- *  System table encodings
- * =====================================================
- */
-#define PLATFORM_CONFIG_MAGIC_NUM              0x3d4f5041
-#define PLATFORM_CONFIG_MAGIC_NUMBER_LEN       4
-
-/*
- * These power classes are the same as defined in SFF 8636 spec rev 2.4
- * describing byte 129 in table 6-16, except enumerated in a different order
- */
-enum platform_config_qsfp_power_class_encoding {
-       QSFP_POWER_CLASS_1 = 1,
-       QSFP_POWER_CLASS_2,
-       QSFP_POWER_CLASS_3,
-       QSFP_POWER_CLASS_4,
-       QSFP_POWER_CLASS_5,
-       QSFP_POWER_CLASS_6,
-       QSFP_POWER_CLASS_7
-};
-
-/*
- * ====================================================
- *  Port table encodings
- * ====================================================
- */
-enum platform_config_port_type_encoding {
-       PORT_TYPE_UNKNOWN,
-       PORT_TYPE_DISCONNECTED,
-       PORT_TYPE_FIXED,
-       PORT_TYPE_VARIABLE,
-       PORT_TYPE_QSFP,
-       PORT_TYPE_MAX
-};
-
-enum platform_config_link_speed_supported_encoding {
-       LINK_SPEED_SUPP_12G = 1,
-       LINK_SPEED_SUPP_25G,
-       LINK_SPEED_SUPP_12G_25G,
-       LINK_SPEED_SUPP_MAX
-};
-
-/*
- * This is a subset (not strict) of the link downgrades
- * supported. The link downgrades supported are expected
- * to be supplied to the driver by another entity such as
- * the fabric manager
- */
-enum platform_config_link_width_supported_encoding {
-       LINK_WIDTH_SUPP_1X = 1,
-       LINK_WIDTH_SUPP_2X,
-       LINK_WIDTH_SUPP_2X_1X,
-       LINK_WIDTH_SUPP_3X,
-       LINK_WIDTH_SUPP_3X_1X,
-       LINK_WIDTH_SUPP_3X_2X,
-       LINK_WIDTH_SUPP_3X_2X_1X,
-       LINK_WIDTH_SUPP_4X,
-       LINK_WIDTH_SUPP_4X_1X,
-       LINK_WIDTH_SUPP_4X_2X,
-       LINK_WIDTH_SUPP_4X_2X_1X,
-       LINK_WIDTH_SUPP_4X_3X,
-       LINK_WIDTH_SUPP_4X_3X_1X,
-       LINK_WIDTH_SUPP_4X_3X_2X,
-       LINK_WIDTH_SUPP_4X_3X_2X_1X,
-       LINK_WIDTH_SUPP_MAX
-};
-
-enum platform_config_virtual_lane_capability_encoding {
-       VL_CAP_VL0 = 1,
-       VL_CAP_VL0_1,
-       VL_CAP_VL0_2,
-       VL_CAP_VL0_3,
-       VL_CAP_VL0_4,
-       VL_CAP_VL0_5,
-       VL_CAP_VL0_6,
-       VL_CAP_VL0_7,
-       VL_CAP_VL0_8,
-       VL_CAP_VL0_9,
-       VL_CAP_VL0_10,
-       VL_CAP_VL0_11,
-       VL_CAP_VL0_12,
-       VL_CAP_VL0_13,
-       VL_CAP_VL0_14,
-       VL_CAP_MAX
-};
-
-/* Max MTU */
-enum platform_config_mtu_capability_encoding {
-       MTU_CAP_256   = 1,
-       MTU_CAP_512   = 2,
-       MTU_CAP_1024  = 3,
-       MTU_CAP_2048  = 4,
-       MTU_CAP_4096  = 5,
-       MTU_CAP_8192  = 6,
-       MTU_CAP_10240 = 7
-};
-
-enum platform_config_local_max_timeout_encoding {
-       LOCAL_MAX_TIMEOUT_10_MS = 1,
-       LOCAL_MAX_TIMEOUT_100_MS,
-       LOCAL_MAX_TIMEOUT_1_S,
-       LOCAL_MAX_TIMEOUT_10_S,
-       LOCAL_MAX_TIMEOUT_100_S,
-       LOCAL_MAX_TIMEOUT_1000_S
-};
-
-enum link_tuning_encoding {
-       OPA_PASSIVE_TUNING,
-       OPA_ACTIVE_TUNING,
-       OPA_UNKNOWN_TUNING
-};
-
-/* platform.c */
-void get_platform_config(struct hfi1_devdata *dd);
-void free_platform_config(struct hfi1_devdata *dd);
-int set_qsfp_tx(struct hfi1_pportdata *ppd, int on);
-void tune_serdes(struct hfi1_pportdata *ppd);
-
-#endif                 /*__PLATFORM_H*/
diff --git a/drivers/staging/rdma/hfi1/qp.c b/drivers/staging/rdma/hfi1/qp.c
deleted file mode 100644 (file)
index 91eb423..0000000
+++ /dev/null
@@ -1,977 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/err.h>
-#include <linux/vmalloc.h>
-#include <linux/hash.h>
-#include <linux/module.h>
-#include <linux/random.h>
-#include <linux/seq_file.h>
-#include <rdma/rdma_vt.h>
-#include <rdma/rdmavt_qp.h>
-
-#include "hfi.h"
-#include "qp.h"
-#include "trace.h"
-#include "verbs_txreq.h"
-
-unsigned int hfi1_qp_table_size = 256;
-module_param_named(qp_table_size, hfi1_qp_table_size, uint, S_IRUGO);
-MODULE_PARM_DESC(qp_table_size, "QP table size");
-
-static void flush_tx_list(struct rvt_qp *qp);
-static int iowait_sleep(
-       struct sdma_engine *sde,
-       struct iowait *wait,
-       struct sdma_txreq *stx,
-       unsigned seq);
-static void iowait_wakeup(struct iowait *wait, int reason);
-static void iowait_sdma_drained(struct iowait *wait);
-static void qp_pio_drain(struct rvt_qp *qp);
-
-static inline unsigned mk_qpn(struct rvt_qpn_table *qpt,
-                             struct rvt_qpn_map *map, unsigned off)
-{
-       return (map - qpt->map) * RVT_BITS_PER_PAGE + off;
-}
-
-/*
- * Convert the AETH credit code into the number of credits.
- */
-static const u16 credit_table[31] = {
-       0,                      /* 0 */
-       1,                      /* 1 */
-       2,                      /* 2 */
-       3,                      /* 3 */
-       4,                      /* 4 */
-       6,                      /* 5 */
-       8,                      /* 6 */
-       12,                     /* 7 */
-       16,                     /* 8 */
-       24,                     /* 9 */
-       32,                     /* A */
-       48,                     /* B */
-       64,                     /* C */
-       96,                     /* D */
-       128,                    /* E */
-       192,                    /* F */
-       256,                    /* 10 */
-       384,                    /* 11 */
-       512,                    /* 12 */
-       768,                    /* 13 */
-       1024,                   /* 14 */
-       1536,                   /* 15 */
-       2048,                   /* 16 */
-       3072,                   /* 17 */
-       4096,                   /* 18 */
-       6144,                   /* 19 */
-       8192,                   /* 1A */
-       12288,                  /* 1B */
-       16384,                  /* 1C */
-       24576,                  /* 1D */
-       32768                   /* 1E */
-};
-
-static void flush_tx_list(struct rvt_qp *qp)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-
-       while (!list_empty(&priv->s_iowait.tx_head)) {
-               struct sdma_txreq *tx;
-
-               tx = list_first_entry(
-                       &priv->s_iowait.tx_head,
-                       struct sdma_txreq,
-                       list);
-               list_del_init(&tx->list);
-               hfi1_put_txreq(
-                       container_of(tx, struct verbs_txreq, txreq));
-       }
-}
-
-static void flush_iowait(struct rvt_qp *qp)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-       struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
-       unsigned long flags;
-
-       write_seqlock_irqsave(&dev->iowait_lock, flags);
-       if (!list_empty(&priv->s_iowait.list)) {
-               list_del_init(&priv->s_iowait.list);
-               if (atomic_dec_and_test(&qp->refcount))
-                       wake_up(&qp->wait);
-       }
-       write_sequnlock_irqrestore(&dev->iowait_lock, flags);
-}
-
-static inline int opa_mtu_enum_to_int(int mtu)
-{
-       switch (mtu) {
-       case OPA_MTU_8192:  return 8192;
-       case OPA_MTU_10240: return 10240;
-       default:            return -1;
-       }
-}
-
-/**
- * This function is what we would push to the core layer if we wanted to be a
- * "first class citizen".  Instead we hide this here and rely on Verbs ULPs
- * to blindly pass the MTU enum value from the PathRecord to us.
- *
- * The actual flag used to determine "8k MTU" will change and is currently
- * unknown.
- */
-static inline int verbs_mtu_enum_to_int(struct ib_device *dev, enum ib_mtu mtu)
-{
-       int val;
-
-       /* Constraining 10KB packets to 8KB packets */
-       if (mtu == (enum ib_mtu)OPA_MTU_10240)
-               mtu = OPA_MTU_8192;
-       val = opa_mtu_enum_to_int((int)mtu);
-       if (val > 0)
-               return val;
-       return ib_mtu_enum_to_int(mtu);
-}
-
-int hfi1_check_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
-                        int attr_mask, struct ib_udata *udata)
-{
-       struct ib_qp *ibqp = &qp->ibqp;
-       struct hfi1_ibdev *dev = to_idev(ibqp->device);
-       struct hfi1_devdata *dd = dd_from_dev(dev);
-       u8 sc;
-
-       if (attr_mask & IB_QP_AV) {
-               sc = ah_to_sc(ibqp->device, &attr->ah_attr);
-               if (sc == 0xf)
-                       return -EINVAL;
-
-               if (!qp_to_sdma_engine(qp, sc) &&
-                   dd->flags & HFI1_HAS_SEND_DMA)
-                       return -EINVAL;
-
-               if (!qp_to_send_context(qp, sc))
-                       return -EINVAL;
-       }
-
-       if (attr_mask & IB_QP_ALT_PATH) {
-               sc = ah_to_sc(ibqp->device, &attr->alt_ah_attr);
-               if (sc == 0xf)
-                       return -EINVAL;
-
-               if (!qp_to_sdma_engine(qp, sc) &&
-                   dd->flags & HFI1_HAS_SEND_DMA)
-                       return -EINVAL;
-
-               if (!qp_to_send_context(qp, sc))
-                       return -EINVAL;
-       }
-
-       return 0;
-}
-
-void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
-                   int attr_mask, struct ib_udata *udata)
-{
-       struct ib_qp *ibqp = &qp->ibqp;
-       struct hfi1_qp_priv *priv = qp->priv;
-
-       if (attr_mask & IB_QP_AV) {
-               priv->s_sc = ah_to_sc(ibqp->device, &qp->remote_ah_attr);
-               priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc);
-               priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc);
-       }
-
-       if (attr_mask & IB_QP_PATH_MIG_STATE &&
-           attr->path_mig_state == IB_MIG_MIGRATED &&
-           qp->s_mig_state == IB_MIG_ARMED) {
-               qp->s_flags |= RVT_S_AHG_CLEAR;
-               priv->s_sc = ah_to_sc(ibqp->device, &qp->remote_ah_attr);
-               priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc);
-               priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc);
-       }
-}
-
-/**
- * hfi1_check_send_wqe - validate wqe
- * @qp - The qp
- * @wqe - The built wqe
- *
- * validate wqe.  This is called
- * prior to inserting the wqe into
- * the ring but after the wqe has been
- * setup.
- *
- * Returns 0 on success, -EINVAL on failure
- *
- */
-int hfi1_check_send_wqe(struct rvt_qp *qp,
-                       struct rvt_swqe *wqe)
-{
-       struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
-       struct rvt_ah *ah;
-
-       switch (qp->ibqp.qp_type) {
-       case IB_QPT_RC:
-       case IB_QPT_UC:
-               if (wqe->length > 0x80000000U)
-                       return -EINVAL;
-               break;
-       case IB_QPT_SMI:
-               ah = ibah_to_rvtah(wqe->ud_wr.ah);
-               if (wqe->length > (1 << ah->log_pmtu))
-                       return -EINVAL;
-               break;
-       case IB_QPT_GSI:
-       case IB_QPT_UD:
-               ah = ibah_to_rvtah(wqe->ud_wr.ah);
-               if (wqe->length > (1 << ah->log_pmtu))
-                       return -EINVAL;
-               if (ibp->sl_to_sc[ah->attr.sl] == 0xf)
-                       return -EINVAL;
-       default:
-               break;
-       }
-       return wqe->length <= piothreshold;
-}
-
-/**
- * hfi1_compute_aeth - compute the AETH (syndrome + MSN)
- * @qp: the queue pair to compute the AETH for
- *
- * Returns the AETH.
- */
-__be32 hfi1_compute_aeth(struct rvt_qp *qp)
-{
-       u32 aeth = qp->r_msn & HFI1_MSN_MASK;
-
-       if (qp->ibqp.srq) {
-               /*
-                * Shared receive queues don't generate credits.
-                * Set the credit field to the invalid value.
-                */
-               aeth |= HFI1_AETH_CREDIT_INVAL << HFI1_AETH_CREDIT_SHIFT;
-       } else {
-               u32 min, max, x;
-               u32 credits;
-               struct rvt_rwq *wq = qp->r_rq.wq;
-               u32 head;
-               u32 tail;
-
-               /* sanity check pointers before trusting them */
-               head = wq->head;
-               if (head >= qp->r_rq.size)
-                       head = 0;
-               tail = wq->tail;
-               if (tail >= qp->r_rq.size)
-                       tail = 0;
-               /*
-                * Compute the number of credits available (RWQEs).
-                * There is a small chance that the pair of reads are
-                * not atomic, which is OK, since the fuzziness is
-                * resolved as further ACKs go out.
-                */
-               credits = head - tail;
-               if ((int)credits < 0)
-                       credits += qp->r_rq.size;
-               /*
-                * Binary search the credit table to find the code to
-                * use.
-                */
-               min = 0;
-               max = 31;
-               for (;;) {
-                       x = (min + max) / 2;
-                       if (credit_table[x] == credits)
-                               break;
-                       if (credit_table[x] > credits) {
-                               max = x;
-                       } else {
-                               if (min == x)
-                                       break;
-                               min = x;
-                       }
-               }
-               aeth |= x << HFI1_AETH_CREDIT_SHIFT;
-       }
-       return cpu_to_be32(aeth);
-}
-
-/**
- * _hfi1_schedule_send - schedule progress
- * @qp: the QP
- *
- * This schedules qp progress w/o regard to the s_flags.
- *
- * It is only used in the post send, which doesn't hold
- * the s_lock.
- */
-void _hfi1_schedule_send(struct rvt_qp *qp)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-       struct hfi1_ibport *ibp =
-               to_iport(qp->ibqp.device, qp->port_num);
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-       struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
-
-       iowait_schedule(&priv->s_iowait, ppd->hfi1_wq,
-                       priv->s_sde ?
-                       priv->s_sde->cpu :
-                       cpumask_first(cpumask_of_node(dd->node)));
-}
-
-static void qp_pio_drain(struct rvt_qp *qp)
-{
-       struct hfi1_ibdev *dev;
-       struct hfi1_qp_priv *priv = qp->priv;
-
-       if (!priv->s_sendcontext)
-               return;
-       dev = to_idev(qp->ibqp.device);
-       while (iowait_pio_pending(&priv->s_iowait)) {
-               write_seqlock_irq(&dev->iowait_lock);
-               hfi1_sc_wantpiobuf_intr(priv->s_sendcontext, 1);
-               write_sequnlock_irq(&dev->iowait_lock);
-               iowait_pio_drain(&priv->s_iowait);
-               write_seqlock_irq(&dev->iowait_lock);
-               hfi1_sc_wantpiobuf_intr(priv->s_sendcontext, 0);
-               write_sequnlock_irq(&dev->iowait_lock);
-       }
-}
-
-/**
- * hfi1_schedule_send - schedule progress
- * @qp: the QP
- *
- * This schedules qp progress and caller should hold
- * the s_lock.
- */
-void hfi1_schedule_send(struct rvt_qp *qp)
-{
-       if (hfi1_send_ok(qp))
-               _hfi1_schedule_send(qp);
-}
-
-/**
- * hfi1_get_credit - flush the send work queue of a QP
- * @qp: the qp who's send work queue to flush
- * @aeth: the Acknowledge Extended Transport Header
- *
- * The QP s_lock should be held.
- */
-void hfi1_get_credit(struct rvt_qp *qp, u32 aeth)
-{
-       u32 credit = (aeth >> HFI1_AETH_CREDIT_SHIFT) & HFI1_AETH_CREDIT_MASK;
-
-       /*
-        * If the credit is invalid, we can send
-        * as many packets as we like.  Otherwise, we have to
-        * honor the credit field.
-        */
-       if (credit == HFI1_AETH_CREDIT_INVAL) {
-               if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) {
-                       qp->s_flags |= RVT_S_UNLIMITED_CREDIT;
-                       if (qp->s_flags & RVT_S_WAIT_SSN_CREDIT) {
-                               qp->s_flags &= ~RVT_S_WAIT_SSN_CREDIT;
-                               hfi1_schedule_send(qp);
-                       }
-               }
-       } else if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) {
-               /* Compute new LSN (i.e., MSN + credit) */
-               credit = (aeth + credit_table[credit]) & HFI1_MSN_MASK;
-               if (cmp_msn(credit, qp->s_lsn) > 0) {
-                       qp->s_lsn = credit;
-                       if (qp->s_flags & RVT_S_WAIT_SSN_CREDIT) {
-                               qp->s_flags &= ~RVT_S_WAIT_SSN_CREDIT;
-                               hfi1_schedule_send(qp);
-                       }
-               }
-       }
-}
-
-void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&qp->s_lock, flags);
-       if (qp->s_flags & flag) {
-               qp->s_flags &= ~flag;
-               trace_hfi1_qpwakeup(qp, flag);
-               hfi1_schedule_send(qp);
-       }
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-       /* Notify hfi1_destroy_qp() if it is waiting. */
-       if (atomic_dec_and_test(&qp->refcount))
-               wake_up(&qp->wait);
-}
-
-static int iowait_sleep(
-       struct sdma_engine *sde,
-       struct iowait *wait,
-       struct sdma_txreq *stx,
-       unsigned seq)
-{
-       struct verbs_txreq *tx = container_of(stx, struct verbs_txreq, txreq);
-       struct rvt_qp *qp;
-       struct hfi1_qp_priv *priv;
-       unsigned long flags;
-       int ret = 0;
-       struct hfi1_ibdev *dev;
-
-       qp = tx->qp;
-       priv = qp->priv;
-
-       spin_lock_irqsave(&qp->s_lock, flags);
-       if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
-               /*
-                * If we couldn't queue the DMA request, save the info
-                * and try again later rather than destroying the
-                * buffer and undoing the side effects of the copy.
-                */
-               /* Make a common routine? */
-               dev = &sde->dd->verbs_dev;
-               list_add_tail(&stx->list, &wait->tx_head);
-               write_seqlock(&dev->iowait_lock);
-               if (sdma_progress(sde, seq, stx))
-                       goto eagain;
-               if (list_empty(&priv->s_iowait.list)) {
-                       struct hfi1_ibport *ibp =
-                               to_iport(qp->ibqp.device, qp->port_num);
-
-                       ibp->rvp.n_dmawait++;
-                       qp->s_flags |= RVT_S_WAIT_DMA_DESC;
-                       list_add_tail(&priv->s_iowait.list, &sde->dmawait);
-                       trace_hfi1_qpsleep(qp, RVT_S_WAIT_DMA_DESC);
-                       atomic_inc(&qp->refcount);
-               }
-               write_sequnlock(&dev->iowait_lock);
-               qp->s_flags &= ~RVT_S_BUSY;
-               spin_unlock_irqrestore(&qp->s_lock, flags);
-               ret = -EBUSY;
-       } else {
-               spin_unlock_irqrestore(&qp->s_lock, flags);
-               hfi1_put_txreq(tx);
-       }
-       return ret;
-eagain:
-       write_sequnlock(&dev->iowait_lock);
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-       list_del_init(&stx->list);
-       return -EAGAIN;
-}
-
-static void iowait_wakeup(struct iowait *wait, int reason)
-{
-       struct rvt_qp *qp = iowait_to_qp(wait);
-
-       WARN_ON(reason != SDMA_AVAIL_REASON);
-       hfi1_qp_wakeup(qp, RVT_S_WAIT_DMA_DESC);
-}
-
-static void iowait_sdma_drained(struct iowait *wait)
-{
-       struct rvt_qp *qp = iowait_to_qp(wait);
-
-       /*
-        * This happens when the send engine notes
-        * a QP in the error state and cannot
-        * do the flush work until that QP's
-        * sdma work has finished.
-        */
-       spin_lock(&qp->s_lock);
-       if (qp->s_flags & RVT_S_WAIT_DMA) {
-               qp->s_flags &= ~RVT_S_WAIT_DMA;
-               hfi1_schedule_send(qp);
-       }
-       spin_unlock(&qp->s_lock);
-}
-
-/**
- *
- * qp_to_sdma_engine - map a qp to a send engine
- * @qp: the QP
- * @sc5: the 5 bit sc
- *
- * Return:
- * A send engine for the qp or NULL for SMI type qp.
- */
-struct sdma_engine *qp_to_sdma_engine(struct rvt_qp *qp, u8 sc5)
-{
-       struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
-       struct sdma_engine *sde;
-
-       if (!(dd->flags & HFI1_HAS_SEND_DMA))
-               return NULL;
-       switch (qp->ibqp.qp_type) {
-       case IB_QPT_SMI:
-               return NULL;
-       default:
-               break;
-       }
-       sde = sdma_select_engine_sc(dd, qp->ibqp.qp_num >> dd->qos_shift, sc5);
-       return sde;
-}
-
-/*
- * qp_to_send_context - map a qp to a send context
- * @qp: the QP
- * @sc5: the 5 bit sc
- *
- * Return:
- * A send context for the qp
- */
-struct send_context *qp_to_send_context(struct rvt_qp *qp, u8 sc5)
-{
-       struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
-
-       switch (qp->ibqp.qp_type) {
-       case IB_QPT_SMI:
-               /* SMA packets to VL15 */
-               return dd->vld[15].sc;
-       default:
-               break;
-       }
-
-       return pio_select_send_context_sc(dd, qp->ibqp.qp_num >> dd->qos_shift,
-                                         sc5);
-}
-
-struct qp_iter {
-       struct hfi1_ibdev *dev;
-       struct rvt_qp *qp;
-       int specials;
-       int n;
-};
-
-struct qp_iter *qp_iter_init(struct hfi1_ibdev *dev)
-{
-       struct qp_iter *iter;
-
-       iter = kzalloc(sizeof(*iter), GFP_KERNEL);
-       if (!iter)
-               return NULL;
-
-       iter->dev = dev;
-       iter->specials = dev->rdi.ibdev.phys_port_cnt * 2;
-       if (qp_iter_next(iter)) {
-               kfree(iter);
-               return NULL;
-       }
-
-       return iter;
-}
-
-int qp_iter_next(struct qp_iter *iter)
-{
-       struct hfi1_ibdev *dev = iter->dev;
-       int n = iter->n;
-       int ret = 1;
-       struct rvt_qp *pqp = iter->qp;
-       struct rvt_qp *qp;
-
-       /*
-        * The approach is to consider the special qps
-        * as an additional table entries before the
-        * real hash table.  Since the qp code sets
-        * the qp->next hash link to NULL, this works just fine.
-        *
-        * iter->specials is 2 * # ports
-        *
-        * n = 0..iter->specials is the special qp indices
-        *
-        * n = iter->specials..dev->rdi.qp_dev->qp_table_size+iter->specials are
-        * the potential hash bucket entries
-        *
-        */
-       for (; n <  dev->rdi.qp_dev->qp_table_size + iter->specials; n++) {
-               if (pqp) {
-                       qp = rcu_dereference(pqp->next);
-               } else {
-                       if (n < iter->specials) {
-                               struct hfi1_pportdata *ppd;
-                               struct hfi1_ibport *ibp;
-                               int pidx;
-
-                               pidx = n % dev->rdi.ibdev.phys_port_cnt;
-                               ppd = &dd_from_dev(dev)->pport[pidx];
-                               ibp = &ppd->ibport_data;
-
-                               if (!(n & 1))
-                                       qp = rcu_dereference(ibp->rvp.qp[0]);
-                               else
-                                       qp = rcu_dereference(ibp->rvp.qp[1]);
-                       } else {
-                               qp = rcu_dereference(
-                                       dev->rdi.qp_dev->qp_table[
-                                               (n - iter->specials)]);
-                       }
-               }
-               pqp = qp;
-               if (qp) {
-                       iter->qp = qp;
-                       iter->n = n;
-                       return 0;
-               }
-       }
-       return ret;
-}
-
-static const char * const qp_type_str[] = {
-       "SMI", "GSI", "RC", "UC", "UD",
-};
-
-static int qp_idle(struct rvt_qp *qp)
-{
-       return
-               qp->s_last == qp->s_acked &&
-               qp->s_acked == qp->s_cur &&
-               qp->s_cur == qp->s_tail &&
-               qp->s_tail == qp->s_head;
-}
-
-void qp_iter_print(struct seq_file *s, struct qp_iter *iter)
-{
-       struct rvt_swqe *wqe;
-       struct rvt_qp *qp = iter->qp;
-       struct hfi1_qp_priv *priv = qp->priv;
-       struct sdma_engine *sde;
-       struct send_context *send_context;
-
-       sde = qp_to_sdma_engine(qp, priv->s_sc);
-       wqe = rvt_get_swqe_ptr(qp, qp->s_last);
-       send_context = qp_to_send_context(qp, priv->s_sc);
-       seq_printf(s,
-                  "N %d %s QP %x R %u %s %u %u %u f=%x %u %u %u %u %u %u PSN %x %x %x %x %x (%u %u %u %u %u %u %u) RQP %x LID %x SL %u MTU %u %u %u %u SDE %p,%u SC %p,%u SCQ %u %u PID %d\n",
-                  iter->n,
-                  qp_idle(qp) ? "I" : "B",
-                  qp->ibqp.qp_num,
-                  atomic_read(&qp->refcount),
-                  qp_type_str[qp->ibqp.qp_type],
-                  qp->state,
-                  wqe ? wqe->wr.opcode : 0,
-                  qp->s_hdrwords,
-                  qp->s_flags,
-                  iowait_sdma_pending(&priv->s_iowait),
-                  iowait_pio_pending(&priv->s_iowait),
-                  !list_empty(&priv->s_iowait.list),
-                  qp->timeout,
-                  wqe ? wqe->ssn : 0,
-                  qp->s_lsn,
-                  qp->s_last_psn,
-                  qp->s_psn, qp->s_next_psn,
-                  qp->s_sending_psn, qp->s_sending_hpsn,
-                  qp->s_last, qp->s_acked, qp->s_cur,
-                  qp->s_tail, qp->s_head, qp->s_size,
-                  qp->s_avail,
-                  qp->remote_qpn,
-                  qp->remote_ah_attr.dlid,
-                  qp->remote_ah_attr.sl,
-                  qp->pmtu,
-                  qp->s_retry,
-                  qp->s_retry_cnt,
-                  qp->s_rnr_retry_cnt,
-                  sde,
-                  sde ? sde->this_idx : 0,
-                  send_context,
-                  send_context ? send_context->sw_index : 0,
-                  ibcq_to_rvtcq(qp->ibqp.send_cq)->queue->head,
-                  ibcq_to_rvtcq(qp->ibqp.send_cq)->queue->tail,
-                  qp->pid);
-}
-
-void qp_comm_est(struct rvt_qp *qp)
-{
-       qp->r_flags |= RVT_R_COMM_EST;
-       if (qp->ibqp.event_handler) {
-               struct ib_event ev;
-
-               ev.device = qp->ibqp.device;
-               ev.element.qp = &qp->ibqp;
-               ev.event = IB_EVENT_COMM_EST;
-               qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
-       }
-}
-
-void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp,
-                   gfp_t gfp)
-{
-       struct hfi1_qp_priv *priv;
-
-       priv = kzalloc_node(sizeof(*priv), gfp, rdi->dparms.node);
-       if (!priv)
-               return ERR_PTR(-ENOMEM);
-
-       priv->owner = qp;
-
-       priv->s_hdr = kzalloc_node(sizeof(*priv->s_hdr), gfp, rdi->dparms.node);
-       if (!priv->s_hdr) {
-               kfree(priv);
-               return ERR_PTR(-ENOMEM);
-       }
-       setup_timer(&priv->s_rnr_timer, hfi1_rc_rnr_retry, (unsigned long)qp);
-       qp->s_timer.function = hfi1_rc_timeout;
-       return priv;
-}
-
-void qp_priv_free(struct rvt_dev_info *rdi, struct rvt_qp *qp)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-
-       kfree(priv->s_hdr);
-       kfree(priv);
-}
-
-unsigned free_all_qps(struct rvt_dev_info *rdi)
-{
-       struct hfi1_ibdev *verbs_dev = container_of(rdi,
-                                                   struct hfi1_ibdev,
-                                                   rdi);
-       struct hfi1_devdata *dd = container_of(verbs_dev,
-                                              struct hfi1_devdata,
-                                              verbs_dev);
-       int n;
-       unsigned qp_inuse = 0;
-
-       for (n = 0; n < dd->num_pports; n++) {
-               struct hfi1_ibport *ibp = &dd->pport[n].ibport_data;
-
-               rcu_read_lock();
-               if (rcu_dereference(ibp->rvp.qp[0]))
-                       qp_inuse++;
-               if (rcu_dereference(ibp->rvp.qp[1]))
-                       qp_inuse++;
-               rcu_read_unlock();
-       }
-
-       return qp_inuse;
-}
-
-void flush_qp_waiters(struct rvt_qp *qp)
-{
-       flush_iowait(qp);
-       hfi1_stop_rc_timers(qp);
-}
-
-void stop_send_queue(struct rvt_qp *qp)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-
-       cancel_work_sync(&priv->s_iowait.iowork);
-       hfi1_del_timers_sync(qp);
-}
-
-void quiesce_qp(struct rvt_qp *qp)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-
-       iowait_sdma_drain(&priv->s_iowait);
-       qp_pio_drain(qp);
-       flush_tx_list(qp);
-}
-
-void notify_qp_reset(struct rvt_qp *qp)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-
-       iowait_init(
-               &priv->s_iowait,
-               1,
-               _hfi1_do_send,
-               iowait_sleep,
-               iowait_wakeup,
-               iowait_sdma_drained);
-       priv->r_adefered = 0;
-       clear_ahg(qp);
-}
-
-/*
- * Switch to alternate path.
- * The QP s_lock should be held and interrupts disabled.
- */
-void hfi1_migrate_qp(struct rvt_qp *qp)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-       struct ib_event ev;
-
-       qp->s_mig_state = IB_MIG_MIGRATED;
-       qp->remote_ah_attr = qp->alt_ah_attr;
-       qp->port_num = qp->alt_ah_attr.port_num;
-       qp->s_pkey_index = qp->s_alt_pkey_index;
-       qp->s_flags |= RVT_S_AHG_CLEAR;
-       priv->s_sc = ah_to_sc(qp->ibqp.device, &qp->remote_ah_attr);
-       priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc);
-
-       ev.device = qp->ibqp.device;
-       ev.element.qp = &qp->ibqp;
-       ev.event = IB_EVENT_PATH_MIG;
-       qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
-}
-
-int mtu_to_path_mtu(u32 mtu)
-{
-       return mtu_to_enum(mtu, OPA_MTU_8192);
-}
-
-u32 mtu_from_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, u32 pmtu)
-{
-       u32 mtu;
-       struct hfi1_ibdev *verbs_dev = container_of(rdi,
-                                                   struct hfi1_ibdev,
-                                                   rdi);
-       struct hfi1_devdata *dd = container_of(verbs_dev,
-                                              struct hfi1_devdata,
-                                              verbs_dev);
-       struct hfi1_ibport *ibp;
-       u8 sc, vl;
-
-       ibp = &dd->pport[qp->port_num - 1].ibport_data;
-       sc = ibp->sl_to_sc[qp->remote_ah_attr.sl];
-       vl = sc_to_vlt(dd, sc);
-
-       mtu = verbs_mtu_enum_to_int(qp->ibqp.device, pmtu);
-       if (vl < PER_VL_SEND_CONTEXTS)
-               mtu = min_t(u32, mtu, dd->vld[vl].mtu);
-       return mtu;
-}
-
-int get_pmtu_from_attr(struct rvt_dev_info *rdi, struct rvt_qp *qp,
-                      struct ib_qp_attr *attr)
-{
-       int mtu, pidx = qp->port_num - 1;
-       struct hfi1_ibdev *verbs_dev = container_of(rdi,
-                                                   struct hfi1_ibdev,
-                                                   rdi);
-       struct hfi1_devdata *dd = container_of(verbs_dev,
-                                              struct hfi1_devdata,
-                                              verbs_dev);
-       mtu = verbs_mtu_enum_to_int(qp->ibqp.device, attr->path_mtu);
-       if (mtu == -1)
-               return -1; /* values less than 0 are error */
-
-       if (mtu > dd->pport[pidx].ibmtu)
-               return mtu_to_enum(dd->pport[pidx].ibmtu, IB_MTU_2048);
-       else
-               return attr->path_mtu;
-}
-
-void notify_error_qp(struct rvt_qp *qp)
-{
-       struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
-       struct hfi1_qp_priv *priv = qp->priv;
-
-       write_seqlock(&dev->iowait_lock);
-       if (!list_empty(&priv->s_iowait.list) && !(qp->s_flags & RVT_S_BUSY)) {
-               qp->s_flags &= ~RVT_S_ANY_WAIT_IO;
-               list_del_init(&priv->s_iowait.list);
-               if (atomic_dec_and_test(&qp->refcount))
-                       wake_up(&qp->wait);
-       }
-       write_sequnlock(&dev->iowait_lock);
-
-       if (!(qp->s_flags & RVT_S_BUSY)) {
-               qp->s_hdrwords = 0;
-               if (qp->s_rdma_mr) {
-                       rvt_put_mr(qp->s_rdma_mr);
-                       qp->s_rdma_mr = NULL;
-               }
-               flush_tx_list(qp);
-       }
-}
-
-/**
- * hfi1_error_port_qps - put a port's RC/UC qps into error state
- * @ibp: the ibport.
- * @sl: the service level.
- *
- * This function places all RC/UC qps with a given service level into error
- * state. It is generally called to force upper lay apps to abandon stale qps
- * after an sl->sc mapping change.
- */
-void hfi1_error_port_qps(struct hfi1_ibport *ibp, u8 sl)
-{
-       struct rvt_qp *qp = NULL;
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-       struct hfi1_ibdev *dev = &ppd->dd->verbs_dev;
-       int n;
-       int lastwqe;
-       struct ib_event ev;
-
-       rcu_read_lock();
-
-       /* Deal only with RC/UC qps that use the given SL. */
-       for (n = 0; n < dev->rdi.qp_dev->qp_table_size; n++) {
-               for (qp = rcu_dereference(dev->rdi.qp_dev->qp_table[n]); qp;
-                       qp = rcu_dereference(qp->next)) {
-                       if (qp->port_num == ppd->port &&
-                           (qp->ibqp.qp_type == IB_QPT_UC ||
-                            qp->ibqp.qp_type == IB_QPT_RC) &&
-                           qp->remote_ah_attr.sl == sl &&
-                           (ib_rvt_state_ops[qp->state] &
-                            RVT_POST_SEND_OK)) {
-                               spin_lock_irq(&qp->r_lock);
-                               spin_lock(&qp->s_hlock);
-                               spin_lock(&qp->s_lock);
-                               lastwqe = rvt_error_qp(qp,
-                                                      IB_WC_WR_FLUSH_ERR);
-                               spin_unlock(&qp->s_lock);
-                               spin_unlock(&qp->s_hlock);
-                               spin_unlock_irq(&qp->r_lock);
-                               if (lastwqe) {
-                                       ev.device = qp->ibqp.device;
-                                       ev.element.qp = &qp->ibqp;
-                                       ev.event =
-                                               IB_EVENT_QP_LAST_WQE_REACHED;
-                                       qp->ibqp.event_handler(&ev,
-                                               qp->ibqp.qp_context);
-                               }
-                       }
-               }
-       }
-
-       rcu_read_unlock();
-}
diff --git a/drivers/staging/rdma/hfi1/qp.h b/drivers/staging/rdma/hfi1/qp.h
deleted file mode 100644 (file)
index e7bc8d6..0000000
+++ /dev/null
@@ -1,160 +0,0 @@
-#ifndef _QP_H
-#define _QP_H
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/hash.h>
-#include <rdma/rdmavt_qp.h>
-#include "verbs.h"
-#include "sdma.h"
-
-extern unsigned int hfi1_qp_table_size;
-
-/*
- * free_ahg - clear ahg from QP
- */
-static inline void clear_ahg(struct rvt_qp *qp)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-
-       priv->s_hdr->ahgcount = 0;
-       qp->s_flags &= ~(RVT_S_AHG_VALID | RVT_S_AHG_CLEAR);
-       if (priv->s_sde && qp->s_ahgidx >= 0)
-               sdma_ahg_free(priv->s_sde, qp->s_ahgidx);
-       qp->s_ahgidx = -1;
-}
-
-/**
- * hfi1_compute_aeth - compute the AETH (syndrome + MSN)
- * @qp: the queue pair to compute the AETH for
- *
- * Returns the AETH.
- */
-__be32 hfi1_compute_aeth(struct rvt_qp *qp);
-
-/**
- * hfi1_create_qp - create a queue pair for a device
- * @ibpd: the protection domain who's device we create the queue pair for
- * @init_attr: the attributes of the queue pair
- * @udata: user data for libibverbs.so
- *
- * Returns the queue pair on success, otherwise returns an errno.
- *
- * Called by the ib_create_qp() core verbs function.
- */
-struct ib_qp *hfi1_create_qp(struct ib_pd *ibpd,
-                            struct ib_qp_init_attr *init_attr,
-                            struct ib_udata *udata);
-/**
- * hfi1_get_credit - flush the send work queue of a QP
- * @qp: the qp who's send work queue to flush
- * @aeth: the Acknowledge Extended Transport Header
- *
- * The QP s_lock should be held.
- */
-void hfi1_get_credit(struct rvt_qp *qp, u32 aeth);
-
-/**
- * hfi1_qp_wakeup - wake up on the indicated event
- * @qp: the QP
- * @flag: flag the qp on which the qp is stalled
- */
-void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag);
-
-struct sdma_engine *qp_to_sdma_engine(struct rvt_qp *qp, u8 sc5);
-struct send_context *qp_to_send_context(struct rvt_qp *qp, u8 sc5);
-
-struct qp_iter;
-
-/**
- * qp_iter_init - initialize the iterator for the qp hash list
- * @dev: the hfi1_ibdev
- */
-struct qp_iter *qp_iter_init(struct hfi1_ibdev *dev);
-
-/**
- * qp_iter_next - Find the next qp in the hash list
- * @iter: the iterator for the qp hash list
- */
-int qp_iter_next(struct qp_iter *iter);
-
-/**
- * qp_iter_print - print the qp information to seq_file
- * @s: the seq_file to emit the qp information on
- * @iter: the iterator for the qp hash list
- */
-void qp_iter_print(struct seq_file *s, struct qp_iter *iter);
-
-/**
- * qp_comm_est - handle trap with QP established
- * @qp: the QP
- */
-void qp_comm_est(struct rvt_qp *qp);
-
-void _hfi1_schedule_send(struct rvt_qp *qp);
-void hfi1_schedule_send(struct rvt_qp *qp);
-
-void hfi1_migrate_qp(struct rvt_qp *qp);
-
-/*
- * Functions provided by hfi1 driver for rdmavt to use
- */
-void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp,
-                   gfp_t gfp);
-void qp_priv_free(struct rvt_dev_info *rdi, struct rvt_qp *qp);
-unsigned free_all_qps(struct rvt_dev_info *rdi);
-void notify_qp_reset(struct rvt_qp *qp);
-int get_pmtu_from_attr(struct rvt_dev_info *rdi, struct rvt_qp *qp,
-                      struct ib_qp_attr *attr);
-void flush_qp_waiters(struct rvt_qp *qp);
-void notify_error_qp(struct rvt_qp *qp);
-void stop_send_queue(struct rvt_qp *qp);
-void quiesce_qp(struct rvt_qp *qp);
-u32 mtu_from_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, u32 pmtu);
-int mtu_to_path_mtu(u32 mtu);
-void hfi1_error_port_qps(struct hfi1_ibport *ibp, u8 sl);
-#endif /* _QP_H */
diff --git a/drivers/staging/rdma/hfi1/qsfp.c b/drivers/staging/rdma/hfi1/qsfp.c
deleted file mode 100644 (file)
index 2441669..0000000
+++ /dev/null
@@ -1,632 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/delay.h>
-#include <linux/pci.h>
-#include <linux/vmalloc.h>
-
-#include "hfi.h"
-#include "twsi.h"
-
-/*
- * QSFP support for hfi driver, using "Two Wire Serial Interface" driver
- * in twsi.c
- */
-#define I2C_MAX_RETRY 4
-
-/*
- * Raw i2c write.  No set-up or lock checking.
- */
-static int __i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
-                      int offset, void *bp, int len)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       int ret, cnt;
-       u8 *buff = bp;
-
-       cnt = 0;
-       while (cnt < len) {
-               int wlen = len - cnt;
-
-               ret = hfi1_twsi_blk_wr(dd, target, i2c_addr, offset,
-                                      buff + cnt, wlen);
-               if (ret) {
-                       /* hfi1_twsi_blk_wr() 1 for error, else 0 */
-                       return -EIO;
-               }
-               offset += wlen;
-               cnt += wlen;
-       }
-
-       /* Must wait min 20us between qsfp i2c transactions */
-       udelay(20);
-
-       return cnt;
-}
-
-/*
- * Caller must hold the i2c chain resource.
- */
-int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset,
-             void *bp, int len)
-{
-       int ret;
-
-       if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
-               return -EACCES;
-
-       /* make sure the TWSI bus is in a sane state */
-       ret = hfi1_twsi_reset(ppd->dd, target);
-       if (ret) {
-               hfi1_dev_porterr(ppd->dd, ppd->port,
-                                "I2C chain %d write interface reset failed\n",
-                                target);
-               return ret;
-       }
-
-       return __i2c_write(ppd, target, i2c_addr, offset, bp, len);
-}
-
-/*
- * Raw i2c read.  No set-up or lock checking.
- */
-static int __i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
-                     int offset, void *bp, int len)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       int ret, cnt, pass = 0;
-       int orig_offset = offset;
-
-       cnt = 0;
-       while (cnt < len) {
-               int rlen = len - cnt;
-
-               ret = hfi1_twsi_blk_rd(dd, target, i2c_addr, offset,
-                                      bp + cnt, rlen);
-               /* Some QSFP's fail first try. Retry as experiment */
-               if (ret && cnt == 0 && ++pass < I2C_MAX_RETRY)
-                       continue;
-               if (ret) {
-                       /* hfi1_twsi_blk_rd() 1 for error, else 0 */
-                       ret = -EIO;
-                       goto exit;
-               }
-               offset += rlen;
-               cnt += rlen;
-       }
-
-       ret = cnt;
-
-exit:
-       if (ret < 0) {
-               hfi1_dev_porterr(dd, ppd->port,
-                                "I2C chain %d read failed, addr 0x%x, offset 0x%x, len %d\n",
-                                target, i2c_addr, orig_offset, len);
-       }
-
-       /* Must wait min 20us between qsfp i2c transactions */
-       udelay(20);
-
-       return ret;
-}
-
-/*
- * Caller must hold the i2c chain resource.
- */
-int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset,
-            void *bp, int len)
-{
-       int ret;
-
-       if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
-               return -EACCES;
-
-       /* make sure the TWSI bus is in a sane state */
-       ret = hfi1_twsi_reset(ppd->dd, target);
-       if (ret) {
-               hfi1_dev_porterr(ppd->dd, ppd->port,
-                                "I2C chain %d read interface reset failed\n",
-                                target);
-               return ret;
-       }
-
-       return __i2c_read(ppd, target, i2c_addr, offset, bp, len);
-}
-
-/*
- * Write page n, offset m of QSFP memory as defined by SFF 8636
- * by writing @addr = ((256 * n) + m)
- *
- * Caller must hold the i2c chain resource.
- */
-int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
-              int len)
-{
-       int count = 0;
-       int offset;
-       int nwrite;
-       int ret;
-       u8 page;
-
-       if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
-               return -EACCES;
-
-       /* make sure the TWSI bus is in a sane state */
-       ret = hfi1_twsi_reset(ppd->dd, target);
-       if (ret) {
-               hfi1_dev_porterr(ppd->dd, ppd->port,
-                                "QSFP chain %d write interface reset failed\n",
-                                target);
-               return ret;
-       }
-
-       while (count < len) {
-               /*
-                * Set the qsfp page based on a zero-based address
-                * and a page size of QSFP_PAGESIZE bytes.
-                */
-               page = (u8)(addr / QSFP_PAGESIZE);
-
-               ret = __i2c_write(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE,
-                                 QSFP_PAGE_SELECT_BYTE_OFFS, &page, 1);
-               if (ret != 1) {
-                       hfi1_dev_porterr(ppd->dd, ppd->port,
-                                        "QSFP chain %d can't write QSFP_PAGE_SELECT_BYTE: %d\n",
-                                        target, ret);
-                       ret = -EIO;
-                       break;
-               }
-
-               offset = addr % QSFP_PAGESIZE;
-               nwrite = len - count;
-               /* truncate write to boundary if crossing boundary */
-               if (((addr % QSFP_RW_BOUNDARY) + nwrite) > QSFP_RW_BOUNDARY)
-                       nwrite = QSFP_RW_BOUNDARY - (addr % QSFP_RW_BOUNDARY);
-
-               ret = __i2c_write(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE,
-                                 offset, bp + count, nwrite);
-               if (ret <= 0)   /* stop on error or nothing written */
-                       break;
-
-               count += ret;
-               addr += ret;
-       }
-
-       if (ret < 0)
-               return ret;
-       return count;
-}
-
-/*
- * Perform a stand-alone single QSFP write.  Acquire the resource, do the
- * read, then release the resource.
- */
-int one_qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
-                  int len)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       u32 resource = qsfp_resource(dd);
-       int ret;
-
-       ret = acquire_chip_resource(dd, resource, QSFP_WAIT);
-       if (ret)
-               return ret;
-       ret = qsfp_write(ppd, target, addr, bp, len);
-       release_chip_resource(dd, resource);
-
-       return ret;
-}
-
-/*
- * Access page n, offset m of QSFP memory as defined by SFF 8636
- * by reading @addr = ((256 * n) + m)
- *
- * Caller must hold the i2c chain resource.
- */
-int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
-             int len)
-{
-       int count = 0;
-       int offset;
-       int nread;
-       int ret;
-       u8 page;
-
-       if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
-               return -EACCES;
-
-       /* make sure the TWSI bus is in a sane state */
-       ret = hfi1_twsi_reset(ppd->dd, target);
-       if (ret) {
-               hfi1_dev_porterr(ppd->dd, ppd->port,
-                                "QSFP chain %d read interface reset failed\n",
-                                target);
-               return ret;
-       }
-
-       while (count < len) {
-               /*
-                * Set the qsfp page based on a zero-based address
-                * and a page size of QSFP_PAGESIZE bytes.
-                */
-               page = (u8)(addr / QSFP_PAGESIZE);
-               ret = __i2c_write(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE,
-                                 QSFP_PAGE_SELECT_BYTE_OFFS, &page, 1);
-               if (ret != 1) {
-                       hfi1_dev_porterr(ppd->dd, ppd->port,
-                                        "QSFP chain %d can't write QSFP_PAGE_SELECT_BYTE: %d\n",
-                                        target, ret);
-                       ret = -EIO;
-                       break;
-               }
-
-               offset = addr % QSFP_PAGESIZE;
-               nread = len - count;
-               /* truncate read to boundary if crossing boundary */
-               if (((addr % QSFP_RW_BOUNDARY) + nread) > QSFP_RW_BOUNDARY)
-                       nread = QSFP_RW_BOUNDARY - (addr % QSFP_RW_BOUNDARY);
-
-               /* QSFPs require a 5-10msec delay after write operations */
-               mdelay(5);
-               ret = __i2c_read(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE,
-                                offset, bp + count, nread);
-               if (ret <= 0)   /* stop on error or nothing read */
-                       break;
-
-               count += ret;
-               addr += ret;
-       }
-
-       if (ret < 0)
-               return ret;
-       return count;
-}
-
-/*
- * Perform a stand-alone single QSFP read.  Acquire the resource, do the
- * read, then release the resource.
- */
-int one_qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
-                 int len)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       u32 resource = qsfp_resource(dd);
-       int ret;
-
-       ret = acquire_chip_resource(dd, resource, QSFP_WAIT);
-       if (ret)
-               return ret;
-       ret = qsfp_read(ppd, target, addr, bp, len);
-       release_chip_resource(dd, resource);
-
-       return ret;
-}
-
-/*
- * This function caches the QSFP memory range in 128 byte chunks.
- * As an example, the next byte after address 255 is byte 128 from
- * upper page 01H (if existing) rather than byte 0 from lower page 00H.
- * Access page n, offset m of QSFP memory as defined by SFF 8636
- * in the cache by reading byte ((128 * n) + m)
- * The calls to qsfp_{read,write} in this function correctly handle the
- * address map difference between this mapping and the mapping implemented
- * by those functions
- *
- * The caller must be holding the QSFP i2c chain resource.
- */
-int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp)
-{
-       u32 target = ppd->dd->hfi1_id;
-       int ret;
-       unsigned long flags;
-       u8 *cache = &cp->cache[0];
-
-       /* ensure sane contents on invalid reads, for cable swaps */
-       memset(cache, 0, (QSFP_MAX_NUM_PAGES * 128));
-       spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
-       ppd->qsfp_info.cache_valid = 0;
-       spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, flags);
-
-       if (!qsfp_mod_present(ppd)) {
-               ret = -ENODEV;
-               goto bail;
-       }
-
-       ret = qsfp_read(ppd, target, 0, cache, QSFP_PAGESIZE);
-       if (ret != QSFP_PAGESIZE) {
-               dd_dev_info(ppd->dd,
-                           "%s: Page 0 read failed, expected %d, got %d\n",
-                           __func__, QSFP_PAGESIZE, ret);
-               goto bail;
-       }
-
-       /* Is paging enabled? */
-       if (!(cache[2] & 4)) {
-               /* Paging enabled, page 03 required */
-               if ((cache[195] & 0xC0) == 0xC0) {
-                       /* all */
-                       ret = qsfp_read(ppd, target, 384, cache + 256, 128);
-                       if (ret <= 0 || ret != 128) {
-                               dd_dev_info(ppd->dd, "%s failed\n", __func__);
-                               goto bail;
-                       }
-                       ret = qsfp_read(ppd, target, 640, cache + 384, 128);
-                       if (ret <= 0 || ret != 128) {
-                               dd_dev_info(ppd->dd, "%s failed\n", __func__);
-                               goto bail;
-                       }
-                       ret = qsfp_read(ppd, target, 896, cache + 512, 128);
-                       if (ret <= 0 || ret != 128) {
-                               dd_dev_info(ppd->dd, "%s failed\n", __func__);
-                               goto bail;
-                       }
-               } else if ((cache[195] & 0x80) == 0x80) {
-                       /* only page 2 and 3 */
-                       ret = qsfp_read(ppd, target, 640, cache + 384, 128);
-                       if (ret <= 0 || ret != 128) {
-                               dd_dev_info(ppd->dd, "%s failed\n", __func__);
-                               goto bail;
-                       }
-                       ret = qsfp_read(ppd, target, 896, cache + 512, 128);
-                       if (ret <= 0 || ret != 128) {
-                               dd_dev_info(ppd->dd, "%s failed\n", __func__);
-                               goto bail;
-                       }
-               } else if ((cache[195] & 0x40) == 0x40) {
-                       /* only page 1 and 3 */
-                       ret = qsfp_read(ppd, target, 384, cache + 256, 128);
-                       if (ret <= 0 || ret != 128) {
-                               dd_dev_info(ppd->dd, "%s failed\n", __func__);
-                               goto bail;
-                       }
-                       ret = qsfp_read(ppd, target, 896, cache + 512, 128);
-                       if (ret <= 0 || ret != 128) {
-                               dd_dev_info(ppd->dd, "%s failed\n", __func__);
-                               goto bail;
-                       }
-               } else {
-                       /* only page 3 */
-                       ret = qsfp_read(ppd, target, 896, cache + 512, 128);
-                       if (ret <= 0 || ret != 128) {
-                               dd_dev_info(ppd->dd, "%s failed\n", __func__);
-                               goto bail;
-                       }
-               }
-       }
-
-       spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
-       ppd->qsfp_info.cache_valid = 1;
-       ppd->qsfp_info.cache_refresh_required = 0;
-       spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, flags);
-
-       return 0;
-
-bail:
-       memset(cache, 0, (QSFP_MAX_NUM_PAGES * 128));
-       return ret;
-}
-
-const char * const hfi1_qsfp_devtech[16] = {
-       "850nm VCSEL", "1310nm VCSEL", "1550nm VCSEL", "1310nm FP",
-       "1310nm DFB", "1550nm DFB", "1310nm EML", "1550nm EML",
-       "Cu Misc", "1490nm DFB", "Cu NoEq", "Cu Eq",
-       "Undef", "Cu Active BothEq", "Cu FarEq", "Cu NearEq"
-};
-
-#define QSFP_DUMP_CHUNK 16 /* Holds longest string */
-#define QSFP_DEFAULT_HDR_CNT 224
-
-#define QSFP_PWR(pbyte) (((pbyte) >> 6) & 3)
-#define QSFP_HIGH_PWR(pbyte) ((pbyte) & 3)
-/* For use with QSFP_HIGH_PWR macro */
-#define QSFP_HIGH_PWR_UNUSED   0 /* Bits [1:0] = 00 implies low power module */
-
-/*
- * Takes power class byte [Page 00 Byte 129] in SFF 8636
- * Returns power class as integer (1 through 7, per SFF 8636 rev 2.4)
- */
-int get_qsfp_power_class(u8 power_byte)
-{
-       if (QSFP_HIGH_PWR(power_byte) == QSFP_HIGH_PWR_UNUSED)
-               /* power classes count from 1, their bit encodings from 0 */
-               return (QSFP_PWR(power_byte) + 1);
-       /*
-        * 00 in the high power classes stands for unused, bringing
-        * balance to the off-by-1 offset above, we add 4 here to
-        * account for the difference between the low and high power
-        * groups
-        */
-       return (QSFP_HIGH_PWR(power_byte) + 4);
-}
-
-int qsfp_mod_present(struct hfi1_pportdata *ppd)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       u64 reg;
-
-       reg = read_csr(dd, dd->hfi1_id ? ASIC_QSFP2_IN : ASIC_QSFP1_IN);
-       return !(reg & QSFP_HFI0_MODPRST_N);
-}
-
-/*
- * This function maps QSFP memory addresses in 128 byte chunks in the following
- * fashion per the CableInfo SMA query definition in the IBA 1.3 spec/OPA Gen 1
- * spec
- * For addr 000-127, lower page 00h
- * For addr 128-255, upper page 00h
- * For addr 256-383, upper page 01h
- * For addr 384-511, upper page 02h
- * For addr 512-639, upper page 03h
- *
- * For addresses beyond this range, it returns the invalid range of data buffer
- * set to 0.
- * For upper pages that are optional, if they are not valid, returns the
- * particular range of bytes in the data buffer set to 0.
- */
-int get_cable_info(struct hfi1_devdata *dd, u32 port_num, u32 addr, u32 len,
-                  u8 *data)
-{
-       struct hfi1_pportdata *ppd;
-       u32 excess_len = 0;
-       int ret = 0;
-
-       if (port_num > dd->num_pports || port_num < 1) {
-               dd_dev_info(dd, "%s: Invalid port number %d\n",
-                           __func__, port_num);
-               ret = -EINVAL;
-               goto set_zeroes;
-       }
-
-       ppd = dd->pport + (port_num - 1);
-       if (!qsfp_mod_present(ppd)) {
-               ret = -ENODEV;
-               goto set_zeroes;
-       }
-
-       if (!ppd->qsfp_info.cache_valid) {
-               ret = -EINVAL;
-               goto set_zeroes;
-       }
-
-       if (addr >= (QSFP_MAX_NUM_PAGES * 128)) {
-               ret = -ERANGE;
-               goto set_zeroes;
-       }
-
-       if ((addr + len) > (QSFP_MAX_NUM_PAGES * 128)) {
-               excess_len = (addr + len) - (QSFP_MAX_NUM_PAGES * 128);
-               memcpy(data, &ppd->qsfp_info.cache[addr], (len - excess_len));
-               data += (len - excess_len);
-               goto set_zeroes;
-       }
-
-       memcpy(data, &ppd->qsfp_info.cache[addr], len);
-       return 0;
-
-set_zeroes:
-       memset(data, 0, excess_len);
-       return ret;
-}
-
-static const char *pwr_codes[8] = {"N/AW",
-                                 "1.5W",
-                                 "2.0W",
-                                 "2.5W",
-                                 "3.5W",
-                                 "4.0W",
-                                 "4.5W",
-                                 "5.0W"
-                                };
-
-int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len)
-{
-       u8 *cache = &ppd->qsfp_info.cache[0];
-       u8 bin_buff[QSFP_DUMP_CHUNK];
-       char lenstr[6];
-       int sofar;
-       int bidx = 0;
-       u8 *atten = &cache[QSFP_ATTEN_OFFS];
-       u8 *vendor_oui = &cache[QSFP_VOUI_OFFS];
-       u8 power_byte = 0;
-
-       sofar = 0;
-       lenstr[0] = ' ';
-       lenstr[1] = '\0';
-
-       if (ppd->qsfp_info.cache_valid) {
-               if (QSFP_IS_CU(cache[QSFP_MOD_TECH_OFFS]))
-                       sprintf(lenstr, "%dM ", cache[QSFP_MOD_LEN_OFFS]);
-
-               power_byte = cache[QSFP_MOD_PWR_OFFS];
-               sofar += scnprintf(buf + sofar, len - sofar, "PWR:%.3sW\n",
-                               pwr_codes[get_qsfp_power_class(power_byte)]);
-
-               sofar += scnprintf(buf + sofar, len - sofar, "TECH:%s%s\n",
-                               lenstr,
-                       hfi1_qsfp_devtech[(cache[QSFP_MOD_TECH_OFFS]) >> 4]);
-
-               sofar += scnprintf(buf + sofar, len - sofar, "Vendor:%.*s\n",
-                                  QSFP_VEND_LEN, &cache[QSFP_VEND_OFFS]);
-
-               sofar += scnprintf(buf + sofar, len - sofar, "OUI:%06X\n",
-                                  QSFP_OUI(vendor_oui));
-
-               sofar += scnprintf(buf + sofar, len - sofar, "Part#:%.*s\n",
-                                  QSFP_PN_LEN, &cache[QSFP_PN_OFFS]);
-
-               sofar += scnprintf(buf + sofar, len - sofar, "Rev:%.*s\n",
-                                  QSFP_REV_LEN, &cache[QSFP_REV_OFFS]);
-
-               if (QSFP_IS_CU(cache[QSFP_MOD_TECH_OFFS]))
-                       sofar += scnprintf(buf + sofar, len - sofar,
-                               "Atten:%d, %d\n",
-                               QSFP_ATTEN_SDR(atten),
-                               QSFP_ATTEN_DDR(atten));
-
-               sofar += scnprintf(buf + sofar, len - sofar, "Serial:%.*s\n",
-                                  QSFP_SN_LEN, &cache[QSFP_SN_OFFS]);
-
-               sofar += scnprintf(buf + sofar, len - sofar, "Date:%.*s\n",
-                                  QSFP_DATE_LEN, &cache[QSFP_DATE_OFFS]);
-
-               sofar += scnprintf(buf + sofar, len - sofar, "Lot:%.*s\n",
-                                  QSFP_LOT_LEN, &cache[QSFP_LOT_OFFS]);
-
-               while (bidx < QSFP_DEFAULT_HDR_CNT) {
-                       int iidx;
-
-                       memcpy(bin_buff, &cache[bidx], QSFP_DUMP_CHUNK);
-                       for (iidx = 0; iidx < QSFP_DUMP_CHUNK; ++iidx) {
-                               sofar += scnprintf(buf + sofar, len - sofar,
-                                       " %02X", bin_buff[iidx]);
-                       }
-                       sofar += scnprintf(buf + sofar, len - sofar, "\n");
-                       bidx += QSFP_DUMP_CHUNK;
-               }
-       }
-       return sofar;
-}
diff --git a/drivers/staging/rdma/hfi1/qsfp.h b/drivers/staging/rdma/hfi1/qsfp.h
deleted file mode 100644 (file)
index dadc66c..0000000
+++ /dev/null
@@ -1,240 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-/* QSFP support common definitions, for hfi driver */
-
-#define QSFP_DEV 0xA0
-#define QSFP_PWR_LAG_MSEC 2000
-#define QSFP_MODPRS_LAG_MSEC 20
-/* 128 byte pages, per SFF 8636 rev 2.4 */
-#define QSFP_MAX_NUM_PAGES     5
-
-/*
- * Below are masks for QSFP pins.  Pins are the same for HFI0 and HFI1.
- * _N means asserted low
- */
-#define QSFP_HFI0_I2CCLK    BIT(0)
-#define QSFP_HFI0_I2CDAT    BIT(1)
-#define QSFP_HFI0_RESET_N   BIT(2)
-#define QSFP_HFI0_INT_N            BIT(3)
-#define QSFP_HFI0_MODPRST_N BIT(4)
-
-/* QSFP is paged at 256 bytes */
-#define QSFP_PAGESIZE 256
-/* Reads/writes cannot cross 128 byte boundaries */
-#define QSFP_RW_BOUNDARY 128
-
-/* number of bytes in i2c offset for QSFP devices */
-#define __QSFP_OFFSET_SIZE 1                           /* num address bytes */
-#define QSFP_OFFSET_SIZE (__QSFP_OFFSET_SIZE << 8)     /* shifted value */
-
-/* Defined fields that Intel requires of qualified cables */
-/* Byte 0 is Identifier, not checked */
-/* Byte 1 is reserved "status MSB" */
-#define QSFP_TX_CTRL_BYTE_OFFS 86
-#define QSFP_PWR_CTRL_BYTE_OFFS 93
-#define QSFP_CDR_CTRL_BYTE_OFFS 98
-
-#define QSFP_PAGE_SELECT_BYTE_OFFS 127
-/* Byte 128 is Identifier: must be 0x0c for QSFP, or 0x0d for QSFP+ */
-#define QSFP_MOD_ID_OFFS 128
-/*
- * Byte 129 is "Extended Identifier".
- * For bits [7:6]: 0:1.5W, 1:2.0W, 2:2.5W, 3:3.5W
- * For bits [1:0]: 0:Unused, 1:4W, 2:4.5W, 3:5W
- */
-#define QSFP_MOD_PWR_OFFS 129
-/* Byte 130 is Connector type. Not Intel req'd */
-/* Bytes 131..138 are Transceiver types, bit maps for various tech, none IB */
-/* Byte 139 is encoding. code 0x01 is 8b10b. Not Intel req'd */
-/* byte 140 is nominal bit-rate, in units of 100Mbits/sec */
-#define QSFP_NOM_BIT_RATE_100_OFFS 140
-/* Byte 141 is Extended Rate Select. Not Intel req'd */
-/* Bytes 142..145 are lengths for various fiber types. Not Intel req'd */
-/* Byte 146 is length for Copper. Units of 1 meter */
-#define QSFP_MOD_LEN_OFFS 146
-/*
- * Byte 147 is Device technology. D0..3 not Intel req'd
- * D4..7 select from 15 choices, translated by table:
- */
-#define QSFP_MOD_TECH_OFFS 147
-extern const char *const hfi1_qsfp_devtech[16];
-/* Active Equalization includes fiber, copper full EQ, and copper near Eq */
-#define QSFP_IS_ACTIVE(tech) ((0xA2FF >> ((tech) >> 4)) & 1)
-/* Active Equalization includes fiber, copper full EQ, and copper far Eq */
-#define QSFP_IS_ACTIVE_FAR(tech) ((0x32FF >> ((tech) >> 4)) & 1)
-/* Attenuation should be valid for copper other than full/near Eq */
-#define QSFP_HAS_ATTEN(tech) ((0x4D00 >> ((tech) >> 4)) & 1)
-/* Length is only valid if technology is "copper" */
-#define QSFP_IS_CU(tech) ((0xED00 >> ((tech) >> 4)) & 1)
-#define QSFP_TECH_1490 9
-
-#define QSFP_OUI(oui) (((unsigned)oui[0] << 16) | ((unsigned)oui[1] << 8) | \
-                       oui[2])
-#define QSFP_OUI_AMPHENOL 0x415048
-#define QSFP_OUI_FINISAR  0x009065
-#define QSFP_OUI_GORE     0x002177
-
-/* Bytes 148..163 are Vendor Name, Left-justified Blank-filled */
-#define QSFP_VEND_OFFS 148
-#define QSFP_VEND_LEN 16
-/* Byte 164 is IB Extended transceiver codes Bits D0..3 are SDR,DDR,QDR,EDR */
-#define QSFP_IBXCV_OFFS 164
-/* Bytes 165..167 are Vendor OUI number */
-#define QSFP_VOUI_OFFS 165
-#define QSFP_VOUI_LEN 3
-/* Bytes 168..183 are Vendor Part Number, string */
-#define QSFP_PN_OFFS 168
-#define QSFP_PN_LEN 16
-/* Bytes 184,185 are Vendor Rev. Left Justified, Blank-filled */
-#define QSFP_REV_OFFS 184
-#define QSFP_REV_LEN 2
-/*
- * Bytes 186,187 are Wavelength, if Optical. Not Intel req'd
- *  If copper, they are attenuation in dB:
- * Byte 186 is at 2.5Gb/sec (SDR), Byte 187 at 5.0Gb/sec (DDR)
- */
-#define QSFP_ATTEN_OFFS 186
-#define QSFP_ATTEN_LEN 2
-/*
- * Bytes 188,189 are Wavelength tolerance, if optical
- * If copper, they are attenuation in dB:
- * Byte 188 is at 12.5 Gb/s, Byte 189 at 25 Gb/s
- */
-#define QSFP_CU_ATTEN_7G_OFFS 188
-#define QSFP_CU_ATTEN_12G_OFFS 189
-/* Byte 190 is Max Case Temp. Not Intel req'd */
-/* Byte 191 is LSB of sum of bytes 128..190. Not Intel req'd */
-#define QSFP_CC_OFFS 191
-#define QSFP_EQ_INFO_OFFS 193
-#define QSFP_CDR_INFO_OFFS 194
-/* Bytes 196..211 are Serial Number, String */
-#define QSFP_SN_OFFS 196
-#define QSFP_SN_LEN 16
-/* Bytes 212..219 are date-code YYMMDD (MM==1 for Jan) */
-#define QSFP_DATE_OFFS 212
-#define QSFP_DATE_LEN 6
-/* Bytes 218,219 are optional lot-code, string */
-#define QSFP_LOT_OFFS 218
-#define QSFP_LOT_LEN 2
-/* Bytes 220, 221 indicate monitoring options, Not Intel req'd */
-/* Byte 222 indicates nominal bitrate in units of 250Mbits/sec */
-#define QSFP_NOM_BIT_RATE_250_OFFS 222
-/* Byte 223 is LSB of sum of bytes 192..222 */
-#define QSFP_CC_EXT_OFFS 223
-
-/*
- * Interrupt flag masks
- */
-#define QSFP_DATA_NOT_READY            0x01
-
-#define QSFP_HIGH_TEMP_ALARM           0x80
-#define QSFP_LOW_TEMP_ALARM            0x40
-#define QSFP_HIGH_TEMP_WARNING         0x20
-#define QSFP_LOW_TEMP_WARNING          0x10
-
-#define QSFP_HIGH_VCC_ALARM            0x80
-#define QSFP_LOW_VCC_ALARM             0x40
-#define QSFP_HIGH_VCC_WARNING          0x20
-#define QSFP_LOW_VCC_WARNING           0x10
-
-#define QSFP_HIGH_POWER_ALARM          0x88
-#define QSFP_LOW_POWER_ALARM           0x44
-#define QSFP_HIGH_POWER_WARNING                0x22
-#define QSFP_LOW_POWER_WARNING         0x11
-
-#define QSFP_HIGH_BIAS_ALARM           0x88
-#define QSFP_LOW_BIAS_ALARM            0x44
-#define QSFP_HIGH_BIAS_WARNING         0x22
-#define QSFP_LOW_BIAS_WARNING          0x11
-
-#define QSFP_ATTEN_SDR(attenarray) (attenarray[0])
-#define QSFP_ATTEN_DDR(attenarray) (attenarray[1])
-
-/*
- * struct qsfp_data encapsulates state of QSFP device for one port.
- * it will be part of port-specific data if a board supports QSFP.
- *
- * Since multiple board-types use QSFP, and their pport_data structs
- * differ (in the chip-specific section), we need a pointer to its head.
- *
- * Avoiding premature optimization, we will have one work_struct per port,
- * and let the qsfp_lock arbitrate access to common resources.
- *
- */
-struct qsfp_data {
-       /* Helps to find our way */
-       struct hfi1_pportdata *ppd;
-       struct work_struct qsfp_work;
-       u8 cache[QSFP_MAX_NUM_PAGES * 128];
-       /* protect qsfp data */
-       spinlock_t qsfp_lock;
-       u8 check_interrupt_flags;
-       u8 reset_needed;
-       u8 limiting_active;
-       u8 cache_valid;
-       u8 cache_refresh_required;
-};
-
-int refresh_qsfp_cache(struct hfi1_pportdata *ppd,
-                      struct qsfp_data *cp);
-int get_qsfp_power_class(u8 power_byte);
-int qsfp_mod_present(struct hfi1_pportdata *ppd);
-int get_cable_info(struct hfi1_devdata *dd, u32 port_num, u32 addr,
-                  u32 len, u8 *data);
-
-int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
-             int offset, void *bp, int len);
-int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
-            int offset, void *bp, int len);
-int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
-              int len);
-int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
-             int len);
-int one_qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
-                  int len);
-int one_qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
-                 int len);
diff --git a/drivers/staging/rdma/hfi1/rc.c b/drivers/staging/rdma/hfi1/rc.c
deleted file mode 100644 (file)
index 792f15e..0000000
+++ /dev/null
@@ -1,2580 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/io.h>
-#include <rdma/rdma_vt.h>
-#include <rdma/rdmavt_qp.h>
-
-#include "hfi.h"
-#include "qp.h"
-#include "verbs_txreq.h"
-#include "trace.h"
-
-/* cut down ridiculously long IB macro names */
-#define OP(x) IB_OPCODE_RC_##x
-
-/**
- * hfi1_add_retry_timer - add/start a retry timer
- * @qp - the QP
- *
- * add a retry timer on the QP
- */
-static inline void hfi1_add_retry_timer(struct rvt_qp *qp)
-{
-       struct ib_qp *ibqp = &qp->ibqp;
-       struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
-
-       qp->s_flags |= RVT_S_TIMER;
-       /* 4.096 usec. * (1 << qp->timeout) */
-       qp->s_timer.expires = jiffies + qp->timeout_jiffies +
-                             rdi->busy_jiffies;
-       add_timer(&qp->s_timer);
-}
-
-/**
- * hfi1_add_rnr_timer - add/start an rnr timer
- * @qp - the QP
- * @to - timeout in usecs
- *
- * add an rnr timer on the QP
- */
-void hfi1_add_rnr_timer(struct rvt_qp *qp, u32 to)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-
-       qp->s_flags |= RVT_S_WAIT_RNR;
-       qp->s_timer.expires = jiffies + usecs_to_jiffies(to);
-       add_timer(&priv->s_rnr_timer);
-}
-
-/**
- * hfi1_mod_retry_timer - mod a retry timer
- * @qp - the QP
- *
- * Modify a potentially already running retry
- * timer
- */
-static inline void hfi1_mod_retry_timer(struct rvt_qp *qp)
-{
-       struct ib_qp *ibqp = &qp->ibqp;
-       struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
-
-       qp->s_flags |= RVT_S_TIMER;
-       /* 4.096 usec. * (1 << qp->timeout) */
-       mod_timer(&qp->s_timer, jiffies + qp->timeout_jiffies +
-                 rdi->busy_jiffies);
-}
-
-/**
- * hfi1_stop_retry_timer - stop a retry timer
- * @qp - the QP
- *
- * stop a retry timer and return if the timer
- * had been pending.
- */
-static inline int hfi1_stop_retry_timer(struct rvt_qp *qp)
-{
-       int rval = 0;
-
-       /* Remove QP from retry */
-       if (qp->s_flags & RVT_S_TIMER) {
-               qp->s_flags &= ~RVT_S_TIMER;
-               rval = del_timer(&qp->s_timer);
-       }
-       return rval;
-}
-
-/**
- * hfi1_stop_rc_timers - stop all timers
- * @qp - the QP
- *
- * stop any pending timers
- */
-void hfi1_stop_rc_timers(struct rvt_qp *qp)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-
-       /* Remove QP from all timers */
-       if (qp->s_flags & (RVT_S_TIMER | RVT_S_WAIT_RNR)) {
-               qp->s_flags &= ~(RVT_S_TIMER | RVT_S_WAIT_RNR);
-               del_timer(&qp->s_timer);
-               del_timer(&priv->s_rnr_timer);
-       }
-}
-
-/**
- * hfi1_stop_rnr_timer - stop an rnr timer
- * @qp - the QP
- *
- * stop an rnr timer and return if the timer
- * had been pending.
- */
-static inline int hfi1_stop_rnr_timer(struct rvt_qp *qp)
-{
-       int rval = 0;
-       struct hfi1_qp_priv *priv = qp->priv;
-
-       /* Remove QP from rnr timer */
-       if (qp->s_flags & RVT_S_WAIT_RNR) {
-               qp->s_flags &= ~RVT_S_WAIT_RNR;
-               rval = del_timer(&priv->s_rnr_timer);
-       }
-       return rval;
-}
-
-/**
- * hfi1_del_timers_sync - wait for any timeout routines to exit
- * @qp - the QP
- */
-void hfi1_del_timers_sync(struct rvt_qp *qp)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-
-       del_timer_sync(&qp->s_timer);
-       del_timer_sync(&priv->s_rnr_timer);
-}
-
-/* only opcode mask for adaptive pio */
-const u32 rc_only_opcode =
-       BIT(OP(SEND_ONLY) & 0x1f) |
-       BIT(OP(SEND_ONLY_WITH_IMMEDIATE & 0x1f)) |
-       BIT(OP(RDMA_WRITE_ONLY & 0x1f)) |
-       BIT(OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE & 0x1f)) |
-       BIT(OP(RDMA_READ_REQUEST & 0x1f)) |
-       BIT(OP(ACKNOWLEDGE & 0x1f)) |
-       BIT(OP(ATOMIC_ACKNOWLEDGE & 0x1f)) |
-       BIT(OP(COMPARE_SWAP & 0x1f)) |
-       BIT(OP(FETCH_ADD & 0x1f));
-
-static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
-                      u32 psn, u32 pmtu)
-{
-       u32 len;
-
-       len = delta_psn(psn, wqe->psn) * pmtu;
-       ss->sge = wqe->sg_list[0];
-       ss->sg_list = wqe->sg_list + 1;
-       ss->num_sge = wqe->wr.num_sge;
-       ss->total_len = wqe->length;
-       hfi1_skip_sge(ss, len, 0);
-       return wqe->length - len;
-}
-
-/**
- * make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
- * @dev: the device for this QP
- * @qp: a pointer to the QP
- * @ohdr: a pointer to the IB header being constructed
- * @ps: the xmit packet state
- *
- * Return 1 if constructed; otherwise, return 0.
- * Note that we are in the responder's side of the QP context.
- * Note the QP s_lock must be held.
- */
-static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
-                      struct hfi1_other_headers *ohdr,
-                      struct hfi1_pkt_state *ps)
-{
-       struct rvt_ack_entry *e;
-       u32 hwords;
-       u32 len;
-       u32 bth0;
-       u32 bth2;
-       int middle = 0;
-       u32 pmtu = qp->pmtu;
-       struct hfi1_qp_priv *priv = qp->priv;
-
-       /* Don't send an ACK if we aren't supposed to. */
-       if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
-               goto bail;
-
-       /* header size in 32-bit words LRH+BTH = (8+12)/4. */
-       hwords = 5;
-
-       switch (qp->s_ack_state) {
-       case OP(RDMA_READ_RESPONSE_LAST):
-       case OP(RDMA_READ_RESPONSE_ONLY):
-               e = &qp->s_ack_queue[qp->s_tail_ack_queue];
-               if (e->rdma_sge.mr) {
-                       rvt_put_mr(e->rdma_sge.mr);
-                       e->rdma_sge.mr = NULL;
-               }
-               /* FALLTHROUGH */
-       case OP(ATOMIC_ACKNOWLEDGE):
-               /*
-                * We can increment the tail pointer now that the last
-                * response has been sent instead of only being
-                * constructed.
-                */
-               if (++qp->s_tail_ack_queue > HFI1_MAX_RDMA_ATOMIC)
-                       qp->s_tail_ack_queue = 0;
-               /* FALLTHROUGH */
-       case OP(SEND_ONLY):
-       case OP(ACKNOWLEDGE):
-               /* Check for no next entry in the queue. */
-               if (qp->r_head_ack_queue == qp->s_tail_ack_queue) {
-                       if (qp->s_flags & RVT_S_ACK_PENDING)
-                               goto normal;
-                       goto bail;
-               }
-
-               e = &qp->s_ack_queue[qp->s_tail_ack_queue];
-               if (e->opcode == OP(RDMA_READ_REQUEST)) {
-                       /*
-                        * If a RDMA read response is being resent and
-                        * we haven't seen the duplicate request yet,
-                        * then stop sending the remaining responses the
-                        * responder has seen until the requester re-sends it.
-                        */
-                       len = e->rdma_sge.sge_length;
-                       if (len && !e->rdma_sge.mr) {
-                               qp->s_tail_ack_queue = qp->r_head_ack_queue;
-                               goto bail;
-                       }
-                       /* Copy SGE state in case we need to resend */
-                       ps->s_txreq->mr = e->rdma_sge.mr;
-                       if (ps->s_txreq->mr)
-                               rvt_get_mr(ps->s_txreq->mr);
-                       qp->s_ack_rdma_sge.sge = e->rdma_sge;
-                       qp->s_ack_rdma_sge.num_sge = 1;
-                       qp->s_cur_sge = &qp->s_ack_rdma_sge;
-                       if (len > pmtu) {
-                               len = pmtu;
-                               qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
-                       } else {
-                               qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);
-                               e->sent = 1;
-                       }
-                       ohdr->u.aeth = hfi1_compute_aeth(qp);
-                       hwords++;
-                       qp->s_ack_rdma_psn = e->psn;
-                       bth2 = mask_psn(qp->s_ack_rdma_psn++);
-               } else {
-                       /* COMPARE_SWAP or FETCH_ADD */
-                       qp->s_cur_sge = NULL;
-                       len = 0;
-                       qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
-                       ohdr->u.at.aeth = hfi1_compute_aeth(qp);
-                       ohdr->u.at.atomic_ack_eth[0] =
-                               cpu_to_be32(e->atomic_data >> 32);
-                       ohdr->u.at.atomic_ack_eth[1] =
-                               cpu_to_be32(e->atomic_data);
-                       hwords += sizeof(ohdr->u.at) / sizeof(u32);
-                       bth2 = mask_psn(e->psn);
-                       e->sent = 1;
-               }
-               bth0 = qp->s_ack_state << 24;
-               break;
-
-       case OP(RDMA_READ_RESPONSE_FIRST):
-               qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
-               /* FALLTHROUGH */
-       case OP(RDMA_READ_RESPONSE_MIDDLE):
-               qp->s_cur_sge = &qp->s_ack_rdma_sge;
-               ps->s_txreq->mr = qp->s_ack_rdma_sge.sge.mr;
-               if (ps->s_txreq->mr)
-                       rvt_get_mr(ps->s_txreq->mr);
-               len = qp->s_ack_rdma_sge.sge.sge_length;
-               if (len > pmtu) {
-                       len = pmtu;
-                       middle = HFI1_CAP_IS_KSET(SDMA_AHG);
-               } else {
-                       ohdr->u.aeth = hfi1_compute_aeth(qp);
-                       hwords++;
-                       qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
-                       e = &qp->s_ack_queue[qp->s_tail_ack_queue];
-                       e->sent = 1;
-               }
-               bth0 = qp->s_ack_state << 24;
-               bth2 = mask_psn(qp->s_ack_rdma_psn++);
-               break;
-
-       default:
-normal:
-               /*
-                * Send a regular ACK.
-                * Set the s_ack_state so we wait until after sending
-                * the ACK before setting s_ack_state to ACKNOWLEDGE
-                * (see above).
-                */
-               qp->s_ack_state = OP(SEND_ONLY);
-               qp->s_flags &= ~RVT_S_ACK_PENDING;
-               qp->s_cur_sge = NULL;
-               if (qp->s_nak_state)
-                       ohdr->u.aeth =
-                               cpu_to_be32((qp->r_msn & HFI1_MSN_MASK) |
-                                           (qp->s_nak_state <<
-                                            HFI1_AETH_CREDIT_SHIFT));
-               else
-                       ohdr->u.aeth = hfi1_compute_aeth(qp);
-               hwords++;
-               len = 0;
-               bth0 = OP(ACKNOWLEDGE) << 24;
-               bth2 = mask_psn(qp->s_ack_psn);
-       }
-       qp->s_rdma_ack_cnt++;
-       qp->s_hdrwords = hwords;
-       ps->s_txreq->sde = priv->s_sde;
-       qp->s_cur_size = len;
-       hfi1_make_ruc_header(qp, ohdr, bth0, bth2, middle, ps);
-       /* pbc */
-       ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2;
-       return 1;
-
-bail:
-       qp->s_ack_state = OP(ACKNOWLEDGE);
-       /*
-        * Ensure s_rdma_ack_cnt changes are committed prior to resetting
-        * RVT_S_RESP_PENDING
-        */
-       smp_wmb();
-       qp->s_flags &= ~(RVT_S_RESP_PENDING
-                               | RVT_S_ACK_PENDING
-                               | RVT_S_AHG_VALID);
-       return 0;
-}
-
-/**
- * hfi1_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
- * @qp: a pointer to the QP
- *
- * Assumes s_lock is held.
- *
- * Return 1 if constructed; otherwise, return 0.
- */
-int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-       struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
-       struct hfi1_other_headers *ohdr;
-       struct rvt_sge_state *ss;
-       struct rvt_swqe *wqe;
-       /* header size in 32-bit words LRH+BTH = (8+12)/4. */
-       u32 hwords = 5;
-       u32 len;
-       u32 bth0 = 0;
-       u32 bth2;
-       u32 pmtu = qp->pmtu;
-       char newreq;
-       int middle = 0;
-       int delta;
-
-       ps->s_txreq = get_txreq(ps->dev, qp);
-       if (IS_ERR(ps->s_txreq))
-               goto bail_no_tx;
-
-       ohdr = &ps->s_txreq->phdr.hdr.u.oth;
-       if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
-               ohdr = &ps->s_txreq->phdr.hdr.u.l.oth;
-
-       /* Sending responses has higher priority over sending requests. */
-       if ((qp->s_flags & RVT_S_RESP_PENDING) &&
-           make_rc_ack(dev, qp, ohdr, ps))
-               return 1;
-
-       if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) {
-               if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
-                       goto bail;
-               /* We are in the error state, flush the work request. */
-               smp_read_barrier_depends(); /* see post_one_send() */
-               if (qp->s_last == ACCESS_ONCE(qp->s_head))
-                       goto bail;
-               /* If DMAs are in progress, we can't flush immediately. */
-               if (iowait_sdma_pending(&priv->s_iowait)) {
-                       qp->s_flags |= RVT_S_WAIT_DMA;
-                       goto bail;
-               }
-               clear_ahg(qp);
-               wqe = rvt_get_swqe_ptr(qp, qp->s_last);
-               hfi1_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
-                       IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
-               /* will get called again */
-               goto done_free_tx;
-       }
-
-       if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK))
-               goto bail;
-
-       if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) {
-               if (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) {
-                       qp->s_flags |= RVT_S_WAIT_PSN;
-                       goto bail;
-               }
-               qp->s_sending_psn = qp->s_psn;
-               qp->s_sending_hpsn = qp->s_psn - 1;
-       }
-
-       /* Send a request. */
-       wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
-       switch (qp->s_state) {
-       default:
-               if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK))
-                       goto bail;
-               /*
-                * Resend an old request or start a new one.
-                *
-                * We keep track of the current SWQE so that
-                * we don't reset the "furthest progress" state
-                * if we need to back up.
-                */
-               newreq = 0;
-               if (qp->s_cur == qp->s_tail) {
-                       /* Check if send work queue is empty. */
-                       if (qp->s_tail == qp->s_head) {
-                               clear_ahg(qp);
-                               goto bail;
-                       }
-                       /*
-                        * If a fence is requested, wait for previous
-                        * RDMA read and atomic operations to finish.
-                        */
-                       if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
-                           qp->s_num_rd_atomic) {
-                               qp->s_flags |= RVT_S_WAIT_FENCE;
-                               goto bail;
-                       }
-                       newreq = 1;
-                       qp->s_psn = wqe->psn;
-               }
-               /*
-                * Note that we have to be careful not to modify the
-                * original work request since we may need to resend
-                * it.
-                */
-               len = wqe->length;
-               ss = &qp->s_sge;
-               bth2 = mask_psn(qp->s_psn);
-               switch (wqe->wr.opcode) {
-               case IB_WR_SEND:
-               case IB_WR_SEND_WITH_IMM:
-                       /* If no credit, return. */
-                       if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) &&
-                           cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
-                               qp->s_flags |= RVT_S_WAIT_SSN_CREDIT;
-                               goto bail;
-                       }
-                       if (len > pmtu) {
-                               qp->s_state = OP(SEND_FIRST);
-                               len = pmtu;
-                               break;
-                       }
-                       if (wqe->wr.opcode == IB_WR_SEND) {
-                               qp->s_state = OP(SEND_ONLY);
-                       } else {
-                               qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
-                               /* Immediate data comes after the BTH */
-                               ohdr->u.imm_data = wqe->wr.ex.imm_data;
-                               hwords += 1;
-                       }
-                       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-                               bth0 |= IB_BTH_SOLICITED;
-                       bth2 |= IB_BTH_REQ_ACK;
-                       if (++qp->s_cur == qp->s_size)
-                               qp->s_cur = 0;
-                       break;
-
-               case IB_WR_RDMA_WRITE:
-                       if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
-                               qp->s_lsn++;
-                       /* FALLTHROUGH */
-               case IB_WR_RDMA_WRITE_WITH_IMM:
-                       /* If no credit, return. */
-                       if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) &&
-                           cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
-                               qp->s_flags |= RVT_S_WAIT_SSN_CREDIT;
-                               goto bail;
-                       }
-                       ohdr->u.rc.reth.vaddr =
-                               cpu_to_be64(wqe->rdma_wr.remote_addr);
-                       ohdr->u.rc.reth.rkey =
-                               cpu_to_be32(wqe->rdma_wr.rkey);
-                       ohdr->u.rc.reth.length = cpu_to_be32(len);
-                       hwords += sizeof(struct ib_reth) / sizeof(u32);
-                       if (len > pmtu) {
-                               qp->s_state = OP(RDMA_WRITE_FIRST);
-                               len = pmtu;
-                               break;
-                       }
-                       if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
-                               qp->s_state = OP(RDMA_WRITE_ONLY);
-                       } else {
-                               qp->s_state =
-                                       OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
-                               /* Immediate data comes after RETH */
-                               ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
-                               hwords += 1;
-                               if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-                                       bth0 |= IB_BTH_SOLICITED;
-                       }
-                       bth2 |= IB_BTH_REQ_ACK;
-                       if (++qp->s_cur == qp->s_size)
-                               qp->s_cur = 0;
-                       break;
-
-               case IB_WR_RDMA_READ:
-                       /*
-                        * Don't allow more operations to be started
-                        * than the QP limits allow.
-                        */
-                       if (newreq) {
-                               if (qp->s_num_rd_atomic >=
-                                   qp->s_max_rd_atomic) {
-                                       qp->s_flags |= RVT_S_WAIT_RDMAR;
-                                       goto bail;
-                               }
-                               qp->s_num_rd_atomic++;
-                               if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
-                                       qp->s_lsn++;
-                       }
-                       ohdr->u.rc.reth.vaddr =
-                               cpu_to_be64(wqe->rdma_wr.remote_addr);
-                       ohdr->u.rc.reth.rkey =
-                               cpu_to_be32(wqe->rdma_wr.rkey);
-                       ohdr->u.rc.reth.length = cpu_to_be32(len);
-                       qp->s_state = OP(RDMA_READ_REQUEST);
-                       hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
-                       ss = NULL;
-                       len = 0;
-                       bth2 |= IB_BTH_REQ_ACK;
-                       if (++qp->s_cur == qp->s_size)
-                               qp->s_cur = 0;
-                       break;
-
-               case IB_WR_ATOMIC_CMP_AND_SWP:
-               case IB_WR_ATOMIC_FETCH_AND_ADD:
-                       /*
-                        * Don't allow more operations to be started
-                        * than the QP limits allow.
-                        */
-                       if (newreq) {
-                               if (qp->s_num_rd_atomic >=
-                                   qp->s_max_rd_atomic) {
-                                       qp->s_flags |= RVT_S_WAIT_RDMAR;
-                                       goto bail;
-                               }
-                               qp->s_num_rd_atomic++;
-                               if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
-                                       qp->s_lsn++;
-                       }
-                       if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
-                               qp->s_state = OP(COMPARE_SWAP);
-                               ohdr->u.atomic_eth.swap_data = cpu_to_be64(
-                                       wqe->atomic_wr.swap);
-                               ohdr->u.atomic_eth.compare_data = cpu_to_be64(
-                                       wqe->atomic_wr.compare_add);
-                       } else {
-                               qp->s_state = OP(FETCH_ADD);
-                               ohdr->u.atomic_eth.swap_data = cpu_to_be64(
-                                       wqe->atomic_wr.compare_add);
-                               ohdr->u.atomic_eth.compare_data = 0;
-                       }
-                       ohdr->u.atomic_eth.vaddr[0] = cpu_to_be32(
-                               wqe->atomic_wr.remote_addr >> 32);
-                       ohdr->u.atomic_eth.vaddr[1] = cpu_to_be32(
-                               wqe->atomic_wr.remote_addr);
-                       ohdr->u.atomic_eth.rkey = cpu_to_be32(
-                               wqe->atomic_wr.rkey);
-                       hwords += sizeof(struct ib_atomic_eth) / sizeof(u32);
-                       ss = NULL;
-                       len = 0;
-                       bth2 |= IB_BTH_REQ_ACK;
-                       if (++qp->s_cur == qp->s_size)
-                               qp->s_cur = 0;
-                       break;
-
-               default:
-                       goto bail;
-               }
-               qp->s_sge.sge = wqe->sg_list[0];
-               qp->s_sge.sg_list = wqe->sg_list + 1;
-               qp->s_sge.num_sge = wqe->wr.num_sge;
-               qp->s_sge.total_len = wqe->length;
-               qp->s_len = wqe->length;
-               if (newreq) {
-                       qp->s_tail++;
-                       if (qp->s_tail >= qp->s_size)
-                               qp->s_tail = 0;
-               }
-               if (wqe->wr.opcode == IB_WR_RDMA_READ)
-                       qp->s_psn = wqe->lpsn + 1;
-               else
-                       qp->s_psn++;
-               break;
-
-       case OP(RDMA_READ_RESPONSE_FIRST):
-               /*
-                * qp->s_state is normally set to the opcode of the
-                * last packet constructed for new requests and therefore
-                * is never set to RDMA read response.
-                * RDMA_READ_RESPONSE_FIRST is used by the ACK processing
-                * thread to indicate a SEND needs to be restarted from an
-                * earlier PSN without interfering with the sending thread.
-                * See restart_rc().
-                */
-               qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
-               /* FALLTHROUGH */
-       case OP(SEND_FIRST):
-               qp->s_state = OP(SEND_MIDDLE);
-               /* FALLTHROUGH */
-       case OP(SEND_MIDDLE):
-               bth2 = mask_psn(qp->s_psn++);
-               ss = &qp->s_sge;
-               len = qp->s_len;
-               if (len > pmtu) {
-                       len = pmtu;
-                       middle = HFI1_CAP_IS_KSET(SDMA_AHG);
-                       break;
-               }
-               if (wqe->wr.opcode == IB_WR_SEND) {
-                       qp->s_state = OP(SEND_LAST);
-               } else {
-                       qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
-                       /* Immediate data comes after the BTH */
-                       ohdr->u.imm_data = wqe->wr.ex.imm_data;
-                       hwords += 1;
-               }
-               if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-                       bth0 |= IB_BTH_SOLICITED;
-               bth2 |= IB_BTH_REQ_ACK;
-               qp->s_cur++;
-               if (qp->s_cur >= qp->s_size)
-                       qp->s_cur = 0;
-               break;
-
-       case OP(RDMA_READ_RESPONSE_LAST):
-               /*
-                * qp->s_state is normally set to the opcode of the
-                * last packet constructed for new requests and therefore
-                * is never set to RDMA read response.
-                * RDMA_READ_RESPONSE_LAST is used by the ACK processing
-                * thread to indicate a RDMA write needs to be restarted from
-                * an earlier PSN without interfering with the sending thread.
-                * See restart_rc().
-                */
-               qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
-               /* FALLTHROUGH */
-       case OP(RDMA_WRITE_FIRST):
-               qp->s_state = OP(RDMA_WRITE_MIDDLE);
-               /* FALLTHROUGH */
-       case OP(RDMA_WRITE_MIDDLE):
-               bth2 = mask_psn(qp->s_psn++);
-               ss = &qp->s_sge;
-               len = qp->s_len;
-               if (len > pmtu) {
-                       len = pmtu;
-                       middle = HFI1_CAP_IS_KSET(SDMA_AHG);
-                       break;
-               }
-               if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
-                       qp->s_state = OP(RDMA_WRITE_LAST);
-               } else {
-                       qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
-                       /* Immediate data comes after the BTH */
-                       ohdr->u.imm_data = wqe->wr.ex.imm_data;
-                       hwords += 1;
-                       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-                               bth0 |= IB_BTH_SOLICITED;
-               }
-               bth2 |= IB_BTH_REQ_ACK;
-               qp->s_cur++;
-               if (qp->s_cur >= qp->s_size)
-                       qp->s_cur = 0;
-               break;
-
-       case OP(RDMA_READ_RESPONSE_MIDDLE):
-               /*
-                * qp->s_state is normally set to the opcode of the
-                * last packet constructed for new requests and therefore
-                * is never set to RDMA read response.
-                * RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing
-                * thread to indicate a RDMA read needs to be restarted from
-                * an earlier PSN without interfering with the sending thread.
-                * See restart_rc().
-                */
-               len = (delta_psn(qp->s_psn, wqe->psn)) * pmtu;
-               ohdr->u.rc.reth.vaddr =
-                       cpu_to_be64(wqe->rdma_wr.remote_addr + len);
-               ohdr->u.rc.reth.rkey =
-                       cpu_to_be32(wqe->rdma_wr.rkey);
-               ohdr->u.rc.reth.length = cpu_to_be32(wqe->length - len);
-               qp->s_state = OP(RDMA_READ_REQUEST);
-               hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
-               bth2 = mask_psn(qp->s_psn) | IB_BTH_REQ_ACK;
-               qp->s_psn = wqe->lpsn + 1;
-               ss = NULL;
-               len = 0;
-               qp->s_cur++;
-               if (qp->s_cur == qp->s_size)
-                       qp->s_cur = 0;
-               break;
-       }
-       qp->s_sending_hpsn = bth2;
-       delta = delta_psn(bth2, wqe->psn);
-       if (delta && delta % HFI1_PSN_CREDIT == 0)
-               bth2 |= IB_BTH_REQ_ACK;
-       if (qp->s_flags & RVT_S_SEND_ONE) {
-               qp->s_flags &= ~RVT_S_SEND_ONE;
-               qp->s_flags |= RVT_S_WAIT_ACK;
-               bth2 |= IB_BTH_REQ_ACK;
-       }
-       qp->s_len -= len;
-       qp->s_hdrwords = hwords;
-       ps->s_txreq->sde = priv->s_sde;
-       qp->s_cur_sge = ss;
-       qp->s_cur_size = len;
-       hfi1_make_ruc_header(
-               qp,
-               ohdr,
-               bth0 | (qp->s_state << 24),
-               bth2,
-               middle,
-               ps);
-       /* pbc */
-       ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2;
-       return 1;
-
-done_free_tx:
-       hfi1_put_txreq(ps->s_txreq);
-       ps->s_txreq = NULL;
-       return 1;
-
-bail:
-       hfi1_put_txreq(ps->s_txreq);
-
-bail_no_tx:
-       ps->s_txreq = NULL;
-       qp->s_flags &= ~RVT_S_BUSY;
-       qp->s_hdrwords = 0;
-       return 0;
-}
-
-/**
- * hfi1_send_rc_ack - Construct an ACK packet and send it
- * @qp: a pointer to the QP
- *
- * This is called from hfi1_rc_rcv() and handle_receive_interrupt().
- * Note that RDMA reads and atomics are handled in the
- * send side QP state and tasklet.
- */
-void hfi1_send_rc_ack(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp,
-                     int is_fecn)
-{
-       struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-       u64 pbc, pbc_flags = 0;
-       u16 lrh0;
-       u16 sc5;
-       u32 bth0;
-       u32 hwords;
-       u32 vl, plen;
-       struct send_context *sc;
-       struct pio_buf *pbuf;
-       struct hfi1_ib_header hdr;
-       struct hfi1_other_headers *ohdr;
-       unsigned long flags;
-
-       /* Don't send ACK or NAK if a RDMA read or atomic is pending. */
-       if (qp->s_flags & RVT_S_RESP_PENDING)
-               goto queue_ack;
-
-       /* Ensure s_rdma_ack_cnt changes are committed */
-       smp_read_barrier_depends();
-       if (qp->s_rdma_ack_cnt)
-               goto queue_ack;
-
-       /* Construct the header */
-       /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4 */
-       hwords = 6;
-       if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
-               hwords += hfi1_make_grh(ibp, &hdr.u.l.grh,
-                                      &qp->remote_ah_attr.grh, hwords, 0);
-               ohdr = &hdr.u.l.oth;
-               lrh0 = HFI1_LRH_GRH;
-       } else {
-               ohdr = &hdr.u.oth;
-               lrh0 = HFI1_LRH_BTH;
-       }
-       /* read pkey_index w/o lock (its atomic) */
-       bth0 = hfi1_get_pkey(ibp, qp->s_pkey_index) | (OP(ACKNOWLEDGE) << 24);
-       if (qp->s_mig_state == IB_MIG_MIGRATED)
-               bth0 |= IB_BTH_MIG_REQ;
-       if (qp->r_nak_state)
-               ohdr->u.aeth = cpu_to_be32((qp->r_msn & HFI1_MSN_MASK) |
-                                           (qp->r_nak_state <<
-                                            HFI1_AETH_CREDIT_SHIFT));
-       else
-               ohdr->u.aeth = hfi1_compute_aeth(qp);
-       sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
-       /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
-       pbc_flags |= ((!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT);
-       lrh0 |= (sc5 & 0xf) << 12 | (qp->remote_ah_attr.sl & 0xf) << 4;
-       hdr.lrh[0] = cpu_to_be16(lrh0);
-       hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
-       hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
-       hdr.lrh[3] = cpu_to_be16(ppd->lid | qp->remote_ah_attr.src_path_bits);
-       ohdr->bth[0] = cpu_to_be32(bth0);
-       ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
-       ohdr->bth[1] |= cpu_to_be32((!!is_fecn) << HFI1_BECN_SHIFT);
-       ohdr->bth[2] = cpu_to_be32(mask_psn(qp->r_ack_psn));
-
-       /* Don't try to send ACKs if the link isn't ACTIVE */
-       if (driver_lstate(ppd) != IB_PORT_ACTIVE)
-               return;
-
-       sc = rcd->sc;
-       plen = 2 /* PBC */ + hwords;
-       vl = sc_to_vlt(ppd->dd, sc5);
-       pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
-
-       pbuf = sc_buffer_alloc(sc, plen, NULL, NULL);
-       if (!pbuf) {
-               /*
-                * We have no room to send at the moment.  Pass
-                * responsibility for sending the ACK to the send tasklet
-                * so that when enough buffer space becomes available,
-                * the ACK is sent ahead of other outgoing packets.
-                */
-               goto queue_ack;
-       }
-
-       trace_ack_output_ibhdr(dd_from_ibdev(qp->ibqp.device), &hdr);
-
-       /* write the pbc and data */
-       ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc, &hdr, hwords);
-
-       return;
-
-queue_ack:
-       this_cpu_inc(*ibp->rvp.rc_qacks);
-       spin_lock_irqsave(&qp->s_lock, flags);
-       qp->s_flags |= RVT_S_ACK_PENDING | RVT_S_RESP_PENDING;
-       qp->s_nak_state = qp->r_nak_state;
-       qp->s_ack_psn = qp->r_ack_psn;
-       if (is_fecn)
-               qp->s_flags |= RVT_S_ECN;
-
-       /* Schedule the send tasklet. */
-       hfi1_schedule_send(qp);
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-}
-
-/**
- * reset_psn - reset the QP state to send starting from PSN
- * @qp: the QP
- * @psn: the packet sequence number to restart at
- *
- * This is called from hfi1_rc_rcv() to process an incoming RC ACK
- * for the given QP.
- * Called at interrupt level with the QP s_lock held.
- */
-static void reset_psn(struct rvt_qp *qp, u32 psn)
-{
-       u32 n = qp->s_acked;
-       struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, n);
-       u32 opcode;
-
-       qp->s_cur = n;
-
-       /*
-        * If we are starting the request from the beginning,
-        * let the normal send code handle initialization.
-        */
-       if (cmp_psn(psn, wqe->psn) <= 0) {
-               qp->s_state = OP(SEND_LAST);
-               goto done;
-       }
-
-       /* Find the work request opcode corresponding to the given PSN. */
-       opcode = wqe->wr.opcode;
-       for (;;) {
-               int diff;
-
-               if (++n == qp->s_size)
-                       n = 0;
-               if (n == qp->s_tail)
-                       break;
-               wqe = rvt_get_swqe_ptr(qp, n);
-               diff = cmp_psn(psn, wqe->psn);
-               if (diff < 0)
-                       break;
-               qp->s_cur = n;
-               /*
-                * If we are starting the request from the beginning,
-                * let the normal send code handle initialization.
-                */
-               if (diff == 0) {
-                       qp->s_state = OP(SEND_LAST);
-                       goto done;
-               }
-               opcode = wqe->wr.opcode;
-       }
-
-       /*
-        * Set the state to restart in the middle of a request.
-        * Don't change the s_sge, s_cur_sge, or s_cur_size.
-        * See hfi1_make_rc_req().
-        */
-       switch (opcode) {
-       case IB_WR_SEND:
-       case IB_WR_SEND_WITH_IMM:
-               qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);
-               break;
-
-       case IB_WR_RDMA_WRITE:
-       case IB_WR_RDMA_WRITE_WITH_IMM:
-               qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
-               break;
-
-       case IB_WR_RDMA_READ:
-               qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
-               break;
-
-       default:
-               /*
-                * This case shouldn't happen since its only
-                * one PSN per req.
-                */
-               qp->s_state = OP(SEND_LAST);
-       }
-done:
-       qp->s_psn = psn;
-       /*
-        * Set RVT_S_WAIT_PSN as rc_complete() may start the timer
-        * asynchronously before the send tasklet can get scheduled.
-        * Doing it in hfi1_make_rc_req() is too late.
-        */
-       if ((cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) &&
-           (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0))
-               qp->s_flags |= RVT_S_WAIT_PSN;
-       qp->s_flags &= ~RVT_S_AHG_VALID;
-}
-
-/*
- * Back up requester to resend the last un-ACKed request.
- * The QP r_lock and s_lock should be held and interrupts disabled.
- */
-static void restart_rc(struct rvt_qp *qp, u32 psn, int wait)
-{
-       struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
-       struct hfi1_ibport *ibp;
-
-       if (qp->s_retry == 0) {
-               if (qp->s_mig_state == IB_MIG_ARMED) {
-                       hfi1_migrate_qp(qp);
-                       qp->s_retry = qp->s_retry_cnt;
-               } else if (qp->s_last == qp->s_acked) {
-                       hfi1_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
-                       rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
-                       return;
-               } else { /* need to handle delayed completion */
-                       return;
-               }
-       } else {
-               qp->s_retry--;
-       }
-
-       ibp = to_iport(qp->ibqp.device, qp->port_num);
-       if (wqe->wr.opcode == IB_WR_RDMA_READ)
-               ibp->rvp.n_rc_resends++;
-       else
-               ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
-
-       qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR |
-                        RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN |
-                        RVT_S_WAIT_ACK);
-       if (wait)
-               qp->s_flags |= RVT_S_SEND_ONE;
-       reset_psn(qp, psn);
-}
-
-/*
- * This is called from s_timer for missing responses.
- */
-void hfi1_rc_timeout(unsigned long arg)
-{
-       struct rvt_qp *qp = (struct rvt_qp *)arg;
-       struct hfi1_ibport *ibp;
-       unsigned long flags;
-
-       spin_lock_irqsave(&qp->r_lock, flags);
-       spin_lock(&qp->s_lock);
-       if (qp->s_flags & RVT_S_TIMER) {
-               ibp = to_iport(qp->ibqp.device, qp->port_num);
-               ibp->rvp.n_rc_timeouts++;
-               qp->s_flags &= ~RVT_S_TIMER;
-               del_timer(&qp->s_timer);
-               trace_hfi1_rc_timeout(qp, qp->s_last_psn + 1);
-               restart_rc(qp, qp->s_last_psn + 1, 1);
-               hfi1_schedule_send(qp);
-       }
-       spin_unlock(&qp->s_lock);
-       spin_unlock_irqrestore(&qp->r_lock, flags);
-}
-
-/*
- * This is called from s_timer for RNR timeouts.
- */
-void hfi1_rc_rnr_retry(unsigned long arg)
-{
-       struct rvt_qp *qp = (struct rvt_qp *)arg;
-       unsigned long flags;
-
-       spin_lock_irqsave(&qp->s_lock, flags);
-       hfi1_stop_rnr_timer(qp);
-       hfi1_schedule_send(qp);
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-}
-
-/*
- * Set qp->s_sending_psn to the next PSN after the given one.
- * This would be psn+1 except when RDMA reads are present.
- */
-static void reset_sending_psn(struct rvt_qp *qp, u32 psn)
-{
-       struct rvt_swqe *wqe;
-       u32 n = qp->s_last;
-
-       /* Find the work request corresponding to the given PSN. */
-       for (;;) {
-               wqe = rvt_get_swqe_ptr(qp, n);
-               if (cmp_psn(psn, wqe->lpsn) <= 0) {
-                       if (wqe->wr.opcode == IB_WR_RDMA_READ)
-                               qp->s_sending_psn = wqe->lpsn + 1;
-                       else
-                               qp->s_sending_psn = psn + 1;
-                       break;
-               }
-               if (++n == qp->s_size)
-                       n = 0;
-               if (n == qp->s_tail)
-                       break;
-       }
-}
-
-/*
- * This should be called with the QP s_lock held and interrupts disabled.
- */
-void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_ib_header *hdr)
-{
-       struct hfi1_other_headers *ohdr;
-       struct rvt_swqe *wqe;
-       struct ib_wc wc;
-       unsigned i;
-       u32 opcode;
-       u32 psn;
-
-       if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND))
-               return;
-
-       /* Find out where the BTH is */
-       if ((be16_to_cpu(hdr->lrh[0]) & 3) == HFI1_LRH_BTH)
-               ohdr = &hdr->u.oth;
-       else
-               ohdr = &hdr->u.l.oth;
-
-       opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
-       if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
-           opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
-               WARN_ON(!qp->s_rdma_ack_cnt);
-               qp->s_rdma_ack_cnt--;
-               return;
-       }
-
-       psn = be32_to_cpu(ohdr->bth[2]);
-       reset_sending_psn(qp, psn);
-
-       /*
-        * Start timer after a packet requesting an ACK has been sent and
-        * there are still requests that haven't been acked.
-        */
-       if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail &&
-           !(qp->s_flags &
-               (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) &&
-               (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
-               hfi1_add_retry_timer(qp);
-
-       while (qp->s_last != qp->s_acked) {
-               u32 s_last;
-
-               wqe = rvt_get_swqe_ptr(qp, qp->s_last);
-               if (cmp_psn(wqe->lpsn, qp->s_sending_psn) >= 0 &&
-                   cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)
-                       break;
-               s_last = qp->s_last;
-               if (++s_last >= qp->s_size)
-                       s_last = 0;
-               qp->s_last = s_last;
-               /* see post_send() */
-               barrier();
-               for (i = 0; i < wqe->wr.num_sge; i++) {
-                       struct rvt_sge *sge = &wqe->sg_list[i];
-
-                       rvt_put_mr(sge->mr);
-               }
-               /* Post a send completion queue entry if requested. */
-               if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) ||
-                   (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
-                       memset(&wc, 0, sizeof(wc));
-                       wc.wr_id = wqe->wr.wr_id;
-                       wc.status = IB_WC_SUCCESS;
-                       wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode];
-                       wc.byte_len = wqe->length;
-                       wc.qp = &qp->ibqp;
-                       rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc, 0);
-               }
-       }
-       /*
-        * If we were waiting for sends to complete before re-sending,
-        * and they are now complete, restart sending.
-        */
-       trace_hfi1_rc_sendcomplete(qp, psn);
-       if (qp->s_flags & RVT_S_WAIT_PSN &&
-           cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
-               qp->s_flags &= ~RVT_S_WAIT_PSN;
-               qp->s_sending_psn = qp->s_psn;
-               qp->s_sending_hpsn = qp->s_psn - 1;
-               hfi1_schedule_send(qp);
-       }
-}
-
-static inline void update_last_psn(struct rvt_qp *qp, u32 psn)
-{
-       qp->s_last_psn = psn;
-}
-
-/*
- * Generate a SWQE completion.
- * This is similar to hfi1_send_complete but has to check to be sure
- * that the SGEs are not being referenced if the SWQE is being resent.
- */
-static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
-                                        struct rvt_swqe *wqe,
-                                        struct hfi1_ibport *ibp)
-{
-       struct ib_wc wc;
-       unsigned i;
-
-       /*
-        * Don't decrement refcount and don't generate a
-        * completion if the SWQE is being resent until the send
-        * is finished.
-        */
-       if (cmp_psn(wqe->lpsn, qp->s_sending_psn) < 0 ||
-           cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
-               u32 s_last;
-
-               for (i = 0; i < wqe->wr.num_sge; i++) {
-                       struct rvt_sge *sge = &wqe->sg_list[i];
-
-                       rvt_put_mr(sge->mr);
-               }
-               s_last = qp->s_last;
-               if (++s_last >= qp->s_size)
-                       s_last = 0;
-               qp->s_last = s_last;
-               /* see post_send() */
-               barrier();
-               /* Post a send completion queue entry if requested. */
-               if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) ||
-                   (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
-                       memset(&wc, 0, sizeof(wc));
-                       wc.wr_id = wqe->wr.wr_id;
-                       wc.status = IB_WC_SUCCESS;
-                       wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode];
-                       wc.byte_len = wqe->length;
-                       wc.qp = &qp->ibqp;
-                       rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc, 0);
-               }
-       } else {
-               struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-
-               this_cpu_inc(*ibp->rvp.rc_delayed_comp);
-               /*
-                * If send progress not running attempt to progress
-                * SDMA queue.
-                */
-               if (ppd->dd->flags & HFI1_HAS_SEND_DMA) {
-                       struct sdma_engine *engine;
-                       u8 sc5;
-
-                       /* For now use sc to find engine */
-                       sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
-                       engine = qp_to_sdma_engine(qp, sc5);
-                       sdma_engine_progress_schedule(engine);
-               }
-       }
-
-       qp->s_retry = qp->s_retry_cnt;
-       update_last_psn(qp, wqe->lpsn);
-
-       /*
-        * If we are completing a request which is in the process of
-        * being resent, we can stop re-sending it since we know the
-        * responder has already seen it.
-        */
-       if (qp->s_acked == qp->s_cur) {
-               if (++qp->s_cur >= qp->s_size)
-                       qp->s_cur = 0;
-               qp->s_acked = qp->s_cur;
-               wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
-               if (qp->s_acked != qp->s_tail) {
-                       qp->s_state = OP(SEND_LAST);
-                       qp->s_psn = wqe->psn;
-               }
-       } else {
-               if (++qp->s_acked >= qp->s_size)
-                       qp->s_acked = 0;
-               if (qp->state == IB_QPS_SQD && qp->s_acked == qp->s_cur)
-                       qp->s_draining = 0;
-               wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
-       }
-       return wqe;
-}
-
-/**
- * do_rc_ack - process an incoming RC ACK
- * @qp: the QP the ACK came in on
- * @psn: the packet sequence number of the ACK
- * @opcode: the opcode of the request that resulted in the ACK
- *
- * This is called from rc_rcv_resp() to process an incoming RC ACK
- * for the given QP.
- * May be called at interrupt level, with the QP s_lock held.
- * Returns 1 if OK, 0 if current operation should be aborted (NAK).
- */
-static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
-                    u64 val, struct hfi1_ctxtdata *rcd)
-{
-       struct hfi1_ibport *ibp;
-       enum ib_wc_status status;
-       struct rvt_swqe *wqe;
-       int ret = 0;
-       u32 ack_psn;
-       int diff;
-       unsigned long to;
-
-       /*
-        * Note that NAKs implicitly ACK outstanding SEND and RDMA write
-        * requests and implicitly NAK RDMA read and atomic requests issued
-        * before the NAK'ed request.  The MSN won't include the NAK'ed
-        * request but will include an ACK'ed request(s).
-        */
-       ack_psn = psn;
-       if (aeth >> 29)
-               ack_psn--;
-       wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
-       ibp = to_iport(qp->ibqp.device, qp->port_num);
-
-       /*
-        * The MSN might be for a later WQE than the PSN indicates so
-        * only complete WQEs that the PSN finishes.
-        */
-       while ((diff = delta_psn(ack_psn, wqe->lpsn)) >= 0) {
-               /*
-                * RDMA_READ_RESPONSE_ONLY is a special case since
-                * we want to generate completion events for everything
-                * before the RDMA read, copy the data, then generate
-                * the completion for the read.
-                */
-               if (wqe->wr.opcode == IB_WR_RDMA_READ &&
-                   opcode == OP(RDMA_READ_RESPONSE_ONLY) &&
-                   diff == 0) {
-                       ret = 1;
-                       goto bail_stop;
-               }
-               /*
-                * If this request is a RDMA read or atomic, and the ACK is
-                * for a later operation, this ACK NAKs the RDMA read or
-                * atomic.  In other words, only a RDMA_READ_LAST or ONLY
-                * can ACK a RDMA read and likewise for atomic ops.  Note
-                * that the NAK case can only happen if relaxed ordering is
-                * used and requests are sent after an RDMA read or atomic
-                * is sent but before the response is received.
-                */
-               if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
-                    (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
-                   ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
-                     wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
-                    (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
-                       /* Retry this request. */
-                       if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
-                               qp->r_flags |= RVT_R_RDMAR_SEQ;
-                               restart_rc(qp, qp->s_last_psn + 1, 0);
-                               if (list_empty(&qp->rspwait)) {
-                                       qp->r_flags |= RVT_R_RSP_SEND;
-                                       atomic_inc(&qp->refcount);
-                                       list_add_tail(&qp->rspwait,
-                                                     &rcd->qp_wait_list);
-                               }
-                       }
-                       /*
-                        * No need to process the ACK/NAK since we are
-                        * restarting an earlier request.
-                        */
-                       goto bail_stop;
-               }
-               if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
-                   wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
-                       u64 *vaddr = wqe->sg_list[0].vaddr;
-                       *vaddr = val;
-               }
-               if (qp->s_num_rd_atomic &&
-                   (wqe->wr.opcode == IB_WR_RDMA_READ ||
-                    wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
-                    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) {
-                       qp->s_num_rd_atomic--;
-                       /* Restart sending task if fence is complete */
-                       if ((qp->s_flags & RVT_S_WAIT_FENCE) &&
-                           !qp->s_num_rd_atomic) {
-                               qp->s_flags &= ~(RVT_S_WAIT_FENCE |
-                                                RVT_S_WAIT_ACK);
-                               hfi1_schedule_send(qp);
-                       } else if (qp->s_flags & RVT_S_WAIT_RDMAR) {
-                               qp->s_flags &= ~(RVT_S_WAIT_RDMAR |
-                                                RVT_S_WAIT_ACK);
-                               hfi1_schedule_send(qp);
-                       }
-               }
-               wqe = do_rc_completion(qp, wqe, ibp);
-               if (qp->s_acked == qp->s_tail)
-                       break;
-       }
-
-       switch (aeth >> 29) {
-       case 0:         /* ACK */
-               this_cpu_inc(*ibp->rvp.rc_acks);
-               if (qp->s_acked != qp->s_tail) {
-                       /*
-                        * We are expecting more ACKs so
-                        * mod the retry timer.
-                        */
-                       hfi1_mod_retry_timer(qp);
-                       /*
-                        * We can stop re-sending the earlier packets and
-                        * continue with the next packet the receiver wants.
-                        */
-                       if (cmp_psn(qp->s_psn, psn) <= 0)
-                               reset_psn(qp, psn + 1);
-               } else {
-                       /* No more acks - kill all timers */
-                       hfi1_stop_rc_timers(qp);
-                       if (cmp_psn(qp->s_psn, psn) <= 0) {
-                               qp->s_state = OP(SEND_LAST);
-                               qp->s_psn = psn + 1;
-                       }
-               }
-               if (qp->s_flags & RVT_S_WAIT_ACK) {
-                       qp->s_flags &= ~RVT_S_WAIT_ACK;
-                       hfi1_schedule_send(qp);
-               }
-               hfi1_get_credit(qp, aeth);
-               qp->s_rnr_retry = qp->s_rnr_retry_cnt;
-               qp->s_retry = qp->s_retry_cnt;
-               update_last_psn(qp, psn);
-               return 1;
-
-       case 1:         /* RNR NAK */
-               ibp->rvp.n_rnr_naks++;
-               if (qp->s_acked == qp->s_tail)
-                       goto bail_stop;
-               if (qp->s_flags & RVT_S_WAIT_RNR)
-                       goto bail_stop;
-               if (qp->s_rnr_retry == 0) {
-                       status = IB_WC_RNR_RETRY_EXC_ERR;
-                       goto class_b;
-               }
-               if (qp->s_rnr_retry_cnt < 7)
-                       qp->s_rnr_retry--;
-
-               /* The last valid PSN is the previous PSN. */
-               update_last_psn(qp, psn - 1);
-
-               ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
-
-               reset_psn(qp, psn);
-
-               qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK);
-               hfi1_stop_rc_timers(qp);
-               to =
-                       ib_hfi1_rnr_table[(aeth >> HFI1_AETH_CREDIT_SHIFT) &
-                                          HFI1_AETH_CREDIT_MASK];
-               hfi1_add_rnr_timer(qp, to);
-               return 0;
-
-       case 3:         /* NAK */
-               if (qp->s_acked == qp->s_tail)
-                       goto bail_stop;
-               /* The last valid PSN is the previous PSN. */
-               update_last_psn(qp, psn - 1);
-               switch ((aeth >> HFI1_AETH_CREDIT_SHIFT) &
-                       HFI1_AETH_CREDIT_MASK) {
-               case 0: /* PSN sequence error */
-                       ibp->rvp.n_seq_naks++;
-                       /*
-                        * Back up to the responder's expected PSN.
-                        * Note that we might get a NAK in the middle of an
-                        * RDMA READ response which terminates the RDMA
-                        * READ.
-                        */
-                       restart_rc(qp, psn, 0);
-                       hfi1_schedule_send(qp);
-                       break;
-
-               case 1: /* Invalid Request */
-                       status = IB_WC_REM_INV_REQ_ERR;
-                       ibp->rvp.n_other_naks++;
-                       goto class_b;
-
-               case 2: /* Remote Access Error */
-                       status = IB_WC_REM_ACCESS_ERR;
-                       ibp->rvp.n_other_naks++;
-                       goto class_b;
-
-               case 3: /* Remote Operation Error */
-                       status = IB_WC_REM_OP_ERR;
-                       ibp->rvp.n_other_naks++;
-class_b:
-                       if (qp->s_last == qp->s_acked) {
-                               hfi1_send_complete(qp, wqe, status);
-                               rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
-                       }
-                       break;
-
-               default:
-                       /* Ignore other reserved NAK error codes */
-                       goto reserved;
-               }
-               qp->s_retry = qp->s_retry_cnt;
-               qp->s_rnr_retry = qp->s_rnr_retry_cnt;
-               goto bail_stop;
-
-       default:                /* 2: reserved */
-reserved:
-               /* Ignore reserved NAK codes. */
-               goto bail_stop;
-       }
-       /* cannot be reached  */
-bail_stop:
-       hfi1_stop_rc_timers(qp);
-       return ret;
-}
-
-/*
- * We have seen an out of sequence RDMA read middle or last packet.
- * This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE.
- */
-static void rdma_seq_err(struct rvt_qp *qp, struct hfi1_ibport *ibp, u32 psn,
-                        struct hfi1_ctxtdata *rcd)
-{
-       struct rvt_swqe *wqe;
-
-       /* Remove QP from retry timer */
-       hfi1_stop_rc_timers(qp);
-
-       wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
-
-       while (cmp_psn(psn, wqe->lpsn) > 0) {
-               if (wqe->wr.opcode == IB_WR_RDMA_READ ||
-                   wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
-                   wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
-                       break;
-               wqe = do_rc_completion(qp, wqe, ibp);
-       }
-
-       ibp->rvp.n_rdma_seq++;
-       qp->r_flags |= RVT_R_RDMAR_SEQ;
-       restart_rc(qp, qp->s_last_psn + 1, 0);
-       if (list_empty(&qp->rspwait)) {
-               qp->r_flags |= RVT_R_RSP_SEND;
-               atomic_inc(&qp->refcount);
-               list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
-       }
-}
-
-/**
- * rc_rcv_resp - process an incoming RC response packet
- * @ibp: the port this packet came in on
- * @ohdr: the other headers for this packet
- * @data: the packet data
- * @tlen: the packet length
- * @qp: the QP for this packet
- * @opcode: the opcode for this packet
- * @psn: the packet sequence number for this packet
- * @hdrsize: the header length
- * @pmtu: the path MTU
- *
- * This is called from hfi1_rc_rcv() to process an incoming RC response
- * packet for the given QP.
- * Called at interrupt level.
- */
-static void rc_rcv_resp(struct hfi1_ibport *ibp,
-                       struct hfi1_other_headers *ohdr,
-                       void *data, u32 tlen, struct rvt_qp *qp,
-                       u32 opcode, u32 psn, u32 hdrsize, u32 pmtu,
-                       struct hfi1_ctxtdata *rcd)
-{
-       struct rvt_swqe *wqe;
-       enum ib_wc_status status;
-       unsigned long flags;
-       int diff;
-       u32 pad;
-       u32 aeth;
-       u64 val;
-
-       spin_lock_irqsave(&qp->s_lock, flags);
-
-       trace_hfi1_rc_ack(qp, psn);
-
-       /* Ignore invalid responses. */
-       smp_read_barrier_depends(); /* see post_one_send */
-       if (cmp_psn(psn, ACCESS_ONCE(qp->s_next_psn)) >= 0)
-               goto ack_done;
-
-       /* Ignore duplicate responses. */
-       diff = cmp_psn(psn, qp->s_last_psn);
-       if (unlikely(diff <= 0)) {
-               /* Update credits for "ghost" ACKs */
-               if (diff == 0 && opcode == OP(ACKNOWLEDGE)) {
-                       aeth = be32_to_cpu(ohdr->u.aeth);
-                       if ((aeth >> 29) == 0)
-                               hfi1_get_credit(qp, aeth);
-               }
-               goto ack_done;
-       }
-
-       /*
-        * Skip everything other than the PSN we expect, if we are waiting
-        * for a reply to a restarted RDMA read or atomic op.
-        */
-       if (qp->r_flags & RVT_R_RDMAR_SEQ) {
-               if (cmp_psn(psn, qp->s_last_psn + 1) != 0)
-                       goto ack_done;
-               qp->r_flags &= ~RVT_R_RDMAR_SEQ;
-       }
-
-       if (unlikely(qp->s_acked == qp->s_tail))
-               goto ack_done;
-       wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
-       status = IB_WC_SUCCESS;
-
-       switch (opcode) {
-       case OP(ACKNOWLEDGE):
-       case OP(ATOMIC_ACKNOWLEDGE):
-       case OP(RDMA_READ_RESPONSE_FIRST):
-               aeth = be32_to_cpu(ohdr->u.aeth);
-               if (opcode == OP(ATOMIC_ACKNOWLEDGE)) {
-                       __be32 *p = ohdr->u.at.atomic_ack_eth;
-
-                       val = ((u64)be32_to_cpu(p[0]) << 32) |
-                               be32_to_cpu(p[1]);
-               } else {
-                       val = 0;
-               }
-               if (!do_rc_ack(qp, aeth, psn, opcode, val, rcd) ||
-                   opcode != OP(RDMA_READ_RESPONSE_FIRST))
-                       goto ack_done;
-               wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
-               if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
-                       goto ack_op_err;
-               /*
-                * If this is a response to a resent RDMA read, we
-                * have to be careful to copy the data to the right
-                * location.
-                */
-               qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
-                                                 wqe, psn, pmtu);
-               goto read_middle;
-
-       case OP(RDMA_READ_RESPONSE_MIDDLE):
-               /* no AETH, no ACK */
-               if (unlikely(cmp_psn(psn, qp->s_last_psn + 1)))
-                       goto ack_seq_err;
-               if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
-                       goto ack_op_err;
-read_middle:
-               if (unlikely(tlen != (hdrsize + pmtu + 4)))
-                       goto ack_len_err;
-               if (unlikely(pmtu >= qp->s_rdma_read_len))
-                       goto ack_len_err;
-
-               /*
-                * We got a response so update the timeout.
-                * 4.096 usec. * (1 << qp->timeout)
-                */
-               qp->s_flags |= RVT_S_TIMER;
-               mod_timer(&qp->s_timer, jiffies + qp->timeout_jiffies);
-               if (qp->s_flags & RVT_S_WAIT_ACK) {
-                       qp->s_flags &= ~RVT_S_WAIT_ACK;
-                       hfi1_schedule_send(qp);
-               }
-
-               if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE))
-                       qp->s_retry = qp->s_retry_cnt;
-
-               /*
-                * Update the RDMA receive state but do the copy w/o
-                * holding the locks and blocking interrupts.
-                */
-               qp->s_rdma_read_len -= pmtu;
-               update_last_psn(qp, psn);
-               spin_unlock_irqrestore(&qp->s_lock, flags);
-               hfi1_copy_sge(&qp->s_rdma_read_sge, data, pmtu, 0, 0);
-               goto bail;
-
-       case OP(RDMA_READ_RESPONSE_ONLY):
-               aeth = be32_to_cpu(ohdr->u.aeth);
-               if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd))
-                       goto ack_done;
-               /* Get the number of bytes the message was padded by. */
-               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-               /*
-                * Check that the data size is >= 0 && <= pmtu.
-                * Remember to account for ICRC (4).
-                */
-               if (unlikely(tlen < (hdrsize + pad + 4)))
-                       goto ack_len_err;
-               /*
-                * If this is a response to a resent RDMA read, we
-                * have to be careful to copy the data to the right
-                * location.
-                */
-               wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
-               qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
-                                                 wqe, psn, pmtu);
-               goto read_last;
-
-       case OP(RDMA_READ_RESPONSE_LAST):
-               /* ACKs READ req. */
-               if (unlikely(cmp_psn(psn, qp->s_last_psn + 1)))
-                       goto ack_seq_err;
-               if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
-                       goto ack_op_err;
-               /* Get the number of bytes the message was padded by. */
-               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-               /*
-                * Check that the data size is >= 1 && <= pmtu.
-                * Remember to account for ICRC (4).
-                */
-               if (unlikely(tlen <= (hdrsize + pad + 4)))
-                       goto ack_len_err;
-read_last:
-               tlen -= hdrsize + pad + 4;
-               if (unlikely(tlen != qp->s_rdma_read_len))
-                       goto ack_len_err;
-               aeth = be32_to_cpu(ohdr->u.aeth);
-               hfi1_copy_sge(&qp->s_rdma_read_sge, data, tlen, 0, 0);
-               WARN_ON(qp->s_rdma_read_sge.num_sge);
-               (void)do_rc_ack(qp, aeth, psn,
-                                OP(RDMA_READ_RESPONSE_LAST), 0, rcd);
-               goto ack_done;
-       }
-
-ack_op_err:
-       status = IB_WC_LOC_QP_OP_ERR;
-       goto ack_err;
-
-ack_seq_err:
-       rdma_seq_err(qp, ibp, psn, rcd);
-       goto ack_done;
-
-ack_len_err:
-       status = IB_WC_LOC_LEN_ERR;
-ack_err:
-       if (qp->s_last == qp->s_acked) {
-               hfi1_send_complete(qp, wqe, status);
-               rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
-       }
-ack_done:
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-bail:
-       return;
-}
-
-static inline void rc_defered_ack(struct hfi1_ctxtdata *rcd,
-                                 struct rvt_qp *qp)
-{
-       if (list_empty(&qp->rspwait)) {
-               qp->r_flags |= RVT_R_RSP_NAK;
-               atomic_inc(&qp->refcount);
-               list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
-       }
-}
-
-static inline void rc_cancel_ack(struct rvt_qp *qp)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-
-       priv->r_adefered = 0;
-       if (list_empty(&qp->rspwait))
-               return;
-       list_del_init(&qp->rspwait);
-       qp->r_flags &= ~RVT_R_RSP_NAK;
-       if (atomic_dec_and_test(&qp->refcount))
-               wake_up(&qp->wait);
-}
-
-/**
- * rc_rcv_error - process an incoming duplicate or error RC packet
- * @ohdr: the other headers for this packet
- * @data: the packet data
- * @qp: the QP for this packet
- * @opcode: the opcode for this packet
- * @psn: the packet sequence number for this packet
- * @diff: the difference between the PSN and the expected PSN
- *
- * This is called from hfi1_rc_rcv() to process an unexpected
- * incoming RC packet for the given QP.
- * Called at interrupt level.
- * Return 1 if no more processing is needed; otherwise return 0 to
- * schedule a response to be sent.
- */
-static noinline int rc_rcv_error(struct hfi1_other_headers *ohdr, void *data,
-                                struct rvt_qp *qp, u32 opcode, u32 psn,
-                                int diff, struct hfi1_ctxtdata *rcd)
-{
-       struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
-       struct rvt_ack_entry *e;
-       unsigned long flags;
-       u8 i, prev;
-       int old_req;
-
-       trace_hfi1_rc_rcv_error(qp, psn);
-       if (diff > 0) {
-               /*
-                * Packet sequence error.
-                * A NAK will ACK earlier sends and RDMA writes.
-                * Don't queue the NAK if we already sent one.
-                */
-               if (!qp->r_nak_state) {
-                       ibp->rvp.n_rc_seqnak++;
-                       qp->r_nak_state = IB_NAK_PSN_ERROR;
-                       /* Use the expected PSN. */
-                       qp->r_ack_psn = qp->r_psn;
-                       /*
-                        * Wait to send the sequence NAK until all packets
-                        * in the receive queue have been processed.
-                        * Otherwise, we end up propagating congestion.
-                        */
-                       rc_defered_ack(rcd, qp);
-               }
-               goto done;
-       }
-
-       /*
-        * Handle a duplicate request.  Don't re-execute SEND, RDMA
-        * write or atomic op.  Don't NAK errors, just silently drop
-        * the duplicate request.  Note that r_sge, r_len, and
-        * r_rcv_len may be in use so don't modify them.
-        *
-        * We are supposed to ACK the earliest duplicate PSN but we
-        * can coalesce an outstanding duplicate ACK.  We have to
-        * send the earliest so that RDMA reads can be restarted at
-        * the requester's expected PSN.
-        *
-        * First, find where this duplicate PSN falls within the
-        * ACKs previously sent.
-        * old_req is true if there is an older response that is scheduled
-        * to be sent before sending this one.
-        */
-       e = NULL;
-       old_req = 1;
-       ibp->rvp.n_rc_dupreq++;
-
-       spin_lock_irqsave(&qp->s_lock, flags);
-
-       for (i = qp->r_head_ack_queue; ; i = prev) {
-               if (i == qp->s_tail_ack_queue)
-                       old_req = 0;
-               if (i)
-                       prev = i - 1;
-               else
-                       prev = HFI1_MAX_RDMA_ATOMIC;
-               if (prev == qp->r_head_ack_queue) {
-                       e = NULL;
-                       break;
-               }
-               e = &qp->s_ack_queue[prev];
-               if (!e->opcode) {
-                       e = NULL;
-                       break;
-               }
-               if (cmp_psn(psn, e->psn) >= 0) {
-                       if (prev == qp->s_tail_ack_queue &&
-                           cmp_psn(psn, e->lpsn) <= 0)
-                               old_req = 0;
-                       break;
-               }
-       }
-       switch (opcode) {
-       case OP(RDMA_READ_REQUEST): {
-               struct ib_reth *reth;
-               u32 offset;
-               u32 len;
-
-               /*
-                * If we didn't find the RDMA read request in the ack queue,
-                * we can ignore this request.
-                */
-               if (!e || e->opcode != OP(RDMA_READ_REQUEST))
-                       goto unlock_done;
-               /* RETH comes after BTH */
-               reth = &ohdr->u.rc.reth;
-               /*
-                * Address range must be a subset of the original
-                * request and start on pmtu boundaries.
-                * We reuse the old ack_queue slot since the requester
-                * should not back up and request an earlier PSN for the
-                * same request.
-                */
-               offset = delta_psn(psn, e->psn) * qp->pmtu;
-               len = be32_to_cpu(reth->length);
-               if (unlikely(offset + len != e->rdma_sge.sge_length))
-                       goto unlock_done;
-               if (e->rdma_sge.mr) {
-                       rvt_put_mr(e->rdma_sge.mr);
-                       e->rdma_sge.mr = NULL;
-               }
-               if (len != 0) {
-                       u32 rkey = be32_to_cpu(reth->rkey);
-                       u64 vaddr = be64_to_cpu(reth->vaddr);
-                       int ok;
-
-                       ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey,
-                                        IB_ACCESS_REMOTE_READ);
-                       if (unlikely(!ok))
-                               goto unlock_done;
-               } else {
-                       e->rdma_sge.vaddr = NULL;
-                       e->rdma_sge.length = 0;
-                       e->rdma_sge.sge_length = 0;
-               }
-               e->psn = psn;
-               if (old_req)
-                       goto unlock_done;
-               qp->s_tail_ack_queue = prev;
-               break;
-       }
-
-       case OP(COMPARE_SWAP):
-       case OP(FETCH_ADD): {
-               /*
-                * If we didn't find the atomic request in the ack queue
-                * or the send tasklet is already backed up to send an
-                * earlier entry, we can ignore this request.
-                */
-               if (!e || e->opcode != (u8)opcode || old_req)
-                       goto unlock_done;
-               qp->s_tail_ack_queue = prev;
-               break;
-       }
-
-       default:
-               /*
-                * Ignore this operation if it doesn't request an ACK
-                * or an earlier RDMA read or atomic is going to be resent.
-                */
-               if (!(psn & IB_BTH_REQ_ACK) || old_req)
-                       goto unlock_done;
-               /*
-                * Resend the most recent ACK if this request is
-                * after all the previous RDMA reads and atomics.
-                */
-               if (i == qp->r_head_ack_queue) {
-                       spin_unlock_irqrestore(&qp->s_lock, flags);
-                       qp->r_nak_state = 0;
-                       qp->r_ack_psn = qp->r_psn - 1;
-                       goto send_ack;
-               }
-
-               /*
-                * Resend the RDMA read or atomic op which
-                * ACKs this duplicate request.
-                */
-               qp->s_tail_ack_queue = i;
-               break;
-       }
-       qp->s_ack_state = OP(ACKNOWLEDGE);
-       qp->s_flags |= RVT_S_RESP_PENDING;
-       qp->r_nak_state = 0;
-       hfi1_schedule_send(qp);
-
-unlock_done:
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-done:
-       return 1;
-
-send_ack:
-       return 0;
-}
-
-void hfi1_rc_error(struct rvt_qp *qp, enum ib_wc_status err)
-{
-       unsigned long flags;
-       int lastwqe;
-
-       spin_lock_irqsave(&qp->s_lock, flags);
-       lastwqe = rvt_error_qp(qp, err);
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-
-       if (lastwqe) {
-               struct ib_event ev;
-
-               ev.device = qp->ibqp.device;
-               ev.element.qp = &qp->ibqp;
-               ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
-               qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
-       }
-}
-
-static inline void update_ack_queue(struct rvt_qp *qp, unsigned n)
-{
-       unsigned next;
-
-       next = n + 1;
-       if (next > HFI1_MAX_RDMA_ATOMIC)
-               next = 0;
-       qp->s_tail_ack_queue = next;
-       qp->s_ack_state = OP(ACKNOWLEDGE);
-}
-
-static void log_cca_event(struct hfi1_pportdata *ppd, u8 sl, u32 rlid,
-                         u32 lqpn, u32 rqpn, u8 svc_type)
-{
-       struct opa_hfi1_cong_log_event_internal *cc_event;
-       unsigned long flags;
-
-       if (sl >= OPA_MAX_SLS)
-               return;
-
-       spin_lock_irqsave(&ppd->cc_log_lock, flags);
-
-       ppd->threshold_cong_event_map[sl / 8] |= 1 << (sl % 8);
-       ppd->threshold_event_counter++;
-
-       cc_event = &ppd->cc_events[ppd->cc_log_idx++];
-       if (ppd->cc_log_idx == OPA_CONG_LOG_ELEMS)
-               ppd->cc_log_idx = 0;
-       cc_event->lqpn = lqpn & RVT_QPN_MASK;
-       cc_event->rqpn = rqpn & RVT_QPN_MASK;
-       cc_event->sl = sl;
-       cc_event->svc_type = svc_type;
-       cc_event->rlid = rlid;
-       /* keep timestamp in units of 1.024 usec */
-       cc_event->timestamp = ktime_to_ns(ktime_get()) / 1024;
-
-       spin_unlock_irqrestore(&ppd->cc_log_lock, flags);
-}
-
-void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn,
-                 u32 rqpn, u8 svc_type)
-{
-       struct cca_timer *cca_timer;
-       u16 ccti, ccti_incr, ccti_timer, ccti_limit;
-       u8 trigger_threshold;
-       struct cc_state *cc_state;
-       unsigned long flags;
-
-       if (sl >= OPA_MAX_SLS)
-               return;
-
-       cc_state = get_cc_state(ppd);
-
-       if (!cc_state)
-               return;
-
-       /*
-        * 1) increase CCTI (for this SL)
-        * 2) select IPG (i.e., call set_link_ipg())
-        * 3) start timer
-        */
-       ccti_limit = cc_state->cct.ccti_limit;
-       ccti_incr = cc_state->cong_setting.entries[sl].ccti_increase;
-       ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer;
-       trigger_threshold =
-               cc_state->cong_setting.entries[sl].trigger_threshold;
-
-       spin_lock_irqsave(&ppd->cca_timer_lock, flags);
-
-       cca_timer = &ppd->cca_timer[sl];
-       if (cca_timer->ccti < ccti_limit) {
-               if (cca_timer->ccti + ccti_incr <= ccti_limit)
-                       cca_timer->ccti += ccti_incr;
-               else
-                       cca_timer->ccti = ccti_limit;
-               set_link_ipg(ppd);
-       }
-
-       ccti = cca_timer->ccti;
-
-       if (!hrtimer_active(&cca_timer->hrtimer)) {
-               /* ccti_timer is in units of 1.024 usec */
-               unsigned long nsec = 1024 * ccti_timer;
-
-               hrtimer_start(&cca_timer->hrtimer, ns_to_ktime(nsec),
-                             HRTIMER_MODE_REL);
-       }
-
-       spin_unlock_irqrestore(&ppd->cca_timer_lock, flags);
-
-       if ((trigger_threshold != 0) && (ccti >= trigger_threshold))
-               log_cca_event(ppd, sl, rlid, lqpn, rqpn, svc_type);
-}
-
-/**
- * hfi1_rc_rcv - process an incoming RC packet
- * @rcd: the context pointer
- * @hdr: the header of this packet
- * @rcv_flags: flags relevant to rcv processing
- * @data: the packet data
- * @tlen: the packet length
- * @qp: the QP for this packet
- *
- * This is called from qp_rcv() to process an incoming RC packet
- * for the given QP.
- * May be called at interrupt level.
- */
-void hfi1_rc_rcv(struct hfi1_packet *packet)
-{
-       struct hfi1_ctxtdata *rcd = packet->rcd;
-       struct hfi1_ib_header *hdr = packet->hdr;
-       u32 rcv_flags = packet->rcv_flags;
-       void *data = packet->ebuf;
-       u32 tlen = packet->tlen;
-       struct rvt_qp *qp = packet->qp;
-       struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-       struct hfi1_other_headers *ohdr = packet->ohdr;
-       u32 bth0, opcode;
-       u32 hdrsize = packet->hlen;
-       u32 psn;
-       u32 pad;
-       struct ib_wc wc;
-       u32 pmtu = qp->pmtu;
-       int diff;
-       struct ib_reth *reth;
-       unsigned long flags;
-       u32 bth1;
-       int ret, is_fecn = 0;
-       int copy_last = 0;
-
-       bth0 = be32_to_cpu(ohdr->bth[0]);
-       if (hfi1_ruc_check_hdr(ibp, hdr, rcv_flags & HFI1_HAS_GRH, qp, bth0))
-               return;
-
-       bth1 = be32_to_cpu(ohdr->bth[1]);
-       if (unlikely(bth1 & (HFI1_BECN_SMASK | HFI1_FECN_SMASK))) {
-               if (bth1 & HFI1_BECN_SMASK) {
-                       u16 rlid = qp->remote_ah_attr.dlid;
-                       u32 lqpn, rqpn;
-
-                       lqpn = qp->ibqp.qp_num;
-                       rqpn = qp->remote_qpn;
-                       process_becn(
-                               ppd,
-                               qp->remote_ah_attr.sl,
-                               rlid, lqpn, rqpn,
-                               IB_CC_SVCTYPE_RC);
-               }
-               is_fecn = bth1 & HFI1_FECN_SMASK;
-       }
-
-       psn = be32_to_cpu(ohdr->bth[2]);
-       opcode = (bth0 >> 24) & 0xff;
-
-       /*
-        * Process responses (ACKs) before anything else.  Note that the
-        * packet sequence number will be for something in the send work
-        * queue rather than the expected receive packet sequence number.
-        * In other words, this QP is the requester.
-        */
-       if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
-           opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
-               rc_rcv_resp(ibp, ohdr, data, tlen, qp, opcode, psn,
-                           hdrsize, pmtu, rcd);
-               if (is_fecn)
-                       goto send_ack;
-               return;
-       }
-
-       /* Compute 24 bits worth of difference. */
-       diff = delta_psn(psn, qp->r_psn);
-       if (unlikely(diff)) {
-               if (rc_rcv_error(ohdr, data, qp, opcode, psn, diff, rcd))
-                       return;
-               goto send_ack;
-       }
-
-       /* Check for opcode sequence errors. */
-       switch (qp->r_state) {
-       case OP(SEND_FIRST):
-       case OP(SEND_MIDDLE):
-               if (opcode == OP(SEND_MIDDLE) ||
-                   opcode == OP(SEND_LAST) ||
-                   opcode == OP(SEND_LAST_WITH_IMMEDIATE))
-                       break;
-               goto nack_inv;
-
-       case OP(RDMA_WRITE_FIRST):
-       case OP(RDMA_WRITE_MIDDLE):
-               if (opcode == OP(RDMA_WRITE_MIDDLE) ||
-                   opcode == OP(RDMA_WRITE_LAST) ||
-                   opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
-                       break;
-               goto nack_inv;
-
-       default:
-               if (opcode == OP(SEND_MIDDLE) ||
-                   opcode == OP(SEND_LAST) ||
-                   opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
-                   opcode == OP(RDMA_WRITE_MIDDLE) ||
-                   opcode == OP(RDMA_WRITE_LAST) ||
-                   opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
-                       goto nack_inv;
-               /*
-                * Note that it is up to the requester to not send a new
-                * RDMA read or atomic operation before receiving an ACK
-                * for the previous operation.
-                */
-               break;
-       }
-
-       if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
-               qp_comm_est(qp);
-
-       /* OK, process the packet. */
-       switch (opcode) {
-       case OP(SEND_FIRST):
-               ret = hfi1_rvt_get_rwqe(qp, 0);
-               if (ret < 0)
-                       goto nack_op_err;
-               if (!ret)
-                       goto rnr_nak;
-               qp->r_rcv_len = 0;
-               /* FALLTHROUGH */
-       case OP(SEND_MIDDLE):
-       case OP(RDMA_WRITE_MIDDLE):
-send_middle:
-               /* Check for invalid length PMTU or posted rwqe len. */
-               if (unlikely(tlen != (hdrsize + pmtu + 4)))
-                       goto nack_inv;
-               qp->r_rcv_len += pmtu;
-               if (unlikely(qp->r_rcv_len > qp->r_len))
-                       goto nack_inv;
-               hfi1_copy_sge(&qp->r_sge, data, pmtu, 1, 0);
-               break;
-
-       case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
-               /* consume RWQE */
-               ret = hfi1_rvt_get_rwqe(qp, 1);
-               if (ret < 0)
-                       goto nack_op_err;
-               if (!ret)
-                       goto rnr_nak;
-               goto send_last_imm;
-
-       case OP(SEND_ONLY):
-       case OP(SEND_ONLY_WITH_IMMEDIATE):
-               ret = hfi1_rvt_get_rwqe(qp, 0);
-               if (ret < 0)
-                       goto nack_op_err;
-               if (!ret)
-                       goto rnr_nak;
-               qp->r_rcv_len = 0;
-               if (opcode == OP(SEND_ONLY))
-                       goto no_immediate_data;
-               /* FALLTHROUGH for SEND_ONLY_WITH_IMMEDIATE */
-       case OP(SEND_LAST_WITH_IMMEDIATE):
-send_last_imm:
-               wc.ex.imm_data = ohdr->u.imm_data;
-               wc.wc_flags = IB_WC_WITH_IMM;
-               goto send_last;
-       case OP(RDMA_WRITE_LAST):
-               copy_last = ibpd_to_rvtpd(qp->ibqp.pd)->user;
-               /* fall through */
-       case OP(SEND_LAST):
-no_immediate_data:
-               wc.wc_flags = 0;
-               wc.ex.imm_data = 0;
-send_last:
-               /* Get the number of bytes the message was padded by. */
-               pad = (bth0 >> 20) & 3;
-               /* Check for invalid length. */
-               /* LAST len should be >= 1 */
-               if (unlikely(tlen < (hdrsize + pad + 4)))
-                       goto nack_inv;
-               /* Don't count the CRC. */
-               tlen -= (hdrsize + pad + 4);
-               wc.byte_len = tlen + qp->r_rcv_len;
-               if (unlikely(wc.byte_len > qp->r_len))
-                       goto nack_inv;
-               hfi1_copy_sge(&qp->r_sge, data, tlen, 1, copy_last);
-               rvt_put_ss(&qp->r_sge);
-               qp->r_msn++;
-               if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
-                       break;
-               wc.wr_id = qp->r_wr_id;
-               wc.status = IB_WC_SUCCESS;
-               if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) ||
-                   opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
-                       wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
-               else
-                       wc.opcode = IB_WC_RECV;
-               wc.qp = &qp->ibqp;
-               wc.src_qp = qp->remote_qpn;
-               wc.slid = qp->remote_ah_attr.dlid;
-               /*
-                * It seems that IB mandates the presence of an SL in a
-                * work completion only for the UD transport (see section
-                * 11.4.2 of IBTA Vol. 1).
-                *
-                * However, the way the SL is chosen below is consistent
-                * with the way that IB/qib works and is trying avoid
-                * introducing incompatibilities.
-                *
-                * See also OPA Vol. 1, section 9.7.6, and table 9-17.
-                */
-               wc.sl = qp->remote_ah_attr.sl;
-               /* zero fields that are N/A */
-               wc.vendor_err = 0;
-               wc.pkey_index = 0;
-               wc.dlid_path_bits = 0;
-               wc.port_num = 0;
-               /* Signal completion event if the solicited bit is set. */
-               rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
-                            (bth0 & IB_BTH_SOLICITED) != 0);
-               break;
-
-       case OP(RDMA_WRITE_ONLY):
-               copy_last = 1;
-               /* fall through */
-       case OP(RDMA_WRITE_FIRST):
-       case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
-               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
-                       goto nack_inv;
-               /* consume RWQE */
-               reth = &ohdr->u.rc.reth;
-               qp->r_len = be32_to_cpu(reth->length);
-               qp->r_rcv_len = 0;
-               qp->r_sge.sg_list = NULL;
-               if (qp->r_len != 0) {
-                       u32 rkey = be32_to_cpu(reth->rkey);
-                       u64 vaddr = be64_to_cpu(reth->vaddr);
-                       int ok;
-
-                       /* Check rkey & NAK */
-                       ok = rvt_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr,
-                                        rkey, IB_ACCESS_REMOTE_WRITE);
-                       if (unlikely(!ok))
-                               goto nack_acc;
-                       qp->r_sge.num_sge = 1;
-               } else {
-                       qp->r_sge.num_sge = 0;
-                       qp->r_sge.sge.mr = NULL;
-                       qp->r_sge.sge.vaddr = NULL;
-                       qp->r_sge.sge.length = 0;
-                       qp->r_sge.sge.sge_length = 0;
-               }
-               if (opcode == OP(RDMA_WRITE_FIRST))
-                       goto send_middle;
-               else if (opcode == OP(RDMA_WRITE_ONLY))
-                       goto no_immediate_data;
-               ret = hfi1_rvt_get_rwqe(qp, 1);
-               if (ret < 0)
-                       goto nack_op_err;
-               if (!ret)
-                       goto rnr_nak;
-               wc.ex.imm_data = ohdr->u.rc.imm_data;
-               wc.wc_flags = IB_WC_WITH_IMM;
-               goto send_last;
-
-       case OP(RDMA_READ_REQUEST): {
-               struct rvt_ack_entry *e;
-               u32 len;
-               u8 next;
-
-               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
-                       goto nack_inv;
-               next = qp->r_head_ack_queue + 1;
-               /* s_ack_queue is size HFI1_MAX_RDMA_ATOMIC+1 so use > not >= */
-               if (next > HFI1_MAX_RDMA_ATOMIC)
-                       next = 0;
-               spin_lock_irqsave(&qp->s_lock, flags);
-               if (unlikely(next == qp->s_tail_ack_queue)) {
-                       if (!qp->s_ack_queue[next].sent)
-                               goto nack_inv_unlck;
-                       update_ack_queue(qp, next);
-               }
-               e = &qp->s_ack_queue[qp->r_head_ack_queue];
-               if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
-                       rvt_put_mr(e->rdma_sge.mr);
-                       e->rdma_sge.mr = NULL;
-               }
-               reth = &ohdr->u.rc.reth;
-               len = be32_to_cpu(reth->length);
-               if (len) {
-                       u32 rkey = be32_to_cpu(reth->rkey);
-                       u64 vaddr = be64_to_cpu(reth->vaddr);
-                       int ok;
-
-                       /* Check rkey & NAK */
-                       ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr,
-                                        rkey, IB_ACCESS_REMOTE_READ);
-                       if (unlikely(!ok))
-                               goto nack_acc_unlck;
-                       /*
-                        * Update the next expected PSN.  We add 1 later
-                        * below, so only add the remainder here.
-                        */
-                       if (len > pmtu)
-                               qp->r_psn += (len - 1) / pmtu;
-               } else {
-                       e->rdma_sge.mr = NULL;
-                       e->rdma_sge.vaddr = NULL;
-                       e->rdma_sge.length = 0;
-                       e->rdma_sge.sge_length = 0;
-               }
-               e->opcode = opcode;
-               e->sent = 0;
-               e->psn = psn;
-               e->lpsn = qp->r_psn;
-               /*
-                * We need to increment the MSN here instead of when we
-                * finish sending the result since a duplicate request would
-                * increment it more than once.
-                */
-               qp->r_msn++;
-               qp->r_psn++;
-               qp->r_state = opcode;
-               qp->r_nak_state = 0;
-               qp->r_head_ack_queue = next;
-
-               /* Schedule the send tasklet. */
-               qp->s_flags |= RVT_S_RESP_PENDING;
-               hfi1_schedule_send(qp);
-
-               spin_unlock_irqrestore(&qp->s_lock, flags);
-               if (is_fecn)
-                       goto send_ack;
-               return;
-       }
-
-       case OP(COMPARE_SWAP):
-       case OP(FETCH_ADD): {
-               struct ib_atomic_eth *ateth;
-               struct rvt_ack_entry *e;
-               u64 vaddr;
-               atomic64_t *maddr;
-               u64 sdata;
-               u32 rkey;
-               u8 next;
-
-               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
-                       goto nack_inv;
-               next = qp->r_head_ack_queue + 1;
-               if (next > HFI1_MAX_RDMA_ATOMIC)
-                       next = 0;
-               spin_lock_irqsave(&qp->s_lock, flags);
-               if (unlikely(next == qp->s_tail_ack_queue)) {
-                       if (!qp->s_ack_queue[next].sent)
-                               goto nack_inv_unlck;
-                       update_ack_queue(qp, next);
-               }
-               e = &qp->s_ack_queue[qp->r_head_ack_queue];
-               if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
-                       rvt_put_mr(e->rdma_sge.mr);
-                       e->rdma_sge.mr = NULL;
-               }
-               ateth = &ohdr->u.atomic_eth;
-               vaddr = ((u64)be32_to_cpu(ateth->vaddr[0]) << 32) |
-                       be32_to_cpu(ateth->vaddr[1]);
-               if (unlikely(vaddr & (sizeof(u64) - 1)))
-                       goto nack_inv_unlck;
-               rkey = be32_to_cpu(ateth->rkey);
-               /* Check rkey & NAK */
-               if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
-                                         vaddr, rkey,
-                                         IB_ACCESS_REMOTE_ATOMIC)))
-                       goto nack_acc_unlck;
-               /* Perform atomic OP and save result. */
-               maddr = (atomic64_t *)qp->r_sge.sge.vaddr;
-               sdata = be64_to_cpu(ateth->swap_data);
-               e->atomic_data = (opcode == OP(FETCH_ADD)) ?
-                       (u64)atomic64_add_return(sdata, maddr) - sdata :
-                       (u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr,
-                                     be64_to_cpu(ateth->compare_data),
-                                     sdata);
-               rvt_put_mr(qp->r_sge.sge.mr);
-               qp->r_sge.num_sge = 0;
-               e->opcode = opcode;
-               e->sent = 0;
-               e->psn = psn;
-               e->lpsn = psn;
-               qp->r_msn++;
-               qp->r_psn++;
-               qp->r_state = opcode;
-               qp->r_nak_state = 0;
-               qp->r_head_ack_queue = next;
-
-               /* Schedule the send tasklet. */
-               qp->s_flags |= RVT_S_RESP_PENDING;
-               hfi1_schedule_send(qp);
-
-               spin_unlock_irqrestore(&qp->s_lock, flags);
-               if (is_fecn)
-                       goto send_ack;
-               return;
-       }
-
-       default:
-               /* NAK unknown opcodes. */
-               goto nack_inv;
-       }
-       qp->r_psn++;
-       qp->r_state = opcode;
-       qp->r_ack_psn = psn;
-       qp->r_nak_state = 0;
-       /* Send an ACK if requested or required. */
-       if (psn & IB_BTH_REQ_ACK) {
-               struct hfi1_qp_priv *priv = qp->priv;
-
-               if (packet->numpkt == 0) {
-                       rc_cancel_ack(qp);
-                       goto send_ack;
-               }
-               if (priv->r_adefered >= HFI1_PSN_CREDIT) {
-                       rc_cancel_ack(qp);
-                       goto send_ack;
-               }
-               if (unlikely(is_fecn)) {
-                       rc_cancel_ack(qp);
-                       goto send_ack;
-               }
-               priv->r_adefered++;
-               rc_defered_ack(rcd, qp);
-       }
-       return;
-
-rnr_nak:
-       qp->r_nak_state = qp->r_min_rnr_timer | IB_RNR_NAK;
-       qp->r_ack_psn = qp->r_psn;
-       /* Queue RNR NAK for later */
-       rc_defered_ack(rcd, qp);
-       return;
-
-nack_op_err:
-       hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
-       qp->r_nak_state = IB_NAK_REMOTE_OPERATIONAL_ERROR;
-       qp->r_ack_psn = qp->r_psn;
-       /* Queue NAK for later */
-       rc_defered_ack(rcd, qp);
-       return;
-
-nack_inv_unlck:
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-nack_inv:
-       hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
-       qp->r_nak_state = IB_NAK_INVALID_REQUEST;
-       qp->r_ack_psn = qp->r_psn;
-       /* Queue NAK for later */
-       rc_defered_ack(rcd, qp);
-       return;
-
-nack_acc_unlck:
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-nack_acc:
-       hfi1_rc_error(qp, IB_WC_LOC_PROT_ERR);
-       qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
-       qp->r_ack_psn = qp->r_psn;
-send_ack:
-       hfi1_send_rc_ack(rcd, qp, is_fecn);
-}
-
-void hfi1_rc_hdrerr(
-       struct hfi1_ctxtdata *rcd,
-       struct hfi1_ib_header *hdr,
-       u32 rcv_flags,
-       struct rvt_qp *qp)
-{
-       int has_grh = rcv_flags & HFI1_HAS_GRH;
-       struct hfi1_other_headers *ohdr;
-       struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
-       int diff;
-       u32 opcode;
-       u32 psn, bth0;
-
-       /* Check for GRH */
-       ohdr = &hdr->u.oth;
-       if (has_grh)
-               ohdr = &hdr->u.l.oth;
-
-       bth0 = be32_to_cpu(ohdr->bth[0]);
-       if (hfi1_ruc_check_hdr(ibp, hdr, has_grh, qp, bth0))
-               return;
-
-       psn = be32_to_cpu(ohdr->bth[2]);
-       opcode = (bth0 >> 24) & 0xff;
-
-       /* Only deal with RDMA Writes for now */
-       if (opcode < IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) {
-               diff = delta_psn(psn, qp->r_psn);
-               if (!qp->r_nak_state && diff >= 0) {
-                       ibp->rvp.n_rc_seqnak++;
-                       qp->r_nak_state = IB_NAK_PSN_ERROR;
-                       /* Use the expected PSN. */
-                       qp->r_ack_psn = qp->r_psn;
-                       /*
-                        * Wait to send the sequence
-                        * NAK until all packets
-                        * in the receive queue have
-                        * been processed.
-                        * Otherwise, we end up
-                        * propagating congestion.
-                        */
-                       rc_defered_ack(rcd, qp);
-               } /* Out of sequence NAK */
-       } /* QP Request NAKs */
-}
diff --git a/drivers/staging/rdma/hfi1/ruc.c b/drivers/staging/rdma/hfi1/ruc.c
deleted file mode 100644 (file)
index a659aec..0000000
+++ /dev/null
@@ -1,979 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/spinlock.h>
-
-#include "hfi.h"
-#include "mad.h"
-#include "qp.h"
-#include "verbs_txreq.h"
-#include "trace.h"
-
-/*
- * Convert the AETH RNR timeout code into the number of microseconds.
- */
-const u32 ib_hfi1_rnr_table[32] = {
-       655360, /* 00: 655.36 */
-       10,     /* 01:    .01 */
-       20,     /* 02     .02 */
-       30,     /* 03:    .03 */
-       40,     /* 04:    .04 */
-       60,     /* 05:    .06 */
-       80,     /* 06:    .08 */
-       120,    /* 07:    .12 */
-       160,    /* 08:    .16 */
-       240,    /* 09:    .24 */
-       320,    /* 0A:    .32 */
-       480,    /* 0B:    .48 */
-       640,    /* 0C:    .64 */
-       960,    /* 0D:    .96 */
-       1280,   /* 0E:   1.28 */
-       1920,   /* 0F:   1.92 */
-       2560,   /* 10:   2.56 */
-       3840,   /* 11:   3.84 */
-       5120,   /* 12:   5.12 */
-       7680,   /* 13:   7.68 */
-       10240,  /* 14:  10.24 */
-       15360,  /* 15:  15.36 */
-       20480,  /* 16:  20.48 */
-       30720,  /* 17:  30.72 */
-       40960,  /* 18:  40.96 */
-       61440,  /* 19:  61.44 */
-       81920,  /* 1A:  81.92 */
-       122880, /* 1B: 122.88 */
-       163840, /* 1C: 163.84 */
-       245760, /* 1D: 245.76 */
-       327680, /* 1E: 327.68 */
-       491520  /* 1F: 491.52 */
-};
-
-/*
- * Validate a RWQE and fill in the SGE state.
- * Return 1 if OK.
- */
-static int init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe)
-{
-       int i, j, ret;
-       struct ib_wc wc;
-       struct rvt_lkey_table *rkt;
-       struct rvt_pd *pd;
-       struct rvt_sge_state *ss;
-
-       rkt = &to_idev(qp->ibqp.device)->rdi.lkey_table;
-       pd = ibpd_to_rvtpd(qp->ibqp.srq ? qp->ibqp.srq->pd : qp->ibqp.pd);
-       ss = &qp->r_sge;
-       ss->sg_list = qp->r_sg_list;
-       qp->r_len = 0;
-       for (i = j = 0; i < wqe->num_sge; i++) {
-               if (wqe->sg_list[i].length == 0)
-                       continue;
-               /* Check LKEY */
-               if (!rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge,
-                                &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE))
-                       goto bad_lkey;
-               qp->r_len += wqe->sg_list[i].length;
-               j++;
-       }
-       ss->num_sge = j;
-       ss->total_len = qp->r_len;
-       ret = 1;
-       goto bail;
-
-bad_lkey:
-       while (j) {
-               struct rvt_sge *sge = --j ? &ss->sg_list[j - 1] : &ss->sge;
-
-               rvt_put_mr(sge->mr);
-       }
-       ss->num_sge = 0;
-       memset(&wc, 0, sizeof(wc));
-       wc.wr_id = wqe->wr_id;
-       wc.status = IB_WC_LOC_PROT_ERR;
-       wc.opcode = IB_WC_RECV;
-       wc.qp = &qp->ibqp;
-       /* Signal solicited completion event. */
-       rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
-       ret = 0;
-bail:
-       return ret;
-}
-
-/**
- * hfi1_rvt_get_rwqe - copy the next RWQE into the QP's RWQE
- * @qp: the QP
- * @wr_id_only: update qp->r_wr_id only, not qp->r_sge
- *
- * Return -1 if there is a local error, 0 if no RWQE is available,
- * otherwise return 1.
- *
- * Can be called from interrupt level.
- */
-int hfi1_rvt_get_rwqe(struct rvt_qp *qp, int wr_id_only)
-{
-       unsigned long flags;
-       struct rvt_rq *rq;
-       struct rvt_rwq *wq;
-       struct rvt_srq *srq;
-       struct rvt_rwqe *wqe;
-       void (*handler)(struct ib_event *, void *);
-       u32 tail;
-       int ret;
-
-       if (qp->ibqp.srq) {
-               srq = ibsrq_to_rvtsrq(qp->ibqp.srq);
-               handler = srq->ibsrq.event_handler;
-               rq = &srq->rq;
-       } else {
-               srq = NULL;
-               handler = NULL;
-               rq = &qp->r_rq;
-       }
-
-       spin_lock_irqsave(&rq->lock, flags);
-       if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
-               ret = 0;
-               goto unlock;
-       }
-
-       wq = rq->wq;
-       tail = wq->tail;
-       /* Validate tail before using it since it is user writable. */
-       if (tail >= rq->size)
-               tail = 0;
-       if (unlikely(tail == wq->head)) {
-               ret = 0;
-               goto unlock;
-       }
-       /* Make sure entry is read after head index is read. */
-       smp_rmb();
-       wqe = rvt_get_rwqe_ptr(rq, tail);
-       /*
-        * Even though we update the tail index in memory, the verbs
-        * consumer is not supposed to post more entries until a
-        * completion is generated.
-        */
-       if (++tail >= rq->size)
-               tail = 0;
-       wq->tail = tail;
-       if (!wr_id_only && !init_sge(qp, wqe)) {
-               ret = -1;
-               goto unlock;
-       }
-       qp->r_wr_id = wqe->wr_id;
-
-       ret = 1;
-       set_bit(RVT_R_WRID_VALID, &qp->r_aflags);
-       if (handler) {
-               u32 n;
-
-               /*
-                * Validate head pointer value and compute
-                * the number of remaining WQEs.
-                */
-               n = wq->head;
-               if (n >= rq->size)
-                       n = 0;
-               if (n < tail)
-                       n += rq->size - tail;
-               else
-                       n -= tail;
-               if (n < srq->limit) {
-                       struct ib_event ev;
-
-                       srq->limit = 0;
-                       spin_unlock_irqrestore(&rq->lock, flags);
-                       ev.device = qp->ibqp.device;
-                       ev.element.srq = qp->ibqp.srq;
-                       ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
-                       handler(&ev, srq->ibsrq.srq_context);
-                       goto bail;
-               }
-       }
-unlock:
-       spin_unlock_irqrestore(&rq->lock, flags);
-bail:
-       return ret;
-}
-
-static __be64 get_sguid(struct hfi1_ibport *ibp, unsigned index)
-{
-       if (!index) {
-               struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-
-               return cpu_to_be64(ppd->guid);
-       }
-       return ibp->guids[index - 1];
-}
-
-static int gid_ok(union ib_gid *gid, __be64 gid_prefix, __be64 id)
-{
-       return (gid->global.interface_id == id &&
-               (gid->global.subnet_prefix == gid_prefix ||
-                gid->global.subnet_prefix == IB_DEFAULT_GID_PREFIX));
-}
-
-/*
- *
- * This should be called with the QP r_lock held.
- *
- * The s_lock will be acquired around the hfi1_migrate_qp() call.
- */
-int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_ib_header *hdr,
-                      int has_grh, struct rvt_qp *qp, u32 bth0)
-{
-       __be64 guid;
-       unsigned long flags;
-       u8 sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
-
-       if (qp->s_mig_state == IB_MIG_ARMED && (bth0 & IB_BTH_MIG_REQ)) {
-               if (!has_grh) {
-                       if (qp->alt_ah_attr.ah_flags & IB_AH_GRH)
-                               goto err;
-               } else {
-                       if (!(qp->alt_ah_attr.ah_flags & IB_AH_GRH))
-                               goto err;
-                       guid = get_sguid(ibp, qp->alt_ah_attr.grh.sgid_index);
-                       if (!gid_ok(&hdr->u.l.grh.dgid, ibp->rvp.gid_prefix,
-                                   guid))
-                               goto err;
-                       if (!gid_ok(
-                               &hdr->u.l.grh.sgid,
-                               qp->alt_ah_attr.grh.dgid.global.subnet_prefix,
-                               qp->alt_ah_attr.grh.dgid.global.interface_id))
-                               goto err;
-               }
-               if (unlikely(rcv_pkey_check(ppd_from_ibp(ibp), (u16)bth0,
-                                           sc5, be16_to_cpu(hdr->lrh[3])))) {
-                       hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_P_KEY,
-                                      (u16)bth0,
-                                      (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF,
-                                      0, qp->ibqp.qp_num,
-                                      be16_to_cpu(hdr->lrh[3]),
-                                      be16_to_cpu(hdr->lrh[1]));
-                       goto err;
-               }
-               /* Validate the SLID. See Ch. 9.6.1.5 and 17.2.8 */
-               if (be16_to_cpu(hdr->lrh[3]) != qp->alt_ah_attr.dlid ||
-                   ppd_from_ibp(ibp)->port != qp->alt_ah_attr.port_num)
-                       goto err;
-               spin_lock_irqsave(&qp->s_lock, flags);
-               hfi1_migrate_qp(qp);
-               spin_unlock_irqrestore(&qp->s_lock, flags);
-       } else {
-               if (!has_grh) {
-                       if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
-                               goto err;
-               } else {
-                       if (!(qp->remote_ah_attr.ah_flags & IB_AH_GRH))
-                               goto err;
-                       guid = get_sguid(ibp,
-                                        qp->remote_ah_attr.grh.sgid_index);
-                       if (!gid_ok(&hdr->u.l.grh.dgid, ibp->rvp.gid_prefix,
-                                   guid))
-                               goto err;
-                       if (!gid_ok(
-                            &hdr->u.l.grh.sgid,
-                            qp->remote_ah_attr.grh.dgid.global.subnet_prefix,
-                            qp->remote_ah_attr.grh.dgid.global.interface_id))
-                               goto err;
-               }
-               if (unlikely(rcv_pkey_check(ppd_from_ibp(ibp), (u16)bth0,
-                                           sc5, be16_to_cpu(hdr->lrh[3])))) {
-                       hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_P_KEY,
-                                      (u16)bth0,
-                                      (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF,
-                                      0, qp->ibqp.qp_num,
-                                      be16_to_cpu(hdr->lrh[3]),
-                                      be16_to_cpu(hdr->lrh[1]));
-                       goto err;
-               }
-               /* Validate the SLID. See Ch. 9.6.1.5 */
-               if (be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid ||
-                   ppd_from_ibp(ibp)->port != qp->port_num)
-                       goto err;
-               if (qp->s_mig_state == IB_MIG_REARM &&
-                   !(bth0 & IB_BTH_MIG_REQ))
-                       qp->s_mig_state = IB_MIG_ARMED;
-       }
-
-       return 0;
-
-err:
-       return 1;
-}
-
-/**
- * ruc_loopback - handle UC and RC loopback requests
- * @sqp: the sending QP
- *
- * This is called from hfi1_do_send() to
- * forward a WQE addressed to the same HFI.
- * Note that although we are single threaded due to the tasklet, we still
- * have to protect against post_send().  We don't have to worry about
- * receive interrupts since this is a connected protocol and all packets
- * will pass through here.
- */
-static void ruc_loopback(struct rvt_qp *sqp)
-{
-       struct hfi1_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num);
-       struct rvt_qp *qp;
-       struct rvt_swqe *wqe;
-       struct rvt_sge *sge;
-       unsigned long flags;
-       struct ib_wc wc;
-       u64 sdata;
-       atomic64_t *maddr;
-       enum ib_wc_status send_status;
-       int release;
-       int ret;
-       int copy_last = 0;
-       u32 to;
-
-       rcu_read_lock();
-
-       /*
-        * Note that we check the responder QP state after
-        * checking the requester's state.
-        */
-       qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), &ibp->rvp,
-                           sqp->remote_qpn);
-
-       spin_lock_irqsave(&sqp->s_lock, flags);
-
-       /* Return if we are already busy processing a work request. */
-       if ((sqp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT)) ||
-           !(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_OR_FLUSH_SEND))
-               goto unlock;
-
-       sqp->s_flags |= RVT_S_BUSY;
-
-again:
-       smp_read_barrier_depends(); /* see post_one_send() */
-       if (sqp->s_last == ACCESS_ONCE(sqp->s_head))
-               goto clr_busy;
-       wqe = rvt_get_swqe_ptr(sqp, sqp->s_last);
-
-       /* Return if it is not OK to start a new work request. */
-       if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_NEXT_SEND_OK)) {
-               if (!(ib_rvt_state_ops[sqp->state] & RVT_FLUSH_SEND))
-                       goto clr_busy;
-               /* We are in the error state, flush the work request. */
-               send_status = IB_WC_WR_FLUSH_ERR;
-               goto flush_send;
-       }
-
-       /*
-        * We can rely on the entry not changing without the s_lock
-        * being held until we update s_last.
-        * We increment s_cur to indicate s_last is in progress.
-        */
-       if (sqp->s_last == sqp->s_cur) {
-               if (++sqp->s_cur >= sqp->s_size)
-                       sqp->s_cur = 0;
-       }
-       spin_unlock_irqrestore(&sqp->s_lock, flags);
-
-       if (!qp || !(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) ||
-           qp->ibqp.qp_type != sqp->ibqp.qp_type) {
-               ibp->rvp.n_pkt_drops++;
-               /*
-                * For RC, the requester would timeout and retry so
-                * shortcut the timeouts and just signal too many retries.
-                */
-               if (sqp->ibqp.qp_type == IB_QPT_RC)
-                       send_status = IB_WC_RETRY_EXC_ERR;
-               else
-                       send_status = IB_WC_SUCCESS;
-               goto serr;
-       }
-
-       memset(&wc, 0, sizeof(wc));
-       send_status = IB_WC_SUCCESS;
-
-       release = 1;
-       sqp->s_sge.sge = wqe->sg_list[0];
-       sqp->s_sge.sg_list = wqe->sg_list + 1;
-       sqp->s_sge.num_sge = wqe->wr.num_sge;
-       sqp->s_len = wqe->length;
-       switch (wqe->wr.opcode) {
-       case IB_WR_SEND_WITH_IMM:
-               wc.wc_flags = IB_WC_WITH_IMM;
-               wc.ex.imm_data = wqe->wr.ex.imm_data;
-               /* FALLTHROUGH */
-       case IB_WR_SEND:
-               ret = hfi1_rvt_get_rwqe(qp, 0);
-               if (ret < 0)
-                       goto op_err;
-               if (!ret)
-                       goto rnr_nak;
-               break;
-
-       case IB_WR_RDMA_WRITE_WITH_IMM:
-               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
-                       goto inv_err;
-               wc.wc_flags = IB_WC_WITH_IMM;
-               wc.ex.imm_data = wqe->wr.ex.imm_data;
-               ret = hfi1_rvt_get_rwqe(qp, 1);
-               if (ret < 0)
-                       goto op_err;
-               if (!ret)
-                       goto rnr_nak;
-               /* skip copy_last set and qp_access_flags recheck */
-               goto do_write;
-       case IB_WR_RDMA_WRITE:
-               copy_last = ibpd_to_rvtpd(qp->ibqp.pd)->user;
-               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
-                       goto inv_err;
-do_write:
-               if (wqe->length == 0)
-                       break;
-               if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, wqe->length,
-                                         wqe->rdma_wr.remote_addr,
-                                         wqe->rdma_wr.rkey,
-                                         IB_ACCESS_REMOTE_WRITE)))
-                       goto acc_err;
-               qp->r_sge.sg_list = NULL;
-               qp->r_sge.num_sge = 1;
-               qp->r_sge.total_len = wqe->length;
-               break;
-
-       case IB_WR_RDMA_READ:
-               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
-                       goto inv_err;
-               if (unlikely(!rvt_rkey_ok(qp, &sqp->s_sge.sge, wqe->length,
-                                         wqe->rdma_wr.remote_addr,
-                                         wqe->rdma_wr.rkey,
-                                         IB_ACCESS_REMOTE_READ)))
-                       goto acc_err;
-               release = 0;
-               sqp->s_sge.sg_list = NULL;
-               sqp->s_sge.num_sge = 1;
-               qp->r_sge.sge = wqe->sg_list[0];
-               qp->r_sge.sg_list = wqe->sg_list + 1;
-               qp->r_sge.num_sge = wqe->wr.num_sge;
-               qp->r_sge.total_len = wqe->length;
-               break;
-
-       case IB_WR_ATOMIC_CMP_AND_SWP:
-       case IB_WR_ATOMIC_FETCH_AND_ADD:
-               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
-                       goto inv_err;
-               if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
-                                         wqe->atomic_wr.remote_addr,
-                                         wqe->atomic_wr.rkey,
-                                         IB_ACCESS_REMOTE_ATOMIC)))
-                       goto acc_err;
-               /* Perform atomic OP and save result. */
-               maddr = (atomic64_t *)qp->r_sge.sge.vaddr;
-               sdata = wqe->atomic_wr.compare_add;
-               *(u64 *)sqp->s_sge.sge.vaddr =
-                       (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ?
-                       (u64)atomic64_add_return(sdata, maddr) - sdata :
-                       (u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr,
-                                     sdata, wqe->atomic_wr.swap);
-               rvt_put_mr(qp->r_sge.sge.mr);
-               qp->r_sge.num_sge = 0;
-               goto send_comp;
-
-       default:
-               send_status = IB_WC_LOC_QP_OP_ERR;
-               goto serr;
-       }
-
-       sge = &sqp->s_sge.sge;
-       while (sqp->s_len) {
-               u32 len = sqp->s_len;
-
-               if (len > sge->length)
-                       len = sge->length;
-               if (len > sge->sge_length)
-                       len = sge->sge_length;
-               WARN_ON_ONCE(len == 0);
-               hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, release, copy_last);
-               sge->vaddr += len;
-               sge->length -= len;
-               sge->sge_length -= len;
-               if (sge->sge_length == 0) {
-                       if (!release)
-                               rvt_put_mr(sge->mr);
-                       if (--sqp->s_sge.num_sge)
-                               *sge = *sqp->s_sge.sg_list++;
-               } else if (sge->length == 0 && sge->mr->lkey) {
-                       if (++sge->n >= RVT_SEGSZ) {
-                               if (++sge->m >= sge->mr->mapsz)
-                                       break;
-                               sge->n = 0;
-                       }
-                       sge->vaddr =
-                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
-                       sge->length =
-                               sge->mr->map[sge->m]->segs[sge->n].length;
-               }
-               sqp->s_len -= len;
-       }
-       if (release)
-               rvt_put_ss(&qp->r_sge);
-
-       if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
-               goto send_comp;
-
-       if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM)
-               wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
-       else
-               wc.opcode = IB_WC_RECV;
-       wc.wr_id = qp->r_wr_id;
-       wc.status = IB_WC_SUCCESS;
-       wc.byte_len = wqe->length;
-       wc.qp = &qp->ibqp;
-       wc.src_qp = qp->remote_qpn;
-       wc.slid = qp->remote_ah_attr.dlid;
-       wc.sl = qp->remote_ah_attr.sl;
-       wc.port_num = 1;
-       /* Signal completion event if the solicited bit is set. */
-       rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
-                    wqe->wr.send_flags & IB_SEND_SOLICITED);
-
-send_comp:
-       spin_lock_irqsave(&sqp->s_lock, flags);
-       ibp->rvp.n_loop_pkts++;
-flush_send:
-       sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
-       hfi1_send_complete(sqp, wqe, send_status);
-       goto again;
-
-rnr_nak:
-       /* Handle RNR NAK */
-       if (qp->ibqp.qp_type == IB_QPT_UC)
-               goto send_comp;
-       ibp->rvp.n_rnr_naks++;
-       /*
-        * Note: we don't need the s_lock held since the BUSY flag
-        * makes this single threaded.
-        */
-       if (sqp->s_rnr_retry == 0) {
-               send_status = IB_WC_RNR_RETRY_EXC_ERR;
-               goto serr;
-       }
-       if (sqp->s_rnr_retry_cnt < 7)
-               sqp->s_rnr_retry--;
-       spin_lock_irqsave(&sqp->s_lock, flags);
-       if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_RECV_OK))
-               goto clr_busy;
-       to = ib_hfi1_rnr_table[qp->r_min_rnr_timer];
-       hfi1_add_rnr_timer(sqp, to);
-       goto clr_busy;
-
-op_err:
-       send_status = IB_WC_REM_OP_ERR;
-       wc.status = IB_WC_LOC_QP_OP_ERR;
-       goto err;
-
-inv_err:
-       send_status = IB_WC_REM_INV_REQ_ERR;
-       wc.status = IB_WC_LOC_QP_OP_ERR;
-       goto err;
-
-acc_err:
-       send_status = IB_WC_REM_ACCESS_ERR;
-       wc.status = IB_WC_LOC_PROT_ERR;
-err:
-       /* responder goes to error state */
-       hfi1_rc_error(qp, wc.status);
-
-serr:
-       spin_lock_irqsave(&sqp->s_lock, flags);
-       hfi1_send_complete(sqp, wqe, send_status);
-       if (sqp->ibqp.qp_type == IB_QPT_RC) {
-               int lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR);
-
-               sqp->s_flags &= ~RVT_S_BUSY;
-               spin_unlock_irqrestore(&sqp->s_lock, flags);
-               if (lastwqe) {
-                       struct ib_event ev;
-
-                       ev.device = sqp->ibqp.device;
-                       ev.element.qp = &sqp->ibqp;
-                       ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
-                       sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context);
-               }
-               goto done;
-       }
-clr_busy:
-       sqp->s_flags &= ~RVT_S_BUSY;
-unlock:
-       spin_unlock_irqrestore(&sqp->s_lock, flags);
-done:
-       rcu_read_unlock();
-}
-
-/**
- * hfi1_make_grh - construct a GRH header
- * @ibp: a pointer to the IB port
- * @hdr: a pointer to the GRH header being constructed
- * @grh: the global route address to send to
- * @hwords: the number of 32 bit words of header being sent
- * @nwords: the number of 32 bit words of data being sent
- *
- * Return the size of the header in 32 bit words.
- */
-u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr,
-                 struct ib_global_route *grh, u32 hwords, u32 nwords)
-{
-       hdr->version_tclass_flow =
-               cpu_to_be32((IB_GRH_VERSION << IB_GRH_VERSION_SHIFT) |
-                           (grh->traffic_class << IB_GRH_TCLASS_SHIFT) |
-                           (grh->flow_label << IB_GRH_FLOW_SHIFT));
-       hdr->paylen = cpu_to_be16((hwords - 2 + nwords + SIZE_OF_CRC) << 2);
-       /* next_hdr is defined by C8-7 in ch. 8.4.1 */
-       hdr->next_hdr = IB_GRH_NEXT_HDR;
-       hdr->hop_limit = grh->hop_limit;
-       /* The SGID is 32-bit aligned. */
-       hdr->sgid.global.subnet_prefix = ibp->rvp.gid_prefix;
-       hdr->sgid.global.interface_id =
-               grh->sgid_index && grh->sgid_index < ARRAY_SIZE(ibp->guids) ?
-               ibp->guids[grh->sgid_index - 1] :
-                       cpu_to_be64(ppd_from_ibp(ibp)->guid);
-       hdr->dgid = grh->dgid;
-
-       /* GRH header size in 32-bit words. */
-       return sizeof(struct ib_grh) / sizeof(u32);
-}
-
-#define BTH2_OFFSET (offsetof(struct hfi1_pio_header, hdr.u.oth.bth[2]) / 4)
-
-/**
- * build_ahg - create ahg in s_hdr
- * @qp: a pointer to QP
- * @npsn: the next PSN for the request/response
- *
- * This routine handles the AHG by allocating an ahg entry and causing the
- * copy of the first middle.
- *
- * Subsequent middles use the copied entry, editing the
- * PSN with 1 or 2 edits.
- */
-static inline void build_ahg(struct rvt_qp *qp, u32 npsn)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-
-       if (unlikely(qp->s_flags & RVT_S_AHG_CLEAR))
-               clear_ahg(qp);
-       if (!(qp->s_flags & RVT_S_AHG_VALID)) {
-               /* first middle that needs copy  */
-               if (qp->s_ahgidx < 0)
-                       qp->s_ahgidx = sdma_ahg_alloc(priv->s_sde);
-               if (qp->s_ahgidx >= 0) {
-                       qp->s_ahgpsn = npsn;
-                       priv->s_hdr->tx_flags |= SDMA_TXREQ_F_AHG_COPY;
-                       /* save to protect a change in another thread */
-                       priv->s_hdr->sde = priv->s_sde;
-                       priv->s_hdr->ahgidx = qp->s_ahgidx;
-                       qp->s_flags |= RVT_S_AHG_VALID;
-               }
-       } else {
-               /* subsequent middle after valid */
-               if (qp->s_ahgidx >= 0) {
-                       priv->s_hdr->tx_flags |= SDMA_TXREQ_F_USE_AHG;
-                       priv->s_hdr->ahgidx = qp->s_ahgidx;
-                       priv->s_hdr->ahgcount++;
-                       priv->s_hdr->ahgdesc[0] =
-                               sdma_build_ahg_descriptor(
-                                       (__force u16)cpu_to_be16((u16)npsn),
-                                       BTH2_OFFSET,
-                                       16,
-                                       16);
-                       if ((npsn & 0xffff0000) !=
-                                       (qp->s_ahgpsn & 0xffff0000)) {
-                               priv->s_hdr->ahgcount++;
-                               priv->s_hdr->ahgdesc[1] =
-                                       sdma_build_ahg_descriptor(
-                                               (__force u16)cpu_to_be16(
-                                                       (u16)(npsn >> 16)),
-                                               BTH2_OFFSET,
-                                               0,
-                                               16);
-                       }
-               }
-       }
-}
-
-void hfi1_make_ruc_header(struct rvt_qp *qp, struct hfi1_other_headers *ohdr,
-                         u32 bth0, u32 bth2, int middle,
-                         struct hfi1_pkt_state *ps)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-       struct hfi1_ibport *ibp = ps->ibp;
-       u16 lrh0;
-       u32 nwords;
-       u32 extra_bytes;
-       u32 bth1;
-
-       /* Construct the header. */
-       extra_bytes = -qp->s_cur_size & 3;
-       nwords = (qp->s_cur_size + extra_bytes) >> 2;
-       lrh0 = HFI1_LRH_BTH;
-       if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
-               qp->s_hdrwords += hfi1_make_grh(ibp,
-                                               &ps->s_txreq->phdr.hdr.u.l.grh,
-                                               &qp->remote_ah_attr.grh,
-                                               qp->s_hdrwords, nwords);
-               lrh0 = HFI1_LRH_GRH;
-               middle = 0;
-       }
-       lrh0 |= (priv->s_sc & 0xf) << 12 | (qp->remote_ah_attr.sl & 0xf) << 4;
-       /*
-        * reset s_hdr/AHG fields
-        *
-        * This insures that the ahgentry/ahgcount
-        * are at a non-AHG default to protect
-        * build_verbs_tx_desc() from using
-        * an include ahgidx.
-        *
-        * build_ahg() will modify as appropriate
-        * to use the AHG feature.
-        */
-       priv->s_hdr->tx_flags = 0;
-       priv->s_hdr->ahgcount = 0;
-       priv->s_hdr->ahgidx = 0;
-       priv->s_hdr->sde = NULL;
-       if (qp->s_mig_state == IB_MIG_MIGRATED)
-               bth0 |= IB_BTH_MIG_REQ;
-       else
-               middle = 0;
-       if (middle)
-               build_ahg(qp, bth2);
-       else
-               qp->s_flags &= ~RVT_S_AHG_VALID;
-       ps->s_txreq->phdr.hdr.lrh[0] = cpu_to_be16(lrh0);
-       ps->s_txreq->phdr.hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
-       ps->s_txreq->phdr.hdr.lrh[2] =
-               cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC);
-       ps->s_txreq->phdr.hdr.lrh[3] = cpu_to_be16(ppd_from_ibp(ibp)->lid |
-                                      qp->remote_ah_attr.src_path_bits);
-       bth0 |= hfi1_get_pkey(ibp, qp->s_pkey_index);
-       bth0 |= extra_bytes << 20;
-       ohdr->bth[0] = cpu_to_be32(bth0);
-       bth1 = qp->remote_qpn;
-       if (qp->s_flags & RVT_S_ECN) {
-               qp->s_flags &= ~RVT_S_ECN;
-               /* we recently received a FECN, so return a BECN */
-               bth1 |= (HFI1_BECN_MASK << HFI1_BECN_SHIFT);
-       }
-       ohdr->bth[1] = cpu_to_be32(bth1);
-       ohdr->bth[2] = cpu_to_be32(bth2);
-}
-
-/* when sending, force a reschedule every one of these periods */
-#define SEND_RESCHED_TIMEOUT (5 * HZ)  /* 5s in jiffies */
-
-void _hfi1_do_send(struct work_struct *work)
-{
-       struct iowait *wait = container_of(work, struct iowait, iowork);
-       struct rvt_qp *qp = iowait_to_qp(wait);
-
-       hfi1_do_send(qp);
-}
-
-/**
- * hfi1_do_send - perform a send on a QP
- * @work: contains a pointer to the QP
- *
- * Process entries in the send work queue until credit or queue is
- * exhausted.  Only allow one CPU to send a packet per QP (tasklet).
- * Otherwise, two threads could send packets out of order.
- */
-void hfi1_do_send(struct rvt_qp *qp)
-{
-       struct hfi1_pkt_state ps;
-       struct hfi1_qp_priv *priv = qp->priv;
-       int (*make_req)(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
-       unsigned long timeout;
-       unsigned long timeout_int;
-       int cpu;
-
-       ps.dev = to_idev(qp->ibqp.device);
-       ps.ibp = to_iport(qp->ibqp.device, qp->port_num);
-       ps.ppd = ppd_from_ibp(ps.ibp);
-
-       switch (qp->ibqp.qp_type) {
-       case IB_QPT_RC:
-               if (!loopback && ((qp->remote_ah_attr.dlid & ~((1 << ps.ppd->lmc
-                                                               ) - 1)) ==
-                                ps.ppd->lid)) {
-                       ruc_loopback(qp);
-                       return;
-               }
-               make_req = hfi1_make_rc_req;
-               timeout_int = (qp->timeout_jiffies);
-               break;
-       case IB_QPT_UC:
-               if (!loopback && ((qp->remote_ah_attr.dlid & ~((1 << ps.ppd->lmc
-                                                               ) - 1)) ==
-                                ps.ppd->lid)) {
-                       ruc_loopback(qp);
-                       return;
-               }
-               make_req = hfi1_make_uc_req;
-               timeout_int = SEND_RESCHED_TIMEOUT;
-               break;
-       default:
-               make_req = hfi1_make_ud_req;
-               timeout_int = SEND_RESCHED_TIMEOUT;
-       }
-
-       spin_lock_irqsave(&qp->s_lock, ps.flags);
-
-       /* Return if we are already busy processing a work request. */
-       if (!hfi1_send_ok(qp)) {
-               spin_unlock_irqrestore(&qp->s_lock, ps.flags);
-               return;
-       }
-
-       qp->s_flags |= RVT_S_BUSY;
-
-       timeout = jiffies + (timeout_int) / 8;
-       cpu = priv->s_sde ? priv->s_sde->cpu :
-                       cpumask_first(cpumask_of_node(ps.ppd->dd->node));
-       /* insure a pre-built packet is handled  */
-       ps.s_txreq = get_waiting_verbs_txreq(qp);
-       do {
-               /* Check for a constructed packet to be sent. */
-               if (qp->s_hdrwords != 0) {
-                       spin_unlock_irqrestore(&qp->s_lock, ps.flags);
-                       /*
-                        * If the packet cannot be sent now, return and
-                        * the send tasklet will be woken up later.
-                        */
-                       if (hfi1_verbs_send(qp, &ps))
-                               return;
-                       /* Record that s_hdr is empty. */
-                       qp->s_hdrwords = 0;
-                       /* allow other tasks to run */
-                       if (unlikely(time_after(jiffies, timeout))) {
-                               if (workqueue_congested(cpu,
-                                                       ps.ppd->hfi1_wq)) {
-                                       spin_lock_irqsave(
-                                               &qp->s_lock,
-                                               ps.flags);
-                                       qp->s_flags &= ~RVT_S_BUSY;
-                                       hfi1_schedule_send(qp);
-                                       spin_unlock_irqrestore(
-                                               &qp->s_lock,
-                                               ps.flags);
-                                       this_cpu_inc(
-                                               *ps.ppd->dd->send_schedule);
-                                       return;
-                               }
-                               if (!irqs_disabled()) {
-                                       cond_resched();
-                                       this_cpu_inc(
-                                          *ps.ppd->dd->send_schedule);
-                               }
-                               timeout = jiffies + (timeout_int) / 8;
-                       }
-                       spin_lock_irqsave(&qp->s_lock, ps.flags);
-               }
-       } while (make_req(qp, &ps));
-
-       spin_unlock_irqrestore(&qp->s_lock, ps.flags);
-}
-
-/*
- * This should be called with s_lock held.
- */
-void hfi1_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
-                       enum ib_wc_status status)
-{
-       u32 old_last, last;
-       unsigned i;
-
-       if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND))
-               return;
-
-       last = qp->s_last;
-       old_last = last;
-       if (++last >= qp->s_size)
-               last = 0;
-       qp->s_last = last;
-       /* See post_send() */
-       barrier();
-       for (i = 0; i < wqe->wr.num_sge; i++) {
-               struct rvt_sge *sge = &wqe->sg_list[i];
-
-               rvt_put_mr(sge->mr);
-       }
-       if (qp->ibqp.qp_type == IB_QPT_UD ||
-           qp->ibqp.qp_type == IB_QPT_SMI ||
-           qp->ibqp.qp_type == IB_QPT_GSI)
-               atomic_dec(&ibah_to_rvtah(wqe->ud_wr.ah)->refcount);
-
-       /* See ch. 11.2.4.1 and 10.7.3.1 */
-       if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) ||
-           (wqe->wr.send_flags & IB_SEND_SIGNALED) ||
-           status != IB_WC_SUCCESS) {
-               struct ib_wc wc;
-
-               memset(&wc, 0, sizeof(wc));
-               wc.wr_id = wqe->wr.wr_id;
-               wc.status = status;
-               wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode];
-               wc.qp = &qp->ibqp;
-               if (status == IB_WC_SUCCESS)
-                       wc.byte_len = wqe->length;
-               rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc,
-                            status != IB_WC_SUCCESS);
-       }
-
-       if (qp->s_acked == old_last)
-               qp->s_acked = last;
-       if (qp->s_cur == old_last)
-               qp->s_cur = last;
-       if (qp->s_tail == old_last)
-               qp->s_tail = last;
-       if (qp->state == IB_QPS_SQD && last == qp->s_cur)
-               qp->s_draining = 0;
-}
diff --git a/drivers/staging/rdma/hfi1/sdma.c b/drivers/staging/rdma/hfi1/sdma.c
deleted file mode 100644 (file)
index abb8ebc..0000000
+++ /dev/null
@@ -1,3052 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/spinlock.h>
-#include <linux/seqlock.h>
-#include <linux/netdevice.h>
-#include <linux/moduleparam.h>
-#include <linux/bitops.h>
-#include <linux/timer.h>
-#include <linux/vmalloc.h>
-#include <linux/highmem.h>
-
-#include "hfi.h"
-#include "common.h"
-#include "qp.h"
-#include "sdma.h"
-#include "iowait.h"
-#include "trace.h"
-
-/* must be a power of 2 >= 64 <= 32768 */
-#define SDMA_DESCQ_CNT 2048
-#define SDMA_DESC_INTR 64
-#define INVALID_TAIL 0xffff
-
-static uint sdma_descq_cnt = SDMA_DESCQ_CNT;
-module_param(sdma_descq_cnt, uint, S_IRUGO);
-MODULE_PARM_DESC(sdma_descq_cnt, "Number of SDMA descq entries");
-
-static uint sdma_idle_cnt = 250;
-module_param(sdma_idle_cnt, uint, S_IRUGO);
-MODULE_PARM_DESC(sdma_idle_cnt, "sdma interrupt idle delay (ns,default 250)");
-
-uint mod_num_sdma;
-module_param_named(num_sdma, mod_num_sdma, uint, S_IRUGO);
-MODULE_PARM_DESC(num_sdma, "Set max number SDMA engines to use");
-
-static uint sdma_desct_intr = SDMA_DESC_INTR;
-module_param_named(desct_intr, sdma_desct_intr, uint, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(desct_intr, "Number of SDMA descriptor before interrupt");
-
-#define SDMA_WAIT_BATCH_SIZE 20
-/* max wait time for a SDMA engine to indicate it has halted */
-#define SDMA_ERR_HALT_TIMEOUT 10 /* ms */
-/* all SDMA engine errors that cause a halt */
-
-#define SD(name) SEND_DMA_##name
-#define ALL_SDMA_ENG_HALT_ERRS \
-       (SD(ENG_ERR_STATUS_SDMA_WRONG_DW_ERR_SMASK) \
-       | SD(ENG_ERR_STATUS_SDMA_GEN_MISMATCH_ERR_SMASK) \
-       | SD(ENG_ERR_STATUS_SDMA_TOO_LONG_ERR_SMASK) \
-       | SD(ENG_ERR_STATUS_SDMA_TAIL_OUT_OF_BOUNDS_ERR_SMASK) \
-       | SD(ENG_ERR_STATUS_SDMA_FIRST_DESC_ERR_SMASK) \
-       | SD(ENG_ERR_STATUS_SDMA_MEM_READ_ERR_SMASK) \
-       | SD(ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK) \
-       | SD(ENG_ERR_STATUS_SDMA_LENGTH_MISMATCH_ERR_SMASK) \
-       | SD(ENG_ERR_STATUS_SDMA_PACKET_DESC_OVERFLOW_ERR_SMASK) \
-       | SD(ENG_ERR_STATUS_SDMA_HEADER_SELECT_ERR_SMASK) \
-       | SD(ENG_ERR_STATUS_SDMA_HEADER_ADDRESS_ERR_SMASK) \
-       | SD(ENG_ERR_STATUS_SDMA_HEADER_LENGTH_ERR_SMASK) \
-       | SD(ENG_ERR_STATUS_SDMA_TIMEOUT_ERR_SMASK) \
-       | SD(ENG_ERR_STATUS_SDMA_DESC_TABLE_UNC_ERR_SMASK) \
-       | SD(ENG_ERR_STATUS_SDMA_ASSEMBLY_UNC_ERR_SMASK) \
-       | SD(ENG_ERR_STATUS_SDMA_PACKET_TRACKING_UNC_ERR_SMASK) \
-       | SD(ENG_ERR_STATUS_SDMA_HEADER_STORAGE_UNC_ERR_SMASK) \
-       | SD(ENG_ERR_STATUS_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SMASK))
-
-/* sdma_sendctrl operations */
-#define SDMA_SENDCTRL_OP_ENABLE    BIT(0)
-#define SDMA_SENDCTRL_OP_INTENABLE BIT(1)
-#define SDMA_SENDCTRL_OP_HALT      BIT(2)
-#define SDMA_SENDCTRL_OP_CLEANUP   BIT(3)
-
-/* handle long defines */
-#define SDMA_EGRESS_PACKET_OCCUPANCY_SMASK \
-SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SMASK
-#define SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT \
-SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT
-
-static const char * const sdma_state_names[] = {
-       [sdma_state_s00_hw_down]                = "s00_HwDown",
-       [sdma_state_s10_hw_start_up_halt_wait]  = "s10_HwStartUpHaltWait",
-       [sdma_state_s15_hw_start_up_clean_wait] = "s15_HwStartUpCleanWait",
-       [sdma_state_s20_idle]                   = "s20_Idle",
-       [sdma_state_s30_sw_clean_up_wait]       = "s30_SwCleanUpWait",
-       [sdma_state_s40_hw_clean_up_wait]       = "s40_HwCleanUpWait",
-       [sdma_state_s50_hw_halt_wait]           = "s50_HwHaltWait",
-       [sdma_state_s60_idle_halt_wait]         = "s60_IdleHaltWait",
-       [sdma_state_s80_hw_freeze]              = "s80_HwFreeze",
-       [sdma_state_s82_freeze_sw_clean]        = "s82_FreezeSwClean",
-       [sdma_state_s99_running]                = "s99_Running",
-};
-
-static const char * const sdma_event_names[] = {
-       [sdma_event_e00_go_hw_down]   = "e00_GoHwDown",
-       [sdma_event_e10_go_hw_start]  = "e10_GoHwStart",
-       [sdma_event_e15_hw_halt_done] = "e15_HwHaltDone",
-       [sdma_event_e25_hw_clean_up_done] = "e25_HwCleanUpDone",
-       [sdma_event_e30_go_running]   = "e30_GoRunning",
-       [sdma_event_e40_sw_cleaned]   = "e40_SwCleaned",
-       [sdma_event_e50_hw_cleaned]   = "e50_HwCleaned",
-       [sdma_event_e60_hw_halted]    = "e60_HwHalted",
-       [sdma_event_e70_go_idle]      = "e70_GoIdle",
-       [sdma_event_e80_hw_freeze]    = "e80_HwFreeze",
-       [sdma_event_e81_hw_frozen]    = "e81_HwFrozen",
-       [sdma_event_e82_hw_unfreeze]  = "e82_HwUnfreeze",
-       [sdma_event_e85_link_down]    = "e85_LinkDown",
-       [sdma_event_e90_sw_halted]    = "e90_SwHalted",
-};
-
-static const struct sdma_set_state_action sdma_action_table[] = {
-       [sdma_state_s00_hw_down] = {
-               .go_s99_running_tofalse = 1,
-               .op_enable = 0,
-               .op_intenable = 0,
-               .op_halt = 0,
-               .op_cleanup = 0,
-       },
-       [sdma_state_s10_hw_start_up_halt_wait] = {
-               .op_enable = 0,
-               .op_intenable = 0,
-               .op_halt = 1,
-               .op_cleanup = 0,
-       },
-       [sdma_state_s15_hw_start_up_clean_wait] = {
-               .op_enable = 0,
-               .op_intenable = 1,
-               .op_halt = 0,
-               .op_cleanup = 1,
-       },
-       [sdma_state_s20_idle] = {
-               .op_enable = 0,
-               .op_intenable = 1,
-               .op_halt = 0,
-               .op_cleanup = 0,
-       },
-       [sdma_state_s30_sw_clean_up_wait] = {
-               .op_enable = 0,
-               .op_intenable = 0,
-               .op_halt = 0,
-               .op_cleanup = 0,
-       },
-       [sdma_state_s40_hw_clean_up_wait] = {
-               .op_enable = 0,
-               .op_intenable = 0,
-               .op_halt = 0,
-               .op_cleanup = 1,
-       },
-       [sdma_state_s50_hw_halt_wait] = {
-               .op_enable = 0,
-               .op_intenable = 0,
-               .op_halt = 0,
-               .op_cleanup = 0,
-       },
-       [sdma_state_s60_idle_halt_wait] = {
-               .go_s99_running_tofalse = 1,
-               .op_enable = 0,
-               .op_intenable = 0,
-               .op_halt = 1,
-               .op_cleanup = 0,
-       },
-       [sdma_state_s80_hw_freeze] = {
-               .op_enable = 0,
-               .op_intenable = 0,
-               .op_halt = 0,
-               .op_cleanup = 0,
-       },
-       [sdma_state_s82_freeze_sw_clean] = {
-               .op_enable = 0,
-               .op_intenable = 0,
-               .op_halt = 0,
-               .op_cleanup = 0,
-       },
-       [sdma_state_s99_running] = {
-               .op_enable = 1,
-               .op_intenable = 1,
-               .op_halt = 0,
-               .op_cleanup = 0,
-               .go_s99_running_totrue = 1,
-       },
-};
-
-#define SDMA_TAIL_UPDATE_THRESH 0x1F
-
-/* declare all statics here rather than keep sorting */
-static void sdma_complete(struct kref *);
-static void sdma_finalput(struct sdma_state *);
-static void sdma_get(struct sdma_state *);
-static void sdma_hw_clean_up_task(unsigned long);
-static void sdma_put(struct sdma_state *);
-static void sdma_set_state(struct sdma_engine *, enum sdma_states);
-static void sdma_start_hw_clean_up(struct sdma_engine *);
-static void sdma_sw_clean_up_task(unsigned long);
-static void sdma_sendctrl(struct sdma_engine *, unsigned);
-static void init_sdma_regs(struct sdma_engine *, u32, uint);
-static void sdma_process_event(
-       struct sdma_engine *sde,
-       enum sdma_events event);
-static void __sdma_process_event(
-       struct sdma_engine *sde,
-       enum sdma_events event);
-static void dump_sdma_state(struct sdma_engine *sde);
-static void sdma_make_progress(struct sdma_engine *sde, u64 status);
-static void sdma_desc_avail(struct sdma_engine *sde, unsigned avail);
-static void sdma_flush_descq(struct sdma_engine *sde);
-
-/**
- * sdma_state_name() - return state string from enum
- * @state: state
- */
-static const char *sdma_state_name(enum sdma_states state)
-{
-       return sdma_state_names[state];
-}
-
-static void sdma_get(struct sdma_state *ss)
-{
-       kref_get(&ss->kref);
-}
-
-static void sdma_complete(struct kref *kref)
-{
-       struct sdma_state *ss =
-               container_of(kref, struct sdma_state, kref);
-
-       complete(&ss->comp);
-}
-
-static void sdma_put(struct sdma_state *ss)
-{
-       kref_put(&ss->kref, sdma_complete);
-}
-
-static void sdma_finalput(struct sdma_state *ss)
-{
-       sdma_put(ss);
-       wait_for_completion(&ss->comp);
-}
-
-static inline void write_sde_csr(
-       struct sdma_engine *sde,
-       u32 offset0,
-       u64 value)
-{
-       write_kctxt_csr(sde->dd, sde->this_idx, offset0, value);
-}
-
-static inline u64 read_sde_csr(
-       struct sdma_engine *sde,
-       u32 offset0)
-{
-       return read_kctxt_csr(sde->dd, sde->this_idx, offset0);
-}
-
-/*
- * sdma_wait_for_packet_egress() - wait for the VL FIFO occupancy for
- * sdma engine 'sde' to drop to 0.
- */
-static void sdma_wait_for_packet_egress(struct sdma_engine *sde,
-                                       int pause)
-{
-       u64 off = 8 * sde->this_idx;
-       struct hfi1_devdata *dd = sde->dd;
-       int lcnt = 0;
-       u64 reg_prev;
-       u64 reg = 0;
-
-       while (1) {
-               reg_prev = reg;
-               reg = read_csr(dd, off + SEND_EGRESS_SEND_DMA_STATUS);
-
-               reg &= SDMA_EGRESS_PACKET_OCCUPANCY_SMASK;
-               reg >>= SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT;
-               if (reg == 0)
-                       break;
-               /* counter is reest if accupancy count changes */
-               if (reg != reg_prev)
-                       lcnt = 0;
-               if (lcnt++ > 500) {
-                       /* timed out - bounce the link */
-                       dd_dev_err(dd, "%s: engine %u timeout waiting for packets to egress, remaining count %u, bouncing link\n",
-                                  __func__, sde->this_idx, (u32)reg);
-                       queue_work(dd->pport->hfi1_wq,
-                                  &dd->pport->link_bounce_work);
-                       break;
-               }
-               udelay(1);
-       }
-}
-
-/*
- * sdma_wait() - wait for packet egress to complete for all SDMA engines,
- * and pause for credit return.
- */
-void sdma_wait(struct hfi1_devdata *dd)
-{
-       int i;
-
-       for (i = 0; i < dd->num_sdma; i++) {
-               struct sdma_engine *sde = &dd->per_sdma[i];
-
-               sdma_wait_for_packet_egress(sde, 0);
-       }
-}
-
-static inline void sdma_set_desc_cnt(struct sdma_engine *sde, unsigned cnt)
-{
-       u64 reg;
-
-       if (!(sde->dd->flags & HFI1_HAS_SDMA_TIMEOUT))
-               return;
-       reg = cnt;
-       reg &= SD(DESC_CNT_CNT_MASK);
-       reg <<= SD(DESC_CNT_CNT_SHIFT);
-       write_sde_csr(sde, SD(DESC_CNT), reg);
-}
-
-static inline void complete_tx(struct sdma_engine *sde,
-                              struct sdma_txreq *tx,
-                              int res)
-{
-       /* protect against complete modifying */
-       struct iowait *wait = tx->wait;
-       callback_t complete = tx->complete;
-
-#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
-       trace_hfi1_sdma_out_sn(sde, tx->sn);
-       if (WARN_ON_ONCE(sde->head_sn != tx->sn))
-               dd_dev_err(sde->dd, "expected %llu got %llu\n",
-                          sde->head_sn, tx->sn);
-       sde->head_sn++;
-#endif
-       sdma_txclean(sde->dd, tx);
-       if (complete)
-               (*complete)(tx, res);
-       if (iowait_sdma_dec(wait) && wait)
-               iowait_drain_wakeup(wait);
-}
-
-/*
- * Complete all the sdma requests with a SDMA_TXREQ_S_ABORTED status
- *
- * Depending on timing there can be txreqs in two places:
- * - in the descq ring
- * - in the flush list
- *
- * To avoid ordering issues the descq ring needs to be flushed
- * first followed by the flush list.
- *
- * This routine is called from two places
- * - From a work queue item
- * - Directly from the state machine just before setting the
- *   state to running
- *
- * Must be called with head_lock held
- *
- */
-static void sdma_flush(struct sdma_engine *sde)
-{
-       struct sdma_txreq *txp, *txp_next;
-       LIST_HEAD(flushlist);
-       unsigned long flags;
-
-       /* flush from head to tail */
-       sdma_flush_descq(sde);
-       spin_lock_irqsave(&sde->flushlist_lock, flags);
-       /* copy flush list */
-       list_for_each_entry_safe(txp, txp_next, &sde->flushlist, list) {
-               list_del_init(&txp->list);
-               list_add_tail(&txp->list, &flushlist);
-       }
-       spin_unlock_irqrestore(&sde->flushlist_lock, flags);
-       /* flush from flush list */
-       list_for_each_entry_safe(txp, txp_next, &flushlist, list)
-               complete_tx(sde, txp, SDMA_TXREQ_S_ABORTED);
-}
-
-/*
- * Fields a work request for flushing the descq ring
- * and the flush list
- *
- * If the engine has been brought to running during
- * the scheduling delay, the flush is ignored, assuming
- * that the process of bringing the engine to running
- * would have done this flush prior to going to running.
- *
- */
-static void sdma_field_flush(struct work_struct *work)
-{
-       unsigned long flags;
-       struct sdma_engine *sde =
-               container_of(work, struct sdma_engine, flush_worker);
-
-       write_seqlock_irqsave(&sde->head_lock, flags);
-       if (!__sdma_running(sde))
-               sdma_flush(sde);
-       write_sequnlock_irqrestore(&sde->head_lock, flags);
-}
-
-static void sdma_err_halt_wait(struct work_struct *work)
-{
-       struct sdma_engine *sde = container_of(work, struct sdma_engine,
-                                               err_halt_worker);
-       u64 statuscsr;
-       unsigned long timeout;
-
-       timeout = jiffies + msecs_to_jiffies(SDMA_ERR_HALT_TIMEOUT);
-       while (1) {
-               statuscsr = read_sde_csr(sde, SD(STATUS));
-               statuscsr &= SD(STATUS_ENG_HALTED_SMASK);
-               if (statuscsr)
-                       break;
-               if (time_after(jiffies, timeout)) {
-                       dd_dev_err(sde->dd,
-                                  "SDMA engine %d - timeout waiting for engine to halt\n",
-                                  sde->this_idx);
-                       /*
-                        * Continue anyway.  This could happen if there was
-                        * an uncorrectable error in the wrong spot.
-                        */
-                       break;
-               }
-               usleep_range(80, 120);
-       }
-
-       sdma_process_event(sde, sdma_event_e15_hw_halt_done);
-}
-
-static void sdma_err_progress_check_schedule(struct sdma_engine *sde)
-{
-       if (!is_bx(sde->dd) && HFI1_CAP_IS_KSET(SDMA_AHG)) {
-               unsigned index;
-               struct hfi1_devdata *dd = sde->dd;
-
-               for (index = 0; index < dd->num_sdma; index++) {
-                       struct sdma_engine *curr_sdma = &dd->per_sdma[index];
-
-                       if (curr_sdma != sde)
-                               curr_sdma->progress_check_head =
-                                                       curr_sdma->descq_head;
-               }
-               dd_dev_err(sde->dd,
-                          "SDMA engine %d - check scheduled\n",
-                               sde->this_idx);
-               mod_timer(&sde->err_progress_check_timer, jiffies + 10);
-       }
-}
-
-static void sdma_err_progress_check(unsigned long data)
-{
-       unsigned index;
-       struct sdma_engine *sde = (struct sdma_engine *)data;
-
-       dd_dev_err(sde->dd, "SDE progress check event\n");
-       for (index = 0; index < sde->dd->num_sdma; index++) {
-               struct sdma_engine *curr_sde = &sde->dd->per_sdma[index];
-               unsigned long flags;
-
-               /* check progress on each engine except the current one */
-               if (curr_sde == sde)
-                       continue;
-               /*
-                * We must lock interrupts when acquiring sde->lock,
-                * to avoid a deadlock if interrupt triggers and spins on
-                * the same lock on same CPU
-                */
-               spin_lock_irqsave(&curr_sde->tail_lock, flags);
-               write_seqlock(&curr_sde->head_lock);
-
-               /* skip non-running queues */
-               if (curr_sde->state.current_state != sdma_state_s99_running) {
-                       write_sequnlock(&curr_sde->head_lock);
-                       spin_unlock_irqrestore(&curr_sde->tail_lock, flags);
-                       continue;
-               }
-
-               if ((curr_sde->descq_head != curr_sde->descq_tail) &&
-                   (curr_sde->descq_head ==
-                               curr_sde->progress_check_head))
-                       __sdma_process_event(curr_sde,
-                                            sdma_event_e90_sw_halted);
-               write_sequnlock(&curr_sde->head_lock);
-               spin_unlock_irqrestore(&curr_sde->tail_lock, flags);
-       }
-       schedule_work(&sde->err_halt_worker);
-}
-
-static void sdma_hw_clean_up_task(unsigned long opaque)
-{
-       struct sdma_engine *sde = (struct sdma_engine *)opaque;
-       u64 statuscsr;
-
-       while (1) {
-#ifdef CONFIG_SDMA_VERBOSITY
-               dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
-                          sde->this_idx, slashstrip(__FILE__), __LINE__,
-                       __func__);
-#endif
-               statuscsr = read_sde_csr(sde, SD(STATUS));
-               statuscsr &= SD(STATUS_ENG_CLEANED_UP_SMASK);
-               if (statuscsr)
-                       break;
-               udelay(10);
-       }
-
-       sdma_process_event(sde, sdma_event_e25_hw_clean_up_done);
-}
-
-static inline struct sdma_txreq *get_txhead(struct sdma_engine *sde)
-{
-       smp_read_barrier_depends(); /* see sdma_update_tail() */
-       return sde->tx_ring[sde->tx_head & sde->sdma_mask];
-}
-
-/*
- * flush ring for recovery
- */
-static void sdma_flush_descq(struct sdma_engine *sde)
-{
-       u16 head, tail;
-       int progress = 0;
-       struct sdma_txreq *txp = get_txhead(sde);
-
-       /* The reason for some of the complexity of this code is that
-        * not all descriptors have corresponding txps.  So, we have to
-        * be able to skip over descs until we wander into the range of
-        * the next txp on the list.
-        */
-       head = sde->descq_head & sde->sdma_mask;
-       tail = sde->descq_tail & sde->sdma_mask;
-       while (head != tail) {
-               /* advance head, wrap if needed */
-               head = ++sde->descq_head & sde->sdma_mask;
-               /* if now past this txp's descs, do the callback */
-               if (txp && txp->next_descq_idx == head) {
-                       /* remove from list */
-                       sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL;
-                       complete_tx(sde, txp, SDMA_TXREQ_S_ABORTED);
-                       trace_hfi1_sdma_progress(sde, head, tail, txp);
-                       txp = get_txhead(sde);
-               }
-               progress++;
-       }
-       if (progress)
-               sdma_desc_avail(sde, sdma_descq_freecnt(sde));
-}
-
-static void sdma_sw_clean_up_task(unsigned long opaque)
-{
-       struct sdma_engine *sde = (struct sdma_engine *)opaque;
-       unsigned long flags;
-
-       spin_lock_irqsave(&sde->tail_lock, flags);
-       write_seqlock(&sde->head_lock);
-
-       /*
-        * At this point, the following should always be true:
-        * - We are halted, so no more descriptors are getting retired.
-        * - We are not running, so no one is submitting new work.
-        * - Only we can send the e40_sw_cleaned, so we can't start
-        *   running again until we say so.  So, the active list and
-        *   descq are ours to play with.
-        */
-
-       /*
-        * In the error clean up sequence, software clean must be called
-        * before the hardware clean so we can use the hardware head in
-        * the progress routine.  A hardware clean or SPC unfreeze will
-        * reset the hardware head.
-        *
-        * Process all retired requests. The progress routine will use the
-        * latest physical hardware head - we are not running so speed does
-        * not matter.
-        */
-       sdma_make_progress(sde, 0);
-
-       sdma_flush(sde);
-
-       /*
-        * Reset our notion of head and tail.
-        * Note that the HW registers have been reset via an earlier
-        * clean up.
-        */
-       sde->descq_tail = 0;
-       sde->descq_head = 0;
-       sde->desc_avail = sdma_descq_freecnt(sde);
-       *sde->head_dma = 0;
-
-       __sdma_process_event(sde, sdma_event_e40_sw_cleaned);
-
-       write_sequnlock(&sde->head_lock);
-       spin_unlock_irqrestore(&sde->tail_lock, flags);
-}
-
-static void sdma_sw_tear_down(struct sdma_engine *sde)
-{
-       struct sdma_state *ss = &sde->state;
-
-       /* Releasing this reference means the state machine has stopped. */
-       sdma_put(ss);
-
-       /* stop waiting for all unfreeze events to complete */
-       atomic_set(&sde->dd->sdma_unfreeze_count, -1);
-       wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
-}
-
-static void sdma_start_hw_clean_up(struct sdma_engine *sde)
-{
-       tasklet_hi_schedule(&sde->sdma_hw_clean_up_task);
-}
-
-static void sdma_set_state(struct sdma_engine *sde,
-                          enum sdma_states next_state)
-{
-       struct sdma_state *ss = &sde->state;
-       const struct sdma_set_state_action *action = sdma_action_table;
-       unsigned op = 0;
-
-       trace_hfi1_sdma_state(
-               sde,
-               sdma_state_names[ss->current_state],
-               sdma_state_names[next_state]);
-
-       /* debugging bookkeeping */
-       ss->previous_state = ss->current_state;
-       ss->previous_op = ss->current_op;
-       ss->current_state = next_state;
-
-       if (ss->previous_state != sdma_state_s99_running &&
-           next_state == sdma_state_s99_running)
-               sdma_flush(sde);
-
-       if (action[next_state].op_enable)
-               op |= SDMA_SENDCTRL_OP_ENABLE;
-
-       if (action[next_state].op_intenable)
-               op |= SDMA_SENDCTRL_OP_INTENABLE;
-
-       if (action[next_state].op_halt)
-               op |= SDMA_SENDCTRL_OP_HALT;
-
-       if (action[next_state].op_cleanup)
-               op |= SDMA_SENDCTRL_OP_CLEANUP;
-
-       if (action[next_state].go_s99_running_tofalse)
-               ss->go_s99_running = 0;
-
-       if (action[next_state].go_s99_running_totrue)
-               ss->go_s99_running = 1;
-
-       ss->current_op = op;
-       sdma_sendctrl(sde, ss->current_op);
-}
-
-/**
- * sdma_get_descq_cnt() - called when device probed
- *
- * Return a validated descq count.
- *
- * This is currently only used in the verbs initialization to build the tx
- * list.
- *
- * This will probably be deleted in favor of a more scalable approach to
- * alloc tx's.
- *
- */
-u16 sdma_get_descq_cnt(void)
-{
-       u16 count = sdma_descq_cnt;
-
-       if (!count)
-               return SDMA_DESCQ_CNT;
-       /* count must be a power of 2 greater than 64 and less than
-        * 32768.   Otherwise return default.
-        */
-       if (!is_power_of_2(count))
-               return SDMA_DESCQ_CNT;
-       if (count < 64 || count > 32768)
-               return SDMA_DESCQ_CNT;
-       return count;
-}
-
-/**
- * sdma_select_engine_vl() - select sdma engine
- * @dd: devdata
- * @selector: a spreading factor
- * @vl: this vl
- *
- *
- * This function returns an engine based on the selector and a vl.  The
- * mapping fields are protected by RCU.
- */
-struct sdma_engine *sdma_select_engine_vl(
-       struct hfi1_devdata *dd,
-       u32 selector,
-       u8 vl)
-{
-       struct sdma_vl_map *m;
-       struct sdma_map_elem *e;
-       struct sdma_engine *rval;
-
-       /* NOTE This should only happen if SC->VL changed after the initial
-        *      checks on the QP/AH
-        *      Default will return engine 0 below
-        */
-       if (vl >= num_vls) {
-               rval = NULL;
-               goto done;
-       }
-
-       rcu_read_lock();
-       m = rcu_dereference(dd->sdma_map);
-       if (unlikely(!m)) {
-               rcu_read_unlock();
-               return &dd->per_sdma[0];
-       }
-       e = m->map[vl & m->mask];
-       rval = e->sde[selector & e->mask];
-       rcu_read_unlock();
-
-done:
-       rval =  !rval ? &dd->per_sdma[0] : rval;
-       trace_hfi1_sdma_engine_select(dd, selector, vl, rval->this_idx);
-       return rval;
-}
-
-/**
- * sdma_select_engine_sc() - select sdma engine
- * @dd: devdata
- * @selector: a spreading factor
- * @sc5: the 5 bit sc
- *
- *
- * This function returns an engine based on the selector and an sc.
- */
-struct sdma_engine *sdma_select_engine_sc(
-       struct hfi1_devdata *dd,
-       u32 selector,
-       u8 sc5)
-{
-       u8 vl = sc_to_vlt(dd, sc5);
-
-       return sdma_select_engine_vl(dd, selector, vl);
-}
-
-/*
- * Free the indicated map struct
- */
-static void sdma_map_free(struct sdma_vl_map *m)
-{
-       int i;
-
-       for (i = 0; m && i < m->actual_vls; i++)
-               kfree(m->map[i]);
-       kfree(m);
-}
-
-/*
- * Handle RCU callback
- */
-static void sdma_map_rcu_callback(struct rcu_head *list)
-{
-       struct sdma_vl_map *m = container_of(list, struct sdma_vl_map, list);
-
-       sdma_map_free(m);
-}
-
-/**
- * sdma_map_init - called when # vls change
- * @dd: hfi1_devdata
- * @port: port number
- * @num_vls: number of vls
- * @vl_engines: per vl engine mapping (optional)
- *
- * This routine changes the mapping based on the number of vls.
- *
- * vl_engines is used to specify a non-uniform vl/engine loading. NULL
- * implies auto computing the loading and giving each VLs a uniform
- * distribution of engines per VL.
- *
- * The auto algorithm computes the sde_per_vl and the number of extra
- * engines.  Any extra engines are added from the last VL on down.
- *
- * rcu locking is used here to control access to the mapping fields.
- *
- * If either the num_vls or num_sdma are non-power of 2, the array sizes
- * in the struct sdma_vl_map and the struct sdma_map_elem are rounded
- * up to the next highest power of 2 and the first entry is reused
- * in a round robin fashion.
- *
- * If an error occurs the map change is not done and the mapping is
- * not changed.
- *
- */
-int sdma_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls, u8 *vl_engines)
-{
-       int i, j;
-       int extra, sde_per_vl;
-       int engine = 0;
-       u8 lvl_engines[OPA_MAX_VLS];
-       struct sdma_vl_map *oldmap, *newmap;
-
-       if (!(dd->flags & HFI1_HAS_SEND_DMA))
-               return 0;
-
-       if (!vl_engines) {
-               /* truncate divide */
-               sde_per_vl = dd->num_sdma / num_vls;
-               /* extras */
-               extra = dd->num_sdma % num_vls;
-               vl_engines = lvl_engines;
-               /* add extras from last vl down */
-               for (i = num_vls - 1; i >= 0; i--, extra--)
-                       vl_engines[i] = sde_per_vl + (extra > 0 ? 1 : 0);
-       }
-       /* build new map */
-       newmap = kzalloc(
-               sizeof(struct sdma_vl_map) +
-                       roundup_pow_of_two(num_vls) *
-                       sizeof(struct sdma_map_elem *),
-               GFP_KERNEL);
-       if (!newmap)
-               goto bail;
-       newmap->actual_vls = num_vls;
-       newmap->vls = roundup_pow_of_two(num_vls);
-       newmap->mask = (1 << ilog2(newmap->vls)) - 1;
-       /* initialize back-map */
-       for (i = 0; i < TXE_NUM_SDMA_ENGINES; i++)
-               newmap->engine_to_vl[i] = -1;
-       for (i = 0; i < newmap->vls; i++) {
-               /* save for wrap around */
-               int first_engine = engine;
-
-               if (i < newmap->actual_vls) {
-                       int sz = roundup_pow_of_two(vl_engines[i]);
-
-                       /* only allocate once */
-                       newmap->map[i] = kzalloc(
-                               sizeof(struct sdma_map_elem) +
-                                       sz * sizeof(struct sdma_engine *),
-                               GFP_KERNEL);
-                       if (!newmap->map[i])
-                               goto bail;
-                       newmap->map[i]->mask = (1 << ilog2(sz)) - 1;
-                       /* assign engines */
-                       for (j = 0; j < sz; j++) {
-                               newmap->map[i]->sde[j] =
-                                       &dd->per_sdma[engine];
-                               if (++engine >= first_engine + vl_engines[i])
-                                       /* wrap back to first engine */
-                                       engine = first_engine;
-                       }
-                       /* assign back-map */
-                       for (j = 0; j < vl_engines[i]; j++)
-                               newmap->engine_to_vl[first_engine + j] = i;
-               } else {
-                       /* just re-use entry without allocating */
-                       newmap->map[i] = newmap->map[i % num_vls];
-               }
-               engine = first_engine + vl_engines[i];
-       }
-       /* newmap in hand, save old map */
-       spin_lock_irq(&dd->sde_map_lock);
-       oldmap = rcu_dereference_protected(dd->sdma_map,
-                                          lockdep_is_held(&dd->sde_map_lock));
-
-       /* publish newmap */
-       rcu_assign_pointer(dd->sdma_map, newmap);
-
-       spin_unlock_irq(&dd->sde_map_lock);
-       /* success, free any old map after grace period */
-       if (oldmap)
-               call_rcu(&oldmap->list, sdma_map_rcu_callback);
-       return 0;
-bail:
-       /* free any partial allocation */
-       sdma_map_free(newmap);
-       return -ENOMEM;
-}
-
-/*
- * Clean up allocated memory.
- *
- * This routine is can be called regardless of the success of sdma_init()
- *
- */
-static void sdma_clean(struct hfi1_devdata *dd, size_t num_engines)
-{
-       size_t i;
-       struct sdma_engine *sde;
-
-       if (dd->sdma_pad_dma) {
-               dma_free_coherent(&dd->pcidev->dev, 4,
-                                 (void *)dd->sdma_pad_dma,
-                                 dd->sdma_pad_phys);
-               dd->sdma_pad_dma = NULL;
-               dd->sdma_pad_phys = 0;
-       }
-       if (dd->sdma_heads_dma) {
-               dma_free_coherent(&dd->pcidev->dev, dd->sdma_heads_size,
-                                 (void *)dd->sdma_heads_dma,
-                                 dd->sdma_heads_phys);
-               dd->sdma_heads_dma = NULL;
-               dd->sdma_heads_phys = 0;
-       }
-       for (i = 0; dd->per_sdma && i < num_engines; ++i) {
-               sde = &dd->per_sdma[i];
-
-               sde->head_dma = NULL;
-               sde->head_phys = 0;
-
-               if (sde->descq) {
-                       dma_free_coherent(
-                               &dd->pcidev->dev,
-                               sde->descq_cnt * sizeof(u64[2]),
-                               sde->descq,
-                               sde->descq_phys
-                       );
-                       sde->descq = NULL;
-                       sde->descq_phys = 0;
-               }
-               kvfree(sde->tx_ring);
-               sde->tx_ring = NULL;
-       }
-       spin_lock_irq(&dd->sde_map_lock);
-       sdma_map_free(rcu_access_pointer(dd->sdma_map));
-       RCU_INIT_POINTER(dd->sdma_map, NULL);
-       spin_unlock_irq(&dd->sde_map_lock);
-       synchronize_rcu();
-       kfree(dd->per_sdma);
-       dd->per_sdma = NULL;
-}
-
-/**
- * sdma_init() - called when device probed
- * @dd: hfi1_devdata
- * @port: port number (currently only zero)
- *
- * sdma_init initializes the specified number of engines.
- *
- * The code initializes each sde, its csrs.  Interrupts
- * are not required to be enabled.
- *
- * Returns:
- * 0 - success, -errno on failure
- */
-int sdma_init(struct hfi1_devdata *dd, u8 port)
-{
-       unsigned this_idx;
-       struct sdma_engine *sde;
-       u16 descq_cnt;
-       void *curr_head;
-       struct hfi1_pportdata *ppd = dd->pport + port;
-       u32 per_sdma_credits;
-       uint idle_cnt = sdma_idle_cnt;
-       size_t num_engines = dd->chip_sdma_engines;
-
-       if (!HFI1_CAP_IS_KSET(SDMA)) {
-               HFI1_CAP_CLEAR(SDMA_AHG);
-               return 0;
-       }
-       if (mod_num_sdma &&
-           /* can't exceed chip support */
-           mod_num_sdma <= dd->chip_sdma_engines &&
-           /* count must be >= vls */
-           mod_num_sdma >= num_vls)
-               num_engines = mod_num_sdma;
-
-       dd_dev_info(dd, "SDMA mod_num_sdma: %u\n", mod_num_sdma);
-       dd_dev_info(dd, "SDMA chip_sdma_engines: %u\n", dd->chip_sdma_engines);
-       dd_dev_info(dd, "SDMA chip_sdma_mem_size: %u\n",
-                   dd->chip_sdma_mem_size);
-
-       per_sdma_credits =
-               dd->chip_sdma_mem_size / (num_engines * SDMA_BLOCK_SIZE);
-
-       /* set up freeze waitqueue */
-       init_waitqueue_head(&dd->sdma_unfreeze_wq);
-       atomic_set(&dd->sdma_unfreeze_count, 0);
-
-       descq_cnt = sdma_get_descq_cnt();
-       dd_dev_info(dd, "SDMA engines %zu descq_cnt %u\n",
-                   num_engines, descq_cnt);
-
-       /* alloc memory for array of send engines */
-       dd->per_sdma = kcalloc(num_engines, sizeof(*dd->per_sdma), GFP_KERNEL);
-       if (!dd->per_sdma)
-               return -ENOMEM;
-
-       idle_cnt = ns_to_cclock(dd, idle_cnt);
-       if (!sdma_desct_intr)
-               sdma_desct_intr = SDMA_DESC_INTR;
-
-       /* Allocate memory for SendDMA descriptor FIFOs */
-       for (this_idx = 0; this_idx < num_engines; ++this_idx) {
-               sde = &dd->per_sdma[this_idx];
-               sde->dd = dd;
-               sde->ppd = ppd;
-               sde->this_idx = this_idx;
-               sde->descq_cnt = descq_cnt;
-               sde->desc_avail = sdma_descq_freecnt(sde);
-               sde->sdma_shift = ilog2(descq_cnt);
-               sde->sdma_mask = (1 << sde->sdma_shift) - 1;
-
-               /* Create a mask specifically for each interrupt source */
-               sde->int_mask = (u64)1 << (0 * TXE_NUM_SDMA_ENGINES +
-                                          this_idx);
-               sde->progress_mask = (u64)1 << (1 * TXE_NUM_SDMA_ENGINES +
-                                               this_idx);
-               sde->idle_mask = (u64)1 << (2 * TXE_NUM_SDMA_ENGINES +
-                                           this_idx);
-               /* Create a combined mask to cover all 3 interrupt sources */
-               sde->imask = sde->int_mask | sde->progress_mask |
-                            sde->idle_mask;
-
-               spin_lock_init(&sde->tail_lock);
-               seqlock_init(&sde->head_lock);
-               spin_lock_init(&sde->senddmactrl_lock);
-               spin_lock_init(&sde->flushlist_lock);
-               /* insure there is always a zero bit */
-               sde->ahg_bits = 0xfffffffe00000000ULL;
-
-               sdma_set_state(sde, sdma_state_s00_hw_down);
-
-               /* set up reference counting */
-               kref_init(&sde->state.kref);
-               init_completion(&sde->state.comp);
-
-               INIT_LIST_HEAD(&sde->flushlist);
-               INIT_LIST_HEAD(&sde->dmawait);
-
-               sde->tail_csr =
-                       get_kctxt_csr_addr(dd, this_idx, SD(TAIL));
-
-               if (idle_cnt)
-                       dd->default_desc1 =
-                               SDMA_DESC1_HEAD_TO_HOST_FLAG;
-               else
-                       dd->default_desc1 =
-                               SDMA_DESC1_INT_REQ_FLAG;
-
-               tasklet_init(&sde->sdma_hw_clean_up_task, sdma_hw_clean_up_task,
-                            (unsigned long)sde);
-
-               tasklet_init(&sde->sdma_sw_clean_up_task, sdma_sw_clean_up_task,
-                            (unsigned long)sde);
-               INIT_WORK(&sde->err_halt_worker, sdma_err_halt_wait);
-               INIT_WORK(&sde->flush_worker, sdma_field_flush);
-
-               sde->progress_check_head = 0;
-
-               setup_timer(&sde->err_progress_check_timer,
-                           sdma_err_progress_check, (unsigned long)sde);
-
-               sde->descq = dma_zalloc_coherent(
-                       &dd->pcidev->dev,
-                       descq_cnt * sizeof(u64[2]),
-                       &sde->descq_phys,
-                       GFP_KERNEL
-               );
-               if (!sde->descq)
-                       goto bail;
-               sde->tx_ring =
-                       kcalloc(descq_cnt, sizeof(struct sdma_txreq *),
-                               GFP_KERNEL);
-               if (!sde->tx_ring)
-                       sde->tx_ring =
-                               vzalloc(
-                                       sizeof(struct sdma_txreq *) *
-                                       descq_cnt);
-               if (!sde->tx_ring)
-                       goto bail;
-       }
-
-       dd->sdma_heads_size = L1_CACHE_BYTES * num_engines;
-       /* Allocate memory for DMA of head registers to memory */
-       dd->sdma_heads_dma = dma_zalloc_coherent(
-               &dd->pcidev->dev,
-               dd->sdma_heads_size,
-               &dd->sdma_heads_phys,
-               GFP_KERNEL
-       );
-       if (!dd->sdma_heads_dma) {
-               dd_dev_err(dd, "failed to allocate SendDMA head memory\n");
-               goto bail;
-       }
-
-       /* Allocate memory for pad */
-       dd->sdma_pad_dma = dma_zalloc_coherent(
-               &dd->pcidev->dev,
-               sizeof(u32),
-               &dd->sdma_pad_phys,
-               GFP_KERNEL
-       );
-       if (!dd->sdma_pad_dma) {
-               dd_dev_err(dd, "failed to allocate SendDMA pad memory\n");
-               goto bail;
-       }
-
-       /* assign each engine to different cacheline and init registers */
-       curr_head = (void *)dd->sdma_heads_dma;
-       for (this_idx = 0; this_idx < num_engines; ++this_idx) {
-               unsigned long phys_offset;
-
-               sde = &dd->per_sdma[this_idx];
-
-               sde->head_dma = curr_head;
-               curr_head += L1_CACHE_BYTES;
-               phys_offset = (unsigned long)sde->head_dma -
-                             (unsigned long)dd->sdma_heads_dma;
-               sde->head_phys = dd->sdma_heads_phys + phys_offset;
-               init_sdma_regs(sde, per_sdma_credits, idle_cnt);
-       }
-       dd->flags |= HFI1_HAS_SEND_DMA;
-       dd->flags |= idle_cnt ? HFI1_HAS_SDMA_TIMEOUT : 0;
-       dd->num_sdma = num_engines;
-       if (sdma_map_init(dd, port, ppd->vls_operational, NULL))
-               goto bail;
-       dd_dev_info(dd, "SDMA num_sdma: %u\n", dd->num_sdma);
-       return 0;
-
-bail:
-       sdma_clean(dd, num_engines);
-       return -ENOMEM;
-}
-
-/**
- * sdma_all_running() - called when the link goes up
- * @dd: hfi1_devdata
- *
- * This routine moves all engines to the running state.
- */
-void sdma_all_running(struct hfi1_devdata *dd)
-{
-       struct sdma_engine *sde;
-       unsigned int i;
-
-       /* move all engines to running */
-       for (i = 0; i < dd->num_sdma; ++i) {
-               sde = &dd->per_sdma[i];
-               sdma_process_event(sde, sdma_event_e30_go_running);
-       }
-}
-
-/**
- * sdma_all_idle() - called when the link goes down
- * @dd: hfi1_devdata
- *
- * This routine moves all engines to the idle state.
- */
-void sdma_all_idle(struct hfi1_devdata *dd)
-{
-       struct sdma_engine *sde;
-       unsigned int i;
-
-       /* idle all engines */
-       for (i = 0; i < dd->num_sdma; ++i) {
-               sde = &dd->per_sdma[i];
-               sdma_process_event(sde, sdma_event_e70_go_idle);
-       }
-}
-
-/**
- * sdma_start() - called to kick off state processing for all engines
- * @dd: hfi1_devdata
- *
- * This routine is for kicking off the state processing for all required
- * sdma engines.  Interrupts need to be working at this point.
- *
- */
-void sdma_start(struct hfi1_devdata *dd)
-{
-       unsigned i;
-       struct sdma_engine *sde;
-
-       /* kick off the engines state processing */
-       for (i = 0; i < dd->num_sdma; ++i) {
-               sde = &dd->per_sdma[i];
-               sdma_process_event(sde, sdma_event_e10_go_hw_start);
-       }
-}
-
-/**
- * sdma_exit() - used when module is removed
- * @dd: hfi1_devdata
- */
-void sdma_exit(struct hfi1_devdata *dd)
-{
-       unsigned this_idx;
-       struct sdma_engine *sde;
-
-       for (this_idx = 0; dd->per_sdma && this_idx < dd->num_sdma;
-                       ++this_idx) {
-               sde = &dd->per_sdma[this_idx];
-               if (!list_empty(&sde->dmawait))
-                       dd_dev_err(dd, "sde %u: dmawait list not empty!\n",
-                                  sde->this_idx);
-               sdma_process_event(sde, sdma_event_e00_go_hw_down);
-
-               del_timer_sync(&sde->err_progress_check_timer);
-
-               /*
-                * This waits for the state machine to exit so it is not
-                * necessary to kill the sdma_sw_clean_up_task to make sure
-                * it is not running.
-                */
-               sdma_finalput(&sde->state);
-       }
-       sdma_clean(dd, dd->num_sdma);
-}
-
-/*
- * unmap the indicated descriptor
- */
-static inline void sdma_unmap_desc(
-       struct hfi1_devdata *dd,
-       struct sdma_desc *descp)
-{
-       switch (sdma_mapping_type(descp)) {
-       case SDMA_MAP_SINGLE:
-               dma_unmap_single(
-                       &dd->pcidev->dev,
-                       sdma_mapping_addr(descp),
-                       sdma_mapping_len(descp),
-                       DMA_TO_DEVICE);
-               break;
-       case SDMA_MAP_PAGE:
-               dma_unmap_page(
-                       &dd->pcidev->dev,
-                       sdma_mapping_addr(descp),
-                       sdma_mapping_len(descp),
-                       DMA_TO_DEVICE);
-               break;
-       }
-}
-
-/*
- * return the mode as indicated by the first
- * descriptor in the tx.
- */
-static inline u8 ahg_mode(struct sdma_txreq *tx)
-{
-       return (tx->descp[0].qw[1] & SDMA_DESC1_HEADER_MODE_SMASK)
-               >> SDMA_DESC1_HEADER_MODE_SHIFT;
-}
-
-/**
- * sdma_txclean() - clean tx of mappings, descp *kmalloc's
- * @dd: hfi1_devdata for unmapping
- * @tx: tx request to clean
- *
- * This is used in the progress routine to clean the tx or
- * by the ULP to toss an in-process tx build.
- *
- * The code can be called multiple times without issue.
- *
- */
-void sdma_txclean(
-       struct hfi1_devdata *dd,
-       struct sdma_txreq *tx)
-{
-       u16 i;
-
-       if (tx->num_desc) {
-               u8 skip = 0, mode = ahg_mode(tx);
-
-               /* unmap first */
-               sdma_unmap_desc(dd, &tx->descp[0]);
-               /* determine number of AHG descriptors to skip */
-               if (mode > SDMA_AHG_APPLY_UPDATE1)
-                       skip = mode >> 1;
-               for (i = 1 + skip; i < tx->num_desc; i++)
-                       sdma_unmap_desc(dd, &tx->descp[i]);
-               tx->num_desc = 0;
-       }
-       kfree(tx->coalesce_buf);
-       tx->coalesce_buf = NULL;
-       /* kmalloc'ed descp */
-       if (unlikely(tx->desc_limit > ARRAY_SIZE(tx->descs))) {
-               tx->desc_limit = ARRAY_SIZE(tx->descs);
-               kfree(tx->descp);
-       }
-}
-
-static inline u16 sdma_gethead(struct sdma_engine *sde)
-{
-       struct hfi1_devdata *dd = sde->dd;
-       int use_dmahead;
-       u16 hwhead;
-
-#ifdef CONFIG_SDMA_VERBOSITY
-       dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
-                  sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
-#endif
-
-retry:
-       use_dmahead = HFI1_CAP_IS_KSET(USE_SDMA_HEAD) && __sdma_running(sde) &&
-                                       (dd->flags & HFI1_HAS_SDMA_TIMEOUT);
-       hwhead = use_dmahead ?
-               (u16)le64_to_cpu(*sde->head_dma) :
-               (u16)read_sde_csr(sde, SD(HEAD));
-
-       if (unlikely(HFI1_CAP_IS_KSET(SDMA_HEAD_CHECK))) {
-               u16 cnt;
-               u16 swtail;
-               u16 swhead;
-               int sane;
-
-               swhead = sde->descq_head & sde->sdma_mask;
-               /* this code is really bad for cache line trading */
-               swtail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask;
-               cnt = sde->descq_cnt;
-
-               if (swhead < swtail)
-                       /* not wrapped */
-                       sane = (hwhead >= swhead) & (hwhead <= swtail);
-               else if (swhead > swtail)
-                       /* wrapped around */
-                       sane = ((hwhead >= swhead) && (hwhead < cnt)) ||
-                               (hwhead <= swtail);
-               else
-                       /* empty */
-                       sane = (hwhead == swhead);
-
-               if (unlikely(!sane)) {
-                       dd_dev_err(dd, "SDMA(%u) bad head (%s) hwhd=%hu swhd=%hu swtl=%hu cnt=%hu\n",
-                                  sde->this_idx,
-                                  use_dmahead ? "dma" : "kreg",
-                                  hwhead, swhead, swtail, cnt);
-                       if (use_dmahead) {
-                               /* try one more time, using csr */
-                               use_dmahead = 0;
-                               goto retry;
-                       }
-                       /* proceed as if no progress */
-                       hwhead = swhead;
-               }
-       }
-       return hwhead;
-}
-
-/*
- * This is called when there are send DMA descriptors that might be
- * available.
- *
- * This is called with head_lock held.
- */
-static void sdma_desc_avail(struct sdma_engine *sde, unsigned avail)
-{
-       struct iowait *wait, *nw;
-       struct iowait *waits[SDMA_WAIT_BATCH_SIZE];
-       unsigned i, n = 0, seq;
-       struct sdma_txreq *stx;
-       struct hfi1_ibdev *dev = &sde->dd->verbs_dev;
-
-#ifdef CONFIG_SDMA_VERBOSITY
-       dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
-                  slashstrip(__FILE__), __LINE__, __func__);
-       dd_dev_err(sde->dd, "avail: %u\n", avail);
-#endif
-
-       do {
-               seq = read_seqbegin(&dev->iowait_lock);
-               if (!list_empty(&sde->dmawait)) {
-                       /* at least one item */
-                       write_seqlock(&dev->iowait_lock);
-                       /* Harvest waiters wanting DMA descriptors */
-                       list_for_each_entry_safe(
-                                       wait,
-                                       nw,
-                                       &sde->dmawait,
-                                       list) {
-                               u16 num_desc = 0;
-
-                               if (!wait->wakeup)
-                                       continue;
-                               if (n == ARRAY_SIZE(waits))
-                                       break;
-                               if (!list_empty(&wait->tx_head)) {
-                                       stx = list_first_entry(
-                                               &wait->tx_head,
-                                               struct sdma_txreq,
-                                               list);
-                                       num_desc = stx->num_desc;
-                               }
-                               if (num_desc > avail)
-                                       break;
-                               avail -= num_desc;
-                               list_del_init(&wait->list);
-                               waits[n++] = wait;
-                       }
-                       write_sequnlock(&dev->iowait_lock);
-                       break;
-               }
-       } while (read_seqretry(&dev->iowait_lock, seq));
-
-       for (i = 0; i < n; i++)
-               waits[i]->wakeup(waits[i], SDMA_AVAIL_REASON);
-}
-
-/* head_lock must be held */
-static void sdma_make_progress(struct sdma_engine *sde, u64 status)
-{
-       struct sdma_txreq *txp = NULL;
-       int progress = 0;
-       u16 hwhead, swhead;
-       int idle_check_done = 0;
-
-       hwhead = sdma_gethead(sde);
-
-       /* The reason for some of the complexity of this code is that
-        * not all descriptors have corresponding txps.  So, we have to
-        * be able to skip over descs until we wander into the range of
-        * the next txp on the list.
-        */
-
-retry:
-       txp = get_txhead(sde);
-       swhead = sde->descq_head & sde->sdma_mask;
-       trace_hfi1_sdma_progress(sde, hwhead, swhead, txp);
-       while (swhead != hwhead) {
-               /* advance head, wrap if needed */
-               swhead = ++sde->descq_head & sde->sdma_mask;
-
-               /* if now past this txp's descs, do the callback */
-               if (txp && txp->next_descq_idx == swhead) {
-                       /* remove from list */
-                       sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL;
-                       complete_tx(sde, txp, SDMA_TXREQ_S_OK);
-                       /* see if there is another txp */
-                       txp = get_txhead(sde);
-               }
-               trace_hfi1_sdma_progress(sde, hwhead, swhead, txp);
-               progress++;
-       }
-
-       /*
-        * The SDMA idle interrupt is not guaranteed to be ordered with respect
-        * to updates to the the dma_head location in host memory. The head
-        * value read might not be fully up to date. If there are pending
-        * descriptors and the SDMA idle interrupt fired then read from the
-        * CSR SDMA head instead to get the latest value from the hardware.
-        * The hardware SDMA head should be read at most once in this invocation
-        * of sdma_make_progress(..) which is ensured by idle_check_done flag
-        */
-       if ((status & sde->idle_mask) && !idle_check_done) {
-               u16 swtail;
-
-               swtail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask;
-               if (swtail != hwhead) {
-                       hwhead = (u16)read_sde_csr(sde, SD(HEAD));
-                       idle_check_done = 1;
-                       goto retry;
-               }
-       }
-
-       sde->last_status = status;
-       if (progress)
-               sdma_desc_avail(sde, sdma_descq_freecnt(sde));
-}
-
-/*
- * sdma_engine_interrupt() - interrupt handler for engine
- * @sde: sdma engine
- * @status: sdma interrupt reason
- *
- * Status is a mask of the 3 possible interrupts for this engine.  It will
- * contain bits _only_ for this SDMA engine.  It will contain at least one
- * bit, it may contain more.
- */
-void sdma_engine_interrupt(struct sdma_engine *sde, u64 status)
-{
-       trace_hfi1_sdma_engine_interrupt(sde, status);
-       write_seqlock(&sde->head_lock);
-       sdma_set_desc_cnt(sde, sdma_desct_intr);
-       if (status & sde->idle_mask)
-               sde->idle_int_cnt++;
-       else if (status & sde->progress_mask)
-               sde->progress_int_cnt++;
-       else if (status & sde->int_mask)
-               sde->sdma_int_cnt++;
-       sdma_make_progress(sde, status);
-       write_sequnlock(&sde->head_lock);
-}
-
-/**
- * sdma_engine_error() - error handler for engine
- * @sde: sdma engine
- * @status: sdma interrupt reason
- */
-void sdma_engine_error(struct sdma_engine *sde, u64 status)
-{
-       unsigned long flags;
-
-#ifdef CONFIG_SDMA_VERBOSITY
-       dd_dev_err(sde->dd, "CONFIG SDMA(%u) error status 0x%llx state %s\n",
-                  sde->this_idx,
-                  (unsigned long long)status,
-                  sdma_state_names[sde->state.current_state]);
-#endif
-       spin_lock_irqsave(&sde->tail_lock, flags);
-       write_seqlock(&sde->head_lock);
-       if (status & ALL_SDMA_ENG_HALT_ERRS)
-               __sdma_process_event(sde, sdma_event_e60_hw_halted);
-       if (status & ~SD(ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK)) {
-               dd_dev_err(sde->dd,
-                          "SDMA (%u) engine error: 0x%llx state %s\n",
-                          sde->this_idx,
-                          (unsigned long long)status,
-                          sdma_state_names[sde->state.current_state]);
-               dump_sdma_state(sde);
-       }
-       write_sequnlock(&sde->head_lock);
-       spin_unlock_irqrestore(&sde->tail_lock, flags);
-}
-
-static void sdma_sendctrl(struct sdma_engine *sde, unsigned op)
-{
-       u64 set_senddmactrl = 0;
-       u64 clr_senddmactrl = 0;
-       unsigned long flags;
-
-#ifdef CONFIG_SDMA_VERBOSITY
-       dd_dev_err(sde->dd, "CONFIG SDMA(%u) senddmactrl E=%d I=%d H=%d C=%d\n",
-                  sde->this_idx,
-                  (op & SDMA_SENDCTRL_OP_ENABLE) ? 1 : 0,
-                  (op & SDMA_SENDCTRL_OP_INTENABLE) ? 1 : 0,
-                  (op & SDMA_SENDCTRL_OP_HALT) ? 1 : 0,
-                  (op & SDMA_SENDCTRL_OP_CLEANUP) ? 1 : 0);
-#endif
-
-       if (op & SDMA_SENDCTRL_OP_ENABLE)
-               set_senddmactrl |= SD(CTRL_SDMA_ENABLE_SMASK);
-       else
-               clr_senddmactrl |= SD(CTRL_SDMA_ENABLE_SMASK);
-
-       if (op & SDMA_SENDCTRL_OP_INTENABLE)
-               set_senddmactrl |= SD(CTRL_SDMA_INT_ENABLE_SMASK);
-       else
-               clr_senddmactrl |= SD(CTRL_SDMA_INT_ENABLE_SMASK);
-
-       if (op & SDMA_SENDCTRL_OP_HALT)
-               set_senddmactrl |= SD(CTRL_SDMA_HALT_SMASK);
-       else
-               clr_senddmactrl |= SD(CTRL_SDMA_HALT_SMASK);
-
-       spin_lock_irqsave(&sde->senddmactrl_lock, flags);
-
-       sde->p_senddmactrl |= set_senddmactrl;
-       sde->p_senddmactrl &= ~clr_senddmactrl;
-
-       if (op & SDMA_SENDCTRL_OP_CLEANUP)
-               write_sde_csr(sde, SD(CTRL),
-                             sde->p_senddmactrl |
-                             SD(CTRL_SDMA_CLEANUP_SMASK));
-       else
-               write_sde_csr(sde, SD(CTRL), sde->p_senddmactrl);
-
-       spin_unlock_irqrestore(&sde->senddmactrl_lock, flags);
-
-#ifdef CONFIG_SDMA_VERBOSITY
-       sdma_dumpstate(sde);
-#endif
-}
-
-static void sdma_setlengen(struct sdma_engine *sde)
-{
-#ifdef CONFIG_SDMA_VERBOSITY
-       dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
-                  sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
-#endif
-
-       /*
-        * Set SendDmaLenGen and clear-then-set the MSB of the generation
-        * count to enable generation checking and load the internal
-        * generation counter.
-        */
-       write_sde_csr(sde, SD(LEN_GEN),
-                     (sde->descq_cnt / 64) << SD(LEN_GEN_LENGTH_SHIFT));
-       write_sde_csr(sde, SD(LEN_GEN),
-                     ((sde->descq_cnt / 64) << SD(LEN_GEN_LENGTH_SHIFT)) |
-                     (4ULL << SD(LEN_GEN_GENERATION_SHIFT)));
-}
-
-static inline void sdma_update_tail(struct sdma_engine *sde, u16 tail)
-{
-       /* Commit writes to memory and advance the tail on the chip */
-       smp_wmb(); /* see get_txhead() */
-       writeq(tail, sde->tail_csr);
-}
-
-/*
- * This is called when changing to state s10_hw_start_up_halt_wait as
- * a result of send buffer errors or send DMA descriptor errors.
- */
-static void sdma_hw_start_up(struct sdma_engine *sde)
-{
-       u64 reg;
-
-#ifdef CONFIG_SDMA_VERBOSITY
-       dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
-                  sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
-#endif
-
-       sdma_setlengen(sde);
-       sdma_update_tail(sde, 0); /* Set SendDmaTail */
-       *sde->head_dma = 0;
-
-       reg = SD(ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_MASK) <<
-             SD(ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SHIFT);
-       write_sde_csr(sde, SD(ENG_ERR_CLEAR), reg);
-}
-
-#define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \
-(r &= ~SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
-
-#define SET_STATIC_RATE_CONTROL_SMASK(r) \
-(r |= SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
-/*
- * set_sdma_integrity
- *
- * Set the SEND_DMA_CHECK_ENABLE register for send DMA engine 'sde'.
- */
-static void set_sdma_integrity(struct sdma_engine *sde)
-{
-       struct hfi1_devdata *dd = sde->dd;
-       u64 reg;
-
-       if (unlikely(HFI1_CAP_IS_KSET(NO_INTEGRITY)))
-               return;
-
-       reg = hfi1_pkt_base_sdma_integrity(dd);
-
-       if (HFI1_CAP_IS_KSET(STATIC_RATE_CTRL))
-               CLEAR_STATIC_RATE_CONTROL_SMASK(reg);
-       else
-               SET_STATIC_RATE_CONTROL_SMASK(reg);
-
-       write_sde_csr(sde, SD(CHECK_ENABLE), reg);
-}
-
-static void init_sdma_regs(
-       struct sdma_engine *sde,
-       u32 credits,
-       uint idle_cnt)
-{
-       u8 opval, opmask;
-#ifdef CONFIG_SDMA_VERBOSITY
-       struct hfi1_devdata *dd = sde->dd;
-
-       dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n",
-                  sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
-#endif
-
-       write_sde_csr(sde, SD(BASE_ADDR), sde->descq_phys);
-       sdma_setlengen(sde);
-       sdma_update_tail(sde, 0); /* Set SendDmaTail */
-       write_sde_csr(sde, SD(RELOAD_CNT), idle_cnt);
-       write_sde_csr(sde, SD(DESC_CNT), 0);
-       write_sde_csr(sde, SD(HEAD_ADDR), sde->head_phys);
-       write_sde_csr(sde, SD(MEMORY),
-                     ((u64)credits << SD(MEMORY_SDMA_MEMORY_CNT_SHIFT)) |
-                     ((u64)(credits * sde->this_idx) <<
-                      SD(MEMORY_SDMA_MEMORY_INDEX_SHIFT)));
-       write_sde_csr(sde, SD(ENG_ERR_MASK), ~0ull);
-       set_sdma_integrity(sde);
-       opmask = OPCODE_CHECK_MASK_DISABLED;
-       opval = OPCODE_CHECK_VAL_DISABLED;
-       write_sde_csr(sde, SD(CHECK_OPCODE),
-                     (opmask << SEND_CTXT_CHECK_OPCODE_MASK_SHIFT) |
-                     (opval << SEND_CTXT_CHECK_OPCODE_VALUE_SHIFT));
-}
-
-#ifdef CONFIG_SDMA_VERBOSITY
-
-#define sdma_dumpstate_helper0(reg) do { \
-               csr = read_csr(sde->dd, reg); \
-               dd_dev_err(sde->dd, "%36s     0x%016llx\n", #reg, csr); \
-       } while (0)
-
-#define sdma_dumpstate_helper(reg) do { \
-               csr = read_sde_csr(sde, reg); \
-               dd_dev_err(sde->dd, "%36s[%02u] 0x%016llx\n", \
-                       #reg, sde->this_idx, csr); \
-       } while (0)
-
-#define sdma_dumpstate_helper2(reg) do { \
-               csr = read_csr(sde->dd, reg + (8 * i)); \
-               dd_dev_err(sde->dd, "%33s_%02u     0x%016llx\n", \
-                               #reg, i, csr); \
-       } while (0)
-
-void sdma_dumpstate(struct sdma_engine *sde)
-{
-       u64 csr;
-       unsigned i;
-
-       sdma_dumpstate_helper(SD(CTRL));
-       sdma_dumpstate_helper(SD(STATUS));
-       sdma_dumpstate_helper0(SD(ERR_STATUS));
-       sdma_dumpstate_helper0(SD(ERR_MASK));
-       sdma_dumpstate_helper(SD(ENG_ERR_STATUS));
-       sdma_dumpstate_helper(SD(ENG_ERR_MASK));
-
-       for (i = 0; i < CCE_NUM_INT_CSRS; ++i) {
-               sdma_dumpstate_helper2(CCE_INT_STATUS);
-               sdma_dumpstate_helper2(CCE_INT_MASK);
-               sdma_dumpstate_helper2(CCE_INT_BLOCKED);
-       }
-
-       sdma_dumpstate_helper(SD(TAIL));
-       sdma_dumpstate_helper(SD(HEAD));
-       sdma_dumpstate_helper(SD(PRIORITY_THLD));
-       sdma_dumpstate_helper(SD(IDLE_CNT));
-       sdma_dumpstate_helper(SD(RELOAD_CNT));
-       sdma_dumpstate_helper(SD(DESC_CNT));
-       sdma_dumpstate_helper(SD(DESC_FETCHED_CNT));
-       sdma_dumpstate_helper(SD(MEMORY));
-       sdma_dumpstate_helper0(SD(ENGINES));
-       sdma_dumpstate_helper0(SD(MEM_SIZE));
-       /* sdma_dumpstate_helper(SEND_EGRESS_SEND_DMA_STATUS);  */
-       sdma_dumpstate_helper(SD(BASE_ADDR));
-       sdma_dumpstate_helper(SD(LEN_GEN));
-       sdma_dumpstate_helper(SD(HEAD_ADDR));
-       sdma_dumpstate_helper(SD(CHECK_ENABLE));
-       sdma_dumpstate_helper(SD(CHECK_VL));
-       sdma_dumpstate_helper(SD(CHECK_JOB_KEY));
-       sdma_dumpstate_helper(SD(CHECK_PARTITION_KEY));
-       sdma_dumpstate_helper(SD(CHECK_SLID));
-       sdma_dumpstate_helper(SD(CHECK_OPCODE));
-}
-#endif
-
-static void dump_sdma_state(struct sdma_engine *sde)
-{
-       struct hw_sdma_desc *descq;
-       struct hw_sdma_desc *descqp;
-       u64 desc[2];
-       u64 addr;
-       u8 gen;
-       u16 len;
-       u16 head, tail, cnt;
-
-       head = sde->descq_head & sde->sdma_mask;
-       tail = sde->descq_tail & sde->sdma_mask;
-       cnt = sdma_descq_freecnt(sde);
-       descq = sde->descq;
-
-       dd_dev_err(sde->dd,
-                  "SDMA (%u) descq_head: %u descq_tail: %u freecnt: %u FLE %d\n",
-                  sde->this_idx, head, tail, cnt,
-                  !list_empty(&sde->flushlist));
-
-       /* print info for each entry in the descriptor queue */
-       while (head != tail) {
-               char flags[6] = { 'x', 'x', 'x', 'x', 0 };
-
-               descqp = &sde->descq[head];
-               desc[0] = le64_to_cpu(descqp->qw[0]);
-               desc[1] = le64_to_cpu(descqp->qw[1]);
-               flags[0] = (desc[1] & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-';
-               flags[1] = (desc[1] & SDMA_DESC1_HEAD_TO_HOST_FLAG) ?
-                               'H' : '-';
-               flags[2] = (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-';
-               flags[3] = (desc[0] & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-';
-               addr = (desc[0] >> SDMA_DESC0_PHY_ADDR_SHIFT)
-                       & SDMA_DESC0_PHY_ADDR_MASK;
-               gen = (desc[1] >> SDMA_DESC1_GENERATION_SHIFT)
-                       & SDMA_DESC1_GENERATION_MASK;
-               len = (desc[0] >> SDMA_DESC0_BYTE_COUNT_SHIFT)
-                       & SDMA_DESC0_BYTE_COUNT_MASK;
-               dd_dev_err(sde->dd,
-                          "SDMA sdmadesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes\n",
-                          head, flags, addr, gen, len);
-               dd_dev_err(sde->dd,
-                          "\tdesc0:0x%016llx desc1 0x%016llx\n",
-                          desc[0], desc[1]);
-               if (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG)
-                       dd_dev_err(sde->dd,
-                                  "\taidx: %u amode: %u alen: %u\n",
-                                  (u8)((desc[1] &
-                                        SDMA_DESC1_HEADER_INDEX_SMASK) >>
-                                       SDMA_DESC1_HEADER_INDEX_SHIFT),
-                                  (u8)((desc[1] &
-                                        SDMA_DESC1_HEADER_MODE_SMASK) >>
-                                       SDMA_DESC1_HEADER_MODE_SHIFT),
-                                  (u8)((desc[1] &
-                                        SDMA_DESC1_HEADER_DWS_SMASK) >>
-                                       SDMA_DESC1_HEADER_DWS_SHIFT));
-               head++;
-               head &= sde->sdma_mask;
-       }
-}
-
-#define SDE_FMT \
-       "SDE %u CPU %d STE %s C 0x%llx S 0x%016llx E 0x%llx T(HW) 0x%llx T(SW) 0x%x H(HW) 0x%llx H(SW) 0x%x H(D) 0x%llx DM 0x%llx GL 0x%llx R 0x%llx LIS 0x%llx AHGI 0x%llx TXT %u TXH %u DT %u DH %u FLNE %d DQF %u SLC 0x%llx\n"
-/**
- * sdma_seqfile_dump_sde() - debugfs dump of sde
- * @s: seq file
- * @sde: send dma engine to dump
- *
- * This routine dumps the sde to the indicated seq file.
- */
-void sdma_seqfile_dump_sde(struct seq_file *s, struct sdma_engine *sde)
-{
-       u16 head, tail;
-       struct hw_sdma_desc *descqp;
-       u64 desc[2];
-       u64 addr;
-       u8 gen;
-       u16 len;
-
-       head = sde->descq_head & sde->sdma_mask;
-       tail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask;
-       seq_printf(s, SDE_FMT, sde->this_idx,
-                  sde->cpu,
-                  sdma_state_name(sde->state.current_state),
-                  (unsigned long long)read_sde_csr(sde, SD(CTRL)),
-                  (unsigned long long)read_sde_csr(sde, SD(STATUS)),
-                  (unsigned long long)read_sde_csr(sde, SD(ENG_ERR_STATUS)),
-                  (unsigned long long)read_sde_csr(sde, SD(TAIL)), tail,
-                  (unsigned long long)read_sde_csr(sde, SD(HEAD)), head,
-                  (unsigned long long)le64_to_cpu(*sde->head_dma),
-                  (unsigned long long)read_sde_csr(sde, SD(MEMORY)),
-                  (unsigned long long)read_sde_csr(sde, SD(LEN_GEN)),
-                  (unsigned long long)read_sde_csr(sde, SD(RELOAD_CNT)),
-                  (unsigned long long)sde->last_status,
-                  (unsigned long long)sde->ahg_bits,
-                  sde->tx_tail,
-                  sde->tx_head,
-                  sde->descq_tail,
-                  sde->descq_head,
-                  !list_empty(&sde->flushlist),
-                  sde->descq_full_count,
-                  (unsigned long long)read_sde_csr(sde, SEND_DMA_CHECK_SLID));
-
-       /* print info for each entry in the descriptor queue */
-       while (head != tail) {
-               char flags[6] = { 'x', 'x', 'x', 'x', 0 };
-
-               descqp = &sde->descq[head];
-               desc[0] = le64_to_cpu(descqp->qw[0]);
-               desc[1] = le64_to_cpu(descqp->qw[1]);
-               flags[0] = (desc[1] & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-';
-               flags[1] = (desc[1] & SDMA_DESC1_HEAD_TO_HOST_FLAG) ?
-                               'H' : '-';
-               flags[2] = (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-';
-               flags[3] = (desc[0] & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-';
-               addr = (desc[0] >> SDMA_DESC0_PHY_ADDR_SHIFT)
-                       & SDMA_DESC0_PHY_ADDR_MASK;
-               gen = (desc[1] >> SDMA_DESC1_GENERATION_SHIFT)
-                       & SDMA_DESC1_GENERATION_MASK;
-               len = (desc[0] >> SDMA_DESC0_BYTE_COUNT_SHIFT)
-                       & SDMA_DESC0_BYTE_COUNT_MASK;
-               seq_printf(s,
-                          "\tdesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes\n",
-                          head, flags, addr, gen, len);
-               if (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG)
-                       seq_printf(s, "\t\tahgidx: %u ahgmode: %u\n",
-                                  (u8)((desc[1] &
-                                        SDMA_DESC1_HEADER_INDEX_SMASK) >>
-                                       SDMA_DESC1_HEADER_INDEX_SHIFT),
-                                  (u8)((desc[1] &
-                                        SDMA_DESC1_HEADER_MODE_SMASK) >>
-                                       SDMA_DESC1_HEADER_MODE_SHIFT));
-               head = (head + 1) & sde->sdma_mask;
-       }
-}
-
-/*
- * add the generation number into
- * the qw1 and return
- */
-static inline u64 add_gen(struct sdma_engine *sde, u64 qw1)
-{
-       u8 generation = (sde->descq_tail >> sde->sdma_shift) & 3;
-
-       qw1 &= ~SDMA_DESC1_GENERATION_SMASK;
-       qw1 |= ((u64)generation & SDMA_DESC1_GENERATION_MASK)
-                       << SDMA_DESC1_GENERATION_SHIFT;
-       return qw1;
-}
-
-/*
- * This routine submits the indicated tx
- *
- * Space has already been guaranteed and
- * tail side of ring is locked.
- *
- * The hardware tail update is done
- * in the caller and that is facilitated
- * by returning the new tail.
- *
- * There is special case logic for ahg
- * to not add the generation number for
- * up to 2 descriptors that follow the
- * first descriptor.
- *
- */
-static inline u16 submit_tx(struct sdma_engine *sde, struct sdma_txreq *tx)
-{
-       int i;
-       u16 tail;
-       struct sdma_desc *descp = tx->descp;
-       u8 skip = 0, mode = ahg_mode(tx);
-
-       tail = sde->descq_tail & sde->sdma_mask;
-       sde->descq[tail].qw[0] = cpu_to_le64(descp->qw[0]);
-       sde->descq[tail].qw[1] = cpu_to_le64(add_gen(sde, descp->qw[1]));
-       trace_hfi1_sdma_descriptor(sde, descp->qw[0], descp->qw[1],
-                                  tail, &sde->descq[tail]);
-       tail = ++sde->descq_tail & sde->sdma_mask;
-       descp++;
-       if (mode > SDMA_AHG_APPLY_UPDATE1)
-               skip = mode >> 1;
-       for (i = 1; i < tx->num_desc; i++, descp++) {
-               u64 qw1;
-
-               sde->descq[tail].qw[0] = cpu_to_le64(descp->qw[0]);
-               if (skip) {
-                       /* edits don't have generation */
-                       qw1 = descp->qw[1];
-                       skip--;
-               } else {
-                       /* replace generation with real one for non-edits */
-                       qw1 = add_gen(sde, descp->qw[1]);
-               }
-               sde->descq[tail].qw[1] = cpu_to_le64(qw1);
-               trace_hfi1_sdma_descriptor(sde, descp->qw[0], qw1,
-                                          tail, &sde->descq[tail]);
-               tail = ++sde->descq_tail & sde->sdma_mask;
-       }
-       tx->next_descq_idx = tail;
-#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
-       tx->sn = sde->tail_sn++;
-       trace_hfi1_sdma_in_sn(sde, tx->sn);
-       WARN_ON_ONCE(sde->tx_ring[sde->tx_tail & sde->sdma_mask]);
-#endif
-       sde->tx_ring[sde->tx_tail++ & sde->sdma_mask] = tx;
-       sde->desc_avail -= tx->num_desc;
-       return tail;
-}
-
-/*
- * Check for progress
- */
-static int sdma_check_progress(
-       struct sdma_engine *sde,
-       struct iowait *wait,
-       struct sdma_txreq *tx)
-{
-       int ret;
-
-       sde->desc_avail = sdma_descq_freecnt(sde);
-       if (tx->num_desc <= sde->desc_avail)
-               return -EAGAIN;
-       /* pulse the head_lock */
-       if (wait && wait->sleep) {
-               unsigned seq;
-
-               seq = raw_seqcount_begin(
-                       (const seqcount_t *)&sde->head_lock.seqcount);
-               ret = wait->sleep(sde, wait, tx, seq);
-               if (ret == -EAGAIN)
-                       sde->desc_avail = sdma_descq_freecnt(sde);
-       } else {
-               ret = -EBUSY;
-       }
-       return ret;
-}
-
-/**
- * sdma_send_txreq() - submit a tx req to ring
- * @sde: sdma engine to use
- * @wait: wait structure to use when full (may be NULL)
- * @tx: sdma_txreq to submit
- *
- * The call submits the tx into the ring.  If a iowait structure is non-NULL
- * the packet will be queued to the list in wait.
- *
- * Return:
- * 0 - Success, -EINVAL - sdma_txreq incomplete, -EBUSY - no space in
- * ring (wait == NULL)
- * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state
- */
-int sdma_send_txreq(struct sdma_engine *sde,
-                   struct iowait *wait,
-                   struct sdma_txreq *tx)
-{
-       int ret = 0;
-       u16 tail;
-       unsigned long flags;
-
-       /* user should have supplied entire packet */
-       if (unlikely(tx->tlen))
-               return -EINVAL;
-       tx->wait = wait;
-       spin_lock_irqsave(&sde->tail_lock, flags);
-retry:
-       if (unlikely(!__sdma_running(sde)))
-               goto unlock_noconn;
-       if (unlikely(tx->num_desc > sde->desc_avail))
-               goto nodesc;
-       tail = submit_tx(sde, tx);
-       if (wait)
-               iowait_sdma_inc(wait);
-       sdma_update_tail(sde, tail);
-unlock:
-       spin_unlock_irqrestore(&sde->tail_lock, flags);
-       return ret;
-unlock_noconn:
-       if (wait)
-               iowait_sdma_inc(wait);
-       tx->next_descq_idx = 0;
-#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
-       tx->sn = sde->tail_sn++;
-       trace_hfi1_sdma_in_sn(sde, tx->sn);
-#endif
-       spin_lock(&sde->flushlist_lock);
-       list_add_tail(&tx->list, &sde->flushlist);
-       spin_unlock(&sde->flushlist_lock);
-       if (wait) {
-               wait->tx_count++;
-               wait->count += tx->num_desc;
-       }
-       schedule_work(&sde->flush_worker);
-       ret = -ECOMM;
-       goto unlock;
-nodesc:
-       ret = sdma_check_progress(sde, wait, tx);
-       if (ret == -EAGAIN) {
-               ret = 0;
-               goto retry;
-       }
-       sde->descq_full_count++;
-       goto unlock;
-}
-
-/**
- * sdma_send_txlist() - submit a list of tx req to ring
- * @sde: sdma engine to use
- * @wait: wait structure to use when full (may be NULL)
- * @tx_list: list of sdma_txreqs to submit
- *
- * The call submits the list into the ring.
- *
- * If the iowait structure is non-NULL and not equal to the iowait list
- * the unprocessed part of the list  will be appended to the list in wait.
- *
- * In all cases, the tx_list will be updated so the head of the tx_list is
- * the list of descriptors that have yet to be transmitted.
- *
- * The intent of this call is to provide a more efficient
- * way of submitting multiple packets to SDMA while holding the tail
- * side locking.
- *
- * Return:
- * > 0 - Success (value is number of sdma_txreq's submitted),
- * -EINVAL - sdma_txreq incomplete, -EBUSY - no space in ring (wait == NULL)
- * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state
- */
-int sdma_send_txlist(struct sdma_engine *sde, struct iowait *wait,
-                    struct list_head *tx_list)
-{
-       struct sdma_txreq *tx, *tx_next;
-       int ret = 0;
-       unsigned long flags;
-       u16 tail = INVALID_TAIL;
-       int count = 0;
-
-       spin_lock_irqsave(&sde->tail_lock, flags);
-retry:
-       list_for_each_entry_safe(tx, tx_next, tx_list, list) {
-               tx->wait = wait;
-               if (unlikely(!__sdma_running(sde)))
-                       goto unlock_noconn;
-               if (unlikely(tx->num_desc > sde->desc_avail))
-                       goto nodesc;
-               if (unlikely(tx->tlen)) {
-                       ret = -EINVAL;
-                       goto update_tail;
-               }
-               list_del_init(&tx->list);
-               tail = submit_tx(sde, tx);
-               count++;
-               if (tail != INVALID_TAIL &&
-                   (count & SDMA_TAIL_UPDATE_THRESH) == 0) {
-                       sdma_update_tail(sde, tail);
-                       tail = INVALID_TAIL;
-               }
-       }
-update_tail:
-       if (wait)
-               iowait_sdma_add(wait, count);
-       if (tail != INVALID_TAIL)
-               sdma_update_tail(sde, tail);
-       spin_unlock_irqrestore(&sde->tail_lock, flags);
-       return ret == 0 ? count : ret;
-unlock_noconn:
-       spin_lock(&sde->flushlist_lock);
-       list_for_each_entry_safe(tx, tx_next, tx_list, list) {
-               tx->wait = wait;
-               list_del_init(&tx->list);
-               if (wait)
-                       iowait_sdma_inc(wait);
-               tx->next_descq_idx = 0;
-#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
-               tx->sn = sde->tail_sn++;
-               trace_hfi1_sdma_in_sn(sde, tx->sn);
-#endif
-               list_add_tail(&tx->list, &sde->flushlist);
-               if (wait) {
-                       wait->tx_count++;
-                       wait->count += tx->num_desc;
-               }
-       }
-       spin_unlock(&sde->flushlist_lock);
-       schedule_work(&sde->flush_worker);
-       ret = -ECOMM;
-       goto update_tail;
-nodesc:
-       ret = sdma_check_progress(sde, wait, tx);
-       if (ret == -EAGAIN) {
-               ret = 0;
-               goto retry;
-       }
-       sde->descq_full_count++;
-       goto update_tail;
-}
-
-static void sdma_process_event(struct sdma_engine *sde, enum sdma_events event)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&sde->tail_lock, flags);
-       write_seqlock(&sde->head_lock);
-
-       __sdma_process_event(sde, event);
-
-       if (sde->state.current_state == sdma_state_s99_running)
-               sdma_desc_avail(sde, sdma_descq_freecnt(sde));
-
-       write_sequnlock(&sde->head_lock);
-       spin_unlock_irqrestore(&sde->tail_lock, flags);
-}
-
-static void __sdma_process_event(struct sdma_engine *sde,
-                                enum sdma_events event)
-{
-       struct sdma_state *ss = &sde->state;
-       int need_progress = 0;
-
-       /* CONFIG SDMA temporary */
-#ifdef CONFIG_SDMA_VERBOSITY
-       dd_dev_err(sde->dd, "CONFIG SDMA(%u) [%s] %s\n", sde->this_idx,
-                  sdma_state_names[ss->current_state],
-                  sdma_event_names[event]);
-#endif
-
-       switch (ss->current_state) {
-       case sdma_state_s00_hw_down:
-               switch (event) {
-               case sdma_event_e00_go_hw_down:
-                       break;
-               case sdma_event_e30_go_running:
-                       /*
-                        * If down, but running requested (usually result
-                        * of link up, then we need to start up.
-                        * This can happen when hw down is requested while
-                        * bringing the link up with traffic active on
-                        * 7220, e.g.
-                        */
-                       ss->go_s99_running = 1;
-                       /* fall through and start dma engine */
-               case sdma_event_e10_go_hw_start:
-                       /* This reference means the state machine is started */
-                       sdma_get(&sde->state);
-                       sdma_set_state(sde,
-                                      sdma_state_s10_hw_start_up_halt_wait);
-                       break;
-               case sdma_event_e15_hw_halt_done:
-                       break;
-               case sdma_event_e25_hw_clean_up_done:
-                       break;
-               case sdma_event_e40_sw_cleaned:
-                       sdma_sw_tear_down(sde);
-                       break;
-               case sdma_event_e50_hw_cleaned:
-                       break;
-               case sdma_event_e60_hw_halted:
-                       break;
-               case sdma_event_e70_go_idle:
-                       break;
-               case sdma_event_e80_hw_freeze:
-                       break;
-               case sdma_event_e81_hw_frozen:
-                       break;
-               case sdma_event_e82_hw_unfreeze:
-                       break;
-               case sdma_event_e85_link_down:
-                       break;
-               case sdma_event_e90_sw_halted:
-                       break;
-               }
-               break;
-
-       case sdma_state_s10_hw_start_up_halt_wait:
-               switch (event) {
-               case sdma_event_e00_go_hw_down:
-                       sdma_set_state(sde, sdma_state_s00_hw_down);
-                       sdma_sw_tear_down(sde);
-                       break;
-               case sdma_event_e10_go_hw_start:
-                       break;
-               case sdma_event_e15_hw_halt_done:
-                       sdma_set_state(sde,
-                                      sdma_state_s15_hw_start_up_clean_wait);
-                       sdma_start_hw_clean_up(sde);
-                       break;
-               case sdma_event_e25_hw_clean_up_done:
-                       break;
-               case sdma_event_e30_go_running:
-                       ss->go_s99_running = 1;
-                       break;
-               case sdma_event_e40_sw_cleaned:
-                       break;
-               case sdma_event_e50_hw_cleaned:
-                       break;
-               case sdma_event_e60_hw_halted:
-                       schedule_work(&sde->err_halt_worker);
-                       break;
-               case sdma_event_e70_go_idle:
-                       ss->go_s99_running = 0;
-                       break;
-               case sdma_event_e80_hw_freeze:
-                       break;
-               case sdma_event_e81_hw_frozen:
-                       break;
-               case sdma_event_e82_hw_unfreeze:
-                       break;
-               case sdma_event_e85_link_down:
-                       break;
-               case sdma_event_e90_sw_halted:
-                       break;
-               }
-               break;
-
-       case sdma_state_s15_hw_start_up_clean_wait:
-               switch (event) {
-               case sdma_event_e00_go_hw_down:
-                       sdma_set_state(sde, sdma_state_s00_hw_down);
-                       sdma_sw_tear_down(sde);
-                       break;
-               case sdma_event_e10_go_hw_start:
-                       break;
-               case sdma_event_e15_hw_halt_done:
-                       break;
-               case sdma_event_e25_hw_clean_up_done:
-                       sdma_hw_start_up(sde);
-                       sdma_set_state(sde, ss->go_s99_running ?
-                                      sdma_state_s99_running :
-                                      sdma_state_s20_idle);
-                       break;
-               case sdma_event_e30_go_running:
-                       ss->go_s99_running = 1;
-                       break;
-               case sdma_event_e40_sw_cleaned:
-                       break;
-               case sdma_event_e50_hw_cleaned:
-                       break;
-               case sdma_event_e60_hw_halted:
-                       break;
-               case sdma_event_e70_go_idle:
-                       ss->go_s99_running = 0;
-                       break;
-               case sdma_event_e80_hw_freeze:
-                       break;
-               case sdma_event_e81_hw_frozen:
-                       break;
-               case sdma_event_e82_hw_unfreeze:
-                       break;
-               case sdma_event_e85_link_down:
-                       break;
-               case sdma_event_e90_sw_halted:
-                       break;
-               }
-               break;
-
-       case sdma_state_s20_idle:
-               switch (event) {
-               case sdma_event_e00_go_hw_down:
-                       sdma_set_state(sde, sdma_state_s00_hw_down);
-                       sdma_sw_tear_down(sde);
-                       break;
-               case sdma_event_e10_go_hw_start:
-                       break;
-               case sdma_event_e15_hw_halt_done:
-                       break;
-               case sdma_event_e25_hw_clean_up_done:
-                       break;
-               case sdma_event_e30_go_running:
-                       sdma_set_state(sde, sdma_state_s99_running);
-                       ss->go_s99_running = 1;
-                       break;
-               case sdma_event_e40_sw_cleaned:
-                       break;
-               case sdma_event_e50_hw_cleaned:
-                       break;
-               case sdma_event_e60_hw_halted:
-                       sdma_set_state(sde, sdma_state_s50_hw_halt_wait);
-                       schedule_work(&sde->err_halt_worker);
-                       break;
-               case sdma_event_e70_go_idle:
-                       break;
-               case sdma_event_e85_link_down:
-                       /* fall through */
-               case sdma_event_e80_hw_freeze:
-                       sdma_set_state(sde, sdma_state_s80_hw_freeze);
-                       atomic_dec(&sde->dd->sdma_unfreeze_count);
-                       wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
-                       break;
-               case sdma_event_e81_hw_frozen:
-                       break;
-               case sdma_event_e82_hw_unfreeze:
-                       break;
-               case sdma_event_e90_sw_halted:
-                       break;
-               }
-               break;
-
-       case sdma_state_s30_sw_clean_up_wait:
-               switch (event) {
-               case sdma_event_e00_go_hw_down:
-                       sdma_set_state(sde, sdma_state_s00_hw_down);
-                       break;
-               case sdma_event_e10_go_hw_start:
-                       break;
-               case sdma_event_e15_hw_halt_done:
-                       break;
-               case sdma_event_e25_hw_clean_up_done:
-                       break;
-               case sdma_event_e30_go_running:
-                       ss->go_s99_running = 1;
-                       break;
-               case sdma_event_e40_sw_cleaned:
-                       sdma_set_state(sde, sdma_state_s40_hw_clean_up_wait);
-                       sdma_start_hw_clean_up(sde);
-                       break;
-               case sdma_event_e50_hw_cleaned:
-                       break;
-               case sdma_event_e60_hw_halted:
-                       break;
-               case sdma_event_e70_go_idle:
-                       ss->go_s99_running = 0;
-                       break;
-               case sdma_event_e80_hw_freeze:
-                       break;
-               case sdma_event_e81_hw_frozen:
-                       break;
-               case sdma_event_e82_hw_unfreeze:
-                       break;
-               case sdma_event_e85_link_down:
-                       ss->go_s99_running = 0;
-                       break;
-               case sdma_event_e90_sw_halted:
-                       break;
-               }
-               break;
-
-       case sdma_state_s40_hw_clean_up_wait:
-               switch (event) {
-               case sdma_event_e00_go_hw_down:
-                       sdma_set_state(sde, sdma_state_s00_hw_down);
-                       tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
-                       break;
-               case sdma_event_e10_go_hw_start:
-                       break;
-               case sdma_event_e15_hw_halt_done:
-                       break;
-               case sdma_event_e25_hw_clean_up_done:
-                       sdma_hw_start_up(sde);
-                       sdma_set_state(sde, ss->go_s99_running ?
-                                      sdma_state_s99_running :
-                                      sdma_state_s20_idle);
-                       break;
-               case sdma_event_e30_go_running:
-                       ss->go_s99_running = 1;
-                       break;
-               case sdma_event_e40_sw_cleaned:
-                       break;
-               case sdma_event_e50_hw_cleaned:
-                       break;
-               case sdma_event_e60_hw_halted:
-                       break;
-               case sdma_event_e70_go_idle:
-                       ss->go_s99_running = 0;
-                       break;
-               case sdma_event_e80_hw_freeze:
-                       break;
-               case sdma_event_e81_hw_frozen:
-                       break;
-               case sdma_event_e82_hw_unfreeze:
-                       break;
-               case sdma_event_e85_link_down:
-                       ss->go_s99_running = 0;
-                       break;
-               case sdma_event_e90_sw_halted:
-                       break;
-               }
-               break;
-
-       case sdma_state_s50_hw_halt_wait:
-               switch (event) {
-               case sdma_event_e00_go_hw_down:
-                       sdma_set_state(sde, sdma_state_s00_hw_down);
-                       tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
-                       break;
-               case sdma_event_e10_go_hw_start:
-                       break;
-               case sdma_event_e15_hw_halt_done:
-                       sdma_set_state(sde, sdma_state_s30_sw_clean_up_wait);
-                       tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
-                       break;
-               case sdma_event_e25_hw_clean_up_done:
-                       break;
-               case sdma_event_e30_go_running:
-                       ss->go_s99_running = 1;
-                       break;
-               case sdma_event_e40_sw_cleaned:
-                       break;
-               case sdma_event_e50_hw_cleaned:
-                       break;
-               case sdma_event_e60_hw_halted:
-                       schedule_work(&sde->err_halt_worker);
-                       break;
-               case sdma_event_e70_go_idle:
-                       ss->go_s99_running = 0;
-                       break;
-               case sdma_event_e80_hw_freeze:
-                       break;
-               case sdma_event_e81_hw_frozen:
-                       break;
-               case sdma_event_e82_hw_unfreeze:
-                       break;
-               case sdma_event_e85_link_down:
-                       ss->go_s99_running = 0;
-                       break;
-               case sdma_event_e90_sw_halted:
-                       break;
-               }
-               break;
-
-       case sdma_state_s60_idle_halt_wait:
-               switch (event) {
-               case sdma_event_e00_go_hw_down:
-                       sdma_set_state(sde, sdma_state_s00_hw_down);
-                       tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
-                       break;
-               case sdma_event_e10_go_hw_start:
-                       break;
-               case sdma_event_e15_hw_halt_done:
-                       sdma_set_state(sde, sdma_state_s30_sw_clean_up_wait);
-                       tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
-                       break;
-               case sdma_event_e25_hw_clean_up_done:
-                       break;
-               case sdma_event_e30_go_running:
-                       ss->go_s99_running = 1;
-                       break;
-               case sdma_event_e40_sw_cleaned:
-                       break;
-               case sdma_event_e50_hw_cleaned:
-                       break;
-               case sdma_event_e60_hw_halted:
-                       schedule_work(&sde->err_halt_worker);
-                       break;
-               case sdma_event_e70_go_idle:
-                       ss->go_s99_running = 0;
-                       break;
-               case sdma_event_e80_hw_freeze:
-                       break;
-               case sdma_event_e81_hw_frozen:
-                       break;
-               case sdma_event_e82_hw_unfreeze:
-                       break;
-               case sdma_event_e85_link_down:
-                       break;
-               case sdma_event_e90_sw_halted:
-                       break;
-               }
-               break;
-
-       case sdma_state_s80_hw_freeze:
-               switch (event) {
-               case sdma_event_e00_go_hw_down:
-                       sdma_set_state(sde, sdma_state_s00_hw_down);
-                       tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
-                       break;
-               case sdma_event_e10_go_hw_start:
-                       break;
-               case sdma_event_e15_hw_halt_done:
-                       break;
-               case sdma_event_e25_hw_clean_up_done:
-                       break;
-               case sdma_event_e30_go_running:
-                       ss->go_s99_running = 1;
-                       break;
-               case sdma_event_e40_sw_cleaned:
-                       break;
-               case sdma_event_e50_hw_cleaned:
-                       break;
-               case sdma_event_e60_hw_halted:
-                       break;
-               case sdma_event_e70_go_idle:
-                       ss->go_s99_running = 0;
-                       break;
-               case sdma_event_e80_hw_freeze:
-                       break;
-               case sdma_event_e81_hw_frozen:
-                       sdma_set_state(sde, sdma_state_s82_freeze_sw_clean);
-                       tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
-                       break;
-               case sdma_event_e82_hw_unfreeze:
-                       break;
-               case sdma_event_e85_link_down:
-                       break;
-               case sdma_event_e90_sw_halted:
-                       break;
-               }
-               break;
-
-       case sdma_state_s82_freeze_sw_clean:
-               switch (event) {
-               case sdma_event_e00_go_hw_down:
-                       sdma_set_state(sde, sdma_state_s00_hw_down);
-                       tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
-                       break;
-               case sdma_event_e10_go_hw_start:
-                       break;
-               case sdma_event_e15_hw_halt_done:
-                       break;
-               case sdma_event_e25_hw_clean_up_done:
-                       break;
-               case sdma_event_e30_go_running:
-                       ss->go_s99_running = 1;
-                       break;
-               case sdma_event_e40_sw_cleaned:
-                       /* notify caller this engine is done cleaning */
-                       atomic_dec(&sde->dd->sdma_unfreeze_count);
-                       wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
-                       break;
-               case sdma_event_e50_hw_cleaned:
-                       break;
-               case sdma_event_e60_hw_halted:
-                       break;
-               case sdma_event_e70_go_idle:
-                       ss->go_s99_running = 0;
-                       break;
-               case sdma_event_e80_hw_freeze:
-                       break;
-               case sdma_event_e81_hw_frozen:
-                       break;
-               case sdma_event_e82_hw_unfreeze:
-                       sdma_hw_start_up(sde);
-                       sdma_set_state(sde, ss->go_s99_running ?
-                                      sdma_state_s99_running :
-                                      sdma_state_s20_idle);
-                       break;
-               case sdma_event_e85_link_down:
-                       break;
-               case sdma_event_e90_sw_halted:
-                       break;
-               }
-               break;
-
-       case sdma_state_s99_running:
-               switch (event) {
-               case sdma_event_e00_go_hw_down:
-                       sdma_set_state(sde, sdma_state_s00_hw_down);
-                       tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
-                       break;
-               case sdma_event_e10_go_hw_start:
-                       break;
-               case sdma_event_e15_hw_halt_done:
-                       break;
-               case sdma_event_e25_hw_clean_up_done:
-                       break;
-               case sdma_event_e30_go_running:
-                       break;
-               case sdma_event_e40_sw_cleaned:
-                       break;
-               case sdma_event_e50_hw_cleaned:
-                       break;
-               case sdma_event_e60_hw_halted:
-                       need_progress = 1;
-                       sdma_err_progress_check_schedule(sde);
-               case sdma_event_e90_sw_halted:
-                       /*
-                       * SW initiated halt does not perform engines
-                       * progress check
-                       */
-                       sdma_set_state(sde, sdma_state_s50_hw_halt_wait);
-                       schedule_work(&sde->err_halt_worker);
-                       break;
-               case sdma_event_e70_go_idle:
-                       sdma_set_state(sde, sdma_state_s60_idle_halt_wait);
-                       break;
-               case sdma_event_e85_link_down:
-                       ss->go_s99_running = 0;
-                       /* fall through */
-               case sdma_event_e80_hw_freeze:
-                       sdma_set_state(sde, sdma_state_s80_hw_freeze);
-                       atomic_dec(&sde->dd->sdma_unfreeze_count);
-                       wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
-                       break;
-               case sdma_event_e81_hw_frozen:
-                       break;
-               case sdma_event_e82_hw_unfreeze:
-                       break;
-               }
-               break;
-       }
-
-       ss->last_event = event;
-       if (need_progress)
-               sdma_make_progress(sde, 0);
-}
-
-/*
- * _extend_sdma_tx_descs() - helper to extend txreq
- *
- * This is called once the initial nominal allocation
- * of descriptors in the sdma_txreq is exhausted.
- *
- * The code will bump the allocation up to the max
- * of MAX_DESC (64) descriptors. There doesn't seem
- * much point in an interim step. The last descriptor
- * is reserved for coalesce buffer in order to support
- * cases where input packet has >MAX_DESC iovecs.
- *
- */
-static int _extend_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx)
-{
-       int i;
-
-       /* Handle last descriptor */
-       if (unlikely((tx->num_desc == (MAX_DESC - 1)))) {
-               /* if tlen is 0, it is for padding, release last descriptor */
-               if (!tx->tlen) {
-                       tx->desc_limit = MAX_DESC;
-               } else if (!tx->coalesce_buf) {
-                       /* allocate coalesce buffer with space for padding */
-                       tx->coalesce_buf = kmalloc(tx->tlen + sizeof(u32),
-                                                  GFP_ATOMIC);
-                       if (!tx->coalesce_buf)
-                               goto enomem;
-                       tx->coalesce_idx = 0;
-               }
-               return 0;
-       }
-
-       if (unlikely(tx->num_desc == MAX_DESC))
-               goto enomem;
-
-       tx->descp = kmalloc_array(
-                       MAX_DESC,
-                       sizeof(struct sdma_desc),
-                       GFP_ATOMIC);
-       if (!tx->descp)
-               goto enomem;
-
-       /* reserve last descriptor for coalescing */
-       tx->desc_limit = MAX_DESC - 1;
-       /* copy ones already built */
-       for (i = 0; i < tx->num_desc; i++)
-               tx->descp[i] = tx->descs[i];
-       return 0;
-enomem:
-       sdma_txclean(dd, tx);
-       return -ENOMEM;
-}
-
-/*
- * ext_coal_sdma_tx_descs() - extend or coalesce sdma tx descriptors
- *
- * This is called once the initial nominal allocation of descriptors
- * in the sdma_txreq is exhausted.
- *
- * This function calls _extend_sdma_tx_descs to extend or allocate
- * coalesce buffer. If there is a allocated coalesce buffer, it will
- * copy the input packet data into the coalesce buffer. It also adds
- * coalesce buffer descriptor once when whole packet is received.
- *
- * Return:
- * <0 - error
- * 0 - coalescing, don't populate descriptor
- * 1 - continue with populating descriptor
- */
-int ext_coal_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx,
-                          int type, void *kvaddr, struct page *page,
-                          unsigned long offset, u16 len)
-{
-       int pad_len, rval;
-       dma_addr_t addr;
-
-       rval = _extend_sdma_tx_descs(dd, tx);
-       if (rval) {
-               sdma_txclean(dd, tx);
-               return rval;
-       }
-
-       /* If coalesce buffer is allocated, copy data into it */
-       if (tx->coalesce_buf) {
-               if (type == SDMA_MAP_NONE) {
-                       sdma_txclean(dd, tx);
-                       return -EINVAL;
-               }
-
-               if (type == SDMA_MAP_PAGE) {
-                       kvaddr = kmap(page);
-                       kvaddr += offset;
-               } else if (WARN_ON(!kvaddr)) {
-                       sdma_txclean(dd, tx);
-                       return -EINVAL;
-               }
-
-               memcpy(tx->coalesce_buf + tx->coalesce_idx, kvaddr, len);
-               tx->coalesce_idx += len;
-               if (type == SDMA_MAP_PAGE)
-                       kunmap(page);
-
-               /* If there is more data, return */
-               if (tx->tlen - tx->coalesce_idx)
-                       return 0;
-
-               /* Whole packet is received; add any padding */
-               pad_len = tx->packet_len & (sizeof(u32) - 1);
-               if (pad_len) {
-                       pad_len = sizeof(u32) - pad_len;
-                       memset(tx->coalesce_buf + tx->coalesce_idx, 0, pad_len);
-                       /* padding is taken care of for coalescing case */
-                       tx->packet_len += pad_len;
-                       tx->tlen += pad_len;
-               }
-
-               /* dma map the coalesce buffer */
-               addr = dma_map_single(&dd->pcidev->dev,
-                                     tx->coalesce_buf,
-                                     tx->tlen,
-                                     DMA_TO_DEVICE);
-
-               if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) {
-                       sdma_txclean(dd, tx);
-                       return -ENOSPC;
-               }
-
-               /* Add descriptor for coalesce buffer */
-               tx->desc_limit = MAX_DESC;
-               return _sdma_txadd_daddr(dd, SDMA_MAP_SINGLE, tx,
-                                        addr, tx->tlen);
-       }
-
-       return 1;
-}
-
-/* Update sdes when the lmc changes */
-void sdma_update_lmc(struct hfi1_devdata *dd, u64 mask, u32 lid)
-{
-       struct sdma_engine *sde;
-       int i;
-       u64 sreg;
-
-       sreg = ((mask & SD(CHECK_SLID_MASK_MASK)) <<
-               SD(CHECK_SLID_MASK_SHIFT)) |
-               (((lid & mask) & SD(CHECK_SLID_VALUE_MASK)) <<
-               SD(CHECK_SLID_VALUE_SHIFT));
-
-       for (i = 0; i < dd->num_sdma; i++) {
-               hfi1_cdbg(LINKVERB, "SendDmaEngine[%d].SLID_CHECK = 0x%x",
-                         i, (u32)sreg);
-               sde = &dd->per_sdma[i];
-               write_sde_csr(sde, SD(CHECK_SLID), sreg);
-       }
-}
-
-/* tx not dword sized - pad */
-int _pad_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx)
-{
-       int rval = 0;
-
-       tx->num_desc++;
-       if ((unlikely(tx->num_desc == tx->desc_limit))) {
-               rval = _extend_sdma_tx_descs(dd, tx);
-               if (rval) {
-                       sdma_txclean(dd, tx);
-                       return rval;
-               }
-       }
-       /* finish the one just added */
-       make_tx_sdma_desc(
-               tx,
-               SDMA_MAP_NONE,
-               dd->sdma_pad_phys,
-               sizeof(u32) - (tx->packet_len & (sizeof(u32) - 1)));
-       _sdma_close_tx(dd, tx);
-       return rval;
-}
-
-/*
- * Add ahg to the sdma_txreq
- *
- * The logic will consume up to 3
- * descriptors at the beginning of
- * sdma_txreq.
- */
-void _sdma_txreq_ahgadd(
-       struct sdma_txreq *tx,
-       u8 num_ahg,
-       u8 ahg_entry,
-       u32 *ahg,
-       u8 ahg_hlen)
-{
-       u32 i, shift = 0, desc = 0;
-       u8 mode;
-
-       WARN_ON_ONCE(num_ahg > 9 || (ahg_hlen & 3) || ahg_hlen == 4);
-       /* compute mode */
-       if (num_ahg == 1)
-               mode = SDMA_AHG_APPLY_UPDATE1;
-       else if (num_ahg <= 5)
-               mode = SDMA_AHG_APPLY_UPDATE2;
-       else
-               mode = SDMA_AHG_APPLY_UPDATE3;
-       tx->num_desc++;
-       /* initialize to consumed descriptors to zero */
-       switch (mode) {
-       case SDMA_AHG_APPLY_UPDATE3:
-               tx->num_desc++;
-               tx->descs[2].qw[0] = 0;
-               tx->descs[2].qw[1] = 0;
-               /* FALLTHROUGH */
-       case SDMA_AHG_APPLY_UPDATE2:
-               tx->num_desc++;
-               tx->descs[1].qw[0] = 0;
-               tx->descs[1].qw[1] = 0;
-               break;
-       }
-       ahg_hlen >>= 2;
-       tx->descs[0].qw[1] |=
-               (((u64)ahg_entry & SDMA_DESC1_HEADER_INDEX_MASK)
-                       << SDMA_DESC1_HEADER_INDEX_SHIFT) |
-               (((u64)ahg_hlen & SDMA_DESC1_HEADER_DWS_MASK)
-                       << SDMA_DESC1_HEADER_DWS_SHIFT) |
-               (((u64)mode & SDMA_DESC1_HEADER_MODE_MASK)
-                       << SDMA_DESC1_HEADER_MODE_SHIFT) |
-               (((u64)ahg[0] & SDMA_DESC1_HEADER_UPDATE1_MASK)
-                       << SDMA_DESC1_HEADER_UPDATE1_SHIFT);
-       for (i = 0; i < (num_ahg - 1); i++) {
-               if (!shift && !(i & 2))
-                       desc++;
-               tx->descs[desc].qw[!!(i & 2)] |=
-                       (((u64)ahg[i + 1])
-                               << shift);
-               shift = (shift + 32) & 63;
-       }
-}
-
-/**
- * sdma_ahg_alloc - allocate an AHG entry
- * @sde: engine to allocate from
- *
- * Return:
- * 0-31 when successful, -EOPNOTSUPP if AHG is not enabled,
- * -ENOSPC if an entry is not available
- */
-int sdma_ahg_alloc(struct sdma_engine *sde)
-{
-       int nr;
-       int oldbit;
-
-       if (!sde) {
-               trace_hfi1_ahg_allocate(sde, -EINVAL);
-               return -EINVAL;
-       }
-       while (1) {
-               nr = ffz(ACCESS_ONCE(sde->ahg_bits));
-               if (nr > 31) {
-                       trace_hfi1_ahg_allocate(sde, -ENOSPC);
-                       return -ENOSPC;
-               }
-               oldbit = test_and_set_bit(nr, &sde->ahg_bits);
-               if (!oldbit)
-                       break;
-               cpu_relax();
-       }
-       trace_hfi1_ahg_allocate(sde, nr);
-       return nr;
-}
-
-/**
- * sdma_ahg_free - free an AHG entry
- * @sde: engine to return AHG entry
- * @ahg_index: index to free
- *
- * This routine frees the indicate AHG entry.
- */
-void sdma_ahg_free(struct sdma_engine *sde, int ahg_index)
-{
-       if (!sde)
-               return;
-       trace_hfi1_ahg_deallocate(sde, ahg_index);
-       if (ahg_index < 0 || ahg_index > 31)
-               return;
-       clear_bit(ahg_index, &sde->ahg_bits);
-}
-
-/*
- * SPC freeze handling for SDMA engines.  Called when the driver knows
- * the SPC is going into a freeze but before the freeze is fully
- * settled.  Generally an error interrupt.
- *
- * This event will pull the engine out of running so no more entries can be
- * added to the engine's queue.
- */
-void sdma_freeze_notify(struct hfi1_devdata *dd, int link_down)
-{
-       int i;
-       enum sdma_events event = link_down ? sdma_event_e85_link_down :
-                                            sdma_event_e80_hw_freeze;
-
-       /* set up the wait but do not wait here */
-       atomic_set(&dd->sdma_unfreeze_count, dd->num_sdma);
-
-       /* tell all engines to stop running and wait */
-       for (i = 0; i < dd->num_sdma; i++)
-               sdma_process_event(&dd->per_sdma[i], event);
-
-       /* sdma_freeze() will wait for all engines to have stopped */
-}
-
-/*
- * SPC freeze handling for SDMA engines.  Called when the driver knows
- * the SPC is fully frozen.
- */
-void sdma_freeze(struct hfi1_devdata *dd)
-{
-       int i;
-       int ret;
-
-       /*
-        * Make sure all engines have moved out of the running state before
-        * continuing.
-        */
-       ret = wait_event_interruptible(dd->sdma_unfreeze_wq,
-                                      atomic_read(&dd->sdma_unfreeze_count) <=
-                                      0);
-       /* interrupted or count is negative, then unloading - just exit */
-       if (ret || atomic_read(&dd->sdma_unfreeze_count) < 0)
-               return;
-
-       /* set up the count for the next wait */
-       atomic_set(&dd->sdma_unfreeze_count, dd->num_sdma);
-
-       /* tell all engines that the SPC is frozen, they can start cleaning */
-       for (i = 0; i < dd->num_sdma; i++)
-               sdma_process_event(&dd->per_sdma[i], sdma_event_e81_hw_frozen);
-
-       /*
-        * Wait for everyone to finish software clean before exiting.  The
-        * software clean will read engine CSRs, so must be completed before
-        * the next step, which will clear the engine CSRs.
-        */
-       (void)wait_event_interruptible(dd->sdma_unfreeze_wq,
-                               atomic_read(&dd->sdma_unfreeze_count) <= 0);
-       /* no need to check results - done no matter what */
-}
-
-/*
- * SPC freeze handling for the SDMA engines.  Called after the SPC is unfrozen.
- *
- * The SPC freeze acts like a SDMA halt and a hardware clean combined.  All
- * that is left is a software clean.  We could do it after the SPC is fully
- * frozen, but then we'd have to add another state to wait for the unfreeze.
- * Instead, just defer the software clean until the unfreeze step.
- */
-void sdma_unfreeze(struct hfi1_devdata *dd)
-{
-       int i;
-
-       /* tell all engines start freeze clean up */
-       for (i = 0; i < dd->num_sdma; i++)
-               sdma_process_event(&dd->per_sdma[i],
-                                  sdma_event_e82_hw_unfreeze);
-}
-
-/**
- * _sdma_engine_progress_schedule() - schedule progress on engine
- * @sde: sdma_engine to schedule progress
- *
- */
-void _sdma_engine_progress_schedule(
-       struct sdma_engine *sde)
-{
-       trace_hfi1_sdma_engine_progress(sde, sde->progress_mask);
-       /* assume we have selected a good cpu */
-       write_csr(sde->dd,
-                 CCE_INT_FORCE + (8 * (IS_SDMA_START / 64)),
-                 sde->progress_mask);
-}
diff --git a/drivers/staging/rdma/hfi1/sdma.h b/drivers/staging/rdma/hfi1/sdma.h
deleted file mode 100644 (file)
index 8f50c99..0000000
+++ /dev/null
@@ -1,1082 +0,0 @@
-#ifndef _HFI1_SDMA_H
-#define _HFI1_SDMA_H
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/types.h>
-#include <linux/list.h>
-#include <asm/byteorder.h>
-#include <linux/workqueue.h>
-#include <linux/rculist.h>
-
-#include "hfi.h"
-#include "verbs.h"
-#include "sdma_txreq.h"
-
-/* Hardware limit */
-#define MAX_DESC 64
-/* Hardware limit for SDMA packet size */
-#define MAX_SDMA_PKT_SIZE ((16 * 1024) - 1)
-
-#define SDMA_TXREQ_S_OK        0
-#define SDMA_TXREQ_S_SENDERROR 1
-#define SDMA_TXREQ_S_ABORTED   2
-#define SDMA_TXREQ_S_SHUTDOWN  3
-
-/* flags bits */
-#define SDMA_TXREQ_F_URGENT       0x0001
-#define SDMA_TXREQ_F_AHG_COPY     0x0002
-#define SDMA_TXREQ_F_USE_AHG      0x0004
-
-#define SDMA_MAP_NONE          0
-#define SDMA_MAP_SINGLE        1
-#define SDMA_MAP_PAGE          2
-
-#define SDMA_AHG_VALUE_MASK          0xffff
-#define SDMA_AHG_VALUE_SHIFT         0
-#define SDMA_AHG_INDEX_MASK          0xf
-#define SDMA_AHG_INDEX_SHIFT         16
-#define SDMA_AHG_FIELD_LEN_MASK      0xf
-#define SDMA_AHG_FIELD_LEN_SHIFT     20
-#define SDMA_AHG_FIELD_START_MASK    0x1f
-#define SDMA_AHG_FIELD_START_SHIFT   24
-#define SDMA_AHG_UPDATE_ENABLE_MASK  0x1
-#define SDMA_AHG_UPDATE_ENABLE_SHIFT 31
-
-/* AHG modes */
-
-/*
- * Be aware the ordering and values
- * for SDMA_AHG_APPLY_UPDATE[123]
- * are assumed in generating a skip
- * count in submit_tx() in sdma.c
- */
-#define SDMA_AHG_NO_AHG              0
-#define SDMA_AHG_COPY                1
-#define SDMA_AHG_APPLY_UPDATE1       2
-#define SDMA_AHG_APPLY_UPDATE2       3
-#define SDMA_AHG_APPLY_UPDATE3       4
-
-/*
- * Bits defined in the send DMA descriptor.
- */
-#define SDMA_DESC0_FIRST_DESC_FLAG      BIT_ULL(63)
-#define SDMA_DESC0_LAST_DESC_FLAG       BIT_ULL(62)
-#define SDMA_DESC0_BYTE_COUNT_SHIFT     48
-#define SDMA_DESC0_BYTE_COUNT_WIDTH     14
-#define SDMA_DESC0_BYTE_COUNT_MASK \
-       ((1ULL << SDMA_DESC0_BYTE_COUNT_WIDTH) - 1)
-#define SDMA_DESC0_BYTE_COUNT_SMASK \
-       (SDMA_DESC0_BYTE_COUNT_MASK << SDMA_DESC0_BYTE_COUNT_SHIFT)
-#define SDMA_DESC0_PHY_ADDR_SHIFT       0
-#define SDMA_DESC0_PHY_ADDR_WIDTH       48
-#define SDMA_DESC0_PHY_ADDR_MASK \
-       ((1ULL << SDMA_DESC0_PHY_ADDR_WIDTH) - 1)
-#define SDMA_DESC0_PHY_ADDR_SMASK \
-       (SDMA_DESC0_PHY_ADDR_MASK << SDMA_DESC0_PHY_ADDR_SHIFT)
-
-#define SDMA_DESC1_HEADER_UPDATE1_SHIFT 32
-#define SDMA_DESC1_HEADER_UPDATE1_WIDTH 32
-#define SDMA_DESC1_HEADER_UPDATE1_MASK \
-       ((1ULL << SDMA_DESC1_HEADER_UPDATE1_WIDTH) - 1)
-#define SDMA_DESC1_HEADER_UPDATE1_SMASK \
-       (SDMA_DESC1_HEADER_UPDATE1_MASK << SDMA_DESC1_HEADER_UPDATE1_SHIFT)
-#define SDMA_DESC1_HEADER_MODE_SHIFT    13
-#define SDMA_DESC1_HEADER_MODE_WIDTH    3
-#define SDMA_DESC1_HEADER_MODE_MASK \
-       ((1ULL << SDMA_DESC1_HEADER_MODE_WIDTH) - 1)
-#define SDMA_DESC1_HEADER_MODE_SMASK \
-       (SDMA_DESC1_HEADER_MODE_MASK << SDMA_DESC1_HEADER_MODE_SHIFT)
-#define SDMA_DESC1_HEADER_INDEX_SHIFT   8
-#define SDMA_DESC1_HEADER_INDEX_WIDTH   5
-#define SDMA_DESC1_HEADER_INDEX_MASK \
-       ((1ULL << SDMA_DESC1_HEADER_INDEX_WIDTH) - 1)
-#define SDMA_DESC1_HEADER_INDEX_SMASK \
-       (SDMA_DESC1_HEADER_INDEX_MASK << SDMA_DESC1_HEADER_INDEX_SHIFT)
-#define SDMA_DESC1_HEADER_DWS_SHIFT     4
-#define SDMA_DESC1_HEADER_DWS_WIDTH     4
-#define SDMA_DESC1_HEADER_DWS_MASK \
-       ((1ULL << SDMA_DESC1_HEADER_DWS_WIDTH) - 1)
-#define SDMA_DESC1_HEADER_DWS_SMASK \
-       (SDMA_DESC1_HEADER_DWS_MASK << SDMA_DESC1_HEADER_DWS_SHIFT)
-#define SDMA_DESC1_GENERATION_SHIFT     2
-#define SDMA_DESC1_GENERATION_WIDTH     2
-#define SDMA_DESC1_GENERATION_MASK \
-       ((1ULL << SDMA_DESC1_GENERATION_WIDTH) - 1)
-#define SDMA_DESC1_GENERATION_SMASK \
-       (SDMA_DESC1_GENERATION_MASK << SDMA_DESC1_GENERATION_SHIFT)
-#define SDMA_DESC1_INT_REQ_FLAG         BIT_ULL(1)
-#define SDMA_DESC1_HEAD_TO_HOST_FLAG    BIT_ULL(0)
-
-enum sdma_states {
-       sdma_state_s00_hw_down,
-       sdma_state_s10_hw_start_up_halt_wait,
-       sdma_state_s15_hw_start_up_clean_wait,
-       sdma_state_s20_idle,
-       sdma_state_s30_sw_clean_up_wait,
-       sdma_state_s40_hw_clean_up_wait,
-       sdma_state_s50_hw_halt_wait,
-       sdma_state_s60_idle_halt_wait,
-       sdma_state_s80_hw_freeze,
-       sdma_state_s82_freeze_sw_clean,
-       sdma_state_s99_running,
-};
-
-enum sdma_events {
-       sdma_event_e00_go_hw_down,
-       sdma_event_e10_go_hw_start,
-       sdma_event_e15_hw_halt_done,
-       sdma_event_e25_hw_clean_up_done,
-       sdma_event_e30_go_running,
-       sdma_event_e40_sw_cleaned,
-       sdma_event_e50_hw_cleaned,
-       sdma_event_e60_hw_halted,
-       sdma_event_e70_go_idle,
-       sdma_event_e80_hw_freeze,
-       sdma_event_e81_hw_frozen,
-       sdma_event_e82_hw_unfreeze,
-       sdma_event_e85_link_down,
-       sdma_event_e90_sw_halted,
-};
-
-struct sdma_set_state_action {
-       unsigned op_enable:1;
-       unsigned op_intenable:1;
-       unsigned op_halt:1;
-       unsigned op_cleanup:1;
-       unsigned go_s99_running_tofalse:1;
-       unsigned go_s99_running_totrue:1;
-};
-
-struct sdma_state {
-       struct kref          kref;
-       struct completion    comp;
-       enum sdma_states current_state;
-       unsigned             current_op;
-       unsigned             go_s99_running;
-       /* debugging/development */
-       enum sdma_states previous_state;
-       unsigned             previous_op;
-       enum sdma_events last_event;
-};
-
-/**
- * DOC: sdma exported routines
- *
- * These sdma routines fit into three categories:
- * - The SDMA API for building and submitting packets
- *   to the ring
- *
- * - Initialization and tear down routines to buildup
- *   and tear down SDMA
- *
- * - ISR entrances to handle interrupts, state changes
- *   and errors
- */
-
-/**
- * DOC: sdma PSM/verbs API
- *
- * The sdma API is designed to be used by both PSM
- * and verbs to supply packets to the SDMA ring.
- *
- * The usage of the API is as follows:
- *
- * Embed a struct iowait in the QP or
- * PQ.  The iowait should be initialized with a
- * call to iowait_init().
- *
- * The user of the API should create an allocation method
- * for their version of the txreq. slabs, pre-allocated lists,
- * and dma pools can be used.  Once the user's overload of
- * the sdma_txreq has been allocated, the sdma_txreq member
- * must be initialized with sdma_txinit() or sdma_txinit_ahg().
- *
- * The txreq must be declared with the sdma_txreq first.
- *
- * The tx request, once initialized,  is manipulated with calls to
- * sdma_txadd_daddr(), sdma_txadd_page(), or sdma_txadd_kvaddr()
- * for each disjoint memory location.  It is the user's responsibility
- * to understand the packet boundaries and page boundaries to do the
- * appropriate number of sdma_txadd_* calls..  The user
- * must be prepared to deal with failures from these routines due to
- * either memory allocation or dma_mapping failures.
- *
- * The mapping specifics for each memory location are recorded
- * in the tx. Memory locations added with sdma_txadd_page()
- * and sdma_txadd_kvaddr() are automatically mapped when added
- * to the tx and nmapped as part of the progress processing in the
- * SDMA interrupt handling.
- *
- * sdma_txadd_daddr() is used to add an dma_addr_t memory to the
- * tx.   An example of a use case would be a pre-allocated
- * set of headers allocated via dma_pool_alloc() or
- * dma_alloc_coherent().  For these memory locations, it
- * is the responsibility of the user to handle that unmapping.
- * (This would usually be at an unload or job termination.)
- *
- * The routine sdma_send_txreq() is used to submit
- * a tx to the ring after the appropriate number of
- * sdma_txadd_* have been done.
- *
- * If it is desired to send a burst of sdma_txreqs, sdma_send_txlist()
- * can be used to submit a list of packets.
- *
- * The user is free to use the link overhead in the struct sdma_txreq as
- * long as the tx isn't in flight.
- *
- * The extreme degenerate case of the number of descriptors
- * exceeding the ring size is automatically handled as
- * memory locations are added.  An overflow of the descriptor
- * array that is part of the sdma_txreq is also automatically
- * handled.
- *
- */
-
-/**
- * DOC: Infrastructure calls
- *
- * sdma_init() is used to initialize data structures and
- * CSRs for the desired number of SDMA engines.
- *
- * sdma_start() is used to kick the SDMA engines initialized
- * with sdma_init().   Interrupts must be enabled at this
- * point since aspects of the state machine are interrupt
- * driven.
- *
- * sdma_engine_error() and sdma_engine_interrupt() are
- * entrances for interrupts.
- *
- * sdma_map_init() is for the management of the mapping
- * table when the number of vls is changed.
- *
- */
-
-/*
- * struct hw_sdma_desc - raw 128 bit SDMA descriptor
- *
- * This is the raw descriptor in the SDMA ring
- */
-struct hw_sdma_desc {
-       /* private:  don't use directly */
-       __le64 qw[2];
-};
-
-/**
- * struct sdma_engine - Data pertaining to each SDMA engine.
- * @dd: a back-pointer to the device data
- * @ppd: per port back-pointer
- * @imask: mask for irq manipulation
- * @idle_mask: mask for determining if an interrupt is due to sdma_idle
- *
- * This structure has the state for each sdma_engine.
- *
- * Accessing to non public fields are not supported
- * since the private members are subject to change.
- */
-struct sdma_engine {
-       /* read mostly */
-       struct hfi1_devdata *dd;
-       struct hfi1_pportdata *ppd;
-       /* private: */
-       void __iomem *tail_csr;
-       u64 imask;                      /* clear interrupt mask */
-       u64 idle_mask;
-       u64 progress_mask;
-       u64 int_mask;
-       /* private: */
-       volatile __le64      *head_dma; /* DMA'ed by chip */
-       /* private: */
-       dma_addr_t            head_phys;
-       /* private: */
-       struct hw_sdma_desc *descq;
-       /* private: */
-       unsigned descq_full_count;
-       struct sdma_txreq **tx_ring;
-       /* private: */
-       dma_addr_t            descq_phys;
-       /* private */
-       u32 sdma_mask;
-       /* private */
-       struct sdma_state state;
-       /* private */
-       int cpu;
-       /* private: */
-       u8 sdma_shift;
-       /* private: */
-       u8 this_idx; /* zero relative engine */
-       /* protect changes to senddmactrl shadow */
-       spinlock_t senddmactrl_lock;
-       /* private: */
-       u64 p_senddmactrl;              /* shadow per-engine SendDmaCtrl */
-
-       /* read/write using tail_lock */
-       spinlock_t            tail_lock ____cacheline_aligned_in_smp;
-#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
-       /* private: */
-       u64                   tail_sn;
-#endif
-       /* private: */
-       u32                   descq_tail;
-       /* private: */
-       unsigned long         ahg_bits;
-       /* private: */
-       u16                   desc_avail;
-       /* private: */
-       u16                   tx_tail;
-       /* private: */
-       u16 descq_cnt;
-
-       /* read/write using head_lock */
-       /* private: */
-       seqlock_t            head_lock ____cacheline_aligned_in_smp;
-#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
-       /* private: */
-       u64                   head_sn;
-#endif
-       /* private: */
-       u32                   descq_head;
-       /* private: */
-       u16                   tx_head;
-       /* private: */
-       u64                   last_status;
-       /* private */
-       u64                     err_cnt;
-       /* private */
-       u64                     sdma_int_cnt;
-       u64                     idle_int_cnt;
-       u64                     progress_int_cnt;
-
-       /* private: */
-       struct list_head      dmawait;
-
-       /* CONFIG SDMA for now, just blindly duplicate */
-       /* private: */
-       struct tasklet_struct sdma_hw_clean_up_task
-               ____cacheline_aligned_in_smp;
-
-       /* private: */
-       struct tasklet_struct sdma_sw_clean_up_task
-               ____cacheline_aligned_in_smp;
-       /* private: */
-       struct work_struct err_halt_worker;
-       /* private */
-       struct timer_list     err_progress_check_timer;
-       u32                   progress_check_head;
-       /* private: */
-       struct work_struct flush_worker;
-       /* protect flush list */
-       spinlock_t flushlist_lock;
-       /* private: */
-       struct list_head flushlist;
-};
-
-int sdma_init(struct hfi1_devdata *dd, u8 port);
-void sdma_start(struct hfi1_devdata *dd);
-void sdma_exit(struct hfi1_devdata *dd);
-void sdma_all_running(struct hfi1_devdata *dd);
-void sdma_all_idle(struct hfi1_devdata *dd);
-void sdma_freeze_notify(struct hfi1_devdata *dd, int go_idle);
-void sdma_freeze(struct hfi1_devdata *dd);
-void sdma_unfreeze(struct hfi1_devdata *dd);
-void sdma_wait(struct hfi1_devdata *dd);
-
-/**
- * sdma_empty() - idle engine test
- * @engine: sdma engine
- *
- * Currently used by verbs as a latency optimization.
- *
- * Return:
- * 1 - empty, 0 - non-empty
- */
-static inline int sdma_empty(struct sdma_engine *sde)
-{
-       return sde->descq_tail == sde->descq_head;
-}
-
-static inline u16 sdma_descq_freecnt(struct sdma_engine *sde)
-{
-       return sde->descq_cnt -
-               (sde->descq_tail -
-                ACCESS_ONCE(sde->descq_head)) - 1;
-}
-
-static inline u16 sdma_descq_inprocess(struct sdma_engine *sde)
-{
-       return sde->descq_cnt - sdma_descq_freecnt(sde);
-}
-
-/*
- * Either head_lock or tail lock required to see
- * a steady state.
- */
-static inline int __sdma_running(struct sdma_engine *engine)
-{
-       return engine->state.current_state == sdma_state_s99_running;
-}
-
-/**
- * sdma_running() - state suitability test
- * @engine: sdma engine
- *
- * sdma_running probes the internal state to determine if it is suitable
- * for submitting packets.
- *
- * Return:
- * 1 - ok to submit, 0 - not ok to submit
- *
- */
-static inline int sdma_running(struct sdma_engine *engine)
-{
-       unsigned long flags;
-       int ret;
-
-       spin_lock_irqsave(&engine->tail_lock, flags);
-       ret = __sdma_running(engine);
-       spin_unlock_irqrestore(&engine->tail_lock, flags);
-       return ret;
-}
-
-void _sdma_txreq_ahgadd(
-       struct sdma_txreq *tx,
-       u8 num_ahg,
-       u8 ahg_entry,
-       u32 *ahg,
-       u8 ahg_hlen);
-
-/**
- * sdma_txinit_ahg() - initialize an sdma_txreq struct with AHG
- * @tx: tx request to initialize
- * @flags: flags to key last descriptor additions
- * @tlen: total packet length (pbc + headers + data)
- * @ahg_entry: ahg entry to use  (0 - 31)
- * @num_ahg: ahg descriptor for first descriptor (0 - 9)
- * @ahg: array of AHG descriptors (up to 9 entries)
- * @ahg_hlen: number of bytes from ASIC entry to use
- * @cb: callback
- *
- * The allocation of the sdma_txreq and it enclosing structure is user
- * dependent.  This routine must be called to initialize the user independent
- * fields.
- *
- * The currently supported flags are SDMA_TXREQ_F_URGENT,
- * SDMA_TXREQ_F_AHG_COPY, and SDMA_TXREQ_F_USE_AHG.
- *
- * SDMA_TXREQ_F_URGENT is used for latency sensitive situations where the
- * completion is desired as soon as possible.
- *
- * SDMA_TXREQ_F_AHG_COPY causes the header in the first descriptor to be
- * copied to chip entry. SDMA_TXREQ_F_USE_AHG causes the code to add in
- * the AHG descriptors into the first 1 to 3 descriptors.
- *
- * Completions of submitted requests can be gotten on selected
- * txreqs by giving a completion routine callback to sdma_txinit() or
- * sdma_txinit_ahg().  The environment in which the callback runs
- * can be from an ISR, a tasklet, or a thread, so no sleeping
- * kernel routines can be used.   Aspects of the sdma ring may
- * be locked so care should be taken with locking.
- *
- * The callback pointer can be NULL to avoid any callback for the packet
- * being submitted. The callback will be provided this tx, a status, and a flag.
- *
- * The status will be one of SDMA_TXREQ_S_OK, SDMA_TXREQ_S_SENDERROR,
- * SDMA_TXREQ_S_ABORTED, or SDMA_TXREQ_S_SHUTDOWN.
- *
- * The flag, if the is the iowait had been used, indicates the iowait
- * sdma_busy count has reached zero.
- *
- * user data portion of tlen should be precise.   The sdma_txadd_* entrances
- * will pad with a descriptor references 1 - 3 bytes when the number of bytes
- * specified in tlen have been supplied to the sdma_txreq.
- *
- * ahg_hlen is used to determine the number of on-chip entry bytes to
- * use as the header.   This is for cases where the stored header is
- * larger than the header to be used in a packet.  This is typical
- * for verbs where an RDMA_WRITE_FIRST is larger than the packet in
- * and RDMA_WRITE_MIDDLE.
- *
- */
-static inline int sdma_txinit_ahg(
-       struct sdma_txreq *tx,
-       u16 flags,
-       u16 tlen,
-       u8 ahg_entry,
-       u8 num_ahg,
-       u32 *ahg,
-       u8 ahg_hlen,
-       void (*cb)(struct sdma_txreq *, int))
-{
-       if (tlen == 0)
-               return -ENODATA;
-       if (tlen > MAX_SDMA_PKT_SIZE)
-               return -EMSGSIZE;
-       tx->desc_limit = ARRAY_SIZE(tx->descs);
-       tx->descp = &tx->descs[0];
-       INIT_LIST_HEAD(&tx->list);
-       tx->num_desc = 0;
-       tx->flags = flags;
-       tx->complete = cb;
-       tx->coalesce_buf = NULL;
-       tx->wait = NULL;
-       tx->packet_len = tlen;
-       tx->tlen = tx->packet_len;
-       tx->descs[0].qw[0] = SDMA_DESC0_FIRST_DESC_FLAG;
-       tx->descs[0].qw[1] = 0;
-       if (flags & SDMA_TXREQ_F_AHG_COPY)
-               tx->descs[0].qw[1] |=
-                       (((u64)ahg_entry & SDMA_DESC1_HEADER_INDEX_MASK)
-                               << SDMA_DESC1_HEADER_INDEX_SHIFT) |
-                       (((u64)SDMA_AHG_COPY & SDMA_DESC1_HEADER_MODE_MASK)
-                               << SDMA_DESC1_HEADER_MODE_SHIFT);
-       else if (flags & SDMA_TXREQ_F_USE_AHG && num_ahg)
-               _sdma_txreq_ahgadd(tx, num_ahg, ahg_entry, ahg, ahg_hlen);
-       return 0;
-}
-
-/**
- * sdma_txinit() - initialize an sdma_txreq struct (no AHG)
- * @tx: tx request to initialize
- * @flags: flags to key last descriptor additions
- * @tlen: total packet length (pbc + headers + data)
- * @cb: callback pointer
- *
- * The allocation of the sdma_txreq and it enclosing structure is user
- * dependent.  This routine must be called to initialize the user
- * independent fields.
- *
- * The currently supported flags is SDMA_TXREQ_F_URGENT.
- *
- * SDMA_TXREQ_F_URGENT is used for latency sensitive situations where the
- * completion is desired as soon as possible.
- *
- * Completions of submitted requests can be gotten on selected
- * txreqs by giving a completion routine callback to sdma_txinit() or
- * sdma_txinit_ahg().  The environment in which the callback runs
- * can be from an ISR, a tasklet, or a thread, so no sleeping
- * kernel routines can be used.   The head size of the sdma ring may
- * be locked so care should be taken with locking.
- *
- * The callback pointer can be NULL to avoid any callback for the packet
- * being submitted.
- *
- * The callback, if non-NULL,  will be provided this tx and a status.  The
- * status will be one of SDMA_TXREQ_S_OK, SDMA_TXREQ_S_SENDERROR,
- * SDMA_TXREQ_S_ABORTED, or SDMA_TXREQ_S_SHUTDOWN.
- *
- */
-static inline int sdma_txinit(
-       struct sdma_txreq *tx,
-       u16 flags,
-       u16 tlen,
-       void (*cb)(struct sdma_txreq *, int))
-{
-       return sdma_txinit_ahg(tx, flags, tlen, 0, 0, NULL, 0, cb);
-}
-
-/* helpers - don't use */
-static inline int sdma_mapping_type(struct sdma_desc *d)
-{
-       return (d->qw[1] & SDMA_DESC1_GENERATION_SMASK)
-               >> SDMA_DESC1_GENERATION_SHIFT;
-}
-
-static inline size_t sdma_mapping_len(struct sdma_desc *d)
-{
-       return (d->qw[0] & SDMA_DESC0_BYTE_COUNT_SMASK)
-               >> SDMA_DESC0_BYTE_COUNT_SHIFT;
-}
-
-static inline dma_addr_t sdma_mapping_addr(struct sdma_desc *d)
-{
-       return (d->qw[0] & SDMA_DESC0_PHY_ADDR_SMASK)
-               >> SDMA_DESC0_PHY_ADDR_SHIFT;
-}
-
-static inline void make_tx_sdma_desc(
-       struct sdma_txreq *tx,
-       int type,
-       dma_addr_t addr,
-       size_t len)
-{
-       struct sdma_desc *desc = &tx->descp[tx->num_desc];
-
-       if (!tx->num_desc) {
-               /* qw[0] zero; qw[1] first, ahg mode already in from init */
-               desc->qw[1] |= ((u64)type & SDMA_DESC1_GENERATION_MASK)
-                               << SDMA_DESC1_GENERATION_SHIFT;
-       } else {
-               desc->qw[0] = 0;
-               desc->qw[1] = ((u64)type & SDMA_DESC1_GENERATION_MASK)
-                               << SDMA_DESC1_GENERATION_SHIFT;
-       }
-       desc->qw[0] |= (((u64)addr & SDMA_DESC0_PHY_ADDR_MASK)
-                               << SDMA_DESC0_PHY_ADDR_SHIFT) |
-                       (((u64)len & SDMA_DESC0_BYTE_COUNT_MASK)
-                               << SDMA_DESC0_BYTE_COUNT_SHIFT);
-}
-
-/* helper to extend txreq */
-int ext_coal_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx,
-                          int type, void *kvaddr, struct page *page,
-                          unsigned long offset, u16 len);
-int _pad_sdma_tx_descs(struct hfi1_devdata *, struct sdma_txreq *);
-void sdma_txclean(struct hfi1_devdata *, struct sdma_txreq *);
-
-/* helpers used by public routines */
-static inline void _sdma_close_tx(struct hfi1_devdata *dd,
-                                 struct sdma_txreq *tx)
-{
-       tx->descp[tx->num_desc].qw[0] |=
-               SDMA_DESC0_LAST_DESC_FLAG;
-       tx->descp[tx->num_desc].qw[1] |=
-               dd->default_desc1;
-       if (tx->flags & SDMA_TXREQ_F_URGENT)
-               tx->descp[tx->num_desc].qw[1] |=
-                       (SDMA_DESC1_HEAD_TO_HOST_FLAG |
-                        SDMA_DESC1_INT_REQ_FLAG);
-}
-
-static inline int _sdma_txadd_daddr(
-       struct hfi1_devdata *dd,
-       int type,
-       struct sdma_txreq *tx,
-       dma_addr_t addr,
-       u16 len)
-{
-       int rval = 0;
-
-       make_tx_sdma_desc(
-               tx,
-               type,
-               addr, len);
-       WARN_ON(len > tx->tlen);
-       tx->tlen -= len;
-       /* special cases for last */
-       if (!tx->tlen) {
-               if (tx->packet_len & (sizeof(u32) - 1)) {
-                       rval = _pad_sdma_tx_descs(dd, tx);
-                       if (rval)
-                               return rval;
-               } else {
-                       _sdma_close_tx(dd, tx);
-               }
-       }
-       tx->num_desc++;
-       return rval;
-}
-
-/**
- * sdma_txadd_page() - add a page to the sdma_txreq
- * @dd: the device to use for mapping
- * @tx: tx request to which the page is added
- * @page: page to map
- * @offset: offset within the page
- * @len: length in bytes
- *
- * This is used to add a page/offset/length descriptor.
- *
- * The mapping/unmapping of the page/offset/len is automatically handled.
- *
- * Return:
- * 0 - success, -ENOSPC - mapping fail, -ENOMEM - couldn't
- * extend/coalesce descriptor array
- */
-static inline int sdma_txadd_page(
-       struct hfi1_devdata *dd,
-       struct sdma_txreq *tx,
-       struct page *page,
-       unsigned long offset,
-       u16 len)
-{
-       dma_addr_t addr;
-       int rval;
-
-       if ((unlikely(tx->num_desc == tx->desc_limit))) {
-               rval = ext_coal_sdma_tx_descs(dd, tx, SDMA_MAP_PAGE,
-                                             NULL, page, offset, len);
-               if (rval <= 0)
-                       return rval;
-       }
-
-       addr = dma_map_page(
-                      &dd->pcidev->dev,
-                      page,
-                      offset,
-                      len,
-                      DMA_TO_DEVICE);
-
-       if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) {
-               sdma_txclean(dd, tx);
-               return -ENOSPC;
-       }
-
-       return _sdma_txadd_daddr(
-                       dd, SDMA_MAP_PAGE, tx, addr, len);
-}
-
-/**
- * sdma_txadd_daddr() - add a dma address to the sdma_txreq
- * @dd: the device to use for mapping
- * @tx: sdma_txreq to which the page is added
- * @addr: dma address mapped by caller
- * @len: length in bytes
- *
- * This is used to add a descriptor for memory that is already dma mapped.
- *
- * In this case, there is no unmapping as part of the progress processing for
- * this memory location.
- *
- * Return:
- * 0 - success, -ENOMEM - couldn't extend descriptor array
- */
-
-static inline int sdma_txadd_daddr(
-       struct hfi1_devdata *dd,
-       struct sdma_txreq *tx,
-       dma_addr_t addr,
-       u16 len)
-{
-       int rval;
-
-       if ((unlikely(tx->num_desc == tx->desc_limit))) {
-               rval = ext_coal_sdma_tx_descs(dd, tx, SDMA_MAP_NONE,
-                                             NULL, NULL, 0, 0);
-               if (rval <= 0)
-                       return rval;
-       }
-
-       return _sdma_txadd_daddr(dd, SDMA_MAP_NONE, tx, addr, len);
-}
-
-/**
- * sdma_txadd_kvaddr() - add a kernel virtual address to sdma_txreq
- * @dd: the device to use for mapping
- * @tx: sdma_txreq to which the page is added
- * @kvaddr: the kernel virtual address
- * @len: length in bytes
- *
- * This is used to add a descriptor referenced by the indicated kvaddr and
- * len.
- *
- * The mapping/unmapping of the kvaddr and len is automatically handled.
- *
- * Return:
- * 0 - success, -ENOSPC - mapping fail, -ENOMEM - couldn't extend/coalesce
- * descriptor array
- */
-static inline int sdma_txadd_kvaddr(
-       struct hfi1_devdata *dd,
-       struct sdma_txreq *tx,
-       void *kvaddr,
-       u16 len)
-{
-       dma_addr_t addr;
-       int rval;
-
-       if ((unlikely(tx->num_desc == tx->desc_limit))) {
-               rval = ext_coal_sdma_tx_descs(dd, tx, SDMA_MAP_SINGLE,
-                                             kvaddr, NULL, 0, len);
-               if (rval <= 0)
-                       return rval;
-       }
-
-       addr = dma_map_single(
-                      &dd->pcidev->dev,
-                      kvaddr,
-                      len,
-                      DMA_TO_DEVICE);
-
-       if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) {
-               sdma_txclean(dd, tx);
-               return -ENOSPC;
-       }
-
-       return _sdma_txadd_daddr(
-                       dd, SDMA_MAP_SINGLE, tx, addr, len);
-}
-
-struct iowait;
-
-int sdma_send_txreq(struct sdma_engine *sde,
-                   struct iowait *wait,
-                   struct sdma_txreq *tx);
-int sdma_send_txlist(struct sdma_engine *sde,
-                    struct iowait *wait,
-                    struct list_head *tx_list);
-
-int sdma_ahg_alloc(struct sdma_engine *sde);
-void sdma_ahg_free(struct sdma_engine *sde, int ahg_index);
-
-/**
- * sdma_build_ahg - build ahg descriptor
- * @data
- * @dwindex
- * @startbit
- * @bits
- *
- * Build and return a 32 bit descriptor.
- */
-static inline u32 sdma_build_ahg_descriptor(
-       u16 data,
-       u8 dwindex,
-       u8 startbit,
-       u8 bits)
-{
-       return (u32)(1UL << SDMA_AHG_UPDATE_ENABLE_SHIFT |
-               ((startbit & SDMA_AHG_FIELD_START_MASK) <<
-               SDMA_AHG_FIELD_START_SHIFT) |
-               ((bits & SDMA_AHG_FIELD_LEN_MASK) <<
-               SDMA_AHG_FIELD_LEN_SHIFT) |
-               ((dwindex & SDMA_AHG_INDEX_MASK) <<
-               SDMA_AHG_INDEX_SHIFT) |
-               ((data & SDMA_AHG_VALUE_MASK) <<
-               SDMA_AHG_VALUE_SHIFT));
-}
-
-/**
- * sdma_progress - use seq number of detect head progress
- * @sde: sdma_engine to check
- * @seq: base seq count
- * @tx: txreq for which we need to check descriptor availability
- *
- * This is used in the appropriate spot in the sleep routine
- * to check for potential ring progress.  This routine gets the
- * seqcount before queuing the iowait structure for progress.
- *
- * If the seqcount indicates that progress needs to be checked,
- * re-submission is detected by checking whether the descriptor
- * queue has enough descriptor for the txreq.
- */
-static inline unsigned sdma_progress(struct sdma_engine *sde, unsigned seq,
-                                    struct sdma_txreq *tx)
-{
-       if (read_seqretry(&sde->head_lock, seq)) {
-               sde->desc_avail = sdma_descq_freecnt(sde);
-               if (tx->num_desc > sde->desc_avail)
-                       return 0;
-               return 1;
-       }
-       return 0;
-}
-
-/**
- * sdma_iowait_schedule() - initialize wait structure
- * @sde: sdma_engine to schedule
- * @wait: wait struct to schedule
- *
- * This function initializes the iowait
- * structure embedded in the QP or PQ.
- *
- */
-static inline void sdma_iowait_schedule(
-       struct sdma_engine *sde,
-       struct iowait *wait)
-{
-       struct hfi1_pportdata *ppd = sde->dd->pport;
-
-       iowait_schedule(wait, ppd->hfi1_wq, sde->cpu);
-}
-
-/* for use by interrupt handling */
-void sdma_engine_error(struct sdma_engine *sde, u64 status);
-void sdma_engine_interrupt(struct sdma_engine *sde, u64 status);
-
-/*
- *
- * The diagram below details the relationship of the mapping structures
- *
- * Since the mapping now allows for non-uniform engines per vl, the
- * number of engines for a vl is either the vl_engines[vl] or
- * a computation based on num_sdma/num_vls:
- *
- * For example:
- * nactual = vl_engines ? vl_engines[vl] : num_sdma/num_vls
- *
- * n = roundup to next highest power of 2 using nactual
- *
- * In the case where there are num_sdma/num_vls doesn't divide
- * evenly, the extras are added from the last vl downward.
- *
- * For the case where n > nactual, the engines are assigned
- * in a round robin fashion wrapping back to the first engine
- * for a particular vl.
- *
- *               dd->sdma_map
- *                    |                                   sdma_map_elem[0]
- *                    |                                +--------------------+
- *                    v                                |       mask         |
- *               sdma_vl_map                           |--------------------|
- *      +--------------------------+                   | sde[0] -> eng 1    |
- *      |    list (RCU)            |                   |--------------------|
- *      |--------------------------|                 ->| sde[1] -> eng 2    |
- *      |    mask                  |              --/  |--------------------|
- *      |--------------------------|            -/     |        *           |
- *      |    actual_vls (max 8)    |          -/       |--------------------|
- *      |--------------------------|       --/         | sde[n] -> eng n    |
- *      |    vls (max 8)           |     -/            +--------------------+
- *      |--------------------------|  --/
- *      |    map[0]                |-/
- *      |--------------------------|                   +--------------------+
- *      |    map[1]                |---                |       mask         |
- *      |--------------------------|   \----           |--------------------|
- *      |           *              |        \--        | sde[0] -> eng 1+n  |
- *      |           *              |           \----   |--------------------|
- *      |           *              |                \->| sde[1] -> eng 2+n  |
- *      |--------------------------|                   |--------------------|
- *      |   map[vls - 1]           |-                  |         *          |
- *      +--------------------------+ \-                |--------------------|
- *                                     \-              | sde[m] -> eng m+n  |
- *                                       \             +--------------------+
- *                                        \-
- *                                          \
- *                                           \-        +--------------------+
- *                                             \-      |       mask         |
- *                                               \     |--------------------|
- *                                                \-   | sde[0] -> eng 1+m+n|
- *                                                  \- |--------------------|
- *                                                    >| sde[1] -> eng 2+m+n|
- *                                                     |--------------------|
- *                                                     |         *          |
- *                                                     |--------------------|
- *                                                     | sde[o] -> eng o+m+n|
- *                                                     +--------------------+
- *
- */
-
-/**
- * struct sdma_map_elem - mapping for a vl
- * @mask - selector mask
- * @sde - array of engines for this vl
- *
- * The mask is used to "mod" the selector
- * to produce index into the trailing
- * array of sdes.
- */
-struct sdma_map_elem {
-       u32 mask;
-       struct sdma_engine *sde[0];
-};
-
-/**
- * struct sdma_map_el - mapping for a vl
- * @engine_to_vl - map of an engine to a vl
- * @list - rcu head for free callback
- * @mask - vl mask to "mod" the vl to produce an index to map array
- * @actual_vls - number of vls
- * @vls - number of vls rounded to next power of 2
- * @map - array of sdma_map_elem entries
- *
- * This is the parent mapping structure.  The trailing
- * members of the struct point to sdma_map_elem entries, which
- * in turn point to an array of sde's for that vl.
- */
-struct sdma_vl_map {
-       s8 engine_to_vl[TXE_NUM_SDMA_ENGINES];
-       struct rcu_head list;
-       u32 mask;
-       u8 actual_vls;
-       u8 vls;
-       struct sdma_map_elem *map[0];
-};
-
-int sdma_map_init(
-       struct hfi1_devdata *dd,
-       u8 port,
-       u8 num_vls,
-       u8 *vl_engines);
-
-/* slow path */
-void _sdma_engine_progress_schedule(struct sdma_engine *sde);
-
-/**
- * sdma_engine_progress_schedule() - schedule progress on engine
- * @sde: sdma_engine to schedule progress
- *
- * This is the fast path.
- *
- */
-static inline void sdma_engine_progress_schedule(
-       struct sdma_engine *sde)
-{
-       if (!sde || sdma_descq_inprocess(sde) < (sde->descq_cnt / 8))
-               return;
-       _sdma_engine_progress_schedule(sde);
-}
-
-struct sdma_engine *sdma_select_engine_sc(
-       struct hfi1_devdata *dd,
-       u32 selector,
-       u8 sc5);
-
-struct sdma_engine *sdma_select_engine_vl(
-       struct hfi1_devdata *dd,
-       u32 selector,
-       u8 vl);
-
-void sdma_seqfile_dump_sde(struct seq_file *s, struct sdma_engine *);
-
-#ifdef CONFIG_SDMA_VERBOSITY
-void sdma_dumpstate(struct sdma_engine *);
-#endif
-static inline char *slashstrip(char *s)
-{
-       char *r = s;
-
-       while (*s)
-               if (*s++ == '/')
-                       r = s;
-       return r;
-}
-
-u16 sdma_get_descq_cnt(void);
-
-extern uint mod_num_sdma;
-
-void sdma_update_lmc(struct hfi1_devdata *dd, u64 mask, u32 lid);
-
-#endif
diff --git a/drivers/staging/rdma/hfi1/sdma_txreq.h b/drivers/staging/rdma/hfi1/sdma_txreq.h
deleted file mode 100644 (file)
index bf7d777..0000000
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright(c) 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#ifndef HFI1_SDMA_TXREQ_H
-#define HFI1_SDMA_TXREQ_H
-
-/* increased for AHG */
-#define NUM_DESC 6
-
-/*
- * struct sdma_desc - canonical fragment descriptor
- *
- * This is the descriptor carried in the tx request
- * corresponding to each fragment.
- *
- */
-struct sdma_desc {
-       /* private:  don't use directly */
-       u64 qw[2];
-};
-
-/**
- * struct sdma_txreq - the sdma_txreq structure (one per packet)
- * @list: for use by user and by queuing for wait
- *
- * This is the representation of a packet which consists of some
- * number of fragments.   Storage is provided to within the structure.
- * for all fragments.
- *
- * The storage for the descriptors are automatically extended as needed
- * when the currently allocation is exceeded.
- *
- * The user (Verbs or PSM) may overload this structure with fields
- * specific to their use by putting this struct first in their struct.
- * The method of allocation of the overloaded structure is user dependent
- *
- * The list is the only public field in the structure.
- *
- */
-
-#define SDMA_TXREQ_S_OK        0
-#define SDMA_TXREQ_S_SENDERROR 1
-#define SDMA_TXREQ_S_ABORTED   2
-#define SDMA_TXREQ_S_SHUTDOWN  3
-
-/* flags bits */
-#define SDMA_TXREQ_F_URGENT       0x0001
-#define SDMA_TXREQ_F_AHG_COPY     0x0002
-#define SDMA_TXREQ_F_USE_AHG      0x0004
-
-struct sdma_txreq;
-typedef void (*callback_t)(struct sdma_txreq *, int);
-
-struct iowait;
-struct sdma_txreq {
-       struct list_head list;
-       /* private: */
-       struct sdma_desc *descp;
-       /* private: */
-       void *coalesce_buf;
-       /* private: */
-       struct iowait *wait;
-       /* private: */
-       callback_t                  complete;
-#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
-       u64 sn;
-#endif
-       /* private: - used in coalesce/pad processing */
-       u16                         packet_len;
-       /* private: - down-counted to trigger last */
-       u16                         tlen;
-       /* private: */
-       u16                         num_desc;
-       /* private: */
-       u16                         desc_limit;
-       /* private: */
-       u16                         next_descq_idx;
-       /* private: */
-       u16 coalesce_idx;
-       /* private: flags */
-       u16                         flags;
-       /* private: */
-       struct sdma_desc descs[NUM_DESC];
-};
-
-static inline int sdma_txreq_built(struct sdma_txreq *tx)
-{
-       return tx->num_desc;
-}
-
-#endif                          /* HFI1_SDMA_TXREQ_H */
diff --git a/drivers/staging/rdma/hfi1/sysfs.c b/drivers/staging/rdma/hfi1/sysfs.c
deleted file mode 100644 (file)
index 8cd6df8..0000000
+++ /dev/null
@@ -1,785 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#include <linux/ctype.h>
-
-#include "hfi.h"
-#include "mad.h"
-#include "trace.h"
-
-/*
- * Start of per-port congestion control structures and support code
- */
-
-/*
- * Congestion control table size followed by table entries
- */
-static ssize_t read_cc_table_bin(struct file *filp, struct kobject *kobj,
-                                struct bin_attribute *bin_attr,
-                                char *buf, loff_t pos, size_t count)
-{
-       int ret;
-       struct hfi1_pportdata *ppd =
-               container_of(kobj, struct hfi1_pportdata, pport_cc_kobj);
-       struct cc_state *cc_state;
-
-       ret = ppd->total_cct_entry * sizeof(struct ib_cc_table_entry_shadow)
-                + sizeof(__be16);
-
-       if (pos > ret)
-               return -EINVAL;
-
-       if (count > ret - pos)
-               count = ret - pos;
-
-       if (!count)
-               return count;
-
-       rcu_read_lock();
-       cc_state = get_cc_state(ppd);
-       if (!cc_state) {
-               rcu_read_unlock();
-               return -EINVAL;
-       }
-       memcpy(buf, (void *)&cc_state->cct + pos, count);
-       rcu_read_unlock();
-
-       return count;
-}
-
-static void port_release(struct kobject *kobj)
-{
-       /* nothing to do since memory is freed by hfi1_free_devdata() */
-}
-
-static struct bin_attribute cc_table_bin_attr = {
-       .attr = {.name = "cc_table_bin", .mode = 0444},
-       .read = read_cc_table_bin,
-       .size = PAGE_SIZE,
-};
-
-/*
- * Congestion settings: port control, control map and an array of 16
- * entries for the congestion entries - increase, timer, event log
- * trigger threshold and the minimum injection rate delay.
- */
-static ssize_t read_cc_setting_bin(struct file *filp, struct kobject *kobj,
-                                  struct bin_attribute *bin_attr,
-                                  char *buf, loff_t pos, size_t count)
-{
-       int ret;
-       struct hfi1_pportdata *ppd =
-               container_of(kobj, struct hfi1_pportdata, pport_cc_kobj);
-       struct cc_state *cc_state;
-
-       ret = sizeof(struct opa_congestion_setting_attr_shadow);
-
-       if (pos > ret)
-               return -EINVAL;
-       if (count > ret - pos)
-               count = ret - pos;
-
-       if (!count)
-               return count;
-
-       rcu_read_lock();
-       cc_state = get_cc_state(ppd);
-       if (!cc_state) {
-               rcu_read_unlock();
-               return -EINVAL;
-       }
-       memcpy(buf, (void *)&cc_state->cong_setting + pos, count);
-       rcu_read_unlock();
-
-       return count;
-}
-
-static struct bin_attribute cc_setting_bin_attr = {
-       .attr = {.name = "cc_settings_bin", .mode = 0444},
-       .read = read_cc_setting_bin,
-       .size = PAGE_SIZE,
-};
-
-struct hfi1_port_attr {
-       struct attribute attr;
-       ssize_t (*show)(struct hfi1_pportdata *, char *);
-       ssize_t (*store)(struct hfi1_pportdata *, const char *, size_t);
-};
-
-static ssize_t cc_prescan_show(struct hfi1_pportdata *ppd, char *buf)
-{
-       return sprintf(buf, "%s\n", ppd->cc_prescan ? "on" : "off");
-}
-
-static ssize_t cc_prescan_store(struct hfi1_pportdata *ppd, const char *buf,
-                               size_t count)
-{
-       if (!memcmp(buf, "on", 2))
-               ppd->cc_prescan = true;
-       else if (!memcmp(buf, "off", 3))
-               ppd->cc_prescan = false;
-
-       return count;
-}
-
-static struct hfi1_port_attr cc_prescan_attr =
-               __ATTR(cc_prescan, 0600, cc_prescan_show, cc_prescan_store);
-
-static ssize_t cc_attr_show(struct kobject *kobj, struct attribute *attr,
-                           char *buf)
-{
-       struct hfi1_port_attr *port_attr =
-               container_of(attr, struct hfi1_port_attr, attr);
-       struct hfi1_pportdata *ppd =
-               container_of(kobj, struct hfi1_pportdata, pport_cc_kobj);
-
-       return port_attr->show(ppd, buf);
-}
-
-static ssize_t cc_attr_store(struct kobject *kobj, struct attribute *attr,
-                            const char *buf, size_t count)
-{
-       struct hfi1_port_attr *port_attr =
-               container_of(attr, struct hfi1_port_attr, attr);
-       struct hfi1_pportdata *ppd =
-               container_of(kobj, struct hfi1_pportdata, pport_cc_kobj);
-
-       return port_attr->store(ppd, buf, count);
-}
-
-static const struct sysfs_ops port_cc_sysfs_ops = {
-       .show = cc_attr_show,
-       .store = cc_attr_store
-};
-
-static struct attribute *port_cc_default_attributes[] = {
-       &cc_prescan_attr.attr
-};
-
-static struct kobj_type port_cc_ktype = {
-       .release = port_release,
-       .sysfs_ops = &port_cc_sysfs_ops,
-       .default_attrs = port_cc_default_attributes
-};
-
-/* Start sc2vl */
-#define HFI1_SC2VL_ATTR(N)                                 \
-       static struct hfi1_sc2vl_attr hfi1_sc2vl_attr_##N = { \
-               .attr = { .name = __stringify(N), .mode = 0444 }, \
-               .sc = N \
-       }
-
-struct hfi1_sc2vl_attr {
-       struct attribute attr;
-       int sc;
-};
-
-HFI1_SC2VL_ATTR(0);
-HFI1_SC2VL_ATTR(1);
-HFI1_SC2VL_ATTR(2);
-HFI1_SC2VL_ATTR(3);
-HFI1_SC2VL_ATTR(4);
-HFI1_SC2VL_ATTR(5);
-HFI1_SC2VL_ATTR(6);
-HFI1_SC2VL_ATTR(7);
-HFI1_SC2VL_ATTR(8);
-HFI1_SC2VL_ATTR(9);
-HFI1_SC2VL_ATTR(10);
-HFI1_SC2VL_ATTR(11);
-HFI1_SC2VL_ATTR(12);
-HFI1_SC2VL_ATTR(13);
-HFI1_SC2VL_ATTR(14);
-HFI1_SC2VL_ATTR(15);
-HFI1_SC2VL_ATTR(16);
-HFI1_SC2VL_ATTR(17);
-HFI1_SC2VL_ATTR(18);
-HFI1_SC2VL_ATTR(19);
-HFI1_SC2VL_ATTR(20);
-HFI1_SC2VL_ATTR(21);
-HFI1_SC2VL_ATTR(22);
-HFI1_SC2VL_ATTR(23);
-HFI1_SC2VL_ATTR(24);
-HFI1_SC2VL_ATTR(25);
-HFI1_SC2VL_ATTR(26);
-HFI1_SC2VL_ATTR(27);
-HFI1_SC2VL_ATTR(28);
-HFI1_SC2VL_ATTR(29);
-HFI1_SC2VL_ATTR(30);
-HFI1_SC2VL_ATTR(31);
-
-static struct attribute *sc2vl_default_attributes[] = {
-       &hfi1_sc2vl_attr_0.attr,
-       &hfi1_sc2vl_attr_1.attr,
-       &hfi1_sc2vl_attr_2.attr,
-       &hfi1_sc2vl_attr_3.attr,
-       &hfi1_sc2vl_attr_4.attr,
-       &hfi1_sc2vl_attr_5.attr,
-       &hfi1_sc2vl_attr_6.attr,
-       &hfi1_sc2vl_attr_7.attr,
-       &hfi1_sc2vl_attr_8.attr,
-       &hfi1_sc2vl_attr_9.attr,
-       &hfi1_sc2vl_attr_10.attr,
-       &hfi1_sc2vl_attr_11.attr,
-       &hfi1_sc2vl_attr_12.attr,
-       &hfi1_sc2vl_attr_13.attr,
-       &hfi1_sc2vl_attr_14.attr,
-       &hfi1_sc2vl_attr_15.attr,
-       &hfi1_sc2vl_attr_16.attr,
-       &hfi1_sc2vl_attr_17.attr,
-       &hfi1_sc2vl_attr_18.attr,
-       &hfi1_sc2vl_attr_19.attr,
-       &hfi1_sc2vl_attr_20.attr,
-       &hfi1_sc2vl_attr_21.attr,
-       &hfi1_sc2vl_attr_22.attr,
-       &hfi1_sc2vl_attr_23.attr,
-       &hfi1_sc2vl_attr_24.attr,
-       &hfi1_sc2vl_attr_25.attr,
-       &hfi1_sc2vl_attr_26.attr,
-       &hfi1_sc2vl_attr_27.attr,
-       &hfi1_sc2vl_attr_28.attr,
-       &hfi1_sc2vl_attr_29.attr,
-       &hfi1_sc2vl_attr_30.attr,
-       &hfi1_sc2vl_attr_31.attr,
-       NULL
-};
-
-static ssize_t sc2vl_attr_show(struct kobject *kobj, struct attribute *attr,
-                              char *buf)
-{
-       struct hfi1_sc2vl_attr *sattr =
-               container_of(attr, struct hfi1_sc2vl_attr, attr);
-       struct hfi1_pportdata *ppd =
-               container_of(kobj, struct hfi1_pportdata, sc2vl_kobj);
-       struct hfi1_devdata *dd = ppd->dd;
-
-       return sprintf(buf, "%u\n", *((u8 *)dd->sc2vl + sattr->sc));
-}
-
-static const struct sysfs_ops hfi1_sc2vl_ops = {
-       .show = sc2vl_attr_show,
-};
-
-static struct kobj_type hfi1_sc2vl_ktype = {
-       .release = port_release,
-       .sysfs_ops = &hfi1_sc2vl_ops,
-       .default_attrs = sc2vl_default_attributes
-};
-
-/* End sc2vl */
-
-/* Start sl2sc */
-#define HFI1_SL2SC_ATTR(N)                                 \
-       static struct hfi1_sl2sc_attr hfi1_sl2sc_attr_##N = {     \
-               .attr = { .name = __stringify(N), .mode = 0444 }, \
-               .sl = N                                           \
-       }
-
-struct hfi1_sl2sc_attr {
-       struct attribute attr;
-       int sl;
-};
-
-HFI1_SL2SC_ATTR(0);
-HFI1_SL2SC_ATTR(1);
-HFI1_SL2SC_ATTR(2);
-HFI1_SL2SC_ATTR(3);
-HFI1_SL2SC_ATTR(4);
-HFI1_SL2SC_ATTR(5);
-HFI1_SL2SC_ATTR(6);
-HFI1_SL2SC_ATTR(7);
-HFI1_SL2SC_ATTR(8);
-HFI1_SL2SC_ATTR(9);
-HFI1_SL2SC_ATTR(10);
-HFI1_SL2SC_ATTR(11);
-HFI1_SL2SC_ATTR(12);
-HFI1_SL2SC_ATTR(13);
-HFI1_SL2SC_ATTR(14);
-HFI1_SL2SC_ATTR(15);
-HFI1_SL2SC_ATTR(16);
-HFI1_SL2SC_ATTR(17);
-HFI1_SL2SC_ATTR(18);
-HFI1_SL2SC_ATTR(19);
-HFI1_SL2SC_ATTR(20);
-HFI1_SL2SC_ATTR(21);
-HFI1_SL2SC_ATTR(22);
-HFI1_SL2SC_ATTR(23);
-HFI1_SL2SC_ATTR(24);
-HFI1_SL2SC_ATTR(25);
-HFI1_SL2SC_ATTR(26);
-HFI1_SL2SC_ATTR(27);
-HFI1_SL2SC_ATTR(28);
-HFI1_SL2SC_ATTR(29);
-HFI1_SL2SC_ATTR(30);
-HFI1_SL2SC_ATTR(31);
-
-static struct attribute *sl2sc_default_attributes[] = {
-       &hfi1_sl2sc_attr_0.attr,
-       &hfi1_sl2sc_attr_1.attr,
-       &hfi1_sl2sc_attr_2.attr,
-       &hfi1_sl2sc_attr_3.attr,
-       &hfi1_sl2sc_attr_4.attr,
-       &hfi1_sl2sc_attr_5.attr,
-       &hfi1_sl2sc_attr_6.attr,
-       &hfi1_sl2sc_attr_7.attr,
-       &hfi1_sl2sc_attr_8.attr,
-       &hfi1_sl2sc_attr_9.attr,
-       &hfi1_sl2sc_attr_10.attr,
-       &hfi1_sl2sc_attr_11.attr,
-       &hfi1_sl2sc_attr_12.attr,
-       &hfi1_sl2sc_attr_13.attr,
-       &hfi1_sl2sc_attr_14.attr,
-       &hfi1_sl2sc_attr_15.attr,
-       &hfi1_sl2sc_attr_16.attr,
-       &hfi1_sl2sc_attr_17.attr,
-       &hfi1_sl2sc_attr_18.attr,
-       &hfi1_sl2sc_attr_19.attr,
-       &hfi1_sl2sc_attr_20.attr,
-       &hfi1_sl2sc_attr_21.attr,
-       &hfi1_sl2sc_attr_22.attr,
-       &hfi1_sl2sc_attr_23.attr,
-       &hfi1_sl2sc_attr_24.attr,
-       &hfi1_sl2sc_attr_25.attr,
-       &hfi1_sl2sc_attr_26.attr,
-       &hfi1_sl2sc_attr_27.attr,
-       &hfi1_sl2sc_attr_28.attr,
-       &hfi1_sl2sc_attr_29.attr,
-       &hfi1_sl2sc_attr_30.attr,
-       &hfi1_sl2sc_attr_31.attr,
-       NULL
-};
-
-static ssize_t sl2sc_attr_show(struct kobject *kobj, struct attribute *attr,
-                              char *buf)
-{
-       struct hfi1_sl2sc_attr *sattr =
-               container_of(attr, struct hfi1_sl2sc_attr, attr);
-       struct hfi1_pportdata *ppd =
-               container_of(kobj, struct hfi1_pportdata, sl2sc_kobj);
-       struct hfi1_ibport *ibp = &ppd->ibport_data;
-
-       return sprintf(buf, "%u\n", ibp->sl_to_sc[sattr->sl]);
-}
-
-static const struct sysfs_ops hfi1_sl2sc_ops = {
-       .show = sl2sc_attr_show,
-};
-
-static struct kobj_type hfi1_sl2sc_ktype = {
-       .release = port_release,
-       .sysfs_ops = &hfi1_sl2sc_ops,
-       .default_attrs = sl2sc_default_attributes
-};
-
-/* End sl2sc */
-
-/* Start vl2mtu */
-
-#define HFI1_VL2MTU_ATTR(N) \
-       static struct hfi1_vl2mtu_attr hfi1_vl2mtu_attr_##N = { \
-               .attr = { .name = __stringify(N), .mode = 0444 }, \
-               .vl = N                                           \
-       }
-
-struct hfi1_vl2mtu_attr {
-       struct attribute attr;
-       int vl;
-};
-
-HFI1_VL2MTU_ATTR(0);
-HFI1_VL2MTU_ATTR(1);
-HFI1_VL2MTU_ATTR(2);
-HFI1_VL2MTU_ATTR(3);
-HFI1_VL2MTU_ATTR(4);
-HFI1_VL2MTU_ATTR(5);
-HFI1_VL2MTU_ATTR(6);
-HFI1_VL2MTU_ATTR(7);
-HFI1_VL2MTU_ATTR(8);
-HFI1_VL2MTU_ATTR(9);
-HFI1_VL2MTU_ATTR(10);
-HFI1_VL2MTU_ATTR(11);
-HFI1_VL2MTU_ATTR(12);
-HFI1_VL2MTU_ATTR(13);
-HFI1_VL2MTU_ATTR(14);
-HFI1_VL2MTU_ATTR(15);
-
-static struct attribute *vl2mtu_default_attributes[] = {
-       &hfi1_vl2mtu_attr_0.attr,
-       &hfi1_vl2mtu_attr_1.attr,
-       &hfi1_vl2mtu_attr_2.attr,
-       &hfi1_vl2mtu_attr_3.attr,
-       &hfi1_vl2mtu_attr_4.attr,
-       &hfi1_vl2mtu_attr_5.attr,
-       &hfi1_vl2mtu_attr_6.attr,
-       &hfi1_vl2mtu_attr_7.attr,
-       &hfi1_vl2mtu_attr_8.attr,
-       &hfi1_vl2mtu_attr_9.attr,
-       &hfi1_vl2mtu_attr_10.attr,
-       &hfi1_vl2mtu_attr_11.attr,
-       &hfi1_vl2mtu_attr_12.attr,
-       &hfi1_vl2mtu_attr_13.attr,
-       &hfi1_vl2mtu_attr_14.attr,
-       &hfi1_vl2mtu_attr_15.attr,
-       NULL
-};
-
-static ssize_t vl2mtu_attr_show(struct kobject *kobj, struct attribute *attr,
-                               char *buf)
-{
-       struct hfi1_vl2mtu_attr *vlattr =
-               container_of(attr, struct hfi1_vl2mtu_attr, attr);
-       struct hfi1_pportdata *ppd =
-               container_of(kobj, struct hfi1_pportdata, vl2mtu_kobj);
-       struct hfi1_devdata *dd = ppd->dd;
-
-       return sprintf(buf, "%u\n", dd->vld[vlattr->vl].mtu);
-}
-
-static const struct sysfs_ops hfi1_vl2mtu_ops = {
-       .show = vl2mtu_attr_show,
-};
-
-static struct kobj_type hfi1_vl2mtu_ktype = {
-       .release = port_release,
-       .sysfs_ops = &hfi1_vl2mtu_ops,
-       .default_attrs = vl2mtu_default_attributes
-};
-
-/* end of per-port file structures and support code */
-
-/*
- * Start of per-unit (or driver, in some cases, but replicated
- * per unit) functions (these get a device *)
- */
-static ssize_t show_rev(struct device *device, struct device_attribute *attr,
-                       char *buf)
-{
-       struct hfi1_ibdev *dev =
-               container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
-
-       return sprintf(buf, "%x\n", dd_from_dev(dev)->minrev);
-}
-
-static ssize_t show_hfi(struct device *device, struct device_attribute *attr,
-                       char *buf)
-{
-       struct hfi1_ibdev *dev =
-               container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
-       struct hfi1_devdata *dd = dd_from_dev(dev);
-       int ret;
-
-       if (!dd->boardname)
-               ret = -EINVAL;
-       else
-               ret = scnprintf(buf, PAGE_SIZE, "%s\n", dd->boardname);
-       return ret;
-}
-
-static ssize_t show_boardversion(struct device *device,
-                                struct device_attribute *attr, char *buf)
-{
-       struct hfi1_ibdev *dev =
-               container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
-       struct hfi1_devdata *dd = dd_from_dev(dev);
-
-       /* The string printed here is already newline-terminated. */
-       return scnprintf(buf, PAGE_SIZE, "%s", dd->boardversion);
-}
-
-static ssize_t show_nctxts(struct device *device,
-                          struct device_attribute *attr, char *buf)
-{
-       struct hfi1_ibdev *dev =
-               container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
-       struct hfi1_devdata *dd = dd_from_dev(dev);
-
-       /*
-        * Return the smaller of send and receive contexts.
-        * Normally, user level applications would require both a send
-        * and a receive context, so returning the smaller of the two counts
-        * give a more accurate picture of total contexts available.
-        */
-       return scnprintf(buf, PAGE_SIZE, "%u\n",
-                        min(dd->num_rcv_contexts - dd->first_user_ctxt,
-                            (u32)dd->sc_sizes[SC_USER].count));
-}
-
-static ssize_t show_nfreectxts(struct device *device,
-                              struct device_attribute *attr, char *buf)
-{
-       struct hfi1_ibdev *dev =
-               container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
-       struct hfi1_devdata *dd = dd_from_dev(dev);
-
-       /* Return the number of free user ports (contexts) available. */
-       return scnprintf(buf, PAGE_SIZE, "%u\n", dd->freectxts);
-}
-
-static ssize_t show_serial(struct device *device,
-                          struct device_attribute *attr, char *buf)
-{
-       struct hfi1_ibdev *dev =
-               container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
-       struct hfi1_devdata *dd = dd_from_dev(dev);
-
-       return scnprintf(buf, PAGE_SIZE, "%s", dd->serial);
-}
-
-static ssize_t store_chip_reset(struct device *device,
-                               struct device_attribute *attr, const char *buf,
-                               size_t count)
-{
-       struct hfi1_ibdev *dev =
-               container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
-       struct hfi1_devdata *dd = dd_from_dev(dev);
-       int ret;
-
-       if (count < 5 || memcmp(buf, "reset", 5) || !dd->diag_client) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       ret = hfi1_reset_device(dd->unit);
-bail:
-       return ret < 0 ? ret : count;
-}
-
-/*
- * Convert the reported temperature from an integer (reported in
- * units of 0.25C) to a floating point number.
- */
-#define temp2str(temp, buf, size, idx)                                 \
-       scnprintf((buf) + (idx), (size) - (idx), "%u.%02u ",            \
-                             ((temp) >> 2), ((temp) & 0x3) * 25)
-
-/*
- * Dump tempsense values, in decimal, to ease shell-scripts.
- */
-static ssize_t show_tempsense(struct device *device,
-                             struct device_attribute *attr, char *buf)
-{
-       struct hfi1_ibdev *dev =
-               container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
-       struct hfi1_devdata *dd = dd_from_dev(dev);
-       struct hfi1_temp temp;
-       int ret;
-
-       ret = hfi1_tempsense_rd(dd, &temp);
-       if (!ret) {
-               int idx = 0;
-
-               idx += temp2str(temp.curr, buf, PAGE_SIZE, idx);
-               idx += temp2str(temp.lo_lim, buf, PAGE_SIZE, idx);
-               idx += temp2str(temp.hi_lim, buf, PAGE_SIZE, idx);
-               idx += temp2str(temp.crit_lim, buf, PAGE_SIZE, idx);
-               idx += scnprintf(buf + idx, PAGE_SIZE - idx,
-                               "%u %u %u\n", temp.triggers & 0x1,
-                               temp.triggers & 0x2, temp.triggers & 0x4);
-               ret = idx;
-       }
-       return ret;
-}
-
-/*
- * end of per-unit (or driver, in some cases, but replicated
- * per unit) functions
- */
-
-/* start of per-unit file structures and support code */
-static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
-static DEVICE_ATTR(board_id, S_IRUGO, show_hfi, NULL);
-static DEVICE_ATTR(nctxts, S_IRUGO, show_nctxts, NULL);
-static DEVICE_ATTR(nfreectxts, S_IRUGO, show_nfreectxts, NULL);
-static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL);
-static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL);
-static DEVICE_ATTR(tempsense, S_IRUGO, show_tempsense, NULL);
-static DEVICE_ATTR(chip_reset, S_IWUSR, NULL, store_chip_reset);
-
-static struct device_attribute *hfi1_attributes[] = {
-       &dev_attr_hw_rev,
-       &dev_attr_board_id,
-       &dev_attr_nctxts,
-       &dev_attr_nfreectxts,
-       &dev_attr_serial,
-       &dev_attr_boardversion,
-       &dev_attr_tempsense,
-       &dev_attr_chip_reset,
-};
-
-int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num,
-                          struct kobject *kobj)
-{
-       struct hfi1_pportdata *ppd;
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       int ret;
-
-       if (!port_num || port_num > dd->num_pports) {
-               dd_dev_err(dd,
-                          "Skipping infiniband class with invalid port %u\n",
-                          port_num);
-               return -ENODEV;
-       }
-       ppd = &dd->pport[port_num - 1];
-
-       ret = kobject_init_and_add(&ppd->sc2vl_kobj, &hfi1_sc2vl_ktype, kobj,
-                                  "sc2vl");
-       if (ret) {
-               dd_dev_err(dd,
-                          "Skipping sc2vl sysfs info, (err %d) port %u\n",
-                          ret, port_num);
-               goto bail;
-       }
-       kobject_uevent(&ppd->sc2vl_kobj, KOBJ_ADD);
-
-       ret = kobject_init_and_add(&ppd->sl2sc_kobj, &hfi1_sl2sc_ktype, kobj,
-                                  "sl2sc");
-       if (ret) {
-               dd_dev_err(dd,
-                          "Skipping sl2sc sysfs info, (err %d) port %u\n",
-                          ret, port_num);
-               goto bail_sc2vl;
-       }
-       kobject_uevent(&ppd->sl2sc_kobj, KOBJ_ADD);
-
-       ret = kobject_init_and_add(&ppd->vl2mtu_kobj, &hfi1_vl2mtu_ktype, kobj,
-                                  "vl2mtu");
-       if (ret) {
-               dd_dev_err(dd,
-                          "Skipping vl2mtu sysfs info, (err %d) port %u\n",
-                          ret, port_num);
-               goto bail_sl2sc;
-       }
-       kobject_uevent(&ppd->vl2mtu_kobj, KOBJ_ADD);
-
-       ret = kobject_init_and_add(&ppd->pport_cc_kobj, &port_cc_ktype,
-                                  kobj, "CCMgtA");
-       if (ret) {
-               dd_dev_err(dd,
-                          "Skipping Congestion Control sysfs info, (err %d) port %u\n",
-                          ret, port_num);
-               goto bail_vl2mtu;
-       }
-
-       kobject_uevent(&ppd->pport_cc_kobj, KOBJ_ADD);
-
-       ret = sysfs_create_bin_file(&ppd->pport_cc_kobj, &cc_setting_bin_attr);
-       if (ret) {
-               dd_dev_err(dd,
-                          "Skipping Congestion Control setting sysfs info, (err %d) port %u\n",
-                          ret, port_num);
-               goto bail_cc;
-       }
-
-       ret = sysfs_create_bin_file(&ppd->pport_cc_kobj, &cc_table_bin_attr);
-       if (ret) {
-               dd_dev_err(dd,
-                          "Skipping Congestion Control table sysfs info, (err %d) port %u\n",
-                          ret, port_num);
-               goto bail_cc_entry_bin;
-       }
-
-       dd_dev_info(dd,
-                   "IB%u: Congestion Control Agent enabled for port %d\n",
-                   dd->unit, port_num);
-
-       return 0;
-
-bail_cc_entry_bin:
-       sysfs_remove_bin_file(&ppd->pport_cc_kobj,
-                             &cc_setting_bin_attr);
-bail_cc:
-       kobject_put(&ppd->pport_cc_kobj);
-bail_vl2mtu:
-       kobject_put(&ppd->vl2mtu_kobj);
-bail_sl2sc:
-       kobject_put(&ppd->sl2sc_kobj);
-bail_sc2vl:
-       kobject_put(&ppd->sc2vl_kobj);
-bail:
-       return ret;
-}
-
-/*
- * Register and create our files in /sys/class/infiniband.
- */
-int hfi1_verbs_register_sysfs(struct hfi1_devdata *dd)
-{
-       struct ib_device *dev = &dd->verbs_dev.rdi.ibdev;
-       int i, ret;
-
-       for (i = 0; i < ARRAY_SIZE(hfi1_attributes); ++i) {
-               ret = device_create_file(&dev->dev, hfi1_attributes[i]);
-               if (ret)
-                       goto bail;
-       }
-
-       return 0;
-bail:
-       for (i = 0; i < ARRAY_SIZE(hfi1_attributes); ++i)
-               device_remove_file(&dev->dev, hfi1_attributes[i]);
-       return ret;
-}
-
-/*
- * Unregister and remove our files in /sys/class/infiniband.
- */
-void hfi1_verbs_unregister_sysfs(struct hfi1_devdata *dd)
-{
-       struct hfi1_pportdata *ppd;
-       int i;
-
-       for (i = 0; i < dd->num_pports; i++) {
-               ppd = &dd->pport[i];
-
-               sysfs_remove_bin_file(&ppd->pport_cc_kobj,
-                                     &cc_setting_bin_attr);
-               sysfs_remove_bin_file(&ppd->pport_cc_kobj,
-                                     &cc_table_bin_attr);
-               kobject_put(&ppd->pport_cc_kobj);
-               kobject_put(&ppd->vl2mtu_kobj);
-               kobject_put(&ppd->sl2sc_kobj);
-               kobject_put(&ppd->sc2vl_kobj);
-       }
-}
diff --git a/drivers/staging/rdma/hfi1/trace.c b/drivers/staging/rdma/hfi1/trace.c
deleted file mode 100644 (file)
index 8b62fef..0000000
+++ /dev/null
@@ -1,235 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#define CREATE_TRACE_POINTS
-#include "trace.h"
-
-u8 ibhdr_exhdr_len(struct hfi1_ib_header *hdr)
-{
-       struct hfi1_other_headers *ohdr;
-       u8 opcode;
-       u8 lnh = (u8)(be16_to_cpu(hdr->lrh[0]) & 3);
-
-       if (lnh == HFI1_LRH_BTH)
-               ohdr = &hdr->u.oth;
-       else
-               ohdr = &hdr->u.l.oth;
-       opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
-       return hdr_len_by_opcode[opcode] == 0 ?
-              0 : hdr_len_by_opcode[opcode] - (12 + 8);
-}
-
-#define IMM_PRN  "imm %d"
-#define RETH_PRN "reth vaddr 0x%.16llx rkey 0x%.8x dlen 0x%.8x"
-#define AETH_PRN "aeth syn 0x%.2x %s msn 0x%.8x"
-#define DETH_PRN "deth qkey 0x%.8x sqpn 0x%.6x"
-#define ATOMICACKETH_PRN "origdata %lld"
-#define ATOMICETH_PRN "vaddr 0x%llx rkey 0x%.8x sdata %lld cdata %lld"
-
-#define OP(transport, op) IB_OPCODE_## transport ## _ ## op
-
-static u64 ib_u64_get(__be32 *p)
-{
-       return ((u64)be32_to_cpu(p[0]) << 32) | be32_to_cpu(p[1]);
-}
-
-static const char *parse_syndrome(u8 syndrome)
-{
-       switch (syndrome >> 5) {
-       case 0:
-               return "ACK";
-       case 1:
-               return "RNRNAK";
-       case 3:
-               return "NAK";
-       }
-       return "";
-}
-
-const char *parse_everbs_hdrs(
-       struct trace_seq *p,
-       u8 opcode,
-       void *ehdrs)
-{
-       union ib_ehdrs *eh = ehdrs;
-       const char *ret = trace_seq_buffer_ptr(p);
-
-       switch (opcode) {
-       /* imm */
-       case OP(RC, SEND_LAST_WITH_IMMEDIATE):
-       case OP(UC, SEND_LAST_WITH_IMMEDIATE):
-       case OP(RC, SEND_ONLY_WITH_IMMEDIATE):
-       case OP(UC, SEND_ONLY_WITH_IMMEDIATE):
-       case OP(RC, RDMA_WRITE_LAST_WITH_IMMEDIATE):
-       case OP(UC, RDMA_WRITE_LAST_WITH_IMMEDIATE):
-               trace_seq_printf(p, IMM_PRN,
-                                be32_to_cpu(eh->imm_data));
-               break;
-       /* reth + imm */
-       case OP(RC, RDMA_WRITE_ONLY_WITH_IMMEDIATE):
-       case OP(UC, RDMA_WRITE_ONLY_WITH_IMMEDIATE):
-               trace_seq_printf(p, RETH_PRN " " IMM_PRN,
-                                (unsigned long long)ib_u64_get(
-                                (__be32 *)&eh->rc.reth.vaddr),
-                                be32_to_cpu(eh->rc.reth.rkey),
-                                be32_to_cpu(eh->rc.reth.length),
-                                be32_to_cpu(eh->rc.imm_data));
-               break;
-       /* reth */
-       case OP(RC, RDMA_READ_REQUEST):
-       case OP(RC, RDMA_WRITE_FIRST):
-       case OP(UC, RDMA_WRITE_FIRST):
-       case OP(RC, RDMA_WRITE_ONLY):
-       case OP(UC, RDMA_WRITE_ONLY):
-               trace_seq_printf(p, RETH_PRN,
-                                (unsigned long long)ib_u64_get(
-                                (__be32 *)&eh->rc.reth.vaddr),
-                                be32_to_cpu(eh->rc.reth.rkey),
-                                be32_to_cpu(eh->rc.reth.length));
-               break;
-       case OP(RC, RDMA_READ_RESPONSE_FIRST):
-       case OP(RC, RDMA_READ_RESPONSE_LAST):
-       case OP(RC, RDMA_READ_RESPONSE_ONLY):
-       case OP(RC, ACKNOWLEDGE):
-               trace_seq_printf(p, AETH_PRN, be32_to_cpu(eh->aeth) >> 24,
-                                parse_syndrome(be32_to_cpu(eh->aeth) >> 24),
-                                be32_to_cpu(eh->aeth) & HFI1_MSN_MASK);
-               break;
-       /* aeth + atomicacketh */
-       case OP(RC, ATOMIC_ACKNOWLEDGE):
-               trace_seq_printf(p, AETH_PRN " " ATOMICACKETH_PRN,
-                                be32_to_cpu(eh->at.aeth) >> 24,
-                                parse_syndrome(be32_to_cpu(eh->at.aeth) >> 24),
-                                be32_to_cpu(eh->at.aeth) & HFI1_MSN_MASK,
-                                (unsigned long long)
-                                ib_u64_get(eh->at.atomic_ack_eth));
-               break;
-       /* atomiceth */
-       case OP(RC, COMPARE_SWAP):
-       case OP(RC, FETCH_ADD):
-               trace_seq_printf(p, ATOMICETH_PRN,
-                                (unsigned long long)ib_u64_get(
-                                eh->atomic_eth.vaddr),
-                                eh->atomic_eth.rkey,
-                                (unsigned long long)ib_u64_get(
-                                (__be32 *)&eh->atomic_eth.swap_data),
-                                (unsigned long long)ib_u64_get(
-                                (__be32 *)&eh->atomic_eth.compare_data));
-               break;
-       /* deth */
-       case OP(UD, SEND_ONLY):
-       case OP(UD, SEND_ONLY_WITH_IMMEDIATE):
-               trace_seq_printf(p, DETH_PRN,
-                                be32_to_cpu(eh->ud.deth[0]),
-                                be32_to_cpu(eh->ud.deth[1]) & RVT_QPN_MASK);
-               break;
-       }
-       trace_seq_putc(p, 0);
-       return ret;
-}
-
-const char *parse_sdma_flags(
-       struct trace_seq *p,
-       u64 desc0, u64 desc1)
-{
-       const char *ret = trace_seq_buffer_ptr(p);
-       char flags[5] = { 'x', 'x', 'x', 'x', 0 };
-
-       flags[0] = (desc1 & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-';
-       flags[1] = (desc1 & SDMA_DESC1_HEAD_TO_HOST_FLAG) ?  'H' : '-';
-       flags[2] = (desc0 & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-';
-       flags[3] = (desc0 & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-';
-       trace_seq_printf(p, "%s", flags);
-       if (desc0 & SDMA_DESC0_FIRST_DESC_FLAG)
-               trace_seq_printf(p, " amode:%u aidx:%u alen:%u",
-                                (u8)((desc1 >> SDMA_DESC1_HEADER_MODE_SHIFT) &
-                                     SDMA_DESC1_HEADER_MODE_MASK),
-                                (u8)((desc1 >> SDMA_DESC1_HEADER_INDEX_SHIFT) &
-                                     SDMA_DESC1_HEADER_INDEX_MASK),
-                                (u8)((desc1 >> SDMA_DESC1_HEADER_DWS_SHIFT) &
-                                     SDMA_DESC1_HEADER_DWS_MASK));
-       return ret;
-}
-
-const char *print_u32_array(
-       struct trace_seq *p,
-       u32 *arr, int len)
-{
-       int i;
-       const char *ret = trace_seq_buffer_ptr(p);
-
-       for (i = 0; i < len ; i++)
-               trace_seq_printf(p, "%s%#x", i == 0 ? "" : " ", arr[i]);
-       trace_seq_putc(p, 0);
-       return ret;
-}
-
-const char *print_u64_array(
-       struct trace_seq *p,
-       u64 *arr, int len)
-{
-       int i;
-       const char *ret = trace_seq_buffer_ptr(p);
-
-       for (i = 0; i < len; i++)
-               trace_seq_printf(p, "%s0x%016llx", i == 0 ? "" : " ", arr[i]);
-       trace_seq_putc(p, 0);
-       return ret;
-}
-
-__hfi1_trace_fn(PKT);
-__hfi1_trace_fn(PROC);
-__hfi1_trace_fn(SDMA);
-__hfi1_trace_fn(LINKVERB);
-__hfi1_trace_fn(DEBUG);
-__hfi1_trace_fn(SNOOP);
-__hfi1_trace_fn(CNTR);
-__hfi1_trace_fn(PIO);
-__hfi1_trace_fn(DC8051);
-__hfi1_trace_fn(FIRMWARE);
-__hfi1_trace_fn(RCVCTRL);
-__hfi1_trace_fn(TID);
-__hfi1_trace_fn(MMU);
diff --git a/drivers/staging/rdma/hfi1/trace.h b/drivers/staging/rdma/hfi1/trace.h
deleted file mode 100644 (file)
index 963dc94..0000000
+++ /dev/null
@@ -1,1369 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#undef TRACE_SYSTEM_VAR
-#define TRACE_SYSTEM_VAR hfi1
-
-#if !defined(__HFI1_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
-#define __HFI1_TRACE_H
-
-#include <linux/tracepoint.h>
-#include <linux/trace_seq.h>
-
-#include "hfi.h"
-#include "mad.h"
-#include "sdma.h"
-
-#define DD_DEV_ENTRY(dd)       __string(dev, dev_name(&(dd)->pcidev->dev))
-#define DD_DEV_ASSIGN(dd)      __assign_str(dev, dev_name(&(dd)->pcidev->dev))
-
-#define packettype_name(etype) { RHF_RCV_TYPE_##etype, #etype }
-#define show_packettype(etype)                  \
-__print_symbolic(etype,                         \
-       packettype_name(EXPECTED),              \
-       packettype_name(EAGER),                 \
-       packettype_name(IB),                    \
-       packettype_name(ERROR),                 \
-       packettype_name(BYPASS))
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_rx
-
-TRACE_EVENT(hfi1_rcvhdr,
-           TP_PROTO(struct hfi1_devdata *dd,
-                    u64 eflags,
-                    u32 ctxt,
-                    u32 etype,
-                    u32 hlen,
-                    u32 tlen,
-                    u32 updegr,
-                    u32 etail
-                    ),
-           TP_ARGS(dd, ctxt, eflags, etype, hlen, tlen, updegr, etail),
-           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
-                            __field(u64, eflags)
-                            __field(u32, ctxt)
-                            __field(u32, etype)
-                            __field(u32, hlen)
-                            __field(u32, tlen)
-                            __field(u32, updegr)
-                            __field(u32, etail)
-                            ),
-           TP_fast_assign(DD_DEV_ASSIGN(dd);
-                          __entry->eflags = eflags;
-                          __entry->ctxt = ctxt;
-                          __entry->etype = etype;
-                          __entry->hlen = hlen;
-                          __entry->tlen = tlen;
-                          __entry->updegr = updegr;
-                          __entry->etail = etail;
-                          ),
-           TP_printk(
-                     "[%s] ctxt %d eflags 0x%llx etype %d,%s hlen %d tlen %d updegr %d etail %d",
-                     __get_str(dev),
-                     __entry->ctxt,
-                     __entry->eflags,
-                     __entry->etype, show_packettype(__entry->etype),
-                     __entry->hlen,
-                     __entry->tlen,
-                     __entry->updegr,
-                     __entry->etail
-                     )
-);
-
-TRACE_EVENT(hfi1_receive_interrupt,
-           TP_PROTO(struct hfi1_devdata *dd, u32 ctxt),
-           TP_ARGS(dd, ctxt),
-           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
-                            __field(u32, ctxt)
-                            __field(u8, slow_path)
-                            __field(u8, dma_rtail)
-                            ),
-           TP_fast_assign(DD_DEV_ASSIGN(dd);
-                          __entry->ctxt = ctxt;
-                          if (dd->rcd[ctxt]->do_interrupt ==
-                              &handle_receive_interrupt) {
-                               __entry->slow_path = 1;
-                               __entry->dma_rtail = 0xFF;
-                          } else if (dd->rcd[ctxt]->do_interrupt ==
-                                     &handle_receive_interrupt_dma_rtail){
-                               __entry->dma_rtail = 1;
-                               __entry->slow_path = 0;
-                          } else if (dd->rcd[ctxt]->do_interrupt ==
-                                     &handle_receive_interrupt_nodma_rtail) {
-                               __entry->dma_rtail = 0;
-                               __entry->slow_path = 0;
-                          }
-                          ),
-           TP_printk("[%s] ctxt %d SlowPath: %d DmaRtail: %d",
-                     __get_str(dev),
-                     __entry->ctxt,
-                     __entry->slow_path,
-                     __entry->dma_rtail
-                     )
-);
-
-TRACE_EVENT(hfi1_exp_tid_reg,
-           TP_PROTO(unsigned ctxt, u16 subctxt, u32 rarr,
-                    u32 npages, unsigned long va, unsigned long pa,
-                    dma_addr_t dma),
-           TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma),
-           TP_STRUCT__entry(
-                   __field(unsigned, ctxt)
-                   __field(u16, subctxt)
-                   __field(u32, rarr)
-                   __field(u32, npages)
-                   __field(unsigned long, va)
-                   __field(unsigned long, pa)
-                   __field(dma_addr_t, dma)
-                   ),
-           TP_fast_assign(
-                   __entry->ctxt = ctxt;
-                   __entry->subctxt = subctxt;
-                   __entry->rarr = rarr;
-                   __entry->npages = npages;
-                   __entry->va = va;
-                   __entry->pa = pa;
-                   __entry->dma = dma;
-                   ),
-           TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx",
-                     __entry->ctxt,
-                     __entry->subctxt,
-                     __entry->rarr,
-                     __entry->npages,
-                     __entry->pa,
-                     __entry->va,
-                     __entry->dma
-                   )
-       );
-
-TRACE_EVENT(hfi1_exp_tid_unreg,
-           TP_PROTO(unsigned ctxt, u16 subctxt, u32 rarr, u32 npages,
-                    unsigned long va, unsigned long pa, dma_addr_t dma),
-           TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma),
-           TP_STRUCT__entry(
-                   __field(unsigned, ctxt)
-                   __field(u16, subctxt)
-                   __field(u32, rarr)
-                   __field(u32, npages)
-                   __field(unsigned long, va)
-                   __field(unsigned long, pa)
-                   __field(dma_addr_t, dma)
-                   ),
-           TP_fast_assign(
-                   __entry->ctxt = ctxt;
-                   __entry->subctxt = subctxt;
-                   __entry->rarr = rarr;
-                   __entry->npages = npages;
-                   __entry->va = va;
-                   __entry->pa = pa;
-                   __entry->dma = dma;
-                   ),
-           TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx",
-                     __entry->ctxt,
-                     __entry->subctxt,
-                     __entry->rarr,
-                     __entry->npages,
-                     __entry->pa,
-                     __entry->va,
-                     __entry->dma
-                   )
-       );
-
-TRACE_EVENT(hfi1_exp_tid_inval,
-           TP_PROTO(unsigned ctxt, u16 subctxt, unsigned long va, u32 rarr,
-                    u32 npages, dma_addr_t dma),
-           TP_ARGS(ctxt, subctxt, va, rarr, npages, dma),
-           TP_STRUCT__entry(
-                   __field(unsigned, ctxt)
-                   __field(u16, subctxt)
-                   __field(unsigned long, va)
-                   __field(u32, rarr)
-                   __field(u32, npages)
-                   __field(dma_addr_t, dma)
-                   ),
-           TP_fast_assign(
-                   __entry->ctxt = ctxt;
-                   __entry->subctxt = subctxt;
-                   __entry->va = va;
-                   __entry->rarr = rarr;
-                   __entry->npages = npages;
-                   __entry->dma = dma;
-                   ),
-           TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx dma: 0x%llx",
-                     __entry->ctxt,
-                     __entry->subctxt,
-                     __entry->rarr,
-                     __entry->npages,
-                     __entry->va,
-                     __entry->dma
-                   )
-       );
-
-TRACE_EVENT(hfi1_mmu_invalidate,
-           TP_PROTO(unsigned ctxt, u16 subctxt, const char *type,
-                    unsigned long start, unsigned long end),
-           TP_ARGS(ctxt, subctxt, type, start, end),
-           TP_STRUCT__entry(
-                   __field(unsigned, ctxt)
-                   __field(u16, subctxt)
-                   __string(type, type)
-                   __field(unsigned long, start)
-                   __field(unsigned long, end)
-                   ),
-           TP_fast_assign(
-                   __entry->ctxt = ctxt;
-                   __entry->subctxt = subctxt;
-                   __assign_str(type, type);
-                   __entry->start = start;
-                   __entry->end = end;
-                   ),
-           TP_printk("[%3u:%02u] MMU Invalidate (%s) 0x%lx - 0x%lx",
-                     __entry->ctxt,
-                     __entry->subctxt,
-                     __get_str(type),
-                     __entry->start,
-                     __entry->end
-                   )
-       );
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_tx
-
-TRACE_EVENT(hfi1_piofree,
-           TP_PROTO(struct send_context *sc, int extra),
-           TP_ARGS(sc, extra),
-           TP_STRUCT__entry(DD_DEV_ENTRY(sc->dd)
-                            __field(u32, sw_index)
-                            __field(u32, hw_context)
-                            __field(int, extra)
-                            ),
-           TP_fast_assign(DD_DEV_ASSIGN(sc->dd);
-                          __entry->sw_index = sc->sw_index;
-                          __entry->hw_context = sc->hw_context;
-                          __entry->extra = extra;
-                          ),
-           TP_printk("[%s] ctxt %u(%u) extra %d",
-                     __get_str(dev),
-                     __entry->sw_index,
-                     __entry->hw_context,
-                     __entry->extra
-                     )
-);
-
-TRACE_EVENT(hfi1_wantpiointr,
-           TP_PROTO(struct send_context *sc, u32 needint, u64 credit_ctrl),
-           TP_ARGS(sc, needint, credit_ctrl),
-           TP_STRUCT__entry(DD_DEV_ENTRY(sc->dd)
-                            __field(u32, sw_index)
-                            __field(u32, hw_context)
-                            __field(u32, needint)
-                            __field(u64, credit_ctrl)
-                            ),
-           TP_fast_assign(DD_DEV_ASSIGN(sc->dd);
-                          __entry->sw_index = sc->sw_index;
-                          __entry->hw_context = sc->hw_context;
-                          __entry->needint = needint;
-                          __entry->credit_ctrl = credit_ctrl;
-                          ),
-           TP_printk("[%s] ctxt %u(%u) on %d credit_ctrl 0x%llx",
-                     __get_str(dev),
-                     __entry->sw_index,
-                     __entry->hw_context,
-                     __entry->needint,
-                     (unsigned long long)__entry->credit_ctrl
-                      )
-);
-
-DECLARE_EVENT_CLASS(hfi1_qpsleepwakeup_template,
-                   TP_PROTO(struct rvt_qp *qp, u32 flags),
-                   TP_ARGS(qp, flags),
-                   TP_STRUCT__entry(
-                           DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
-                           __field(u32, qpn)
-                           __field(u32, flags)
-                           __field(u32, s_flags)
-                           ),
-                   TP_fast_assign(
-                           DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
-                           __entry->flags = flags;
-                           __entry->qpn = qp->ibqp.qp_num;
-                           __entry->s_flags = qp->s_flags;
-                           ),
-                   TP_printk(
-                           "[%s] qpn 0x%x flags 0x%x s_flags 0x%x",
-                           __get_str(dev),
-                           __entry->qpn,
-                           __entry->flags,
-                           __entry->s_flags
-                           )
-);
-
-DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpwakeup,
-            TP_PROTO(struct rvt_qp *qp, u32 flags),
-            TP_ARGS(qp, flags));
-
-DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpsleep,
-            TP_PROTO(struct rvt_qp *qp, u32 flags),
-            TP_ARGS(qp, flags));
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_ibhdrs
-
-u8 ibhdr_exhdr_len(struct hfi1_ib_header *hdr);
-const char *parse_everbs_hdrs(struct trace_seq *p, u8 opcode, void *ehdrs);
-
-#define __parse_ib_ehdrs(op, ehdrs) parse_everbs_hdrs(p, op, ehdrs)
-
-const char *parse_sdma_flags(struct trace_seq *p, u64 desc0, u64 desc1);
-
-#define __parse_sdma_flags(desc0, desc1) parse_sdma_flags(p, desc0, desc1)
-
-#define lrh_name(lrh) { HFI1_##lrh, #lrh }
-#define show_lnh(lrh)                    \
-__print_symbolic(lrh,                    \
-       lrh_name(LRH_BTH),               \
-       lrh_name(LRH_GRH))
-
-#define ib_opcode_name(opcode) { IB_OPCODE_##opcode, #opcode  }
-#define show_ib_opcode(opcode)                             \
-__print_symbolic(opcode,                                   \
-       ib_opcode_name(RC_SEND_FIRST),                     \
-       ib_opcode_name(RC_SEND_MIDDLE),                    \
-       ib_opcode_name(RC_SEND_LAST),                      \
-       ib_opcode_name(RC_SEND_LAST_WITH_IMMEDIATE),       \
-       ib_opcode_name(RC_SEND_ONLY),                      \
-       ib_opcode_name(RC_SEND_ONLY_WITH_IMMEDIATE),       \
-       ib_opcode_name(RC_RDMA_WRITE_FIRST),               \
-       ib_opcode_name(RC_RDMA_WRITE_MIDDLE),              \
-       ib_opcode_name(RC_RDMA_WRITE_LAST),                \
-       ib_opcode_name(RC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \
-       ib_opcode_name(RC_RDMA_WRITE_ONLY),                \
-       ib_opcode_name(RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \
-       ib_opcode_name(RC_RDMA_READ_REQUEST),              \
-       ib_opcode_name(RC_RDMA_READ_RESPONSE_FIRST),       \
-       ib_opcode_name(RC_RDMA_READ_RESPONSE_MIDDLE),      \
-       ib_opcode_name(RC_RDMA_READ_RESPONSE_LAST),        \
-       ib_opcode_name(RC_RDMA_READ_RESPONSE_ONLY),        \
-       ib_opcode_name(RC_ACKNOWLEDGE),                    \
-       ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE),             \
-       ib_opcode_name(RC_COMPARE_SWAP),                   \
-       ib_opcode_name(RC_FETCH_ADD),                      \
-       ib_opcode_name(UC_SEND_FIRST),                     \
-       ib_opcode_name(UC_SEND_MIDDLE),                    \
-       ib_opcode_name(UC_SEND_LAST),                      \
-       ib_opcode_name(UC_SEND_LAST_WITH_IMMEDIATE),       \
-       ib_opcode_name(UC_SEND_ONLY),                      \
-       ib_opcode_name(UC_SEND_ONLY_WITH_IMMEDIATE),       \
-       ib_opcode_name(UC_RDMA_WRITE_FIRST),               \
-       ib_opcode_name(UC_RDMA_WRITE_MIDDLE),              \
-       ib_opcode_name(UC_RDMA_WRITE_LAST),                \
-       ib_opcode_name(UC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \
-       ib_opcode_name(UC_RDMA_WRITE_ONLY),                \
-       ib_opcode_name(UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \
-       ib_opcode_name(UD_SEND_ONLY),                      \
-       ib_opcode_name(UD_SEND_ONLY_WITH_IMMEDIATE),       \
-       ib_opcode_name(CNP))
-
-#define LRH_PRN "vl %d lver %d sl %d lnh %d,%s dlid %.4x len %d slid %.4x"
-#define BTH_PRN \
-       "op 0x%.2x,%s se %d m %d pad %d tver %d pkey 0x%.4x " \
-       "f %d b %d qpn 0x%.6x a %d psn 0x%.8x"
-#define EHDR_PRN "%s"
-
-DECLARE_EVENT_CLASS(hfi1_ibhdr_template,
-                   TP_PROTO(struct hfi1_devdata *dd,
-                            struct hfi1_ib_header *hdr),
-                   TP_ARGS(dd, hdr),
-                   TP_STRUCT__entry(
-                           DD_DEV_ENTRY(dd)
-                           /* LRH */
-                           __field(u8, vl)
-                           __field(u8, lver)
-                           __field(u8, sl)
-                           __field(u8, lnh)
-                           __field(u16, dlid)
-                           __field(u16, len)
-                           __field(u16, slid)
-                           /* BTH */
-                           __field(u8, opcode)
-                           __field(u8, se)
-                           __field(u8, m)
-                           __field(u8, pad)
-                           __field(u8, tver)
-                           __field(u16, pkey)
-                           __field(u8, f)
-                           __field(u8, b)
-                           __field(u32, qpn)
-                           __field(u8, a)
-                           __field(u32, psn)
-                           /* extended headers */
-                           __dynamic_array(u8, ehdrs, ibhdr_exhdr_len(hdr))
-                           ),
-                   TP_fast_assign(
-                          struct hfi1_other_headers *ohdr;
-
-                          DD_DEV_ASSIGN(dd);
-                          /* LRH */
-                          __entry->vl =
-                          (u8)(be16_to_cpu(hdr->lrh[0]) >> 12);
-                          __entry->lver =
-                          (u8)(be16_to_cpu(hdr->lrh[0]) >> 8) & 0xf;
-                          __entry->sl =
-                          (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf;
-                          __entry->lnh =
-                          (u8)(be16_to_cpu(hdr->lrh[0]) & 3);
-                          __entry->dlid =
-                          be16_to_cpu(hdr->lrh[1]);
-                          /* allow for larger len */
-                          __entry->len =
-                          be16_to_cpu(hdr->lrh[2]);
-                          __entry->slid =
-                          be16_to_cpu(hdr->lrh[3]);
-                          /* BTH */
-                          if (__entry->lnh == HFI1_LRH_BTH)
-                               ohdr = &hdr->u.oth;
-                          else
-                               ohdr = &hdr->u.l.oth;
-                         __entry->opcode =
-                         (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
-                         __entry->se =
-                         (be32_to_cpu(ohdr->bth[0]) >> 23) & 1;
-                         __entry->m =
-                         (be32_to_cpu(ohdr->bth[0]) >> 22) & 1;
-                         __entry->pad =
-                         (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-                         __entry->tver =
-                         (be32_to_cpu(ohdr->bth[0]) >> 16) & 0xf;
-                         __entry->pkey =
-                         be32_to_cpu(ohdr->bth[0]) & 0xffff;
-                         __entry->f =
-                         (be32_to_cpu(ohdr->bth[1]) >> HFI1_FECN_SHIFT) &
-                         HFI1_FECN_MASK;
-                         __entry->b =
-                         (be32_to_cpu(ohdr->bth[1]) >> HFI1_BECN_SHIFT) &
-                         HFI1_BECN_MASK;
-                         __entry->qpn =
-                         be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK;
-                         __entry->a =
-                         (be32_to_cpu(ohdr->bth[2]) >> 31) & 1;
-                         /* allow for larger PSN */
-                         __entry->psn =
-                         be32_to_cpu(ohdr->bth[2]) & 0x7fffffff;
-                         /* extended headers */
-                         memcpy(__get_dynamic_array(ehdrs), &ohdr->u,
-                                ibhdr_exhdr_len(hdr));
-                        ),
-                   TP_printk("[%s] " LRH_PRN " " BTH_PRN " " EHDR_PRN,
-                             __get_str(dev),
-                             /* LRH */
-                             __entry->vl,
-                             __entry->lver,
-                             __entry->sl,
-                             __entry->lnh, show_lnh(__entry->lnh),
-                             __entry->dlid,
-                             __entry->len,
-                             __entry->slid,
-                             /* BTH */
-                             __entry->opcode, show_ib_opcode(__entry->opcode),
-                             __entry->se,
-                             __entry->m,
-                             __entry->pad,
-                             __entry->tver,
-                             __entry->pkey,
-                             __entry->f,
-                             __entry->b,
-                             __entry->qpn,
-                             __entry->a,
-                             __entry->psn,
-                             /* extended headers */
-                             __parse_ib_ehdrs(
-                                       __entry->opcode,
-                                       (void *)__get_dynamic_array(ehdrs))
-                            )
-);
-
-DEFINE_EVENT(hfi1_ibhdr_template, input_ibhdr,
-            TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
-            TP_ARGS(dd, hdr));
-
-DEFINE_EVENT(hfi1_ibhdr_template, pio_output_ibhdr,
-            TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
-            TP_ARGS(dd, hdr));
-
-DEFINE_EVENT(hfi1_ibhdr_template, ack_output_ibhdr,
-            TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
-            TP_ARGS(dd, hdr));
-
-DEFINE_EVENT(hfi1_ibhdr_template, sdma_output_ibhdr,
-            TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
-            TP_ARGS(dd, hdr));
-
-#define SNOOP_PRN \
-       "slid %.4x dlid %.4x qpn 0x%.6x opcode 0x%.2x,%s " \
-       "svc lvl %d pkey 0x%.4x [header = %d bytes] [data = %d bytes]"
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_snoop
-
-TRACE_EVENT(snoop_capture,
-           TP_PROTO(struct hfi1_devdata *dd,
-                    int hdr_len,
-                    struct hfi1_ib_header *hdr,
-                    int data_len,
-                    void *data),
-           TP_ARGS(dd, hdr_len, hdr, data_len, data),
-           TP_STRUCT__entry(
-               DD_DEV_ENTRY(dd)
-               __field(u16, slid)
-               __field(u16, dlid)
-               __field(u32, qpn)
-               __field(u8, opcode)
-               __field(u8, sl)
-               __field(u16, pkey)
-               __field(u32, hdr_len)
-               __field(u32, data_len)
-               __field(u8, lnh)
-               __dynamic_array(u8, raw_hdr, hdr_len)
-               __dynamic_array(u8, raw_pkt, data_len)
-               ),
-           TP_fast_assign(
-               struct hfi1_other_headers *ohdr;
-
-               __entry->lnh = (u8)(be16_to_cpu(hdr->lrh[0]) & 3);
-               if (__entry->lnh == HFI1_LRH_BTH)
-                       ohdr = &hdr->u.oth;
-               else
-                       ohdr = &hdr->u.l.oth;
-               DD_DEV_ASSIGN(dd);
-               __entry->slid = be16_to_cpu(hdr->lrh[3]);
-               __entry->dlid = be16_to_cpu(hdr->lrh[1]);
-               __entry->qpn = be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK;
-               __entry->opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
-               __entry->sl = (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf;
-               __entry->pkey = be32_to_cpu(ohdr->bth[0]) & 0xffff;
-               __entry->hdr_len = hdr_len;
-               __entry->data_len = data_len;
-               memcpy(__get_dynamic_array(raw_hdr), hdr, hdr_len);
-               memcpy(__get_dynamic_array(raw_pkt), data, data_len);
-               ),
-           TP_printk(
-               "[%s] " SNOOP_PRN,
-               __get_str(dev),
-               __entry->slid,
-               __entry->dlid,
-               __entry->qpn,
-               __entry->opcode,
-               show_ib_opcode(__entry->opcode),
-               __entry->sl,
-               __entry->pkey,
-               __entry->hdr_len,
-               __entry->data_len
-               )
-);
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_ctxts
-
-#define UCTXT_FMT \
-       "cred:%u, credaddr:0x%llx, piobase:0x%llx, rcvhdr_cnt:%u, "     \
-       "rcvbase:0x%llx, rcvegrc:%u, rcvegrb:0x%llx"
-TRACE_EVENT(hfi1_uctxtdata,
-           TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ctxtdata *uctxt),
-           TP_ARGS(dd, uctxt),
-           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
-                            __field(unsigned, ctxt)
-                            __field(u32, credits)
-                            __field(u64, hw_free)
-                            __field(u64, piobase)
-                            __field(u16, rcvhdrq_cnt)
-                            __field(u64, rcvhdrq_phys)
-                            __field(u32, eager_cnt)
-                            __field(u64, rcvegr_phys)
-                            ),
-           TP_fast_assign(DD_DEV_ASSIGN(dd);
-                          __entry->ctxt = uctxt->ctxt;
-                          __entry->credits = uctxt->sc->credits;
-                          __entry->hw_free = (u64)uctxt->sc->hw_free;
-                          __entry->piobase = (u64)uctxt->sc->base_addr;
-                          __entry->rcvhdrq_cnt = uctxt->rcvhdrq_cnt;
-                          __entry->rcvhdrq_phys = uctxt->rcvhdrq_phys;
-                          __entry->eager_cnt = uctxt->egrbufs.alloced;
-                          __entry->rcvegr_phys =
-                          uctxt->egrbufs.rcvtids[0].phys;
-                          ),
-           TP_printk("[%s] ctxt %u " UCTXT_FMT,
-                     __get_str(dev),
-                     __entry->ctxt,
-                     __entry->credits,
-                     __entry->hw_free,
-                     __entry->piobase,
-                     __entry->rcvhdrq_cnt,
-                     __entry->rcvhdrq_phys,
-                     __entry->eager_cnt,
-                     __entry->rcvegr_phys
-                     )
-);
-
-#define CINFO_FMT \
-       "egrtids:%u, egr_size:%u, hdrq_cnt:%u, hdrq_size:%u, sdma_ring_size:%u"
-TRACE_EVENT(hfi1_ctxt_info,
-           TP_PROTO(struct hfi1_devdata *dd, unsigned ctxt, unsigned subctxt,
-                    struct hfi1_ctxt_info cinfo),
-           TP_ARGS(dd, ctxt, subctxt, cinfo),
-           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
-                            __field(unsigned, ctxt)
-                            __field(unsigned, subctxt)
-                            __field(u16, egrtids)
-                            __field(u16, rcvhdrq_cnt)
-                            __field(u16, rcvhdrq_size)
-                            __field(u16, sdma_ring_size)
-                            __field(u32, rcvegr_size)
-                            ),
-           TP_fast_assign(DD_DEV_ASSIGN(dd);
-                           __entry->ctxt = ctxt;
-                           __entry->subctxt = subctxt;
-                           __entry->egrtids = cinfo.egrtids;
-                           __entry->rcvhdrq_cnt = cinfo.rcvhdrq_cnt;
-                           __entry->rcvhdrq_size = cinfo.rcvhdrq_entsize;
-                           __entry->sdma_ring_size = cinfo.sdma_ring_size;
-                           __entry->rcvegr_size = cinfo.rcvegr_size;
-                           ),
-           TP_printk("[%s] ctxt %u:%u " CINFO_FMT,
-                     __get_str(dev),
-                     __entry->ctxt,
-                     __entry->subctxt,
-                     __entry->egrtids,
-                     __entry->rcvegr_size,
-                     __entry->rcvhdrq_cnt,
-                     __entry->rcvhdrq_size,
-                     __entry->sdma_ring_size
-                     )
-);
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_sma
-
-#define BCT_FORMAT \
-       "shared_limit %x vls 0-7 [%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x] 15 [%x,%x]"
-
-#define BCT(field) \
-       be16_to_cpu( \
-               ((struct buffer_control *)__get_dynamic_array(bct))->field \
-       )
-
-DECLARE_EVENT_CLASS(hfi1_bct_template,
-                   TP_PROTO(struct hfi1_devdata *dd,
-                            struct buffer_control *bc),
-                   TP_ARGS(dd, bc),
-                   TP_STRUCT__entry(DD_DEV_ENTRY(dd)
-                                    __dynamic_array(u8, bct, sizeof(*bc))
-                                    ),
-                   TP_fast_assign(DD_DEV_ASSIGN(dd);
-                                  memcpy(__get_dynamic_array(bct), bc,
-                                         sizeof(*bc));
-                                  ),
-                   TP_printk(BCT_FORMAT,
-                             BCT(overall_shared_limit),
-
-                             BCT(vl[0].dedicated),
-                             BCT(vl[0].shared),
-
-                             BCT(vl[1].dedicated),
-                             BCT(vl[1].shared),
-
-                             BCT(vl[2].dedicated),
-                             BCT(vl[2].shared),
-
-                             BCT(vl[3].dedicated),
-                             BCT(vl[3].shared),
-
-                             BCT(vl[4].dedicated),
-                             BCT(vl[4].shared),
-
-                             BCT(vl[5].dedicated),
-                             BCT(vl[5].shared),
-
-                             BCT(vl[6].dedicated),
-                             BCT(vl[6].shared),
-
-                             BCT(vl[7].dedicated),
-                             BCT(vl[7].shared),
-
-                             BCT(vl[15].dedicated),
-                             BCT(vl[15].shared)
-                             )
-);
-
-DEFINE_EVENT(hfi1_bct_template, bct_set,
-            TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc),
-            TP_ARGS(dd, bc));
-
-DEFINE_EVENT(hfi1_bct_template, bct_get,
-            TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc),
-            TP_ARGS(dd, bc));
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_sdma
-
-TRACE_EVENT(hfi1_sdma_descriptor,
-           TP_PROTO(struct sdma_engine *sde,
-                    u64 desc0,
-                    u64 desc1,
-                    u16 e,
-                    void *descp),
-       TP_ARGS(sde, desc0, desc1, e, descp),
-       TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
-                        __field(void *, descp)
-                        __field(u64, desc0)
-                        __field(u64, desc1)
-                        __field(u16, e)
-                        __field(u8, idx)
-                        ),
-       TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
-                      __entry->desc0 = desc0;
-                      __entry->desc1 = desc1;
-                      __entry->idx = sde->this_idx;
-                      __entry->descp = descp;
-                      __entry->e = e;
-                      ),
-       TP_printk(
-                 "[%s] SDE(%u) flags:%s addr:0x%016llx gen:%u len:%u d0:%016llx d1:%016llx to %p,%u",
-                 __get_str(dev),
-                 __entry->idx,
-                 __parse_sdma_flags(__entry->desc0, __entry->desc1),
-                 (__entry->desc0 >> SDMA_DESC0_PHY_ADDR_SHIFT) &
-                 SDMA_DESC0_PHY_ADDR_MASK,
-                 (u8)((__entry->desc1 >> SDMA_DESC1_GENERATION_SHIFT) &
-                      SDMA_DESC1_GENERATION_MASK),
-                 (u16)((__entry->desc0 >> SDMA_DESC0_BYTE_COUNT_SHIFT) &
-                       SDMA_DESC0_BYTE_COUNT_MASK),
-                 __entry->desc0,
-                 __entry->desc1,
-                 __entry->descp,
-                 __entry->e
-                 )
-);
-
-TRACE_EVENT(hfi1_sdma_engine_select,
-           TP_PROTO(struct hfi1_devdata *dd, u32 sel, u8 vl, u8 idx),
-           TP_ARGS(dd, sel, vl, idx),
-           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
-                            __field(u32, sel)
-                            __field(u8, vl)
-                            __field(u8, idx)
-                            ),
-           TP_fast_assign(DD_DEV_ASSIGN(dd);
-                          __entry->sel = sel;
-                          __entry->vl = vl;
-                          __entry->idx = idx;
-                          ),
-           TP_printk("[%s] selecting SDE %u sel 0x%x vl %u",
-                     __get_str(dev),
-                     __entry->idx,
-                     __entry->sel,
-                     __entry->vl
-                     )
-);
-
-DECLARE_EVENT_CLASS(hfi1_sdma_engine_class,
-                   TP_PROTO(struct sdma_engine *sde, u64 status),
-                   TP_ARGS(sde, status),
-                   TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
-                                    __field(u64, status)
-                                    __field(u8, idx)
-                                    ),
-                   TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
-                                  __entry->status = status;
-                                  __entry->idx = sde->this_idx;
-                                  ),
-                   TP_printk("[%s] SDE(%u) status %llx",
-                             __get_str(dev),
-                             __entry->idx,
-                             (unsigned long long)__entry->status
-                             )
-);
-
-DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_interrupt,
-            TP_PROTO(struct sdma_engine *sde, u64 status),
-            TP_ARGS(sde, status)
-);
-
-DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_progress,
-            TP_PROTO(struct sdma_engine *sde, u64 status),
-            TP_ARGS(sde, status)
-);
-
-DECLARE_EVENT_CLASS(hfi1_sdma_ahg_ad,
-                   TP_PROTO(struct sdma_engine *sde, int aidx),
-                   TP_ARGS(sde, aidx),
-                   TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
-                                    __field(int, aidx)
-                                    __field(u8, idx)
-                                    ),
-                   TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
-                                  __entry->idx = sde->this_idx;
-                                  __entry->aidx = aidx;
-                                  ),
-                   TP_printk("[%s] SDE(%u) aidx %d",
-                             __get_str(dev),
-                             __entry->idx,
-                             __entry->aidx
-                             )
-);
-
-DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_allocate,
-            TP_PROTO(struct sdma_engine *sde, int aidx),
-            TP_ARGS(sde, aidx));
-
-DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_deallocate,
-            TP_PROTO(struct sdma_engine *sde, int aidx),
-            TP_ARGS(sde, aidx));
-
-#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
-TRACE_EVENT(hfi1_sdma_progress,
-           TP_PROTO(struct sdma_engine *sde,
-                    u16 hwhead,
-                    u16 swhead,
-                    struct sdma_txreq *txp
-                    ),
-           TP_ARGS(sde, hwhead, swhead, txp),
-           TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
-                            __field(u64, sn)
-                            __field(u16, hwhead)
-                            __field(u16, swhead)
-                            __field(u16, txnext)
-                            __field(u16, tx_tail)
-                            __field(u16, tx_head)
-                            __field(u8, idx)
-                            ),
-           TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
-                          __entry->hwhead = hwhead;
-                          __entry->swhead = swhead;
-                          __entry->tx_tail = sde->tx_tail;
-                          __entry->tx_head = sde->tx_head;
-                          __entry->txnext = txp ? txp->next_descq_idx : ~0;
-                          __entry->idx = sde->this_idx;
-                          __entry->sn = txp ? txp->sn : ~0;
-                          ),
-           TP_printk(
-                     "[%s] SDE(%u) sn %llu hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u",
-                     __get_str(dev),
-                     __entry->idx,
-                     __entry->sn,
-                     __entry->hwhead,
-                     __entry->swhead,
-                     __entry->txnext,
-                     __entry->tx_head,
-                     __entry->tx_tail
-                     )
-);
-#else
-TRACE_EVENT(hfi1_sdma_progress,
-           TP_PROTO(struct sdma_engine *sde,
-                    u16 hwhead, u16 swhead,
-                    struct sdma_txreq *txp
-           ),
-       TP_ARGS(sde, hwhead, swhead, txp),
-       TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
-                        __field(u16, hwhead)
-                        __field(u16, swhead)
-                        __field(u16, txnext)
-                        __field(u16, tx_tail)
-                        __field(u16, tx_head)
-                        __field(u8, idx)
-                        ),
-       TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
-                      __entry->hwhead = hwhead;
-                      __entry->swhead = swhead;
-                      __entry->tx_tail = sde->tx_tail;
-                      __entry->tx_head = sde->tx_head;
-                      __entry->txnext = txp ? txp->next_descq_idx : ~0;
-                      __entry->idx = sde->this_idx;
-                      ),
-       TP_printk(
-                 "[%s] SDE(%u) hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u",
-                 __get_str(dev),
-                 __entry->idx,
-                 __entry->hwhead,
-                 __entry->swhead,
-                 __entry->txnext,
-                 __entry->tx_head,
-                 __entry->tx_tail
-                 )
-);
-#endif
-
-DECLARE_EVENT_CLASS(hfi1_sdma_sn,
-                   TP_PROTO(struct sdma_engine *sde, u64 sn),
-                   TP_ARGS(sde, sn),
-                   TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
-                                    __field(u64, sn)
-                                    __field(u8, idx)
-                                    ),
-                   TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
-                                  __entry->sn = sn;
-                                  __entry->idx = sde->this_idx;
-                                  ),
-                   TP_printk("[%s] SDE(%u) sn %llu",
-                             __get_str(dev),
-                             __entry->idx,
-                             __entry->sn
-                             )
-);
-
-DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_out_sn,
-            TP_PROTO(
-               struct sdma_engine *sde,
-               u64 sn
-            ),
-            TP_ARGS(sde, sn)
-);
-
-DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_in_sn,
-            TP_PROTO(struct sdma_engine *sde, u64 sn),
-            TP_ARGS(sde, sn)
-);
-
-#define USDMA_HDR_FORMAT \
-       "[%s:%u:%u:%u] PBC=(0x%x 0x%x) LRH=(0x%x 0x%x) BTH=(0x%x 0x%x 0x%x) KDETH=(0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x) TIDVal=0x%x"
-
-TRACE_EVENT(hfi1_sdma_user_header,
-           TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req,
-                    struct hfi1_pkt_header *hdr, u32 tidval),
-           TP_ARGS(dd, ctxt, subctxt, req, hdr, tidval),
-           TP_STRUCT__entry(
-                   DD_DEV_ENTRY(dd)
-                   __field(u16, ctxt)
-                   __field(u8, subctxt)
-                   __field(u16, req)
-                   __field(__le32, pbc0)
-                   __field(__le32, pbc1)
-                   __field(__be32, lrh0)
-                   __field(__be32, lrh1)
-                   __field(__be32, bth0)
-                   __field(__be32, bth1)
-                   __field(__be32, bth2)
-                   __field(__le32, kdeth0)
-                   __field(__le32, kdeth1)
-                   __field(__le32, kdeth2)
-                   __field(__le32, kdeth3)
-                   __field(__le32, kdeth4)
-                   __field(__le32, kdeth5)
-                   __field(__le32, kdeth6)
-                   __field(__le32, kdeth7)
-                   __field(__le32, kdeth8)
-                   __field(u32, tidval)
-                   ),
-           TP_fast_assign(
-                   __le32 *pbc = (__le32 *)hdr->pbc;
-                   __be32 *lrh = (__be32 *)hdr->lrh;
-                   __be32 *bth = (__be32 *)hdr->bth;
-                   __le32 *kdeth = (__le32 *)&hdr->kdeth;
-
-                   DD_DEV_ASSIGN(dd);
-                   __entry->ctxt = ctxt;
-                   __entry->subctxt = subctxt;
-                   __entry->req = req;
-                   __entry->pbc0 = pbc[0];
-                   __entry->pbc1 = pbc[1];
-                   __entry->lrh0 = be32_to_cpu(lrh[0]);
-                   __entry->lrh1 = be32_to_cpu(lrh[1]);
-                   __entry->bth0 = be32_to_cpu(bth[0]);
-                   __entry->bth1 = be32_to_cpu(bth[1]);
-                   __entry->bth2 = be32_to_cpu(bth[2]);
-                   __entry->kdeth0 = kdeth[0];
-                   __entry->kdeth1 = kdeth[1];
-                   __entry->kdeth2 = kdeth[2];
-                   __entry->kdeth3 = kdeth[3];
-                   __entry->kdeth4 = kdeth[4];
-                   __entry->kdeth5 = kdeth[5];
-                   __entry->kdeth6 = kdeth[6];
-                   __entry->kdeth7 = kdeth[7];
-                   __entry->kdeth8 = kdeth[8];
-                   __entry->tidval = tidval;
-                   ),
-           TP_printk(USDMA_HDR_FORMAT,
-                     __get_str(dev),
-                     __entry->ctxt,
-                     __entry->subctxt,
-                     __entry->req,
-                     __entry->pbc1,
-                     __entry->pbc0,
-                     __entry->lrh0,
-                     __entry->lrh1,
-                     __entry->bth0,
-                     __entry->bth1,
-                     __entry->bth2,
-                     __entry->kdeth0,
-                     __entry->kdeth1,
-                     __entry->kdeth2,
-                     __entry->kdeth3,
-                     __entry->kdeth4,
-                     __entry->kdeth5,
-                     __entry->kdeth6,
-                     __entry->kdeth7,
-                     __entry->kdeth8,
-                     __entry->tidval
-                   )
-       );
-
-#define SDMA_UREQ_FMT \
-       "[%s:%u:%u] ver/op=0x%x, iovcnt=%u, npkts=%u, frag=%u, idx=%u"
-TRACE_EVENT(hfi1_sdma_user_reqinfo,
-           TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 *i),
-           TP_ARGS(dd, ctxt, subctxt, i),
-           TP_STRUCT__entry(
-                   DD_DEV_ENTRY(dd);
-                   __field(u16, ctxt)
-                   __field(u8, subctxt)
-                   __field(u8, ver_opcode)
-                   __field(u8, iovcnt)
-                   __field(u16, npkts)
-                   __field(u16, fragsize)
-                   __field(u16, comp_idx)
-                   ),
-           TP_fast_assign(
-                   DD_DEV_ASSIGN(dd);
-                   __entry->ctxt = ctxt;
-                   __entry->subctxt = subctxt;
-                   __entry->ver_opcode = i[0] & 0xff;
-                   __entry->iovcnt = (i[0] >> 8) & 0xff;
-                   __entry->npkts = i[1];
-                   __entry->fragsize = i[2];
-                   __entry->comp_idx = i[3];
-                   ),
-           TP_printk(SDMA_UREQ_FMT,
-                     __get_str(dev),
-                     __entry->ctxt,
-                     __entry->subctxt,
-                     __entry->ver_opcode,
-                     __entry->iovcnt,
-                     __entry->npkts,
-                     __entry->fragsize,
-                     __entry->comp_idx
-                   )
-       );
-
-#define usdma_complete_name(st) { st, #st }
-#define show_usdma_complete_state(st)                  \
-       __print_symbolic(st,                            \
-                        usdma_complete_name(FREE),     \
-                        usdma_complete_name(QUEUED),   \
-                        usdma_complete_name(COMPLETE), \
-                        usdma_complete_name(ERROR))
-
-TRACE_EVENT(hfi1_sdma_user_completion,
-           TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 idx,
-                    u8 state, int code),
-           TP_ARGS(dd, ctxt, subctxt, idx, state, code),
-           TP_STRUCT__entry(
-                   DD_DEV_ENTRY(dd)
-                   __field(u16, ctxt)
-                   __field(u8, subctxt)
-                   __field(u16, idx)
-                   __field(u8, state)
-                   __field(int, code)
-                   ),
-           TP_fast_assign(
-                   DD_DEV_ASSIGN(dd);
-                   __entry->ctxt = ctxt;
-                   __entry->subctxt = subctxt;
-                   __entry->idx = idx;
-                   __entry->state = state;
-                   __entry->code = code;
-                   ),
-           TP_printk("[%s:%u:%u:%u] SDMA completion state %s (%d)",
-                     __get_str(dev), __entry->ctxt, __entry->subctxt,
-                     __entry->idx, show_usdma_complete_state(__entry->state),
-                     __entry->code)
-       );
-
-const char *print_u32_array(struct trace_seq *, u32 *, int);
-#define __print_u32_hex(arr, len) print_u32_array(p, arr, len)
-
-TRACE_EVENT(hfi1_sdma_user_header_ahg,
-           TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req,
-                    u8 sde, u8 ahgidx, u32 *ahg, int len, u32 tidval),
-           TP_ARGS(dd, ctxt, subctxt, req, sde, ahgidx, ahg, len, tidval),
-           TP_STRUCT__entry(
-                   DD_DEV_ENTRY(dd)
-                   __field(u16, ctxt)
-                   __field(u8, subctxt)
-                   __field(u16, req)
-                   __field(u8, sde)
-                   __field(u8, idx)
-                   __field(int, len)
-                   __field(u32, tidval)
-                   __array(u32, ahg, 10)
-                   ),
-           TP_fast_assign(
-                   DD_DEV_ASSIGN(dd);
-                   __entry->ctxt = ctxt;
-                   __entry->subctxt = subctxt;
-                   __entry->req = req;
-                   __entry->sde = sde;
-                   __entry->idx = ahgidx;
-                   __entry->len = len;
-                   __entry->tidval = tidval;
-                   memcpy(__entry->ahg, ahg, len * sizeof(u32));
-                   ),
-           TP_printk("[%s:%u:%u:%u] (SDE%u/AHG%u) ahg[0-%d]=(%s) TIDVal=0x%x",
-                     __get_str(dev),
-                     __entry->ctxt,
-                     __entry->subctxt,
-                     __entry->req,
-                     __entry->sde,
-                     __entry->idx,
-                     __entry->len - 1,
-                     __print_u32_hex(__entry->ahg, __entry->len),
-                     __entry->tidval
-                   )
-       );
-
-TRACE_EVENT(hfi1_sdma_state,
-           TP_PROTO(struct sdma_engine *sde,
-                    const char *cstate,
-                    const char *nstate
-                    ),
-           TP_ARGS(sde, cstate, nstate),
-           TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
-                            __string(curstate, cstate)
-                            __string(newstate, nstate)
-                            ),
-       TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
-                      __assign_str(curstate, cstate);
-                      __assign_str(newstate, nstate);
-                      ),
-       TP_printk("[%s] current state %s new state %s",
-                 __get_str(dev),
-                 __get_str(curstate),
-                 __get_str(newstate)
-                 )
-);
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_rc
-
-DECLARE_EVENT_CLASS(hfi1_rc_template,
-                   TP_PROTO(struct rvt_qp *qp, u32 psn),
-                   TP_ARGS(qp, psn),
-                   TP_STRUCT__entry(
-                       DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
-                       __field(u32, qpn)
-                       __field(u32, s_flags)
-                       __field(u32, psn)
-                       __field(u32, s_psn)
-                       __field(u32, s_next_psn)
-                       __field(u32, s_sending_psn)
-                       __field(u32, s_sending_hpsn)
-                       __field(u32, r_psn)
-                       ),
-                   TP_fast_assign(
-                       DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
-                       __entry->qpn = qp->ibqp.qp_num;
-                       __entry->s_flags = qp->s_flags;
-                       __entry->psn = psn;
-                       __entry->s_psn = qp->s_psn;
-                       __entry->s_next_psn = qp->s_next_psn;
-                       __entry->s_sending_psn = qp->s_sending_psn;
-                       __entry->s_sending_hpsn = qp->s_sending_hpsn;
-                       __entry->r_psn = qp->r_psn;
-                       ),
-                   TP_printk(
-                       "[%s] qpn 0x%x s_flags 0x%x psn 0x%x s_psn 0x%x s_next_psn 0x%x s_sending_psn 0x%x sending_hpsn 0x%x r_psn 0x%x",
-                       __get_str(dev),
-                       __entry->qpn,
-                       __entry->s_flags,
-                       __entry->psn,
-                       __entry->s_psn,
-                       __entry->s_next_psn,
-                       __entry->s_sending_psn,
-                       __entry->s_sending_hpsn,
-                       __entry->r_psn
-                       )
-);
-
-DEFINE_EVENT(hfi1_rc_template, hfi1_rc_sendcomplete,
-            TP_PROTO(struct rvt_qp *qp, u32 psn),
-            TP_ARGS(qp, psn)
-);
-
-DEFINE_EVENT(hfi1_rc_template, hfi1_rc_ack,
-            TP_PROTO(struct rvt_qp *qp, u32 psn),
-            TP_ARGS(qp, psn)
-);
-
-DEFINE_EVENT(hfi1_rc_template, hfi1_rc_timeout,
-            TP_PROTO(struct rvt_qp *qp, u32 psn),
-            TP_ARGS(qp, psn)
-);
-
-DEFINE_EVENT(hfi1_rc_template, hfi1_rc_rcv_error,
-            TP_PROTO(struct rvt_qp *qp, u32 psn),
-            TP_ARGS(qp, psn)
-);
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_misc
-
-TRACE_EVENT(hfi1_interrupt,
-           TP_PROTO(struct hfi1_devdata *dd, const struct is_table *is_entry,
-                    int src),
-           TP_ARGS(dd, is_entry, src),
-           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
-                            __array(char, buf, 64)
-                            __field(int, src)
-                            ),
-           TP_fast_assign(DD_DEV_ASSIGN(dd)
-                          is_entry->is_name(__entry->buf, 64,
-                                            src - is_entry->start);
-                          __entry->src = src;
-                          ),
-           TP_printk("[%s] source: %s [%d]", __get_str(dev), __entry->buf,
-                     __entry->src)
-);
-
-/*
- * Note:
- * This produces a REALLY ugly trace in the console output when the string is
- * too long.
- */
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_trace
-
-#define MAX_MSG_LEN 512
-
-DECLARE_EVENT_CLASS(hfi1_trace_template,
-                   TP_PROTO(const char *function, struct va_format *vaf),
-                   TP_ARGS(function, vaf),
-                   TP_STRUCT__entry(__string(function, function)
-                                    __dynamic_array(char, msg, MAX_MSG_LEN)
-                                    ),
-                   TP_fast_assign(__assign_str(function, function);
-                                  WARN_ON_ONCE(vsnprintf
-                                               (__get_dynamic_array(msg),
-                                                MAX_MSG_LEN, vaf->fmt,
-                                                *vaf->va) >=
-                                               MAX_MSG_LEN);
-                                  ),
-                   TP_printk("(%s) %s",
-                             __get_str(function),
-                             __get_str(msg))
-);
-
-/*
- * It may be nice to macroize the __hfi1_trace but the va_* stuff requires an
- * actual function to work and can not be in a macro.
- */
-#define __hfi1_trace_def(lvl) \
-void __hfi1_trace_##lvl(const char *funct, char *fmt, ...);            \
-                                                                       \
-DEFINE_EVENT(hfi1_trace_template, hfi1_ ##lvl,                         \
-       TP_PROTO(const char *function, struct va_format *vaf),          \
-       TP_ARGS(function, vaf))
-
-#define __hfi1_trace_fn(lvl) \
-void __hfi1_trace_##lvl(const char *func, char *fmt, ...)              \
-{                                                                      \
-       struct va_format vaf = {                                        \
-               .fmt = fmt,                                             \
-       };                                                              \
-       va_list args;                                                   \
-                                                                       \
-       va_start(args, fmt);                                            \
-       vaf.va = &args;                                                 \
-       trace_hfi1_ ##lvl(func, &vaf);                                  \
-       va_end(args);                                                   \
-       return;                                                         \
-}
-
-/*
- * To create a new trace level simply define it below and as a __hfi1_trace_fn
- * in trace.c. This will create all the hooks for calling
- * hfi1_cdbg(LVL, fmt, ...); as well as take care of all
- * the debugfs stuff.
- */
-__hfi1_trace_def(PKT);
-__hfi1_trace_def(PROC);
-__hfi1_trace_def(SDMA);
-__hfi1_trace_def(LINKVERB);
-__hfi1_trace_def(DEBUG);
-__hfi1_trace_def(SNOOP);
-__hfi1_trace_def(CNTR);
-__hfi1_trace_def(PIO);
-__hfi1_trace_def(DC8051);
-__hfi1_trace_def(FIRMWARE);
-__hfi1_trace_def(RCVCTRL);
-__hfi1_trace_def(TID);
-__hfi1_trace_def(MMU);
-
-#define hfi1_cdbg(which, fmt, ...) \
-       __hfi1_trace_##which(__func__, fmt, ##__VA_ARGS__)
-
-#define hfi1_dbg(fmt, ...) \
-       hfi1_cdbg(DEBUG, fmt, ##__VA_ARGS__)
-
-/*
- * Define HFI1_EARLY_DBG at compile time or here to enable early trace
- * messages. Do not check in an enablement for this.
- */
-
-#ifdef HFI1_EARLY_DBG
-#define hfi1_dbg_early(fmt, ...) \
-       trace_printk(fmt, ##__VA_ARGS__)
-#else
-#define hfi1_dbg_early(fmt, ...)
-#endif
-
-#endif /* __HFI1_TRACE_H */
-
-#undef TRACE_INCLUDE_PATH
-#undef TRACE_INCLUDE_FILE
-#define TRACE_INCLUDE_PATH .
-#define TRACE_INCLUDE_FILE trace
-#include <trace/define_trace.h>
diff --git a/drivers/staging/rdma/hfi1/twsi.c b/drivers/staging/rdma/hfi1/twsi.c
deleted file mode 100644 (file)
index e82e52a..0000000
+++ /dev/null
@@ -1,489 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/delay.h>
-#include <linux/pci.h>
-#include <linux/vmalloc.h>
-
-#include "hfi.h"
-#include "twsi.h"
-
-/*
- * "Two Wire Serial Interface" support.
- *
- * Originally written for a not-quite-i2c serial eeprom, which is
- * still used on some supported boards. Later boards have added a
- * variety of other uses, most board-specific, so the bit-boffing
- * part has been split off to this file, while the other parts
- * have been moved to chip-specific files.
- *
- * We have also dropped all pretense of fully generic (e.g. pretend
- * we don't know whether '1' is the higher voltage) interface, as
- * the restrictions of the generic i2c interface (e.g. no access from
- * driver itself) make it unsuitable for this use.
- */
-
-#define READ_CMD 1
-#define WRITE_CMD 0
-
-/**
- * i2c_wait_for_writes - wait for a write
- * @dd: the hfi1_ib device
- *
- * We use this instead of udelay directly, so we can make sure
- * that previous register writes have been flushed all the way
- * to the chip.  Since we are delaying anyway, the cost doesn't
- * hurt, and makes the bit twiddling more regular
- */
-static void i2c_wait_for_writes(struct hfi1_devdata *dd, u32 target)
-{
-       /*
-        * implicit read of EXTStatus is as good as explicit
-        * read of scratch, if all we want to do is flush
-        * writes.
-        */
-       hfi1_gpio_mod(dd, target, 0, 0, 0);
-       rmb(); /* inlined, so prevent compiler reordering */
-}
-
-/*
- * QSFP modules are allowed to hold SCL low for 500uSec. Allow twice that
- * for "almost compliant" modules
- */
-#define SCL_WAIT_USEC 1000
-
-/* BUF_WAIT is time bus must be free between STOP or ACK and to next START.
- * Should be 20, but some chips need more.
- */
-#define TWSI_BUF_WAIT_USEC 60
-
-static void scl_out(struct hfi1_devdata *dd, u32 target, u8 bit)
-{
-       u32 mask;
-
-       udelay(1);
-
-       mask = QSFP_HFI0_I2CCLK;
-
-       /* SCL is meant to be bare-drain, so never set "OUT", just DIR */
-       hfi1_gpio_mod(dd, target, 0, bit ? 0 : mask, mask);
-
-       /*
-        * Allow for slow slaves by simple
-        * delay for falling edge, sampling on rise.
-        */
-       if (!bit) {
-               udelay(2);
-       } else {
-               int rise_usec;
-
-               for (rise_usec = SCL_WAIT_USEC; rise_usec > 0; rise_usec -= 2) {
-                       if (mask & hfi1_gpio_mod(dd, target, 0, 0, 0))
-                               break;
-                       udelay(2);
-               }
-               if (rise_usec <= 0)
-                       dd_dev_err(dd, "SCL interface stuck low > %d uSec\n",
-                                  SCL_WAIT_USEC);
-       }
-       i2c_wait_for_writes(dd, target);
-}
-
-static u8 scl_in(struct hfi1_devdata *dd, u32 target, int wait)
-{
-       u32 read_val, mask;
-
-       mask = QSFP_HFI0_I2CCLK;
-       /* SCL is meant to be bare-drain, so never set "OUT", just DIR */
-       hfi1_gpio_mod(dd, target, 0, 0, mask);
-       read_val = hfi1_gpio_mod(dd, target, 0, 0, 0);
-       if (wait)
-               i2c_wait_for_writes(dd, target);
-       return (read_val & mask) >> GPIO_SCL_NUM;
-}
-
-static void sda_out(struct hfi1_devdata *dd, u32 target, u8 bit)
-{
-       u32 mask;
-
-       mask = QSFP_HFI0_I2CDAT;
-
-       /* SDA is meant to be bare-drain, so never set "OUT", just DIR */
-       hfi1_gpio_mod(dd, target, 0, bit ? 0 : mask, mask);
-
-       i2c_wait_for_writes(dd, target);
-       udelay(2);
-}
-
-static u8 sda_in(struct hfi1_devdata *dd, u32 target, int wait)
-{
-       u32 read_val, mask;
-
-       mask = QSFP_HFI0_I2CDAT;
-       /* SDA is meant to be bare-drain, so never set "OUT", just DIR */
-       hfi1_gpio_mod(dd, target, 0, 0, mask);
-       read_val = hfi1_gpio_mod(dd, target, 0, 0, 0);
-       if (wait)
-               i2c_wait_for_writes(dd, target);
-       return (read_val & mask) >> GPIO_SDA_NUM;
-}
-
-/**
- * i2c_ackrcv - see if ack following write is true
- * @dd: the hfi1_ib device
- */
-static int i2c_ackrcv(struct hfi1_devdata *dd, u32 target)
-{
-       u8 ack_received;
-
-       /* AT ENTRY SCL = LOW */
-       /* change direction, ignore data */
-       ack_received = sda_in(dd, target, 1);
-       scl_out(dd, target, 1);
-       ack_received = sda_in(dd, target, 1) == 0;
-       scl_out(dd, target, 0);
-       return ack_received;
-}
-
-static void stop_cmd(struct hfi1_devdata *dd, u32 target);
-
-/**
- * rd_byte - read a byte, sending STOP on last, else ACK
- * @dd: the hfi1_ib device
- *
- * Returns byte shifted out of device
- */
-static int rd_byte(struct hfi1_devdata *dd, u32 target, int last)
-{
-       int bit_cntr, data;
-
-       data = 0;
-
-       for (bit_cntr = 7; bit_cntr >= 0; --bit_cntr) {
-               data <<= 1;
-               scl_out(dd, target, 1);
-               data |= sda_in(dd, target, 0);
-               scl_out(dd, target, 0);
-       }
-       if (last) {
-               scl_out(dd, target, 1);
-               stop_cmd(dd, target);
-       } else {
-               sda_out(dd, target, 0);
-               scl_out(dd, target, 1);
-               scl_out(dd, target, 0);
-               sda_out(dd, target, 1);
-       }
-       return data;
-}
-
-/**
- * wr_byte - write a byte, one bit at a time
- * @dd: the hfi1_ib device
- * @data: the byte to write
- *
- * Returns 0 if we got the following ack, otherwise 1
- */
-static int wr_byte(struct hfi1_devdata *dd, u32 target, u8 data)
-{
-       int bit_cntr;
-       u8 bit;
-
-       for (bit_cntr = 7; bit_cntr >= 0; bit_cntr--) {
-               bit = (data >> bit_cntr) & 1;
-               sda_out(dd, target, bit);
-               scl_out(dd, target, 1);
-               scl_out(dd, target, 0);
-       }
-       return (!i2c_ackrcv(dd, target)) ? 1 : 0;
-}
-
-/*
- * issue TWSI start sequence:
- * (both clock/data high, clock high, data low while clock is high)
- */
-static void start_seq(struct hfi1_devdata *dd, u32 target)
-{
-       sda_out(dd, target, 1);
-       scl_out(dd, target, 1);
-       sda_out(dd, target, 0);
-       udelay(1);
-       scl_out(dd, target, 0);
-}
-
-/**
- * stop_seq - transmit the stop sequence
- * @dd: the hfi1_ib device
- *
- * (both clock/data low, clock high, data high while clock is high)
- */
-static void stop_seq(struct hfi1_devdata *dd, u32 target)
-{
-       scl_out(dd, target, 0);
-       sda_out(dd, target, 0);
-       scl_out(dd, target, 1);
-       sda_out(dd, target, 1);
-}
-
-/**
- * stop_cmd - transmit the stop condition
- * @dd: the hfi1_ib device
- *
- * (both clock/data low, clock high, data high while clock is high)
- */
-static void stop_cmd(struct hfi1_devdata *dd, u32 target)
-{
-       stop_seq(dd, target);
-       udelay(TWSI_BUF_WAIT_USEC);
-}
-
-/**
- * hfi1_twsi_reset - reset I2C communication
- * @dd: the hfi1_ib device
- * returns 0 if ok, -EIO on error
- */
-int hfi1_twsi_reset(struct hfi1_devdata *dd, u32 target)
-{
-       int clock_cycles_left = 9;
-       u32 mask;
-
-       /* Both SCL and SDA should be high. If not, there
-        * is something wrong.
-        */
-       mask = QSFP_HFI0_I2CCLK | QSFP_HFI0_I2CDAT;
-
-       /*
-        * Force pins to desired innocuous state.
-        * This is the default power-on state with out=0 and dir=0,
-        * So tri-stated and should be floating high (barring HW problems)
-        */
-       hfi1_gpio_mod(dd, target, 0, 0, mask);
-
-       /* Check if SCL is low, if it is low then we have a slave device
-        * misbehaving and there is not much we can do.
-        */
-       if (!scl_in(dd, target, 0))
-               return -EIO;
-
-       /* Check if SDA is low, if it is low then we have to clock SDA
-        * up to 9 times for the device to release the bus
-        */
-       while (clock_cycles_left--) {
-               if (sda_in(dd, target, 0))
-                       return 0;
-               scl_out(dd, target, 0);
-               scl_out(dd, target, 1);
-       }
-
-       return -EIO;
-}
-
-#define HFI1_TWSI_START 0x100
-#define HFI1_TWSI_STOP 0x200
-
-/* Write byte to TWSI, optionally prefixed with START or suffixed with
- * STOP.
- * returns 0 if OK (ACK received), else != 0
- */
-static int twsi_wr(struct hfi1_devdata *dd, u32 target, int data, int flags)
-{
-       int ret = 1;
-
-       if (flags & HFI1_TWSI_START)
-               start_seq(dd, target);
-
-       /* Leaves SCL low (from i2c_ackrcv()) */
-       ret = wr_byte(dd, target, data);
-
-       if (flags & HFI1_TWSI_STOP)
-               stop_cmd(dd, target);
-       return ret;
-}
-
-/* Added functionality for IBA7220-based cards */
-#define HFI1_TEMP_DEV 0x98
-
-/*
- * hfi1_twsi_blk_rd
- * General interface for data transfer from twsi devices.
- * One vestige of its former role is that it recognizes a device
- * HFI1_TWSI_NO_DEV and does the correct operation for the legacy part,
- * which responded to all TWSI device codes, interpreting them as
- * address within device. On all other devices found on board handled by
- * this driver, the device is followed by a N-byte "address" which selects
- * the "register" or "offset" within the device from which data should
- * be read.
- */
-int hfi1_twsi_blk_rd(struct hfi1_devdata *dd, u32 target, int dev, int addr,
-                    void *buffer, int len)
-{
-       u8 *bp = buffer;
-       int ret = 1;
-       int i;
-       int offset_size;
-
-       /* obtain the offset size, strip it from the device address */
-       offset_size = (dev >> 8) & 0xff;
-       dev &= 0xff;
-
-       /* allow at most a 2 byte offset */
-       if (offset_size > 2)
-               goto bail;
-
-       if (dev == HFI1_TWSI_NO_DEV) {
-               /* legacy not-really-I2C */
-               addr = (addr << 1) | READ_CMD;
-               ret = twsi_wr(dd, target, addr, HFI1_TWSI_START);
-       } else {
-               /* Actual I2C */
-               if (offset_size) {
-                       ret = twsi_wr(dd, target,
-                                     dev | WRITE_CMD, HFI1_TWSI_START);
-                       if (ret) {
-                               stop_cmd(dd, target);
-                               goto bail;
-                       }
-
-                       for (i = 0; i < offset_size; i++) {
-                               ret = twsi_wr(dd, target,
-                                             (addr >> (i * 8)) & 0xff, 0);
-                               udelay(TWSI_BUF_WAIT_USEC);
-                               if (ret) {
-                                       dd_dev_err(dd, "Failed to write byte %d of offset 0x%04X\n",
-                                                  i, addr);
-                                       goto bail;
-                               }
-                       }
-               }
-               ret = twsi_wr(dd, target, dev | READ_CMD, HFI1_TWSI_START);
-       }
-       if (ret) {
-               stop_cmd(dd, target);
-               goto bail;
-       }
-
-       /*
-        * block devices keeps clocking data out as long as we ack,
-        * automatically incrementing the address. Some have "pages"
-        * whose boundaries will not be crossed, but the handling
-        * of these is left to the caller, who is in a better
-        * position to know.
-        */
-       while (len-- > 0) {
-               /*
-                * Get and store data, sending ACK if length remaining,
-                * else STOP
-                */
-               *bp++ = rd_byte(dd, target, !len);
-       }
-
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-/*
- * hfi1_twsi_blk_wr
- * General interface for data transfer to twsi devices.
- * One vestige of its former role is that it recognizes a device
- * HFI1_TWSI_NO_DEV and does the correct operation for the legacy part,
- * which responded to all TWSI device codes, interpreting them as
- * address within device. On all other devices found on board handled by
- * this driver, the device is followed by a N-byte "address" which selects
- * the "register" or "offset" within the device to which data should
- * be written.
- */
-int hfi1_twsi_blk_wr(struct hfi1_devdata *dd, u32 target, int dev, int addr,
-                    const void *buffer, int len)
-{
-       const u8 *bp = buffer;
-       int ret = 1;
-       int i;
-       int offset_size;
-
-       /* obtain the offset size, strip it from the device address */
-       offset_size = (dev >> 8) & 0xff;
-       dev &= 0xff;
-
-       /* allow at most a 2 byte offset */
-       if (offset_size > 2)
-               goto bail;
-
-       if (dev == HFI1_TWSI_NO_DEV) {
-               if (twsi_wr(dd, target, (addr << 1) | WRITE_CMD,
-                           HFI1_TWSI_START)) {
-                       goto failed_write;
-               }
-       } else {
-               /* Real I2C */
-               if (twsi_wr(dd, target, dev | WRITE_CMD, HFI1_TWSI_START))
-                       goto failed_write;
-       }
-
-       for (i = 0; i < offset_size; i++) {
-               ret = twsi_wr(dd, target, (addr >> (i * 8)) & 0xff, 0);
-               udelay(TWSI_BUF_WAIT_USEC);
-               if (ret) {
-                       dd_dev_err(dd, "Failed to write byte %d of offset 0x%04X\n",
-                                  i, addr);
-                       goto bail;
-               }
-       }
-
-       for (i = 0; i < len; i++)
-               if (twsi_wr(dd, target, *bp++, 0))
-                       goto failed_write;
-
-       ret = 0;
-
-failed_write:
-       stop_cmd(dd, target);
-
-bail:
-       return ret;
-}
diff --git a/drivers/staging/rdma/hfi1/twsi.h b/drivers/staging/rdma/hfi1/twsi.h
deleted file mode 100644 (file)
index 5b8a5b5..0000000
+++ /dev/null
@@ -1,65 +0,0 @@
-#ifndef _TWSI_H
-#define _TWSI_H
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#define HFI1_TWSI_NO_DEV 0xFF
-
-struct hfi1_devdata;
-
-/* Bit position of SDA/SCL pins in ASIC_QSFP* registers  */
-#define  GPIO_SDA_NUM 1
-#define  GPIO_SCL_NUM 0
-
-/* these functions must be called with qsfp_lock held */
-int hfi1_twsi_reset(struct hfi1_devdata *dd, u32 target);
-int hfi1_twsi_blk_rd(struct hfi1_devdata *dd, u32 target, int dev, int addr,
-                    void *buffer, int len);
-int hfi1_twsi_blk_wr(struct hfi1_devdata *dd, u32 target, int dev, int addr,
-                    const void *buffer, int len);
-
-#endif /* _TWSI_H */
diff --git a/drivers/staging/rdma/hfi1/uc.c b/drivers/staging/rdma/hfi1/uc.c
deleted file mode 100644 (file)
index df773d4..0000000
+++ /dev/null
@@ -1,604 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "hfi.h"
-#include "verbs_txreq.h"
-#include "qp.h"
-
-/* cut down ridiculously long IB macro names */
-#define OP(x) IB_OPCODE_UC_##x
-
-/* only opcode mask for adaptive pio */
-const u32 uc_only_opcode =
-       BIT(OP(SEND_ONLY) & 0x1f) |
-       BIT(OP(SEND_ONLY_WITH_IMMEDIATE & 0x1f)) |
-       BIT(OP(RDMA_WRITE_ONLY & 0x1f)) |
-       BIT(OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE & 0x1f));
-
-/**
- * hfi1_make_uc_req - construct a request packet (SEND, RDMA write)
- * @qp: a pointer to the QP
- *
- * Assume s_lock is held.
- *
- * Return 1 if constructed; otherwise, return 0.
- */
-int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-       struct hfi1_other_headers *ohdr;
-       struct rvt_swqe *wqe;
-       u32 hwords = 5;
-       u32 bth0 = 0;
-       u32 len;
-       u32 pmtu = qp->pmtu;
-       int middle = 0;
-
-       ps->s_txreq = get_txreq(ps->dev, qp);
-       if (IS_ERR(ps->s_txreq))
-               goto bail_no_tx;
-
-       if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) {
-               if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
-                       goto bail;
-               /* We are in the error state, flush the work request. */
-               smp_read_barrier_depends(); /* see post_one_send() */
-               if (qp->s_last == ACCESS_ONCE(qp->s_head))
-                       goto bail;
-               /* If DMAs are in progress, we can't flush immediately. */
-               if (iowait_sdma_pending(&priv->s_iowait)) {
-                       qp->s_flags |= RVT_S_WAIT_DMA;
-                       goto bail;
-               }
-               clear_ahg(qp);
-               wqe = rvt_get_swqe_ptr(qp, qp->s_last);
-               hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
-               goto done_free_tx;
-       }
-
-       ohdr = &ps->s_txreq->phdr.hdr.u.oth;
-       if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
-               ohdr = &ps->s_txreq->phdr.hdr.u.l.oth;
-
-       /* Get the next send request. */
-       wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
-       qp->s_wqe = NULL;
-       switch (qp->s_state) {
-       default:
-               if (!(ib_rvt_state_ops[qp->state] &
-                   RVT_PROCESS_NEXT_SEND_OK))
-                       goto bail;
-               /* Check if send work queue is empty. */
-               smp_read_barrier_depends(); /* see post_one_send() */
-               if (qp->s_cur == ACCESS_ONCE(qp->s_head)) {
-                       clear_ahg(qp);
-                       goto bail;
-               }
-               /*
-                * Start a new request.
-                */
-               qp->s_psn = wqe->psn;
-               qp->s_sge.sge = wqe->sg_list[0];
-               qp->s_sge.sg_list = wqe->sg_list + 1;
-               qp->s_sge.num_sge = wqe->wr.num_sge;
-               qp->s_sge.total_len = wqe->length;
-               len = wqe->length;
-               qp->s_len = len;
-               switch (wqe->wr.opcode) {
-               case IB_WR_SEND:
-               case IB_WR_SEND_WITH_IMM:
-                       if (len > pmtu) {
-                               qp->s_state = OP(SEND_FIRST);
-                               len = pmtu;
-                               break;
-                       }
-                       if (wqe->wr.opcode == IB_WR_SEND) {
-                               qp->s_state = OP(SEND_ONLY);
-                       } else {
-                               qp->s_state =
-                                       OP(SEND_ONLY_WITH_IMMEDIATE);
-                               /* Immediate data comes after the BTH */
-                               ohdr->u.imm_data = wqe->wr.ex.imm_data;
-                               hwords += 1;
-                       }
-                       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-                               bth0 |= IB_BTH_SOLICITED;
-                       qp->s_wqe = wqe;
-                       if (++qp->s_cur >= qp->s_size)
-                               qp->s_cur = 0;
-                       break;
-
-               case IB_WR_RDMA_WRITE:
-               case IB_WR_RDMA_WRITE_WITH_IMM:
-                       ohdr->u.rc.reth.vaddr =
-                               cpu_to_be64(wqe->rdma_wr.remote_addr);
-                       ohdr->u.rc.reth.rkey =
-                               cpu_to_be32(wqe->rdma_wr.rkey);
-                       ohdr->u.rc.reth.length = cpu_to_be32(len);
-                       hwords += sizeof(struct ib_reth) / 4;
-                       if (len > pmtu) {
-                               qp->s_state = OP(RDMA_WRITE_FIRST);
-                               len = pmtu;
-                               break;
-                       }
-                       if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
-                               qp->s_state = OP(RDMA_WRITE_ONLY);
-                       } else {
-                               qp->s_state =
-                                       OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
-                               /* Immediate data comes after the RETH */
-                               ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
-                               hwords += 1;
-                               if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-                                       bth0 |= IB_BTH_SOLICITED;
-                       }
-                       qp->s_wqe = wqe;
-                       if (++qp->s_cur >= qp->s_size)
-                               qp->s_cur = 0;
-                       break;
-
-               default:
-                       goto bail;
-               }
-               break;
-
-       case OP(SEND_FIRST):
-               qp->s_state = OP(SEND_MIDDLE);
-               /* FALLTHROUGH */
-       case OP(SEND_MIDDLE):
-               len = qp->s_len;
-               if (len > pmtu) {
-                       len = pmtu;
-                       middle = HFI1_CAP_IS_KSET(SDMA_AHG);
-                       break;
-               }
-               if (wqe->wr.opcode == IB_WR_SEND) {
-                       qp->s_state = OP(SEND_LAST);
-               } else {
-                       qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
-                       /* Immediate data comes after the BTH */
-                       ohdr->u.imm_data = wqe->wr.ex.imm_data;
-                       hwords += 1;
-               }
-               if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-                       bth0 |= IB_BTH_SOLICITED;
-               qp->s_wqe = wqe;
-               if (++qp->s_cur >= qp->s_size)
-                       qp->s_cur = 0;
-               break;
-
-       case OP(RDMA_WRITE_FIRST):
-               qp->s_state = OP(RDMA_WRITE_MIDDLE);
-               /* FALLTHROUGH */
-       case OP(RDMA_WRITE_MIDDLE):
-               len = qp->s_len;
-               if (len > pmtu) {
-                       len = pmtu;
-                       middle = HFI1_CAP_IS_KSET(SDMA_AHG);
-                       break;
-               }
-               if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
-                       qp->s_state = OP(RDMA_WRITE_LAST);
-               } else {
-                       qp->s_state =
-                               OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
-                       /* Immediate data comes after the BTH */
-                       ohdr->u.imm_data = wqe->wr.ex.imm_data;
-                       hwords += 1;
-                       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-                               bth0 |= IB_BTH_SOLICITED;
-               }
-               qp->s_wqe = wqe;
-               if (++qp->s_cur >= qp->s_size)
-                       qp->s_cur = 0;
-               break;
-       }
-       qp->s_len -= len;
-       qp->s_hdrwords = hwords;
-       ps->s_txreq->sde = priv->s_sde;
-       qp->s_cur_sge = &qp->s_sge;
-       qp->s_cur_size = len;
-       hfi1_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24),
-                            mask_psn(qp->s_psn++), middle, ps);
-       /* pbc */
-       ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2;
-       return 1;
-
-done_free_tx:
-       hfi1_put_txreq(ps->s_txreq);
-       ps->s_txreq = NULL;
-       return 1;
-
-bail:
-       hfi1_put_txreq(ps->s_txreq);
-
-bail_no_tx:
-       ps->s_txreq = NULL;
-       qp->s_flags &= ~RVT_S_BUSY;
-       qp->s_hdrwords = 0;
-       return 0;
-}
-
-/**
- * hfi1_uc_rcv - handle an incoming UC packet
- * @ibp: the port the packet came in on
- * @hdr: the header of the packet
- * @rcv_flags: flags relevant to rcv processing
- * @data: the packet data
- * @tlen: the length of the packet
- * @qp: the QP for this packet.
- *
- * This is called from qp_rcv() to process an incoming UC packet
- * for the given QP.
- * Called at interrupt level.
- */
-void hfi1_uc_rcv(struct hfi1_packet *packet)
-{
-       struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data;
-       struct hfi1_ib_header *hdr = packet->hdr;
-       u32 rcv_flags = packet->rcv_flags;
-       void *data = packet->ebuf;
-       u32 tlen = packet->tlen;
-       struct rvt_qp *qp = packet->qp;
-       struct hfi1_other_headers *ohdr = packet->ohdr;
-       u32 bth0, opcode;
-       u32 hdrsize = packet->hlen;
-       u32 psn;
-       u32 pad;
-       struct ib_wc wc;
-       u32 pmtu = qp->pmtu;
-       struct ib_reth *reth;
-       int has_grh = rcv_flags & HFI1_HAS_GRH;
-       int ret;
-       u32 bth1;
-
-       bth0 = be32_to_cpu(ohdr->bth[0]);
-       if (hfi1_ruc_check_hdr(ibp, hdr, has_grh, qp, bth0))
-               return;
-
-       bth1 = be32_to_cpu(ohdr->bth[1]);
-       if (unlikely(bth1 & (HFI1_BECN_SMASK | HFI1_FECN_SMASK))) {
-               if (bth1 & HFI1_BECN_SMASK) {
-                       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-                       u32 rqpn, lqpn;
-                       u16 rlid = be16_to_cpu(hdr->lrh[3]);
-                       u8 sl, sc5;
-
-                       lqpn = bth1 & RVT_QPN_MASK;
-                       rqpn = qp->remote_qpn;
-
-                       sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
-                       sl = ibp->sc_to_sl[sc5];
-
-                       process_becn(ppd, sl, rlid, lqpn, rqpn,
-                                    IB_CC_SVCTYPE_UC);
-               }
-
-               if (bth1 & HFI1_FECN_SMASK) {
-                       struct ib_grh *grh = NULL;
-                       u16 pkey = (u16)be32_to_cpu(ohdr->bth[0]);
-                       u16 slid = be16_to_cpu(hdr->lrh[3]);
-                       u16 dlid = be16_to_cpu(hdr->lrh[1]);
-                       u32 src_qp = qp->remote_qpn;
-                       u8 sc5;
-
-                       sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
-                       if (has_grh)
-                               grh = &hdr->u.l.grh;
-
-                       return_cnp(ibp, qp, src_qp, pkey, dlid, slid, sc5,
-                                  grh);
-               }
-       }
-
-       psn = be32_to_cpu(ohdr->bth[2]);
-       opcode = (bth0 >> 24) & 0xff;
-
-       /* Compare the PSN verses the expected PSN. */
-       if (unlikely(cmp_psn(psn, qp->r_psn) != 0)) {
-               /*
-                * Handle a sequence error.
-                * Silently drop any current message.
-                */
-               qp->r_psn = psn;
-inv:
-               if (qp->r_state == OP(SEND_FIRST) ||
-                   qp->r_state == OP(SEND_MIDDLE)) {
-                       set_bit(RVT_R_REWIND_SGE, &qp->r_aflags);
-                       qp->r_sge.num_sge = 0;
-               } else {
-                       rvt_put_ss(&qp->r_sge);
-               }
-               qp->r_state = OP(SEND_LAST);
-               switch (opcode) {
-               case OP(SEND_FIRST):
-               case OP(SEND_ONLY):
-               case OP(SEND_ONLY_WITH_IMMEDIATE):
-                       goto send_first;
-
-               case OP(RDMA_WRITE_FIRST):
-               case OP(RDMA_WRITE_ONLY):
-               case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
-                       goto rdma_first;
-
-               default:
-                       goto drop;
-               }
-       }
-
-       /* Check for opcode sequence errors. */
-       switch (qp->r_state) {
-       case OP(SEND_FIRST):
-       case OP(SEND_MIDDLE):
-               if (opcode == OP(SEND_MIDDLE) ||
-                   opcode == OP(SEND_LAST) ||
-                   opcode == OP(SEND_LAST_WITH_IMMEDIATE))
-                       break;
-               goto inv;
-
-       case OP(RDMA_WRITE_FIRST):
-       case OP(RDMA_WRITE_MIDDLE):
-               if (opcode == OP(RDMA_WRITE_MIDDLE) ||
-                   opcode == OP(RDMA_WRITE_LAST) ||
-                   opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
-                       break;
-               goto inv;
-
-       default:
-               if (opcode == OP(SEND_FIRST) ||
-                   opcode == OP(SEND_ONLY) ||
-                   opcode == OP(SEND_ONLY_WITH_IMMEDIATE) ||
-                   opcode == OP(RDMA_WRITE_FIRST) ||
-                   opcode == OP(RDMA_WRITE_ONLY) ||
-                   opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
-                       break;
-               goto inv;
-       }
-
-       if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
-               qp_comm_est(qp);
-
-       /* OK, process the packet. */
-       switch (opcode) {
-       case OP(SEND_FIRST):
-       case OP(SEND_ONLY):
-       case OP(SEND_ONLY_WITH_IMMEDIATE):
-send_first:
-               if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags)) {
-                       qp->r_sge = qp->s_rdma_read_sge;
-               } else {
-                       ret = hfi1_rvt_get_rwqe(qp, 0);
-                       if (ret < 0)
-                               goto op_err;
-                       if (!ret)
-                               goto drop;
-                       /*
-                        * qp->s_rdma_read_sge will be the owner
-                        * of the mr references.
-                        */
-                       qp->s_rdma_read_sge = qp->r_sge;
-               }
-               qp->r_rcv_len = 0;
-               if (opcode == OP(SEND_ONLY))
-                       goto no_immediate_data;
-               else if (opcode == OP(SEND_ONLY_WITH_IMMEDIATE))
-                       goto send_last_imm;
-               /* FALLTHROUGH */
-       case OP(SEND_MIDDLE):
-               /* Check for invalid length PMTU or posted rwqe len. */
-               if (unlikely(tlen != (hdrsize + pmtu + 4)))
-                       goto rewind;
-               qp->r_rcv_len += pmtu;
-               if (unlikely(qp->r_rcv_len > qp->r_len))
-                       goto rewind;
-               hfi1_copy_sge(&qp->r_sge, data, pmtu, 0, 0);
-               break;
-
-       case OP(SEND_LAST_WITH_IMMEDIATE):
-send_last_imm:
-               wc.ex.imm_data = ohdr->u.imm_data;
-               wc.wc_flags = IB_WC_WITH_IMM;
-               goto send_last;
-       case OP(SEND_LAST):
-no_immediate_data:
-               wc.ex.imm_data = 0;
-               wc.wc_flags = 0;
-send_last:
-               /* Get the number of bytes the message was padded by. */
-               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-               /* Check for invalid length. */
-               /* LAST len should be >= 1 */
-               if (unlikely(tlen < (hdrsize + pad + 4)))
-                       goto rewind;
-               /* Don't count the CRC. */
-               tlen -= (hdrsize + pad + 4);
-               wc.byte_len = tlen + qp->r_rcv_len;
-               if (unlikely(wc.byte_len > qp->r_len))
-                       goto rewind;
-               wc.opcode = IB_WC_RECV;
-               hfi1_copy_sge(&qp->r_sge, data, tlen, 0, 0);
-               rvt_put_ss(&qp->s_rdma_read_sge);
-last_imm:
-               wc.wr_id = qp->r_wr_id;
-               wc.status = IB_WC_SUCCESS;
-               wc.qp = &qp->ibqp;
-               wc.src_qp = qp->remote_qpn;
-               wc.slid = qp->remote_ah_attr.dlid;
-               /*
-                * It seems that IB mandates the presence of an SL in a
-                * work completion only for the UD transport (see section
-                * 11.4.2 of IBTA Vol. 1).
-                *
-                * However, the way the SL is chosen below is consistent
-                * with the way that IB/qib works and is trying avoid
-                * introducing incompatibilities.
-                *
-                * See also OPA Vol. 1, section 9.7.6, and table 9-17.
-                */
-               wc.sl = qp->remote_ah_attr.sl;
-               /* zero fields that are N/A */
-               wc.vendor_err = 0;
-               wc.pkey_index = 0;
-               wc.dlid_path_bits = 0;
-               wc.port_num = 0;
-               /* Signal completion event if the solicited bit is set. */
-               rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
-                            (ohdr->bth[0] &
-                             cpu_to_be32(IB_BTH_SOLICITED)) != 0);
-               break;
-
-       case OP(RDMA_WRITE_FIRST):
-       case OP(RDMA_WRITE_ONLY):
-       case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): /* consume RWQE */
-rdma_first:
-               if (unlikely(!(qp->qp_access_flags &
-                              IB_ACCESS_REMOTE_WRITE))) {
-                       goto drop;
-               }
-               reth = &ohdr->u.rc.reth;
-               qp->r_len = be32_to_cpu(reth->length);
-               qp->r_rcv_len = 0;
-               qp->r_sge.sg_list = NULL;
-               if (qp->r_len != 0) {
-                       u32 rkey = be32_to_cpu(reth->rkey);
-                       u64 vaddr = be64_to_cpu(reth->vaddr);
-                       int ok;
-
-                       /* Check rkey */
-                       ok = rvt_rkey_ok(qp, &qp->r_sge.sge, qp->r_len,
-                                        vaddr, rkey, IB_ACCESS_REMOTE_WRITE);
-                       if (unlikely(!ok))
-                               goto drop;
-                       qp->r_sge.num_sge = 1;
-               } else {
-                       qp->r_sge.num_sge = 0;
-                       qp->r_sge.sge.mr = NULL;
-                       qp->r_sge.sge.vaddr = NULL;
-                       qp->r_sge.sge.length = 0;
-                       qp->r_sge.sge.sge_length = 0;
-               }
-               if (opcode == OP(RDMA_WRITE_ONLY)) {
-                       goto rdma_last;
-               } else if (opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) {
-                       wc.ex.imm_data = ohdr->u.rc.imm_data;
-                       goto rdma_last_imm;
-               }
-               /* FALLTHROUGH */
-       case OP(RDMA_WRITE_MIDDLE):
-               /* Check for invalid length PMTU or posted rwqe len. */
-               if (unlikely(tlen != (hdrsize + pmtu + 4)))
-                       goto drop;
-               qp->r_rcv_len += pmtu;
-               if (unlikely(qp->r_rcv_len > qp->r_len))
-                       goto drop;
-               hfi1_copy_sge(&qp->r_sge, data, pmtu, 1, 0);
-               break;
-
-       case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
-               wc.ex.imm_data = ohdr->u.imm_data;
-rdma_last_imm:
-               wc.wc_flags = IB_WC_WITH_IMM;
-
-               /* Get the number of bytes the message was padded by. */
-               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-               /* Check for invalid length. */
-               /* LAST len should be >= 1 */
-               if (unlikely(tlen < (hdrsize + pad + 4)))
-                       goto drop;
-               /* Don't count the CRC. */
-               tlen -= (hdrsize + pad + 4);
-               if (unlikely(tlen + qp->r_rcv_len != qp->r_len))
-                       goto drop;
-               if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags)) {
-                       rvt_put_ss(&qp->s_rdma_read_sge);
-               } else {
-                       ret = hfi1_rvt_get_rwqe(qp, 1);
-                       if (ret < 0)
-                               goto op_err;
-                       if (!ret)
-                               goto drop;
-               }
-               wc.byte_len = qp->r_len;
-               wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
-               hfi1_copy_sge(&qp->r_sge, data, tlen, 1, 0);
-               rvt_put_ss(&qp->r_sge);
-               goto last_imm;
-
-       case OP(RDMA_WRITE_LAST):
-rdma_last:
-               /* Get the number of bytes the message was padded by. */
-               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-               /* Check for invalid length. */
-               /* LAST len should be >= 1 */
-               if (unlikely(tlen < (hdrsize + pad + 4)))
-                       goto drop;
-               /* Don't count the CRC. */
-               tlen -= (hdrsize + pad + 4);
-               if (unlikely(tlen + qp->r_rcv_len != qp->r_len))
-                       goto drop;
-               hfi1_copy_sge(&qp->r_sge, data, tlen, 1, 0);
-               rvt_put_ss(&qp->r_sge);
-               break;
-
-       default:
-               /* Drop packet for unknown opcodes. */
-               goto drop;
-       }
-       qp->r_psn++;
-       qp->r_state = opcode;
-       return;
-
-rewind:
-       set_bit(RVT_R_REWIND_SGE, &qp->r_aflags);
-       qp->r_sge.num_sge = 0;
-drop:
-       ibp->rvp.n_pkt_drops++;
-       return;
-
-op_err:
-       hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
-}
diff --git a/drivers/staging/rdma/hfi1/ud.c b/drivers/staging/rdma/hfi1/ud.c
deleted file mode 100644 (file)
index 1e503ad..0000000
+++ /dev/null
@@ -1,911 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/net.h>
-#include <rdma/ib_smi.h>
-
-#include "hfi.h"
-#include "mad.h"
-#include "verbs_txreq.h"
-#include "qp.h"
-
-/**
- * ud_loopback - handle send on loopback QPs
- * @sqp: the sending QP
- * @swqe: the send work request
- *
- * This is called from hfi1_make_ud_req() to forward a WQE addressed
- * to the same HFI.
- * Note that the receive interrupt handler may be calling hfi1_ud_rcv()
- * while this is being called.
- */
-static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
-{
-       struct hfi1_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num);
-       struct hfi1_pportdata *ppd;
-       struct rvt_qp *qp;
-       struct ib_ah_attr *ah_attr;
-       unsigned long flags;
-       struct rvt_sge_state ssge;
-       struct rvt_sge *sge;
-       struct ib_wc wc;
-       u32 length;
-       enum ib_qp_type sqptype, dqptype;
-
-       rcu_read_lock();
-
-       qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), &ibp->rvp,
-                           swqe->ud_wr.remote_qpn);
-       if (!qp) {
-               ibp->rvp.n_pkt_drops++;
-               rcu_read_unlock();
-               return;
-       }
-
-       sqptype = sqp->ibqp.qp_type == IB_QPT_GSI ?
-                       IB_QPT_UD : sqp->ibqp.qp_type;
-       dqptype = qp->ibqp.qp_type == IB_QPT_GSI ?
-                       IB_QPT_UD : qp->ibqp.qp_type;
-
-       if (dqptype != sqptype ||
-           !(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
-               ibp->rvp.n_pkt_drops++;
-               goto drop;
-       }
-
-       ah_attr = &ibah_to_rvtah(swqe->ud_wr.ah)->attr;
-       ppd = ppd_from_ibp(ibp);
-
-       if (qp->ibqp.qp_num > 1) {
-               u16 pkey;
-               u16 slid;
-               u8 sc5 = ibp->sl_to_sc[ah_attr->sl];
-
-               pkey = hfi1_get_pkey(ibp, sqp->s_pkey_index);
-               slid = ppd->lid | (ah_attr->src_path_bits &
-                                  ((1 << ppd->lmc) - 1));
-               if (unlikely(ingress_pkey_check(ppd, pkey, sc5,
-                                               qp->s_pkey_index, slid))) {
-                       hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_P_KEY, pkey,
-                                      ah_attr->sl,
-                                      sqp->ibqp.qp_num, qp->ibqp.qp_num,
-                                      slid, ah_attr->dlid);
-                       goto drop;
-               }
-       }
-
-       /*
-        * Check that the qkey matches (except for QP0, see 9.6.1.4.1).
-        * Qkeys with the high order bit set mean use the
-        * qkey from the QP context instead of the WR (see 10.2.5).
-        */
-       if (qp->ibqp.qp_num) {
-               u32 qkey;
-
-               qkey = (int)swqe->ud_wr.remote_qkey < 0 ?
-                       sqp->qkey : swqe->ud_wr.remote_qkey;
-               if (unlikely(qkey != qp->qkey)) {
-                       u16 lid;
-
-                       lid = ppd->lid | (ah_attr->src_path_bits &
-                                         ((1 << ppd->lmc) - 1));
-                       hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_Q_KEY, qkey,
-                                      ah_attr->sl,
-                                      sqp->ibqp.qp_num, qp->ibqp.qp_num,
-                                      lid,
-                                      ah_attr->dlid);
-                       goto drop;
-               }
-       }
-
-       /*
-        * A GRH is expected to precede the data even if not
-        * present on the wire.
-        */
-       length = swqe->length;
-       memset(&wc, 0, sizeof(wc));
-       wc.byte_len = length + sizeof(struct ib_grh);
-
-       if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
-               wc.wc_flags = IB_WC_WITH_IMM;
-               wc.ex.imm_data = swqe->wr.ex.imm_data;
-       }
-
-       spin_lock_irqsave(&qp->r_lock, flags);
-
-       /*
-        * Get the next work request entry to find where to put the data.
-        */
-       if (qp->r_flags & RVT_R_REUSE_SGE) {
-               qp->r_flags &= ~RVT_R_REUSE_SGE;
-       } else {
-               int ret;
-
-               ret = hfi1_rvt_get_rwqe(qp, 0);
-               if (ret < 0) {
-                       hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
-                       goto bail_unlock;
-               }
-               if (!ret) {
-                       if (qp->ibqp.qp_num == 0)
-                               ibp->rvp.n_vl15_dropped++;
-                       goto bail_unlock;
-               }
-       }
-       /* Silently drop packets which are too big. */
-       if (unlikely(wc.byte_len > qp->r_len)) {
-               qp->r_flags |= RVT_R_REUSE_SGE;
-               ibp->rvp.n_pkt_drops++;
-               goto bail_unlock;
-       }
-
-       if (ah_attr->ah_flags & IB_AH_GRH) {
-               hfi1_copy_sge(&qp->r_sge, &ah_attr->grh,
-                             sizeof(struct ib_grh), 1, 0);
-               wc.wc_flags |= IB_WC_GRH;
-       } else {
-               hfi1_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1);
-       }
-       ssge.sg_list = swqe->sg_list + 1;
-       ssge.sge = *swqe->sg_list;
-       ssge.num_sge = swqe->wr.num_sge;
-       sge = &ssge.sge;
-       while (length) {
-               u32 len = sge->length;
-
-               if (len > length)
-                       len = length;
-               if (len > sge->sge_length)
-                       len = sge->sge_length;
-               WARN_ON_ONCE(len == 0);
-               hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, 1, 0);
-               sge->vaddr += len;
-               sge->length -= len;
-               sge->sge_length -= len;
-               if (sge->sge_length == 0) {
-                       if (--ssge.num_sge)
-                               *sge = *ssge.sg_list++;
-               } else if (sge->length == 0 && sge->mr->lkey) {
-                       if (++sge->n >= RVT_SEGSZ) {
-                               if (++sge->m >= sge->mr->mapsz)
-                                       break;
-                               sge->n = 0;
-                       }
-                       sge->vaddr =
-                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
-                       sge->length =
-                               sge->mr->map[sge->m]->segs[sge->n].length;
-               }
-               length -= len;
-       }
-       rvt_put_ss(&qp->r_sge);
-       if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
-               goto bail_unlock;
-       wc.wr_id = qp->r_wr_id;
-       wc.status = IB_WC_SUCCESS;
-       wc.opcode = IB_WC_RECV;
-       wc.qp = &qp->ibqp;
-       wc.src_qp = sqp->ibqp.qp_num;
-       if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI) {
-               if (sqp->ibqp.qp_type == IB_QPT_GSI ||
-                   sqp->ibqp.qp_type == IB_QPT_SMI)
-                       wc.pkey_index = swqe->ud_wr.pkey_index;
-               else
-                       wc.pkey_index = sqp->s_pkey_index;
-       } else {
-               wc.pkey_index = 0;
-       }
-       wc.slid = ppd->lid | (ah_attr->src_path_bits & ((1 << ppd->lmc) - 1));
-       /* Check for loopback when the port lid is not set */
-       if (wc.slid == 0 && sqp->ibqp.qp_type == IB_QPT_GSI)
-               wc.slid = be16_to_cpu(IB_LID_PERMISSIVE);
-       wc.sl = ah_attr->sl;
-       wc.dlid_path_bits = ah_attr->dlid & ((1 << ppd->lmc) - 1);
-       wc.port_num = qp->port_num;
-       /* Signal completion event if the solicited bit is set. */
-       rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
-                    swqe->wr.send_flags & IB_SEND_SOLICITED);
-       ibp->rvp.n_loop_pkts++;
-bail_unlock:
-       spin_unlock_irqrestore(&qp->r_lock, flags);
-drop:
-       rcu_read_unlock();
-}
-
-/**
- * hfi1_make_ud_req - construct a UD request packet
- * @qp: the QP
- *
- * Assume s_lock is held.
- *
- * Return 1 if constructed; otherwise, return 0.
- */
-int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-       struct hfi1_other_headers *ohdr;
-       struct ib_ah_attr *ah_attr;
-       struct hfi1_pportdata *ppd;
-       struct hfi1_ibport *ibp;
-       struct rvt_swqe *wqe;
-       u32 nwords;
-       u32 extra_bytes;
-       u32 bth0;
-       u16 lrh0;
-       u16 lid;
-       int next_cur;
-       u8 sc5;
-
-       ps->s_txreq = get_txreq(ps->dev, qp);
-       if (IS_ERR(ps->s_txreq))
-               goto bail_no_tx;
-
-       if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK)) {
-               if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
-                       goto bail;
-               /* We are in the error state, flush the work request. */
-               smp_read_barrier_depends(); /* see post_one_send */
-               if (qp->s_last == ACCESS_ONCE(qp->s_head))
-                       goto bail;
-               /* If DMAs are in progress, we can't flush immediately. */
-               if (iowait_sdma_pending(&priv->s_iowait)) {
-                       qp->s_flags |= RVT_S_WAIT_DMA;
-                       goto bail;
-               }
-               wqe = rvt_get_swqe_ptr(qp, qp->s_last);
-               hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
-               goto done_free_tx;
-       }
-
-       /* see post_one_send() */
-       smp_read_barrier_depends();
-       if (qp->s_cur == ACCESS_ONCE(qp->s_head))
-               goto bail;
-
-       wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
-       next_cur = qp->s_cur + 1;
-       if (next_cur >= qp->s_size)
-               next_cur = 0;
-
-       /* Construct the header. */
-       ibp = to_iport(qp->ibqp.device, qp->port_num);
-       ppd = ppd_from_ibp(ibp);
-       ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr;
-       if (ah_attr->dlid < be16_to_cpu(IB_MULTICAST_LID_BASE) ||
-           ah_attr->dlid == be16_to_cpu(IB_LID_PERMISSIVE)) {
-               lid = ah_attr->dlid & ~((1 << ppd->lmc) - 1);
-               if (unlikely(!loopback &&
-                            (lid == ppd->lid ||
-                             (lid == be16_to_cpu(IB_LID_PERMISSIVE) &&
-                             qp->ibqp.qp_type == IB_QPT_GSI)))) {
-                       unsigned long tflags = ps->flags;
-                       /*
-                        * If DMAs are in progress, we can't generate
-                        * a completion for the loopback packet since
-                        * it would be out of order.
-                        * Instead of waiting, we could queue a
-                        * zero length descriptor so we get a callback.
-                        */
-                       if (iowait_sdma_pending(&priv->s_iowait)) {
-                               qp->s_flags |= RVT_S_WAIT_DMA;
-                               goto bail;
-                       }
-                       qp->s_cur = next_cur;
-                       spin_unlock_irqrestore(&qp->s_lock, tflags);
-                       ud_loopback(qp, wqe);
-                       spin_lock_irqsave(&qp->s_lock, tflags);
-                       ps->flags = tflags;
-                       hfi1_send_complete(qp, wqe, IB_WC_SUCCESS);
-                       goto done_free_tx;
-               }
-       }
-
-       qp->s_cur = next_cur;
-       extra_bytes = -wqe->length & 3;
-       nwords = (wqe->length + extra_bytes) >> 2;
-
-       /* header size in 32-bit words LRH+BTH+DETH = (8+12+8)/4. */
-       qp->s_hdrwords = 7;
-       qp->s_cur_size = wqe->length;
-       qp->s_cur_sge = &qp->s_sge;
-       qp->s_srate = ah_attr->static_rate;
-       qp->srate_mbps = ib_rate_to_mbps(qp->s_srate);
-       qp->s_wqe = wqe;
-       qp->s_sge.sge = wqe->sg_list[0];
-       qp->s_sge.sg_list = wqe->sg_list + 1;
-       qp->s_sge.num_sge = wqe->wr.num_sge;
-       qp->s_sge.total_len = wqe->length;
-
-       if (ah_attr->ah_flags & IB_AH_GRH) {
-               /* Header size in 32-bit words. */
-               qp->s_hdrwords += hfi1_make_grh(ibp,
-                                               &ps->s_txreq->phdr.hdr.u.l.grh,
-                                               &ah_attr->grh,
-                                               qp->s_hdrwords, nwords);
-               lrh0 = HFI1_LRH_GRH;
-               ohdr = &ps->s_txreq->phdr.hdr.u.l.oth;
-               /*
-                * Don't worry about sending to locally attached multicast
-                * QPs.  It is unspecified by the spec. what happens.
-                */
-       } else {
-               /* Header size in 32-bit words. */
-               lrh0 = HFI1_LRH_BTH;
-               ohdr = &ps->s_txreq->phdr.hdr.u.oth;
-       }
-       if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
-               qp->s_hdrwords++;
-               ohdr->u.ud.imm_data = wqe->wr.ex.imm_data;
-               bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24;
-       } else {
-               bth0 = IB_OPCODE_UD_SEND_ONLY << 24;
-       }
-       sc5 = ibp->sl_to_sc[ah_attr->sl];
-       lrh0 |= (ah_attr->sl & 0xf) << 4;
-       if (qp->ibqp.qp_type == IB_QPT_SMI) {
-               lrh0 |= 0xF000; /* Set VL (see ch. 13.5.3.1) */
-               priv->s_sc = 0xf;
-       } else {
-               lrh0 |= (sc5 & 0xf) << 12;
-               priv->s_sc = sc5;
-       }
-       priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc);
-       ps->s_txreq->sde = priv->s_sde;
-       priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc);
-       ps->s_txreq->psc = priv->s_sendcontext;
-       ps->s_txreq->phdr.hdr.lrh[0] = cpu_to_be16(lrh0);
-       ps->s_txreq->phdr.hdr.lrh[1] = cpu_to_be16(ah_attr->dlid);
-       ps->s_txreq->phdr.hdr.lrh[2] =
-               cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC);
-       if (ah_attr->dlid == be16_to_cpu(IB_LID_PERMISSIVE)) {
-               ps->s_txreq->phdr.hdr.lrh[3] = IB_LID_PERMISSIVE;
-       } else {
-               lid = ppd->lid;
-               if (lid) {
-                       lid |= ah_attr->src_path_bits & ((1 << ppd->lmc) - 1);
-                       ps->s_txreq->phdr.hdr.lrh[3] = cpu_to_be16(lid);
-               } else {
-                       ps->s_txreq->phdr.hdr.lrh[3] = IB_LID_PERMISSIVE;
-               }
-       }
-       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-               bth0 |= IB_BTH_SOLICITED;
-       bth0 |= extra_bytes << 20;
-       if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI)
-               bth0 |= hfi1_get_pkey(ibp, wqe->ud_wr.pkey_index);
-       else
-               bth0 |= hfi1_get_pkey(ibp, qp->s_pkey_index);
-       ohdr->bth[0] = cpu_to_be32(bth0);
-       ohdr->bth[1] = cpu_to_be32(wqe->ud_wr.remote_qpn);
-       ohdr->bth[2] = cpu_to_be32(mask_psn(wqe->psn));
-       /*
-        * Qkeys with the high order bit set mean use the
-        * qkey from the QP context instead of the WR (see 10.2.5).
-        */
-       ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->ud_wr.remote_qkey < 0 ?
-                                        qp->qkey : wqe->ud_wr.remote_qkey);
-       ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
-       /* disarm any ahg */
-       priv->s_hdr->ahgcount = 0;
-       priv->s_hdr->ahgidx = 0;
-       priv->s_hdr->tx_flags = 0;
-       priv->s_hdr->sde = NULL;
-       /* pbc */
-       ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2;
-
-       return 1;
-
-done_free_tx:
-       hfi1_put_txreq(ps->s_txreq);
-       ps->s_txreq = NULL;
-       return 1;
-
-bail:
-       hfi1_put_txreq(ps->s_txreq);
-
-bail_no_tx:
-       ps->s_txreq = NULL;
-       qp->s_flags &= ~RVT_S_BUSY;
-       qp->s_hdrwords = 0;
-       return 0;
-}
-
-/*
- * Hardware can't check this so we do it here.
- *
- * This is a slightly different algorithm than the standard pkey check.  It
- * special cases the management keys and allows for 0x7fff and 0xffff to be in
- * the table at the same time.
- *
- * @returns the index found or -1 if not found
- */
-int hfi1_lookup_pkey_idx(struct hfi1_ibport *ibp, u16 pkey)
-{
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-       unsigned i;
-
-       if (pkey == FULL_MGMT_P_KEY || pkey == LIM_MGMT_P_KEY) {
-               unsigned lim_idx = -1;
-
-               for (i = 0; i < ARRAY_SIZE(ppd->pkeys); ++i) {
-                       /* here we look for an exact match */
-                       if (ppd->pkeys[i] == pkey)
-                               return i;
-                       if (ppd->pkeys[i] == LIM_MGMT_P_KEY)
-                               lim_idx = i;
-               }
-
-               /* did not find 0xffff return 0x7fff idx if found */
-               if (pkey == FULL_MGMT_P_KEY)
-                       return lim_idx;
-
-               /* no match...  */
-               return -1;
-       }
-
-       pkey &= 0x7fff; /* remove limited/full membership bit */
-
-       for (i = 0; i < ARRAY_SIZE(ppd->pkeys); ++i)
-               if ((ppd->pkeys[i] & 0x7fff) == pkey)
-                       return i;
-
-       /*
-        * Should not get here, this means hardware failed to validate pkeys.
-        */
-       return -1;
-}
-
-void return_cnp(struct hfi1_ibport *ibp, struct rvt_qp *qp, u32 remote_qpn,
-               u32 pkey, u32 slid, u32 dlid, u8 sc5,
-               const struct ib_grh *old_grh)
-{
-       u64 pbc, pbc_flags = 0;
-       u32 bth0, plen, vl, hwords = 5;
-       u16 lrh0;
-       u8 sl = ibp->sc_to_sl[sc5];
-       struct hfi1_ib_header hdr;
-       struct hfi1_other_headers *ohdr;
-       struct pio_buf *pbuf;
-       struct send_context *ctxt = qp_to_send_context(qp, sc5);
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-
-       if (old_grh) {
-               struct ib_grh *grh = &hdr.u.l.grh;
-
-               grh->version_tclass_flow = old_grh->version_tclass_flow;
-               grh->paylen = cpu_to_be16((hwords - 2 + SIZE_OF_CRC) << 2);
-               grh->hop_limit = 0xff;
-               grh->sgid = old_grh->dgid;
-               grh->dgid = old_grh->sgid;
-               ohdr = &hdr.u.l.oth;
-               lrh0 = HFI1_LRH_GRH;
-               hwords += sizeof(struct ib_grh) / sizeof(u32);
-       } else {
-               ohdr = &hdr.u.oth;
-               lrh0 = HFI1_LRH_BTH;
-       }
-
-       lrh0 |= (sc5 & 0xf) << 12 | sl << 4;
-
-       bth0 = pkey | (IB_OPCODE_CNP << 24);
-       ohdr->bth[0] = cpu_to_be32(bth0);
-
-       ohdr->bth[1] = cpu_to_be32(remote_qpn | (1 << HFI1_BECN_SHIFT));
-       ohdr->bth[2] = 0; /* PSN 0 */
-
-       hdr.lrh[0] = cpu_to_be16(lrh0);
-       hdr.lrh[1] = cpu_to_be16(dlid);
-       hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
-       hdr.lrh[3] = cpu_to_be16(slid);
-
-       plen = 2 /* PBC */ + hwords;
-       pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT;
-       vl = sc_to_vlt(ppd->dd, sc5);
-       pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
-       if (ctxt) {
-               pbuf = sc_buffer_alloc(ctxt, plen, NULL, NULL);
-               if (pbuf)
-                       ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc,
-                                                &hdr, hwords);
-       }
-}
-
-/*
- * opa_smp_check() - Do the regular pkey checking, and the additional
- * checks for SMPs specified in OPAv1 rev 0.90, section 9.10.26
- * ("SMA Packet Checks").
- *
- * Note that:
- *   - Checks are done using the pkey directly from the packet's BTH,
- *     and specifically _not_ the pkey that we attach to the completion,
- *     which may be different.
- *   - These checks are specifically for "non-local" SMPs (i.e., SMPs
- *     which originated on another node). SMPs which are sent from, and
- *     destined to this node are checked in opa_local_smp_check().
- *
- * At the point where opa_smp_check() is called, we know:
- *   - destination QP is QP0
- *
- * opa_smp_check() returns 0 if all checks succeed, 1 otherwise.
- */
-static int opa_smp_check(struct hfi1_ibport *ibp, u16 pkey, u8 sc5,
-                        struct rvt_qp *qp, u16 slid, struct opa_smp *smp)
-{
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-
-       /*
-        * I don't think it's possible for us to get here with sc != 0xf,
-        * but check it to be certain.
-        */
-       if (sc5 != 0xf)
-               return 1;
-
-       if (rcv_pkey_check(ppd, pkey, sc5, slid))
-               return 1;
-
-       /*
-        * At this point we know (and so don't need to check again) that
-        * the pkey is either LIM_MGMT_P_KEY, or FULL_MGMT_P_KEY
-        * (see ingress_pkey_check).
-        */
-       if (smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE &&
-           smp->mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED) {
-               ingress_pkey_table_fail(ppd, pkey, slid);
-               return 1;
-       }
-
-       /*
-        * SMPs fall into one of four (disjoint) categories:
-        * SMA request, SMA response, trap, or trap repress.
-        * Our response depends, in part, on which type of
-        * SMP we're processing.
-        *
-        * If this is not an SMA request, or trap repress:
-        *   - accept MAD if the port is running an SM
-        *   - pkey == FULL_MGMT_P_KEY =>
-        *       reply with unsupported method (i.e., just mark
-        *       the smp's status field here, and let it be
-        *       processed normally)
-        *   - pkey != LIM_MGMT_P_KEY =>
-        *       increment port recv constraint errors, drop MAD
-        * If this is an SMA request or trap repress:
-        *   - pkey != FULL_MGMT_P_KEY =>
-        *       increment port recv constraint errors, drop MAD
-        */
-       switch (smp->method) {
-       case IB_MGMT_METHOD_GET:
-       case IB_MGMT_METHOD_SET:
-       case IB_MGMT_METHOD_REPORT:
-       case IB_MGMT_METHOD_TRAP_REPRESS:
-               if (pkey != FULL_MGMT_P_KEY) {
-                       ingress_pkey_table_fail(ppd, pkey, slid);
-                       return 1;
-               }
-               break;
-       case IB_MGMT_METHOD_SEND:
-       case IB_MGMT_METHOD_TRAP:
-       case IB_MGMT_METHOD_GET_RESP:
-       case IB_MGMT_METHOD_REPORT_RESP:
-               if (ibp->rvp.port_cap_flags & IB_PORT_SM)
-                       return 0;
-               if (pkey == FULL_MGMT_P_KEY) {
-                       smp->status |= IB_SMP_UNSUP_METHOD;
-                       return 0;
-               }
-               if (pkey != LIM_MGMT_P_KEY) {
-                       ingress_pkey_table_fail(ppd, pkey, slid);
-                       return 1;
-               }
-               break;
-       default:
-               break;
-       }
-       return 0;
-}
-
-/**
- * hfi1_ud_rcv - receive an incoming UD packet
- * @ibp: the port the packet came in on
- * @hdr: the packet header
- * @rcv_flags: flags relevant to rcv processing
- * @data: the packet data
- * @tlen: the packet length
- * @qp: the QP the packet came on
- *
- * This is called from qp_rcv() to process an incoming UD packet
- * for the given QP.
- * Called at interrupt level.
- */
-void hfi1_ud_rcv(struct hfi1_packet *packet)
-{
-       struct hfi1_other_headers *ohdr = packet->ohdr;
-       int opcode;
-       u32 hdrsize = packet->hlen;
-       u32 pad;
-       struct ib_wc wc;
-       u32 qkey;
-       u32 src_qp;
-       u16 dlid, pkey;
-       int mgmt_pkey_idx = -1;
-       struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data;
-       struct hfi1_ib_header *hdr = packet->hdr;
-       u32 rcv_flags = packet->rcv_flags;
-       void *data = packet->ebuf;
-       u32 tlen = packet->tlen;
-       struct rvt_qp *qp = packet->qp;
-       bool has_grh = rcv_flags & HFI1_HAS_GRH;
-       bool sc4_bit = has_sc4_bit(packet);
-       u8 sc;
-       u32 bth1;
-       int is_mcast;
-       struct ib_grh *grh = NULL;
-
-       qkey = be32_to_cpu(ohdr->u.ud.deth[0]);
-       src_qp = be32_to_cpu(ohdr->u.ud.deth[1]) & RVT_QPN_MASK;
-       dlid = be16_to_cpu(hdr->lrh[1]);
-       is_mcast = (dlid > be16_to_cpu(IB_MULTICAST_LID_BASE)) &&
-                       (dlid != be16_to_cpu(IB_LID_PERMISSIVE));
-       bth1 = be32_to_cpu(ohdr->bth[1]);
-       if (unlikely(bth1 & HFI1_BECN_SMASK)) {
-               /*
-                * In pre-B0 h/w the CNP_OPCODE is handled via an
-                * error path.
-                */
-               struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-               u32 lqpn =  be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK;
-               u8 sl, sc5;
-
-               sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
-               sc5 |= sc4_bit;
-               sl = ibp->sc_to_sl[sc5];
-
-               process_becn(ppd, sl, 0, lqpn, 0, IB_CC_SVCTYPE_UD);
-       }
-
-       /*
-        * The opcode is in the low byte when its in network order
-        * (top byte when in host order).
-        */
-       opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
-       opcode &= 0xff;
-
-       pkey = (u16)be32_to_cpu(ohdr->bth[0]);
-
-       if (!is_mcast && (opcode != IB_OPCODE_CNP) && bth1 & HFI1_FECN_SMASK) {
-               u16 slid = be16_to_cpu(hdr->lrh[3]);
-               u8 sc5;
-
-               sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
-               sc5 |= sc4_bit;
-
-               return_cnp(ibp, qp, src_qp, pkey, dlid, slid, sc5, grh);
-       }
-       /*
-        * Get the number of bytes the message was padded by
-        * and drop incomplete packets.
-        */
-       pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-       if (unlikely(tlen < (hdrsize + pad + 4)))
-               goto drop;
-
-       tlen -= hdrsize + pad + 4;
-
-       /*
-        * Check that the permissive LID is only used on QP0
-        * and the QKEY matches (see 9.6.1.4.1 and 9.6.1.5.1).
-        */
-       if (qp->ibqp.qp_num) {
-               if (unlikely(hdr->lrh[1] == IB_LID_PERMISSIVE ||
-                            hdr->lrh[3] == IB_LID_PERMISSIVE))
-                       goto drop;
-               if (qp->ibqp.qp_num > 1) {
-                       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-                       u16 slid;
-                       u8 sc5;
-
-                       sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
-                       sc5 |= sc4_bit;
-
-                       slid = be16_to_cpu(hdr->lrh[3]);
-                       if (unlikely(rcv_pkey_check(ppd, pkey, sc5, slid))) {
-                               /*
-                                * Traps will not be sent for packets dropped
-                                * by the HW. This is fine, as sending trap
-                                * for invalid pkeys is optional according to
-                                * IB spec (release 1.3, section 10.9.4)
-                                */
-                               hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_P_KEY,
-                                              pkey,
-                                              (be16_to_cpu(hdr->lrh[0]) >> 4) &
-                                               0xF,
-                                              src_qp, qp->ibqp.qp_num,
-                                              be16_to_cpu(hdr->lrh[3]),
-                                              be16_to_cpu(hdr->lrh[1]));
-                               return;
-                       }
-               } else {
-                       /* GSI packet */
-                       mgmt_pkey_idx = hfi1_lookup_pkey_idx(ibp, pkey);
-                       if (mgmt_pkey_idx < 0)
-                               goto drop;
-               }
-               if (unlikely(qkey != qp->qkey)) {
-                       hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_Q_KEY, qkey,
-                                      (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF,
-                                      src_qp, qp->ibqp.qp_num,
-                                      be16_to_cpu(hdr->lrh[3]),
-                                      be16_to_cpu(hdr->lrh[1]));
-                       return;
-               }
-               /* Drop invalid MAD packets (see 13.5.3.1). */
-               if (unlikely(qp->ibqp.qp_num == 1 &&
-                            (tlen > 2048 ||
-                             (be16_to_cpu(hdr->lrh[0]) >> 12) == 15)))
-                       goto drop;
-       } else {
-               /* Received on QP0, and so by definition, this is an SMP */
-               struct opa_smp *smp = (struct opa_smp *)data;
-               u16 slid = be16_to_cpu(hdr->lrh[3]);
-               u8 sc5;
-
-               sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
-               sc5 |= sc4_bit;
-
-               if (opa_smp_check(ibp, pkey, sc5, qp, slid, smp))
-                       goto drop;
-
-               if (tlen > 2048)
-                       goto drop;
-               if ((hdr->lrh[1] == IB_LID_PERMISSIVE ||
-                    hdr->lrh[3] == IB_LID_PERMISSIVE) &&
-                   smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
-                       goto drop;
-
-               /* look up SMI pkey */
-               mgmt_pkey_idx = hfi1_lookup_pkey_idx(ibp, pkey);
-               if (mgmt_pkey_idx < 0)
-                       goto drop;
-       }
-
-       if (qp->ibqp.qp_num > 1 &&
-           opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) {
-               wc.ex.imm_data = ohdr->u.ud.imm_data;
-               wc.wc_flags = IB_WC_WITH_IMM;
-               tlen -= sizeof(u32);
-       } else if (opcode == IB_OPCODE_UD_SEND_ONLY) {
-               wc.ex.imm_data = 0;
-               wc.wc_flags = 0;
-       } else {
-               goto drop;
-       }
-
-       /*
-        * A GRH is expected to precede the data even if not
-        * present on the wire.
-        */
-       wc.byte_len = tlen + sizeof(struct ib_grh);
-
-       /*
-        * Get the next work request entry to find where to put the data.
-        */
-       if (qp->r_flags & RVT_R_REUSE_SGE) {
-               qp->r_flags &= ~RVT_R_REUSE_SGE;
-       } else {
-               int ret;
-
-               ret = hfi1_rvt_get_rwqe(qp, 0);
-               if (ret < 0) {
-                       hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
-                       return;
-               }
-               if (!ret) {
-                       if (qp->ibqp.qp_num == 0)
-                               ibp->rvp.n_vl15_dropped++;
-                       return;
-               }
-       }
-       /* Silently drop packets which are too big. */
-       if (unlikely(wc.byte_len > qp->r_len)) {
-               qp->r_flags |= RVT_R_REUSE_SGE;
-               goto drop;
-       }
-       if (has_grh) {
-               hfi1_copy_sge(&qp->r_sge, &hdr->u.l.grh,
-                             sizeof(struct ib_grh), 1, 0);
-               wc.wc_flags |= IB_WC_GRH;
-       } else {
-               hfi1_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1);
-       }
-       hfi1_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh),
-                     1, 0);
-       rvt_put_ss(&qp->r_sge);
-       if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
-               return;
-       wc.wr_id = qp->r_wr_id;
-       wc.status = IB_WC_SUCCESS;
-       wc.opcode = IB_WC_RECV;
-       wc.vendor_err = 0;
-       wc.qp = &qp->ibqp;
-       wc.src_qp = src_qp;
-
-       if (qp->ibqp.qp_type == IB_QPT_GSI ||
-           qp->ibqp.qp_type == IB_QPT_SMI) {
-               if (mgmt_pkey_idx < 0) {
-                       if (net_ratelimit()) {
-                               struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-                               struct hfi1_devdata *dd = ppd->dd;
-
-                               dd_dev_err(dd, "QP type %d mgmt_pkey_idx < 0 and packet not dropped???\n",
-                                          qp->ibqp.qp_type);
-                               mgmt_pkey_idx = 0;
-                       }
-               }
-               wc.pkey_index = (unsigned)mgmt_pkey_idx;
-       } else {
-               wc.pkey_index = 0;
-       }
-
-       wc.slid = be16_to_cpu(hdr->lrh[3]);
-       sc = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
-       sc |= sc4_bit;
-       wc.sl = ibp->sc_to_sl[sc];
-
-       /*
-        * Save the LMC lower bits if the destination LID is a unicast LID.
-        */
-       wc.dlid_path_bits = dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE) ? 0 :
-               dlid & ((1 << ppd_from_ibp(ibp)->lmc) - 1);
-       wc.port_num = qp->port_num;
-       /* Signal completion event if the solicited bit is set. */
-       rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
-                    (ohdr->bth[0] &
-                     cpu_to_be32(IB_BTH_SOLICITED)) != 0);
-       return;
-
-drop:
-       ibp->rvp.n_pkt_drops++;
-}
diff --git a/drivers/staging/rdma/hfi1/user_exp_rcv.c b/drivers/staging/rdma/hfi1/user_exp_rcv.c
deleted file mode 100644 (file)
index 1b640a3..0000000
+++ /dev/null
@@ -1,1050 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#include <asm/page.h>
-
-#include "user_exp_rcv.h"
-#include "trace.h"
-#include "mmu_rb.h"
-
-struct tid_group {
-       struct list_head list;
-       unsigned base;
-       u8 size;
-       u8 used;
-       u8 map;
-};
-
-struct tid_rb_node {
-       struct mmu_rb_node mmu;
-       unsigned long phys;
-       struct tid_group *grp;
-       u32 rcventry;
-       dma_addr_t dma_addr;
-       bool freed;
-       unsigned npages;
-       struct page *pages[0];
-};
-
-struct tid_pageset {
-       u16 idx;
-       u16 count;
-};
-
-#define EXP_TID_SET_EMPTY(set) (set.count == 0 && list_empty(&set.list))
-
-#define num_user_pages(vaddr, len)                                    \
-       (1 + (((((unsigned long)(vaddr) +                              \
-                (unsigned long)(len) - 1) & PAGE_MASK) -              \
-              ((unsigned long)vaddr & PAGE_MASK)) >> PAGE_SHIFT))
-
-static void unlock_exp_tids(struct hfi1_ctxtdata *, struct exp_tid_set *,
-                           struct rb_root *);
-static u32 find_phys_blocks(struct page **, unsigned, struct tid_pageset *);
-static int set_rcvarray_entry(struct file *, unsigned long, u32,
-                             struct tid_group *, struct page **, unsigned);
-static int mmu_rb_insert(struct rb_root *, struct mmu_rb_node *);
-static void mmu_rb_remove(struct rb_root *, struct mmu_rb_node *,
-                         struct mm_struct *);
-static int mmu_rb_invalidate(struct rb_root *, struct mmu_rb_node *);
-static int program_rcvarray(struct file *, unsigned long, struct tid_group *,
-                           struct tid_pageset *, unsigned, u16, struct page **,
-                           u32 *, unsigned *, unsigned *);
-static int unprogram_rcvarray(struct file *, u32, struct tid_group **);
-static void clear_tid_node(struct hfi1_filedata *, u16, struct tid_rb_node *);
-
-static struct mmu_rb_ops tid_rb_ops = {
-       .insert = mmu_rb_insert,
-       .remove = mmu_rb_remove,
-       .invalidate = mmu_rb_invalidate
-};
-
-static inline u32 rcventry2tidinfo(u32 rcventry)
-{
-       u32 pair = rcventry & ~0x1;
-
-       return EXP_TID_SET(IDX, pair >> 1) |
-               EXP_TID_SET(CTRL, 1 << (rcventry - pair));
-}
-
-static inline void exp_tid_group_init(struct exp_tid_set *set)
-{
-       INIT_LIST_HEAD(&set->list);
-       set->count = 0;
-}
-
-static inline void tid_group_remove(struct tid_group *grp,
-                                   struct exp_tid_set *set)
-{
-       list_del_init(&grp->list);
-       set->count--;
-}
-
-static inline void tid_group_add_tail(struct tid_group *grp,
-                                     struct exp_tid_set *set)
-{
-       list_add_tail(&grp->list, &set->list);
-       set->count++;
-}
-
-static inline struct tid_group *tid_group_pop(struct exp_tid_set *set)
-{
-       struct tid_group *grp =
-               list_first_entry(&set->list, struct tid_group, list);
-       list_del_init(&grp->list);
-       set->count--;
-       return grp;
-}
-
-static inline void tid_group_move(struct tid_group *group,
-                                 struct exp_tid_set *s1,
-                                 struct exp_tid_set *s2)
-{
-       tid_group_remove(group, s1);
-       tid_group_add_tail(group, s2);
-}
-
-/*
- * Initialize context and file private data needed for Expected
- * receive caching. This needs to be done after the context has
- * been configured with the eager/expected RcvEntry counts.
- */
-int hfi1_user_exp_rcv_init(struct file *fp)
-{
-       struct hfi1_filedata *fd = fp->private_data;
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-       struct hfi1_devdata *dd = uctxt->dd;
-       unsigned tidbase;
-       int i, ret = 0;
-
-       spin_lock_init(&fd->tid_lock);
-       spin_lock_init(&fd->invalid_lock);
-       fd->tid_rb_root = RB_ROOT;
-
-       if (!uctxt->subctxt_cnt || !fd->subctxt) {
-               exp_tid_group_init(&uctxt->tid_group_list);
-               exp_tid_group_init(&uctxt->tid_used_list);
-               exp_tid_group_init(&uctxt->tid_full_list);
-
-               tidbase = uctxt->expected_base;
-               for (i = 0; i < uctxt->expected_count /
-                            dd->rcv_entries.group_size; i++) {
-                       struct tid_group *grp;
-
-                       grp = kzalloc(sizeof(*grp), GFP_KERNEL);
-                       if (!grp) {
-                               /*
-                                * If we fail here, the groups already
-                                * allocated will be freed by the close
-                                * call.
-                                */
-                               ret = -ENOMEM;
-                               goto done;
-                       }
-                       grp->size = dd->rcv_entries.group_size;
-                       grp->base = tidbase;
-                       tid_group_add_tail(grp, &uctxt->tid_group_list);
-                       tidbase += dd->rcv_entries.group_size;
-               }
-       }
-
-       fd->entry_to_rb = kcalloc(uctxt->expected_count,
-                                    sizeof(struct rb_node *),
-                                    GFP_KERNEL);
-       if (!fd->entry_to_rb)
-               return -ENOMEM;
-
-       if (!HFI1_CAP_IS_USET(TID_UNMAP)) {
-               fd->invalid_tid_idx = 0;
-               fd->invalid_tids = kzalloc(uctxt->expected_count *
-                                          sizeof(u32), GFP_KERNEL);
-               if (!fd->invalid_tids) {
-                       ret = -ENOMEM;
-                       goto done;
-               }
-
-               /*
-                * Register MMU notifier callbacks. If the registration
-                * fails, continue but turn off the TID caching for
-                * all user contexts.
-                */
-               ret = hfi1_mmu_rb_register(&fd->tid_rb_root, &tid_rb_ops);
-               if (ret) {
-                       dd_dev_info(dd,
-                                   "Failed MMU notifier registration %d\n",
-                                   ret);
-                       HFI1_CAP_USET(TID_UNMAP);
-                       ret = 0;
-               }
-       }
-
-       /*
-        * PSM does not have a good way to separate, count, and
-        * effectively enforce a limit on RcvArray entries used by
-        * subctxts (when context sharing is used) when TID caching
-        * is enabled. To help with that, we calculate a per-process
-        * RcvArray entry share and enforce that.
-        * If TID caching is not in use, PSM deals with usage on its
-        * own. In that case, we allow any subctxt to take all of the
-        * entries.
-        *
-        * Make sure that we set the tid counts only after successful
-        * init.
-        */
-       spin_lock(&fd->tid_lock);
-       if (uctxt->subctxt_cnt && !HFI1_CAP_IS_USET(TID_UNMAP)) {
-               u16 remainder;
-
-               fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt;
-               remainder = uctxt->expected_count % uctxt->subctxt_cnt;
-               if (remainder && fd->subctxt < remainder)
-                       fd->tid_limit++;
-       } else {
-               fd->tid_limit = uctxt->expected_count;
-       }
-       spin_unlock(&fd->tid_lock);
-done:
-       return ret;
-}
-
-int hfi1_user_exp_rcv_free(struct hfi1_filedata *fd)
-{
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-       struct tid_group *grp, *gptr;
-
-       if (!test_bit(HFI1_CTXT_SETUP_DONE, &uctxt->event_flags))
-               return 0;
-       /*
-        * The notifier would have been removed when the process'es mm
-        * was freed.
-        */
-       if (!HFI1_CAP_IS_USET(TID_UNMAP))
-               hfi1_mmu_rb_unregister(&fd->tid_rb_root);
-
-       kfree(fd->invalid_tids);
-
-       if (!uctxt->cnt) {
-               if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list))
-                       unlock_exp_tids(uctxt, &uctxt->tid_full_list,
-                                       &fd->tid_rb_root);
-               if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list))
-                       unlock_exp_tids(uctxt, &uctxt->tid_used_list,
-                                       &fd->tid_rb_root);
-               list_for_each_entry_safe(grp, gptr, &uctxt->tid_group_list.list,
-                                        list) {
-                       list_del_init(&grp->list);
-                       kfree(grp);
-               }
-               hfi1_clear_tids(uctxt);
-       }
-
-       kfree(fd->entry_to_rb);
-       return 0;
-}
-
-/*
- * Write an "empty" RcvArray entry.
- * This function exists so the TID registaration code can use it
- * to write to unused/unneeded entries and still take advantage
- * of the WC performance improvements. The HFI will ignore this
- * write to the RcvArray entry.
- */
-static inline void rcv_array_wc_fill(struct hfi1_devdata *dd, u32 index)
-{
-       /*
-        * Doing the WC fill writes only makes sense if the device is
-        * present and the RcvArray has been mapped as WC memory.
-        */
-       if ((dd->flags & HFI1_PRESENT) && dd->rcvarray_wc)
-               writeq(0, dd->rcvarray_wc + (index * 8));
-}
-
-/*
- * RcvArray entry allocation for Expected Receives is done by the
- * following algorithm:
- *
- * The context keeps 3 lists of groups of RcvArray entries:
- *   1. List of empty groups - tid_group_list
- *      This list is created during user context creation and
- *      contains elements which describe sets (of 8) of empty
- *      RcvArray entries.
- *   2. List of partially used groups - tid_used_list
- *      This list contains sets of RcvArray entries which are
- *      not completely used up. Another mapping request could
- *      use some of all of the remaining entries.
- *   3. List of full groups - tid_full_list
- *      This is the list where sets that are completely used
- *      up go.
- *
- * An attempt to optimize the usage of RcvArray entries is
- * made by finding all sets of physically contiguous pages in a
- * user's buffer.
- * These physically contiguous sets are further split into
- * sizes supported by the receive engine of the HFI. The
- * resulting sets of pages are stored in struct tid_pageset,
- * which describes the sets as:
- *    * .count - number of pages in this set
- *    * .idx - starting index into struct page ** array
- *                    of this set
- *
- * From this point on, the algorithm deals with the page sets
- * described above. The number of pagesets is divided by the
- * RcvArray group size to produce the number of full groups
- * needed.
- *
- * Groups from the 3 lists are manipulated using the following
- * rules:
- *   1. For each set of 8 pagesets, a complete group from
- *      tid_group_list is taken, programmed, and moved to
- *      the tid_full_list list.
- *   2. For all remaining pagesets:
- *      2.1 If the tid_used_list is empty and the tid_group_list
- *          is empty, stop processing pageset and return only
- *          what has been programmed up to this point.
- *      2.2 If the tid_used_list is empty and the tid_group_list
- *          is not empty, move a group from tid_group_list to
- *          tid_used_list.
- *      2.3 For each group is tid_used_group, program as much as
- *          can fit into the group. If the group becomes fully
- *          used, move it to tid_full_list.
- */
-int hfi1_user_exp_rcv_setup(struct file *fp, struct hfi1_tid_info *tinfo)
-{
-       int ret = 0, need_group = 0, pinned;
-       struct hfi1_filedata *fd = fp->private_data;
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-       struct hfi1_devdata *dd = uctxt->dd;
-       unsigned npages, ngroups, pageidx = 0, pageset_count, npagesets,
-               tididx = 0, mapped, mapped_pages = 0;
-       unsigned long vaddr = tinfo->vaddr;
-       struct page **pages = NULL;
-       u32 *tidlist = NULL;
-       struct tid_pageset *pagesets = NULL;
-
-       /* Get the number of pages the user buffer spans */
-       npages = num_user_pages(vaddr, tinfo->length);
-       if (!npages)
-               return -EINVAL;
-
-       if (npages > uctxt->expected_count) {
-               dd_dev_err(dd, "Expected buffer too big\n");
-               return -EINVAL;
-       }
-
-       /* Verify that access is OK for the user buffer */
-       if (!access_ok(VERIFY_WRITE, (void __user *)vaddr,
-                      npages * PAGE_SIZE)) {
-               dd_dev_err(dd, "Fail vaddr %p, %u pages, !access_ok\n",
-                          (void *)vaddr, npages);
-               return -EFAULT;
-       }
-
-       pagesets = kcalloc(uctxt->expected_count, sizeof(*pagesets),
-                          GFP_KERNEL);
-       if (!pagesets)
-               return -ENOMEM;
-
-       /* Allocate the array of struct page pointers needed for pinning */
-       pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
-       if (!pages) {
-               ret = -ENOMEM;
-               goto bail;
-       }
-
-       /*
-        * Pin all the pages of the user buffer. If we can't pin all the
-        * pages, accept the amount pinned so far and program only that.
-        * User space knows how to deal with partially programmed buffers.
-        */
-       if (!hfi1_can_pin_pages(dd, fd->tid_n_pinned, npages)) {
-               ret = -ENOMEM;
-               goto bail;
-       }
-
-       pinned = hfi1_acquire_user_pages(vaddr, npages, true, pages);
-       if (pinned <= 0) {
-               ret = pinned;
-               goto bail;
-       }
-       fd->tid_n_pinned += npages;
-
-       /* Find sets of physically contiguous pages */
-       npagesets = find_phys_blocks(pages, pinned, pagesets);
-
-       /*
-        * We don't need to access this under a lock since tid_used is per
-        * process and the same process cannot be in hfi1_user_exp_rcv_clear()
-        * and hfi1_user_exp_rcv_setup() at the same time.
-        */
-       spin_lock(&fd->tid_lock);
-       if (fd->tid_used + npagesets > fd->tid_limit)
-               pageset_count = fd->tid_limit - fd->tid_used;
-       else
-               pageset_count = npagesets;
-       spin_unlock(&fd->tid_lock);
-
-       if (!pageset_count)
-               goto bail;
-
-       ngroups = pageset_count / dd->rcv_entries.group_size;
-       tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL);
-       if (!tidlist) {
-               ret = -ENOMEM;
-               goto nomem;
-       }
-
-       tididx = 0;
-
-       /*
-        * From this point on, we are going to be using shared (between master
-        * and subcontexts) context resources. We need to take the lock.
-        */
-       mutex_lock(&uctxt->exp_lock);
-       /*
-        * The first step is to program the RcvArray entries which are complete
-        * groups.
-        */
-       while (ngroups && uctxt->tid_group_list.count) {
-               struct tid_group *grp =
-                       tid_group_pop(&uctxt->tid_group_list);
-
-               ret = program_rcvarray(fp, vaddr, grp, pagesets,
-                                      pageidx, dd->rcv_entries.group_size,
-                                      pages, tidlist, &tididx, &mapped);
-               /*
-                * If there was a failure to program the RcvArray
-                * entries for the entire group, reset the grp fields
-                * and add the grp back to the free group list.
-                */
-               if (ret <= 0) {
-                       tid_group_add_tail(grp, &uctxt->tid_group_list);
-                       hfi1_cdbg(TID,
-                                 "Failed to program RcvArray group %d", ret);
-                       goto unlock;
-               }
-
-               tid_group_add_tail(grp, &uctxt->tid_full_list);
-               ngroups--;
-               pageidx += ret;
-               mapped_pages += mapped;
-       }
-
-       while (pageidx < pageset_count) {
-               struct tid_group *grp, *ptr;
-               /*
-                * If we don't have any partially used tid groups, check
-                * if we have empty groups. If so, take one from there and
-                * put in the partially used list.
-                */
-               if (!uctxt->tid_used_list.count || need_group) {
-                       if (!uctxt->tid_group_list.count)
-                               goto unlock;
-
-                       grp = tid_group_pop(&uctxt->tid_group_list);
-                       tid_group_add_tail(grp, &uctxt->tid_used_list);
-                       need_group = 0;
-               }
-               /*
-                * There is an optimization opportunity here - instead of
-                * fitting as many page sets as we can, check for a group
-                * later on in the list that could fit all of them.
-                */
-               list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list,
-                                        list) {
-                       unsigned use = min_t(unsigned, pageset_count - pageidx,
-                                            grp->size - grp->used);
-
-                       ret = program_rcvarray(fp, vaddr, grp, pagesets,
-                                              pageidx, use, pages, tidlist,
-                                              &tididx, &mapped);
-                       if (ret < 0) {
-                               hfi1_cdbg(TID,
-                                         "Failed to program RcvArray entries %d",
-                                         ret);
-                               ret = -EFAULT;
-                               goto unlock;
-                       } else if (ret > 0) {
-                               if (grp->used == grp->size)
-                                       tid_group_move(grp,
-                                                      &uctxt->tid_used_list,
-                                                      &uctxt->tid_full_list);
-                               pageidx += ret;
-                               mapped_pages += mapped;
-                               need_group = 0;
-                               /* Check if we are done so we break out early */
-                               if (pageidx >= pageset_count)
-                                       break;
-                       } else if (WARN_ON(ret == 0)) {
-                               /*
-                                * If ret is 0, we did not program any entries
-                                * into this group, which can only happen if
-                                * we've screwed up the accounting somewhere.
-                                * Warn and try to continue.
-                                */
-                               need_group = 1;
-                       }
-               }
-       }
-unlock:
-       mutex_unlock(&uctxt->exp_lock);
-nomem:
-       hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx,
-                 mapped_pages, ret);
-       if (tididx) {
-               spin_lock(&fd->tid_lock);
-               fd->tid_used += tididx;
-               spin_unlock(&fd->tid_lock);
-               tinfo->tidcnt = tididx;
-               tinfo->length = mapped_pages * PAGE_SIZE;
-
-               if (copy_to_user((void __user *)(unsigned long)tinfo->tidlist,
-                                tidlist, sizeof(tidlist[0]) * tididx)) {
-                       /*
-                        * On failure to copy to the user level, we need to undo
-                        * everything done so far so we don't leak resources.
-                        */
-                       tinfo->tidlist = (unsigned long)&tidlist;
-                       hfi1_user_exp_rcv_clear(fp, tinfo);
-                       tinfo->tidlist = 0;
-                       ret = -EFAULT;
-                       goto bail;
-               }
-       }
-
-       /*
-        * If not everything was mapped (due to insufficient RcvArray entries,
-        * for example), unpin all unmapped pages so we can pin them nex time.
-        */
-       if (mapped_pages != pinned) {
-               hfi1_release_user_pages(current->mm, &pages[mapped_pages],
-                                       pinned - mapped_pages,
-                                       false);
-               fd->tid_n_pinned -= pinned - mapped_pages;
-       }
-bail:
-       kfree(pagesets);
-       kfree(pages);
-       kfree(tidlist);
-       return ret > 0 ? 0 : ret;
-}
-
-int hfi1_user_exp_rcv_clear(struct file *fp, struct hfi1_tid_info *tinfo)
-{
-       int ret = 0;
-       struct hfi1_filedata *fd = fp->private_data;
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-       u32 *tidinfo;
-       unsigned tididx;
-
-       tidinfo = kcalloc(tinfo->tidcnt, sizeof(*tidinfo), GFP_KERNEL);
-       if (!tidinfo)
-               return -ENOMEM;
-
-       if (copy_from_user(tidinfo, (void __user *)(unsigned long)
-                          tinfo->tidlist, sizeof(tidinfo[0]) *
-                          tinfo->tidcnt)) {
-               ret = -EFAULT;
-               goto done;
-       }
-
-       mutex_lock(&uctxt->exp_lock);
-       for (tididx = 0; tididx < tinfo->tidcnt; tididx++) {
-               ret = unprogram_rcvarray(fp, tidinfo[tididx], NULL);
-               if (ret) {
-                       hfi1_cdbg(TID, "Failed to unprogram rcv array %d",
-                                 ret);
-                       break;
-               }
-       }
-       spin_lock(&fd->tid_lock);
-       fd->tid_used -= tididx;
-       spin_unlock(&fd->tid_lock);
-       tinfo->tidcnt = tididx;
-       mutex_unlock(&uctxt->exp_lock);
-done:
-       kfree(tidinfo);
-       return ret;
-}
-
-int hfi1_user_exp_rcv_invalid(struct file *fp, struct hfi1_tid_info *tinfo)
-{
-       struct hfi1_filedata *fd = fp->private_data;
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-       unsigned long *ev = uctxt->dd->events +
-               (((uctxt->ctxt - uctxt->dd->first_user_ctxt) *
-                 HFI1_MAX_SHARED_CTXTS) + fd->subctxt);
-       u32 *array;
-       int ret = 0;
-
-       if (!fd->invalid_tids)
-               return -EINVAL;
-
-       /*
-        * copy_to_user() can sleep, which will leave the invalid_lock
-        * locked and cause the MMU notifier to be blocked on the lock
-        * for a long time.
-        * Copy the data to a local buffer so we can release the lock.
-        */
-       array = kcalloc(uctxt->expected_count, sizeof(*array), GFP_KERNEL);
-       if (!array)
-               return -EFAULT;
-
-       spin_lock(&fd->invalid_lock);
-       if (fd->invalid_tid_idx) {
-               memcpy(array, fd->invalid_tids, sizeof(*array) *
-                      fd->invalid_tid_idx);
-               memset(fd->invalid_tids, 0, sizeof(*fd->invalid_tids) *
-                      fd->invalid_tid_idx);
-               tinfo->tidcnt = fd->invalid_tid_idx;
-               fd->invalid_tid_idx = 0;
-               /*
-                * Reset the user flag while still holding the lock.
-                * Otherwise, PSM can miss events.
-                */
-               clear_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
-       } else {
-               tinfo->tidcnt = 0;
-       }
-       spin_unlock(&fd->invalid_lock);
-
-       if (tinfo->tidcnt) {
-               if (copy_to_user((void __user *)tinfo->tidlist,
-                                array, sizeof(*array) * tinfo->tidcnt))
-                       ret = -EFAULT;
-       }
-       kfree(array);
-
-       return ret;
-}
-
-static u32 find_phys_blocks(struct page **pages, unsigned npages,
-                           struct tid_pageset *list)
-{
-       unsigned pagecount, pageidx, setcount = 0, i;
-       unsigned long pfn, this_pfn;
-
-       if (!npages)
-               return 0;
-
-       /*
-        * Look for sets of physically contiguous pages in the user buffer.
-        * This will allow us to optimize Expected RcvArray entry usage by
-        * using the bigger supported sizes.
-        */
-       pfn = page_to_pfn(pages[0]);
-       for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) {
-               this_pfn = i < npages ? page_to_pfn(pages[i]) : 0;
-
-               /*
-                * If the pfn's are not sequential, pages are not physically
-                * contiguous.
-                */
-               if (this_pfn != ++pfn) {
-                       /*
-                        * At this point we have to loop over the set of
-                        * physically contiguous pages and break them down it
-                        * sizes supported by the HW.
-                        * There are two main constraints:
-                        *     1. The max buffer size is MAX_EXPECTED_BUFFER.
-                        *        If the total set size is bigger than that
-                        *        program only a MAX_EXPECTED_BUFFER chunk.
-                        *     2. The buffer size has to be a power of two. If
-                        *        it is not, round down to the closes power of
-                        *        2 and program that size.
-                        */
-                       while (pagecount) {
-                               int maxpages = pagecount;
-                               u32 bufsize = pagecount * PAGE_SIZE;
-
-                               if (bufsize > MAX_EXPECTED_BUFFER)
-                                       maxpages =
-                                               MAX_EXPECTED_BUFFER >>
-                                               PAGE_SHIFT;
-                               else if (!is_power_of_2(bufsize))
-                                       maxpages =
-                                               rounddown_pow_of_two(bufsize) >>
-                                               PAGE_SHIFT;
-
-                               list[setcount].idx = pageidx;
-                               list[setcount].count = maxpages;
-                               pagecount -= maxpages;
-                               pageidx += maxpages;
-                               setcount++;
-                       }
-                       pageidx = i;
-                       pagecount = 1;
-                       pfn = this_pfn;
-               } else {
-                       pagecount++;
-               }
-       }
-       return setcount;
-}
-
-/**
- * program_rcvarray() - program an RcvArray group with receive buffers
- * @fp: file pointer
- * @vaddr: starting user virtual address
- * @grp: RcvArray group
- * @sets: array of struct tid_pageset holding information on physically
- *        contiguous chunks from the user buffer
- * @start: starting index into sets array
- * @count: number of struct tid_pageset's to program
- * @pages: an array of struct page * for the user buffer
- * @tidlist: the array of u32 elements when the information about the
- *           programmed RcvArray entries is to be encoded.
- * @tididx: starting offset into tidlist
- * @pmapped: (output parameter) number of pages programmed into the RcvArray
- *           entries.
- *
- * This function will program up to 'count' number of RcvArray entries from the
- * group 'grp'. To make best use of write-combining writes, the function will
- * perform writes to the unused RcvArray entries which will be ignored by the
- * HW. Each RcvArray entry will be programmed with a physically contiguous
- * buffer chunk from the user's virtual buffer.
- *
- * Return:
- * -EINVAL if the requested count is larger than the size of the group,
- * -ENOMEM or -EFAULT on error from set_rcvarray_entry(), or
- * number of RcvArray entries programmed.
- */
-static int program_rcvarray(struct file *fp, unsigned long vaddr,
-                           struct tid_group *grp,
-                           struct tid_pageset *sets,
-                           unsigned start, u16 count, struct page **pages,
-                           u32 *tidlist, unsigned *tididx, unsigned *pmapped)
-{
-       struct hfi1_filedata *fd = fp->private_data;
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-       struct hfi1_devdata *dd = uctxt->dd;
-       u16 idx;
-       u32 tidinfo = 0, rcventry, useidx = 0;
-       int mapped = 0;
-
-       /* Count should never be larger than the group size */
-       if (count > grp->size)
-               return -EINVAL;
-
-       /* Find the first unused entry in the group */
-       for (idx = 0; idx < grp->size; idx++) {
-               if (!(grp->map & (1 << idx))) {
-                       useidx = idx;
-                       break;
-               }
-               rcv_array_wc_fill(dd, grp->base + idx);
-       }
-
-       idx = 0;
-       while (idx < count) {
-               u16 npages, pageidx, setidx = start + idx;
-               int ret = 0;
-
-               /*
-                * If this entry in the group is used, move to the next one.
-                * If we go past the end of the group, exit the loop.
-                */
-               if (useidx >= grp->size) {
-                       break;
-               } else if (grp->map & (1 << useidx)) {
-                       rcv_array_wc_fill(dd, grp->base + useidx);
-                       useidx++;
-                       continue;
-               }
-
-               rcventry = grp->base + useidx;
-               npages = sets[setidx].count;
-               pageidx = sets[setidx].idx;
-
-               ret = set_rcvarray_entry(fp, vaddr + (pageidx * PAGE_SIZE),
-                                        rcventry, grp, pages + pageidx,
-                                        npages);
-               if (ret)
-                       return ret;
-               mapped += npages;
-
-               tidinfo = rcventry2tidinfo(rcventry - uctxt->expected_base) |
-                       EXP_TID_SET(LEN, npages);
-               tidlist[(*tididx)++] = tidinfo;
-               grp->used++;
-               grp->map |= 1 << useidx++;
-               idx++;
-       }
-
-       /* Fill the rest of the group with "blank" writes */
-       for (; useidx < grp->size; useidx++)
-               rcv_array_wc_fill(dd, grp->base + useidx);
-       *pmapped = mapped;
-       return idx;
-}
-
-static int set_rcvarray_entry(struct file *fp, unsigned long vaddr,
-                             u32 rcventry, struct tid_group *grp,
-                             struct page **pages, unsigned npages)
-{
-       int ret;
-       struct hfi1_filedata *fd = fp->private_data;
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-       struct tid_rb_node *node;
-       struct hfi1_devdata *dd = uctxt->dd;
-       struct rb_root *root = &fd->tid_rb_root;
-       dma_addr_t phys;
-
-       /*
-        * Allocate the node first so we can handle a potential
-        * failure before we've programmed anything.
-        */
-       node = kzalloc(sizeof(*node) + (sizeof(struct page *) * npages),
-                      GFP_KERNEL);
-       if (!node)
-               return -ENOMEM;
-
-       phys = pci_map_single(dd->pcidev,
-                             __va(page_to_phys(pages[0])),
-                             npages * PAGE_SIZE, PCI_DMA_FROMDEVICE);
-       if (dma_mapping_error(&dd->pcidev->dev, phys)) {
-               dd_dev_err(dd, "Failed to DMA map Exp Rcv pages 0x%llx\n",
-                          phys);
-               kfree(node);
-               return -EFAULT;
-       }
-
-       node->mmu.addr = vaddr;
-       node->mmu.len = npages * PAGE_SIZE;
-       node->phys = page_to_phys(pages[0]);
-       node->npages = npages;
-       node->rcventry = rcventry;
-       node->dma_addr = phys;
-       node->grp = grp;
-       node->freed = false;
-       memcpy(node->pages, pages, sizeof(struct page *) * npages);
-
-       if (HFI1_CAP_IS_USET(TID_UNMAP))
-               ret = mmu_rb_insert(root, &node->mmu);
-       else
-               ret = hfi1_mmu_rb_insert(root, &node->mmu);
-
-       if (ret) {
-               hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d",
-                         node->rcventry, node->mmu.addr, node->phys, ret);
-               pci_unmap_single(dd->pcidev, phys, npages * PAGE_SIZE,
-                                PCI_DMA_FROMDEVICE);
-               kfree(node);
-               return -EFAULT;
-       }
-       hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1);
-       trace_hfi1_exp_tid_reg(uctxt->ctxt, fd->subctxt, rcventry, npages,
-                              node->mmu.addr, node->phys, phys);
-       return 0;
-}
-
-static int unprogram_rcvarray(struct file *fp, u32 tidinfo,
-                             struct tid_group **grp)
-{
-       struct hfi1_filedata *fd = fp->private_data;
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-       struct hfi1_devdata *dd = uctxt->dd;
-       struct tid_rb_node *node;
-       u8 tidctrl = EXP_TID_GET(tidinfo, CTRL);
-       u32 tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry;
-
-       if (tididx >= uctxt->expected_count) {
-               dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n",
-                          tididx, uctxt->ctxt);
-               return -EINVAL;
-       }
-
-       if (tidctrl == 0x3)
-               return -EINVAL;
-
-       rcventry = tididx + (tidctrl - 1);
-
-       node = fd->entry_to_rb[rcventry];
-       if (!node || node->rcventry != (uctxt->expected_base + rcventry))
-               return -EBADF;
-       if (HFI1_CAP_IS_USET(TID_UNMAP))
-               mmu_rb_remove(&fd->tid_rb_root, &node->mmu, NULL);
-       else
-               hfi1_mmu_rb_remove(&fd->tid_rb_root, &node->mmu);
-
-       if (grp)
-               *grp = node->grp;
-       clear_tid_node(fd, fd->subctxt, node);
-       return 0;
-}
-
-static void clear_tid_node(struct hfi1_filedata *fd, u16 subctxt,
-                          struct tid_rb_node *node)
-{
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-       struct hfi1_devdata *dd = uctxt->dd;
-
-       trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry,
-                                node->npages, node->mmu.addr, node->phys,
-                                node->dma_addr);
-
-       hfi1_put_tid(dd, node->rcventry, PT_INVALID, 0, 0);
-       /*
-        * Make sure device has seen the write before we unpin the
-        * pages.
-        */
-       flush_wc();
-
-       pci_unmap_single(dd->pcidev, node->dma_addr, node->mmu.len,
-                        PCI_DMA_FROMDEVICE);
-       hfi1_release_user_pages(current->mm, node->pages, node->npages, true);
-       fd->tid_n_pinned -= node->npages;
-
-       node->grp->used--;
-       node->grp->map &= ~(1 << (node->rcventry - node->grp->base));
-
-       if (node->grp->used == node->grp->size - 1)
-               tid_group_move(node->grp, &uctxt->tid_full_list,
-                              &uctxt->tid_used_list);
-       else if (!node->grp->used)
-               tid_group_move(node->grp, &uctxt->tid_used_list,
-                              &uctxt->tid_group_list);
-       kfree(node);
-}
-
-static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
-                           struct exp_tid_set *set, struct rb_root *root)
-{
-       struct tid_group *grp, *ptr;
-       struct hfi1_filedata *fd = container_of(root, struct hfi1_filedata,
-                                               tid_rb_root);
-       int i;
-
-       list_for_each_entry_safe(grp, ptr, &set->list, list) {
-               list_del_init(&grp->list);
-
-               for (i = 0; i < grp->size; i++) {
-                       if (grp->map & (1 << i)) {
-                               u16 rcventry = grp->base + i;
-                               struct tid_rb_node *node;
-
-                               node = fd->entry_to_rb[rcventry -
-                                                         uctxt->expected_base];
-                               if (!node || node->rcventry != rcventry)
-                                       continue;
-                               if (HFI1_CAP_IS_USET(TID_UNMAP))
-                                       mmu_rb_remove(&fd->tid_rb_root,
-                                                     &node->mmu, NULL);
-                               else
-                                       hfi1_mmu_rb_remove(&fd->tid_rb_root,
-                                                          &node->mmu);
-                               clear_tid_node(fd, -1, node);
-                       }
-               }
-       }
-}
-
-static int mmu_rb_invalidate(struct rb_root *root, struct mmu_rb_node *mnode)
-{
-       struct hfi1_filedata *fdata =
-               container_of(root, struct hfi1_filedata, tid_rb_root);
-       struct hfi1_ctxtdata *uctxt = fdata->uctxt;
-       struct tid_rb_node *node =
-               container_of(mnode, struct tid_rb_node, mmu);
-
-       if (node->freed)
-               return 0;
-
-       trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt, node->mmu.addr,
-                                node->rcventry, node->npages, node->dma_addr);
-       node->freed = true;
-
-       spin_lock(&fdata->invalid_lock);
-       if (fdata->invalid_tid_idx < uctxt->expected_count) {
-               fdata->invalid_tids[fdata->invalid_tid_idx] =
-                       rcventry2tidinfo(node->rcventry - uctxt->expected_base);
-               fdata->invalid_tids[fdata->invalid_tid_idx] |=
-                       EXP_TID_SET(LEN, node->npages);
-               if (!fdata->invalid_tid_idx) {
-                       unsigned long *ev;
-
-                       /*
-                        * hfi1_set_uevent_bits() sets a user event flag
-                        * for all processes. Because calling into the
-                        * driver to process TID cache invalidations is
-                        * expensive and TID cache invalidations are
-                        * handled on a per-process basis, we can
-                        * optimize this to set the flag only for the
-                        * process in question.
-                        */
-                       ev = uctxt->dd->events +
-                               (((uctxt->ctxt - uctxt->dd->first_user_ctxt) *
-                                 HFI1_MAX_SHARED_CTXTS) + fdata->subctxt);
-                       set_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
-               }
-               fdata->invalid_tid_idx++;
-       }
-       spin_unlock(&fdata->invalid_lock);
-       return 0;
-}
-
-static int mmu_rb_insert(struct rb_root *root, struct mmu_rb_node *node)
-{
-       struct hfi1_filedata *fdata =
-               container_of(root, struct hfi1_filedata, tid_rb_root);
-       struct tid_rb_node *tnode =
-               container_of(node, struct tid_rb_node, mmu);
-       u32 base = fdata->uctxt->expected_base;
-
-       fdata->entry_to_rb[tnode->rcventry - base] = tnode;
-       return 0;
-}
-
-static void mmu_rb_remove(struct rb_root *root, struct mmu_rb_node *node,
-                         struct mm_struct *mm)
-{
-       struct hfi1_filedata *fdata =
-               container_of(root, struct hfi1_filedata, tid_rb_root);
-       struct tid_rb_node *tnode =
-               container_of(node, struct tid_rb_node, mmu);
-       u32 base = fdata->uctxt->expected_base;
-
-       fdata->entry_to_rb[tnode->rcventry - base] = NULL;
-}
diff --git a/drivers/staging/rdma/hfi1/user_exp_rcv.h b/drivers/staging/rdma/hfi1/user_exp_rcv.h
deleted file mode 100644 (file)
index 9bc8d9f..0000000
+++ /dev/null
@@ -1,79 +0,0 @@
-#ifndef _HFI1_USER_EXP_RCV_H
-#define _HFI1_USER_EXP_RCV_H
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "hfi.h"
-
-#define EXP_TID_TIDLEN_MASK   0x7FFULL
-#define EXP_TID_TIDLEN_SHIFT  0
-#define EXP_TID_TIDCTRL_MASK  0x3ULL
-#define EXP_TID_TIDCTRL_SHIFT 20
-#define EXP_TID_TIDIDX_MASK   0x3FFULL
-#define EXP_TID_TIDIDX_SHIFT  22
-#define EXP_TID_GET(tid, field)        \
-       (((tid) >> EXP_TID_TID##field##_SHIFT) & EXP_TID_TID##field##_MASK)
-
-#define EXP_TID_SET(field, value)                      \
-       (((value) & EXP_TID_TID##field##_MASK) <<       \
-        EXP_TID_TID##field##_SHIFT)
-#define EXP_TID_CLEAR(tid, field) ({                                   \
-               (tid) &= ~(EXP_TID_TID##field##_MASK <<                 \
-                          EXP_TID_TID##field##_SHIFT);                 \
-               })
-#define EXP_TID_RESET(tid, field, value) do {                          \
-               EXP_TID_CLEAR(tid, field);                              \
-               (tid) |= EXP_TID_SET(field, (value));                   \
-       } while (0)
-
-int hfi1_user_exp_rcv_init(struct file *);
-int hfi1_user_exp_rcv_free(struct hfi1_filedata *);
-int hfi1_user_exp_rcv_setup(struct file *, struct hfi1_tid_info *);
-int hfi1_user_exp_rcv_clear(struct file *, struct hfi1_tid_info *);
-int hfi1_user_exp_rcv_invalid(struct file *, struct hfi1_tid_info *);
-
-#endif /* _HFI1_USER_EXP_RCV_H */
diff --git a/drivers/staging/rdma/hfi1/user_pages.c b/drivers/staging/rdma/hfi1/user_pages.c
deleted file mode 100644 (file)
index 88e10b5..0000000
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/device.h>
-#include <linux/module.h>
-
-#include "hfi.h"
-
-static unsigned long cache_size = 256;
-module_param(cache_size, ulong, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(cache_size, "Send and receive side cache size limit (in MB)");
-
-/*
- * Determine whether the caller can pin pages.
- *
- * This function should be used in the implementation of buffer caches.
- * The cache implementation should call this function prior to attempting
- * to pin buffer pages in order to determine whether they should do so.
- * The function computes cache limits based on the configured ulimit and
- * cache size. Use of this function is especially important for caches
- * which are not limited in any other way (e.g. by HW resources) and, thus,
- * could keeping caching buffers.
- *
- */
-bool hfi1_can_pin_pages(struct hfi1_devdata *dd, u32 nlocked, u32 npages)
-{
-       unsigned long ulimit = rlimit(RLIMIT_MEMLOCK), pinned, cache_limit,
-               size = (cache_size * (1UL << 20)); /* convert to bytes */
-       unsigned usr_ctxts = dd->num_rcv_contexts - dd->first_user_ctxt;
-       bool can_lock = capable(CAP_IPC_LOCK);
-
-       /*
-        * Calculate per-cache size. The calculation below uses only a quarter
-        * of the available per-context limit. This leaves space for other
-        * pinning. Should we worry about shared ctxts?
-        */
-       cache_limit = (ulimit / usr_ctxts) / 4;
-
-       /* If ulimit isn't set to "unlimited" and is smaller than cache_size. */
-       if (ulimit != (-1UL) && size > cache_limit)
-               size = cache_limit;
-
-       /* Convert to number of pages */
-       size = DIV_ROUND_UP(size, PAGE_SIZE);
-
-       down_read(&current->mm->mmap_sem);
-       pinned = current->mm->pinned_vm;
-       up_read(&current->mm->mmap_sem);
-
-       /* First, check the absolute limit against all pinned pages. */
-       if (pinned + npages >= ulimit && !can_lock)
-               return false;
-
-       return ((nlocked + npages) <= size) || can_lock;
-}
-
-int hfi1_acquire_user_pages(unsigned long vaddr, size_t npages, bool writable,
-                           struct page **pages)
-{
-       int ret;
-
-       ret = get_user_pages_fast(vaddr, npages, writable, pages);
-       if (ret < 0)
-               return ret;
-
-       down_write(&current->mm->mmap_sem);
-       current->mm->pinned_vm += ret;
-       up_write(&current->mm->mmap_sem);
-
-       return ret;
-}
-
-void hfi1_release_user_pages(struct mm_struct *mm, struct page **p,
-                            size_t npages, bool dirty)
-{
-       size_t i;
-
-       for (i = 0; i < npages; i++) {
-               if (dirty)
-                       set_page_dirty_lock(p[i]);
-               put_page(p[i]);
-       }
-
-       if (mm) { /* during close after signal, mm can be NULL */
-               down_write(&mm->mmap_sem);
-               mm->pinned_vm -= npages;
-               up_write(&mm->mmap_sem);
-       }
-}
diff --git a/drivers/staging/rdma/hfi1/user_sdma.c b/drivers/staging/rdma/hfi1/user_sdma.c
deleted file mode 100644 (file)
index 0014c9c..0000000
+++ /dev/null
@@ -1,1623 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#include <linux/mm.h>
-#include <linux/types.h>
-#include <linux/device.h>
-#include <linux/dmapool.h>
-#include <linux/slab.h>
-#include <linux/list.h>
-#include <linux/highmem.h>
-#include <linux/io.h>
-#include <linux/uio.h>
-#include <linux/rbtree.h>
-#include <linux/spinlock.h>
-#include <linux/delay.h>
-#include <linux/kthread.h>
-#include <linux/mmu_context.h>
-#include <linux/module.h>
-#include <linux/vmalloc.h>
-
-#include "hfi.h"
-#include "sdma.h"
-#include "user_sdma.h"
-#include "verbs.h"  /* for the headers */
-#include "common.h" /* for struct hfi1_tid_info */
-#include "trace.h"
-#include "mmu_rb.h"
-
-static uint hfi1_sdma_comp_ring_size = 128;
-module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO);
-MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128");
-
-/* The maximum number of Data io vectors per message/request */
-#define MAX_VECTORS_PER_REQ 8
-/*
- * Maximum number of packet to send from each message/request
- * before moving to the next one.
- */
-#define MAX_PKTS_PER_QUEUE 16
-
-#define num_pages(x) (1 + ((((x) - 1) & PAGE_MASK) >> PAGE_SHIFT))
-
-#define req_opcode(x) \
-       (((x) >> HFI1_SDMA_REQ_OPCODE_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK)
-#define req_version(x) \
-       (((x) >> HFI1_SDMA_REQ_VERSION_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK)
-#define req_iovcnt(x) \
-       (((x) >> HFI1_SDMA_REQ_IOVCNT_SHIFT) & HFI1_SDMA_REQ_IOVCNT_MASK)
-
-/* Number of BTH.PSN bits used for sequence number in expected rcvs */
-#define BTH_SEQ_MASK 0x7ffull
-
-/*
- * Define fields in the KDETH header so we can update the header
- * template.
- */
-#define KDETH_OFFSET_SHIFT        0
-#define KDETH_OFFSET_MASK         0x7fff
-#define KDETH_OM_SHIFT            15
-#define KDETH_OM_MASK             0x1
-#define KDETH_TID_SHIFT           16
-#define KDETH_TID_MASK            0x3ff
-#define KDETH_TIDCTRL_SHIFT       26
-#define KDETH_TIDCTRL_MASK        0x3
-#define KDETH_INTR_SHIFT          28
-#define KDETH_INTR_MASK           0x1
-#define KDETH_SH_SHIFT            29
-#define KDETH_SH_MASK             0x1
-#define KDETH_HCRC_UPPER_SHIFT    16
-#define KDETH_HCRC_UPPER_MASK     0xff
-#define KDETH_HCRC_LOWER_SHIFT    24
-#define KDETH_HCRC_LOWER_MASK     0xff
-
-#define PBC2LRH(x) ((((x) & 0xfff) << 2) - 4)
-#define LRH2PBC(x) ((((x) >> 2) + 1) & 0xfff)
-
-#define KDETH_GET(val, field)                                          \
-       (((le32_to_cpu((val))) >> KDETH_##field##_SHIFT) & KDETH_##field##_MASK)
-#define KDETH_SET(dw, field, val) do {                                 \
-               u32 dwval = le32_to_cpu(dw);                            \
-               dwval &= ~(KDETH_##field##_MASK << KDETH_##field##_SHIFT); \
-               dwval |= (((val) & KDETH_##field##_MASK) << \
-                         KDETH_##field##_SHIFT);                       \
-               dw = cpu_to_le32(dwval);                                \
-       } while (0)
-
-#define AHG_HEADER_SET(arr, idx, dw, bit, width, value)                        \
-       do {                                                            \
-               if ((idx) < ARRAY_SIZE((arr)))                          \
-                       (arr)[(idx++)] = sdma_build_ahg_descriptor(     \
-                               (__force u16)(value), (dw), (bit),      \
-                                                       (width));       \
-               else                                                    \
-                       return -ERANGE;                                 \
-       } while (0)
-
-/* KDETH OM multipliers and switch over point */
-#define KDETH_OM_SMALL     4
-#define KDETH_OM_LARGE     64
-#define KDETH_OM_MAX_SIZE  (1 << ((KDETH_OM_LARGE / KDETH_OM_SMALL) + 1))
-
-/* Last packet in the request */
-#define TXREQ_FLAGS_REQ_LAST_PKT BIT(0)
-
-#define SDMA_REQ_IN_USE     0
-#define SDMA_REQ_FOR_THREAD 1
-#define SDMA_REQ_SEND_DONE  2
-#define SDMA_REQ_HAVE_AHG   3
-#define SDMA_REQ_HAS_ERROR  4
-#define SDMA_REQ_DONE_ERROR 5
-
-#define SDMA_PKT_Q_INACTIVE BIT(0)
-#define SDMA_PKT_Q_ACTIVE   BIT(1)
-#define SDMA_PKT_Q_DEFERRED BIT(2)
-
-/*
- * Maximum retry attempts to submit a TX request
- * before putting the process to sleep.
- */
-#define MAX_DEFER_RETRY_COUNT 1
-
-static unsigned initial_pkt_count = 8;
-
-#define SDMA_IOWAIT_TIMEOUT 1000 /* in milliseconds */
-
-struct user_sdma_iovec {
-       struct list_head list;
-       struct iovec iov;
-       /* number of pages in this vector */
-       unsigned npages;
-       /* array of pinned pages for this vector */
-       struct page **pages;
-       /*
-        * offset into the virtual address space of the vector at
-        * which we last left off.
-        */
-       u64 offset;
-};
-
-#define SDMA_CACHE_NODE_EVICT BIT(0)
-
-struct sdma_mmu_node {
-       struct mmu_rb_node rb;
-       struct list_head list;
-       struct hfi1_user_sdma_pkt_q *pq;
-       atomic_t refcount;
-       struct page **pages;
-       unsigned npages;
-       unsigned long flags;
-};
-
-struct user_sdma_request {
-       struct sdma_req_info info;
-       struct hfi1_user_sdma_pkt_q *pq;
-       struct hfi1_user_sdma_comp_q *cq;
-       /* This is the original header from user space */
-       struct hfi1_pkt_header hdr;
-       /*
-        * Pointer to the SDMA engine for this request.
-        * Since different request could be on different VLs,
-        * each request will need it's own engine pointer.
-        */
-       struct sdma_engine *sde;
-       u8 ahg_idx;
-       u32 ahg[9];
-       /*
-        * KDETH.Offset (Eager) field
-        * We need to remember the initial value so the headers
-        * can be updated properly.
-        */
-       u32 koffset;
-       /*
-        * KDETH.OFFSET (TID) field
-        * The offset can cover multiple packets, depending on the
-        * size of the TID entry.
-        */
-       u32 tidoffset;
-       /*
-        * KDETH.OM
-        * Remember this because the header template always sets it
-        * to 0.
-        */
-       u8 omfactor;
-       /*
-        * We copy the iovs for this request (based on
-        * info.iovcnt). These are only the data vectors
-        */
-       unsigned data_iovs;
-       /* total length of the data in the request */
-       u32 data_len;
-       /* progress index moving along the iovs array */
-       unsigned iov_idx;
-       struct user_sdma_iovec iovs[MAX_VECTORS_PER_REQ];
-       /* number of elements copied to the tids array */
-       u16 n_tids;
-       /* TID array values copied from the tid_iov vector */
-       u32 *tids;
-       u16 tididx;
-       u32 sent;
-       u64 seqnum;
-       u64 seqcomp;
-       u64 seqsubmitted;
-       struct list_head txps;
-       unsigned long flags;
-       /* status of the last txreq completed */
-       int status;
-};
-
-/*
- * A single txreq could span up to 3 physical pages when the MTU
- * is sufficiently large (> 4K). Each of the IOV pointers also
- * needs it's own set of flags so the vector has been handled
- * independently of each other.
- */
-struct user_sdma_txreq {
-       /* Packet header for the txreq */
-       struct hfi1_pkt_header hdr;
-       struct sdma_txreq txreq;
-       struct list_head list;
-       struct user_sdma_request *req;
-       u16 flags;
-       unsigned busycount;
-       u64 seqnum;
-};
-
-#define SDMA_DBG(req, fmt, ...)                                     \
-       hfi1_cdbg(SDMA, "[%u:%u:%u:%u] " fmt, (req)->pq->dd->unit, \
-                (req)->pq->ctxt, (req)->pq->subctxt, (req)->info.comp_idx, \
-                ##__VA_ARGS__)
-#define SDMA_Q_DBG(pq, fmt, ...)                        \
-       hfi1_cdbg(SDMA, "[%u:%u:%u] " fmt, (pq)->dd->unit, (pq)->ctxt, \
-                (pq)->subctxt, ##__VA_ARGS__)
-
-static int user_sdma_send_pkts(struct user_sdma_request *, unsigned);
-static int num_user_pages(const struct iovec *);
-static void user_sdma_txreq_cb(struct sdma_txreq *, int);
-static inline void pq_update(struct hfi1_user_sdma_pkt_q *);
-static void user_sdma_free_request(struct user_sdma_request *, bool);
-static int pin_vector_pages(struct user_sdma_request *,
-                           struct user_sdma_iovec *);
-static void unpin_vector_pages(struct mm_struct *, struct page **, unsigned,
-                              unsigned);
-static int check_header_template(struct user_sdma_request *,
-                                struct hfi1_pkt_header *, u32, u32);
-static int set_txreq_header(struct user_sdma_request *,
-                           struct user_sdma_txreq *, u32);
-static int set_txreq_header_ahg(struct user_sdma_request *,
-                               struct user_sdma_txreq *, u32);
-static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *,
-                                 struct hfi1_user_sdma_comp_q *,
-                                 u16, enum hfi1_sdma_comp_state, int);
-static inline u32 set_pkt_bth_psn(__be32, u8, u32);
-static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len);
-
-static int defer_packet_queue(
-       struct sdma_engine *,
-       struct iowait *,
-       struct sdma_txreq *,
-       unsigned seq);
-static void activate_packet_queue(struct iowait *, int);
-static bool sdma_rb_filter(struct mmu_rb_node *, unsigned long, unsigned long);
-static int sdma_rb_insert(struct rb_root *, struct mmu_rb_node *);
-static void sdma_rb_remove(struct rb_root *, struct mmu_rb_node *,
-                          struct mm_struct *);
-static int sdma_rb_invalidate(struct rb_root *, struct mmu_rb_node *);
-
-static struct mmu_rb_ops sdma_rb_ops = {
-       .filter = sdma_rb_filter,
-       .insert = sdma_rb_insert,
-       .remove = sdma_rb_remove,
-       .invalidate = sdma_rb_invalidate
-};
-
-static int defer_packet_queue(
-       struct sdma_engine *sde,
-       struct iowait *wait,
-       struct sdma_txreq *txreq,
-       unsigned seq)
-{
-       struct hfi1_user_sdma_pkt_q *pq =
-               container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
-       struct hfi1_ibdev *dev = &pq->dd->verbs_dev;
-       struct user_sdma_txreq *tx =
-               container_of(txreq, struct user_sdma_txreq, txreq);
-
-       if (sdma_progress(sde, seq, txreq)) {
-               if (tx->busycount++ < MAX_DEFER_RETRY_COUNT)
-                       goto eagain;
-       }
-       /*
-        * We are assuming that if the list is enqueued somewhere, it
-        * is to the dmawait list since that is the only place where
-        * it is supposed to be enqueued.
-        */
-       xchg(&pq->state, SDMA_PKT_Q_DEFERRED);
-       write_seqlock(&dev->iowait_lock);
-       if (list_empty(&pq->busy.list))
-               list_add_tail(&pq->busy.list, &sde->dmawait);
-       write_sequnlock(&dev->iowait_lock);
-       return -EBUSY;
-eagain:
-       return -EAGAIN;
-}
-
-static void activate_packet_queue(struct iowait *wait, int reason)
-{
-       struct hfi1_user_sdma_pkt_q *pq =
-               container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
-       xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
-       wake_up(&wait->wait_dma);
-};
-
-static void sdma_kmem_cache_ctor(void *obj)
-{
-       struct user_sdma_txreq *tx = obj;
-
-       memset(tx, 0, sizeof(*tx));
-}
-
-int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp)
-{
-       struct hfi1_filedata *fd;
-       int ret = 0;
-       unsigned memsize;
-       char buf[64];
-       struct hfi1_devdata *dd;
-       struct hfi1_user_sdma_comp_q *cq;
-       struct hfi1_user_sdma_pkt_q *pq;
-       unsigned long flags;
-
-       if (!uctxt || !fp) {
-               ret = -EBADF;
-               goto done;
-       }
-
-       fd = fp->private_data;
-
-       if (!hfi1_sdma_comp_ring_size) {
-               ret = -EINVAL;
-               goto done;
-       }
-
-       dd = uctxt->dd;
-
-       pq = kzalloc(sizeof(*pq), GFP_KERNEL);
-       if (!pq)
-               goto pq_nomem;
-
-       memsize = sizeof(*pq->reqs) * hfi1_sdma_comp_ring_size;
-       pq->reqs = kzalloc(memsize, GFP_KERNEL);
-       if (!pq->reqs)
-               goto pq_reqs_nomem;
-
-       INIT_LIST_HEAD(&pq->list);
-       pq->dd = dd;
-       pq->ctxt = uctxt->ctxt;
-       pq->subctxt = fd->subctxt;
-       pq->n_max_reqs = hfi1_sdma_comp_ring_size;
-       pq->state = SDMA_PKT_Q_INACTIVE;
-       atomic_set(&pq->n_reqs, 0);
-       init_waitqueue_head(&pq->wait);
-       pq->sdma_rb_root = RB_ROOT;
-       INIT_LIST_HEAD(&pq->evict);
-       spin_lock_init(&pq->evict_lock);
-
-       iowait_init(&pq->busy, 0, NULL, defer_packet_queue,
-                   activate_packet_queue, NULL);
-       pq->reqidx = 0;
-       snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt,
-                fd->subctxt);
-       pq->txreq_cache = kmem_cache_create(buf,
-                              sizeof(struct user_sdma_txreq),
-                                           L1_CACHE_BYTES,
-                                           SLAB_HWCACHE_ALIGN,
-                                           sdma_kmem_cache_ctor);
-       if (!pq->txreq_cache) {
-               dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n",
-                          uctxt->ctxt);
-               goto pq_txreq_nomem;
-       }
-       fd->pq = pq;
-       cq = kzalloc(sizeof(*cq), GFP_KERNEL);
-       if (!cq)
-               goto cq_nomem;
-
-       memsize = PAGE_ALIGN(sizeof(*cq->comps) * hfi1_sdma_comp_ring_size);
-       cq->comps = vmalloc_user(memsize);
-       if (!cq->comps)
-               goto cq_comps_nomem;
-
-       cq->nentries = hfi1_sdma_comp_ring_size;
-       fd->cq = cq;
-
-       ret = hfi1_mmu_rb_register(&pq->sdma_rb_root, &sdma_rb_ops);
-       if (ret) {
-               dd_dev_err(dd, "Failed to register with MMU %d", ret);
-               goto done;
-       }
-
-       spin_lock_irqsave(&uctxt->sdma_qlock, flags);
-       list_add(&pq->list, &uctxt->sdma_queues);
-       spin_unlock_irqrestore(&uctxt->sdma_qlock, flags);
-       goto done;
-
-cq_comps_nomem:
-       kfree(cq);
-cq_nomem:
-       kmem_cache_destroy(pq->txreq_cache);
-pq_txreq_nomem:
-       kfree(pq->reqs);
-pq_reqs_nomem:
-       kfree(pq);
-       fd->pq = NULL;
-pq_nomem:
-       ret = -ENOMEM;
-done:
-       return ret;
-}
-
-int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd)
-{
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-       struct hfi1_user_sdma_pkt_q *pq;
-       unsigned long flags;
-
-       hfi1_cdbg(SDMA, "[%u:%u:%u] Freeing user SDMA queues", uctxt->dd->unit,
-                 uctxt->ctxt, fd->subctxt);
-       pq = fd->pq;
-       hfi1_mmu_rb_unregister(&pq->sdma_rb_root);
-       if (pq) {
-               spin_lock_irqsave(&uctxt->sdma_qlock, flags);
-               if (!list_empty(&pq->list))
-                       list_del_init(&pq->list);
-               spin_unlock_irqrestore(&uctxt->sdma_qlock, flags);
-               iowait_sdma_drain(&pq->busy);
-               /* Wait until all requests have been freed. */
-               wait_event_interruptible(
-                       pq->wait,
-                       (ACCESS_ONCE(pq->state) == SDMA_PKT_Q_INACTIVE));
-               kfree(pq->reqs);
-               kmem_cache_destroy(pq->txreq_cache);
-               kfree(pq);
-               fd->pq = NULL;
-       }
-       if (fd->cq) {
-               vfree(fd->cq->comps);
-               kfree(fd->cq);
-               fd->cq = NULL;
-       }
-       return 0;
-}
-
-int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
-                                  unsigned long dim, unsigned long *count)
-{
-       int ret = 0, i = 0;
-       struct hfi1_filedata *fd = fp->private_data;
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-       struct hfi1_user_sdma_pkt_q *pq = fd->pq;
-       struct hfi1_user_sdma_comp_q *cq = fd->cq;
-       struct hfi1_devdata *dd = pq->dd;
-       unsigned long idx = 0;
-       u8 pcount = initial_pkt_count;
-       struct sdma_req_info info;
-       struct user_sdma_request *req;
-       u8 opcode, sc, vl;
-
-       if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) {
-               hfi1_cdbg(
-                  SDMA,
-                  "[%u:%u:%u] First vector not big enough for header %lu/%lu",
-                  dd->unit, uctxt->ctxt, fd->subctxt,
-                  iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr));
-               return -EINVAL;
-       }
-       ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info));
-       if (ret) {
-               hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)",
-                         dd->unit, uctxt->ctxt, fd->subctxt, ret);
-               return -EFAULT;
-       }
-
-       trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt,
-                                    (u16 *)&info);
-       if (cq->comps[info.comp_idx].status == QUEUED ||
-           test_bit(SDMA_REQ_IN_USE, &pq->reqs[info.comp_idx].flags)) {
-               hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in QUEUED state",
-                         dd->unit, uctxt->ctxt, fd->subctxt,
-                         info.comp_idx);
-               return -EBADSLT;
-       }
-       if (!info.fragsize) {
-               hfi1_cdbg(SDMA,
-                         "[%u:%u:%u:%u] Request does not specify fragsize",
-                         dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
-               return -EINVAL;
-       }
-       /*
-        * We've done all the safety checks that we can up to this point,
-        * "allocate" the request entry.
-        */
-       hfi1_cdbg(SDMA, "[%u:%u:%u] Using req/comp entry %u\n", dd->unit,
-                 uctxt->ctxt, fd->subctxt, info.comp_idx);
-       req = pq->reqs + info.comp_idx;
-       memset(req, 0, sizeof(*req));
-       /* Mark the request as IN_USE before we start filling it in. */
-       set_bit(SDMA_REQ_IN_USE, &req->flags);
-       req->data_iovs = req_iovcnt(info.ctrl) - 1;
-       req->pq = pq;
-       req->cq = cq;
-       req->status = -1;
-       INIT_LIST_HEAD(&req->txps);
-
-       memcpy(&req->info, &info, sizeof(info));
-
-       if (req_opcode(info.ctrl) == EXPECTED)
-               req->data_iovs--;
-
-       if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) {
-               SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs,
-                        MAX_VECTORS_PER_REQ);
-               return -EINVAL;
-       }
-       /* Copy the header from the user buffer */
-       ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info),
-                            sizeof(req->hdr));
-       if (ret) {
-               SDMA_DBG(req, "Failed to copy header template (%d)", ret);
-               ret = -EFAULT;
-               goto free_req;
-       }
-
-       /* If Static rate control is not enabled, sanitize the header. */
-       if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL))
-               req->hdr.pbc[2] = 0;
-
-       /* Validate the opcode. Do not trust packets from user space blindly. */
-       opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff;
-       if ((opcode & USER_OPCODE_CHECK_MASK) !=
-            USER_OPCODE_CHECK_VAL) {
-               SDMA_DBG(req, "Invalid opcode (%d)", opcode);
-               ret = -EINVAL;
-               goto free_req;
-       }
-       /*
-        * Validate the vl. Do not trust packets from user space blindly.
-        * VL comes from PBC, SC comes from LRH, and the VL needs to
-        * match the SC look up.
-        */
-       vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF;
-       sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) |
-             (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4));
-       if (vl >= dd->pport->vls_operational ||
-           vl != sc_to_vlt(dd, sc)) {
-               SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl);
-               ret = -EINVAL;
-               goto free_req;
-       }
-
-       /* Checking P_KEY for requests from user-space */
-       if (egress_pkey_check(dd->pport, req->hdr.lrh, req->hdr.bth, sc,
-                             PKEY_CHECK_INVALID)) {
-               ret = -EINVAL;
-               goto free_req;
-       }
-
-       /*
-        * Also should check the BTH.lnh. If it says the next header is GRH then
-        * the RXE parsing will be off and will land in the middle of the KDETH
-        * or miss it entirely.
-        */
-       if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) {
-               SDMA_DBG(req, "User tried to pass in a GRH");
-               ret = -EINVAL;
-               goto free_req;
-       }
-
-       req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]);
-       /*
-        * Calculate the initial TID offset based on the values of
-        * KDETH.OFFSET and KDETH.OM that are passed in.
-        */
-       req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) *
-               (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
-                KDETH_OM_LARGE : KDETH_OM_SMALL);
-       SDMA_DBG(req, "Initial TID offset %u", req->tidoffset);
-       idx++;
-
-       /* Save all the IO vector structures */
-       while (i < req->data_iovs) {
-               INIT_LIST_HEAD(&req->iovs[i].list);
-               memcpy(&req->iovs[i].iov, iovec + idx++, sizeof(struct iovec));
-               ret = pin_vector_pages(req, &req->iovs[i]);
-               if (ret) {
-                       req->status = ret;
-                       goto free_req;
-               }
-               req->data_len += req->iovs[i++].iov.iov_len;
-       }
-       SDMA_DBG(req, "total data length %u", req->data_len);
-
-       if (pcount > req->info.npkts)
-               pcount = req->info.npkts;
-       /*
-        * Copy any TID info
-        * User space will provide the TID info only when the
-        * request type is EXPECTED. This is true even if there is
-        * only one packet in the request and the header is already
-        * setup. The reason for the singular TID case is that the
-        * driver needs to perform safety checks.
-        */
-       if (req_opcode(req->info.ctrl) == EXPECTED) {
-               u16 ntids = iovec[idx].iov_len / sizeof(*req->tids);
-
-               if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) {
-                       ret = -EINVAL;
-                       goto free_req;
-               }
-               req->tids = kcalloc(ntids, sizeof(*req->tids), GFP_KERNEL);
-               if (!req->tids) {
-                       ret = -ENOMEM;
-                       goto free_req;
-               }
-               /*
-                * We have to copy all of the tids because they may vary
-                * in size and, therefore, the TID count might not be
-                * equal to the pkt count. However, there is no way to
-                * tell at this point.
-                */
-               ret = copy_from_user(req->tids, iovec[idx].iov_base,
-                                    ntids * sizeof(*req->tids));
-               if (ret) {
-                       SDMA_DBG(req, "Failed to copy %d TIDs (%d)",
-                                ntids, ret);
-                       ret = -EFAULT;
-                       goto free_req;
-               }
-               req->n_tids = ntids;
-               idx++;
-       }
-
-       /* Have to select the engine */
-       req->sde = sdma_select_engine_vl(dd,
-                                        (u32)(uctxt->ctxt + fd->subctxt),
-                                        vl);
-       if (!req->sde || !sdma_running(req->sde)) {
-               ret = -ECOMM;
-               goto free_req;
-       }
-
-       /* We don't need an AHG entry if the request contains only one packet */
-       if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG)) {
-               int ahg = sdma_ahg_alloc(req->sde);
-
-               if (likely(ahg >= 0)) {
-                       req->ahg_idx = (u8)ahg;
-                       set_bit(SDMA_REQ_HAVE_AHG, &req->flags);
-               }
-       }
-
-       set_comp_state(pq, cq, info.comp_idx, QUEUED, 0);
-       atomic_inc(&pq->n_reqs);
-       /* Send the first N packets in the request to buy us some time */
-       ret = user_sdma_send_pkts(req, pcount);
-       if (unlikely(ret < 0 && ret != -EBUSY)) {
-               req->status = ret;
-               goto free_req;
-       }
-
-       /*
-        * It is possible that the SDMA engine would have processed all the
-        * submitted packets by the time we get here. Therefore, only set
-        * packet queue state to ACTIVE if there are still uncompleted
-        * requests.
-        */
-       if (atomic_read(&pq->n_reqs))
-               xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
-
-       /*
-        * This is a somewhat blocking send implementation.
-        * The driver will block the caller until all packets of the
-        * request have been submitted to the SDMA engine. However, it
-        * will not wait for send completions.
-        */
-       while (!test_bit(SDMA_REQ_SEND_DONE, &req->flags)) {
-               ret = user_sdma_send_pkts(req, pcount);
-               if (ret < 0) {
-                       if (ret != -EBUSY) {
-                               req->status = ret;
-                               set_bit(SDMA_REQ_DONE_ERROR, &req->flags);
-                               if (ACCESS_ONCE(req->seqcomp) ==
-                                   req->seqsubmitted - 1)
-                                       goto free_req;
-                               return ret;
-                       }
-                       wait_event_interruptible_timeout(
-                               pq->busy.wait_dma,
-                               (pq->state == SDMA_PKT_Q_ACTIVE),
-                               msecs_to_jiffies(
-                                       SDMA_IOWAIT_TIMEOUT));
-               }
-       }
-       *count += idx;
-       return 0;
-free_req:
-       user_sdma_free_request(req, true);
-       pq_update(pq);
-       set_comp_state(pq, cq, info.comp_idx, ERROR, req->status);
-       return ret;
-}
-
-static inline u32 compute_data_length(struct user_sdma_request *req,
-                                     struct user_sdma_txreq *tx)
-{
-       /*
-        * Determine the proper size of the packet data.
-        * The size of the data of the first packet is in the header
-        * template. However, it includes the header and ICRC, which need
-        * to be subtracted.
-        * The size of the remaining packets is the minimum of the frag
-        * size (MTU) or remaining data in the request.
-        */
-       u32 len;
-
-       if (!req->seqnum) {
-               len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) -
-                      (sizeof(tx->hdr) - 4));
-       } else if (req_opcode(req->info.ctrl) == EXPECTED) {
-               u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) *
-                       PAGE_SIZE;
-               /*
-                * Get the data length based on the remaining space in the
-                * TID pair.
-                */
-               len = min(tidlen - req->tidoffset, (u32)req->info.fragsize);
-               /* If we've filled up the TID pair, move to the next one. */
-               if (unlikely(!len) && ++req->tididx < req->n_tids &&
-                   req->tids[req->tididx]) {
-                       tidlen = EXP_TID_GET(req->tids[req->tididx],
-                                            LEN) * PAGE_SIZE;
-                       req->tidoffset = 0;
-                       len = min_t(u32, tidlen, req->info.fragsize);
-               }
-               /*
-                * Since the TID pairs map entire pages, make sure that we
-                * are not going to try to send more data that we have
-                * remaining.
-                */
-               len = min(len, req->data_len - req->sent);
-       } else {
-               len = min(req->data_len - req->sent, (u32)req->info.fragsize);
-       }
-       SDMA_DBG(req, "Data Length = %u", len);
-       return len;
-}
-
-static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len)
-{
-       /* (Size of complete header - size of PBC) + 4B ICRC + data length */
-       return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len);
-}
-
-static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
-{
-       int ret = 0;
-       unsigned npkts = 0;
-       struct user_sdma_txreq *tx = NULL;
-       struct hfi1_user_sdma_pkt_q *pq = NULL;
-       struct user_sdma_iovec *iovec = NULL;
-
-       if (!req->pq)
-               return -EINVAL;
-
-       pq = req->pq;
-
-       /* If tx completion has reported an error, we are done. */
-       if (test_bit(SDMA_REQ_HAS_ERROR, &req->flags)) {
-               set_bit(SDMA_REQ_DONE_ERROR, &req->flags);
-               return -EFAULT;
-       }
-
-       /*
-        * Check if we might have sent the entire request already
-        */
-       if (unlikely(req->seqnum == req->info.npkts)) {
-               if (!list_empty(&req->txps))
-                       goto dosend;
-               return ret;
-       }
-
-       if (!maxpkts || maxpkts > req->info.npkts - req->seqnum)
-               maxpkts = req->info.npkts - req->seqnum;
-
-       while (npkts < maxpkts) {
-               u32 datalen = 0, queued = 0, data_sent = 0;
-               u64 iov_offset = 0;
-
-               /*
-                * Check whether any of the completions have come back
-                * with errors. If so, we are not going to process any
-                * more packets from this request.
-                */
-               if (test_bit(SDMA_REQ_HAS_ERROR, &req->flags)) {
-                       set_bit(SDMA_REQ_DONE_ERROR, &req->flags);
-                       return -EFAULT;
-               }
-
-               tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL);
-               if (!tx)
-                       return -ENOMEM;
-
-               tx->flags = 0;
-               tx->req = req;
-               tx->busycount = 0;
-               INIT_LIST_HEAD(&tx->list);
-
-               if (req->seqnum == req->info.npkts - 1)
-                       tx->flags |= TXREQ_FLAGS_REQ_LAST_PKT;
-
-               /*
-                * Calculate the payload size - this is min of the fragment
-                * (MTU) size or the remaining bytes in the request but only
-                * if we have payload data.
-                */
-               if (req->data_len) {
-                       iovec = &req->iovs[req->iov_idx];
-                       if (ACCESS_ONCE(iovec->offset) == iovec->iov.iov_len) {
-                               if (++req->iov_idx == req->data_iovs) {
-                                       ret = -EFAULT;
-                                       goto free_txreq;
-                               }
-                               iovec = &req->iovs[req->iov_idx];
-                               WARN_ON(iovec->offset);
-                       }
-
-                       datalen = compute_data_length(req, tx);
-                       if (!datalen) {
-                               SDMA_DBG(req,
-                                        "Request has data but pkt len is 0");
-                               ret = -EFAULT;
-                               goto free_tx;
-                       }
-               }
-
-               if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags)) {
-                       if (!req->seqnum) {
-                               u16 pbclen = le16_to_cpu(req->hdr.pbc[0]);
-                               u32 lrhlen = get_lrh_len(req->hdr, datalen);
-                               /*
-                                * Copy the request header into the tx header
-                                * because the HW needs a cacheline-aligned
-                                * address.
-                                * This copy can be optimized out if the hdr
-                                * member of user_sdma_request were also
-                                * cacheline aligned.
-                                */
-                               memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr));
-                               if (PBC2LRH(pbclen) != lrhlen) {
-                                       pbclen = (pbclen & 0xf000) |
-                                               LRH2PBC(lrhlen);
-                                       tx->hdr.pbc[0] = cpu_to_le16(pbclen);
-                               }
-                               ret = sdma_txinit_ahg(&tx->txreq,
-                                                     SDMA_TXREQ_F_AHG_COPY,
-                                                     sizeof(tx->hdr) + datalen,
-                                                     req->ahg_idx, 0, NULL, 0,
-                                                     user_sdma_txreq_cb);
-                               if (ret)
-                                       goto free_tx;
-                               ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq,
-                                                       &tx->hdr,
-                                                       sizeof(tx->hdr));
-                               if (ret)
-                                       goto free_txreq;
-                       } else {
-                               int changes;
-
-                               changes = set_txreq_header_ahg(req, tx,
-                                                              datalen);
-                               if (changes < 0)
-                                       goto free_tx;
-                               sdma_txinit_ahg(&tx->txreq,
-                                               SDMA_TXREQ_F_USE_AHG,
-                                               datalen, req->ahg_idx, changes,
-                                               req->ahg, sizeof(req->hdr),
-                                               user_sdma_txreq_cb);
-                       }
-               } else {
-                       ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) +
-                                         datalen, user_sdma_txreq_cb);
-                       if (ret)
-                               goto free_tx;
-                       /*
-                        * Modify the header for this packet. This only needs
-                        * to be done if we are not going to use AHG. Otherwise,
-                        * the HW will do it based on the changes we gave it
-                        * during sdma_txinit_ahg().
-                        */
-                       ret = set_txreq_header(req, tx, datalen);
-                       if (ret)
-                               goto free_txreq;
-               }
-
-               /*
-                * If the request contains any data vectors, add up to
-                * fragsize bytes to the descriptor.
-                */
-               while (queued < datalen &&
-                      (req->sent + data_sent) < req->data_len) {
-                       unsigned long base, offset;
-                       unsigned pageidx, len;
-
-                       base = (unsigned long)iovec->iov.iov_base;
-                       offset = offset_in_page(base + iovec->offset +
-                                               iov_offset);
-                       pageidx = (((iovec->offset + iov_offset +
-                                    base) - (base & PAGE_MASK)) >> PAGE_SHIFT);
-                       len = offset + req->info.fragsize > PAGE_SIZE ?
-                               PAGE_SIZE - offset : req->info.fragsize;
-                       len = min((datalen - queued), len);
-                       ret = sdma_txadd_page(pq->dd, &tx->txreq,
-                                             iovec->pages[pageidx],
-                                             offset, len);
-                       if (ret) {
-                               SDMA_DBG(req, "SDMA txreq add page failed %d\n",
-                                        ret);
-                               goto free_txreq;
-                       }
-                       iov_offset += len;
-                       queued += len;
-                       data_sent += len;
-                       if (unlikely(queued < datalen &&
-                                    pageidx == iovec->npages &&
-                                    req->iov_idx < req->data_iovs - 1)) {
-                               iovec->offset += iov_offset;
-                               iovec = &req->iovs[++req->iov_idx];
-                               iov_offset = 0;
-                       }
-               }
-               /*
-                * The txreq was submitted successfully so we can update
-                * the counters.
-                */
-               req->koffset += datalen;
-               if (req_opcode(req->info.ctrl) == EXPECTED)
-                       req->tidoffset += datalen;
-               req->sent += data_sent;
-               if (req->data_len)
-                       iovec->offset += iov_offset;
-               list_add_tail(&tx->txreq.list, &req->txps);
-               /*
-                * It is important to increment this here as it is used to
-                * generate the BTH.PSN and, therefore, can't be bulk-updated
-                * outside of the loop.
-                */
-               tx->seqnum = req->seqnum++;
-               npkts++;
-       }
-dosend:
-       ret = sdma_send_txlist(req->sde, &pq->busy, &req->txps);
-       if (list_empty(&req->txps)) {
-               req->seqsubmitted = req->seqnum;
-               if (req->seqnum == req->info.npkts) {
-                       set_bit(SDMA_REQ_SEND_DONE, &req->flags);
-                       /*
-                        * The txreq has already been submitted to the HW queue
-                        * so we can free the AHG entry now. Corruption will not
-                        * happen due to the sequential manner in which
-                        * descriptors are processed.
-                        */
-                       if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags))
-                               sdma_ahg_free(req->sde, req->ahg_idx);
-               }
-       } else if (ret > 0) {
-               req->seqsubmitted += ret;
-               ret = 0;
-       }
-       return ret;
-
-free_txreq:
-       sdma_txclean(pq->dd, &tx->txreq);
-free_tx:
-       kmem_cache_free(pq->txreq_cache, tx);
-       return ret;
-}
-
-/*
- * How many pages in this iovec element?
- */
-static inline int num_user_pages(const struct iovec *iov)
-{
-       const unsigned long addr  = (unsigned long)iov->iov_base;
-       const unsigned long len   = iov->iov_len;
-       const unsigned long spage = addr & PAGE_MASK;
-       const unsigned long epage = (addr + len - 1) & PAGE_MASK;
-
-       return 1 + ((epage - spage) >> PAGE_SHIFT);
-}
-
-static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages)
-{
-       u32 cleared = 0;
-       struct sdma_mmu_node *node, *ptr;
-       struct list_head to_evict = LIST_HEAD_INIT(to_evict);
-
-       spin_lock(&pq->evict_lock);
-       list_for_each_entry_safe_reverse(node, ptr, &pq->evict, list) {
-               /* Make sure that no one is still using the node. */
-               if (!atomic_read(&node->refcount)) {
-                       set_bit(SDMA_CACHE_NODE_EVICT, &node->flags);
-                       list_del_init(&node->list);
-                       list_add(&node->list, &to_evict);
-                       cleared += node->npages;
-                       if (cleared >= npages)
-                               break;
-               }
-       }
-       spin_unlock(&pq->evict_lock);
-
-       list_for_each_entry_safe(node, ptr, &to_evict, list)
-               hfi1_mmu_rb_remove(&pq->sdma_rb_root, &node->rb);
-
-       return cleared;
-}
-
-static int pin_vector_pages(struct user_sdma_request *req,
-                           struct user_sdma_iovec *iovec) {
-       int ret = 0, pinned, npages, cleared;
-       struct page **pages;
-       struct hfi1_user_sdma_pkt_q *pq = req->pq;
-       struct sdma_mmu_node *node = NULL;
-       struct mmu_rb_node *rb_node;
-
-       rb_node = hfi1_mmu_rb_extract(&pq->sdma_rb_root,
-                                     (unsigned long)iovec->iov.iov_base,
-                                     iovec->iov.iov_len);
-       if (rb_node && !IS_ERR(rb_node))
-               node = container_of(rb_node, struct sdma_mmu_node, rb);
-       else
-               rb_node = NULL;
-
-       if (!node) {
-               node = kzalloc(sizeof(*node), GFP_KERNEL);
-               if (!node)
-                       return -ENOMEM;
-
-               node->rb.addr = (unsigned long)iovec->iov.iov_base;
-               node->pq = pq;
-               atomic_set(&node->refcount, 0);
-               INIT_LIST_HEAD(&node->list);
-       }
-
-       npages = num_user_pages(&iovec->iov);
-       if (node->npages < npages) {
-               pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
-               if (!pages) {
-                       SDMA_DBG(req, "Failed page array alloc");
-                       ret = -ENOMEM;
-                       goto bail;
-               }
-               memcpy(pages, node->pages, node->npages * sizeof(*pages));
-
-               npages -= node->npages;
-
-               /*
-                * If rb_node is NULL, it means that this is brand new node
-                * and, therefore not on the eviction list.
-                * If, however, the rb_node is non-NULL, it means that the
-                * node is already in RB tree and, therefore on the eviction
-                * list (nodes are unconditionally inserted in the eviction
-                * list). In that case, we have to remove the node prior to
-                * calling the eviction function in order to prevent it from
-                * freeing this node.
-                */
-               if (rb_node) {
-                       spin_lock(&pq->evict_lock);
-                       list_del_init(&node->list);
-                       spin_unlock(&pq->evict_lock);
-               }
-retry:
-               if (!hfi1_can_pin_pages(pq->dd, pq->n_locked, npages)) {
-                       cleared = sdma_cache_evict(pq, npages);
-                       if (cleared >= npages)
-                               goto retry;
-               }
-               pinned = hfi1_acquire_user_pages(
-                       ((unsigned long)iovec->iov.iov_base +
-                        (node->npages * PAGE_SIZE)), npages, 0,
-                       pages + node->npages);
-               if (pinned < 0) {
-                       kfree(pages);
-                       ret = pinned;
-                       goto bail;
-               }
-               if (pinned != npages) {
-                       unpin_vector_pages(current->mm, pages, node->npages,
-                                          pinned);
-                       ret = -EFAULT;
-                       goto bail;
-               }
-               kfree(node->pages);
-               node->rb.len = iovec->iov.iov_len;
-               node->pages = pages;
-               node->npages += pinned;
-               npages = node->npages;
-               spin_lock(&pq->evict_lock);
-               list_add(&node->list, &pq->evict);
-               pq->n_locked += pinned;
-               spin_unlock(&pq->evict_lock);
-       }
-       iovec->pages = node->pages;
-       iovec->npages = npages;
-
-       ret = hfi1_mmu_rb_insert(&req->pq->sdma_rb_root, &node->rb);
-       if (ret) {
-               spin_lock(&pq->evict_lock);
-               if (!list_empty(&node->list))
-                       list_del(&node->list);
-               pq->n_locked -= node->npages;
-               spin_unlock(&pq->evict_lock);
-               goto bail;
-       }
-       return 0;
-bail:
-       if (rb_node)
-               unpin_vector_pages(current->mm, node->pages, 0, node->npages);
-       kfree(node);
-       return ret;
-}
-
-static void unpin_vector_pages(struct mm_struct *mm, struct page **pages,
-                              unsigned start, unsigned npages)
-{
-       hfi1_release_user_pages(mm, pages + start, npages, 0);
-       kfree(pages);
-}
-
-static int check_header_template(struct user_sdma_request *req,
-                                struct hfi1_pkt_header *hdr, u32 lrhlen,
-                                u32 datalen)
-{
-       /*
-        * Perform safety checks for any type of packet:
-        *    - transfer size is multiple of 64bytes
-        *    - packet length is multiple of 4bytes
-        *    - entire request length is multiple of 4bytes
-        *    - packet length is not larger than MTU size
-        *
-        * These checks are only done for the first packet of the
-        * transfer since the header is "given" to us by user space.
-        * For the remainder of the packets we compute the values.
-        */
-       if (req->info.fragsize % PIO_BLOCK_SIZE ||
-           lrhlen & 0x3 || req->data_len & 0x3  ||
-           lrhlen > get_lrh_len(*hdr, req->info.fragsize))
-               return -EINVAL;
-
-       if (req_opcode(req->info.ctrl) == EXPECTED) {
-               /*
-                * The header is checked only on the first packet. Furthermore,
-                * we ensure that at least one TID entry is copied when the
-                * request is submitted. Therefore, we don't have to verify that
-                * tididx points to something sane.
-                */
-               u32 tidval = req->tids[req->tididx],
-                       tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE,
-                       tididx = EXP_TID_GET(tidval, IDX),
-                       tidctrl = EXP_TID_GET(tidval, CTRL),
-                       tidoff;
-               __le32 kval = hdr->kdeth.ver_tid_offset;
-
-               tidoff = KDETH_GET(kval, OFFSET) *
-                         (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
-                          KDETH_OM_LARGE : KDETH_OM_SMALL);
-               /*
-                * Expected receive packets have the following
-                * additional checks:
-                *     - offset is not larger than the TID size
-                *     - TIDCtrl values match between header and TID array
-                *     - TID indexes match between header and TID array
-                */
-               if ((tidoff + datalen > tidlen) ||
-                   KDETH_GET(kval, TIDCTRL) != tidctrl ||
-                   KDETH_GET(kval, TID) != tididx)
-                       return -EINVAL;
-       }
-       return 0;
-}
-
-/*
- * Correctly set the BTH.PSN field based on type of
- * transfer - eager packets can just increment the PSN but
- * expected packets encode generation and sequence in the
- * BTH.PSN field so just incrementing will result in errors.
- */
-static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags)
-{
-       u32 val = be32_to_cpu(bthpsn),
-               mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull :
-                       0xffffffull),
-               psn = val & mask;
-       if (expct)
-               psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK);
-       else
-               psn = psn + frags;
-       return psn & mask;
-}
-
-static int set_txreq_header(struct user_sdma_request *req,
-                           struct user_sdma_txreq *tx, u32 datalen)
-{
-       struct hfi1_user_sdma_pkt_q *pq = req->pq;
-       struct hfi1_pkt_header *hdr = &tx->hdr;
-       u16 pbclen;
-       int ret;
-       u32 tidval = 0, lrhlen = get_lrh_len(*hdr, datalen);
-
-       /* Copy the header template to the request before modification */
-       memcpy(hdr, &req->hdr, sizeof(*hdr));
-
-       /*
-        * Check if the PBC and LRH length are mismatched. If so
-        * adjust both in the header.
-        */
-       pbclen = le16_to_cpu(hdr->pbc[0]);
-       if (PBC2LRH(pbclen) != lrhlen) {
-               pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
-               hdr->pbc[0] = cpu_to_le16(pbclen);
-               hdr->lrh[2] = cpu_to_be16(lrhlen >> 2);
-               /*
-                * Third packet
-                * This is the first packet in the sequence that has
-                * a "static" size that can be used for the rest of
-                * the packets (besides the last one).
-                */
-               if (unlikely(req->seqnum == 2)) {
-                       /*
-                        * From this point on the lengths in both the
-                        * PBC and LRH are the same until the last
-                        * packet.
-                        * Adjust the template so we don't have to update
-                        * every packet
-                        */
-                       req->hdr.pbc[0] = hdr->pbc[0];
-                       req->hdr.lrh[2] = hdr->lrh[2];
-               }
-       }
-       /*
-        * We only have to modify the header if this is not the
-        * first packet in the request. Otherwise, we use the
-        * header given to us.
-        */
-       if (unlikely(!req->seqnum)) {
-               ret = check_header_template(req, hdr, lrhlen, datalen);
-               if (ret)
-                       return ret;
-               goto done;
-       }
-
-       hdr->bth[2] = cpu_to_be32(
-               set_pkt_bth_psn(hdr->bth[2],
-                               (req_opcode(req->info.ctrl) == EXPECTED),
-                               req->seqnum));
-
-       /* Set ACK request on last packet */
-       if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT))
-               hdr->bth[2] |= cpu_to_be32(1UL << 31);
-
-       /* Set the new offset */
-       hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset);
-       /* Expected packets have to fill in the new TID information */
-       if (req_opcode(req->info.ctrl) == EXPECTED) {
-               tidval = req->tids[req->tididx];
-               /*
-                * If the offset puts us at the end of the current TID,
-                * advance everything.
-                */
-               if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
-                                        PAGE_SIZE)) {
-                       req->tidoffset = 0;
-                       /*
-                        * Since we don't copy all the TIDs, all at once,
-                        * we have to check again.
-                        */
-                       if (++req->tididx > req->n_tids - 1 ||
-                           !req->tids[req->tididx]) {
-                               return -EINVAL;
-                       }
-                       tidval = req->tids[req->tididx];
-               }
-               req->omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >=
-                       KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE : KDETH_OM_SMALL;
-               /* Set KDETH.TIDCtrl based on value for this TID. */
-               KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL,
-                         EXP_TID_GET(tidval, CTRL));
-               /* Set KDETH.TID based on value for this TID */
-               KDETH_SET(hdr->kdeth.ver_tid_offset, TID,
-                         EXP_TID_GET(tidval, IDX));
-               /* Clear KDETH.SH only on the last packet */
-               if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT))
-                       KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0);
-               /*
-                * Set the KDETH.OFFSET and KDETH.OM based on size of
-                * transfer.
-                */
-               SDMA_DBG(req, "TID offset %ubytes %uunits om%u",
-                        req->tidoffset, req->tidoffset / req->omfactor,
-                        !!(req->omfactor - KDETH_OM_SMALL));
-               KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET,
-                         req->tidoffset / req->omfactor);
-               KDETH_SET(hdr->kdeth.ver_tid_offset, OM,
-                         !!(req->omfactor - KDETH_OM_SMALL));
-       }
-done:
-       trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt,
-                                   req->info.comp_idx, hdr, tidval);
-       return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr));
-}
-
-static int set_txreq_header_ahg(struct user_sdma_request *req,
-                               struct user_sdma_txreq *tx, u32 len)
-{
-       int diff = 0;
-       struct hfi1_user_sdma_pkt_q *pq = req->pq;
-       struct hfi1_pkt_header *hdr = &req->hdr;
-       u16 pbclen = le16_to_cpu(hdr->pbc[0]);
-       u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, len);
-
-       if (PBC2LRH(pbclen) != lrhlen) {
-               /* PBC.PbcLengthDWs */
-               AHG_HEADER_SET(req->ahg, diff, 0, 0, 12,
-                              cpu_to_le16(LRH2PBC(lrhlen)));
-               /* LRH.PktLen (we need the full 16 bits due to byte swap) */
-               AHG_HEADER_SET(req->ahg, diff, 3, 0, 16,
-                              cpu_to_be16(lrhlen >> 2));
-       }
-
-       /*
-        * Do the common updates
-        */
-       /* BTH.PSN and BTH.A */
-       val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) &
-               (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff);
-       if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT))
-               val32 |= 1UL << 31;
-       AHG_HEADER_SET(req->ahg, diff, 6, 0, 16, cpu_to_be16(val32 >> 16));
-       AHG_HEADER_SET(req->ahg, diff, 6, 16, 16, cpu_to_be16(val32 & 0xffff));
-       /* KDETH.Offset */
-       AHG_HEADER_SET(req->ahg, diff, 15, 0, 16,
-                      cpu_to_le16(req->koffset & 0xffff));
-       AHG_HEADER_SET(req->ahg, diff, 15, 16, 16,
-                      cpu_to_le16(req->koffset >> 16));
-       if (req_opcode(req->info.ctrl) == EXPECTED) {
-               __le16 val;
-
-               tidval = req->tids[req->tididx];
-
-               /*
-                * If the offset puts us at the end of the current TID,
-                * advance everything.
-                */
-               if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
-                                        PAGE_SIZE)) {
-                       req->tidoffset = 0;
-                       /*
-                        * Since we don't copy all the TIDs, all at once,
-                        * we have to check again.
-                        */
-                       if (++req->tididx > req->n_tids - 1 ||
-                           !req->tids[req->tididx]) {
-                               return -EINVAL;
-                       }
-                       tidval = req->tids[req->tididx];
-               }
-               req->omfactor = ((EXP_TID_GET(tidval, LEN) *
-                                 PAGE_SIZE) >=
-                                KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE :
-                       KDETH_OM_SMALL;
-               /* KDETH.OM and KDETH.OFFSET (TID) */
-               AHG_HEADER_SET(req->ahg, diff, 7, 0, 16,
-                              ((!!(req->omfactor - KDETH_OM_SMALL)) << 15 |
-                               ((req->tidoffset / req->omfactor) & 0x7fff)));
-               /* KDETH.TIDCtrl, KDETH.TID */
-               val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) |
-                                       (EXP_TID_GET(tidval, IDX) & 0x3ff));
-               /* Clear KDETH.SH on last packet */
-               if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT)) {
-                       val |= cpu_to_le16(KDETH_GET(hdr->kdeth.ver_tid_offset,
-                                                               INTR) >> 16);
-                       val &= cpu_to_le16(~(1U << 13));
-                       AHG_HEADER_SET(req->ahg, diff, 7, 16, 14, val);
-               } else {
-                       AHG_HEADER_SET(req->ahg, diff, 7, 16, 12, val);
-               }
-       }
-
-       trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt,
-                                       req->info.comp_idx, req->sde->this_idx,
-                                       req->ahg_idx, req->ahg, diff, tidval);
-       return diff;
-}
-
-/*
- * SDMA tx request completion callback. Called when the SDMA progress
- * state machine gets notification that the SDMA descriptors for this
- * tx request have been processed by the DMA engine. Called in
- * interrupt context.
- */
-static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
-{
-       struct user_sdma_txreq *tx =
-               container_of(txreq, struct user_sdma_txreq, txreq);
-       struct user_sdma_request *req;
-       struct hfi1_user_sdma_pkt_q *pq;
-       struct hfi1_user_sdma_comp_q *cq;
-       u16 idx;
-
-       if (!tx->req)
-               return;
-
-       req = tx->req;
-       pq = req->pq;
-       cq = req->cq;
-
-       if (status != SDMA_TXREQ_S_OK) {
-               SDMA_DBG(req, "SDMA completion with error %d",
-                        status);
-               set_bit(SDMA_REQ_HAS_ERROR, &req->flags);
-       }
-
-       req->seqcomp = tx->seqnum;
-       kmem_cache_free(pq->txreq_cache, tx);
-       tx = NULL;
-
-       idx = req->info.comp_idx;
-       if (req->status == -1 && status == SDMA_TXREQ_S_OK) {
-               if (req->seqcomp == req->info.npkts - 1) {
-                       req->status = 0;
-                       user_sdma_free_request(req, false);
-                       pq_update(pq);
-                       set_comp_state(pq, cq, idx, COMPLETE, 0);
-               }
-       } else {
-               if (status != SDMA_TXREQ_S_OK)
-                       req->status = status;
-               if (req->seqcomp == (ACCESS_ONCE(req->seqsubmitted) - 1) &&
-                   (test_bit(SDMA_REQ_SEND_DONE, &req->flags) ||
-                    test_bit(SDMA_REQ_DONE_ERROR, &req->flags))) {
-                       user_sdma_free_request(req, false);
-                       pq_update(pq);
-                       set_comp_state(pq, cq, idx, ERROR, req->status);
-               }
-       }
-}
-
-static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq)
-{
-       if (atomic_dec_and_test(&pq->n_reqs)) {
-               xchg(&pq->state, SDMA_PKT_Q_INACTIVE);
-               wake_up(&pq->wait);
-       }
-}
-
-static void user_sdma_free_request(struct user_sdma_request *req, bool unpin)
-{
-       if (!list_empty(&req->txps)) {
-               struct sdma_txreq *t, *p;
-
-               list_for_each_entry_safe(t, p, &req->txps, list) {
-                       struct user_sdma_txreq *tx =
-                               container_of(t, struct user_sdma_txreq, txreq);
-                       list_del_init(&t->list);
-                       sdma_txclean(req->pq->dd, t);
-                       kmem_cache_free(req->pq->txreq_cache, tx);
-               }
-       }
-       if (req->data_iovs) {
-               struct sdma_mmu_node *node;
-               struct mmu_rb_node *mnode;
-               int i;
-
-               for (i = 0; i < req->data_iovs; i++) {
-                       mnode = hfi1_mmu_rb_search(
-                               &req->pq->sdma_rb_root,
-                               (unsigned long)req->iovs[i].iov.iov_base,
-                               req->iovs[i].iov.iov_len);
-                       if (!mnode || IS_ERR(mnode))
-                               continue;
-
-                       node = container_of(mnode, struct sdma_mmu_node, rb);
-                       if (unpin)
-                               hfi1_mmu_rb_remove(&req->pq->sdma_rb_root,
-                                                  &node->rb);
-                       else
-                               atomic_dec(&node->refcount);
-               }
-       }
-       kfree(req->tids);
-       clear_bit(SDMA_REQ_IN_USE, &req->flags);
-}
-
-static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
-                                 struct hfi1_user_sdma_comp_q *cq,
-                                 u16 idx, enum hfi1_sdma_comp_state state,
-                                 int ret)
-{
-       hfi1_cdbg(SDMA, "[%u:%u:%u:%u] Setting completion status %u %d",
-                 pq->dd->unit, pq->ctxt, pq->subctxt, idx, state, ret);
-       cq->comps[idx].status = state;
-       if (state == ERROR)
-               cq->comps[idx].errcode = -ret;
-       trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt,
-                                       idx, state, ret);
-}
-
-static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
-                          unsigned long len)
-{
-       return (bool)(node->addr == addr);
-}
-
-static int sdma_rb_insert(struct rb_root *root, struct mmu_rb_node *mnode)
-{
-       struct sdma_mmu_node *node =
-               container_of(mnode, struct sdma_mmu_node, rb);
-
-       atomic_inc(&node->refcount);
-       return 0;
-}
-
-static void sdma_rb_remove(struct rb_root *root, struct mmu_rb_node *mnode,
-                          struct mm_struct *mm)
-{
-       struct sdma_mmu_node *node =
-               container_of(mnode, struct sdma_mmu_node, rb);
-
-       spin_lock(&node->pq->evict_lock);
-       /*
-        * We've been called by the MMU notifier but this node has been
-        * scheduled for eviction. The eviction function will take care
-        * of freeing this node.
-        * We have to take the above lock first because we are racing
-        * against the setting of the bit in the eviction function.
-        */
-       if (mm && test_bit(SDMA_CACHE_NODE_EVICT, &node->flags)) {
-               spin_unlock(&node->pq->evict_lock);
-               return;
-       }
-
-       if (!list_empty(&node->list))
-               list_del(&node->list);
-       node->pq->n_locked -= node->npages;
-       spin_unlock(&node->pq->evict_lock);
-
-       /*
-        * If mm is set, we are being called by the MMU notifier and we
-        * should not pass a mm_struct to unpin_vector_page(). This is to
-        * prevent a deadlock when hfi1_release_user_pages() attempts to
-        * take the mmap_sem, which the MMU notifier has already taken.
-        */
-       unpin_vector_pages(mm ? NULL : current->mm, node->pages, 0,
-                          node->npages);
-       /*
-        * If called by the MMU notifier, we have to adjust the pinned
-        * page count ourselves.
-        */
-       if (mm)
-               mm->pinned_vm -= node->npages;
-       kfree(node);
-}
-
-static int sdma_rb_invalidate(struct rb_root *root, struct mmu_rb_node *mnode)
-{
-       struct sdma_mmu_node *node =
-               container_of(mnode, struct sdma_mmu_node, rb);
-
-       if (!atomic_read(&node->refcount))
-               return 1;
-       return 0;
-}
diff --git a/drivers/staging/rdma/hfi1/user_sdma.h b/drivers/staging/rdma/hfi1/user_sdma.h
deleted file mode 100644 (file)
index b9240e3..0000000
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#include <linux/device.h>
-#include <linux/wait.h>
-
-#include "common.h"
-#include "iowait.h"
-#include "user_exp_rcv.h"
-
-extern uint extended_psn;
-
-struct hfi1_user_sdma_pkt_q {
-       struct list_head list;
-       unsigned ctxt;
-       unsigned subctxt;
-       u16 n_max_reqs;
-       atomic_t n_reqs;
-       u16 reqidx;
-       struct hfi1_devdata *dd;
-       struct kmem_cache *txreq_cache;
-       struct user_sdma_request *reqs;
-       struct iowait busy;
-       unsigned state;
-       wait_queue_head_t wait;
-       unsigned long unpinned;
-       struct rb_root sdma_rb_root;
-       u32 n_locked;
-       struct list_head evict;
-       spinlock_t evict_lock; /* protect evict and n_locked */
-};
-
-struct hfi1_user_sdma_comp_q {
-       u16 nentries;
-       struct hfi1_sdma_comp_entry *comps;
-};
-
-int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *, struct file *);
-int hfi1_user_sdma_free_queues(struct hfi1_filedata *);
-int hfi1_user_sdma_process_request(struct file *, struct iovec *, unsigned long,
-                                  unsigned long *);
diff --git a/drivers/staging/rdma/hfi1/verbs.c b/drivers/staging/rdma/hfi1/verbs.c
deleted file mode 100644 (file)
index 9cdc85f..0000000
+++ /dev/null
@@ -1,1764 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <rdma/ib_mad.h>
-#include <rdma/ib_user_verbs.h>
-#include <linux/io.h>
-#include <linux/module.h>
-#include <linux/utsname.h>
-#include <linux/rculist.h>
-#include <linux/mm.h>
-#include <linux/random.h>
-#include <linux/vmalloc.h>
-
-#include "hfi.h"
-#include "common.h"
-#include "device.h"
-#include "trace.h"
-#include "qp.h"
-#include "verbs_txreq.h"
-
-static unsigned int hfi1_lkey_table_size = 16;
-module_param_named(lkey_table_size, hfi1_lkey_table_size, uint,
-                  S_IRUGO);
-MODULE_PARM_DESC(lkey_table_size,
-                "LKEY table size in bits (2^n, 1 <= n <= 23)");
-
-static unsigned int hfi1_max_pds = 0xFFFF;
-module_param_named(max_pds, hfi1_max_pds, uint, S_IRUGO);
-MODULE_PARM_DESC(max_pds,
-                "Maximum number of protection domains to support");
-
-static unsigned int hfi1_max_ahs = 0xFFFF;
-module_param_named(max_ahs, hfi1_max_ahs, uint, S_IRUGO);
-MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support");
-
-unsigned int hfi1_max_cqes = 0x2FFFF;
-module_param_named(max_cqes, hfi1_max_cqes, uint, S_IRUGO);
-MODULE_PARM_DESC(max_cqes,
-                "Maximum number of completion queue entries to support");
-
-unsigned int hfi1_max_cqs = 0x1FFFF;
-module_param_named(max_cqs, hfi1_max_cqs, uint, S_IRUGO);
-MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support");
-
-unsigned int hfi1_max_qp_wrs = 0x3FFF;
-module_param_named(max_qp_wrs, hfi1_max_qp_wrs, uint, S_IRUGO);
-MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support");
-
-unsigned int hfi1_max_qps = 16384;
-module_param_named(max_qps, hfi1_max_qps, uint, S_IRUGO);
-MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support");
-
-unsigned int hfi1_max_sges = 0x60;
-module_param_named(max_sges, hfi1_max_sges, uint, S_IRUGO);
-MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support");
-
-unsigned int hfi1_max_mcast_grps = 16384;
-module_param_named(max_mcast_grps, hfi1_max_mcast_grps, uint, S_IRUGO);
-MODULE_PARM_DESC(max_mcast_grps,
-                "Maximum number of multicast groups to support");
-
-unsigned int hfi1_max_mcast_qp_attached = 16;
-module_param_named(max_mcast_qp_attached, hfi1_max_mcast_qp_attached,
-                  uint, S_IRUGO);
-MODULE_PARM_DESC(max_mcast_qp_attached,
-                "Maximum number of attached QPs to support");
-
-unsigned int hfi1_max_srqs = 1024;
-module_param_named(max_srqs, hfi1_max_srqs, uint, S_IRUGO);
-MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support");
-
-unsigned int hfi1_max_srq_sges = 128;
-module_param_named(max_srq_sges, hfi1_max_srq_sges, uint, S_IRUGO);
-MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support");
-
-unsigned int hfi1_max_srq_wrs = 0x1FFFF;
-module_param_named(max_srq_wrs, hfi1_max_srq_wrs, uint, S_IRUGO);
-MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support");
-
-unsigned short piothreshold = 256;
-module_param(piothreshold, ushort, S_IRUGO);
-MODULE_PARM_DESC(piothreshold, "size used to determine sdma vs. pio");
-
-#define COPY_CACHELESS 1
-#define COPY_ADAPTIVE  2
-static unsigned int sge_copy_mode;
-module_param(sge_copy_mode, uint, S_IRUGO);
-MODULE_PARM_DESC(sge_copy_mode,
-                "Verbs copy mode: 0 use memcpy, 1 use cacheless copy, 2 adapt based on WSS");
-
-static void verbs_sdma_complete(
-       struct sdma_txreq *cookie,
-       int status);
-
-static int pio_wait(struct rvt_qp *qp,
-                   struct send_context *sc,
-                   struct hfi1_pkt_state *ps,
-                   u32 flag);
-
-/* Length of buffer to create verbs txreq cache name */
-#define TXREQ_NAME_LEN 24
-
-static uint wss_threshold;
-module_param(wss_threshold, uint, S_IRUGO);
-MODULE_PARM_DESC(wss_threshold, "Percentage (1-100) of LLC to use as a threshold for a cacheless copy");
-static uint wss_clean_period = 256;
-module_param(wss_clean_period, uint, S_IRUGO);
-MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the page copy table is cleaned");
-
-/* memory working set size */
-struct hfi1_wss {
-       unsigned long *entries;
-       atomic_t total_count;
-       atomic_t clean_counter;
-       atomic_t clean_entry;
-
-       int threshold;
-       int num_entries;
-       long pages_mask;
-};
-
-static struct hfi1_wss wss;
-
-int hfi1_wss_init(void)
-{
-       long llc_size;
-       long llc_bits;
-       long table_size;
-       long table_bits;
-
-       /* check for a valid percent range - default to 80 if none or invalid */
-       if (wss_threshold < 1 || wss_threshold > 100)
-               wss_threshold = 80;
-       /* reject a wildly large period */
-       if (wss_clean_period > 1000000)
-               wss_clean_period = 256;
-       /* reject a zero period */
-       if (wss_clean_period == 0)
-               wss_clean_period = 1;
-
-       /*
-        * Calculate the table size - the next power of 2 larger than the
-        * LLC size.  LLC size is in KiB.
-        */
-       llc_size = wss_llc_size() * 1024;
-       table_size = roundup_pow_of_two(llc_size);
-
-       /* one bit per page in rounded up table */
-       llc_bits = llc_size / PAGE_SIZE;
-       table_bits = table_size / PAGE_SIZE;
-       wss.pages_mask = table_bits - 1;
-       wss.num_entries = table_bits / BITS_PER_LONG;
-
-       wss.threshold = (llc_bits * wss_threshold) / 100;
-       if (wss.threshold == 0)
-               wss.threshold = 1;
-
-       atomic_set(&wss.clean_counter, wss_clean_period);
-
-       wss.entries = kcalloc(wss.num_entries, sizeof(*wss.entries),
-                             GFP_KERNEL);
-       if (!wss.entries) {
-               hfi1_wss_exit();
-               return -ENOMEM;
-       }
-
-       return 0;
-}
-
-void hfi1_wss_exit(void)
-{
-       /* coded to handle partially initialized and repeat callers */
-       kfree(wss.entries);
-       wss.entries = NULL;
-}
-
-/*
- * Advance the clean counter.  When the clean period has expired,
- * clean an entry.
- *
- * This is implemented in atomics to avoid locking.  Because multiple
- * variables are involved, it can be racy which can lead to slightly
- * inaccurate information.  Since this is only a heuristic, this is
- * OK.  Any innaccuracies will clean themselves out as the counter
- * advances.  That said, it is unlikely the entry clean operation will
- * race - the next possible racer will not start until the next clean
- * period.
- *
- * The clean counter is implemented as a decrement to zero.  When zero
- * is reached an entry is cleaned.
- */
-static void wss_advance_clean_counter(void)
-{
-       int entry;
-       int weight;
-       unsigned long bits;
-
-       /* become the cleaner if we decrement the counter to zero */
-       if (atomic_dec_and_test(&wss.clean_counter)) {
-               /*
-                * Set, not add, the clean period.  This avoids an issue
-                * where the counter could decrement below the clean period.
-                * Doing a set can result in lost decrements, slowing the
-                * clean advance.  Since this a heuristic, this possible
-                * slowdown is OK.
-                *
-                * An alternative is to loop, advancing the counter by a
-                * clean period until the result is > 0. However, this could
-                * lead to several threads keeping another in the clean loop.
-                * This could be mitigated by limiting the number of times
-                * we stay in the loop.
-                */
-               atomic_set(&wss.clean_counter, wss_clean_period);
-
-               /*
-                * Uniquely grab the entry to clean and move to next.
-                * The current entry is always the lower bits of
-                * wss.clean_entry.  The table size, wss.num_entries,
-                * is always a power-of-2.
-                */
-               entry = (atomic_inc_return(&wss.clean_entry) - 1)
-                       & (wss.num_entries - 1);
-
-               /* clear the entry and count the bits */
-               bits = xchg(&wss.entries[entry], 0);
-               weight = hweight64((u64)bits);
-               /* only adjust the contended total count if needed */
-               if (weight)
-                       atomic_sub(weight, &wss.total_count);
-       }
-}
-
-/*
- * Insert the given address into the working set array.
- */
-static void wss_insert(void *address)
-{
-       u32 page = ((unsigned long)address >> PAGE_SHIFT) & wss.pages_mask;
-       u32 entry = page / BITS_PER_LONG; /* assumes this ends up a shift */
-       u32 nr = page & (BITS_PER_LONG - 1);
-
-       if (!test_and_set_bit(nr, &wss.entries[entry]))
-               atomic_inc(&wss.total_count);
-
-       wss_advance_clean_counter();
-}
-
-/*
- * Is the working set larger than the threshold?
- */
-static inline int wss_exceeds_threshold(void)
-{
-       return atomic_read(&wss.total_count) >= wss.threshold;
-}
-
-/*
- * Translate ib_wr_opcode into ib_wc_opcode.
- */
-const enum ib_wc_opcode ib_hfi1_wc_opcode[] = {
-       [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
-       [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
-       [IB_WR_SEND] = IB_WC_SEND,
-       [IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
-       [IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
-       [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
-       [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD
-};
-
-/*
- * Length of header by opcode, 0 --> not supported
- */
-const u8 hdr_len_by_opcode[256] = {
-       /* RC */
-       [IB_OPCODE_RC_SEND_FIRST]                     = 12 + 8,
-       [IB_OPCODE_RC_SEND_MIDDLE]                    = 12 + 8,
-       [IB_OPCODE_RC_SEND_LAST]                      = 12 + 8,
-       [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE]       = 12 + 8 + 4,
-       [IB_OPCODE_RC_SEND_ONLY]                      = 12 + 8,
-       [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE]       = 12 + 8 + 4,
-       [IB_OPCODE_RC_RDMA_WRITE_FIRST]               = 12 + 8 + 16,
-       [IB_OPCODE_RC_RDMA_WRITE_MIDDLE]              = 12 + 8,
-       [IB_OPCODE_RC_RDMA_WRITE_LAST]                = 12 + 8,
-       [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4,
-       [IB_OPCODE_RC_RDMA_WRITE_ONLY]                = 12 + 8 + 16,
-       [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20,
-       [IB_OPCODE_RC_RDMA_READ_REQUEST]              = 12 + 8 + 16,
-       [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST]       = 12 + 8 + 4,
-       [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE]      = 12 + 8,
-       [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST]        = 12 + 8 + 4,
-       [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY]        = 12 + 8 + 4,
-       [IB_OPCODE_RC_ACKNOWLEDGE]                    = 12 + 8 + 4,
-       [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE]             = 12 + 8 + 4,
-       [IB_OPCODE_RC_COMPARE_SWAP]                   = 12 + 8 + 28,
-       [IB_OPCODE_RC_FETCH_ADD]                      = 12 + 8 + 28,
-       /* UC */
-       [IB_OPCODE_UC_SEND_FIRST]                     = 12 + 8,
-       [IB_OPCODE_UC_SEND_MIDDLE]                    = 12 + 8,
-       [IB_OPCODE_UC_SEND_LAST]                      = 12 + 8,
-       [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE]       = 12 + 8 + 4,
-       [IB_OPCODE_UC_SEND_ONLY]                      = 12 + 8,
-       [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE]       = 12 + 8 + 4,
-       [IB_OPCODE_UC_RDMA_WRITE_FIRST]               = 12 + 8 + 16,
-       [IB_OPCODE_UC_RDMA_WRITE_MIDDLE]              = 12 + 8,
-       [IB_OPCODE_UC_RDMA_WRITE_LAST]                = 12 + 8,
-       [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4,
-       [IB_OPCODE_UC_RDMA_WRITE_ONLY]                = 12 + 8 + 16,
-       [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20,
-       /* UD */
-       [IB_OPCODE_UD_SEND_ONLY]                      = 12 + 8 + 8,
-       [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE]       = 12 + 8 + 12
-};
-
-static const opcode_handler opcode_handler_tbl[256] = {
-       /* RC */
-       [IB_OPCODE_RC_SEND_FIRST]                     = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_SEND_MIDDLE]                    = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_SEND_LAST]                      = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE]       = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_SEND_ONLY]                      = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE]       = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_RDMA_WRITE_FIRST]               = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_RDMA_WRITE_MIDDLE]              = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_RDMA_WRITE_LAST]                = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_RDMA_WRITE_ONLY]                = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_RDMA_READ_REQUEST]              = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST]       = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE]      = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST]        = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY]        = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_ACKNOWLEDGE]                    = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE]             = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_COMPARE_SWAP]                   = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_FETCH_ADD]                      = &hfi1_rc_rcv,
-       /* UC */
-       [IB_OPCODE_UC_SEND_FIRST]                     = &hfi1_uc_rcv,
-       [IB_OPCODE_UC_SEND_MIDDLE]                    = &hfi1_uc_rcv,
-       [IB_OPCODE_UC_SEND_LAST]                      = &hfi1_uc_rcv,
-       [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE]       = &hfi1_uc_rcv,
-       [IB_OPCODE_UC_SEND_ONLY]                      = &hfi1_uc_rcv,
-       [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE]       = &hfi1_uc_rcv,
-       [IB_OPCODE_UC_RDMA_WRITE_FIRST]               = &hfi1_uc_rcv,
-       [IB_OPCODE_UC_RDMA_WRITE_MIDDLE]              = &hfi1_uc_rcv,
-       [IB_OPCODE_UC_RDMA_WRITE_LAST]                = &hfi1_uc_rcv,
-       [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_uc_rcv,
-       [IB_OPCODE_UC_RDMA_WRITE_ONLY]                = &hfi1_uc_rcv,
-       [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_uc_rcv,
-       /* UD */
-       [IB_OPCODE_UD_SEND_ONLY]                      = &hfi1_ud_rcv,
-       [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE]       = &hfi1_ud_rcv,
-       /* CNP */
-       [IB_OPCODE_CNP]                               = &hfi1_cnp_rcv
-};
-
-/*
- * System image GUID.
- */
-__be64 ib_hfi1_sys_image_guid;
-
-/**
- * hfi1_copy_sge - copy data to SGE memory
- * @ss: the SGE state
- * @data: the data to copy
- * @length: the length of the data
- * @copy_last: do a separate copy of the last 8 bytes
- */
-void hfi1_copy_sge(
-       struct rvt_sge_state *ss,
-       void *data, u32 length,
-       int release,
-       int copy_last)
-{
-       struct rvt_sge *sge = &ss->sge;
-       int in_last = 0;
-       int i;
-       int cacheless_copy = 0;
-
-       if (sge_copy_mode == COPY_CACHELESS) {
-               cacheless_copy = length >= PAGE_SIZE;
-       } else if (sge_copy_mode == COPY_ADAPTIVE) {
-               if (length >= PAGE_SIZE) {
-                       /*
-                        * NOTE: this *assumes*:
-                        * o The first vaddr is the dest.
-                        * o If multiple pages, then vaddr is sequential.
-                        */
-                       wss_insert(sge->vaddr);
-                       if (length >= (2 * PAGE_SIZE))
-                               wss_insert(sge->vaddr + PAGE_SIZE);
-
-                       cacheless_copy = wss_exceeds_threshold();
-               } else {
-                       wss_advance_clean_counter();
-               }
-       }
-       if (copy_last) {
-               if (length > 8) {
-                       length -= 8;
-               } else {
-                       copy_last = 0;
-                       in_last = 1;
-               }
-       }
-
-again:
-       while (length) {
-               u32 len = sge->length;
-
-               if (len > length)
-                       len = length;
-               if (len > sge->sge_length)
-                       len = sge->sge_length;
-               WARN_ON_ONCE(len == 0);
-               if (unlikely(in_last)) {
-                       /* enforce byte transfer ordering */
-                       for (i = 0; i < len; i++)
-                               ((u8 *)sge->vaddr)[i] = ((u8 *)data)[i];
-               } else if (cacheless_copy) {
-                       cacheless_memcpy(sge->vaddr, data, len);
-               } else {
-                       memcpy(sge->vaddr, data, len);
-               }
-               sge->vaddr += len;
-               sge->length -= len;
-               sge->sge_length -= len;
-               if (sge->sge_length == 0) {
-                       if (release)
-                               rvt_put_mr(sge->mr);
-                       if (--ss->num_sge)
-                               *sge = *ss->sg_list++;
-               } else if (sge->length == 0 && sge->mr->lkey) {
-                       if (++sge->n >= RVT_SEGSZ) {
-                               if (++sge->m >= sge->mr->mapsz)
-                                       break;
-                               sge->n = 0;
-                       }
-                       sge->vaddr =
-                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
-                       sge->length =
-                               sge->mr->map[sge->m]->segs[sge->n].length;
-               }
-               data += len;
-               length -= len;
-       }
-
-       if (copy_last) {
-               copy_last = 0;
-               in_last = 1;
-               length = 8;
-               goto again;
-       }
-}
-
-/**
- * hfi1_skip_sge - skip over SGE memory
- * @ss: the SGE state
- * @length: the number of bytes to skip
- */
-void hfi1_skip_sge(struct rvt_sge_state *ss, u32 length, int release)
-{
-       struct rvt_sge *sge = &ss->sge;
-
-       while (length) {
-               u32 len = sge->length;
-
-               if (len > length)
-                       len = length;
-               if (len > sge->sge_length)
-                       len = sge->sge_length;
-               WARN_ON_ONCE(len == 0);
-               sge->vaddr += len;
-               sge->length -= len;
-               sge->sge_length -= len;
-               if (sge->sge_length == 0) {
-                       if (release)
-                               rvt_put_mr(sge->mr);
-                       if (--ss->num_sge)
-                               *sge = *ss->sg_list++;
-               } else if (sge->length == 0 && sge->mr->lkey) {
-                       if (++sge->n >= RVT_SEGSZ) {
-                               if (++sge->m >= sge->mr->mapsz)
-                                       break;
-                               sge->n = 0;
-                       }
-                       sge->vaddr =
-                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
-                       sge->length =
-                               sge->mr->map[sge->m]->segs[sge->n].length;
-               }
-               length -= len;
-       }
-}
-
-/*
- * Make sure the QP is ready and able to accept the given opcode.
- */
-static inline int qp_ok(int opcode, struct hfi1_packet *packet)
-{
-       struct hfi1_ibport *ibp;
-
-       if (!(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK))
-               goto dropit;
-       if (((opcode & RVT_OPCODE_QP_MASK) == packet->qp->allowed_ops) ||
-           (opcode == IB_OPCODE_CNP))
-               return 1;
-dropit:
-       ibp = &packet->rcd->ppd->ibport_data;
-       ibp->rvp.n_pkt_drops++;
-       return 0;
-}
-
-/**
- * hfi1_ib_rcv - process an incoming packet
- * @packet: data packet information
- *
- * This is called to process an incoming packet at interrupt level.
- *
- * Tlen is the length of the header + data + CRC in bytes.
- */
-void hfi1_ib_rcv(struct hfi1_packet *packet)
-{
-       struct hfi1_ctxtdata *rcd = packet->rcd;
-       struct hfi1_ib_header *hdr = packet->hdr;
-       u32 tlen = packet->tlen;
-       struct hfi1_pportdata *ppd = rcd->ppd;
-       struct hfi1_ibport *ibp = &ppd->ibport_data;
-       struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi;
-       unsigned long flags;
-       u32 qp_num;
-       int lnh;
-       u8 opcode;
-       u16 lid;
-
-       /* Check for GRH */
-       lnh = be16_to_cpu(hdr->lrh[0]) & 3;
-       if (lnh == HFI1_LRH_BTH) {
-               packet->ohdr = &hdr->u.oth;
-       } else if (lnh == HFI1_LRH_GRH) {
-               u32 vtf;
-
-               packet->ohdr = &hdr->u.l.oth;
-               if (hdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR)
-                       goto drop;
-               vtf = be32_to_cpu(hdr->u.l.grh.version_tclass_flow);
-               if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION)
-                       goto drop;
-               packet->rcv_flags |= HFI1_HAS_GRH;
-       } else {
-               goto drop;
-       }
-
-       trace_input_ibhdr(rcd->dd, hdr);
-
-       opcode = (be32_to_cpu(packet->ohdr->bth[0]) >> 24);
-       inc_opstats(tlen, &rcd->opstats->stats[opcode]);
-
-       /* Get the destination QP number. */
-       qp_num = be32_to_cpu(packet->ohdr->bth[1]) & RVT_QPN_MASK;
-       lid = be16_to_cpu(hdr->lrh[1]);
-       if (unlikely((lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) &&
-                    (lid != be16_to_cpu(IB_LID_PERMISSIVE)))) {
-               struct rvt_mcast *mcast;
-               struct rvt_mcast_qp *p;
-
-               if (lnh != HFI1_LRH_GRH)
-                       goto drop;
-               mcast = rvt_mcast_find(&ibp->rvp, &hdr->u.l.grh.dgid);
-               if (!mcast)
-                       goto drop;
-               list_for_each_entry_rcu(p, &mcast->qp_list, list) {
-                       packet->qp = p->qp;
-                       spin_lock_irqsave(&packet->qp->r_lock, flags);
-                       if (likely((qp_ok(opcode, packet))))
-                               opcode_handler_tbl[opcode](packet);
-                       spin_unlock_irqrestore(&packet->qp->r_lock, flags);
-               }
-               /*
-                * Notify rvt_multicast_detach() if it is waiting for us
-                * to finish.
-                */
-               if (atomic_dec_return(&mcast->refcount) <= 1)
-                       wake_up(&mcast->wait);
-       } else {
-               rcu_read_lock();
-               packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
-               if (!packet->qp) {
-                       rcu_read_unlock();
-                       goto drop;
-               }
-               spin_lock_irqsave(&packet->qp->r_lock, flags);
-               if (likely((qp_ok(opcode, packet))))
-                       opcode_handler_tbl[opcode](packet);
-               spin_unlock_irqrestore(&packet->qp->r_lock, flags);
-               rcu_read_unlock();
-       }
-       return;
-
-drop:
-       ibp->rvp.n_pkt_drops++;
-}
-
-/*
- * This is called from a timer to check for QPs
- * which need kernel memory in order to send a packet.
- */
-static void mem_timer(unsigned long data)
-{
-       struct hfi1_ibdev *dev = (struct hfi1_ibdev *)data;
-       struct list_head *list = &dev->memwait;
-       struct rvt_qp *qp = NULL;
-       struct iowait *wait;
-       unsigned long flags;
-       struct hfi1_qp_priv *priv;
-
-       write_seqlock_irqsave(&dev->iowait_lock, flags);
-       if (!list_empty(list)) {
-               wait = list_first_entry(list, struct iowait, list);
-               qp = iowait_to_qp(wait);
-               priv = qp->priv;
-               list_del_init(&priv->s_iowait.list);
-               /* refcount held until actual wake up */
-               if (!list_empty(list))
-                       mod_timer(&dev->mem_timer, jiffies + 1);
-       }
-       write_sequnlock_irqrestore(&dev->iowait_lock, flags);
-
-       if (qp)
-               hfi1_qp_wakeup(qp, RVT_S_WAIT_KMEM);
-}
-
-void update_sge(struct rvt_sge_state *ss, u32 length)
-{
-       struct rvt_sge *sge = &ss->sge;
-
-       sge->vaddr += length;
-       sge->length -= length;
-       sge->sge_length -= length;
-       if (sge->sge_length == 0) {
-               if (--ss->num_sge)
-                       *sge = *ss->sg_list++;
-       } else if (sge->length == 0 && sge->mr->lkey) {
-               if (++sge->n >= RVT_SEGSZ) {
-                       if (++sge->m >= sge->mr->mapsz)
-                               return;
-                       sge->n = 0;
-               }
-               sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
-               sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
-       }
-}
-
-/*
- * This is called with progress side lock held.
- */
-/* New API */
-static void verbs_sdma_complete(
-       struct sdma_txreq *cookie,
-       int status)
-{
-       struct verbs_txreq *tx =
-               container_of(cookie, struct verbs_txreq, txreq);
-       struct rvt_qp *qp = tx->qp;
-
-       spin_lock(&qp->s_lock);
-       if (tx->wqe) {
-               hfi1_send_complete(qp, tx->wqe, IB_WC_SUCCESS);
-       } else if (qp->ibqp.qp_type == IB_QPT_RC) {
-               struct hfi1_ib_header *hdr;
-
-               hdr = &tx->phdr.hdr;
-               hfi1_rc_send_complete(qp, hdr);
-       }
-       spin_unlock(&qp->s_lock);
-
-       hfi1_put_txreq(tx);
-}
-
-static int wait_kmem(struct hfi1_ibdev *dev,
-                    struct rvt_qp *qp,
-                    struct hfi1_pkt_state *ps)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-       unsigned long flags;
-       int ret = 0;
-
-       spin_lock_irqsave(&qp->s_lock, flags);
-       if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
-               write_seqlock(&dev->iowait_lock);
-               list_add_tail(&ps->s_txreq->txreq.list,
-                             &priv->s_iowait.tx_head);
-               if (list_empty(&priv->s_iowait.list)) {
-                       if (list_empty(&dev->memwait))
-                               mod_timer(&dev->mem_timer, jiffies + 1);
-                       qp->s_flags |= RVT_S_WAIT_KMEM;
-                       list_add_tail(&priv->s_iowait.list, &dev->memwait);
-                       trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM);
-                       atomic_inc(&qp->refcount);
-               }
-               write_sequnlock(&dev->iowait_lock);
-               qp->s_flags &= ~RVT_S_BUSY;
-               ret = -EBUSY;
-       }
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-
-       return ret;
-}
-
-/*
- * This routine calls txadds for each sg entry.
- *
- * Add failures will revert the sge cursor
- */
-static noinline int build_verbs_ulp_payload(
-       struct sdma_engine *sde,
-       struct rvt_sge_state *ss,
-       u32 length,
-       struct verbs_txreq *tx)
-{
-       struct rvt_sge *sg_list = ss->sg_list;
-       struct rvt_sge sge = ss->sge;
-       u8 num_sge = ss->num_sge;
-       u32 len;
-       int ret = 0;
-
-       while (length) {
-               len = ss->sge.length;
-               if (len > length)
-                       len = length;
-               if (len > ss->sge.sge_length)
-                       len = ss->sge.sge_length;
-               WARN_ON_ONCE(len == 0);
-               ret = sdma_txadd_kvaddr(
-                       sde->dd,
-                       &tx->txreq,
-                       ss->sge.vaddr,
-                       len);
-               if (ret)
-                       goto bail_txadd;
-               update_sge(ss, len);
-               length -= len;
-       }
-       return ret;
-bail_txadd:
-       /* unwind cursor */
-       ss->sge = sge;
-       ss->num_sge = num_sge;
-       ss->sg_list = sg_list;
-       return ret;
-}
-
-/*
- * Build the number of DMA descriptors needed to send length bytes of data.
- *
- * NOTE: DMA mapping is held in the tx until completed in the ring or
- *       the tx desc is freed without having been submitted to the ring
- *
- * This routine ensures all the helper routine calls succeed.
- */
-/* New API */
-static int build_verbs_tx_desc(
-       struct sdma_engine *sde,
-       struct rvt_sge_state *ss,
-       u32 length,
-       struct verbs_txreq *tx,
-       struct ahg_ib_header *ahdr,
-       u64 pbc)
-{
-       int ret = 0;
-       struct hfi1_pio_header *phdr = &tx->phdr;
-       u16 hdrbytes = tx->hdr_dwords << 2;
-
-       if (!ahdr->ahgcount) {
-               ret = sdma_txinit_ahg(
-                       &tx->txreq,
-                       ahdr->tx_flags,
-                       hdrbytes + length,
-                       ahdr->ahgidx,
-                       0,
-                       NULL,
-                       0,
-                       verbs_sdma_complete);
-               if (ret)
-                       goto bail_txadd;
-               phdr->pbc = cpu_to_le64(pbc);
-               ret = sdma_txadd_kvaddr(
-                       sde->dd,
-                       &tx->txreq,
-                       phdr,
-                       hdrbytes);
-               if (ret)
-                       goto bail_txadd;
-       } else {
-               ret = sdma_txinit_ahg(
-                       &tx->txreq,
-                       ahdr->tx_flags,
-                       length,
-                       ahdr->ahgidx,
-                       ahdr->ahgcount,
-                       ahdr->ahgdesc,
-                       hdrbytes,
-                       verbs_sdma_complete);
-               if (ret)
-                       goto bail_txadd;
-       }
-
-       /* add the ulp payload - if any.  ss can be NULL for acks */
-       if (ss)
-               ret = build_verbs_ulp_payload(sde, ss, length, tx);
-bail_txadd:
-       return ret;
-}
-
-int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
-                       u64 pbc)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-       struct ahg_ib_header *ahdr = priv->s_hdr;
-       u32 hdrwords = qp->s_hdrwords;
-       struct rvt_sge_state *ss = qp->s_cur_sge;
-       u32 len = qp->s_cur_size;
-       u32 plen = hdrwords + ((len + 3) >> 2) + 2; /* includes pbc */
-       struct hfi1_ibdev *dev = ps->dev;
-       struct hfi1_pportdata *ppd = ps->ppd;
-       struct verbs_txreq *tx;
-       u64 pbc_flags = 0;
-       u8 sc5 = priv->s_sc;
-
-       int ret;
-
-       tx = ps->s_txreq;
-       if (!sdma_txreq_built(&tx->txreq)) {
-               if (likely(pbc == 0)) {
-                       u32 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5);
-                       /* No vl15 here */
-                       /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
-                       pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT;
-
-                       pbc = create_pbc(ppd,
-                                        pbc_flags,
-                                        qp->srate_mbps,
-                                        vl,
-                                        plen);
-               }
-               tx->wqe = qp->s_wqe;
-               ret = build_verbs_tx_desc(tx->sde, ss, len, tx, ahdr, pbc);
-               if (unlikely(ret))
-                       goto bail_build;
-       }
-       ret =  sdma_send_txreq(tx->sde, &priv->s_iowait, &tx->txreq);
-       if (unlikely(ret < 0)) {
-               if (ret == -ECOMM)
-                       goto bail_ecomm;
-               return ret;
-       }
-       trace_sdma_output_ibhdr(dd_from_ibdev(qp->ibqp.device),
-                               &ps->s_txreq->phdr.hdr);
-       return ret;
-
-bail_ecomm:
-       /* The current one got "sent" */
-       return 0;
-bail_build:
-       ret = wait_kmem(dev, qp, ps);
-       if (!ret) {
-               /* free txreq - bad state */
-               hfi1_put_txreq(ps->s_txreq);
-               ps->s_txreq = NULL;
-       }
-       return ret;
-}
-
-/*
- * If we are now in the error state, return zero to flush the
- * send work request.
- */
-static int pio_wait(struct rvt_qp *qp,
-                   struct send_context *sc,
-                   struct hfi1_pkt_state *ps,
-                   u32 flag)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-       struct hfi1_devdata *dd = sc->dd;
-       struct hfi1_ibdev *dev = &dd->verbs_dev;
-       unsigned long flags;
-       int ret = 0;
-
-       /*
-        * Note that as soon as want_buffer() is called and
-        * possibly before it returns, sc_piobufavail()
-        * could be called. Therefore, put QP on the I/O wait list before
-        * enabling the PIO avail interrupt.
-        */
-       spin_lock_irqsave(&qp->s_lock, flags);
-       if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
-               write_seqlock(&dev->iowait_lock);
-               list_add_tail(&ps->s_txreq->txreq.list,
-                             &priv->s_iowait.tx_head);
-               if (list_empty(&priv->s_iowait.list)) {
-                       struct hfi1_ibdev *dev = &dd->verbs_dev;
-                       int was_empty;
-
-                       dev->n_piowait += !!(flag & RVT_S_WAIT_PIO);
-                       dev->n_piodrain += !!(flag & RVT_S_WAIT_PIO_DRAIN);
-                       dev->n_piowait++;
-                       qp->s_flags |= flag;
-                       was_empty = list_empty(&sc->piowait);
-                       list_add_tail(&priv->s_iowait.list, &sc->piowait);
-                       trace_hfi1_qpsleep(qp, RVT_S_WAIT_PIO);
-                       atomic_inc(&qp->refcount);
-                       /* counting: only call wantpiobuf_intr if first user */
-                       if (was_empty)
-                               hfi1_sc_wantpiobuf_intr(sc, 1);
-               }
-               write_sequnlock(&dev->iowait_lock);
-               qp->s_flags &= ~RVT_S_BUSY;
-               ret = -EBUSY;
-       }
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-       return ret;
-}
-
-static void verbs_pio_complete(void *arg, int code)
-{
-       struct rvt_qp *qp = (struct rvt_qp *)arg;
-       struct hfi1_qp_priv *priv = qp->priv;
-
-       if (iowait_pio_dec(&priv->s_iowait))
-               iowait_drain_wakeup(&priv->s_iowait);
-}
-
-int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
-                       u64 pbc)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-       u32 hdrwords = qp->s_hdrwords;
-       struct rvt_sge_state *ss = qp->s_cur_sge;
-       u32 len = qp->s_cur_size;
-       u32 dwords = (len + 3) >> 2;
-       u32 plen = hdrwords + dwords + 2; /* includes pbc */
-       struct hfi1_pportdata *ppd = ps->ppd;
-       u32 *hdr = (u32 *)&ps->s_txreq->phdr.hdr;
-       u64 pbc_flags = 0;
-       u8 sc5;
-       unsigned long flags = 0;
-       struct send_context *sc;
-       struct pio_buf *pbuf;
-       int wc_status = IB_WC_SUCCESS;
-       int ret = 0;
-       pio_release_cb cb = NULL;
-
-       /* only RC/UC use complete */
-       switch (qp->ibqp.qp_type) {
-       case IB_QPT_RC:
-       case IB_QPT_UC:
-               cb = verbs_pio_complete;
-               break;
-       default:
-               break;
-       }
-
-       /* vl15 special case taken care of in ud.c */
-       sc5 = priv->s_sc;
-       sc = ps->s_txreq->psc;
-
-       if (likely(pbc == 0)) {
-               u8 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5);
-               /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
-               pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT;
-               pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
-       }
-       if (cb)
-               iowait_pio_inc(&priv->s_iowait);
-       pbuf = sc_buffer_alloc(sc, plen, cb, qp);
-       if (unlikely(!pbuf)) {
-               if (cb)
-                       verbs_pio_complete(qp, 0);
-               if (ppd->host_link_state != HLS_UP_ACTIVE) {
-                       /*
-                        * If we have filled the PIO buffers to capacity and are
-                        * not in an active state this request is not going to
-                        * go out to so just complete it with an error or else a
-                        * ULP or the core may be stuck waiting.
-                        */
-                       hfi1_cdbg(
-                               PIO,
-                               "alloc failed. state not active, completing");
-                       wc_status = IB_WC_GENERAL_ERR;
-                       goto pio_bail;
-               } else {
-                       /*
-                        * This is a normal occurrence. The PIO buffs are full
-                        * up but we are still happily sending, well we could be
-                        * so lets continue to queue the request.
-                        */
-                       hfi1_cdbg(PIO, "alloc failed. state active, queuing");
-                       ret = pio_wait(qp, sc, ps, RVT_S_WAIT_PIO);
-                       if (!ret)
-                               /* txreq not queued - free */
-                               goto bail;
-                       /* tx consumed in wait */
-                       return ret;
-               }
-       }
-
-       if (len == 0) {
-               pio_copy(ppd->dd, pbuf, pbc, hdr, hdrwords);
-       } else {
-               if (ss) {
-                       seg_pio_copy_start(pbuf, pbc, hdr, hdrwords * 4);
-                       while (len) {
-                               void *addr = ss->sge.vaddr;
-                               u32 slen = ss->sge.length;
-
-                               if (slen > len)
-                                       slen = len;
-                               update_sge(ss, slen);
-                               seg_pio_copy_mid(pbuf, addr, slen);
-                               len -= slen;
-                       }
-                       seg_pio_copy_end(pbuf);
-               }
-       }
-
-       trace_pio_output_ibhdr(dd_from_ibdev(qp->ibqp.device),
-                              &ps->s_txreq->phdr.hdr);
-
-pio_bail:
-       if (qp->s_wqe) {
-               spin_lock_irqsave(&qp->s_lock, flags);
-               hfi1_send_complete(qp, qp->s_wqe, wc_status);
-               spin_unlock_irqrestore(&qp->s_lock, flags);
-       } else if (qp->ibqp.qp_type == IB_QPT_RC) {
-               spin_lock_irqsave(&qp->s_lock, flags);
-               hfi1_rc_send_complete(qp, &ps->s_txreq->phdr.hdr);
-               spin_unlock_irqrestore(&qp->s_lock, flags);
-       }
-
-       ret = 0;
-
-bail:
-       hfi1_put_txreq(ps->s_txreq);
-       return ret;
-}
-
-/*
- * egress_pkey_matches_entry - return 1 if the pkey matches ent (ent
- * being an entry from the partition key table), return 0
- * otherwise. Use the matching criteria for egress partition keys
- * specified in the OPAv1 spec., section 9.1l.7.
- */
-static inline int egress_pkey_matches_entry(u16 pkey, u16 ent)
-{
-       u16 mkey = pkey & PKEY_LOW_15_MASK;
-       u16 mentry = ent & PKEY_LOW_15_MASK;
-
-       if (mkey == mentry) {
-               /*
-                * If pkey[15] is set (full partition member),
-                * is bit 15 in the corresponding table element
-                * clear (limited member)?
-                */
-               if (pkey & PKEY_MEMBER_MASK)
-                       return !!(ent & PKEY_MEMBER_MASK);
-               return 1;
-       }
-       return 0;
-}
-
-/**
- * egress_pkey_check - check P_KEY of a packet
- * @ppd:    Physical IB port data
- * @lrh: Local route header
- * @bth: Base transport header
- * @sc5:    SC for packet
- * @s_pkey_index: It will be used for look up optimization for kernel contexts
- * only. If it is negative value, then it means user contexts is calling this
- * function.
- *
- * It checks if hdr's pkey is valid.
- *
- * Return: 0 on success, otherwise, 1
- */
-int egress_pkey_check(struct hfi1_pportdata *ppd, __be16 *lrh, __be32 *bth,
-                     u8 sc5, int8_t s_pkey_index)
-{
-       struct hfi1_devdata *dd;
-       int i;
-       u16 pkey;
-       int is_user_ctxt_mechanism = (s_pkey_index < 0);
-
-       if (!(ppd->part_enforce & HFI1_PART_ENFORCE_OUT))
-               return 0;
-
-       pkey = (u16)be32_to_cpu(bth[0]);
-
-       /* If SC15, pkey[0:14] must be 0x7fff */
-       if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
-               goto bad;
-
-       /* Is the pkey = 0x0, or 0x8000? */
-       if ((pkey & PKEY_LOW_15_MASK) == 0)
-               goto bad;
-
-       /*
-        * For the kernel contexts only, if a qp is passed into the function,
-        * the most likely matching pkey has index qp->s_pkey_index
-        */
-       if (!is_user_ctxt_mechanism &&
-           egress_pkey_matches_entry(pkey, ppd->pkeys[s_pkey_index])) {
-               return 0;
-       }
-
-       for (i = 0; i < MAX_PKEY_VALUES; i++) {
-               if (egress_pkey_matches_entry(pkey, ppd->pkeys[i]))
-                       return 0;
-       }
-bad:
-       /*
-        * For the user-context mechanism, the P_KEY check would only happen
-        * once per SDMA request, not once per packet.  Therefore, there's no
-        * need to increment the counter for the user-context mechanism.
-        */
-       if (!is_user_ctxt_mechanism) {
-               incr_cntr64(&ppd->port_xmit_constraint_errors);
-               dd = ppd->dd;
-               if (!(dd->err_info_xmit_constraint.status &
-                     OPA_EI_STATUS_SMASK)) {
-                       u16 slid = be16_to_cpu(lrh[3]);
-
-                       dd->err_info_xmit_constraint.status |=
-                               OPA_EI_STATUS_SMASK;
-                       dd->err_info_xmit_constraint.slid = slid;
-                       dd->err_info_xmit_constraint.pkey = pkey;
-               }
-       }
-       return 1;
-}
-
-/**
- * get_send_routine - choose an egress routine
- *
- * Choose an egress routine based on QP type
- * and size
- */
-static inline send_routine get_send_routine(struct rvt_qp *qp,
-                                           struct verbs_txreq *tx)
-{
-       struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
-       struct hfi1_qp_priv *priv = qp->priv;
-       struct hfi1_ib_header *h = &tx->phdr.hdr;
-
-       if (unlikely(!(dd->flags & HFI1_HAS_SEND_DMA)))
-               return dd->process_pio_send;
-       switch (qp->ibqp.qp_type) {
-       case IB_QPT_SMI:
-               return dd->process_pio_send;
-       case IB_QPT_GSI:
-       case IB_QPT_UD:
-               break;
-       case IB_QPT_RC:
-               if (piothreshold &&
-                   qp->s_cur_size <= min(piothreshold, qp->pmtu) &&
-                   (BIT(get_opcode(h) & 0x1f) & rc_only_opcode) &&
-                   iowait_sdma_pending(&priv->s_iowait) == 0 &&
-                   !sdma_txreq_built(&tx->txreq))
-                       return dd->process_pio_send;
-               break;
-       case IB_QPT_UC:
-               if (piothreshold &&
-                   qp->s_cur_size <= min(piothreshold, qp->pmtu) &&
-                   (BIT(get_opcode(h) & 0x1f) & uc_only_opcode) &&
-                   iowait_sdma_pending(&priv->s_iowait) == 0 &&
-                   !sdma_txreq_built(&tx->txreq))
-                       return dd->process_pio_send;
-               break;
-       default:
-               break;
-       }
-       return dd->process_dma_send;
-}
-
-/**
- * hfi1_verbs_send - send a packet
- * @qp: the QP to send on
- * @ps: the state of the packet to send
- *
- * Return zero if packet is sent or queued OK.
- * Return non-zero and clear qp->s_flags RVT_S_BUSY otherwise.
- */
-int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
-{
-       struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
-       struct hfi1_qp_priv *priv = qp->priv;
-       struct hfi1_other_headers *ohdr;
-       struct hfi1_ib_header *hdr;
-       send_routine sr;
-       int ret;
-       u8 lnh;
-
-       hdr = &ps->s_txreq->phdr.hdr;
-       /* locate the pkey within the headers */
-       lnh = be16_to_cpu(hdr->lrh[0]) & 3;
-       if (lnh == HFI1_LRH_GRH)
-               ohdr = &hdr->u.l.oth;
-       else
-               ohdr = &hdr->u.oth;
-
-       sr = get_send_routine(qp, ps->s_txreq);
-       ret = egress_pkey_check(dd->pport,
-                               hdr->lrh,
-                               ohdr->bth,
-                               priv->s_sc,
-                               qp->s_pkey_index);
-       if (unlikely(ret)) {
-               /*
-                * The value we are returning here does not get propagated to
-                * the verbs caller. Thus we need to complete the request with
-                * error otherwise the caller could be sitting waiting on the
-                * completion event. Only do this for PIO. SDMA has its own
-                * mechanism for handling the errors. So for SDMA we can just
-                * return.
-                */
-               if (sr == dd->process_pio_send) {
-                       unsigned long flags;
-
-                       hfi1_cdbg(PIO, "%s() Failed. Completing with err",
-                                 __func__);
-                       spin_lock_irqsave(&qp->s_lock, flags);
-                       hfi1_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR);
-                       spin_unlock_irqrestore(&qp->s_lock, flags);
-               }
-               return -EINVAL;
-       }
-       if (sr == dd->process_dma_send && iowait_pio_pending(&priv->s_iowait))
-               return pio_wait(qp,
-                               ps->s_txreq->psc,
-                               ps,
-                               RVT_S_WAIT_PIO_DRAIN);
-       return sr(qp, ps, 0);
-}
-
-/**
- * hfi1_fill_device_attr - Fill in rvt dev info device attributes.
- * @dd: the device data structure
- */
-static void hfi1_fill_device_attr(struct hfi1_devdata *dd)
-{
-       struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
-
-       memset(&rdi->dparms.props, 0, sizeof(rdi->dparms.props));
-
-       rdi->dparms.props.device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR |
-                       IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT |
-                       IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN |
-                       IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE;
-       rdi->dparms.props.page_size_cap = PAGE_SIZE;
-       rdi->dparms.props.vendor_id = dd->oui1 << 16 | dd->oui2 << 8 | dd->oui3;
-       rdi->dparms.props.vendor_part_id = dd->pcidev->device;
-       rdi->dparms.props.hw_ver = dd->minrev;
-       rdi->dparms.props.sys_image_guid = ib_hfi1_sys_image_guid;
-       rdi->dparms.props.max_mr_size = ~0ULL;
-       rdi->dparms.props.max_qp = hfi1_max_qps;
-       rdi->dparms.props.max_qp_wr = hfi1_max_qp_wrs;
-       rdi->dparms.props.max_sge = hfi1_max_sges;
-       rdi->dparms.props.max_sge_rd = hfi1_max_sges;
-       rdi->dparms.props.max_cq = hfi1_max_cqs;
-       rdi->dparms.props.max_ah = hfi1_max_ahs;
-       rdi->dparms.props.max_cqe = hfi1_max_cqes;
-       rdi->dparms.props.max_mr = rdi->lkey_table.max;
-       rdi->dparms.props.max_fmr = rdi->lkey_table.max;
-       rdi->dparms.props.max_map_per_fmr = 32767;
-       rdi->dparms.props.max_pd = hfi1_max_pds;
-       rdi->dparms.props.max_qp_rd_atom = HFI1_MAX_RDMA_ATOMIC;
-       rdi->dparms.props.max_qp_init_rd_atom = 255;
-       rdi->dparms.props.max_srq = hfi1_max_srqs;
-       rdi->dparms.props.max_srq_wr = hfi1_max_srq_wrs;
-       rdi->dparms.props.max_srq_sge = hfi1_max_srq_sges;
-       rdi->dparms.props.atomic_cap = IB_ATOMIC_GLOB;
-       rdi->dparms.props.max_pkeys = hfi1_get_npkeys(dd);
-       rdi->dparms.props.max_mcast_grp = hfi1_max_mcast_grps;
-       rdi->dparms.props.max_mcast_qp_attach = hfi1_max_mcast_qp_attached;
-       rdi->dparms.props.max_total_mcast_qp_attach =
-                                       rdi->dparms.props.max_mcast_qp_attach *
-                                       rdi->dparms.props.max_mcast_grp;
-}
-
-static inline u16 opa_speed_to_ib(u16 in)
-{
-       u16 out = 0;
-
-       if (in & OPA_LINK_SPEED_25G)
-               out |= IB_SPEED_EDR;
-       if (in & OPA_LINK_SPEED_12_5G)
-               out |= IB_SPEED_FDR;
-
-       return out;
-}
-
-/*
- * Convert a single OPA link width (no multiple flags) to an IB value.
- * A zero OPA link width means link down, which means the IB width value
- * is a don't care.
- */
-static inline u16 opa_width_to_ib(u16 in)
-{
-       switch (in) {
-       case OPA_LINK_WIDTH_1X:
-       /* map 2x and 3x to 1x as they don't exist in IB */
-       case OPA_LINK_WIDTH_2X:
-       case OPA_LINK_WIDTH_3X:
-               return IB_WIDTH_1X;
-       default: /* link down or unknown, return our largest width */
-       case OPA_LINK_WIDTH_4X:
-               return IB_WIDTH_4X;
-       }
-}
-
-static int query_port(struct rvt_dev_info *rdi, u8 port_num,
-                     struct ib_port_attr *props)
-{
-       struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
-       struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
-       struct hfi1_pportdata *ppd = &dd->pport[port_num - 1];
-       u16 lid = ppd->lid;
-
-       props->lid = lid ? lid : 0;
-       props->lmc = ppd->lmc;
-       /* OPA logical states match IB logical states */
-       props->state = driver_lstate(ppd);
-       props->phys_state = hfi1_ibphys_portstate(ppd);
-       props->gid_tbl_len = HFI1_GUIDS_PER_PORT;
-       props->active_width = (u8)opa_width_to_ib(ppd->link_width_active);
-       /* see rate_show() in ib core/sysfs.c */
-       props->active_speed = (u8)opa_speed_to_ib(ppd->link_speed_active);
-       props->max_vl_num = ppd->vls_supported;
-
-       /* Once we are a "first class" citizen and have added the OPA MTUs to
-        * the core we can advertise the larger MTU enum to the ULPs, for now
-        * advertise only 4K.
-        *
-        * Those applications which are either OPA aware or pass the MTU enum
-        * from the Path Records to us will get the new 8k MTU.  Those that
-        * attempt to process the MTU enum may fail in various ways.
-        */
-       props->max_mtu = mtu_to_enum((!valid_ib_mtu(hfi1_max_mtu) ?
-                                     4096 : hfi1_max_mtu), IB_MTU_4096);
-       props->active_mtu = !valid_ib_mtu(ppd->ibmtu) ? props->max_mtu :
-               mtu_to_enum(ppd->ibmtu, IB_MTU_2048);
-
-       return 0;
-}
-
-static int modify_device(struct ib_device *device,
-                        int device_modify_mask,
-                        struct ib_device_modify *device_modify)
-{
-       struct hfi1_devdata *dd = dd_from_ibdev(device);
-       unsigned i;
-       int ret;
-
-       if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID |
-                                  IB_DEVICE_MODIFY_NODE_DESC)) {
-               ret = -EOPNOTSUPP;
-               goto bail;
-       }
-
-       if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC) {
-               memcpy(device->node_desc, device_modify->node_desc, 64);
-               for (i = 0; i < dd->num_pports; i++) {
-                       struct hfi1_ibport *ibp = &dd->pport[i].ibport_data;
-
-                       hfi1_node_desc_chg(ibp);
-               }
-       }
-
-       if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) {
-               ib_hfi1_sys_image_guid =
-                       cpu_to_be64(device_modify->sys_image_guid);
-               for (i = 0; i < dd->num_pports; i++) {
-                       struct hfi1_ibport *ibp = &dd->pport[i].ibport_data;
-
-                       hfi1_sys_guid_chg(ibp);
-               }
-       }
-
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-static int shut_down_port(struct rvt_dev_info *rdi, u8 port_num)
-{
-       struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
-       struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
-       struct hfi1_pportdata *ppd = &dd->pport[port_num - 1];
-       int ret;
-
-       set_link_down_reason(ppd, OPA_LINKDOWN_REASON_UNKNOWN, 0,
-                            OPA_LINKDOWN_REASON_UNKNOWN);
-       ret = set_link_state(ppd, HLS_DN_DOWNDEF);
-       return ret;
-}
-
-static int hfi1_get_guid_be(struct rvt_dev_info *rdi, struct rvt_ibport *rvp,
-                           int guid_index, __be64 *guid)
-{
-       struct hfi1_ibport *ibp = container_of(rvp, struct hfi1_ibport, rvp);
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-
-       if (guid_index == 0)
-               *guid = cpu_to_be64(ppd->guid);
-       else if (guid_index < HFI1_GUIDS_PER_PORT)
-               *guid = ibp->guids[guid_index - 1];
-       else
-               return -EINVAL;
-
-       return 0;
-}
-
-/*
- * convert ah port,sl to sc
- */
-u8 ah_to_sc(struct ib_device *ibdev, struct ib_ah_attr *ah)
-{
-       struct hfi1_ibport *ibp = to_iport(ibdev, ah->port_num);
-
-       return ibp->sl_to_sc[ah->sl];
-}
-
-static int hfi1_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr)
-{
-       struct hfi1_ibport *ibp;
-       struct hfi1_pportdata *ppd;
-       struct hfi1_devdata *dd;
-       u8 sc5;
-
-       /* test the mapping for validity */
-       ibp = to_iport(ibdev, ah_attr->port_num);
-       ppd = ppd_from_ibp(ibp);
-       sc5 = ibp->sl_to_sc[ah_attr->sl];
-       dd = dd_from_ppd(ppd);
-       if (sc_to_vlt(dd, sc5) > num_vls && sc_to_vlt(dd, sc5) != 0xf)
-               return -EINVAL;
-       return 0;
-}
-
-static void hfi1_notify_new_ah(struct ib_device *ibdev,
-                              struct ib_ah_attr *ah_attr,
-                              struct rvt_ah *ah)
-{
-       struct hfi1_ibport *ibp;
-       struct hfi1_pportdata *ppd;
-       struct hfi1_devdata *dd;
-       u8 sc5;
-
-       /*
-        * Do not trust reading anything from rvt_ah at this point as it is not
-        * done being setup. We can however modify things which we need to set.
-        */
-
-       ibp = to_iport(ibdev, ah_attr->port_num);
-       ppd = ppd_from_ibp(ibp);
-       sc5 = ibp->sl_to_sc[ah->attr.sl];
-       dd = dd_from_ppd(ppd);
-       ah->vl = sc_to_vlt(dd, sc5);
-       if (ah->vl < num_vls || ah->vl == 15)
-               ah->log_pmtu = ilog2(dd->vld[ah->vl].mtu);
-}
-
-struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u16 dlid)
-{
-       struct ib_ah_attr attr;
-       struct ib_ah *ah = ERR_PTR(-EINVAL);
-       struct rvt_qp *qp0;
-
-       memset(&attr, 0, sizeof(attr));
-       attr.dlid = dlid;
-       attr.port_num = ppd_from_ibp(ibp)->port;
-       rcu_read_lock();
-       qp0 = rcu_dereference(ibp->rvp.qp[0]);
-       if (qp0)
-               ah = ib_create_ah(qp0->ibqp.pd, &attr);
-       rcu_read_unlock();
-       return ah;
-}
-
-/**
- * hfi1_get_npkeys - return the size of the PKEY table for context 0
- * @dd: the hfi1_ib device
- */
-unsigned hfi1_get_npkeys(struct hfi1_devdata *dd)
-{
-       return ARRAY_SIZE(dd->pport[0].pkeys);
-}
-
-static void init_ibport(struct hfi1_pportdata *ppd)
-{
-       struct hfi1_ibport *ibp = &ppd->ibport_data;
-       size_t sz = ARRAY_SIZE(ibp->sl_to_sc);
-       int i;
-
-       for (i = 0; i < sz; i++) {
-               ibp->sl_to_sc[i] = i;
-               ibp->sc_to_sl[i] = i;
-       }
-
-       spin_lock_init(&ibp->rvp.lock);
-       /* Set the prefix to the default value (see ch. 4.1.1) */
-       ibp->rvp.gid_prefix = IB_DEFAULT_GID_PREFIX;
-       ibp->rvp.sm_lid = 0;
-       /* Below should only set bits defined in OPA PortInfo.CapabilityMask */
-       ibp->rvp.port_cap_flags = IB_PORT_AUTO_MIGR_SUP |
-               IB_PORT_CAP_MASK_NOTICE_SUP;
-       ibp->rvp.pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA;
-       ibp->rvp.pma_counter_select[1] = IB_PMA_PORT_RCV_DATA;
-       ibp->rvp.pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS;
-       ibp->rvp.pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS;
-       ibp->rvp.pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT;
-
-       RCU_INIT_POINTER(ibp->rvp.qp[0], NULL);
-       RCU_INIT_POINTER(ibp->rvp.qp[1], NULL);
-}
-
-/**
- * hfi1_register_ib_device - register our device with the infiniband core
- * @dd: the device data structure
- * Return 0 if successful, errno if unsuccessful.
- */
-int hfi1_register_ib_device(struct hfi1_devdata *dd)
-{
-       struct hfi1_ibdev *dev = &dd->verbs_dev;
-       struct ib_device *ibdev = &dev->rdi.ibdev;
-       struct hfi1_pportdata *ppd = dd->pport;
-       unsigned i;
-       int ret;
-       size_t lcpysz = IB_DEVICE_NAME_MAX;
-
-       for (i = 0; i < dd->num_pports; i++)
-               init_ibport(ppd + i);
-
-       /* Only need to initialize non-zero fields. */
-
-       setup_timer(&dev->mem_timer, mem_timer, (unsigned long)dev);
-
-       seqlock_init(&dev->iowait_lock);
-       INIT_LIST_HEAD(&dev->txwait);
-       INIT_LIST_HEAD(&dev->memwait);
-
-       ret = verbs_txreq_init(dev);
-       if (ret)
-               goto err_verbs_txreq;
-
-       /*
-        * The system image GUID is supposed to be the same for all
-        * HFIs in a single system but since there can be other
-        * device types in the system, we can't be sure this is unique.
-        */
-       if (!ib_hfi1_sys_image_guid)
-               ib_hfi1_sys_image_guid = cpu_to_be64(ppd->guid);
-       lcpysz = strlcpy(ibdev->name, class_name(), lcpysz);
-       strlcpy(ibdev->name + lcpysz, "_%d", IB_DEVICE_NAME_MAX - lcpysz);
-       ibdev->owner = THIS_MODULE;
-       ibdev->node_guid = cpu_to_be64(ppd->guid);
-       ibdev->phys_port_cnt = dd->num_pports;
-       ibdev->dma_device = &dd->pcidev->dev;
-       ibdev->modify_device = modify_device;
-
-       /* keep process mad in the driver */
-       ibdev->process_mad = hfi1_process_mad;
-
-       strncpy(ibdev->node_desc, init_utsname()->nodename,
-               sizeof(ibdev->node_desc));
-
-       /*
-        * Fill in rvt info object.
-        */
-       dd->verbs_dev.rdi.driver_f.port_callback = hfi1_create_port_files;
-       dd->verbs_dev.rdi.driver_f.get_card_name = get_card_name;
-       dd->verbs_dev.rdi.driver_f.get_pci_dev = get_pci_dev;
-       dd->verbs_dev.rdi.driver_f.check_ah = hfi1_check_ah;
-       dd->verbs_dev.rdi.driver_f.notify_new_ah = hfi1_notify_new_ah;
-       dd->verbs_dev.rdi.driver_f.get_guid_be = hfi1_get_guid_be;
-       dd->verbs_dev.rdi.driver_f.query_port_state = query_port;
-       dd->verbs_dev.rdi.driver_f.shut_down_port = shut_down_port;
-       dd->verbs_dev.rdi.driver_f.cap_mask_chg = hfi1_cap_mask_chg;
-       /*
-        * Fill in rvt info device attributes.
-        */
-       hfi1_fill_device_attr(dd);
-
-       /* queue pair */
-       dd->verbs_dev.rdi.dparms.qp_table_size = hfi1_qp_table_size;
-       dd->verbs_dev.rdi.dparms.qpn_start = 0;
-       dd->verbs_dev.rdi.dparms.qpn_inc = 1;
-       dd->verbs_dev.rdi.dparms.qos_shift = dd->qos_shift;
-       dd->verbs_dev.rdi.dparms.qpn_res_start = kdeth_qp << 16;
-       dd->verbs_dev.rdi.dparms.qpn_res_end =
-       dd->verbs_dev.rdi.dparms.qpn_res_start + 65535;
-       dd->verbs_dev.rdi.dparms.max_rdma_atomic = HFI1_MAX_RDMA_ATOMIC;
-       dd->verbs_dev.rdi.dparms.psn_mask = PSN_MASK;
-       dd->verbs_dev.rdi.dparms.psn_shift = PSN_SHIFT;
-       dd->verbs_dev.rdi.dparms.psn_modify_mask = PSN_MODIFY_MASK;
-       dd->verbs_dev.rdi.dparms.core_cap_flags = RDMA_CORE_PORT_INTEL_OPA;
-       dd->verbs_dev.rdi.dparms.max_mad_size = OPA_MGMT_MAD_SIZE;
-
-       dd->verbs_dev.rdi.driver_f.qp_priv_alloc = qp_priv_alloc;
-       dd->verbs_dev.rdi.driver_f.qp_priv_free = qp_priv_free;
-       dd->verbs_dev.rdi.driver_f.free_all_qps = free_all_qps;
-       dd->verbs_dev.rdi.driver_f.notify_qp_reset = notify_qp_reset;
-       dd->verbs_dev.rdi.driver_f.do_send = hfi1_do_send;
-       dd->verbs_dev.rdi.driver_f.schedule_send = hfi1_schedule_send;
-       dd->verbs_dev.rdi.driver_f.schedule_send_no_lock = _hfi1_schedule_send;
-       dd->verbs_dev.rdi.driver_f.get_pmtu_from_attr = get_pmtu_from_attr;
-       dd->verbs_dev.rdi.driver_f.notify_error_qp = notify_error_qp;
-       dd->verbs_dev.rdi.driver_f.flush_qp_waiters = flush_qp_waiters;
-       dd->verbs_dev.rdi.driver_f.stop_send_queue = stop_send_queue;
-       dd->verbs_dev.rdi.driver_f.quiesce_qp = quiesce_qp;
-       dd->verbs_dev.rdi.driver_f.notify_error_qp = notify_error_qp;
-       dd->verbs_dev.rdi.driver_f.mtu_from_qp = mtu_from_qp;
-       dd->verbs_dev.rdi.driver_f.mtu_to_path_mtu = mtu_to_path_mtu;
-       dd->verbs_dev.rdi.driver_f.check_modify_qp = hfi1_check_modify_qp;
-       dd->verbs_dev.rdi.driver_f.modify_qp = hfi1_modify_qp;
-       dd->verbs_dev.rdi.driver_f.check_send_wqe = hfi1_check_send_wqe;
-
-       /* completeion queue */
-       snprintf(dd->verbs_dev.rdi.dparms.cq_name,
-                sizeof(dd->verbs_dev.rdi.dparms.cq_name),
-                "hfi1_cq%d", dd->unit);
-       dd->verbs_dev.rdi.dparms.node = dd->node;
-
-       /* misc settings */
-       dd->verbs_dev.rdi.flags = 0; /* Let rdmavt handle it all */
-       dd->verbs_dev.rdi.dparms.lkey_table_size = hfi1_lkey_table_size;
-       dd->verbs_dev.rdi.dparms.nports = dd->num_pports;
-       dd->verbs_dev.rdi.dparms.npkeys = hfi1_get_npkeys(dd);
-
-       ppd = dd->pport;
-       for (i = 0; i < dd->num_pports; i++, ppd++)
-               rvt_init_port(&dd->verbs_dev.rdi,
-                             &ppd->ibport_data.rvp,
-                             i,
-                             ppd->pkeys);
-
-       ret = rvt_register_device(&dd->verbs_dev.rdi);
-       if (ret)
-               goto err_verbs_txreq;
-
-       ret = hfi1_verbs_register_sysfs(dd);
-       if (ret)
-               goto err_class;
-
-       return ret;
-
-err_class:
-       rvt_unregister_device(&dd->verbs_dev.rdi);
-err_verbs_txreq:
-       verbs_txreq_exit(dev);
-       dd_dev_err(dd, "cannot register verbs: %d!\n", -ret);
-       return ret;
-}
-
-void hfi1_unregister_ib_device(struct hfi1_devdata *dd)
-{
-       struct hfi1_ibdev *dev = &dd->verbs_dev;
-
-       hfi1_verbs_unregister_sysfs(dd);
-
-       rvt_unregister_device(&dd->verbs_dev.rdi);
-
-       if (!list_empty(&dev->txwait))
-               dd_dev_err(dd, "txwait list not empty!\n");
-       if (!list_empty(&dev->memwait))
-               dd_dev_err(dd, "memwait list not empty!\n");
-
-       del_timer_sync(&dev->mem_timer);
-       verbs_txreq_exit(dev);
-}
-
-void hfi1_cnp_rcv(struct hfi1_packet *packet)
-{
-       struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data;
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-       struct hfi1_ib_header *hdr = packet->hdr;
-       struct rvt_qp *qp = packet->qp;
-       u32 lqpn, rqpn = 0;
-       u16 rlid = 0;
-       u8 sl, sc5, sc4_bit, svc_type;
-       bool sc4_set = has_sc4_bit(packet);
-
-       switch (packet->qp->ibqp.qp_type) {
-       case IB_QPT_UC:
-               rlid = qp->remote_ah_attr.dlid;
-               rqpn = qp->remote_qpn;
-               svc_type = IB_CC_SVCTYPE_UC;
-               break;
-       case IB_QPT_RC:
-               rlid = qp->remote_ah_attr.dlid;
-               rqpn = qp->remote_qpn;
-               svc_type = IB_CC_SVCTYPE_RC;
-               break;
-       case IB_QPT_SMI:
-       case IB_QPT_GSI:
-       case IB_QPT_UD:
-               svc_type = IB_CC_SVCTYPE_UD;
-               break;
-       default:
-               ibp->rvp.n_pkt_drops++;
-               return;
-       }
-
-       sc4_bit = sc4_set << 4;
-       sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
-       sc5 |= sc4_bit;
-       sl = ibp->sc_to_sl[sc5];
-       lqpn = qp->ibqp.qp_num;
-
-       process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type);
-}
diff --git a/drivers/staging/rdma/hfi1/verbs.h b/drivers/staging/rdma/hfi1/verbs.h
deleted file mode 100644 (file)
index 3ee2239..0000000
+++ /dev/null
@@ -1,530 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#ifndef HFI1_VERBS_H
-#define HFI1_VERBS_H
-
-#include <linux/types.h>
-#include <linux/seqlock.h>
-#include <linux/kernel.h>
-#include <linux/interrupt.h>
-#include <linux/kref.h>
-#include <linux/workqueue.h>
-#include <linux/kthread.h>
-#include <linux/completion.h>
-#include <linux/slab.h>
-#include <rdma/ib_pack.h>
-#include <rdma/ib_user_verbs.h>
-#include <rdma/ib_mad.h>
-#include <rdma/rdma_vt.h>
-#include <rdma/rdmavt_qp.h>
-#include <rdma/rdmavt_cq.h>
-
-struct hfi1_ctxtdata;
-struct hfi1_pportdata;
-struct hfi1_devdata;
-struct hfi1_packet;
-
-#include "iowait.h"
-
-#define HFI1_MAX_RDMA_ATOMIC     16
-#define HFI1_GUIDS_PER_PORT    5
-
-/*
- * Increment this value if any changes that break userspace ABI
- * compatibility are made.
- */
-#define HFI1_UVERBS_ABI_VERSION       2
-
-#define IB_SEQ_NAK     (3 << 29)
-
-/* AETH NAK opcode values */
-#define IB_RNR_NAK                      0x20
-#define IB_NAK_PSN_ERROR                0x60
-#define IB_NAK_INVALID_REQUEST          0x61
-#define IB_NAK_REMOTE_ACCESS_ERROR      0x62
-#define IB_NAK_REMOTE_OPERATIONAL_ERROR 0x63
-#define IB_NAK_INVALID_RD_REQUEST       0x64
-
-/* IB Performance Manager status values */
-#define IB_PMA_SAMPLE_STATUS_DONE       0x00
-#define IB_PMA_SAMPLE_STATUS_STARTED    0x01
-#define IB_PMA_SAMPLE_STATUS_RUNNING    0x02
-
-/* Mandatory IB performance counter select values. */
-#define IB_PMA_PORT_XMIT_DATA   cpu_to_be16(0x0001)
-#define IB_PMA_PORT_RCV_DATA    cpu_to_be16(0x0002)
-#define IB_PMA_PORT_XMIT_PKTS   cpu_to_be16(0x0003)
-#define IB_PMA_PORT_RCV_PKTS    cpu_to_be16(0x0004)
-#define IB_PMA_PORT_XMIT_WAIT   cpu_to_be16(0x0005)
-
-#define HFI1_VENDOR_IPG                cpu_to_be16(0xFFA0)
-
-#define IB_BTH_REQ_ACK         BIT(31)
-#define IB_BTH_SOLICITED       BIT(23)
-#define IB_BTH_MIG_REQ         BIT(22)
-
-#define IB_GRH_VERSION         6
-#define IB_GRH_VERSION_MASK    0xF
-#define IB_GRH_VERSION_SHIFT   28
-#define IB_GRH_TCLASS_MASK     0xFF
-#define IB_GRH_TCLASS_SHIFT    20
-#define IB_GRH_FLOW_MASK       0xFFFFF
-#define IB_GRH_FLOW_SHIFT      0
-#define IB_GRH_NEXT_HDR                0x1B
-
-#define IB_DEFAULT_GID_PREFIX  cpu_to_be64(0xfe80000000000000ULL)
-
-/* flags passed by hfi1_ib_rcv() */
-enum {
-       HFI1_HAS_GRH = (1 << 0),
-};
-
-struct ib_reth {
-       __be64 vaddr;
-       __be32 rkey;
-       __be32 length;
-} __packed;
-
-struct ib_atomic_eth {
-       __be32 vaddr[2];        /* unaligned so access as 2 32-bit words */
-       __be32 rkey;
-       __be64 swap_data;
-       __be64 compare_data;
-} __packed;
-
-union ib_ehdrs {
-       struct {
-               __be32 deth[2];
-               __be32 imm_data;
-       } ud;
-       struct {
-               struct ib_reth reth;
-               __be32 imm_data;
-       } rc;
-       struct {
-               __be32 aeth;
-               __be32 atomic_ack_eth[2];
-       } at;
-       __be32 imm_data;
-       __be32 aeth;
-       struct ib_atomic_eth atomic_eth;
-}  __packed;
-
-struct hfi1_other_headers {
-       __be32 bth[3];
-       union ib_ehdrs u;
-} __packed;
-
-/*
- * Note that UD packets with a GRH header are 8+40+12+8 = 68 bytes
- * long (72 w/ imm_data).  Only the first 56 bytes of the IB header
- * will be in the eager header buffer.  The remaining 12 or 16 bytes
- * are in the data buffer.
- */
-struct hfi1_ib_header {
-       __be16 lrh[4];
-       union {
-               struct {
-                       struct ib_grh grh;
-                       struct hfi1_other_headers oth;
-               } l;
-               struct hfi1_other_headers oth;
-       } u;
-} __packed;
-
-struct ahg_ib_header {
-       struct sdma_engine *sde;
-       u32 ahgdesc[2];
-       u16 tx_flags;
-       u8 ahgcount;
-       u8 ahgidx;
-       struct hfi1_ib_header ibh;
-};
-
-struct hfi1_pio_header {
-       __le64 pbc;
-       struct hfi1_ib_header hdr;
-} __packed;
-
-/*
- * hfi1 specific data structures that will be hidden from rvt after the queue
- * pair is made common
- */
-struct hfi1_qp_priv {
-       struct ahg_ib_header *s_hdr;              /* next header to send */
-       struct sdma_engine *s_sde;                /* current sde */
-       struct send_context *s_sendcontext;       /* current sendcontext */
-       u8 s_sc;                                  /* SC[0..4] for next packet */
-       u8 r_adefered;                            /* number of acks defered */
-       struct iowait s_iowait;
-       struct timer_list s_rnr_timer;
-       struct rvt_qp *owner;
-};
-
-/*
- * This structure is used to hold commonly lookedup and computed values during
- * the send engine progress.
- */
-struct hfi1_pkt_state {
-       struct hfi1_ibdev *dev;
-       struct hfi1_ibport *ibp;
-       struct hfi1_pportdata *ppd;
-       struct verbs_txreq *s_txreq;
-       unsigned long flags;
-};
-
-#define HFI1_PSN_CREDIT  16
-
-struct hfi1_opcode_stats {
-       u64 n_packets;          /* number of packets */
-       u64 n_bytes;            /* total number of bytes */
-};
-
-struct hfi1_opcode_stats_perctx {
-       struct hfi1_opcode_stats stats[256];
-};
-
-static inline void inc_opstats(
-       u32 tlen,
-       struct hfi1_opcode_stats *stats)
-{
-#ifdef CONFIG_DEBUG_FS
-       stats->n_bytes += tlen;
-       stats->n_packets++;
-#endif
-}
-
-struct hfi1_ibport {
-       struct rvt_qp __rcu *qp[2];
-       struct rvt_ibport rvp;
-
-       __be64 guids[HFI1_GUIDS_PER_PORT        - 1];   /* writable GUIDs */
-
-       /* the first 16 entries are sl_to_vl for !OPA */
-       u8 sl_to_sc[32];
-       u8 sc_to_sl[32];
-};
-
-struct hfi1_ibdev {
-       struct rvt_dev_info rdi; /* Must be first */
-
-       /* QP numbers are shared by all IB ports */
-       /* protect wait lists */
-       seqlock_t iowait_lock;
-       struct list_head txwait;        /* list for wait verbs_txreq */
-       struct list_head memwait;       /* list for wait kernel memory */
-       struct list_head txreq_free;
-       struct kmem_cache *verbs_txreq_cache;
-       struct timer_list mem_timer;
-
-       u64 n_piowait;
-       u64 n_piodrain;
-       u64 n_txwait;
-       u64 n_kmem_wait;
-
-#ifdef CONFIG_DEBUG_FS
-       /* per HFI debugfs */
-       struct dentry *hfi1_ibdev_dbg;
-       /* per HFI symlinks to above */
-       struct dentry *hfi1_ibdev_link;
-#endif
-};
-
-static inline struct hfi1_ibdev *to_idev(struct ib_device *ibdev)
-{
-       struct rvt_dev_info *rdi;
-
-       rdi = container_of(ibdev, struct rvt_dev_info, ibdev);
-       return container_of(rdi, struct hfi1_ibdev, rdi);
-}
-
-static inline struct rvt_qp *iowait_to_qp(struct  iowait *s_iowait)
-{
-       struct hfi1_qp_priv *priv;
-
-       priv = container_of(s_iowait, struct hfi1_qp_priv, s_iowait);
-       return priv->owner;
-}
-
-/*
- * Send if not busy or waiting for I/O and either
- * a RC response is pending or we can process send work requests.
- */
-static inline int hfi1_send_ok(struct rvt_qp *qp)
-{
-       return !(qp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT_IO)) &&
-               (qp->s_hdrwords || (qp->s_flags & RVT_S_RESP_PENDING) ||
-                !(qp->s_flags & RVT_S_ANY_WAIT_SEND));
-}
-
-/*
- * This must be called with s_lock held.
- */
-void hfi1_bad_pqkey(struct hfi1_ibport *ibp, __be16 trap_num, u32 key, u32 sl,
-                   u32 qp1, u32 qp2, u16 lid1, u16 lid2);
-void hfi1_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num);
-void hfi1_sys_guid_chg(struct hfi1_ibport *ibp);
-void hfi1_node_desc_chg(struct hfi1_ibport *ibp);
-int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u8 port,
-                    const struct ib_wc *in_wc, const struct ib_grh *in_grh,
-                    const struct ib_mad_hdr *in_mad, size_t in_mad_size,
-                    struct ib_mad_hdr *out_mad, size_t *out_mad_size,
-                    u16 *out_mad_pkey_index);
-
-/*
- * The PSN_MASK and PSN_SHIFT allow for
- * 1) comparing two PSNs
- * 2) returning the PSN with any upper bits masked
- * 3) returning the difference between to PSNs
- *
- * The number of significant bits in the PSN must
- * necessarily be at least one bit less than
- * the container holding the PSN.
- */
-#ifndef CONFIG_HFI1_VERBS_31BIT_PSN
-#define PSN_MASK 0xFFFFFF
-#define PSN_SHIFT 8
-#else
-#define PSN_MASK 0x7FFFFFFF
-#define PSN_SHIFT 1
-#endif
-#define PSN_MODIFY_MASK 0xFFFFFF
-
-/*
- * Compare the lower 24 bits of the msn values.
- * Returns an integer <, ==, or > than zero.
- */
-static inline int cmp_msn(u32 a, u32 b)
-{
-       return (((int)a) - ((int)b)) << 8;
-}
-
-/*
- * Compare two PSNs
- * Returns an integer <, ==, or > than zero.
- */
-static inline int cmp_psn(u32 a, u32 b)
-{
-       return (((int)a) - ((int)b)) << PSN_SHIFT;
-}
-
-/*
- * Return masked PSN
- */
-static inline u32 mask_psn(u32 a)
-{
-       return a & PSN_MASK;
-}
-
-/*
- * Return delta between two PSNs
- */
-static inline u32 delta_psn(u32 a, u32 b)
-{
-       return (((int)a - (int)b) << PSN_SHIFT) >> PSN_SHIFT;
-}
-
-struct verbs_txreq;
-void hfi1_put_txreq(struct verbs_txreq *tx);
-
-int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
-
-void hfi1_copy_sge(struct rvt_sge_state *ss, void *data, u32 length,
-                  int release, int copy_last);
-
-void hfi1_skip_sge(struct rvt_sge_state *ss, u32 length, int release);
-
-void hfi1_cnp_rcv(struct hfi1_packet *packet);
-
-void hfi1_uc_rcv(struct hfi1_packet *packet);
-
-void hfi1_rc_rcv(struct hfi1_packet *packet);
-
-void hfi1_rc_hdrerr(
-       struct hfi1_ctxtdata *rcd,
-       struct hfi1_ib_header *hdr,
-       u32 rcv_flags,
-       struct rvt_qp *qp);
-
-u8 ah_to_sc(struct ib_device *ibdev, struct ib_ah_attr *ah_attr);
-
-struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u16 dlid);
-
-void hfi1_rc_rnr_retry(unsigned long arg);
-void hfi1_add_rnr_timer(struct rvt_qp *qp, u32 to);
-void hfi1_rc_timeout(unsigned long arg);
-void hfi1_del_timers_sync(struct rvt_qp *qp);
-void hfi1_stop_rc_timers(struct rvt_qp *qp);
-
-void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_ib_header *hdr);
-
-void hfi1_rc_error(struct rvt_qp *qp, enum ib_wc_status err);
-
-void hfi1_ud_rcv(struct hfi1_packet *packet);
-
-int hfi1_lookup_pkey_idx(struct hfi1_ibport *ibp, u16 pkey);
-
-int hfi1_rvt_get_rwqe(struct rvt_qp *qp, int wr_id_only);
-
-void hfi1_migrate_qp(struct rvt_qp *qp);
-
-int hfi1_check_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
-                        int attr_mask, struct ib_udata *udata);
-
-void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
-                   int attr_mask, struct ib_udata *udata);
-
-int hfi1_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe);
-
-extern const u32 rc_only_opcode;
-extern const u32 uc_only_opcode;
-
-static inline u8 get_opcode(struct hfi1_ib_header *h)
-{
-       u16 lnh = be16_to_cpu(h->lrh[0]) & 3;
-
-       if (lnh == IB_LNH_IBA_LOCAL)
-               return be32_to_cpu(h->u.oth.bth[0]) >> 24;
-       else
-               return be32_to_cpu(h->u.l.oth.bth[0]) >> 24;
-}
-
-int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_ib_header *hdr,
-                      int has_grh, struct rvt_qp *qp, u32 bth0);
-
-u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr,
-                 struct ib_global_route *grh, u32 hwords, u32 nwords);
-
-void hfi1_make_ruc_header(struct rvt_qp *qp, struct hfi1_other_headers *ohdr,
-                         u32 bth0, u32 bth2, int middle,
-                         struct hfi1_pkt_state *ps);
-
-void _hfi1_do_send(struct work_struct *work);
-
-void hfi1_do_send(struct rvt_qp *qp);
-
-void hfi1_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
-                       enum ib_wc_status status);
-
-void hfi1_send_rc_ack(struct hfi1_ctxtdata *, struct rvt_qp *qp, int is_fecn);
-
-int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
-
-int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
-
-int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
-
-int hfi1_register_ib_device(struct hfi1_devdata *);
-
-void hfi1_unregister_ib_device(struct hfi1_devdata *);
-
-void hfi1_ib_rcv(struct hfi1_packet *packet);
-
-unsigned hfi1_get_npkeys(struct hfi1_devdata *);
-
-int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
-                       u64 pbc);
-
-int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
-                       u64 pbc);
-
-int hfi1_wss_init(void);
-void hfi1_wss_exit(void);
-
-/* platform specific: return the lowest level cache (llc) size, in KiB */
-static inline int wss_llc_size(void)
-{
-       /* assume that the boot CPU value is universal for all CPUs */
-       return boot_cpu_data.x86_cache_size;
-}
-
-/* platform specific: cacheless copy */
-static inline void cacheless_memcpy(void *dst, void *src, size_t n)
-{
-       /*
-        * Use the only available X64 cacheless copy.  Add a __user cast
-        * to quiet sparse.  The src agument is already in the kernel so
-        * there are no security issues.  The extra fault recovery machinery
-        * is not invoked.
-        */
-       __copy_user_nocache(dst, (void __user *)src, n, 0);
-}
-
-extern const enum ib_wc_opcode ib_hfi1_wc_opcode[];
-
-extern const u8 hdr_len_by_opcode[];
-
-extern const int ib_rvt_state_ops[];
-
-extern __be64 ib_hfi1_sys_image_guid;    /* in network order */
-
-extern unsigned int hfi1_max_cqes;
-
-extern unsigned int hfi1_max_cqs;
-
-extern unsigned int hfi1_max_qp_wrs;
-
-extern unsigned int hfi1_max_qps;
-
-extern unsigned int hfi1_max_sges;
-
-extern unsigned int hfi1_max_mcast_grps;
-
-extern unsigned int hfi1_max_mcast_qp_attached;
-
-extern unsigned int hfi1_max_srqs;
-
-extern unsigned int hfi1_max_srq_sges;
-
-extern unsigned int hfi1_max_srq_wrs;
-
-extern unsigned short piothreshold;
-
-extern const u32 ib_hfi1_rnr_table[];
-
-#endif                          /* HFI1_VERBS_H */
diff --git a/drivers/staging/rdma/hfi1/verbs_txreq.c b/drivers/staging/rdma/hfi1/verbs_txreq.c
deleted file mode 100644 (file)
index bc95c41..0000000
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Copyright(c) 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "hfi.h"
-#include "verbs_txreq.h"
-#include "qp.h"
-#include "trace.h"
-
-#define TXREQ_LEN 24
-
-void hfi1_put_txreq(struct verbs_txreq *tx)
-{
-       struct hfi1_ibdev *dev;
-       struct rvt_qp *qp;
-       unsigned long flags;
-       unsigned int seq;
-       struct hfi1_qp_priv *priv;
-
-       qp = tx->qp;
-       dev = to_idev(qp->ibqp.device);
-
-       if (tx->mr)
-               rvt_put_mr(tx->mr);
-
-       sdma_txclean(dd_from_dev(dev), &tx->txreq);
-
-       /* Free verbs_txreq and return to slab cache */
-       kmem_cache_free(dev->verbs_txreq_cache, tx);
-
-       do {
-               seq = read_seqbegin(&dev->iowait_lock);
-               if (!list_empty(&dev->txwait)) {
-                       struct iowait *wait;
-
-                       write_seqlock_irqsave(&dev->iowait_lock, flags);
-                       wait = list_first_entry(&dev->txwait, struct iowait,
-                                               list);
-                       qp = iowait_to_qp(wait);
-                       priv = qp->priv;
-                       list_del_init(&priv->s_iowait.list);
-                       /* refcount held until actual wake up */
-                       write_sequnlock_irqrestore(&dev->iowait_lock, flags);
-                       hfi1_qp_wakeup(qp, RVT_S_WAIT_TX);
-                       break;
-               }
-       } while (read_seqretry(&dev->iowait_lock, seq));
-}
-
-struct verbs_txreq *__get_txreq(struct hfi1_ibdev *dev,
-                               struct rvt_qp *qp)
-{
-       struct verbs_txreq *tx = ERR_PTR(-EBUSY);
-       unsigned long flags;
-
-       spin_lock_irqsave(&qp->s_lock, flags);
-       write_seqlock(&dev->iowait_lock);
-       if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
-               struct hfi1_qp_priv *priv;
-
-               tx = kmem_cache_alloc(dev->verbs_txreq_cache, GFP_ATOMIC);
-               if (tx)
-                       goto out;
-               priv = qp->priv;
-               if (list_empty(&priv->s_iowait.list)) {
-                       dev->n_txwait++;
-                       qp->s_flags |= RVT_S_WAIT_TX;
-                       list_add_tail(&priv->s_iowait.list, &dev->txwait);
-                       trace_hfi1_qpsleep(qp, RVT_S_WAIT_TX);
-                       atomic_inc(&qp->refcount);
-               }
-               qp->s_flags &= ~RVT_S_BUSY;
-       }
-out:
-       write_sequnlock(&dev->iowait_lock);
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-       return tx;
-}
-
-static void verbs_txreq_kmem_cache_ctor(void *obj)
-{
-       struct verbs_txreq *tx = (struct verbs_txreq *)obj;
-
-       memset(tx, 0, sizeof(*tx));
-}
-
-int verbs_txreq_init(struct hfi1_ibdev *dev)
-{
-       char buf[TXREQ_LEN];
-       struct hfi1_devdata *dd = dd_from_dev(dev);
-
-       snprintf(buf, sizeof(buf), "hfi1_%u_vtxreq_cache", dd->unit);
-       dev->verbs_txreq_cache = kmem_cache_create(buf,
-                                                  sizeof(struct verbs_txreq),
-                                                  0, SLAB_HWCACHE_ALIGN,
-                                                  verbs_txreq_kmem_cache_ctor);
-       if (!dev->verbs_txreq_cache)
-               return -ENOMEM;
-       return 0;
-}
-
-void verbs_txreq_exit(struct hfi1_ibdev *dev)
-{
-       kmem_cache_destroy(dev->verbs_txreq_cache);
-       dev->verbs_txreq_cache = NULL;
-}
diff --git a/drivers/staging/rdma/hfi1/verbs_txreq.h b/drivers/staging/rdma/hfi1/verbs_txreq.h
deleted file mode 100644 (file)
index 1cf69b2..0000000
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright(c) 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#ifndef HFI1_VERBS_TXREQ_H
-#define HFI1_VERBS_TXREQ_H
-
-#include <linux/types.h>
-#include <linux/slab.h>
-
-#include "verbs.h"
-#include "sdma_txreq.h"
-#include "iowait.h"
-
-struct verbs_txreq {
-       struct hfi1_pio_header  phdr;
-       struct sdma_txreq       txreq;
-       struct rvt_qp           *qp;
-       struct rvt_swqe         *wqe;
-       struct rvt_mregion      *mr;
-       struct rvt_sge_state    *ss;
-       struct sdma_engine     *sde;
-       struct send_context     *psc;
-       u16                     hdr_dwords;
-};
-
-struct hfi1_ibdev;
-struct verbs_txreq *__get_txreq(struct hfi1_ibdev *dev,
-                               struct rvt_qp *qp);
-
-static inline struct verbs_txreq *get_txreq(struct hfi1_ibdev *dev,
-                                           struct rvt_qp *qp)
-{
-       struct verbs_txreq *tx;
-       struct hfi1_qp_priv *priv = qp->priv;
-
-       tx = kmem_cache_alloc(dev->verbs_txreq_cache, GFP_ATOMIC);
-       if (unlikely(!tx)) {
-               /* call slow path to get the lock */
-               tx = __get_txreq(dev, qp);
-               if (IS_ERR(tx))
-                       return tx;
-       }
-       tx->qp = qp;
-       tx->mr = NULL;
-       tx->sde = priv->s_sde;
-       tx->psc = priv->s_sendcontext;
-       /* so that we can test if the sdma decriptors are there */
-       tx->txreq.num_desc = 0;
-       return tx;
-}
-
-static inline struct sdma_txreq *get_sdma_txreq(struct verbs_txreq *tx)
-{
-       return &tx->txreq;
-}
-
-static inline struct verbs_txreq *get_waiting_verbs_txreq(struct rvt_qp *qp)
-{
-       struct sdma_txreq *stx;
-       struct hfi1_qp_priv *priv = qp->priv;
-
-       stx = iowait_get_txhead(&priv->s_iowait);
-       if (stx)
-               return container_of(stx, struct verbs_txreq, txreq);
-       return NULL;
-}
-
-void hfi1_put_txreq(struct verbs_txreq *tx);
-int verbs_txreq_init(struct hfi1_ibdev *dev);
-void verbs_txreq_exit(struct hfi1_ibdev *dev);
-
-#endif                         /* HFI1_VERBS_TXREQ_H */
index 8345fb4..bbdbf9c 100644 (file)
@@ -7,3 +7,5 @@ config ISCSI_TARGET
        help
        Say M here to enable the ConfigFS enabled Linux-iSCSI.org iSCSI
        Target Mode Stack.
+
+source "drivers/target/iscsi/cxgbit/Kconfig"
index 0f43be9..0f18295 100644 (file)
@@ -18,3 +18,4 @@ iscsi_target_mod-y +=         iscsi_target_parameters.o \
                                iscsi_target_transport.o
 
 obj-$(CONFIG_ISCSI_TARGET)     += iscsi_target_mod.o
+obj-$(CONFIG_ISCSI_TARGET_CXGB4) += cxgbit/
diff --git a/drivers/target/iscsi/cxgbit/Kconfig b/drivers/target/iscsi/cxgbit/Kconfig
new file mode 100644 (file)
index 0000000..c9b6a3c
--- /dev/null
@@ -0,0 +1,7 @@
+config ISCSI_TARGET_CXGB4
+       tristate "Chelsio iSCSI target offload driver"
+       depends on ISCSI_TARGET && CHELSIO_T4 && INET
+       select CHELSIO_T4_UWIRE
+       ---help---
+       To compile this driver as module, choose M here: the module
+       will be called cxgbit.
diff --git a/drivers/target/iscsi/cxgbit/Makefile b/drivers/target/iscsi/cxgbit/Makefile
new file mode 100644 (file)
index 0000000..bd56c07
--- /dev/null
@@ -0,0 +1,6 @@
+ccflags-y := -Idrivers/net/ethernet/chelsio/cxgb4
+ccflags-y += -Idrivers/target/iscsi
+
+obj-$(CONFIG_ISCSI_TARGET_CXGB4)  += cxgbit.o
+
+cxgbit-y  := cxgbit_main.o cxgbit_cm.o cxgbit_target.o cxgbit_ddp.o
diff --git a/drivers/target/iscsi/cxgbit/cxgbit.h b/drivers/target/iscsi/cxgbit/cxgbit.h
new file mode 100644 (file)
index 0000000..625c7f6
--- /dev/null
@@ -0,0 +1,353 @@
+/*
+ * Copyright (c) 2016 Chelsio Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __CXGBIT_H__
+#define __CXGBIT_H__
+
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/idr.h>
+#include <linux/completion.h>
+#include <linux/netdevice.h>
+#include <linux/sched.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/inet.h>
+#include <linux/wait.h>
+#include <linux/kref.h>
+#include <linux/timer.h>
+#include <linux/io.h>
+
+#include <asm/byteorder.h>
+
+#include <net/net_namespace.h>
+
+#include <target/iscsi/iscsi_transport.h>
+#include <iscsi_target_parameters.h>
+#include <iscsi_target_login.h>
+
+#include "t4_regs.h"
+#include "t4_msg.h"
+#include "cxgb4.h"
+#include "cxgb4_uld.h"
+#include "l2t.h"
+#include "cxgb4_ppm.h"
+#include "cxgbit_lro.h"
+
+extern struct mutex cdev_list_lock;
+extern struct list_head cdev_list_head;
+struct cxgbit_np;
+
+struct cxgbit_sock;
+
+struct cxgbit_cmd {
+       struct scatterlist sg;
+       struct cxgbi_task_tag_info ttinfo;
+       bool setup_ddp;
+       bool release;
+};
+
+#define CXGBIT_MAX_ISO_PAYLOAD \
+       min_t(u32, MAX_SKB_FRAGS * PAGE_SIZE, 65535)
+
+struct cxgbit_iso_info {
+       u8 flags;
+       u32 mpdu;
+       u32 len;
+       u32 burst_len;
+};
+
+enum cxgbit_skcb_flags {
+       SKCBF_TX_NEED_HDR       = (1 << 0), /* packet needs a header */
+       SKCBF_TX_FLAG_COMPL     = (1 << 1), /* wr completion flag */
+       SKCBF_TX_ISO            = (1 << 2), /* iso cpl in tx skb */
+       SKCBF_RX_LRO            = (1 << 3), /* lro skb */
+};
+
+struct cxgbit_skb_rx_cb {
+       u8 opcode;
+       void *pdu_cb;
+       void (*backlog_fn)(struct cxgbit_sock *, struct sk_buff *);
+};
+
+struct cxgbit_skb_tx_cb {
+       u8 submode;
+       u32 extra_len;
+};
+
+union cxgbit_skb_cb {
+       struct {
+               u8 flags;
+               union {
+                       struct cxgbit_skb_tx_cb tx;
+                       struct cxgbit_skb_rx_cb rx;
+               };
+       };
+
+       struct {
+               /* This member must be first. */
+               struct l2t_skb_cb l2t;
+               struct sk_buff *wr_next;
+       };
+};
+
+#define CXGBIT_SKB_CB(skb)     ((union cxgbit_skb_cb *)&((skb)->cb[0]))
+#define cxgbit_skcb_flags(skb)         (CXGBIT_SKB_CB(skb)->flags)
+#define cxgbit_skcb_submode(skb)       (CXGBIT_SKB_CB(skb)->tx.submode)
+#define cxgbit_skcb_tx_wr_next(skb)    (CXGBIT_SKB_CB(skb)->wr_next)
+#define cxgbit_skcb_tx_extralen(skb)   (CXGBIT_SKB_CB(skb)->tx.extra_len)
+#define cxgbit_skcb_rx_opcode(skb)     (CXGBIT_SKB_CB(skb)->rx.opcode)
+#define cxgbit_skcb_rx_backlog_fn(skb) (CXGBIT_SKB_CB(skb)->rx.backlog_fn)
+#define cxgbit_rx_pdu_cb(skb)          (CXGBIT_SKB_CB(skb)->rx.pdu_cb)
+
+static inline void *cplhdr(struct sk_buff *skb)
+{
+       return skb->data;
+}
+
+enum cxgbit_cdev_flags {
+       CDEV_STATE_UP = 0,
+       CDEV_ISO_ENABLE,
+       CDEV_DDP_ENABLE,
+};
+
+#define NP_INFO_HASH_SIZE 32
+
+struct np_info {
+       struct np_info *next;
+       struct cxgbit_np *cnp;
+       unsigned int stid;
+};
+
+struct cxgbit_list_head {
+       struct list_head list;
+       /* device lock */
+       spinlock_t lock;
+};
+
+struct cxgbit_device {
+       struct list_head list;
+       struct cxgb4_lld_info lldi;
+       struct np_info *np_hash_tab[NP_INFO_HASH_SIZE];
+       /* np lock */
+       spinlock_t np_lock;
+       u8 selectq[MAX_NPORTS][2];
+       struct cxgbit_list_head cskq;
+       u32 mdsl;
+       struct kref kref;
+       unsigned long flags;
+};
+
+struct cxgbit_wr_wait {
+       struct completion completion;
+       int ret;
+};
+
+enum cxgbit_csk_state {
+       CSK_STATE_IDLE = 0,
+       CSK_STATE_LISTEN,
+       CSK_STATE_CONNECTING,
+       CSK_STATE_ESTABLISHED,
+       CSK_STATE_ABORTING,
+       CSK_STATE_CLOSING,
+       CSK_STATE_MORIBUND,
+       CSK_STATE_DEAD,
+};
+
+enum cxgbit_csk_flags {
+       CSK_TX_DATA_SENT = 0,
+       CSK_LOGIN_PDU_DONE,
+       CSK_LOGIN_DONE,
+       CSK_DDP_ENABLE,
+};
+
+struct cxgbit_sock_common {
+       struct cxgbit_device *cdev;
+       struct sockaddr_storage local_addr;
+       struct sockaddr_storage remote_addr;
+       struct cxgbit_wr_wait wr_wait;
+       enum cxgbit_csk_state state;
+       unsigned long flags;
+};
+
+struct cxgbit_np {
+       struct cxgbit_sock_common com;
+       wait_queue_head_t accept_wait;
+       struct iscsi_np *np;
+       struct completion accept_comp;
+       struct list_head np_accept_list;
+       /* np accept lock */
+       spinlock_t np_accept_lock;
+       struct kref kref;
+       unsigned int stid;
+};
+
+struct cxgbit_sock {
+       struct cxgbit_sock_common com;
+       struct cxgbit_np *cnp;
+       struct iscsi_conn *conn;
+       struct l2t_entry *l2t;
+       struct dst_entry *dst;
+       struct list_head list;
+       struct sk_buff_head rxq;
+       struct sk_buff_head txq;
+       struct sk_buff_head ppodq;
+       struct sk_buff_head backlogq;
+       struct sk_buff_head skbq;
+       struct sk_buff *wr_pending_head;
+       struct sk_buff *wr_pending_tail;
+       struct sk_buff *skb;
+       struct sk_buff *lro_skb;
+       struct sk_buff *lro_hskb;
+       struct list_head accept_node;
+       /* socket lock */
+       spinlock_t lock;
+       wait_queue_head_t waitq;
+       wait_queue_head_t ack_waitq;
+       bool lock_owner;
+       struct kref kref;
+       u32 max_iso_npdu;
+       u32 wr_cred;
+       u32 wr_una_cred;
+       u32 wr_max_cred;
+       u32 snd_una;
+       u32 tid;
+       u32 snd_nxt;
+       u32 rcv_nxt;
+       u32 smac_idx;
+       u32 tx_chan;
+       u32 mtu;
+       u32 write_seq;
+       u32 rx_credits;
+       u32 snd_win;
+       u32 rcv_win;
+       u16 mss;
+       u16 emss;
+       u16 plen;
+       u16 rss_qid;
+       u16 txq_idx;
+       u16 ctrlq_idx;
+       u8 tos;
+       u8 port_id;
+#define CXGBIT_SUBMODE_HCRC 0x1
+#define CXGBIT_SUBMODE_DCRC 0x2
+       u8 submode;
+#ifdef CONFIG_CHELSIO_T4_DCB
+       u8 dcb_priority;
+#endif
+       u8 snd_wscale;
+};
+
+void _cxgbit_free_cdev(struct kref *kref);
+void _cxgbit_free_csk(struct kref *kref);
+void _cxgbit_free_cnp(struct kref *kref);
+
+static inline void cxgbit_get_cdev(struct cxgbit_device *cdev)
+{
+       kref_get(&cdev->kref);
+}
+
+static inline void cxgbit_put_cdev(struct cxgbit_device *cdev)
+{
+       kref_put(&cdev->kref, _cxgbit_free_cdev);
+}
+
+static inline void cxgbit_get_csk(struct cxgbit_sock *csk)
+{
+       kref_get(&csk->kref);
+}
+
+static inline void cxgbit_put_csk(struct cxgbit_sock *csk)
+{
+       kref_put(&csk->kref, _cxgbit_free_csk);
+}
+
+static inline void cxgbit_get_cnp(struct cxgbit_np *cnp)
+{
+       kref_get(&cnp->kref);
+}
+
+static inline void cxgbit_put_cnp(struct cxgbit_np *cnp)
+{
+       kref_put(&cnp->kref, _cxgbit_free_cnp);
+}
+
+static inline void cxgbit_sock_reset_wr_list(struct cxgbit_sock *csk)
+{
+       csk->wr_pending_tail = NULL;
+       csk->wr_pending_head = NULL;
+}
+
+static inline struct sk_buff *cxgbit_sock_peek_wr(const struct cxgbit_sock *csk)
+{
+       return csk->wr_pending_head;
+}
+
+static inline void
+cxgbit_sock_enqueue_wr(struct cxgbit_sock *csk, struct sk_buff *skb)
+{
+       cxgbit_skcb_tx_wr_next(skb) = NULL;
+
+       skb_get(skb);
+
+       if (!csk->wr_pending_head)
+               csk->wr_pending_head = skb;
+       else
+               cxgbit_skcb_tx_wr_next(csk->wr_pending_tail) = skb;
+       csk->wr_pending_tail = skb;
+}
+
+static inline struct sk_buff *cxgbit_sock_dequeue_wr(struct cxgbit_sock *csk)
+{
+       struct sk_buff *skb = csk->wr_pending_head;
+
+       if (likely(skb)) {
+               csk->wr_pending_head = cxgbit_skcb_tx_wr_next(skb);
+               cxgbit_skcb_tx_wr_next(skb) = NULL;
+       }
+       return skb;
+}
+
+typedef void (*cxgbit_cplhandler_func)(struct cxgbit_device *,
+                                      struct sk_buff *);
+
+int cxgbit_setup_np(struct iscsi_np *, struct sockaddr_storage *);
+int cxgbit_setup_conn_digest(struct cxgbit_sock *);
+int cxgbit_accept_np(struct iscsi_np *, struct iscsi_conn *);
+void cxgbit_free_np(struct iscsi_np *);
+void cxgbit_free_conn(struct iscsi_conn *);
+extern cxgbit_cplhandler_func cxgbit_cplhandlers[NUM_CPL_CMDS];
+int cxgbit_get_login_rx(struct iscsi_conn *, struct iscsi_login *);
+int cxgbit_rx_data_ack(struct cxgbit_sock *);
+int cxgbit_l2t_send(struct cxgbit_device *, struct sk_buff *,
+                   struct l2t_entry *);
+void cxgbit_push_tx_frames(struct cxgbit_sock *);
+int cxgbit_put_login_tx(struct iscsi_conn *, struct iscsi_login *, u32);
+int cxgbit_xmit_pdu(struct iscsi_conn *, struct iscsi_cmd *,
+                   struct iscsi_datain_req *, const void *, u32);
+void cxgbit_get_r2t_ttt(struct iscsi_conn *, struct iscsi_cmd *,
+                       struct iscsi_r2t *);
+u32 cxgbit_send_tx_flowc_wr(struct cxgbit_sock *);
+int cxgbit_ofld_send(struct cxgbit_device *, struct sk_buff *);
+void cxgbit_get_rx_pdu(struct iscsi_conn *);
+int cxgbit_validate_params(struct iscsi_conn *);
+struct cxgbit_device *cxgbit_find_device(struct net_device *, u8 *);
+
+/* DDP */
+int cxgbit_ddp_init(struct cxgbit_device *);
+int cxgbit_setup_conn_pgidx(struct cxgbit_sock *, u32);
+int cxgbit_reserve_ttt(struct cxgbit_sock *, struct iscsi_cmd *);
+void cxgbit_release_cmd(struct iscsi_conn *, struct iscsi_cmd *);
+
+static inline
+struct cxgbi_ppm *cdev2ppm(struct cxgbit_device *cdev)
+{
+       return (struct cxgbi_ppm *)(*cdev->lldi.iscsi_ppm);
+}
+#endif /* __CXGBIT_H__ */
diff --git a/drivers/target/iscsi/cxgbit/cxgbit_cm.c b/drivers/target/iscsi/cxgbit/cxgbit_cm.c
new file mode 100644 (file)
index 0000000..0ae0b13
--- /dev/null
@@ -0,0 +1,2086 @@
+/*
+ * Copyright (c) 2016 Chelsio Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/skbuff.h>
+#include <linux/timer.h>
+#include <linux/notifier.h>
+#include <linux/inetdevice.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/if_vlan.h>
+
+#include <net/neighbour.h>
+#include <net/netevent.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+
+#include "cxgbit.h"
+#include "clip_tbl.h"
+
+static void cxgbit_init_wr_wait(struct cxgbit_wr_wait *wr_waitp)
+{
+       wr_waitp->ret = 0;
+       reinit_completion(&wr_waitp->completion);
+}
+
+static void
+cxgbit_wake_up(struct cxgbit_wr_wait *wr_waitp, const char *func, u8 ret)
+{
+       if (ret == CPL_ERR_NONE)
+               wr_waitp->ret = 0;
+       else
+               wr_waitp->ret = -EIO;
+
+       if (wr_waitp->ret)
+               pr_err("%s: err:%u", func, ret);
+
+       complete(&wr_waitp->completion);
+}
+
+static int
+cxgbit_wait_for_reply(struct cxgbit_device *cdev,
+                     struct cxgbit_wr_wait *wr_waitp, u32 tid, u32 timeout,
+                     const char *func)
+{
+       int ret;
+
+       if (!test_bit(CDEV_STATE_UP, &cdev->flags)) {
+               wr_waitp->ret = -EIO;
+               goto out;
+       }
+
+       ret = wait_for_completion_timeout(&wr_waitp->completion, timeout * HZ);
+       if (!ret) {
+               pr_info("%s - Device %s not responding tid %u\n",
+                       func, pci_name(cdev->lldi.pdev), tid);
+               wr_waitp->ret = -ETIMEDOUT;
+       }
+out:
+       if (wr_waitp->ret)
+               pr_info("%s: FW reply %d tid %u\n",
+                       pci_name(cdev->lldi.pdev), wr_waitp->ret, tid);
+       return wr_waitp->ret;
+}
+
+/* Returns whether a CPL status conveys negative advice.
+ */
+static int cxgbit_is_neg_adv(unsigned int status)
+{
+       return status == CPL_ERR_RTX_NEG_ADVICE ||
+               status == CPL_ERR_PERSIST_NEG_ADVICE ||
+               status == CPL_ERR_KEEPALV_NEG_ADVICE;
+}
+
+static int cxgbit_np_hashfn(const struct cxgbit_np *cnp)
+{
+       return ((unsigned long)cnp >> 10) & (NP_INFO_HASH_SIZE - 1);
+}
+
+static struct np_info *
+cxgbit_np_hash_add(struct cxgbit_device *cdev, struct cxgbit_np *cnp,
+                  unsigned int stid)
+{
+       struct np_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
+
+       if (p) {
+               int bucket = cxgbit_np_hashfn(cnp);
+
+               p->cnp = cnp;
+               p->stid = stid;
+               spin_lock(&cdev->np_lock);
+               p->next = cdev->np_hash_tab[bucket];
+               cdev->np_hash_tab[bucket] = p;
+               spin_unlock(&cdev->np_lock);
+       }
+
+       return p;
+}
+
+static int
+cxgbit_np_hash_find(struct cxgbit_device *cdev, struct cxgbit_np *cnp)
+{
+       int stid = -1, bucket = cxgbit_np_hashfn(cnp);
+       struct np_info *p;
+
+       spin_lock(&cdev->np_lock);
+       for (p = cdev->np_hash_tab[bucket]; p; p = p->next) {
+               if (p->cnp == cnp) {
+                       stid = p->stid;
+                       break;
+               }
+       }
+       spin_unlock(&cdev->np_lock);
+
+       return stid;
+}
+
+static int cxgbit_np_hash_del(struct cxgbit_device *cdev, struct cxgbit_np *cnp)
+{
+       int stid = -1, bucket = cxgbit_np_hashfn(cnp);
+       struct np_info *p, **prev = &cdev->np_hash_tab[bucket];
+
+       spin_lock(&cdev->np_lock);
+       for (p = *prev; p; prev = &p->next, p = p->next) {
+               if (p->cnp == cnp) {
+                       stid = p->stid;
+                       *prev = p->next;
+                       kfree(p);
+                       break;
+               }
+       }
+       spin_unlock(&cdev->np_lock);
+
+       return stid;
+}
+
+void _cxgbit_free_cnp(struct kref *kref)
+{
+       struct cxgbit_np *cnp;
+
+       cnp = container_of(kref, struct cxgbit_np, kref);
+       kfree(cnp);
+}
+
+static int
+cxgbit_create_server6(struct cxgbit_device *cdev, unsigned int stid,
+                     struct cxgbit_np *cnp)
+{
+       struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)
+                                    &cnp->com.local_addr;
+       int addr_type;
+       int ret;
+
+       pr_debug("%s: dev = %s; stid = %u; sin6_port = %u\n",
+                __func__, cdev->lldi.ports[0]->name, stid, sin6->sin6_port);
+
+       addr_type = ipv6_addr_type((const struct in6_addr *)
+                                  &sin6->sin6_addr);
+       if (addr_type != IPV6_ADDR_ANY) {
+               ret = cxgb4_clip_get(cdev->lldi.ports[0],
+                                    (const u32 *)&sin6->sin6_addr.s6_addr, 1);
+               if (ret) {
+                       pr_err("Unable to find clip table entry. laddr %pI6. Error:%d.\n",
+                              sin6->sin6_addr.s6_addr, ret);
+                       return -ENOMEM;
+               }
+       }
+
+       cxgbit_get_cnp(cnp);
+       cxgbit_init_wr_wait(&cnp->com.wr_wait);
+
+       ret = cxgb4_create_server6(cdev->lldi.ports[0],
+                                  stid, &sin6->sin6_addr,
+                                  sin6->sin6_port,
+                                  cdev->lldi.rxq_ids[0]);
+       if (!ret)
+               ret = cxgbit_wait_for_reply(cdev, &cnp->com.wr_wait,
+                                           0, 10, __func__);
+       else if (ret > 0)
+               ret = net_xmit_errno(ret);
+       else
+               cxgbit_put_cnp(cnp);
+
+       if (ret) {
+               if (ret != -ETIMEDOUT)
+                       cxgb4_clip_release(cdev->lldi.ports[0],
+                                  (const u32 *)&sin6->sin6_addr.s6_addr, 1);
+
+               pr_err("create server6 err %d stid %d laddr %pI6 lport %d\n",
+                      ret, stid, sin6->sin6_addr.s6_addr,
+                      ntohs(sin6->sin6_port));
+       }
+
+       return ret;
+}
+
+static int
+cxgbit_create_server4(struct cxgbit_device *cdev, unsigned int stid,
+                     struct cxgbit_np *cnp)
+{
+       struct sockaddr_in *sin = (struct sockaddr_in *)
+                                  &cnp->com.local_addr;
+       int ret;
+
+       pr_debug("%s: dev = %s; stid = %u; sin_port = %u\n",
+                __func__, cdev->lldi.ports[0]->name, stid, sin->sin_port);
+
+       cxgbit_get_cnp(cnp);
+       cxgbit_init_wr_wait(&cnp->com.wr_wait);
+
+       ret = cxgb4_create_server(cdev->lldi.ports[0],
+                                 stid, sin->sin_addr.s_addr,
+                                 sin->sin_port, 0,
+                                 cdev->lldi.rxq_ids[0]);
+       if (!ret)
+               ret = cxgbit_wait_for_reply(cdev,
+                                           &cnp->com.wr_wait,
+                                           0, 10, __func__);
+       else if (ret > 0)
+               ret = net_xmit_errno(ret);
+       else
+               cxgbit_put_cnp(cnp);
+
+       if (ret)
+               pr_err("create server failed err %d stid %d laddr %pI4 lport %d\n",
+                      ret, stid, &sin->sin_addr, ntohs(sin->sin_port));
+       return ret;
+}
+
+struct cxgbit_device *cxgbit_find_device(struct net_device *ndev, u8 *port_id)
+{
+       struct cxgbit_device *cdev;
+       u8 i;
+
+       list_for_each_entry(cdev, &cdev_list_head, list) {
+               struct cxgb4_lld_info *lldi = &cdev->lldi;
+
+               for (i = 0; i < lldi->nports; i++) {
+                       if (lldi->ports[i] == ndev) {
+                               if (port_id)
+                                       *port_id = i;
+                               return cdev;
+                       }
+               }
+       }
+
+       return NULL;
+}
+
+static struct net_device *cxgbit_get_real_dev(struct net_device *ndev)
+{
+       if (ndev->priv_flags & IFF_BONDING) {
+               pr_err("Bond devices are not supported. Interface:%s\n",
+                      ndev->name);
+               return NULL;
+       }
+
+       if (is_vlan_dev(ndev))
+               return vlan_dev_real_dev(ndev);
+
+       return ndev;
+}
+
+static struct net_device *cxgbit_ipv4_netdev(__be32 saddr)
+{
+       struct net_device *ndev;
+
+       ndev = __ip_dev_find(&init_net, saddr, false);
+       if (!ndev)
+               return NULL;
+
+       return cxgbit_get_real_dev(ndev);
+}
+
+static struct net_device *cxgbit_ipv6_netdev(struct in6_addr *addr6)
+{
+       struct net_device *ndev = NULL;
+       bool found = false;
+
+       if (IS_ENABLED(CONFIG_IPV6)) {
+               for_each_netdev_rcu(&init_net, ndev)
+                       if (ipv6_chk_addr(&init_net, addr6, ndev, 1)) {
+                               found = true;
+                               break;
+                       }
+       }
+       if (!found)
+               return NULL;
+       return cxgbit_get_real_dev(ndev);
+}
+
+static struct cxgbit_device *cxgbit_find_np_cdev(struct cxgbit_np *cnp)
+{
+       struct sockaddr_storage *sockaddr = &cnp->com.local_addr;
+       int ss_family = sockaddr->ss_family;
+       struct net_device *ndev = NULL;
+       struct cxgbit_device *cdev = NULL;
+
+       rcu_read_lock();
+       if (ss_family == AF_INET) {
+               struct sockaddr_in *sin;
+
+               sin = (struct sockaddr_in *)sockaddr;
+               ndev = cxgbit_ipv4_netdev(sin->sin_addr.s_addr);
+       } else if (ss_family == AF_INET6) {
+               struct sockaddr_in6 *sin6;
+
+               sin6 = (struct sockaddr_in6 *)sockaddr;
+               ndev = cxgbit_ipv6_netdev(&sin6->sin6_addr);
+       }
+       if (!ndev)
+               goto out;
+
+       cdev = cxgbit_find_device(ndev, NULL);
+out:
+       rcu_read_unlock();
+       return cdev;
+}
+
+static bool cxgbit_inaddr_any(struct cxgbit_np *cnp)
+{
+       struct sockaddr_storage *sockaddr = &cnp->com.local_addr;
+       int ss_family = sockaddr->ss_family;
+       int addr_type;
+
+       if (ss_family == AF_INET) {
+               struct sockaddr_in *sin;
+
+               sin = (struct sockaddr_in *)sockaddr;
+               if (sin->sin_addr.s_addr == htonl(INADDR_ANY))
+                       return true;
+       } else if (ss_family == AF_INET6) {
+               struct sockaddr_in6 *sin6;
+
+               sin6 = (struct sockaddr_in6 *)sockaddr;
+               addr_type = ipv6_addr_type((const struct in6_addr *)
+                               &sin6->sin6_addr);
+               if (addr_type == IPV6_ADDR_ANY)
+                       return true;
+       }
+       return false;
+}
+
+static int
+__cxgbit_setup_cdev_np(struct cxgbit_device *cdev, struct cxgbit_np *cnp)
+{
+       int stid, ret;
+       int ss_family = cnp->com.local_addr.ss_family;
+
+       if (!test_bit(CDEV_STATE_UP, &cdev->flags))
+               return -EINVAL;
+
+       stid = cxgb4_alloc_stid(cdev->lldi.tids, ss_family, cnp);
+       if (stid < 0)
+               return -EINVAL;
+
+       if (!cxgbit_np_hash_add(cdev, cnp, stid)) {
+               cxgb4_free_stid(cdev->lldi.tids, stid, ss_family);
+               return -EINVAL;
+       }
+
+       if (ss_family == AF_INET)
+               ret = cxgbit_create_server4(cdev, stid, cnp);
+       else
+               ret = cxgbit_create_server6(cdev, stid, cnp);
+
+       if (ret) {
+               if (ret != -ETIMEDOUT)
+                       cxgb4_free_stid(cdev->lldi.tids, stid,
+                                       ss_family);
+               cxgbit_np_hash_del(cdev, cnp);
+               return ret;
+       }
+       return ret;
+}
+
+static int cxgbit_setup_cdev_np(struct cxgbit_np *cnp)
+{
+       struct cxgbit_device *cdev;
+       int ret = -1;
+
+       mutex_lock(&cdev_list_lock);
+       cdev = cxgbit_find_np_cdev(cnp);
+       if (!cdev)
+               goto out;
+
+       if (cxgbit_np_hash_find(cdev, cnp) >= 0)
+               goto out;
+
+       if (__cxgbit_setup_cdev_np(cdev, cnp))
+               goto out;
+
+       cnp->com.cdev = cdev;
+       ret = 0;
+out:
+       mutex_unlock(&cdev_list_lock);
+       return ret;
+}
+
+static int cxgbit_setup_all_np(struct cxgbit_np *cnp)
+{
+       struct cxgbit_device *cdev;
+       int ret;
+       u32 count = 0;
+
+       mutex_lock(&cdev_list_lock);
+       list_for_each_entry(cdev, &cdev_list_head, list) {
+               if (cxgbit_np_hash_find(cdev, cnp) >= 0) {
+                       mutex_unlock(&cdev_list_lock);
+                       return -1;
+               }
+       }
+
+       list_for_each_entry(cdev, &cdev_list_head, list) {
+               ret = __cxgbit_setup_cdev_np(cdev, cnp);
+               if (ret == -ETIMEDOUT)
+                       break;
+               if (ret != 0)
+                       continue;
+               count++;
+       }
+       mutex_unlock(&cdev_list_lock);
+
+       return count ? 0 : -1;
+}
+
+int cxgbit_setup_np(struct iscsi_np *np, struct sockaddr_storage *ksockaddr)
+{
+       struct cxgbit_np *cnp;
+       int ret;
+
+       if ((ksockaddr->ss_family != AF_INET) &&
+           (ksockaddr->ss_family != AF_INET6))
+               return -EINVAL;
+
+       cnp = kzalloc(sizeof(*cnp), GFP_KERNEL);
+       if (!cnp)
+               return -ENOMEM;
+
+       init_waitqueue_head(&cnp->accept_wait);
+       init_completion(&cnp->com.wr_wait.completion);
+       init_completion(&cnp->accept_comp);
+       INIT_LIST_HEAD(&cnp->np_accept_list);
+       spin_lock_init(&cnp->np_accept_lock);
+       kref_init(&cnp->kref);
+       memcpy(&np->np_sockaddr, ksockaddr,
+              sizeof(struct sockaddr_storage));
+       memcpy(&cnp->com.local_addr, &np->np_sockaddr,
+              sizeof(cnp->com.local_addr));
+
+       cnp->np = np;
+       cnp->com.cdev = NULL;
+
+       if (cxgbit_inaddr_any(cnp))
+               ret = cxgbit_setup_all_np(cnp);
+       else
+               ret = cxgbit_setup_cdev_np(cnp);
+
+       if (ret) {
+               cxgbit_put_cnp(cnp);
+               return -EINVAL;
+       }
+
+       np->np_context = cnp;
+       cnp->com.state = CSK_STATE_LISTEN;
+       return 0;
+}
+
+static void
+cxgbit_set_conn_info(struct iscsi_np *np, struct iscsi_conn *conn,
+                    struct cxgbit_sock *csk)
+{
+       conn->login_family = np->np_sockaddr.ss_family;
+       conn->login_sockaddr = csk->com.remote_addr;
+       conn->local_sockaddr = csk->com.local_addr;
+}
+
+int cxgbit_accept_np(struct iscsi_np *np, struct iscsi_conn *conn)
+{
+       struct cxgbit_np *cnp = np->np_context;
+       struct cxgbit_sock *csk;
+       int ret = 0;
+
+accept_wait:
+       ret = wait_for_completion_interruptible(&cnp->accept_comp);
+       if (ret)
+               return -ENODEV;
+
+       spin_lock_bh(&np->np_thread_lock);
+       if (np->np_thread_state >= ISCSI_NP_THREAD_RESET) {
+               spin_unlock_bh(&np->np_thread_lock);
+               /**
+                * No point in stalling here when np_thread
+                * is in state RESET/SHUTDOWN/EXIT - bail
+                **/
+               return -ENODEV;
+       }
+       spin_unlock_bh(&np->np_thread_lock);
+
+       spin_lock_bh(&cnp->np_accept_lock);
+       if (list_empty(&cnp->np_accept_list)) {
+               spin_unlock_bh(&cnp->np_accept_lock);
+               goto accept_wait;
+       }
+
+       csk = list_first_entry(&cnp->np_accept_list,
+                              struct cxgbit_sock,
+                              accept_node);
+
+       list_del_init(&csk->accept_node);
+       spin_unlock_bh(&cnp->np_accept_lock);
+       conn->context = csk;
+       csk->conn = conn;
+
+       cxgbit_set_conn_info(np, conn, csk);
+       return 0;
+}
+
+static int
+__cxgbit_free_cdev_np(struct cxgbit_device *cdev, struct cxgbit_np *cnp)
+{
+       int stid, ret;
+       bool ipv6 = false;
+
+       stid = cxgbit_np_hash_del(cdev, cnp);
+       if (stid < 0)
+               return -EINVAL;
+       if (!test_bit(CDEV_STATE_UP, &cdev->flags))
+               return -EINVAL;
+
+       if (cnp->np->np_sockaddr.ss_family == AF_INET6)
+               ipv6 = true;
+
+       cxgbit_get_cnp(cnp);
+       cxgbit_init_wr_wait(&cnp->com.wr_wait);
+       ret = cxgb4_remove_server(cdev->lldi.ports[0], stid,
+                                 cdev->lldi.rxq_ids[0], ipv6);
+
+       if (ret > 0)
+               ret = net_xmit_errno(ret);
+
+       if (ret) {
+               cxgbit_put_cnp(cnp);
+               return ret;
+       }
+
+       ret = cxgbit_wait_for_reply(cdev, &cnp->com.wr_wait,
+                                   0, 10, __func__);
+       if (ret == -ETIMEDOUT)
+               return ret;
+
+       if (ipv6 && cnp->com.cdev) {
+               struct sockaddr_in6 *sin6;
+
+               sin6 = (struct sockaddr_in6 *)&cnp->com.local_addr;
+               cxgb4_clip_release(cdev->lldi.ports[0],
+                                  (const u32 *)&sin6->sin6_addr.s6_addr,
+                                  1);
+       }
+
+       cxgb4_free_stid(cdev->lldi.tids, stid,
+                       cnp->com.local_addr.ss_family);
+       return 0;
+}
+
+static void cxgbit_free_all_np(struct cxgbit_np *cnp)
+{
+       struct cxgbit_device *cdev;
+       int ret;
+
+       mutex_lock(&cdev_list_lock);
+       list_for_each_entry(cdev, &cdev_list_head, list) {
+               ret = __cxgbit_free_cdev_np(cdev, cnp);
+               if (ret == -ETIMEDOUT)
+                       break;
+       }
+       mutex_unlock(&cdev_list_lock);
+}
+
+static void cxgbit_free_cdev_np(struct cxgbit_np *cnp)
+{
+       struct cxgbit_device *cdev;
+       bool found = false;
+
+       mutex_lock(&cdev_list_lock);
+       list_for_each_entry(cdev, &cdev_list_head, list) {
+               if (cdev == cnp->com.cdev) {
+                       found = true;
+                       break;
+               }
+       }
+       if (!found)
+               goto out;
+
+       __cxgbit_free_cdev_np(cdev, cnp);
+out:
+       mutex_unlock(&cdev_list_lock);
+}
+
+void cxgbit_free_np(struct iscsi_np *np)
+{
+       struct cxgbit_np *cnp = np->np_context;
+
+       cnp->com.state = CSK_STATE_DEAD;
+       if (cnp->com.cdev)
+               cxgbit_free_cdev_np(cnp);
+       else
+               cxgbit_free_all_np(cnp);
+
+       np->np_context = NULL;
+       cxgbit_put_cnp(cnp);
+}
+
+static void cxgbit_send_halfclose(struct cxgbit_sock *csk)
+{
+       struct sk_buff *skb;
+       struct cpl_close_con_req *req;
+       unsigned int len = roundup(sizeof(struct cpl_close_con_req), 16);
+
+       skb = alloc_skb(len, GFP_ATOMIC);
+       if (!skb)
+               return;
+
+       req = (struct cpl_close_con_req *)__skb_put(skb, len);
+       memset(req, 0, len);
+
+       set_wr_txq(skb, CPL_PRIORITY_DATA, csk->txq_idx);
+       INIT_TP_WR(req, csk->tid);
+       OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_CLOSE_CON_REQ,
+                                                   csk->tid));
+       req->rsvd = 0;
+
+       cxgbit_skcb_flags(skb) |= SKCBF_TX_FLAG_COMPL;
+       __skb_queue_tail(&csk->txq, skb);
+       cxgbit_push_tx_frames(csk);
+}
+
+static void cxgbit_arp_failure_discard(void *handle, struct sk_buff *skb)
+{
+       pr_debug("%s cxgbit_device %p\n", __func__, handle);
+       kfree_skb(skb);
+}
+
+static void cxgbit_abort_arp_failure(void *handle, struct sk_buff *skb)
+{
+       struct cxgbit_device *cdev = handle;
+       struct cpl_abort_req *req = cplhdr(skb);
+
+       pr_debug("%s cdev %p\n", __func__, cdev);
+       req->cmd = CPL_ABORT_NO_RST;
+       cxgbit_ofld_send(cdev, skb);
+}
+
+static int cxgbit_send_abort_req(struct cxgbit_sock *csk)
+{
+       struct cpl_abort_req *req;
+       unsigned int len = roundup(sizeof(*req), 16);
+       struct sk_buff *skb;
+
+       pr_debug("%s: csk %p tid %u; state %d\n",
+                __func__, csk, csk->tid, csk->com.state);
+
+       __skb_queue_purge(&csk->txq);
+
+       if (!test_and_set_bit(CSK_TX_DATA_SENT, &csk->com.flags))
+               cxgbit_send_tx_flowc_wr(csk);
+
+       skb = __skb_dequeue(&csk->skbq);
+       req = (struct cpl_abort_req *)__skb_put(skb, len);
+       memset(req, 0, len);
+
+       set_wr_txq(skb, CPL_PRIORITY_DATA, csk->txq_idx);
+       t4_set_arp_err_handler(skb, csk->com.cdev, cxgbit_abort_arp_failure);
+       INIT_TP_WR(req, csk->tid);
+       OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_ABORT_REQ,
+                                                   csk->tid));
+       req->cmd = CPL_ABORT_SEND_RST;
+       return cxgbit_l2t_send(csk->com.cdev, skb, csk->l2t);
+}
+
+void cxgbit_free_conn(struct iscsi_conn *conn)
+{
+       struct cxgbit_sock *csk = conn->context;
+       bool release = false;
+
+       pr_debug("%s: state %d\n",
+                __func__, csk->com.state);
+
+       spin_lock_bh(&csk->lock);
+       switch (csk->com.state) {
+       case CSK_STATE_ESTABLISHED:
+               if (conn->conn_state == TARG_CONN_STATE_IN_LOGOUT) {
+                       csk->com.state = CSK_STATE_CLOSING;
+                       cxgbit_send_halfclose(csk);
+               } else {
+                       csk->com.state = CSK_STATE_ABORTING;
+                       cxgbit_send_abort_req(csk);
+               }
+               break;
+       case CSK_STATE_CLOSING:
+               csk->com.state = CSK_STATE_MORIBUND;
+               cxgbit_send_halfclose(csk);
+               break;
+       case CSK_STATE_DEAD:
+               release = true;
+               break;
+       default:
+               pr_err("%s: csk %p; state %d\n",
+                      __func__, csk, csk->com.state);
+       }
+       spin_unlock_bh(&csk->lock);
+
+       if (release)
+               cxgbit_put_csk(csk);
+}
+
+static void cxgbit_set_emss(struct cxgbit_sock *csk, u16 opt)
+{
+       csk->emss = csk->com.cdev->lldi.mtus[TCPOPT_MSS_G(opt)] -
+                       ((csk->com.remote_addr.ss_family == AF_INET) ?
+                       sizeof(struct iphdr) : sizeof(struct ipv6hdr)) -
+                       sizeof(struct tcphdr);
+       csk->mss = csk->emss;
+       if (TCPOPT_TSTAMP_G(opt))
+               csk->emss -= round_up(TCPOLEN_TIMESTAMP, 4);
+       if (csk->emss < 128)
+               csk->emss = 128;
+       if (csk->emss & 7)
+               pr_info("Warning: misaligned mtu idx %u mss %u emss=%u\n",
+                       TCPOPT_MSS_G(opt), csk->mss, csk->emss);
+       pr_debug("%s mss_idx %u mss %u emss=%u\n", __func__, TCPOPT_MSS_G(opt),
+                csk->mss, csk->emss);
+}
+
+static void cxgbit_free_skb(struct cxgbit_sock *csk)
+{
+       struct sk_buff *skb;
+
+       __skb_queue_purge(&csk->txq);
+       __skb_queue_purge(&csk->rxq);
+       __skb_queue_purge(&csk->backlogq);
+       __skb_queue_purge(&csk->ppodq);
+       __skb_queue_purge(&csk->skbq);
+
+       while ((skb = cxgbit_sock_dequeue_wr(csk)))
+               kfree_skb(skb);
+
+       __kfree_skb(csk->lro_hskb);
+}
+
+void _cxgbit_free_csk(struct kref *kref)
+{
+       struct cxgbit_sock *csk;
+       struct cxgbit_device *cdev;
+
+       csk = container_of(kref, struct cxgbit_sock, kref);
+
+       pr_debug("%s csk %p state %d\n", __func__, csk, csk->com.state);
+
+       if (csk->com.local_addr.ss_family == AF_INET6) {
+               struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)
+                                            &csk->com.local_addr;
+               cxgb4_clip_release(csk->com.cdev->lldi.ports[0],
+                                  (const u32 *)
+                                  &sin6->sin6_addr.s6_addr, 1);
+       }
+
+       cxgb4_remove_tid(csk->com.cdev->lldi.tids, 0, csk->tid);
+       dst_release(csk->dst);
+       cxgb4_l2t_release(csk->l2t);
+
+       cdev = csk->com.cdev;
+       spin_lock_bh(&cdev->cskq.lock);
+       list_del(&csk->list);
+       spin_unlock_bh(&cdev->cskq.lock);
+
+       cxgbit_free_skb(csk);
+       cxgbit_put_cdev(cdev);
+
+       kfree(csk);
+}
+
+static void
+cxgbit_get_tuple_info(struct cpl_pass_accept_req *req, int *iptype,
+                     __u8 *local_ip, __u8 *peer_ip, __be16 *local_port,
+                     __be16 *peer_port)
+{
+       u32 eth_len = ETH_HDR_LEN_G(be32_to_cpu(req->hdr_len));
+       u32 ip_len = IP_HDR_LEN_G(be32_to_cpu(req->hdr_len));
+       struct iphdr *ip = (struct iphdr *)((u8 *)(req + 1) + eth_len);
+       struct ipv6hdr *ip6 = (struct ipv6hdr *)((u8 *)(req + 1) + eth_len);
+       struct tcphdr *tcp = (struct tcphdr *)
+                             ((u8 *)(req + 1) + eth_len + ip_len);
+
+       if (ip->version == 4) {
+               pr_debug("%s saddr 0x%x daddr 0x%x sport %u dport %u\n",
+                        __func__,
+                        ntohl(ip->saddr), ntohl(ip->daddr),
+                        ntohs(tcp->source),
+                        ntohs(tcp->dest));
+               *iptype = 4;
+               memcpy(peer_ip, &ip->saddr, 4);
+               memcpy(local_ip, &ip->daddr, 4);
+       } else {
+               pr_debug("%s saddr %pI6 daddr %pI6 sport %u dport %u\n",
+                        __func__,
+                        ip6->saddr.s6_addr, ip6->daddr.s6_addr,
+                        ntohs(tcp->source),
+                        ntohs(tcp->dest));
+               *iptype = 6;
+               memcpy(peer_ip, ip6->saddr.s6_addr, 16);
+               memcpy(local_ip, ip6->daddr.s6_addr, 16);
+       }
+
+       *peer_port = tcp->source;
+       *local_port = tcp->dest;
+}
+
+static int
+cxgbit_our_interface(struct cxgbit_device *cdev, struct net_device *egress_dev)
+{
+       u8 i;
+
+       egress_dev = cxgbit_get_real_dev(egress_dev);
+       for (i = 0; i < cdev->lldi.nports; i++)
+               if (cdev->lldi.ports[i] == egress_dev)
+                       return 1;
+       return 0;
+}
+
+static struct dst_entry *
+cxgbit_find_route6(struct cxgbit_device *cdev, __u8 *local_ip, __u8 *peer_ip,
+                  __be16 local_port, __be16 peer_port, u8 tos,
+                  __u32 sin6_scope_id)
+{
+       struct dst_entry *dst = NULL;
+
+       if (IS_ENABLED(CONFIG_IPV6)) {
+               struct flowi6 fl6;
+
+               memset(&fl6, 0, sizeof(fl6));
+               memcpy(&fl6.daddr, peer_ip, 16);
+               memcpy(&fl6.saddr, local_ip, 16);
+               if (ipv6_addr_type(&fl6.daddr) & IPV6_ADDR_LINKLOCAL)
+                       fl6.flowi6_oif = sin6_scope_id;
+               dst = ip6_route_output(&init_net, NULL, &fl6);
+               if (!dst)
+                       goto out;
+               if (!cxgbit_our_interface(cdev, ip6_dst_idev(dst)->dev) &&
+                   !(ip6_dst_idev(dst)->dev->flags & IFF_LOOPBACK)) {
+                       dst_release(dst);
+                       dst = NULL;
+               }
+       }
+out:
+       return dst;
+}
+
+static struct dst_entry *
+cxgbit_find_route(struct cxgbit_device *cdev, __be32 local_ip, __be32 peer_ip,
+                 __be16 local_port, __be16 peer_port, u8 tos)
+{
+       struct rtable *rt;
+       struct flowi4 fl4;
+       struct neighbour *n;
+
+       rt = ip_route_output_ports(&init_net, &fl4, NULL, peer_ip,
+                                  local_ip,
+                                  peer_port, local_port, IPPROTO_TCP,
+                                  tos, 0);
+       if (IS_ERR(rt))
+               return NULL;
+       n = dst_neigh_lookup(&rt->dst, &peer_ip);
+       if (!n)
+               return NULL;
+       if (!cxgbit_our_interface(cdev, n->dev) &&
+           !(n->dev->flags & IFF_LOOPBACK)) {
+               neigh_release(n);
+               dst_release(&rt->dst);
+               return NULL;
+       }
+       neigh_release(n);
+       return &rt->dst;
+}
+
+static void cxgbit_set_tcp_window(struct cxgbit_sock *csk, struct port_info *pi)
+{
+       unsigned int linkspeed;
+       u8 scale;
+
+       linkspeed = pi->link_cfg.speed;
+       scale = linkspeed / SPEED_10000;
+
+#define CXGBIT_10G_RCV_WIN (256 * 1024)
+       csk->rcv_win = CXGBIT_10G_RCV_WIN;
+       if (scale)
+               csk->rcv_win *= scale;
+
+#define CXGBIT_10G_SND_WIN (256 * 1024)
+       csk->snd_win = CXGBIT_10G_SND_WIN;
+       if (scale)
+               csk->snd_win *= scale;
+
+       pr_debug("%s snd_win %d rcv_win %d\n",
+                __func__, csk->snd_win, csk->rcv_win);
+}
+
+#ifdef CONFIG_CHELSIO_T4_DCB
+static u8 cxgbit_get_iscsi_dcb_state(struct net_device *ndev)
+{
+       return ndev->dcbnl_ops->getstate(ndev);
+}
+
+static int cxgbit_select_priority(int pri_mask)
+{
+       if (!pri_mask)
+               return 0;
+
+       return (ffs(pri_mask) - 1);
+}
+
+static u8 cxgbit_get_iscsi_dcb_priority(struct net_device *ndev, u16 local_port)
+{
+       int ret;
+       u8 caps;
+
+       struct dcb_app iscsi_dcb_app = {
+               .protocol = local_port
+       };
+
+       ret = (int)ndev->dcbnl_ops->getcap(ndev, DCB_CAP_ATTR_DCBX, &caps);
+
+       if (ret)
+               return 0;
+
+       if (caps & DCB_CAP_DCBX_VER_IEEE) {
+               iscsi_dcb_app.selector = IEEE_8021QAZ_APP_SEL_ANY;
+
+               ret = dcb_ieee_getapp_mask(ndev, &iscsi_dcb_app);
+
+       } else if (caps & DCB_CAP_DCBX_VER_CEE) {
+               iscsi_dcb_app.selector = DCB_APP_IDTYPE_PORTNUM;
+
+               ret = dcb_getapp(ndev, &iscsi_dcb_app);
+       }
+
+       pr_info("iSCSI priority is set to %u\n", cxgbit_select_priority(ret));
+
+       return cxgbit_select_priority(ret);
+}
+#endif
+
+static int
+cxgbit_offload_init(struct cxgbit_sock *csk, int iptype, __u8 *peer_ip,
+                   u16 local_port, struct dst_entry *dst,
+                   struct cxgbit_device *cdev)
+{
+       struct neighbour *n;
+       int ret, step;
+       struct net_device *ndev;
+       u16 rxq_idx, port_id;
+#ifdef CONFIG_CHELSIO_T4_DCB
+       u8 priority = 0;
+#endif
+
+       n = dst_neigh_lookup(dst, peer_ip);
+       if (!n)
+               return -ENODEV;
+
+       rcu_read_lock();
+       ret = -ENOMEM;
+       if (n->dev->flags & IFF_LOOPBACK) {
+               if (iptype == 4)
+                       ndev = cxgbit_ipv4_netdev(*(__be32 *)peer_ip);
+               else if (IS_ENABLED(CONFIG_IPV6))
+                       ndev = cxgbit_ipv6_netdev((struct in6_addr *)peer_ip);
+               else
+                       ndev = NULL;
+
+               if (!ndev) {
+                       ret = -ENODEV;
+                       goto out;
+               }
+
+               csk->l2t = cxgb4_l2t_get(cdev->lldi.l2t,
+                                        n, ndev, 0);
+               if (!csk->l2t)
+                       goto out;
+               csk->mtu = ndev->mtu;
+               csk->tx_chan = cxgb4_port_chan(ndev);
+               csk->smac_idx = (cxgb4_port_viid(ndev) & 0x7F) << 1;
+               step = cdev->lldi.ntxq /
+                       cdev->lldi.nchan;
+               csk->txq_idx = cxgb4_port_idx(ndev) * step;
+               step = cdev->lldi.nrxq /
+                       cdev->lldi.nchan;
+               csk->ctrlq_idx = cxgb4_port_idx(ndev);
+               csk->rss_qid = cdev->lldi.rxq_ids[
+                               cxgb4_port_idx(ndev) * step];
+               csk->port_id = cxgb4_port_idx(ndev);
+               cxgbit_set_tcp_window(csk,
+                                     (struct port_info *)netdev_priv(ndev));
+       } else {
+               ndev = cxgbit_get_real_dev(n->dev);
+               if (!ndev) {
+                       ret = -ENODEV;
+                       goto out;
+               }
+
+#ifdef CONFIG_CHELSIO_T4_DCB
+               if (cxgbit_get_iscsi_dcb_state(ndev))
+                       priority = cxgbit_get_iscsi_dcb_priority(ndev,
+                                                                local_port);
+
+               csk->dcb_priority = priority;
+
+               csk->l2t = cxgb4_l2t_get(cdev->lldi.l2t, n, ndev, priority);
+#else
+               csk->l2t = cxgb4_l2t_get(cdev->lldi.l2t, n, ndev, 0);
+#endif
+               if (!csk->l2t)
+                       goto out;
+               port_id = cxgb4_port_idx(ndev);
+               csk->mtu = dst_mtu(dst);
+               csk->tx_chan = cxgb4_port_chan(ndev);
+               csk->smac_idx = (cxgb4_port_viid(ndev) & 0x7F) << 1;
+               step = cdev->lldi.ntxq /
+                       cdev->lldi.nports;
+               csk->txq_idx = (port_id * step) +
+                               (cdev->selectq[port_id][0]++ % step);
+               csk->ctrlq_idx = cxgb4_port_idx(ndev);
+               step = cdev->lldi.nrxq /
+                       cdev->lldi.nports;
+               rxq_idx = (port_id * step) +
+                               (cdev->selectq[port_id][1]++ % step);
+               csk->rss_qid = cdev->lldi.rxq_ids[rxq_idx];
+               csk->port_id = port_id;
+               cxgbit_set_tcp_window(csk,
+                                     (struct port_info *)netdev_priv(ndev));
+       }
+       ret = 0;
+out:
+       rcu_read_unlock();
+       neigh_release(n);
+       return ret;
+}
+
+int cxgbit_ofld_send(struct cxgbit_device *cdev, struct sk_buff *skb)
+{
+       int ret = 0;
+
+       if (!test_bit(CDEV_STATE_UP, &cdev->flags)) {
+               kfree_skb(skb);
+               pr_err("%s - device not up - dropping\n", __func__);
+               return -EIO;
+       }
+
+       ret = cxgb4_ofld_send(cdev->lldi.ports[0], skb);
+       if (ret < 0)
+               kfree_skb(skb);
+       return ret < 0 ? ret : 0;
+}
+
+static void cxgbit_release_tid(struct cxgbit_device *cdev, u32 tid)
+{
+       struct cpl_tid_release *req;
+       unsigned int len = roundup(sizeof(*req), 16);
+       struct sk_buff *skb;
+
+       skb = alloc_skb(len, GFP_ATOMIC);
+       if (!skb)
+               return;
+
+       req = (struct cpl_tid_release *)__skb_put(skb, len);
+       memset(req, 0, len);
+
+       INIT_TP_WR(req, tid);
+       OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(
+                  CPL_TID_RELEASE, tid));
+       set_wr_txq(skb, CPL_PRIORITY_SETUP, 0);
+       cxgbit_ofld_send(cdev, skb);
+}
+
+int
+cxgbit_l2t_send(struct cxgbit_device *cdev, struct sk_buff *skb,
+               struct l2t_entry *l2e)
+{
+       int ret = 0;
+
+       if (!test_bit(CDEV_STATE_UP, &cdev->flags)) {
+               kfree_skb(skb);
+               pr_err("%s - device not up - dropping\n", __func__);
+               return -EIO;
+       }
+
+       ret = cxgb4_l2t_send(cdev->lldi.ports[0], skb, l2e);
+       if (ret < 0)
+               kfree_skb(skb);
+       return ret < 0 ? ret : 0;
+}
+
+static void
+cxgbit_best_mtu(const unsigned short *mtus, unsigned short mtu,
+               unsigned int *idx, int use_ts, int ipv6)
+{
+       unsigned short hdr_size = (ipv6 ? sizeof(struct ipv6hdr) :
+                                  sizeof(struct iphdr)) +
+                                  sizeof(struct tcphdr) +
+                                  (use_ts ? round_up(TCPOLEN_TIMESTAMP,
+                                   4) : 0);
+       unsigned short data_size = mtu - hdr_size;
+
+       cxgb4_best_aligned_mtu(mtus, hdr_size, data_size, 8, idx);
+}
+
+static void cxgbit_send_rx_credits(struct cxgbit_sock *csk, struct sk_buff *skb)
+{
+       if (csk->com.state != CSK_STATE_ESTABLISHED) {
+               __kfree_skb(skb);
+               return;
+       }
+
+       cxgbit_ofld_send(csk->com.cdev, skb);
+}
+
+/*
+ * CPL connection rx data ack: host ->
+ * Send RX credits through an RX_DATA_ACK CPL message.
+ * Returns the number of credits sent.
+ */
+int cxgbit_rx_data_ack(struct cxgbit_sock *csk)
+{
+       struct sk_buff *skb;
+       struct cpl_rx_data_ack *req;
+       unsigned int len = roundup(sizeof(*req), 16);
+
+       skb = alloc_skb(len, GFP_KERNEL);
+       if (!skb)
+               return -1;
+
+       req = (struct cpl_rx_data_ack *)__skb_put(skb, len);
+       memset(req, 0, len);
+
+       set_wr_txq(skb, CPL_PRIORITY_ACK, csk->ctrlq_idx);
+       INIT_TP_WR(req, csk->tid);
+       OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_RX_DATA_ACK,
+                                                   csk->tid));
+       req->credit_dack = cpu_to_be32(RX_DACK_CHANGE_F | RX_DACK_MODE_V(1) |
+                                      RX_CREDITS_V(csk->rx_credits));
+
+       csk->rx_credits = 0;
+
+       spin_lock_bh(&csk->lock);
+       if (csk->lock_owner) {
+               cxgbit_skcb_rx_backlog_fn(skb) = cxgbit_send_rx_credits;
+               __skb_queue_tail(&csk->backlogq, skb);
+               spin_unlock_bh(&csk->lock);
+               return 0;
+       }
+
+       cxgbit_send_rx_credits(csk, skb);
+       spin_unlock_bh(&csk->lock);
+
+       return 0;
+}
+
+#define FLOWC_WR_NPARAMS_MIN    9
+#define FLOWC_WR_NPARAMS_MAX   11
+static int cxgbit_alloc_csk_skb(struct cxgbit_sock *csk)
+{
+       struct sk_buff *skb;
+       u32 len, flowclen;
+       u8 i;
+
+       flowclen = offsetof(struct fw_flowc_wr,
+                           mnemval[FLOWC_WR_NPARAMS_MAX]);
+
+       len = max_t(u32, sizeof(struct cpl_abort_req),
+                   sizeof(struct cpl_abort_rpl));
+
+       len = max(len, flowclen);
+       len = roundup(len, 16);
+
+       for (i = 0; i < 3; i++) {
+               skb = alloc_skb(len, GFP_ATOMIC);
+               if (!skb)
+                       goto out;
+               __skb_queue_tail(&csk->skbq, skb);
+       }
+
+       skb = alloc_skb(LRO_SKB_MIN_HEADROOM, GFP_ATOMIC);
+       if (!skb)
+               goto out;
+
+       memset(skb->data, 0, LRO_SKB_MIN_HEADROOM);
+       csk->lro_hskb = skb;
+
+       return 0;
+out:
+       __skb_queue_purge(&csk->skbq);
+       return -ENOMEM;
+}
+
+static u32 cxgbit_compute_wscale(u32 win)
+{
+       u32 wscale = 0;
+
+       while (wscale < 14 && (65535 << wscale) < win)
+               wscale++;
+       return wscale;
+}
+
+static void
+cxgbit_pass_accept_rpl(struct cxgbit_sock *csk, struct cpl_pass_accept_req *req)
+{
+       struct sk_buff *skb;
+       const struct tcphdr *tcph;
+       struct cpl_t5_pass_accept_rpl *rpl5;
+       unsigned int len = roundup(sizeof(*rpl5), 16);
+       unsigned int mtu_idx;
+       u64 opt0;
+       u32 opt2, hlen;
+       u32 wscale;
+       u32 win;
+
+       pr_debug("%s csk %p tid %u\n", __func__, csk, csk->tid);
+
+       skb = alloc_skb(len, GFP_ATOMIC);
+       if (!skb) {
+               cxgbit_put_csk(csk);
+               return;
+       }
+
+       rpl5 = (struct cpl_t5_pass_accept_rpl *)__skb_put(skb, len);
+       memset(rpl5, 0, len);
+
+       INIT_TP_WR(rpl5, csk->tid);
+       OPCODE_TID(rpl5) = cpu_to_be32(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL,
+                                                    csk->tid));
+       cxgbit_best_mtu(csk->com.cdev->lldi.mtus, csk->mtu, &mtu_idx,
+                       req->tcpopt.tstamp,
+                       (csk->com.remote_addr.ss_family == AF_INET) ? 0 : 1);
+       wscale = cxgbit_compute_wscale(csk->rcv_win);
+       /*
+        * Specify the largest window that will fit in opt0. The
+        * remainder will be specified in the rx_data_ack.
+        */
+       win = csk->rcv_win >> 10;
+       if (win > RCV_BUFSIZ_M)
+               win = RCV_BUFSIZ_M;
+       opt0 =  TCAM_BYPASS_F |
+               WND_SCALE_V(wscale) |
+               MSS_IDX_V(mtu_idx) |
+               L2T_IDX_V(csk->l2t->idx) |
+               TX_CHAN_V(csk->tx_chan) |
+               SMAC_SEL_V(csk->smac_idx) |
+               DSCP_V(csk->tos >> 2) |
+               ULP_MODE_V(ULP_MODE_ISCSI) |
+               RCV_BUFSIZ_V(win);
+
+       opt2 = RX_CHANNEL_V(0) |
+               RSS_QUEUE_VALID_F | RSS_QUEUE_V(csk->rss_qid);
+
+       if (req->tcpopt.tstamp)
+               opt2 |= TSTAMPS_EN_F;
+       if (req->tcpopt.sack)
+               opt2 |= SACK_EN_F;
+       if (wscale)
+               opt2 |= WND_SCALE_EN_F;
+
+       hlen = ntohl(req->hdr_len);
+       tcph = (const void *)(req + 1) + ETH_HDR_LEN_G(hlen) +
+               IP_HDR_LEN_G(hlen);
+
+       if (tcph->ece && tcph->cwr)
+               opt2 |= CCTRL_ECN_V(1);
+
+       opt2 |= RX_COALESCE_V(3);
+       opt2 |= CONG_CNTRL_V(CONG_ALG_NEWRENO);
+
+       opt2 |= T5_ISS_F;
+       rpl5->iss = cpu_to_be32((prandom_u32() & ~7UL) - 1);
+
+       opt2 |= T5_OPT_2_VALID_F;
+
+       rpl5->opt0 = cpu_to_be64(opt0);
+       rpl5->opt2 = cpu_to_be32(opt2);
+       set_wr_txq(skb, CPL_PRIORITY_SETUP, csk->ctrlq_idx);
+       t4_set_arp_err_handler(skb, NULL, cxgbit_arp_failure_discard);
+       cxgbit_l2t_send(csk->com.cdev, skb, csk->l2t);
+}
+
+static void
+cxgbit_pass_accept_req(struct cxgbit_device *cdev, struct sk_buff *skb)
+{
+       struct cxgbit_sock *csk = NULL;
+       struct cxgbit_np *cnp;
+       struct cpl_pass_accept_req *req = cplhdr(skb);
+       unsigned int stid = PASS_OPEN_TID_G(ntohl(req->tos_stid));
+       struct tid_info *t = cdev->lldi.tids;
+       unsigned int tid = GET_TID(req);
+       u16 peer_mss = ntohs(req->tcpopt.mss);
+       unsigned short hdrs;
+
+       struct dst_entry *dst;
+       __u8 local_ip[16], peer_ip[16];
+       __be16 local_port, peer_port;
+       int ret;
+       int iptype;
+
+       pr_debug("%s: cdev = %p; stid = %u; tid = %u\n",
+                __func__, cdev, stid, tid);
+
+       cnp = lookup_stid(t, stid);
+       if (!cnp) {
+               pr_err("%s connect request on invalid stid %d\n",
+                      __func__, stid);
+               goto rel_skb;
+       }
+
+       if (cnp->com.state != CSK_STATE_LISTEN) {
+               pr_err("%s - listening parent not in CSK_STATE_LISTEN\n",
+                      __func__);
+               goto reject;
+       }
+
+       csk = lookup_tid(t, tid);
+       if (csk) {
+               pr_err("%s csk not null tid %u\n",
+                      __func__, tid);
+               goto rel_skb;
+       }
+
+       cxgbit_get_tuple_info(req, &iptype, local_ip, peer_ip,
+                             &local_port, &peer_port);
+
+       /* Find output route */
+       if (iptype == 4)  {
+               pr_debug("%s parent sock %p tid %u laddr %pI4 raddr %pI4 "
+                        "lport %d rport %d peer_mss %d\n"
+                        , __func__, cnp, tid,
+                        local_ip, peer_ip, ntohs(local_port),
+                        ntohs(peer_port), peer_mss);
+               dst = cxgbit_find_route(cdev, *(__be32 *)local_ip,
+                                       *(__be32 *)peer_ip,
+                                       local_port, peer_port,
+                                       PASS_OPEN_TOS_G(ntohl(req->tos_stid)));
+       } else {
+               pr_debug("%s parent sock %p tid %u laddr %pI6 raddr %pI6 "
+                        "lport %d rport %d peer_mss %d\n"
+                        , __func__, cnp, tid,
+                        local_ip, peer_ip, ntohs(local_port),
+                        ntohs(peer_port), peer_mss);
+               dst = cxgbit_find_route6(cdev, local_ip, peer_ip,
+                                        local_port, peer_port,
+                                        PASS_OPEN_TOS_G(ntohl(req->tos_stid)),
+                                        ((struct sockaddr_in6 *)
+                                        &cnp->com.local_addr)->sin6_scope_id);
+       }
+       if (!dst) {
+               pr_err("%s - failed to find dst entry!\n",
+                      __func__);
+               goto reject;
+       }
+
+       csk = kzalloc(sizeof(*csk), GFP_ATOMIC);
+       if (!csk) {
+               dst_release(dst);
+               goto rel_skb;
+       }
+
+       ret = cxgbit_offload_init(csk, iptype, peer_ip, ntohs(local_port),
+                                 dst, cdev);
+       if (ret) {
+               pr_err("%s - failed to allocate l2t entry!\n",
+                      __func__);
+               dst_release(dst);
+               kfree(csk);
+               goto reject;
+       }
+
+       kref_init(&csk->kref);
+       init_completion(&csk->com.wr_wait.completion);
+
+       INIT_LIST_HEAD(&csk->accept_node);
+
+       hdrs = (iptype == 4 ? sizeof(struct iphdr) : sizeof(struct ipv6hdr)) +
+               sizeof(struct tcphdr) + (req->tcpopt.tstamp ? 12 : 0);
+       if (peer_mss && csk->mtu > (peer_mss + hdrs))
+               csk->mtu = peer_mss + hdrs;
+
+       csk->com.state = CSK_STATE_CONNECTING;
+       csk->com.cdev = cdev;
+       csk->cnp = cnp;
+       csk->tos = PASS_OPEN_TOS_G(ntohl(req->tos_stid));
+       csk->dst = dst;
+       csk->tid = tid;
+       csk->wr_cred = cdev->lldi.wr_cred -
+                       DIV_ROUND_UP(sizeof(struct cpl_abort_req), 16);
+       csk->wr_max_cred = csk->wr_cred;
+       csk->wr_una_cred = 0;
+
+       if (iptype == 4) {
+               struct sockaddr_in *sin = (struct sockaddr_in *)
+                                         &csk->com.local_addr;
+               sin->sin_family = AF_INET;
+               sin->sin_port = local_port;
+               sin->sin_addr.s_addr = *(__be32 *)local_ip;
+
+               sin = (struct sockaddr_in *)&csk->com.remote_addr;
+               sin->sin_family = AF_INET;
+               sin->sin_port = peer_port;
+               sin->sin_addr.s_addr = *(__be32 *)peer_ip;
+       } else {
+               struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)
+                                           &csk->com.local_addr;
+
+               sin6->sin6_family = PF_INET6;
+               sin6->sin6_port = local_port;
+               memcpy(sin6->sin6_addr.s6_addr, local_ip, 16);
+               cxgb4_clip_get(cdev->lldi.ports[0],
+                              (const u32 *)&sin6->sin6_addr.s6_addr,
+                              1);
+
+               sin6 = (struct sockaddr_in6 *)&csk->com.remote_addr;
+               sin6->sin6_family = PF_INET6;
+               sin6->sin6_port = peer_port;
+               memcpy(sin6->sin6_addr.s6_addr, peer_ip, 16);
+       }
+
+       skb_queue_head_init(&csk->rxq);
+       skb_queue_head_init(&csk->txq);
+       skb_queue_head_init(&csk->ppodq);
+       skb_queue_head_init(&csk->backlogq);
+       skb_queue_head_init(&csk->skbq);
+       cxgbit_sock_reset_wr_list(csk);
+       spin_lock_init(&csk->lock);
+       init_waitqueue_head(&csk->waitq);
+       init_waitqueue_head(&csk->ack_waitq);
+       csk->lock_owner = false;
+
+       if (cxgbit_alloc_csk_skb(csk)) {
+               dst_release(dst);
+               kfree(csk);
+               goto rel_skb;
+       }
+
+       cxgbit_get_cdev(cdev);
+
+       spin_lock(&cdev->cskq.lock);
+       list_add_tail(&csk->list, &cdev->cskq.list);
+       spin_unlock(&cdev->cskq.lock);
+
+       cxgb4_insert_tid(t, csk, tid);
+       cxgbit_pass_accept_rpl(csk, req);
+       goto rel_skb;
+
+reject:
+       cxgbit_release_tid(cdev, tid);
+rel_skb:
+       __kfree_skb(skb);
+}
+
+static u32
+cxgbit_tx_flowc_wr_credits(struct cxgbit_sock *csk, u32 *nparamsp,
+                          u32 *flowclenp)
+{
+       u32 nparams, flowclen16, flowclen;
+
+       nparams = FLOWC_WR_NPARAMS_MIN;
+
+       if (csk->snd_wscale)
+               nparams++;
+
+#ifdef CONFIG_CHELSIO_T4_DCB
+       nparams++;
+#endif
+       flowclen = offsetof(struct fw_flowc_wr, mnemval[nparams]);
+       flowclen16 = DIV_ROUND_UP(flowclen, 16);
+       flowclen = flowclen16 * 16;
+       /*
+        * Return the number of 16-byte credits used by the flowc request.
+        * Pass back the nparams and actual flowc length if requested.
+        */
+       if (nparamsp)
+               *nparamsp = nparams;
+       if (flowclenp)
+               *flowclenp = flowclen;
+       return flowclen16;
+}
+
+u32 cxgbit_send_tx_flowc_wr(struct cxgbit_sock *csk)
+{
+       struct cxgbit_device *cdev = csk->com.cdev;
+       struct fw_flowc_wr *flowc;
+       u32 nparams, flowclen16, flowclen;
+       struct sk_buff *skb;
+       u8 index;
+
+#ifdef CONFIG_CHELSIO_T4_DCB
+       u16 vlan = ((struct l2t_entry *)csk->l2t)->vlan;
+#endif
+
+       flowclen16 = cxgbit_tx_flowc_wr_credits(csk, &nparams, &flowclen);
+
+       skb = __skb_dequeue(&csk->skbq);
+       flowc = (struct fw_flowc_wr *)__skb_put(skb, flowclen);
+       memset(flowc, 0, flowclen);
+
+       flowc->op_to_nparams = cpu_to_be32(FW_WR_OP_V(FW_FLOWC_WR) |
+                                          FW_FLOWC_WR_NPARAMS_V(nparams));
+       flowc->flowid_len16 = cpu_to_be32(FW_WR_LEN16_V(flowclen16) |
+                                         FW_WR_FLOWID_V(csk->tid));
+       flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
+       flowc->mnemval[0].val = cpu_to_be32(FW_PFVF_CMD_PFN_V
+                                           (csk->com.cdev->lldi.pf));
+       flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
+       flowc->mnemval[1].val = cpu_to_be32(csk->tx_chan);
+       flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
+       flowc->mnemval[2].val = cpu_to_be32(csk->tx_chan);
+       flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
+       flowc->mnemval[3].val = cpu_to_be32(csk->rss_qid);
+       flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDNXT;
+       flowc->mnemval[4].val = cpu_to_be32(csk->snd_nxt);
+       flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_RCVNXT;
+       flowc->mnemval[5].val = cpu_to_be32(csk->rcv_nxt);
+       flowc->mnemval[6].mnemonic = FW_FLOWC_MNEM_SNDBUF;
+       flowc->mnemval[6].val = cpu_to_be32(csk->snd_win);
+       flowc->mnemval[7].mnemonic = FW_FLOWC_MNEM_MSS;
+       flowc->mnemval[7].val = cpu_to_be32(csk->emss);
+
+       flowc->mnemval[8].mnemonic = FW_FLOWC_MNEM_TXDATAPLEN_MAX;
+       if (test_bit(CDEV_ISO_ENABLE, &cdev->flags))
+               flowc->mnemval[8].val = cpu_to_be32(CXGBIT_MAX_ISO_PAYLOAD);
+       else
+               flowc->mnemval[8].val = cpu_to_be32(16384);
+
+       index = 9;
+
+       if (csk->snd_wscale) {
+               flowc->mnemval[index].mnemonic = FW_FLOWC_MNEM_RCV_SCALE;
+               flowc->mnemval[index].val = cpu_to_be32(csk->snd_wscale);
+               index++;
+       }
+
+#ifdef CONFIG_CHELSIO_T4_DCB
+       flowc->mnemval[index].mnemonic = FW_FLOWC_MNEM_DCBPRIO;
+       if (vlan == VLAN_NONE) {
+               pr_warn("csk %u without VLAN Tag on DCB Link\n", csk->tid);
+               flowc->mnemval[index].val = cpu_to_be32(0);
+       } else
+               flowc->mnemval[index].val = cpu_to_be32(
+                               (vlan & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT);
+#endif
+
+       pr_debug("%s: csk %p; tx_chan = %u; rss_qid = %u; snd_seq = %u;"
+                " rcv_seq = %u; snd_win = %u; emss = %u\n",
+                __func__, csk, csk->tx_chan, csk->rss_qid, csk->snd_nxt,
+                csk->rcv_nxt, csk->snd_win, csk->emss);
+       set_wr_txq(skb, CPL_PRIORITY_DATA, csk->txq_idx);
+       cxgbit_ofld_send(csk->com.cdev, skb);
+       return flowclen16;
+}
+
+int cxgbit_setup_conn_digest(struct cxgbit_sock *csk)
+{
+       struct sk_buff *skb;
+       struct cpl_set_tcb_field *req;
+       u8 hcrc = csk->submode & CXGBIT_SUBMODE_HCRC;
+       u8 dcrc = csk->submode & CXGBIT_SUBMODE_DCRC;
+       unsigned int len = roundup(sizeof(*req), 16);
+       int ret;
+
+       skb = alloc_skb(len, GFP_KERNEL);
+       if (!skb)
+               return -ENOMEM;
+
+       /*  set up ulp submode */
+       req = (struct cpl_set_tcb_field *)__skb_put(skb, len);
+       memset(req, 0, len);
+
+       INIT_TP_WR(req, csk->tid);
+       OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, csk->tid));
+       req->reply_ctrl = htons(NO_REPLY_V(0) | QUEUENO_V(csk->rss_qid));
+       req->word_cookie = htons(0);
+       req->mask = cpu_to_be64(0x3 << 4);
+       req->val = cpu_to_be64(((hcrc ? ULP_CRC_HEADER : 0) |
+                               (dcrc ? ULP_CRC_DATA : 0)) << 4);
+       set_wr_txq(skb, CPL_PRIORITY_CONTROL, csk->ctrlq_idx);
+
+       cxgbit_get_csk(csk);
+       cxgbit_init_wr_wait(&csk->com.wr_wait);
+
+       cxgbit_ofld_send(csk->com.cdev, skb);
+
+       ret = cxgbit_wait_for_reply(csk->com.cdev,
+                                   &csk->com.wr_wait,
+                                   csk->tid, 5, __func__);
+       if (ret)
+               return -1;
+
+       return 0;
+}
+
+int cxgbit_setup_conn_pgidx(struct cxgbit_sock *csk, u32 pg_idx)
+{
+       struct sk_buff *skb;
+       struct cpl_set_tcb_field *req;
+       unsigned int len = roundup(sizeof(*req), 16);
+       int ret;
+
+       skb = alloc_skb(len, GFP_KERNEL);
+       if (!skb)
+               return -ENOMEM;
+
+       req = (struct cpl_set_tcb_field *)__skb_put(skb, len);
+       memset(req, 0, len);
+
+       INIT_TP_WR(req, csk->tid);
+       OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, csk->tid));
+       req->reply_ctrl = htons(NO_REPLY_V(0) | QUEUENO_V(csk->rss_qid));
+       req->word_cookie = htons(0);
+       req->mask = cpu_to_be64(0x3 << 8);
+       req->val = cpu_to_be64(pg_idx << 8);
+       set_wr_txq(skb, CPL_PRIORITY_CONTROL, csk->ctrlq_idx);
+
+       cxgbit_get_csk(csk);
+       cxgbit_init_wr_wait(&csk->com.wr_wait);
+
+       cxgbit_ofld_send(csk->com.cdev, skb);
+
+       ret = cxgbit_wait_for_reply(csk->com.cdev,
+                                   &csk->com.wr_wait,
+                                   csk->tid, 5, __func__);
+       if (ret)
+               return -1;
+
+       return 0;
+}
+
+static void
+cxgbit_pass_open_rpl(struct cxgbit_device *cdev, struct sk_buff *skb)
+{
+       struct cpl_pass_open_rpl *rpl = cplhdr(skb);
+       struct tid_info *t = cdev->lldi.tids;
+       unsigned int stid = GET_TID(rpl);
+       struct cxgbit_np *cnp = lookup_stid(t, stid);
+
+       pr_debug("%s: cnp = %p; stid = %u; status = %d\n",
+                __func__, cnp, stid, rpl->status);
+
+       if (!cnp) {
+               pr_info("%s stid %d lookup failure\n", __func__, stid);
+               return;
+       }
+
+       cxgbit_wake_up(&cnp->com.wr_wait, __func__, rpl->status);
+       cxgbit_put_cnp(cnp);
+}
+
+static void
+cxgbit_close_listsrv_rpl(struct cxgbit_device *cdev, struct sk_buff *skb)
+{
+       struct cpl_close_listsvr_rpl *rpl = cplhdr(skb);
+       struct tid_info *t = cdev->lldi.tids;
+       unsigned int stid = GET_TID(rpl);
+       struct cxgbit_np *cnp = lookup_stid(t, stid);
+
+       pr_debug("%s: cnp = %p; stid = %u; status = %d\n",
+                __func__, cnp, stid, rpl->status);
+
+       if (!cnp) {
+               pr_info("%s stid %d lookup failure\n", __func__, stid);
+               return;
+       }
+
+       cxgbit_wake_up(&cnp->com.wr_wait, __func__, rpl->status);
+       cxgbit_put_cnp(cnp);
+}
+
+static void
+cxgbit_pass_establish(struct cxgbit_device *cdev, struct sk_buff *skb)
+{
+       struct cpl_pass_establish *req = cplhdr(skb);
+       struct tid_info *t = cdev->lldi.tids;
+       unsigned int tid = GET_TID(req);
+       struct cxgbit_sock *csk;
+       struct cxgbit_np *cnp;
+       u16 tcp_opt = be16_to_cpu(req->tcp_opt);
+       u32 snd_isn = be32_to_cpu(req->snd_isn);
+       u32 rcv_isn = be32_to_cpu(req->rcv_isn);
+
+       csk = lookup_tid(t, tid);
+       if (unlikely(!csk)) {
+               pr_err("can't find connection for tid %u.\n", tid);
+               goto rel_skb;
+       }
+       cnp = csk->cnp;
+
+       pr_debug("%s: csk %p; tid %u; cnp %p\n",
+                __func__, csk, tid, cnp);
+
+       csk->write_seq = snd_isn;
+       csk->snd_una = snd_isn;
+       csk->snd_nxt = snd_isn;
+
+       csk->rcv_nxt = rcv_isn;
+
+       if (csk->rcv_win > (RCV_BUFSIZ_M << 10))
+               csk->rx_credits = (csk->rcv_win - (RCV_BUFSIZ_M << 10));
+
+       csk->snd_wscale = TCPOPT_SND_WSCALE_G(tcp_opt);
+       cxgbit_set_emss(csk, tcp_opt);
+       dst_confirm(csk->dst);
+       csk->com.state = CSK_STATE_ESTABLISHED;
+       spin_lock_bh(&cnp->np_accept_lock);
+       list_add_tail(&csk->accept_node, &cnp->np_accept_list);
+       spin_unlock_bh(&cnp->np_accept_lock);
+       complete(&cnp->accept_comp);
+rel_skb:
+       __kfree_skb(skb);
+}
+
+static void cxgbit_queue_rx_skb(struct cxgbit_sock *csk, struct sk_buff *skb)
+{
+       cxgbit_skcb_flags(skb) = 0;
+       spin_lock_bh(&csk->rxq.lock);
+       __skb_queue_tail(&csk->rxq, skb);
+       spin_unlock_bh(&csk->rxq.lock);
+       wake_up(&csk->waitq);
+}
+
+static void cxgbit_peer_close(struct cxgbit_sock *csk, struct sk_buff *skb)
+{
+       pr_debug("%s: csk %p; tid %u; state %d\n",
+                __func__, csk, csk->tid, csk->com.state);
+
+       switch (csk->com.state) {
+       case CSK_STATE_ESTABLISHED:
+               csk->com.state = CSK_STATE_CLOSING;
+               cxgbit_queue_rx_skb(csk, skb);
+               return;
+       case CSK_STATE_CLOSING:
+               /* simultaneous close */
+               csk->com.state = CSK_STATE_MORIBUND;
+               break;
+       case CSK_STATE_MORIBUND:
+               csk->com.state = CSK_STATE_DEAD;
+               cxgbit_put_csk(csk);
+               break;
+       case CSK_STATE_ABORTING:
+               break;
+       default:
+               pr_info("%s: cpl_peer_close in bad state %d\n",
+                       __func__, csk->com.state);
+       }
+
+       __kfree_skb(skb);
+}
+
+static void cxgbit_close_con_rpl(struct cxgbit_sock *csk, struct sk_buff *skb)
+{
+       pr_debug("%s: csk %p; tid %u; state %d\n",
+                __func__, csk, csk->tid, csk->com.state);
+
+       switch (csk->com.state) {
+       case CSK_STATE_CLOSING:
+               csk->com.state = CSK_STATE_MORIBUND;
+               break;
+       case CSK_STATE_MORIBUND:
+               csk->com.state = CSK_STATE_DEAD;
+               cxgbit_put_csk(csk);
+               break;
+       case CSK_STATE_ABORTING:
+       case CSK_STATE_DEAD:
+               break;
+       default:
+               pr_info("%s: cpl_close_con_rpl in bad state %d\n",
+                       __func__, csk->com.state);
+       }
+
+       __kfree_skb(skb);
+}
+
+static void cxgbit_abort_req_rss(struct cxgbit_sock *csk, struct sk_buff *skb)
+{
+       struct cpl_abort_req_rss *hdr = cplhdr(skb);
+       unsigned int tid = GET_TID(hdr);
+       struct cpl_abort_rpl *rpl;
+       struct sk_buff *rpl_skb;
+       bool release = false;
+       bool wakeup_thread = false;
+       unsigned int len = roundup(sizeof(*rpl), 16);
+
+       pr_debug("%s: csk %p; tid %u; state %d\n",
+                __func__, csk, tid, csk->com.state);
+
+       if (cxgbit_is_neg_adv(hdr->status)) {
+               pr_err("%s: got neg advise %d on tid %u\n",
+                      __func__, hdr->status, tid);
+               goto rel_skb;
+       }
+
+       switch (csk->com.state) {
+       case CSK_STATE_CONNECTING:
+       case CSK_STATE_MORIBUND:
+               csk->com.state = CSK_STATE_DEAD;
+               release = true;
+               break;
+       case CSK_STATE_ESTABLISHED:
+               csk->com.state = CSK_STATE_DEAD;
+               wakeup_thread = true;
+               break;
+       case CSK_STATE_CLOSING:
+               csk->com.state = CSK_STATE_DEAD;
+               if (!csk->conn)
+                       release = true;
+               break;
+       case CSK_STATE_ABORTING:
+               break;
+       default:
+               pr_info("%s: cpl_abort_req_rss in bad state %d\n",
+                       __func__, csk->com.state);
+               csk->com.state = CSK_STATE_DEAD;
+       }
+
+       __skb_queue_purge(&csk->txq);
+
+       if (!test_and_set_bit(CSK_TX_DATA_SENT, &csk->com.flags))
+               cxgbit_send_tx_flowc_wr(csk);
+
+       rpl_skb = __skb_dequeue(&csk->skbq);
+       set_wr_txq(skb, CPL_PRIORITY_DATA, csk->txq_idx);
+
+       rpl = (struct cpl_abort_rpl *)__skb_put(rpl_skb, len);
+       memset(rpl, 0, len);
+
+       INIT_TP_WR(rpl, csk->tid);
+       OPCODE_TID(rpl) = cpu_to_be32(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
+       rpl->cmd = CPL_ABORT_NO_RST;
+       cxgbit_ofld_send(csk->com.cdev, rpl_skb);
+
+       if (wakeup_thread) {
+               cxgbit_queue_rx_skb(csk, skb);
+               return;
+       }
+
+       if (release)
+               cxgbit_put_csk(csk);
+rel_skb:
+       __kfree_skb(skb);
+}
+
+static void cxgbit_abort_rpl_rss(struct cxgbit_sock *csk, struct sk_buff *skb)
+{
+       pr_debug("%s: csk %p; tid %u; state %d\n",
+                __func__, csk, csk->tid, csk->com.state);
+
+       switch (csk->com.state) {
+       case CSK_STATE_ABORTING:
+               csk->com.state = CSK_STATE_DEAD;
+               cxgbit_put_csk(csk);
+               break;
+       default:
+               pr_info("%s: cpl_abort_rpl_rss in state %d\n",
+                       __func__, csk->com.state);
+       }
+
+       __kfree_skb(skb);
+}
+
+static bool cxgbit_credit_err(const struct cxgbit_sock *csk)
+{
+       const struct sk_buff *skb = csk->wr_pending_head;
+       u32 credit = 0;
+
+       if (unlikely(csk->wr_cred > csk->wr_max_cred)) {
+               pr_err("csk 0x%p, tid %u, credit %u > %u\n",
+                      csk, csk->tid, csk->wr_cred, csk->wr_max_cred);
+               return true;
+       }
+
+       while (skb) {
+               credit += skb->csum;
+               skb = cxgbit_skcb_tx_wr_next(skb);
+       }
+
+       if (unlikely((csk->wr_cred + credit) != csk->wr_max_cred)) {
+               pr_err("csk 0x%p, tid %u, credit %u + %u != %u.\n",
+                      csk, csk->tid, csk->wr_cred,
+                      credit, csk->wr_max_cred);
+
+               return true;
+       }
+
+       return false;
+}
+
+static void cxgbit_fw4_ack(struct cxgbit_sock *csk, struct sk_buff *skb)
+{
+       struct cpl_fw4_ack *rpl = (struct cpl_fw4_ack *)cplhdr(skb);
+       u32 credits = rpl->credits;
+       u32 snd_una = ntohl(rpl->snd_una);
+
+       csk->wr_cred += credits;
+       if (csk->wr_una_cred > (csk->wr_max_cred - csk->wr_cred))
+               csk->wr_una_cred = csk->wr_max_cred - csk->wr_cred;
+
+       while (credits) {
+               struct sk_buff *p = cxgbit_sock_peek_wr(csk);
+
+               if (unlikely(!p)) {
+                       pr_err("csk 0x%p,%u, cr %u,%u+%u, empty.\n",
+                              csk, csk->tid, credits,
+                              csk->wr_cred, csk->wr_una_cred);
+                       break;
+               }
+
+               if (unlikely(credits < p->csum)) {
+                       pr_warn("csk 0x%p,%u, cr %u,%u+%u, < %u.\n",
+                               csk,  csk->tid,
+                               credits, csk->wr_cred, csk->wr_una_cred,
+                               p->csum);
+                       p->csum -= credits;
+                       break;
+               }
+
+               cxgbit_sock_dequeue_wr(csk);
+               credits -= p->csum;
+               kfree_skb(p);
+       }
+
+       if (unlikely(cxgbit_credit_err(csk))) {
+               cxgbit_queue_rx_skb(csk, skb);
+               return;
+       }
+
+       if (rpl->seq_vld & CPL_FW4_ACK_FLAGS_SEQVAL) {
+               if (unlikely(before(snd_una, csk->snd_una))) {
+                       pr_warn("csk 0x%p,%u, snd_una %u/%u.",
+                               csk, csk->tid, snd_una,
+                               csk->snd_una);
+                       goto rel_skb;
+               }
+
+               if (csk->snd_una != snd_una) {
+                       csk->snd_una = snd_una;
+                       dst_confirm(csk->dst);
+                       wake_up(&csk->ack_waitq);
+               }
+       }
+
+       if (skb_queue_len(&csk->txq))
+               cxgbit_push_tx_frames(csk);
+
+rel_skb:
+       __kfree_skb(skb);
+}
+
+static void cxgbit_set_tcb_rpl(struct cxgbit_device *cdev, struct sk_buff *skb)
+{
+       struct cxgbit_sock *csk;
+       struct cpl_set_tcb_rpl *rpl = (struct cpl_set_tcb_rpl *)skb->data;
+       unsigned int tid = GET_TID(rpl);
+       struct cxgb4_lld_info *lldi = &cdev->lldi;
+       struct tid_info *t = lldi->tids;
+
+       csk = lookup_tid(t, tid);
+       if (unlikely(!csk))
+               pr_err("can't find connection for tid %u.\n", tid);
+       else
+               cxgbit_wake_up(&csk->com.wr_wait, __func__, rpl->status);
+
+       cxgbit_put_csk(csk);
+}
+
+static void cxgbit_rx_data(struct cxgbit_device *cdev, struct sk_buff *skb)
+{
+       struct cxgbit_sock *csk;
+       struct cpl_rx_data *cpl = cplhdr(skb);
+       unsigned int tid = GET_TID(cpl);
+       struct cxgb4_lld_info *lldi = &cdev->lldi;
+       struct tid_info *t = lldi->tids;
+
+       csk = lookup_tid(t, tid);
+       if (unlikely(!csk)) {
+               pr_err("can't find conn. for tid %u.\n", tid);
+               goto rel_skb;
+       }
+
+       cxgbit_queue_rx_skb(csk, skb);
+       return;
+rel_skb:
+       __kfree_skb(skb);
+}
+
+static void
+__cxgbit_process_rx_cpl(struct cxgbit_sock *csk, struct sk_buff *skb)
+{
+       spin_lock(&csk->lock);
+       if (csk->lock_owner) {
+               __skb_queue_tail(&csk->backlogq, skb);
+               spin_unlock(&csk->lock);
+               return;
+       }
+
+       cxgbit_skcb_rx_backlog_fn(skb)(csk, skb);
+       spin_unlock(&csk->lock);
+}
+
+static void cxgbit_process_rx_cpl(struct cxgbit_sock *csk, struct sk_buff *skb)
+{
+       cxgbit_get_csk(csk);
+       __cxgbit_process_rx_cpl(csk, skb);
+       cxgbit_put_csk(csk);
+}
+
+static void cxgbit_rx_cpl(struct cxgbit_device *cdev, struct sk_buff *skb)
+{
+       struct cxgbit_sock *csk;
+       struct cpl_tx_data *cpl = cplhdr(skb);
+       struct cxgb4_lld_info *lldi = &cdev->lldi;
+       struct tid_info *t = lldi->tids;
+       unsigned int tid = GET_TID(cpl);
+       u8 opcode = cxgbit_skcb_rx_opcode(skb);
+       bool ref = true;
+
+       switch (opcode) {
+       case CPL_FW4_ACK:
+                       cxgbit_skcb_rx_backlog_fn(skb) = cxgbit_fw4_ack;
+                       ref = false;
+                       break;
+       case CPL_PEER_CLOSE:
+                       cxgbit_skcb_rx_backlog_fn(skb) = cxgbit_peer_close;
+                       break;
+       case CPL_CLOSE_CON_RPL:
+                       cxgbit_skcb_rx_backlog_fn(skb) = cxgbit_close_con_rpl;
+                       break;
+       case CPL_ABORT_REQ_RSS:
+                       cxgbit_skcb_rx_backlog_fn(skb) = cxgbit_abort_req_rss;
+                       break;
+       case CPL_ABORT_RPL_RSS:
+                       cxgbit_skcb_rx_backlog_fn(skb) = cxgbit_abort_rpl_rss;
+                       break;
+       default:
+               goto rel_skb;
+       }
+
+       csk = lookup_tid(t, tid);
+       if (unlikely(!csk)) {
+               pr_err("can't find conn. for tid %u.\n", tid);
+               goto rel_skb;
+       }
+
+       if (ref)
+               cxgbit_process_rx_cpl(csk, skb);
+       else
+               __cxgbit_process_rx_cpl(csk, skb);
+
+       return;
+rel_skb:
+       __kfree_skb(skb);
+}
+
+cxgbit_cplhandler_func cxgbit_cplhandlers[NUM_CPL_CMDS] = {
+       [CPL_PASS_OPEN_RPL]     = cxgbit_pass_open_rpl,
+       [CPL_CLOSE_LISTSRV_RPL] = cxgbit_close_listsrv_rpl,
+       [CPL_PASS_ACCEPT_REQ]   = cxgbit_pass_accept_req,
+       [CPL_PASS_ESTABLISH]    = cxgbit_pass_establish,
+       [CPL_SET_TCB_RPL]       = cxgbit_set_tcb_rpl,
+       [CPL_RX_DATA]           = cxgbit_rx_data,
+       [CPL_FW4_ACK]           = cxgbit_rx_cpl,
+       [CPL_PEER_CLOSE]        = cxgbit_rx_cpl,
+       [CPL_CLOSE_CON_RPL]     = cxgbit_rx_cpl,
+       [CPL_ABORT_REQ_RSS]     = cxgbit_rx_cpl,
+       [CPL_ABORT_RPL_RSS]     = cxgbit_rx_cpl,
+};
diff --git a/drivers/target/iscsi/cxgbit/cxgbit_ddp.c b/drivers/target/iscsi/cxgbit/cxgbit_ddp.c
new file mode 100644 (file)
index 0000000..5d78bdb
--- /dev/null
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c) 2016 Chelsio Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "cxgbit.h"
+
+static void
+cxgbit_set_one_ppod(struct cxgbi_pagepod *ppod,
+                   struct cxgbi_task_tag_info *ttinfo,
+                   struct scatterlist **sg_pp, unsigned int *sg_off)
+{
+       struct scatterlist *sg = sg_pp ? *sg_pp : NULL;
+       unsigned int offset = sg_off ? *sg_off : 0;
+       dma_addr_t addr = 0UL;
+       unsigned int len = 0;
+       int i;
+
+       memcpy(ppod, &ttinfo->hdr, sizeof(struct cxgbi_pagepod_hdr));
+
+       if (sg) {
+               addr = sg_dma_address(sg);
+               len = sg_dma_len(sg);
+       }
+
+       for (i = 0; i < PPOD_PAGES_MAX; i++) {
+               if (sg) {
+                       ppod->addr[i] = cpu_to_be64(addr + offset);
+                       offset += PAGE_SIZE;
+                       if (offset == (len + sg->offset)) {
+                               offset = 0;
+                               sg = sg_next(sg);
+                               if (sg) {
+                                       addr = sg_dma_address(sg);
+                                       len = sg_dma_len(sg);
+                               }
+                       }
+               } else {
+                       ppod->addr[i] = 0ULL;
+               }
+       }
+
+       /*
+        * the fifth address needs to be repeated in the next ppod, so do
+        * not move sg
+        */
+       if (sg_pp) {
+               *sg_pp = sg;
+               *sg_off = offset;
+       }
+
+       if (offset == len) {
+               offset = 0;
+               if (sg) {
+                       sg = sg_next(sg);
+                       if (sg)
+                               addr = sg_dma_address(sg);
+               }
+       }
+       ppod->addr[i] = sg ? cpu_to_be64(addr + offset) : 0ULL;
+}
+
+static struct sk_buff *
+cxgbit_ppod_init_idata(struct cxgbit_device *cdev, struct cxgbi_ppm *ppm,
+                      unsigned int idx, unsigned int npods, unsigned int tid)
+{
+       struct ulp_mem_io *req;
+       struct ulptx_idata *idata;
+       unsigned int pm_addr = (idx << PPOD_SIZE_SHIFT) + ppm->llimit;
+       unsigned int dlen = npods << PPOD_SIZE_SHIFT;
+       unsigned int wr_len = roundup(sizeof(struct ulp_mem_io) +
+                               sizeof(struct ulptx_idata) + dlen, 16);
+       struct sk_buff *skb;
+
+       skb  = alloc_skb(wr_len, GFP_KERNEL);
+       if (!skb)
+               return NULL;
+
+       req = (struct ulp_mem_io *)__skb_put(skb, wr_len);
+       INIT_ULPTX_WR(req, wr_len, 0, tid);
+       req->wr.wr_hi = htonl(FW_WR_OP_V(FW_ULPTX_WR) |
+               FW_WR_ATOMIC_V(0));
+       req->cmd = htonl(ULPTX_CMD_V(ULP_TX_MEM_WRITE) |
+               ULP_MEMIO_ORDER_V(0) |
+               T5_ULP_MEMIO_IMM_V(1));
+       req->dlen = htonl(ULP_MEMIO_DATA_LEN_V(dlen >> 5));
+       req->lock_addr = htonl(ULP_MEMIO_ADDR_V(pm_addr >> 5));
+       req->len16 = htonl(DIV_ROUND_UP(wr_len - sizeof(req->wr), 16));
+
+       idata = (struct ulptx_idata *)(req + 1);
+       idata->cmd_more = htonl(ULPTX_CMD_V(ULP_TX_SC_IMM));
+       idata->len = htonl(dlen);
+
+       return skb;
+}
+
+static int
+cxgbit_ppod_write_idata(struct cxgbi_ppm *ppm, struct cxgbit_sock *csk,
+                       struct cxgbi_task_tag_info *ttinfo, unsigned int idx,
+                       unsigned int npods, struct scatterlist **sg_pp,
+                       unsigned int *sg_off)
+{
+       struct cxgbit_device *cdev = csk->com.cdev;
+       struct sk_buff *skb;
+       struct ulp_mem_io *req;
+       struct ulptx_idata *idata;
+       struct cxgbi_pagepod *ppod;
+       unsigned int i;
+
+       skb = cxgbit_ppod_init_idata(cdev, ppm, idx, npods, csk->tid);
+       if (!skb)
+               return -ENOMEM;
+
+       req = (struct ulp_mem_io *)skb->data;
+       idata = (struct ulptx_idata *)(req + 1);
+       ppod = (struct cxgbi_pagepod *)(idata + 1);
+
+       for (i = 0; i < npods; i++, ppod++)
+               cxgbit_set_one_ppod(ppod, ttinfo, sg_pp, sg_off);
+
+       __skb_queue_tail(&csk->ppodq, skb);
+
+       return 0;
+}
+
+static int
+cxgbit_ddp_set_map(struct cxgbi_ppm *ppm, struct cxgbit_sock *csk,
+                  struct cxgbi_task_tag_info *ttinfo)
+{
+       unsigned int pidx = ttinfo->idx;
+       unsigned int npods = ttinfo->npods;
+       unsigned int i, cnt;
+       struct scatterlist *sg = ttinfo->sgl;
+       unsigned int offset = 0;
+       int ret = 0;
+
+       for (i = 0; i < npods; i += cnt, pidx += cnt) {
+               cnt = npods - i;
+
+               if (cnt > ULPMEM_IDATA_MAX_NPPODS)
+                       cnt = ULPMEM_IDATA_MAX_NPPODS;
+
+               ret = cxgbit_ppod_write_idata(ppm, csk, ttinfo, pidx, cnt,
+                                             &sg, &offset);
+               if (ret < 0)
+                       break;
+       }
+
+       return ret;
+}
+
+static int cxgbit_ddp_sgl_check(struct scatterlist *sg,
+                               unsigned int nents)
+{
+       unsigned int last_sgidx = nents - 1;
+       unsigned int i;
+
+       for (i = 0; i < nents; i++, sg = sg_next(sg)) {
+               unsigned int len = sg->length + sg->offset;
+
+               if ((sg->offset & 0x3) || (i && sg->offset) ||
+                   ((i != last_sgidx) && (len != PAGE_SIZE))) {
+                       return -EINVAL;
+               }
+       }
+
+       return 0;
+}
+
+static int
+cxgbit_ddp_reserve(struct cxgbit_sock *csk, struct cxgbi_task_tag_info *ttinfo,
+                  unsigned int xferlen)
+{
+       struct cxgbit_device *cdev = csk->com.cdev;
+       struct cxgbi_ppm *ppm = cdev2ppm(cdev);
+       struct scatterlist *sgl = ttinfo->sgl;
+       unsigned int sgcnt = ttinfo->nents;
+       unsigned int sg_offset = sgl->offset;
+       int ret;
+
+       if ((xferlen < DDP_THRESHOLD) || (!sgcnt)) {
+               pr_debug("ppm 0x%p, pgidx %u, xfer %u, sgcnt %u, NO ddp.\n",
+                        ppm, ppm->tformat.pgsz_idx_dflt,
+                        xferlen, ttinfo->nents);
+               return -EINVAL;
+       }
+
+       if (cxgbit_ddp_sgl_check(sgl, sgcnt) < 0)
+               return -EINVAL;
+
+       ttinfo->nr_pages = (xferlen + sgl->offset +
+                           (1 << PAGE_SHIFT) - 1) >> PAGE_SHIFT;
+
+       /*
+        * the ddp tag will be used for the ttt in the outgoing r2t pdu
+        */
+       ret = cxgbi_ppm_ppods_reserve(ppm, ttinfo->nr_pages, 0, &ttinfo->idx,
+                                     &ttinfo->tag, 0);
+       if (ret < 0)
+               return ret;
+       ttinfo->npods = ret;
+
+       sgl->offset = 0;
+       ret = dma_map_sg(&ppm->pdev->dev, sgl, sgcnt, DMA_FROM_DEVICE);
+       sgl->offset = sg_offset;
+       if (!ret) {
+               pr_info("%s: 0x%x, xfer %u, sgl %u dma mapping err.\n",
+                       __func__, 0, xferlen, sgcnt);
+               goto rel_ppods;
+       }
+
+       cxgbi_ppm_make_ppod_hdr(ppm, ttinfo->tag, csk->tid, sgl->offset,
+                               xferlen, &ttinfo->hdr);
+
+       ret = cxgbit_ddp_set_map(ppm, csk, ttinfo);
+       if (ret < 0) {
+               __skb_queue_purge(&csk->ppodq);
+               dma_unmap_sg(&ppm->pdev->dev, sgl, sgcnt, DMA_FROM_DEVICE);
+               goto rel_ppods;
+       }
+
+       return 0;
+
+rel_ppods:
+       cxgbi_ppm_ppod_release(ppm, ttinfo->idx);
+       return -EINVAL;
+}
+
+void
+cxgbit_get_r2t_ttt(struct iscsi_conn *conn, struct iscsi_cmd *cmd,
+                  struct iscsi_r2t *r2t)
+{
+       struct cxgbit_sock *csk = conn->context;
+       struct cxgbit_device *cdev = csk->com.cdev;
+       struct cxgbit_cmd *ccmd = iscsit_priv_cmd(cmd);
+       struct cxgbi_task_tag_info *ttinfo = &ccmd->ttinfo;
+       int ret = -EINVAL;
+
+       if ((!ccmd->setup_ddp) ||
+           (!test_bit(CSK_DDP_ENABLE, &csk->com.flags)))
+               goto out;
+
+       ccmd->setup_ddp = false;
+
+       ttinfo->sgl = cmd->se_cmd.t_data_sg;
+       ttinfo->nents = cmd->se_cmd.t_data_nents;
+
+       ret = cxgbit_ddp_reserve(csk, ttinfo, cmd->se_cmd.data_length);
+       if (ret < 0) {
+               pr_info("csk 0x%p, cmd 0x%p, xfer len %u, sgcnt %u no ddp.\n",
+                       csk, cmd, cmd->se_cmd.data_length, ttinfo->nents);
+
+               ttinfo->sgl = NULL;
+               ttinfo->nents = 0;
+       } else {
+               ccmd->release = true;
+       }
+out:
+       pr_debug("cdev 0x%p, cmd 0x%p, tag 0x%x\n", cdev, cmd, ttinfo->tag);
+       r2t->targ_xfer_tag = ttinfo->tag;
+}
+
+void cxgbit_release_cmd(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
+{
+       struct cxgbit_cmd *ccmd = iscsit_priv_cmd(cmd);
+
+       if (ccmd->release) {
+               struct cxgbi_task_tag_info *ttinfo = &ccmd->ttinfo;
+
+               if (ttinfo->sgl) {
+                       struct cxgbit_sock *csk = conn->context;
+                       struct cxgbit_device *cdev = csk->com.cdev;
+                       struct cxgbi_ppm *ppm = cdev2ppm(cdev);
+
+                       cxgbi_ppm_ppod_release(ppm, ttinfo->idx);
+
+                       dma_unmap_sg(&ppm->pdev->dev, ttinfo->sgl,
+                                    ttinfo->nents, DMA_FROM_DEVICE);
+               } else {
+                       put_page(sg_page(&ccmd->sg));
+               }
+
+               ccmd->release = false;
+       }
+}
+
+int cxgbit_ddp_init(struct cxgbit_device *cdev)
+{
+       struct cxgb4_lld_info *lldi = &cdev->lldi;
+       struct net_device *ndev = cdev->lldi.ports[0];
+       struct cxgbi_tag_format tformat;
+       unsigned int ppmax;
+       int ret, i;
+
+       if (!lldi->vr->iscsi.size) {
+               pr_warn("%s, iscsi NOT enabled, check config!\n", ndev->name);
+               return -EACCES;
+       }
+
+       ppmax = lldi->vr->iscsi.size >> PPOD_SIZE_SHIFT;
+
+       memset(&tformat, 0, sizeof(struct cxgbi_tag_format));
+       for (i = 0; i < 4; i++)
+               tformat.pgsz_order[i] = (lldi->iscsi_pgsz_order >> (i << 3))
+                                        & 0xF;
+       cxgbi_tagmask_check(lldi->iscsi_tagmask, &tformat);
+
+       ret = cxgbi_ppm_init(lldi->iscsi_ppm, cdev->lldi.ports[0],
+                            cdev->lldi.pdev, &cdev->lldi, &tformat,
+                            ppmax, lldi->iscsi_llimit,
+                            lldi->vr->iscsi.start, 2);
+       if (ret >= 0) {
+               struct cxgbi_ppm *ppm = (struct cxgbi_ppm *)(*lldi->iscsi_ppm);
+
+               if ((ppm->tformat.pgsz_idx_dflt < DDP_PGIDX_MAX) &&
+                   (ppm->ppmax >= 1024))
+                       set_bit(CDEV_DDP_ENABLE, &cdev->flags);
+               ret = 0;
+       }
+
+       return ret;
+}
diff --git a/drivers/target/iscsi/cxgbit/cxgbit_lro.h b/drivers/target/iscsi/cxgbit/cxgbit_lro.h
new file mode 100644 (file)
index 0000000..28c11bd
--- /dev/null
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2016 Chelsio Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ *
+ */
+
+#ifndef        __CXGBIT_LRO_H__
+#define        __CXGBIT_LRO_H__
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+
+#define LRO_FLUSH_LEN_MAX      65535
+
+struct cxgbit_lro_cb {
+       struct cxgbit_sock *csk;
+       u32 pdu_totallen;
+       u32 offset;
+       u8 pdu_idx;
+       bool complete;
+};
+
+enum cxgbit_pducb_flags {
+       PDUCBF_RX_HDR           = (1 << 0), /* received pdu header */
+       PDUCBF_RX_DATA          = (1 << 1), /* received pdu payload */
+       PDUCBF_RX_STATUS        = (1 << 2), /* received ddp status */
+       PDUCBF_RX_DATA_DDPD     = (1 << 3), /* pdu payload ddp'd */
+       PDUCBF_RX_HCRC_ERR      = (1 << 4), /* header digest error */
+       PDUCBF_RX_DCRC_ERR      = (1 << 5), /* data digest error */
+};
+
+struct cxgbit_lro_pdu_cb {
+       u8 flags;
+       u8 frags;
+       u8 hfrag_idx;
+       u8 nr_dfrags;
+       u8 dfrag_idx;
+       bool complete;
+       u32 seq;
+       u32 pdulen;
+       u32 hlen;
+       u32 dlen;
+       u32 doffset;
+       u32 ddigest;
+       void *hdr;
+};
+
+#define LRO_SKB_MAX_HEADROOM  \
+               (sizeof(struct cxgbit_lro_cb) + \
+                (MAX_SKB_FRAGS * sizeof(struct cxgbit_lro_pdu_cb)))
+
+#define LRO_SKB_MIN_HEADROOM  \
+               (sizeof(struct cxgbit_lro_cb) + \
+                sizeof(struct cxgbit_lro_pdu_cb))
+
+#define cxgbit_skb_lro_cb(skb) ((struct cxgbit_lro_cb *)skb->data)
+#define cxgbit_skb_lro_pdu_cb(skb, i)  \
+       ((struct cxgbit_lro_pdu_cb *)(skb->data + sizeof(struct cxgbit_lro_cb) \
+               + (i * sizeof(struct cxgbit_lro_pdu_cb))))
+
+#define CPL_RX_ISCSI_DDP_STATUS_DDP_SHIFT      16 /* ddp'able */
+#define CPL_RX_ISCSI_DDP_STATUS_PAD_SHIFT      19 /* pad error */
+#define CPL_RX_ISCSI_DDP_STATUS_HCRC_SHIFT     20 /* hcrc error */
+#define CPL_RX_ISCSI_DDP_STATUS_DCRC_SHIFT     21 /* dcrc error */
+
+#endif /*__CXGBIT_LRO_H_*/
diff --git a/drivers/target/iscsi/cxgbit/cxgbit_main.c b/drivers/target/iscsi/cxgbit/cxgbit_main.c
new file mode 100644 (file)
index 0000000..60dccd0
--- /dev/null
@@ -0,0 +1,702 @@
+/*
+ * Copyright (c) 2016 Chelsio Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define DRV_NAME "cxgbit"
+#define DRV_VERSION "1.0.0-ko"
+#define pr_fmt(fmt) DRV_NAME ": " fmt
+
+#include "cxgbit.h"
+
+#ifdef CONFIG_CHELSIO_T4_DCB
+#include <net/dcbevent.h>
+#include "cxgb4_dcb.h"
+#endif
+
+LIST_HEAD(cdev_list_head);
+/* cdev list lock */
+DEFINE_MUTEX(cdev_list_lock);
+
+void _cxgbit_free_cdev(struct kref *kref)
+{
+       struct cxgbit_device *cdev;
+
+       cdev = container_of(kref, struct cxgbit_device, kref);
+       kfree(cdev);
+}
+
+static void cxgbit_set_mdsl(struct cxgbit_device *cdev)
+{
+       struct cxgb4_lld_info *lldi = &cdev->lldi;
+       u32 mdsl;
+
+#define ULP2_MAX_PKT_LEN 16224
+#define ISCSI_PDU_NONPAYLOAD_LEN 312
+       mdsl = min_t(u32, lldi->iscsi_iolen - ISCSI_PDU_NONPAYLOAD_LEN,
+                    ULP2_MAX_PKT_LEN - ISCSI_PDU_NONPAYLOAD_LEN);
+       mdsl = min_t(u32, mdsl, 8192);
+       mdsl = min_t(u32, mdsl, (MAX_SKB_FRAGS - 1) * PAGE_SIZE);
+
+       cdev->mdsl = mdsl;
+}
+
+static void *cxgbit_uld_add(const struct cxgb4_lld_info *lldi)
+{
+       struct cxgbit_device *cdev;
+
+       if (is_t4(lldi->adapter_type))
+               return ERR_PTR(-ENODEV);
+
+       cdev = kzalloc(sizeof(*cdev), GFP_KERNEL);
+       if (!cdev)
+               return ERR_PTR(-ENOMEM);
+
+       kref_init(&cdev->kref);
+
+       cdev->lldi = *lldi;
+
+       cxgbit_set_mdsl(cdev);
+
+       if (cxgbit_ddp_init(cdev) < 0) {
+               kfree(cdev);
+               return ERR_PTR(-EINVAL);
+       }
+
+       if (!test_bit(CDEV_DDP_ENABLE, &cdev->flags))
+               pr_info("cdev %s ddp init failed\n",
+                       pci_name(lldi->pdev));
+
+       if (lldi->fw_vers >= 0x10d2b00)
+               set_bit(CDEV_ISO_ENABLE, &cdev->flags);
+
+       spin_lock_init(&cdev->cskq.lock);
+       INIT_LIST_HEAD(&cdev->cskq.list);
+
+       mutex_lock(&cdev_list_lock);
+       list_add_tail(&cdev->list, &cdev_list_head);
+       mutex_unlock(&cdev_list_lock);
+
+       pr_info("cdev %s added for iSCSI target transport\n",
+               pci_name(lldi->pdev));
+
+       return cdev;
+}
+
+static void cxgbit_close_conn(struct cxgbit_device *cdev)
+{
+       struct cxgbit_sock *csk;
+       struct sk_buff *skb;
+       bool wakeup_thread = false;
+
+       spin_lock_bh(&cdev->cskq.lock);
+       list_for_each_entry(csk, &cdev->cskq.list, list) {
+               skb = alloc_skb(0, GFP_ATOMIC);
+               if (!skb)
+                       continue;
+
+               spin_lock_bh(&csk->rxq.lock);
+               __skb_queue_tail(&csk->rxq, skb);
+               if (skb_queue_len(&csk->rxq) == 1)
+                       wakeup_thread = true;
+               spin_unlock_bh(&csk->rxq.lock);
+
+               if (wakeup_thread) {
+                       wake_up(&csk->waitq);
+                       wakeup_thread = false;
+               }
+       }
+       spin_unlock_bh(&cdev->cskq.lock);
+}
+
+static void cxgbit_detach_cdev(struct cxgbit_device *cdev)
+{
+       bool free_cdev = false;
+
+       spin_lock_bh(&cdev->cskq.lock);
+       if (list_empty(&cdev->cskq.list))
+               free_cdev = true;
+       spin_unlock_bh(&cdev->cskq.lock);
+
+       if (free_cdev) {
+               mutex_lock(&cdev_list_lock);
+               list_del(&cdev->list);
+               mutex_unlock(&cdev_list_lock);
+
+               cxgbit_put_cdev(cdev);
+       } else {
+               cxgbit_close_conn(cdev);
+       }
+}
+
+static int cxgbit_uld_state_change(void *handle, enum cxgb4_state state)
+{
+       struct cxgbit_device *cdev = handle;
+
+       switch (state) {
+       case CXGB4_STATE_UP:
+               set_bit(CDEV_STATE_UP, &cdev->flags);
+               pr_info("cdev %s state UP.\n", pci_name(cdev->lldi.pdev));
+               break;
+       case CXGB4_STATE_START_RECOVERY:
+               clear_bit(CDEV_STATE_UP, &cdev->flags);
+               cxgbit_close_conn(cdev);
+               pr_info("cdev %s state RECOVERY.\n", pci_name(cdev->lldi.pdev));
+               break;
+       case CXGB4_STATE_DOWN:
+               pr_info("cdev %s state DOWN.\n", pci_name(cdev->lldi.pdev));
+               break;
+       case CXGB4_STATE_DETACH:
+               clear_bit(CDEV_STATE_UP, &cdev->flags);
+               pr_info("cdev %s state DETACH.\n", pci_name(cdev->lldi.pdev));
+               cxgbit_detach_cdev(cdev);
+               break;
+       default:
+               pr_info("cdev %s unknown state %d.\n",
+                       pci_name(cdev->lldi.pdev), state);
+               break;
+       }
+       return 0;
+}
+
+static void
+cxgbit_proc_ddp_status(unsigned int tid, struct cpl_rx_data_ddp *cpl,
+                      struct cxgbit_lro_pdu_cb *pdu_cb)
+{
+       unsigned int status = ntohl(cpl->ddpvld);
+
+       pdu_cb->flags |= PDUCBF_RX_STATUS;
+       pdu_cb->ddigest = ntohl(cpl->ulp_crc);
+       pdu_cb->pdulen = ntohs(cpl->len);
+
+       if (status & (1 << CPL_RX_ISCSI_DDP_STATUS_HCRC_SHIFT)) {
+               pr_info("tid 0x%x, status 0x%x, hcrc bad.\n", tid, status);
+               pdu_cb->flags |= PDUCBF_RX_HCRC_ERR;
+       }
+
+       if (status & (1 << CPL_RX_ISCSI_DDP_STATUS_DCRC_SHIFT)) {
+               pr_info("tid 0x%x, status 0x%x, dcrc bad.\n", tid, status);
+               pdu_cb->flags |= PDUCBF_RX_DCRC_ERR;
+       }
+
+       if (status & (1 << CPL_RX_ISCSI_DDP_STATUS_PAD_SHIFT))
+               pr_info("tid 0x%x, status 0x%x, pad bad.\n", tid, status);
+
+       if ((status & (1 << CPL_RX_ISCSI_DDP_STATUS_DDP_SHIFT)) &&
+           (!(pdu_cb->flags & PDUCBF_RX_DATA))) {
+               pdu_cb->flags |= PDUCBF_RX_DATA_DDPD;
+       }
+}
+
+static void
+cxgbit_lro_add_packet_rsp(struct sk_buff *skb, u8 op, const __be64 *rsp)
+{
+       struct cxgbit_lro_cb *lro_cb = cxgbit_skb_lro_cb(skb);
+       struct cxgbit_lro_pdu_cb *pdu_cb = cxgbit_skb_lro_pdu_cb(skb,
+                                               lro_cb->pdu_idx);
+       struct cpl_rx_iscsi_ddp *cpl = (struct cpl_rx_iscsi_ddp *)(rsp + 1);
+
+       cxgbit_proc_ddp_status(lro_cb->csk->tid, cpl, pdu_cb);
+
+       if (pdu_cb->flags & PDUCBF_RX_HDR)
+               pdu_cb->complete = true;
+
+       lro_cb->complete = true;
+       lro_cb->pdu_totallen += pdu_cb->pdulen;
+       lro_cb->pdu_idx++;
+}
+
+static void
+cxgbit_copy_frags(struct sk_buff *skb, const struct pkt_gl *gl,
+                 unsigned int offset)
+{
+       u8 skb_frag_idx = skb_shinfo(skb)->nr_frags;
+       u8 i;
+
+       /* usually there's just one frag */
+       __skb_fill_page_desc(skb, skb_frag_idx, gl->frags[0].page,
+                            gl->frags[0].offset + offset,
+                            gl->frags[0].size - offset);
+       for (i = 1; i < gl->nfrags; i++)
+               __skb_fill_page_desc(skb, skb_frag_idx + i,
+                                    gl->frags[i].page,
+                                    gl->frags[i].offset,
+                                    gl->frags[i].size);
+
+       skb_shinfo(skb)->nr_frags += gl->nfrags;
+
+       /* get a reference to the last page, we don't own it */
+       get_page(gl->frags[gl->nfrags - 1].page);
+}
+
+static void
+cxgbit_lro_add_packet_gl(struct sk_buff *skb, u8 op, const struct pkt_gl *gl)
+{
+       struct cxgbit_lro_cb *lro_cb = cxgbit_skb_lro_cb(skb);
+       struct cxgbit_lro_pdu_cb *pdu_cb = cxgbit_skb_lro_pdu_cb(skb,
+                                               lro_cb->pdu_idx);
+       u32 len, offset;
+
+       if (op == CPL_ISCSI_HDR) {
+               struct cpl_iscsi_hdr *cpl = (struct cpl_iscsi_hdr *)gl->va;
+
+               offset = sizeof(struct cpl_iscsi_hdr);
+               pdu_cb->flags |= PDUCBF_RX_HDR;
+               pdu_cb->seq = ntohl(cpl->seq);
+               len = ntohs(cpl->len);
+               pdu_cb->hdr = gl->va + offset;
+               pdu_cb->hlen = len;
+               pdu_cb->hfrag_idx = skb_shinfo(skb)->nr_frags;
+
+               if (unlikely(gl->nfrags > 1))
+                       cxgbit_skcb_flags(skb) = 0;
+
+               lro_cb->complete = false;
+       } else {
+               struct cpl_iscsi_data *cpl = (struct cpl_iscsi_data *)gl->va;
+
+               offset = sizeof(struct cpl_iscsi_data);
+               pdu_cb->flags |= PDUCBF_RX_DATA;
+               len = ntohs(cpl->len);
+               pdu_cb->dlen = len;
+               pdu_cb->doffset = lro_cb->offset;
+               pdu_cb->nr_dfrags = gl->nfrags;
+               pdu_cb->dfrag_idx = skb_shinfo(skb)->nr_frags;
+       }
+
+       cxgbit_copy_frags(skb, gl, offset);
+
+       pdu_cb->frags += gl->nfrags;
+       lro_cb->offset += len;
+       skb->len += len;
+       skb->data_len += len;
+       skb->truesize += len;
+}
+
+static struct sk_buff *
+cxgbit_lro_init_skb(struct cxgbit_sock *csk, u8 op, const struct pkt_gl *gl,
+                   const __be64 *rsp, struct napi_struct *napi)
+{
+       struct sk_buff *skb;
+       struct cxgbit_lro_cb *lro_cb;
+
+       skb = napi_alloc_skb(napi, LRO_SKB_MAX_HEADROOM);
+
+       if (unlikely(!skb))
+               return NULL;
+
+       memset(skb->data, 0, LRO_SKB_MAX_HEADROOM);
+
+       cxgbit_skcb_flags(skb) |= SKCBF_RX_LRO;
+
+       lro_cb = cxgbit_skb_lro_cb(skb);
+
+       cxgbit_get_csk(csk);
+
+       lro_cb->csk = csk;
+
+       return skb;
+}
+
+static void cxgbit_queue_lro_skb(struct cxgbit_sock *csk, struct sk_buff *skb)
+{
+       bool wakeup_thread = false;
+
+       spin_lock(&csk->rxq.lock);
+       __skb_queue_tail(&csk->rxq, skb);
+       if (skb_queue_len(&csk->rxq) == 1)
+               wakeup_thread = true;
+       spin_unlock(&csk->rxq.lock);
+
+       if (wakeup_thread)
+               wake_up(&csk->waitq);
+}
+
+static void cxgbit_lro_flush(struct t4_lro_mgr *lro_mgr, struct sk_buff *skb)
+{
+       struct cxgbit_lro_cb *lro_cb = cxgbit_skb_lro_cb(skb);
+       struct cxgbit_sock *csk = lro_cb->csk;
+
+       csk->lro_skb = NULL;
+
+       __skb_unlink(skb, &lro_mgr->lroq);
+       cxgbit_queue_lro_skb(csk, skb);
+
+       cxgbit_put_csk(csk);
+
+       lro_mgr->lro_pkts++;
+       lro_mgr->lro_session_cnt--;
+}
+
+static void cxgbit_uld_lro_flush(struct t4_lro_mgr *lro_mgr)
+{
+       struct sk_buff *skb;
+
+       while ((skb = skb_peek(&lro_mgr->lroq)))
+               cxgbit_lro_flush(lro_mgr, skb);
+}
+
+static int
+cxgbit_lro_receive(struct cxgbit_sock *csk, u8 op, const __be64 *rsp,
+                  const struct pkt_gl *gl, struct t4_lro_mgr *lro_mgr,
+                  struct napi_struct *napi)
+{
+       struct sk_buff *skb;
+       struct cxgbit_lro_cb *lro_cb;
+
+       if (!csk) {
+               pr_err("%s: csk NULL, op 0x%x.\n", __func__, op);
+               goto out;
+       }
+
+       if (csk->lro_skb)
+               goto add_packet;
+
+start_lro:
+       if (lro_mgr->lro_session_cnt >= MAX_LRO_SESSIONS) {
+               cxgbit_uld_lro_flush(lro_mgr);
+               goto start_lro;
+       }
+
+       skb = cxgbit_lro_init_skb(csk, op, gl, rsp, napi);
+       if (unlikely(!skb))
+               goto out;
+
+       csk->lro_skb = skb;
+
+       __skb_queue_tail(&lro_mgr->lroq, skb);
+       lro_mgr->lro_session_cnt++;
+
+add_packet:
+       skb = csk->lro_skb;
+       lro_cb = cxgbit_skb_lro_cb(skb);
+
+       if ((gl && (((skb_shinfo(skb)->nr_frags + gl->nfrags) >
+           MAX_SKB_FRAGS) || (lro_cb->pdu_totallen >= LRO_FLUSH_LEN_MAX))) ||
+           (lro_cb->pdu_idx >= MAX_SKB_FRAGS)) {
+               cxgbit_lro_flush(lro_mgr, skb);
+               goto start_lro;
+       }
+
+       if (gl)
+               cxgbit_lro_add_packet_gl(skb, op, gl);
+       else
+               cxgbit_lro_add_packet_rsp(skb, op, rsp);
+
+       lro_mgr->lro_merged++;
+
+       return 0;
+
+out:
+       return -1;
+}
+
+static int
+cxgbit_uld_lro_rx_handler(void *hndl, const __be64 *rsp,
+                         const struct pkt_gl *gl, struct t4_lro_mgr *lro_mgr,
+                         struct napi_struct *napi)
+{
+       struct cxgbit_device *cdev = hndl;
+       struct cxgb4_lld_info *lldi = &cdev->lldi;
+       struct cpl_tx_data *rpl = NULL;
+       struct cxgbit_sock *csk = NULL;
+       unsigned int tid = 0;
+       struct sk_buff *skb;
+       unsigned int op = *(u8 *)rsp;
+       bool lro_flush = true;
+
+       switch (op) {
+       case CPL_ISCSI_HDR:
+       case CPL_ISCSI_DATA:
+       case CPL_RX_ISCSI_DDP:
+       case CPL_FW4_ACK:
+               lro_flush = false;
+       case CPL_ABORT_RPL_RSS:
+       case CPL_PASS_ESTABLISH:
+       case CPL_PEER_CLOSE:
+       case CPL_CLOSE_CON_RPL:
+       case CPL_ABORT_REQ_RSS:
+       case CPL_SET_TCB_RPL:
+       case CPL_RX_DATA:
+               rpl = gl ? (struct cpl_tx_data *)gl->va :
+                          (struct cpl_tx_data *)(rsp + 1);
+               tid = GET_TID(rpl);
+               csk = lookup_tid(lldi->tids, tid);
+               break;
+       default:
+               break;
+       }
+
+       if (csk && csk->lro_skb && lro_flush)
+               cxgbit_lro_flush(lro_mgr, csk->lro_skb);
+
+       if (!gl) {
+               unsigned int len;
+
+               if (op == CPL_RX_ISCSI_DDP) {
+                       if (!cxgbit_lro_receive(csk, op, rsp, NULL, lro_mgr,
+                                               napi))
+                               return 0;
+               }
+
+               len = 64 - sizeof(struct rsp_ctrl) - 8;
+               skb = napi_alloc_skb(napi, len);
+               if (!skb)
+                       goto nomem;
+               __skb_put(skb, len);
+               skb_copy_to_linear_data(skb, &rsp[1], len);
+       } else {
+               if (unlikely(op != *(u8 *)gl->va)) {
+                       pr_info("? FL 0x%p,RSS%#llx,FL %#llx,len %u.\n",
+                               gl->va, be64_to_cpu(*rsp),
+                               be64_to_cpu(*(u64 *)gl->va),
+                               gl->tot_len);
+                       return 0;
+               }
+
+               if (op == CPL_ISCSI_HDR || op == CPL_ISCSI_DATA) {
+                       if (!cxgbit_lro_receive(csk, op, rsp, gl, lro_mgr,
+                                               napi))
+                               return 0;
+               }
+
+#define RX_PULL_LEN 128
+               skb = cxgb4_pktgl_to_skb(gl, RX_PULL_LEN, RX_PULL_LEN);
+               if (unlikely(!skb))
+                       goto nomem;
+       }
+
+       rpl = (struct cpl_tx_data *)skb->data;
+       op = rpl->ot.opcode;
+       cxgbit_skcb_rx_opcode(skb) = op;
+
+       pr_debug("cdev %p, opcode 0x%x(0x%x,0x%x), skb %p.\n",
+                cdev, op, rpl->ot.opcode_tid,
+                ntohl(rpl->ot.opcode_tid), skb);
+
+       if (op < NUM_CPL_CMDS && cxgbit_cplhandlers[op]) {
+               cxgbit_cplhandlers[op](cdev, skb);
+       } else {
+               pr_err("No handler for opcode 0x%x.\n", op);
+               __kfree_skb(skb);
+       }
+       return 0;
+nomem:
+       pr_err("%s OOM bailing out.\n", __func__);
+       return 1;
+}
+
+#ifdef CONFIG_CHELSIO_T4_DCB
+struct cxgbit_dcb_work {
+       struct dcb_app_type dcb_app;
+       struct work_struct work;
+};
+
+static void
+cxgbit_update_dcb_priority(struct cxgbit_device *cdev, u8 port_id,
+                          u8 dcb_priority, u16 port_num)
+{
+       struct cxgbit_sock *csk;
+       struct sk_buff *skb;
+       u16 local_port;
+       bool wakeup_thread = false;
+
+       spin_lock_bh(&cdev->cskq.lock);
+       list_for_each_entry(csk, &cdev->cskq.list, list) {
+               if (csk->port_id != port_id)
+                       continue;
+
+               if (csk->com.local_addr.ss_family == AF_INET6) {
+                       struct sockaddr_in6 *sock_in6;
+
+                       sock_in6 = (struct sockaddr_in6 *)&csk->com.local_addr;
+                       local_port = ntohs(sock_in6->sin6_port);
+               } else {
+                       struct sockaddr_in *sock_in;
+
+                       sock_in = (struct sockaddr_in *)&csk->com.local_addr;
+                       local_port = ntohs(sock_in->sin_port);
+               }
+
+               if (local_port != port_num)
+                       continue;
+
+               if (csk->dcb_priority == dcb_priority)
+                       continue;
+
+               skb = alloc_skb(0, GFP_ATOMIC);
+               if (!skb)
+                       continue;
+
+               spin_lock(&csk->rxq.lock);
+               __skb_queue_tail(&csk->rxq, skb);
+               if (skb_queue_len(&csk->rxq) == 1)
+                       wakeup_thread = true;
+               spin_unlock(&csk->rxq.lock);
+
+               if (wakeup_thread) {
+                       wake_up(&csk->waitq);
+                       wakeup_thread = false;
+               }
+       }
+       spin_unlock_bh(&cdev->cskq.lock);
+}
+
+static void cxgbit_dcb_workfn(struct work_struct *work)
+{
+       struct cxgbit_dcb_work *dcb_work;
+       struct net_device *ndev;
+       struct cxgbit_device *cdev = NULL;
+       struct dcb_app_type *iscsi_app;
+       u8 priority, port_id = 0xff;
+
+       dcb_work = container_of(work, struct cxgbit_dcb_work, work);
+       iscsi_app = &dcb_work->dcb_app;
+
+       if (iscsi_app->dcbx & DCB_CAP_DCBX_VER_IEEE) {
+               if (iscsi_app->app.selector != IEEE_8021QAZ_APP_SEL_ANY)
+                       goto out;
+
+               priority = iscsi_app->app.priority;
+
+       } else if (iscsi_app->dcbx & DCB_CAP_DCBX_VER_CEE) {
+               if (iscsi_app->app.selector != DCB_APP_IDTYPE_PORTNUM)
+                       goto out;
+
+               if (!iscsi_app->app.priority)
+                       goto out;
+
+               priority = ffs(iscsi_app->app.priority) - 1;
+       } else {
+               goto out;
+       }
+
+       pr_debug("priority for ifid %d is %u\n",
+                iscsi_app->ifindex, priority);
+
+       ndev = dev_get_by_index(&init_net, iscsi_app->ifindex);
+
+       if (!ndev)
+               goto out;
+
+       mutex_lock(&cdev_list_lock);
+       cdev = cxgbit_find_device(ndev, &port_id);
+
+       dev_put(ndev);
+
+       if (!cdev) {
+               mutex_unlock(&cdev_list_lock);
+               goto out;
+       }
+
+       cxgbit_update_dcb_priority(cdev, port_id, priority,
+                                  iscsi_app->app.protocol);
+       mutex_unlock(&cdev_list_lock);
+out:
+       kfree(dcb_work);
+}
+
+static int
+cxgbit_dcbevent_notify(struct notifier_block *nb, unsigned long action,
+                      void *data)
+{
+       struct cxgbit_dcb_work *dcb_work;
+       struct dcb_app_type *dcb_app = data;
+
+       dcb_work = kzalloc(sizeof(*dcb_work), GFP_ATOMIC);
+       if (!dcb_work)
+               return NOTIFY_DONE;
+
+       dcb_work->dcb_app = *dcb_app;
+       INIT_WORK(&dcb_work->work, cxgbit_dcb_workfn);
+       schedule_work(&dcb_work->work);
+       return NOTIFY_OK;
+}
+#endif
+
+static enum target_prot_op cxgbit_get_sup_prot_ops(struct iscsi_conn *conn)
+{
+       return TARGET_PROT_NORMAL;
+}
+
+static struct iscsit_transport cxgbit_transport = {
+       .name                   = DRV_NAME,
+       .transport_type         = ISCSI_CXGBIT,
+       .rdma_shutdown          = false,
+       .priv_size              = sizeof(struct cxgbit_cmd),
+       .owner                  = THIS_MODULE,
+       .iscsit_setup_np        = cxgbit_setup_np,
+       .iscsit_accept_np       = cxgbit_accept_np,
+       .iscsit_free_np         = cxgbit_free_np,
+       .iscsit_free_conn       = cxgbit_free_conn,
+       .iscsit_get_login_rx    = cxgbit_get_login_rx,
+       .iscsit_put_login_tx    = cxgbit_put_login_tx,
+       .iscsit_immediate_queue = iscsit_immediate_queue,
+       .iscsit_response_queue  = iscsit_response_queue,
+       .iscsit_get_dataout     = iscsit_build_r2ts_for_cmd,
+       .iscsit_queue_data_in   = iscsit_queue_rsp,
+       .iscsit_queue_status    = iscsit_queue_rsp,
+       .iscsit_xmit_pdu        = cxgbit_xmit_pdu,
+       .iscsit_get_r2t_ttt     = cxgbit_get_r2t_ttt,
+       .iscsit_get_rx_pdu      = cxgbit_get_rx_pdu,
+       .iscsit_validate_params = cxgbit_validate_params,
+       .iscsit_release_cmd     = cxgbit_release_cmd,
+       .iscsit_aborted_task    = iscsit_aborted_task,
+       .iscsit_get_sup_prot_ops = cxgbit_get_sup_prot_ops,
+};
+
+static struct cxgb4_uld_info cxgbit_uld_info = {
+       .name           = DRV_NAME,
+       .add            = cxgbit_uld_add,
+       .state_change   = cxgbit_uld_state_change,
+       .lro_rx_handler = cxgbit_uld_lro_rx_handler,
+       .lro_flush      = cxgbit_uld_lro_flush,
+};
+
+#ifdef CONFIG_CHELSIO_T4_DCB
+static struct notifier_block cxgbit_dcbevent_nb = {
+       .notifier_call = cxgbit_dcbevent_notify,
+};
+#endif
+
+static int __init cxgbit_init(void)
+{
+       cxgb4_register_uld(CXGB4_ULD_ISCSIT, &cxgbit_uld_info);
+       iscsit_register_transport(&cxgbit_transport);
+
+#ifdef CONFIG_CHELSIO_T4_DCB
+       pr_info("%s dcb enabled.\n", DRV_NAME);
+       register_dcbevent_notifier(&cxgbit_dcbevent_nb);
+#endif
+       BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, cb) <
+                    sizeof(union cxgbit_skb_cb));
+       return 0;
+}
+
+static void __exit cxgbit_exit(void)
+{
+       struct cxgbit_device *cdev, *tmp;
+
+#ifdef CONFIG_CHELSIO_T4_DCB
+       unregister_dcbevent_notifier(&cxgbit_dcbevent_nb);
+#endif
+       mutex_lock(&cdev_list_lock);
+       list_for_each_entry_safe(cdev, tmp, &cdev_list_head, list) {
+               list_del(&cdev->list);
+               cxgbit_put_cdev(cdev);
+       }
+       mutex_unlock(&cdev_list_lock);
+       iscsit_unregister_transport(&cxgbit_transport);
+       cxgb4_unregister_uld(CXGB4_ULD_ISCSIT);
+}
+
+module_init(cxgbit_init);
+module_exit(cxgbit_exit);
+
+MODULE_DESCRIPTION("Chelsio iSCSI target offload driver");
+MODULE_AUTHOR("Chelsio Communications");
+MODULE_VERSION(DRV_VERSION);
+MODULE_LICENSE("GPL");
diff --git a/drivers/target/iscsi/cxgbit/cxgbit_target.c b/drivers/target/iscsi/cxgbit/cxgbit_target.c
new file mode 100644 (file)
index 0000000..d02bf58
--- /dev/null
@@ -0,0 +1,1561 @@
+/*
+ * Copyright (c) 2016 Chelsio Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <asm/unaligned.h>
+#include <target/target_core_base.h>
+#include <target/target_core_fabric.h>
+#include "cxgbit.h"
+
+struct sge_opaque_hdr {
+       void *dev;
+       dma_addr_t addr[MAX_SKB_FRAGS + 1];
+};
+
+static const u8 cxgbit_digest_len[] = {0, 4, 4, 8};
+
+#define TX_HDR_LEN (sizeof(struct sge_opaque_hdr) + \
+                   sizeof(struct fw_ofld_tx_data_wr))
+
+static struct sk_buff *
+__cxgbit_alloc_skb(struct cxgbit_sock *csk, u32 len, bool iso)
+{
+       struct sk_buff *skb = NULL;
+       u8 submode = 0;
+       int errcode;
+       static const u32 hdr_len = TX_HDR_LEN + ISCSI_HDR_LEN;
+
+       if (len) {
+               skb = alloc_skb_with_frags(hdr_len, len,
+                                          0, &errcode,
+                                          GFP_KERNEL);
+               if (!skb)
+                       return NULL;
+
+               skb_reserve(skb, TX_HDR_LEN);
+               skb_reset_transport_header(skb);
+               __skb_put(skb, ISCSI_HDR_LEN);
+               skb->data_len = len;
+               skb->len += len;
+               submode |= (csk->submode & CXGBIT_SUBMODE_DCRC);
+
+       } else {
+               u32 iso_len = iso ? sizeof(struct cpl_tx_data_iso) : 0;
+
+               skb = alloc_skb(hdr_len + iso_len, GFP_KERNEL);
+               if (!skb)
+                       return NULL;
+
+               skb_reserve(skb, TX_HDR_LEN + iso_len);
+               skb_reset_transport_header(skb);
+               __skb_put(skb, ISCSI_HDR_LEN);
+       }
+
+       submode |= (csk->submode & CXGBIT_SUBMODE_HCRC);
+       cxgbit_skcb_submode(skb) = submode;
+       cxgbit_skcb_tx_extralen(skb) = cxgbit_digest_len[submode];
+       cxgbit_skcb_flags(skb) |= SKCBF_TX_NEED_HDR;
+       return skb;
+}
+
+static struct sk_buff *cxgbit_alloc_skb(struct cxgbit_sock *csk, u32 len)
+{
+       return __cxgbit_alloc_skb(csk, len, false);
+}
+
+/*
+ * cxgbit_is_ofld_imm - check whether a packet can be sent as immediate data
+ * @skb: the packet
+ *
+ * Returns true if a packet can be sent as an offload WR with immediate
+ * data.  We currently use the same limit as for Ethernet packets.
+ */
+static int cxgbit_is_ofld_imm(const struct sk_buff *skb)
+{
+       int length = skb->len;
+
+       if (likely(cxgbit_skcb_flags(skb) & SKCBF_TX_NEED_HDR))
+               length += sizeof(struct fw_ofld_tx_data_wr);
+
+       if (likely(cxgbit_skcb_flags(skb) & SKCBF_TX_ISO))
+               length += sizeof(struct cpl_tx_data_iso);
+
+#define MAX_IMM_TX_PKT_LEN     256
+       return length <= MAX_IMM_TX_PKT_LEN;
+}
+
+/*
+ * cxgbit_sgl_len - calculates the size of an SGL of the given capacity
+ * @n: the number of SGL entries
+ * Calculates the number of flits needed for a scatter/gather list that
+ * can hold the given number of entries.
+ */
+static inline unsigned int cxgbit_sgl_len(unsigned int n)
+{
+       n--;
+       return (3 * n) / 2 + (n & 1) + 2;
+}
+
+/*
+ * cxgbit_calc_tx_flits_ofld - calculate # of flits for an offload packet
+ * @skb: the packet
+ *
+ * Returns the number of flits needed for the given offload packet.
+ * These packets are already fully constructed and no additional headers
+ * will be added.
+ */
+static unsigned int cxgbit_calc_tx_flits_ofld(const struct sk_buff *skb)
+{
+       unsigned int flits, cnt;
+
+       if (cxgbit_is_ofld_imm(skb))
+               return DIV_ROUND_UP(skb->len, 8);
+       flits = skb_transport_offset(skb) / 8;
+       cnt = skb_shinfo(skb)->nr_frags;
+       if (skb_tail_pointer(skb) != skb_transport_header(skb))
+               cnt++;
+       return flits + cxgbit_sgl_len(cnt);
+}
+
+#define CXGBIT_ISO_FSLICE 0x1
+#define CXGBIT_ISO_LSLICE 0x2
+static void
+cxgbit_cpl_tx_data_iso(struct sk_buff *skb, struct cxgbit_iso_info *iso_info)
+{
+       struct cpl_tx_data_iso *cpl;
+       unsigned int submode = cxgbit_skcb_submode(skb);
+       unsigned int fslice = !!(iso_info->flags & CXGBIT_ISO_FSLICE);
+       unsigned int lslice = !!(iso_info->flags & CXGBIT_ISO_LSLICE);
+
+       cpl = (struct cpl_tx_data_iso *)__skb_push(skb, sizeof(*cpl));
+
+       cpl->op_to_scsi = htonl(CPL_TX_DATA_ISO_OP_V(CPL_TX_DATA_ISO) |
+                       CPL_TX_DATA_ISO_FIRST_V(fslice) |
+                       CPL_TX_DATA_ISO_LAST_V(lslice) |
+                       CPL_TX_DATA_ISO_CPLHDRLEN_V(0) |
+                       CPL_TX_DATA_ISO_HDRCRC_V(submode & 1) |
+                       CPL_TX_DATA_ISO_PLDCRC_V(((submode >> 1) & 1)) |
+                       CPL_TX_DATA_ISO_IMMEDIATE_V(0) |
+                       CPL_TX_DATA_ISO_SCSI_V(2));
+
+       cpl->ahs_len = 0;
+       cpl->mpdu = htons(DIV_ROUND_UP(iso_info->mpdu, 4));
+       cpl->burst_size = htonl(DIV_ROUND_UP(iso_info->burst_len, 4));
+       cpl->len = htonl(iso_info->len);
+       cpl->reserved2_seglen_offset = htonl(0);
+       cpl->datasn_offset = htonl(0);
+       cpl->buffer_offset = htonl(0);
+       cpl->reserved3 = 0;
+
+       __skb_pull(skb, sizeof(*cpl));
+}
+
+static void
+cxgbit_tx_data_wr(struct cxgbit_sock *csk, struct sk_buff *skb, u32 dlen,
+                 u32 len, u32 credits, u32 compl)
+{
+       struct fw_ofld_tx_data_wr *req;
+       u32 submode = cxgbit_skcb_submode(skb);
+       u32 wr_ulp_mode = 0;
+       u32 hdr_size = sizeof(*req);
+       u32 opcode = FW_OFLD_TX_DATA_WR;
+       u32 immlen = 0;
+       u32 force = TX_FORCE_V(!submode);
+
+       if (cxgbit_skcb_flags(skb) & SKCBF_TX_ISO) {
+               opcode = FW_ISCSI_TX_DATA_WR;
+               immlen += sizeof(struct cpl_tx_data_iso);
+               hdr_size += sizeof(struct cpl_tx_data_iso);
+               submode |= 8;
+       }
+
+       if (cxgbit_is_ofld_imm(skb))
+               immlen += dlen;
+
+       req = (struct fw_ofld_tx_data_wr *)__skb_push(skb,
+                                                       hdr_size);
+       req->op_to_immdlen = cpu_to_be32(FW_WR_OP_V(opcode) |
+                                       FW_WR_COMPL_V(compl) |
+                                       FW_WR_IMMDLEN_V(immlen));
+       req->flowid_len16 = cpu_to_be32(FW_WR_FLOWID_V(csk->tid) |
+                                       FW_WR_LEN16_V(credits));
+       req->plen = htonl(len);
+       wr_ulp_mode = FW_OFLD_TX_DATA_WR_ULPMODE_V(ULP_MODE_ISCSI) |
+                               FW_OFLD_TX_DATA_WR_ULPSUBMODE_V(submode);
+
+       req->tunnel_to_proxy = htonl((wr_ulp_mode) | force |
+                FW_OFLD_TX_DATA_WR_SHOVE_V(skb_peek(&csk->txq) ? 0 : 1));
+}
+
+static void cxgbit_arp_failure_skb_discard(void *handle, struct sk_buff *skb)
+{
+       kfree_skb(skb);
+}
+
+void cxgbit_push_tx_frames(struct cxgbit_sock *csk)
+{
+       struct sk_buff *skb;
+
+       while (csk->wr_cred && ((skb = skb_peek(&csk->txq)) != NULL)) {
+               u32 dlen = skb->len;
+               u32 len = skb->len;
+               u32 credits_needed;
+               u32 compl = 0;
+               u32 flowclen16 = 0;
+               u32 iso_cpl_len = 0;
+
+               if (cxgbit_skcb_flags(skb) & SKCBF_TX_ISO)
+                       iso_cpl_len = sizeof(struct cpl_tx_data_iso);
+
+               if (cxgbit_is_ofld_imm(skb))
+                       credits_needed = DIV_ROUND_UP(dlen + iso_cpl_len, 16);
+               else
+                       credits_needed = DIV_ROUND_UP((8 *
+                                       cxgbit_calc_tx_flits_ofld(skb)) +
+                                       iso_cpl_len, 16);
+
+               if (likely(cxgbit_skcb_flags(skb) & SKCBF_TX_NEED_HDR))
+                       credits_needed += DIV_ROUND_UP(
+                               sizeof(struct fw_ofld_tx_data_wr), 16);
+               /*
+                * Assumes the initial credits is large enough to support
+                * fw_flowc_wr plus largest possible first payload
+                */
+
+               if (!test_and_set_bit(CSK_TX_DATA_SENT, &csk->com.flags)) {
+                       flowclen16 = cxgbit_send_tx_flowc_wr(csk);
+                       csk->wr_cred -= flowclen16;
+                       csk->wr_una_cred += flowclen16;
+               }
+
+               if (csk->wr_cred < credits_needed) {
+                       pr_debug("csk 0x%p, skb %u/%u, wr %d < %u.\n",
+                                csk, skb->len, skb->data_len,
+                                credits_needed, csk->wr_cred);
+                       break;
+               }
+               __skb_unlink(skb, &csk->txq);
+               set_wr_txq(skb, CPL_PRIORITY_DATA, csk->txq_idx);
+               skb->csum = credits_needed + flowclen16;
+               csk->wr_cred -= credits_needed;
+               csk->wr_una_cred += credits_needed;
+
+               pr_debug("csk 0x%p, skb %u/%u, wr %d, left %u, unack %u.\n",
+                        csk, skb->len, skb->data_len, credits_needed,
+                        csk->wr_cred, csk->wr_una_cred);
+
+               if (likely(cxgbit_skcb_flags(skb) & SKCBF_TX_NEED_HDR)) {
+                       len += cxgbit_skcb_tx_extralen(skb);
+
+                       if ((csk->wr_una_cred >= (csk->wr_max_cred / 2)) ||
+                           (!before(csk->write_seq,
+                                    csk->snd_una + csk->snd_win))) {
+                               compl = 1;
+                               csk->wr_una_cred = 0;
+                       }
+
+                       cxgbit_tx_data_wr(csk, skb, dlen, len, credits_needed,
+                                         compl);
+                       csk->snd_nxt += len;
+
+               } else if ((cxgbit_skcb_flags(skb) & SKCBF_TX_FLAG_COMPL) ||
+                          (csk->wr_una_cred >= (csk->wr_max_cred / 2))) {
+                       struct cpl_close_con_req *req =
+                               (struct cpl_close_con_req *)skb->data;
+                       req->wr.wr_hi |= htonl(FW_WR_COMPL_F);
+                       csk->wr_una_cred = 0;
+               }
+
+               cxgbit_sock_enqueue_wr(csk, skb);
+               t4_set_arp_err_handler(skb, csk,
+                                      cxgbit_arp_failure_skb_discard);
+
+               pr_debug("csk 0x%p,%u, skb 0x%p, %u.\n",
+                        csk, csk->tid, skb, len);
+
+               cxgbit_l2t_send(csk->com.cdev, skb, csk->l2t);
+       }
+}
+
+static bool cxgbit_lock_sock(struct cxgbit_sock *csk)
+{
+       spin_lock_bh(&csk->lock);
+
+       if (before(csk->write_seq, csk->snd_una + csk->snd_win))
+               csk->lock_owner = true;
+
+       spin_unlock_bh(&csk->lock);
+
+       return csk->lock_owner;
+}
+
+static void cxgbit_unlock_sock(struct cxgbit_sock *csk)
+{
+       struct sk_buff_head backlogq;
+       struct sk_buff *skb;
+       void (*fn)(struct cxgbit_sock *, struct sk_buff *);
+
+       skb_queue_head_init(&backlogq);
+
+       spin_lock_bh(&csk->lock);
+       while (skb_queue_len(&csk->backlogq)) {
+               skb_queue_splice_init(&csk->backlogq, &backlogq);
+               spin_unlock_bh(&csk->lock);
+
+               while ((skb = __skb_dequeue(&backlogq))) {
+                       fn = cxgbit_skcb_rx_backlog_fn(skb);
+                       fn(csk, skb);
+               }
+
+               spin_lock_bh(&csk->lock);
+       }
+
+       csk->lock_owner = false;
+       spin_unlock_bh(&csk->lock);
+}
+
+static int cxgbit_queue_skb(struct cxgbit_sock *csk, struct sk_buff *skb)
+{
+       int ret = 0;
+
+       wait_event_interruptible(csk->ack_waitq, cxgbit_lock_sock(csk));
+
+       if (unlikely((csk->com.state != CSK_STATE_ESTABLISHED) ||
+                    signal_pending(current))) {
+               __kfree_skb(skb);
+               __skb_queue_purge(&csk->ppodq);
+               ret = -1;
+               spin_lock_bh(&csk->lock);
+               if (csk->lock_owner) {
+                       spin_unlock_bh(&csk->lock);
+                       goto unlock;
+               }
+               spin_unlock_bh(&csk->lock);
+               return ret;
+       }
+
+       csk->write_seq += skb->len +
+                         cxgbit_skcb_tx_extralen(skb);
+
+       skb_queue_splice_tail_init(&csk->ppodq, &csk->txq);
+       __skb_queue_tail(&csk->txq, skb);
+       cxgbit_push_tx_frames(csk);
+
+unlock:
+       cxgbit_unlock_sock(csk);
+       return ret;
+}
+
+static int
+cxgbit_map_skb(struct iscsi_cmd *cmd, struct sk_buff *skb, u32 data_offset,
+              u32 data_length)
+{
+       u32 i = 0, nr_frags = MAX_SKB_FRAGS;
+       u32 padding = ((-data_length) & 3);
+       struct scatterlist *sg;
+       struct page *page;
+       unsigned int page_off;
+
+       if (padding)
+               nr_frags--;
+
+       /*
+        * We know each entry in t_data_sg contains a page.
+        */
+       sg = &cmd->se_cmd.t_data_sg[data_offset / PAGE_SIZE];
+       page_off = (data_offset % PAGE_SIZE);
+
+       while (data_length && (i < nr_frags)) {
+               u32 cur_len = min_t(u32, data_length, sg->length - page_off);
+
+               page = sg_page(sg);
+
+               get_page(page);
+               skb_fill_page_desc(skb, i, page, sg->offset + page_off,
+                                  cur_len);
+               skb->data_len += cur_len;
+               skb->len += cur_len;
+               skb->truesize += cur_len;
+
+               data_length -= cur_len;
+               page_off = 0;
+               sg = sg_next(sg);
+               i++;
+       }
+
+       if (data_length)
+               return -1;
+
+       if (padding) {
+               page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+               if (!page)
+                       return -1;
+               skb_fill_page_desc(skb, i, page, 0, padding);
+               skb->data_len += padding;
+               skb->len += padding;
+               skb->truesize += padding;
+       }
+
+       return 0;
+}
+
+static int
+cxgbit_tx_datain_iso(struct cxgbit_sock *csk, struct iscsi_cmd *cmd,
+                    struct iscsi_datain_req *dr)
+{
+       struct iscsi_conn *conn = csk->conn;
+       struct sk_buff *skb;
+       struct iscsi_datain datain;
+       struct cxgbit_iso_info iso_info;
+       u32 data_length = cmd->se_cmd.data_length;
+       u32 mrdsl = conn->conn_ops->MaxRecvDataSegmentLength;
+       u32 num_pdu, plen, tx_data = 0;
+       bool task_sense = !!(cmd->se_cmd.se_cmd_flags &
+               SCF_TRANSPORT_TASK_SENSE);
+       bool set_statsn = false;
+       int ret = -1;
+
+       while (data_length) {
+               num_pdu = (data_length + mrdsl - 1) / mrdsl;
+               if (num_pdu > csk->max_iso_npdu)
+                       num_pdu = csk->max_iso_npdu;
+
+               plen = num_pdu * mrdsl;
+               if (plen > data_length)
+                       plen = data_length;
+
+               skb = __cxgbit_alloc_skb(csk, 0, true);
+               if (unlikely(!skb))
+                       return -ENOMEM;
+
+               memset(skb->data, 0, ISCSI_HDR_LEN);
+               cxgbit_skcb_flags(skb) |= SKCBF_TX_ISO;
+               cxgbit_skcb_submode(skb) |= (csk->submode &
+                               CXGBIT_SUBMODE_DCRC);
+               cxgbit_skcb_tx_extralen(skb) = (num_pdu *
+                               cxgbit_digest_len[cxgbit_skcb_submode(skb)]) +
+                                               ((num_pdu - 1) * ISCSI_HDR_LEN);
+
+               memset(&datain, 0, sizeof(struct iscsi_datain));
+               memset(&iso_info, 0, sizeof(iso_info));
+
+               if (!tx_data)
+                       iso_info.flags |= CXGBIT_ISO_FSLICE;
+
+               if (!(data_length - plen)) {
+                       iso_info.flags |= CXGBIT_ISO_LSLICE;
+                       if (!task_sense) {
+                               datain.flags = ISCSI_FLAG_DATA_STATUS;
+                               iscsit_increment_maxcmdsn(cmd, conn->sess);
+                               cmd->stat_sn = conn->stat_sn++;
+                               set_statsn = true;
+                       }
+               }
+
+               iso_info.burst_len = num_pdu * mrdsl;
+               iso_info.mpdu = mrdsl;
+               iso_info.len = ISCSI_HDR_LEN + plen;
+
+               cxgbit_cpl_tx_data_iso(skb, &iso_info);
+
+               datain.offset = tx_data;
+               datain.data_sn = cmd->data_sn - 1;
+
+               iscsit_build_datain_pdu(cmd, conn, &datain,
+                                       (struct iscsi_data_rsp *)skb->data,
+                                       set_statsn);
+
+               ret = cxgbit_map_skb(cmd, skb, tx_data, plen);
+               if (unlikely(ret)) {
+                       __kfree_skb(skb);
+                       goto out;
+               }
+
+               ret = cxgbit_queue_skb(csk, skb);
+               if (unlikely(ret))
+                       goto out;
+
+               tx_data += plen;
+               data_length -= plen;
+
+               cmd->read_data_done += plen;
+               cmd->data_sn += num_pdu;
+       }
+
+       dr->dr_complete = DATAIN_COMPLETE_NORMAL;
+
+       return 0;
+
+out:
+       return ret;
+}
+
+static int
+cxgbit_tx_datain(struct cxgbit_sock *csk, struct iscsi_cmd *cmd,
+                const struct iscsi_datain *datain)
+{
+       struct sk_buff *skb;
+       int ret = 0;
+
+       skb = cxgbit_alloc_skb(csk, 0);
+       if (unlikely(!skb))
+               return -ENOMEM;
+
+       memcpy(skb->data, cmd->pdu, ISCSI_HDR_LEN);
+
+       if (datain->length) {
+               cxgbit_skcb_submode(skb) |= (csk->submode &
+                               CXGBIT_SUBMODE_DCRC);
+               cxgbit_skcb_tx_extralen(skb) =
+                               cxgbit_digest_len[cxgbit_skcb_submode(skb)];
+       }
+
+       ret = cxgbit_map_skb(cmd, skb, datain->offset, datain->length);
+       if (ret < 0) {
+               __kfree_skb(skb);
+               return ret;
+       }
+
+       return cxgbit_queue_skb(csk, skb);
+}
+
+static int
+cxgbit_xmit_datain_pdu(struct iscsi_conn *conn, struct iscsi_cmd *cmd,
+                      struct iscsi_datain_req *dr,
+                      const struct iscsi_datain *datain)
+{
+       struct cxgbit_sock *csk = conn->context;
+       u32 data_length = cmd->se_cmd.data_length;
+       u32 padding = ((-data_length) & 3);
+       u32 mrdsl = conn->conn_ops->MaxRecvDataSegmentLength;
+
+       if ((data_length > mrdsl) && (!dr->recovery) &&
+           (!padding) && (!datain->offset) && csk->max_iso_npdu) {
+               atomic_long_add(data_length - datain->length,
+                               &conn->sess->tx_data_octets);
+               return cxgbit_tx_datain_iso(csk, cmd, dr);
+       }
+
+       return cxgbit_tx_datain(csk, cmd, datain);
+}
+
+static int
+cxgbit_xmit_nondatain_pdu(struct iscsi_conn *conn, struct iscsi_cmd *cmd,
+                         const void *data_buf, u32 data_buf_len)
+{
+       struct cxgbit_sock *csk = conn->context;
+       struct sk_buff *skb;
+       u32 padding = ((-data_buf_len) & 3);
+
+       skb = cxgbit_alloc_skb(csk, data_buf_len + padding);
+       if (unlikely(!skb))
+               return -ENOMEM;
+
+       memcpy(skb->data, cmd->pdu, ISCSI_HDR_LEN);
+
+       if (data_buf_len) {
+               u32 pad_bytes = 0;
+
+               skb_store_bits(skb, ISCSI_HDR_LEN, data_buf, data_buf_len);
+
+               if (padding)
+                       skb_store_bits(skb, ISCSI_HDR_LEN + data_buf_len,
+                                      &pad_bytes, padding);
+       }
+
+       cxgbit_skcb_tx_extralen(skb) = cxgbit_digest_len[
+                                      cxgbit_skcb_submode(skb)];
+
+       return cxgbit_queue_skb(csk, skb);
+}
+
+int
+cxgbit_xmit_pdu(struct iscsi_conn *conn, struct iscsi_cmd *cmd,
+               struct iscsi_datain_req *dr, const void *buf, u32 buf_len)
+{
+       if (dr)
+               return cxgbit_xmit_datain_pdu(conn, cmd, dr, buf);
+       else
+               return cxgbit_xmit_nondatain_pdu(conn, cmd, buf, buf_len);
+}
+
+int cxgbit_validate_params(struct iscsi_conn *conn)
+{
+       struct cxgbit_sock *csk = conn->context;
+       struct cxgbit_device *cdev = csk->com.cdev;
+       struct iscsi_param *param;
+       u32 max_xmitdsl;
+
+       param = iscsi_find_param_from_key(MAXXMITDATASEGMENTLENGTH,
+                                         conn->param_list);
+       if (!param)
+               return -1;
+
+       if (kstrtou32(param->value, 0, &max_xmitdsl) < 0)
+               return -1;
+
+       if (max_xmitdsl > cdev->mdsl) {
+               if (iscsi_change_param_sprintf(
+                       conn, "MaxXmitDataSegmentLength=%u", cdev->mdsl))
+                       return -1;
+       }
+
+       return 0;
+}
+
+static int cxgbit_set_digest(struct cxgbit_sock *csk)
+{
+       struct iscsi_conn *conn = csk->conn;
+       struct iscsi_param *param;
+
+       param = iscsi_find_param_from_key(HEADERDIGEST, conn->param_list);
+       if (!param) {
+               pr_err("param not found key %s\n", HEADERDIGEST);
+               return -1;
+       }
+
+       if (!strcmp(param->value, CRC32C))
+               csk->submode |= CXGBIT_SUBMODE_HCRC;
+
+       param = iscsi_find_param_from_key(DATADIGEST, conn->param_list);
+       if (!param) {
+               csk->submode = 0;
+               pr_err("param not found key %s\n", DATADIGEST);
+               return -1;
+       }
+
+       if (!strcmp(param->value, CRC32C))
+               csk->submode |= CXGBIT_SUBMODE_DCRC;
+
+       if (cxgbit_setup_conn_digest(csk)) {
+               csk->submode = 0;
+               return -1;
+       }
+
+       return 0;
+}
+
+static int cxgbit_set_iso_npdu(struct cxgbit_sock *csk)
+{
+       struct iscsi_conn *conn = csk->conn;
+       struct iscsi_conn_ops *conn_ops = conn->conn_ops;
+       struct iscsi_param *param;
+       u32 mrdsl, mbl;
+       u32 max_npdu, max_iso_npdu;
+
+       if (conn->login->leading_connection) {
+               param = iscsi_find_param_from_key(DATASEQUENCEINORDER,
+                                                 conn->param_list);
+               if (!param) {
+                       pr_err("param not found key %s\n", DATASEQUENCEINORDER);
+                       return -1;
+               }
+
+               if (strcmp(param->value, YES))
+                       return 0;
+
+               param = iscsi_find_param_from_key(DATAPDUINORDER,
+                                                 conn->param_list);
+               if (!param) {
+                       pr_err("param not found key %s\n", DATAPDUINORDER);
+                       return -1;
+               }
+
+               if (strcmp(param->value, YES))
+                       return 0;
+
+               param = iscsi_find_param_from_key(MAXBURSTLENGTH,
+                                                 conn->param_list);
+               if (!param) {
+                       pr_err("param not found key %s\n", MAXBURSTLENGTH);
+                       return -1;
+               }
+
+               if (kstrtou32(param->value, 0, &mbl) < 0)
+                       return -1;
+       } else {
+               if (!conn->sess->sess_ops->DataSequenceInOrder)
+                       return 0;
+               if (!conn->sess->sess_ops->DataPDUInOrder)
+                       return 0;
+
+               mbl = conn->sess->sess_ops->MaxBurstLength;
+       }
+
+       mrdsl = conn_ops->MaxRecvDataSegmentLength;
+       max_npdu = mbl / mrdsl;
+
+       max_iso_npdu = CXGBIT_MAX_ISO_PAYLOAD /
+                       (ISCSI_HDR_LEN + mrdsl +
+                       cxgbit_digest_len[csk->submode]);
+
+       csk->max_iso_npdu = min(max_npdu, max_iso_npdu);
+
+       if (csk->max_iso_npdu <= 1)
+               csk->max_iso_npdu = 0;
+
+       return 0;
+}
+
+static int cxgbit_set_params(struct iscsi_conn *conn)
+{
+       struct cxgbit_sock *csk = conn->context;
+       struct cxgbit_device *cdev = csk->com.cdev;
+       struct cxgbi_ppm *ppm = *csk->com.cdev->lldi.iscsi_ppm;
+       struct iscsi_conn_ops *conn_ops = conn->conn_ops;
+       struct iscsi_param *param;
+       u8 erl;
+
+       if (conn_ops->MaxRecvDataSegmentLength > cdev->mdsl)
+               conn_ops->MaxRecvDataSegmentLength = cdev->mdsl;
+
+       if (conn->login->leading_connection) {
+               param = iscsi_find_param_from_key(ERRORRECOVERYLEVEL,
+                                                 conn->param_list);
+               if (!param) {
+                       pr_err("param not found key %s\n", ERRORRECOVERYLEVEL);
+                       return -1;
+               }
+               if (kstrtou8(param->value, 0, &erl) < 0)
+                       return -1;
+       } else {
+               erl = conn->sess->sess_ops->ErrorRecoveryLevel;
+       }
+
+       if (!erl) {
+               if (test_bit(CDEV_ISO_ENABLE, &cdev->flags)) {
+                       if (cxgbit_set_iso_npdu(csk))
+                               return -1;
+               }
+
+               if (test_bit(CDEV_DDP_ENABLE, &cdev->flags)) {
+                       if (cxgbit_setup_conn_pgidx(csk,
+                                                   ppm->tformat.pgsz_idx_dflt))
+                               return -1;
+                       set_bit(CSK_DDP_ENABLE, &csk->com.flags);
+               }
+       }
+
+       if (cxgbit_set_digest(csk))
+               return -1;
+
+       return 0;
+}
+
+int
+cxgbit_put_login_tx(struct iscsi_conn *conn, struct iscsi_login *login,
+                   u32 length)
+{
+       struct cxgbit_sock *csk = conn->context;
+       struct sk_buff *skb;
+       u32 padding_buf = 0;
+       u8 padding = ((-length) & 3);
+
+       skb = cxgbit_alloc_skb(csk, length + padding);
+       if (!skb)
+               return -ENOMEM;
+       skb_store_bits(skb, 0, login->rsp, ISCSI_HDR_LEN);
+       skb_store_bits(skb, ISCSI_HDR_LEN, login->rsp_buf, length);
+
+       if (padding)
+               skb_store_bits(skb, ISCSI_HDR_LEN + length,
+                              &padding_buf, padding);
+
+       if (login->login_complete) {
+               if (cxgbit_set_params(conn)) {
+                       kfree_skb(skb);
+                       return -1;
+               }
+
+               set_bit(CSK_LOGIN_DONE, &csk->com.flags);
+       }
+
+       if (cxgbit_queue_skb(csk, skb))
+               return -1;
+
+       if ((!login->login_complete) && (!login->login_failed))
+               schedule_delayed_work(&conn->login_work, 0);
+
+       return 0;
+}
+
+static void
+cxgbit_skb_copy_to_sg(struct sk_buff *skb, struct scatterlist *sg,
+                     unsigned int nents)
+{
+       struct skb_seq_state st;
+       const u8 *buf;
+       unsigned int consumed = 0, buf_len;
+       struct cxgbit_lro_pdu_cb *pdu_cb = cxgbit_rx_pdu_cb(skb);
+
+       skb_prepare_seq_read(skb, pdu_cb->doffset,
+                            pdu_cb->doffset + pdu_cb->dlen,
+                            &st);
+
+       while (true) {
+               buf_len = skb_seq_read(consumed, &buf, &st);
+               if (!buf_len) {
+                       skb_abort_seq_read(&st);
+                       break;
+               }
+
+               consumed += sg_pcopy_from_buffer(sg, nents, (void *)buf,
+                                                buf_len, consumed);
+       }
+}
+
+static struct iscsi_cmd *cxgbit_allocate_cmd(struct cxgbit_sock *csk)
+{
+       struct iscsi_conn *conn = csk->conn;
+       struct cxgbi_ppm *ppm = cdev2ppm(csk->com.cdev);
+       struct cxgbit_cmd *ccmd;
+       struct iscsi_cmd *cmd;
+
+       cmd = iscsit_allocate_cmd(conn, TASK_INTERRUPTIBLE);
+       if (!cmd) {
+               pr_err("Unable to allocate iscsi_cmd + cxgbit_cmd\n");
+               return NULL;
+       }
+
+       ccmd = iscsit_priv_cmd(cmd);
+       ccmd->ttinfo.tag = ppm->tformat.no_ddp_mask;
+       ccmd->setup_ddp = true;
+
+       return cmd;
+}
+
+static int
+cxgbit_handle_immediate_data(struct iscsi_cmd *cmd, struct iscsi_scsi_req *hdr,
+                            u32 length)
+{
+       struct iscsi_conn *conn = cmd->conn;
+       struct cxgbit_sock *csk = conn->context;
+       struct cxgbit_lro_pdu_cb *pdu_cb = cxgbit_rx_pdu_cb(csk->skb);
+
+       if (pdu_cb->flags & PDUCBF_RX_DCRC_ERR) {
+               pr_err("ImmediateData CRC32C DataDigest error\n");
+               if (!conn->sess->sess_ops->ErrorRecoveryLevel) {
+                       pr_err("Unable to recover from"
+                              " Immediate Data digest failure while"
+                              " in ERL=0.\n");
+                       iscsit_reject_cmd(cmd, ISCSI_REASON_DATA_DIGEST_ERROR,
+                                         (unsigned char *)hdr);
+                       return IMMEDIATE_DATA_CANNOT_RECOVER;
+               }
+
+               iscsit_reject_cmd(cmd, ISCSI_REASON_DATA_DIGEST_ERROR,
+                                 (unsigned char *)hdr);
+               return IMMEDIATE_DATA_ERL1_CRC_FAILURE;
+       }
+
+       if (cmd->se_cmd.se_cmd_flags & SCF_PASSTHROUGH_SG_TO_MEM_NOALLOC) {
+               struct cxgbit_cmd *ccmd = iscsit_priv_cmd(cmd);
+               struct skb_shared_info *ssi = skb_shinfo(csk->skb);
+               skb_frag_t *dfrag = &ssi->frags[pdu_cb->dfrag_idx];
+
+               sg_init_table(&ccmd->sg, 1);
+               sg_set_page(&ccmd->sg, dfrag->page.p, skb_frag_size(dfrag),
+                           dfrag->page_offset);
+               get_page(dfrag->page.p);
+
+               cmd->se_cmd.t_data_sg = &ccmd->sg;
+               cmd->se_cmd.t_data_nents = 1;
+
+               ccmd->release = true;
+       } else {
+               struct scatterlist *sg = &cmd->se_cmd.t_data_sg[0];
+               u32 sg_nents = max(1UL, DIV_ROUND_UP(pdu_cb->dlen, PAGE_SIZE));
+
+               cxgbit_skb_copy_to_sg(csk->skb, sg, sg_nents);
+       }
+
+       cmd->write_data_done += pdu_cb->dlen;
+
+       if (cmd->write_data_done == cmd->se_cmd.data_length) {
+               spin_lock_bh(&cmd->istate_lock);
+               cmd->cmd_flags |= ICF_GOT_LAST_DATAOUT;
+               cmd->i_state = ISTATE_RECEIVED_LAST_DATAOUT;
+               spin_unlock_bh(&cmd->istate_lock);
+       }
+
+       return IMMEDIATE_DATA_NORMAL_OPERATION;
+}
+
+static int
+cxgbit_get_immediate_data(struct iscsi_cmd *cmd, struct iscsi_scsi_req *hdr,
+                         bool dump_payload)
+{
+       struct iscsi_conn *conn = cmd->conn;
+       int cmdsn_ret = 0, immed_ret = IMMEDIATE_DATA_NORMAL_OPERATION;
+       /*
+        * Special case for Unsupported SAM WRITE Opcodes and ImmediateData=Yes.
+        */
+       if (dump_payload)
+               goto after_immediate_data;
+
+       immed_ret = cxgbit_handle_immediate_data(cmd, hdr,
+                                                cmd->first_burst_len);
+after_immediate_data:
+       if (immed_ret == IMMEDIATE_DATA_NORMAL_OPERATION) {
+               /*
+                * A PDU/CmdSN carrying Immediate Data passed
+                * DataCRC, check against ExpCmdSN/MaxCmdSN if
+                * Immediate Bit is not set.
+                */
+               cmdsn_ret = iscsit_sequence_cmd(conn, cmd,
+                                               (unsigned char *)hdr,
+                                               hdr->cmdsn);
+               if (cmdsn_ret == CMDSN_ERROR_CANNOT_RECOVER)
+                       return -1;
+
+               if (cmd->sense_reason || cmdsn_ret == CMDSN_LOWER_THAN_EXP) {
+                       target_put_sess_cmd(&cmd->se_cmd);
+                       return 0;
+               } else if (cmd->unsolicited_data) {
+                       iscsit_set_unsoliticed_dataout(cmd);
+               }
+
+       } else if (immed_ret == IMMEDIATE_DATA_ERL1_CRC_FAILURE) {
+               /*
+                * Immediate Data failed DataCRC and ERL>=1,
+                * silently drop this PDU and let the initiator
+                * plug the CmdSN gap.
+                *
+                * FIXME: Send Unsolicited NOPIN with reserved
+                * TTT here to help the initiator figure out
+                * the missing CmdSN, although they should be
+                * intelligent enough to determine the missing
+                * CmdSN and issue a retry to plug the sequence.
+                */
+               cmd->i_state = ISTATE_REMOVE;
+               iscsit_add_cmd_to_immediate_queue(cmd, conn, cmd->i_state);
+       } else /* immed_ret == IMMEDIATE_DATA_CANNOT_RECOVER */
+               return -1;
+
+       return 0;
+}
+
+static int
+cxgbit_handle_scsi_cmd(struct cxgbit_sock *csk, struct iscsi_cmd *cmd)
+{
+       struct iscsi_conn *conn = csk->conn;
+       struct cxgbit_lro_pdu_cb *pdu_cb = cxgbit_rx_pdu_cb(csk->skb);
+       struct iscsi_scsi_req *hdr = (struct iscsi_scsi_req *)pdu_cb->hdr;
+       int rc;
+       bool dump_payload = false;
+
+       rc = iscsit_setup_scsi_cmd(conn, cmd, (unsigned char *)hdr);
+       if (rc < 0)
+               return rc;
+
+       if (pdu_cb->dlen && (pdu_cb->dlen == cmd->se_cmd.data_length) &&
+           (pdu_cb->nr_dfrags == 1))
+               cmd->se_cmd.se_cmd_flags |= SCF_PASSTHROUGH_SG_TO_MEM_NOALLOC;
+
+       rc = iscsit_process_scsi_cmd(conn, cmd, hdr);
+       if (rc < 0)
+               return 0;
+       else if (rc > 0)
+               dump_payload = true;
+
+       if (!pdu_cb->dlen)
+               return 0;
+
+       return cxgbit_get_immediate_data(cmd, hdr, dump_payload);
+}
+
+static int cxgbit_handle_iscsi_dataout(struct cxgbit_sock *csk)
+{
+       struct scatterlist *sg_start;
+       struct iscsi_conn *conn = csk->conn;
+       struct iscsi_cmd *cmd = NULL;
+       struct cxgbit_lro_pdu_cb *pdu_cb = cxgbit_rx_pdu_cb(csk->skb);
+       struct iscsi_data *hdr = (struct iscsi_data *)pdu_cb->hdr;
+       u32 data_offset = be32_to_cpu(hdr->offset);
+       u32 data_len = pdu_cb->dlen;
+       int rc, sg_nents, sg_off;
+       bool dcrc_err = false;
+
+       rc = iscsit_check_dataout_hdr(conn, (unsigned char *)hdr, &cmd);
+       if (rc < 0)
+               return rc;
+       else if (!cmd)
+               return 0;
+
+       if (pdu_cb->flags & PDUCBF_RX_DCRC_ERR) {
+               pr_err("ITT: 0x%08x, Offset: %u, Length: %u,"
+                      " DataSN: 0x%08x\n",
+                      hdr->itt, hdr->offset, data_len,
+                      hdr->datasn);
+
+               dcrc_err = true;
+               goto check_payload;
+       }
+
+       pr_debug("DataOut data_len: %u, "
+               "write_data_done: %u, data_length: %u\n",
+                 data_len,  cmd->write_data_done,
+                 cmd->se_cmd.data_length);
+
+       if (!(pdu_cb->flags & PDUCBF_RX_DATA_DDPD)) {
+               sg_off = data_offset / PAGE_SIZE;
+               sg_start = &cmd->se_cmd.t_data_sg[sg_off];
+               sg_nents = max(1UL, DIV_ROUND_UP(data_len, PAGE_SIZE));
+
+               cxgbit_skb_copy_to_sg(csk->skb, sg_start, sg_nents);
+       }
+
+check_payload:
+
+       rc = iscsit_check_dataout_payload(cmd, hdr, dcrc_err);
+       if (rc < 0)
+               return rc;
+
+       return 0;
+}
+
+static int cxgbit_handle_nop_out(struct cxgbit_sock *csk, struct iscsi_cmd *cmd)
+{
+       struct iscsi_conn *conn = csk->conn;
+       struct cxgbit_lro_pdu_cb *pdu_cb = cxgbit_rx_pdu_cb(csk->skb);
+       struct iscsi_nopout *hdr = (struct iscsi_nopout *)pdu_cb->hdr;
+       unsigned char *ping_data = NULL;
+       u32 payload_length = pdu_cb->dlen;
+       int ret;
+
+       ret = iscsit_setup_nop_out(conn, cmd, hdr);
+       if (ret < 0)
+               return 0;
+
+       if (pdu_cb->flags & PDUCBF_RX_DCRC_ERR) {
+               if (!conn->sess->sess_ops->ErrorRecoveryLevel) {
+                       pr_err("Unable to recover from"
+                              " NOPOUT Ping DataCRC failure while in"
+                              " ERL=0.\n");
+                       ret = -1;
+                       goto out;
+               } else {
+                       /*
+                        * drop this PDU and let the
+                        * initiator plug the CmdSN gap.
+                        */
+                       pr_info("Dropping NOPOUT"
+                               " Command CmdSN: 0x%08x due to"
+                               " DataCRC error.\n", hdr->cmdsn);
+                       ret = 0;
+                       goto out;
+               }
+       }
+
+       /*
+        * Handle NOP-OUT payload for traditional iSCSI sockets
+        */
+       if (payload_length && hdr->ttt == cpu_to_be32(0xFFFFFFFF)) {
+               ping_data = kzalloc(payload_length + 1, GFP_KERNEL);
+               if (!ping_data) {
+                       pr_err("Unable to allocate memory for"
+                               " NOPOUT ping data.\n");
+                       ret = -1;
+                       goto out;
+               }
+
+               skb_copy_bits(csk->skb, pdu_cb->doffset,
+                             ping_data, payload_length);
+
+               ping_data[payload_length] = '\0';
+               /*
+                * Attach ping data to struct iscsi_cmd->buf_ptr.
+                */
+               cmd->buf_ptr = ping_data;
+               cmd->buf_ptr_size = payload_length;
+
+               pr_debug("Got %u bytes of NOPOUT ping"
+                       " data.\n", payload_length);
+               pr_debug("Ping Data: \"%s\"\n", ping_data);
+       }
+
+       return iscsit_process_nop_out(conn, cmd, hdr);
+out:
+       if (cmd)
+               iscsit_free_cmd(cmd, false);
+       return ret;
+}
+
+static int
+cxgbit_handle_text_cmd(struct cxgbit_sock *csk, struct iscsi_cmd *cmd)
+{
+       struct iscsi_conn *conn = csk->conn;
+       struct cxgbit_lro_pdu_cb *pdu_cb = cxgbit_rx_pdu_cb(csk->skb);
+       struct iscsi_text *hdr = (struct iscsi_text *)pdu_cb->hdr;
+       u32 payload_length = pdu_cb->dlen;
+       int rc;
+       unsigned char *text_in = NULL;
+
+       rc = iscsit_setup_text_cmd(conn, cmd, hdr);
+       if (rc < 0)
+               return rc;
+
+       if (pdu_cb->flags & PDUCBF_RX_DCRC_ERR) {
+               if (!conn->sess->sess_ops->ErrorRecoveryLevel) {
+                       pr_err("Unable to recover from"
+                              " Text Data digest failure while in"
+                              " ERL=0.\n");
+                       goto reject;
+               } else {
+                       /*
+                        * drop this PDU and let the
+                        * initiator plug the CmdSN gap.
+                        */
+                       pr_info("Dropping Text"
+                               " Command CmdSN: 0x%08x due to"
+                               " DataCRC error.\n", hdr->cmdsn);
+                       return 0;
+               }
+       }
+
+       if (payload_length) {
+               text_in = kzalloc(payload_length, GFP_KERNEL);
+               if (!text_in) {
+                       pr_err("Unable to allocate text_in of payload_length: %u\n",
+                              payload_length);
+                       return -ENOMEM;
+               }
+               skb_copy_bits(csk->skb, pdu_cb->doffset,
+                             text_in, payload_length);
+
+               text_in[payload_length - 1] = '\0';
+
+               cmd->text_in_ptr = text_in;
+       }
+
+       return iscsit_process_text_cmd(conn, cmd, hdr);
+
+reject:
+       return iscsit_reject_cmd(cmd, ISCSI_REASON_PROTOCOL_ERROR,
+                                pdu_cb->hdr);
+}
+
+static int cxgbit_target_rx_opcode(struct cxgbit_sock *csk)
+{
+       struct cxgbit_lro_pdu_cb *pdu_cb = cxgbit_rx_pdu_cb(csk->skb);
+       struct iscsi_hdr *hdr = (struct iscsi_hdr *)pdu_cb->hdr;
+       struct iscsi_conn *conn = csk->conn;
+       struct iscsi_cmd *cmd = NULL;
+       u8 opcode = (hdr->opcode & ISCSI_OPCODE_MASK);
+       int ret = -EINVAL;
+
+       switch (opcode) {
+       case ISCSI_OP_SCSI_CMD:
+               cmd = cxgbit_allocate_cmd(csk);
+               if (!cmd)
+                       goto reject;
+
+               ret = cxgbit_handle_scsi_cmd(csk, cmd);
+               break;
+       case ISCSI_OP_SCSI_DATA_OUT:
+               ret = cxgbit_handle_iscsi_dataout(csk);
+               break;
+       case ISCSI_OP_NOOP_OUT:
+               if (hdr->ttt == cpu_to_be32(0xFFFFFFFF)) {
+                       cmd = cxgbit_allocate_cmd(csk);
+                       if (!cmd)
+                               goto reject;
+               }
+
+               ret = cxgbit_handle_nop_out(csk, cmd);
+               break;
+       case ISCSI_OP_SCSI_TMFUNC:
+               cmd = cxgbit_allocate_cmd(csk);
+               if (!cmd)
+                       goto reject;
+
+               ret = iscsit_handle_task_mgt_cmd(conn, cmd,
+                                                (unsigned char *)hdr);
+               break;
+       case ISCSI_OP_TEXT:
+               if (hdr->ttt != cpu_to_be32(0xFFFFFFFF)) {
+                       cmd = iscsit_find_cmd_from_itt(conn, hdr->itt);
+                       if (!cmd)
+                               goto reject;
+               } else {
+                       cmd = cxgbit_allocate_cmd(csk);
+                       if (!cmd)
+                               goto reject;
+               }
+
+               ret = cxgbit_handle_text_cmd(csk, cmd);
+               break;
+       case ISCSI_OP_LOGOUT:
+               cmd = cxgbit_allocate_cmd(csk);
+               if (!cmd)
+                       goto reject;
+
+               ret = iscsit_handle_logout_cmd(conn, cmd, (unsigned char *)hdr);
+               if (ret > 0)
+                       wait_for_completion_timeout(&conn->conn_logout_comp,
+                                                   SECONDS_FOR_LOGOUT_COMP
+                                                   * HZ);
+               break;
+       case ISCSI_OP_SNACK:
+               ret = iscsit_handle_snack(conn, (unsigned char *)hdr);
+               break;
+       default:
+               pr_err("Got unknown iSCSI OpCode: 0x%02x\n", opcode);
+               dump_stack();
+               break;
+       }
+
+       return ret;
+
+reject:
+       return iscsit_add_reject(conn, ISCSI_REASON_BOOKMARK_NO_RESOURCES,
+                                (unsigned char *)hdr);
+       return ret;
+}
+
+static int cxgbit_rx_opcode(struct cxgbit_sock *csk)
+{
+       struct cxgbit_lro_pdu_cb *pdu_cb = cxgbit_rx_pdu_cb(csk->skb);
+       struct iscsi_conn *conn = csk->conn;
+       struct iscsi_hdr *hdr = pdu_cb->hdr;
+       u8 opcode;
+
+       if (pdu_cb->flags & PDUCBF_RX_HCRC_ERR) {
+               atomic_long_inc(&conn->sess->conn_digest_errors);
+               goto transport_err;
+       }
+
+       if (conn->conn_state == TARG_CONN_STATE_IN_LOGOUT)
+               goto transport_err;
+
+       opcode = hdr->opcode & ISCSI_OPCODE_MASK;
+
+       if (conn->sess->sess_ops->SessionType &&
+           ((!(opcode & ISCSI_OP_TEXT)) ||
+            (!(opcode & ISCSI_OP_LOGOUT)))) {
+               pr_err("Received illegal iSCSI Opcode: 0x%02x"
+                       " while in Discovery Session, rejecting.\n", opcode);
+               iscsit_add_reject(conn, ISCSI_REASON_PROTOCOL_ERROR,
+                                 (unsigned char *)hdr);
+               goto transport_err;
+       }
+
+       if (cxgbit_target_rx_opcode(csk) < 0)
+               goto transport_err;
+
+       return 0;
+
+transport_err:
+       return -1;
+}
+
+static int cxgbit_rx_login_pdu(struct cxgbit_sock *csk)
+{
+       struct iscsi_conn *conn = csk->conn;
+       struct iscsi_login *login = conn->login;
+       struct cxgbit_lro_pdu_cb *pdu_cb = cxgbit_rx_pdu_cb(csk->skb);
+       struct iscsi_login_req *login_req;
+
+       login_req = (struct iscsi_login_req *)login->req;
+       memcpy(login_req, pdu_cb->hdr, sizeof(*login_req));
+
+       pr_debug("Got Login Command, Flags 0x%02x, ITT: 0x%08x,"
+               " CmdSN: 0x%08x, ExpStatSN: 0x%08x, CID: %hu, Length: %u\n",
+               login_req->flags, login_req->itt, login_req->cmdsn,
+               login_req->exp_statsn, login_req->cid, pdu_cb->dlen);
+       /*
+        * Setup the initial iscsi_login values from the leading
+        * login request PDU.
+        */
+       if (login->first_request) {
+               login_req = (struct iscsi_login_req *)login->req;
+               login->leading_connection = (!login_req->tsih) ? 1 : 0;
+               login->current_stage    = ISCSI_LOGIN_CURRENT_STAGE(
+                               login_req->flags);
+               login->version_min      = login_req->min_version;
+               login->version_max      = login_req->max_version;
+               memcpy(login->isid, login_req->isid, 6);
+               login->cmd_sn           = be32_to_cpu(login_req->cmdsn);
+               login->init_task_tag    = login_req->itt;
+               login->initial_exp_statsn = be32_to_cpu(login_req->exp_statsn);
+               login->cid              = be16_to_cpu(login_req->cid);
+               login->tsih             = be16_to_cpu(login_req->tsih);
+       }
+
+       if (iscsi_target_check_login_request(conn, login) < 0)
+               return -1;
+
+       memset(login->req_buf, 0, MAX_KEY_VALUE_PAIRS);
+       skb_copy_bits(csk->skb, pdu_cb->doffset, login->req_buf, pdu_cb->dlen);
+
+       return 0;
+}
+
+static int
+cxgbit_process_iscsi_pdu(struct cxgbit_sock *csk, struct sk_buff *skb, int idx)
+{
+       struct cxgbit_lro_pdu_cb *pdu_cb = cxgbit_skb_lro_pdu_cb(skb, idx);
+       int ret;
+
+       cxgbit_rx_pdu_cb(skb) = pdu_cb;
+
+       csk->skb = skb;
+
+       if (!test_bit(CSK_LOGIN_DONE, &csk->com.flags)) {
+               ret = cxgbit_rx_login_pdu(csk);
+               set_bit(CSK_LOGIN_PDU_DONE, &csk->com.flags);
+       } else {
+               ret = cxgbit_rx_opcode(csk);
+       }
+
+       return ret;
+}
+
+static void cxgbit_lro_skb_dump(struct sk_buff *skb)
+{
+       struct skb_shared_info *ssi = skb_shinfo(skb);
+       struct cxgbit_lro_cb *lro_cb = cxgbit_skb_lro_cb(skb);
+       struct cxgbit_lro_pdu_cb *pdu_cb = cxgbit_skb_lro_pdu_cb(skb, 0);
+       u8 i;
+
+       pr_info("skb 0x%p, head 0x%p, 0x%p, len %u,%u, frags %u.\n",
+               skb, skb->head, skb->data, skb->len, skb->data_len,
+               ssi->nr_frags);
+       pr_info("skb 0x%p, lro_cb, csk 0x%p, pdu %u, %u.\n",
+               skb, lro_cb->csk, lro_cb->pdu_idx, lro_cb->pdu_totallen);
+
+       for (i = 0; i < lro_cb->pdu_idx; i++, pdu_cb++)
+               pr_info("skb 0x%p, pdu %d, %u, f 0x%x, seq 0x%x, dcrc 0x%x, "
+                       "frags %u.\n",
+                       skb, i, pdu_cb->pdulen, pdu_cb->flags, pdu_cb->seq,
+                       pdu_cb->ddigest, pdu_cb->frags);
+       for (i = 0; i < ssi->nr_frags; i++)
+               pr_info("skb 0x%p, frag %d, off %u, sz %u.\n",
+                       skb, i, ssi->frags[i].page_offset, ssi->frags[i].size);
+}
+
+static void cxgbit_lro_hskb_reset(struct cxgbit_sock *csk)
+{
+       struct sk_buff *skb = csk->lro_hskb;
+       struct skb_shared_info *ssi = skb_shinfo(skb);
+       u8 i;
+
+       memset(skb->data, 0, LRO_SKB_MIN_HEADROOM);
+       for (i = 0; i < ssi->nr_frags; i++)
+               put_page(skb_frag_page(&ssi->frags[i]));
+       ssi->nr_frags = 0;
+}
+
+static void
+cxgbit_lro_skb_merge(struct cxgbit_sock *csk, struct sk_buff *skb, u8 pdu_idx)
+{
+       struct sk_buff *hskb = csk->lro_hskb;
+       struct cxgbit_lro_pdu_cb *hpdu_cb = cxgbit_skb_lro_pdu_cb(hskb, 0);
+       struct cxgbit_lro_pdu_cb *pdu_cb = cxgbit_skb_lro_pdu_cb(skb, pdu_idx);
+       struct skb_shared_info *hssi = skb_shinfo(hskb);
+       struct skb_shared_info *ssi = skb_shinfo(skb);
+       unsigned int len = 0;
+
+       if (pdu_cb->flags & PDUCBF_RX_HDR) {
+               hpdu_cb->flags = pdu_cb->flags;
+               hpdu_cb->seq = pdu_cb->seq;
+               hpdu_cb->hdr = pdu_cb->hdr;
+               hpdu_cb->hlen = pdu_cb->hlen;
+
+               memcpy(&hssi->frags[0], &ssi->frags[pdu_cb->hfrag_idx],
+                      sizeof(skb_frag_t));
+
+               get_page(skb_frag_page(&hssi->frags[0]));
+               hssi->nr_frags = 1;
+               hpdu_cb->frags = 1;
+               hpdu_cb->hfrag_idx = 0;
+
+               len = hssi->frags[0].size;
+               hskb->len = len;
+               hskb->data_len = len;
+               hskb->truesize = len;
+       }
+
+       if (pdu_cb->flags & PDUCBF_RX_DATA) {
+               u8 hfrag_idx = 1, i;
+
+               hpdu_cb->flags |= pdu_cb->flags;
+
+               len = 0;
+               for (i = 0; i < pdu_cb->nr_dfrags; hfrag_idx++, i++) {
+                       memcpy(&hssi->frags[hfrag_idx],
+                              &ssi->frags[pdu_cb->dfrag_idx + i],
+                              sizeof(skb_frag_t));
+
+                       get_page(skb_frag_page(&hssi->frags[hfrag_idx]));
+
+                       len += hssi->frags[hfrag_idx].size;
+
+                       hssi->nr_frags++;
+                       hpdu_cb->frags++;
+               }
+
+               hpdu_cb->dlen = pdu_cb->dlen;
+               hpdu_cb->doffset = hpdu_cb->hlen;
+               hpdu_cb->nr_dfrags = pdu_cb->nr_dfrags;
+               hpdu_cb->dfrag_idx = 1;
+               hskb->len += len;
+               hskb->data_len += len;
+               hskb->truesize += len;
+       }
+
+       if (pdu_cb->flags & PDUCBF_RX_STATUS) {
+               hpdu_cb->flags |= pdu_cb->flags;
+
+               if (hpdu_cb->flags & PDUCBF_RX_DATA)
+                       hpdu_cb->flags &= ~PDUCBF_RX_DATA_DDPD;
+
+               hpdu_cb->ddigest = pdu_cb->ddigest;
+               hpdu_cb->pdulen = pdu_cb->pdulen;
+       }
+}
+
+static int cxgbit_process_lro_skb(struct cxgbit_sock *csk, struct sk_buff *skb)
+{
+       struct cxgbit_lro_cb *lro_cb = cxgbit_skb_lro_cb(skb);
+       struct cxgbit_lro_pdu_cb *pdu_cb = cxgbit_skb_lro_pdu_cb(skb, 0);
+       u8 pdu_idx = 0, last_idx = 0;
+       int ret = 0;
+
+       if (!pdu_cb->complete) {
+               cxgbit_lro_skb_merge(csk, skb, 0);
+
+               if (pdu_cb->flags & PDUCBF_RX_STATUS) {
+                       struct sk_buff *hskb = csk->lro_hskb;
+
+                       ret = cxgbit_process_iscsi_pdu(csk, hskb, 0);
+
+                       cxgbit_lro_hskb_reset(csk);
+
+                       if (ret < 0)
+                               goto out;
+               }
+
+               pdu_idx = 1;
+       }
+
+       if (lro_cb->pdu_idx)
+               last_idx = lro_cb->pdu_idx - 1;
+
+       for (; pdu_idx <= last_idx; pdu_idx++) {
+               ret = cxgbit_process_iscsi_pdu(csk, skb, pdu_idx);
+               if (ret < 0)
+                       goto out;
+       }
+
+       if ((!lro_cb->complete) && lro_cb->pdu_idx)
+               cxgbit_lro_skb_merge(csk, skb, lro_cb->pdu_idx);
+
+out:
+       return ret;
+}
+
+static int cxgbit_rx_lro_skb(struct cxgbit_sock *csk, struct sk_buff *skb)
+{
+       struct cxgbit_lro_cb *lro_cb = cxgbit_skb_lro_cb(skb);
+       struct cxgbit_lro_pdu_cb *pdu_cb = cxgbit_skb_lro_pdu_cb(skb, 0);
+       int ret = -1;
+
+       if ((pdu_cb->flags & PDUCBF_RX_HDR) &&
+           (pdu_cb->seq != csk->rcv_nxt)) {
+               pr_info("csk 0x%p, tid 0x%x, seq 0x%x != 0x%x.\n",
+                       csk, csk->tid, pdu_cb->seq, csk->rcv_nxt);
+               cxgbit_lro_skb_dump(skb);
+               return ret;
+       }
+
+       csk->rcv_nxt += lro_cb->pdu_totallen;
+
+       ret = cxgbit_process_lro_skb(csk, skb);
+
+       csk->rx_credits += lro_cb->pdu_totallen;
+
+       if (csk->rx_credits >= (csk->rcv_win / 4))
+               cxgbit_rx_data_ack(csk);
+
+       return ret;
+}
+
+static int cxgbit_rx_skb(struct cxgbit_sock *csk, struct sk_buff *skb)
+{
+       int ret = -1;
+
+       if (likely(cxgbit_skcb_flags(skb) & SKCBF_RX_LRO))
+               ret = cxgbit_rx_lro_skb(csk, skb);
+
+       __kfree_skb(skb);
+       return ret;
+}
+
+static bool cxgbit_rxq_len(struct cxgbit_sock *csk, struct sk_buff_head *rxq)
+{
+       spin_lock_bh(&csk->rxq.lock);
+       if (skb_queue_len(&csk->rxq)) {
+               skb_queue_splice_init(&csk->rxq, rxq);
+               spin_unlock_bh(&csk->rxq.lock);
+               return true;
+       }
+       spin_unlock_bh(&csk->rxq.lock);
+       return false;
+}
+
+static int cxgbit_wait_rxq(struct cxgbit_sock *csk)
+{
+       struct sk_buff *skb;
+       struct sk_buff_head rxq;
+
+       skb_queue_head_init(&rxq);
+
+       wait_event_interruptible(csk->waitq, cxgbit_rxq_len(csk, &rxq));
+
+       if (signal_pending(current))
+               goto out;
+
+       while ((skb = __skb_dequeue(&rxq))) {
+               if (cxgbit_rx_skb(csk, skb))
+                       goto out;
+       }
+
+       return 0;
+out:
+       __skb_queue_purge(&rxq);
+       return -1;
+}
+
+int cxgbit_get_login_rx(struct iscsi_conn *conn, struct iscsi_login *login)
+{
+       struct cxgbit_sock *csk = conn->context;
+       int ret = -1;
+
+       while (!test_and_clear_bit(CSK_LOGIN_PDU_DONE, &csk->com.flags)) {
+               ret = cxgbit_wait_rxq(csk);
+               if (ret) {
+                       clear_bit(CSK_LOGIN_PDU_DONE, &csk->com.flags);
+                       break;
+               }
+       }
+
+       return ret;
+}
+
+void cxgbit_get_rx_pdu(struct iscsi_conn *conn)
+{
+       struct cxgbit_sock *csk = conn->context;
+
+       while (!kthread_should_stop()) {
+               iscsit_thread_check_cpumask(conn, current, 0);
+               if (cxgbit_wait_rxq(csk))
+                       return;
+       }
+}
index 961202f..50f3d3a 100644 (file)
@@ -478,16 +478,16 @@ int iscsit_del_np(struct iscsi_np *np)
        return 0;
 }
 
-static int iscsit_immediate_queue(struct iscsi_conn *, struct iscsi_cmd *, int);
-static int iscsit_response_queue(struct iscsi_conn *, struct iscsi_cmd *, int);
+static void iscsit_get_rx_pdu(struct iscsi_conn *);
 
-static int iscsit_queue_rsp(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
+int iscsit_queue_rsp(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
 {
        iscsit_add_cmd_to_response_queue(cmd, cmd->conn, cmd->i_state);
        return 0;
 }
+EXPORT_SYMBOL(iscsit_queue_rsp);
 
-static void iscsit_aborted_task(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
+void iscsit_aborted_task(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
 {
        bool scsi_cmd = (cmd->iscsi_opcode == ISCSI_OP_SCSI_CMD);
 
@@ -498,6 +498,169 @@ static void iscsit_aborted_task(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
 
        __iscsit_free_cmd(cmd, scsi_cmd, true);
 }
+EXPORT_SYMBOL(iscsit_aborted_task);
+
+static void iscsit_do_crypto_hash_buf(struct ahash_request *, const void *,
+                                     u32, u32, u8 *, u8 *);
+static void iscsit_tx_thread_wait_for_tcp(struct iscsi_conn *);
+
+static int
+iscsit_xmit_nondatain_pdu(struct iscsi_conn *conn, struct iscsi_cmd *cmd,
+                         const void *data_buf, u32 data_buf_len)
+{
+       struct iscsi_hdr *hdr = (struct iscsi_hdr *)cmd->pdu;
+       struct kvec *iov;
+       u32 niov = 0, tx_size = ISCSI_HDR_LEN;
+       int ret;
+
+       iov = &cmd->iov_misc[0];
+       iov[niov].iov_base      = cmd->pdu;
+       iov[niov++].iov_len     = ISCSI_HDR_LEN;
+
+       if (conn->conn_ops->HeaderDigest) {
+               u32 *header_digest = (u32 *)&cmd->pdu[ISCSI_HDR_LEN];
+
+               iscsit_do_crypto_hash_buf(conn->conn_tx_hash, hdr,
+                                         ISCSI_HDR_LEN, 0, NULL,
+                                         (u8 *)header_digest);
+
+               iov[0].iov_len += ISCSI_CRC_LEN;
+               tx_size += ISCSI_CRC_LEN;
+               pr_debug("Attaching CRC32C HeaderDigest"
+                        " to opcode 0x%x 0x%08x\n",
+                        hdr->opcode, *header_digest);
+       }
+
+       if (data_buf_len) {
+               u32 padding = ((-data_buf_len) & 3);
+
+               iov[niov].iov_base      = (void *)data_buf;
+               iov[niov++].iov_len     = data_buf_len;
+               tx_size += data_buf_len;
+
+               if (padding != 0) {
+                       iov[niov].iov_base = &cmd->pad_bytes;
+                       iov[niov++].iov_len = padding;
+                       tx_size += padding;
+                       pr_debug("Attaching %u additional"
+                                " padding bytes.\n", padding);
+               }
+
+               if (conn->conn_ops->DataDigest) {
+                       iscsit_do_crypto_hash_buf(conn->conn_tx_hash,
+                                                 data_buf, data_buf_len,
+                                                 padding,
+                                                 (u8 *)&cmd->pad_bytes,
+                                                 (u8 *)&cmd->data_crc);
+
+                       iov[niov].iov_base = &cmd->data_crc;
+                       iov[niov++].iov_len = ISCSI_CRC_LEN;
+                       tx_size += ISCSI_CRC_LEN;
+                       pr_debug("Attached DataDigest for %u"
+                                " bytes opcode 0x%x, CRC 0x%08x\n",
+                                data_buf_len, hdr->opcode, cmd->data_crc);
+               }
+       }
+
+       cmd->iov_misc_count = niov;
+       cmd->tx_size = tx_size;
+
+       ret = iscsit_send_tx_data(cmd, conn, 1);
+       if (ret < 0) {
+               iscsit_tx_thread_wait_for_tcp(conn);
+               return ret;
+       }
+
+       return 0;
+}
+
+static int iscsit_map_iovec(struct iscsi_cmd *, struct kvec *, u32, u32);
+static void iscsit_unmap_iovec(struct iscsi_cmd *);
+static u32 iscsit_do_crypto_hash_sg(struct ahash_request *, struct iscsi_cmd *,
+                                   u32, u32, u32, u8 *);
+static int
+iscsit_xmit_datain_pdu(struct iscsi_conn *conn, struct iscsi_cmd *cmd,
+                      const struct iscsi_datain *datain)
+{
+       struct kvec *iov;
+       u32 iov_count = 0, tx_size = 0;
+       int ret, iov_ret;
+
+       iov = &cmd->iov_data[0];
+       iov[iov_count].iov_base = cmd->pdu;
+       iov[iov_count++].iov_len = ISCSI_HDR_LEN;
+       tx_size += ISCSI_HDR_LEN;
+
+       if (conn->conn_ops->HeaderDigest) {
+               u32 *header_digest = (u32 *)&cmd->pdu[ISCSI_HDR_LEN];
+
+               iscsit_do_crypto_hash_buf(conn->conn_tx_hash, cmd->pdu,
+                                         ISCSI_HDR_LEN, 0, NULL,
+                                         (u8 *)header_digest);
+
+               iov[0].iov_len += ISCSI_CRC_LEN;
+               tx_size += ISCSI_CRC_LEN;
+
+               pr_debug("Attaching CRC32 HeaderDigest for DataIN PDU 0x%08x\n",
+                        *header_digest);
+       }
+
+       iov_ret = iscsit_map_iovec(cmd, &cmd->iov_data[1],
+                                  datain->offset, datain->length);
+       if (iov_ret < 0)
+               return -1;
+
+       iov_count += iov_ret;
+       tx_size += datain->length;
+
+       cmd->padding = ((-datain->length) & 3);
+       if (cmd->padding) {
+               iov[iov_count].iov_base         = cmd->pad_bytes;
+               iov[iov_count++].iov_len        = cmd->padding;
+               tx_size += cmd->padding;
+
+               pr_debug("Attaching %u padding bytes\n", cmd->padding);
+       }
+
+       if (conn->conn_ops->DataDigest) {
+               cmd->data_crc = iscsit_do_crypto_hash_sg(conn->conn_tx_hash,
+                                                        cmd, datain->offset,
+                                                        datain->length,
+                                                        cmd->padding,
+                                                        cmd->pad_bytes);
+
+               iov[iov_count].iov_base = &cmd->data_crc;
+               iov[iov_count++].iov_len = ISCSI_CRC_LEN;
+               tx_size += ISCSI_CRC_LEN;
+
+               pr_debug("Attached CRC32C DataDigest %d bytes, crc 0x%08x\n",
+                        datain->length + cmd->padding, cmd->data_crc);
+       }
+
+       cmd->iov_data_count = iov_count;
+       cmd->tx_size = tx_size;
+
+       ret = iscsit_fe_sendpage_sg(cmd, conn);
+
+       iscsit_unmap_iovec(cmd);
+
+       if (ret < 0) {
+               iscsit_tx_thread_wait_for_tcp(conn);
+               return ret;
+       }
+
+       return 0;
+}
+
+static int iscsit_xmit_pdu(struct iscsi_conn *conn, struct iscsi_cmd *cmd,
+                          struct iscsi_datain_req *dr, const void *buf,
+                          u32 buf_len)
+{
+       if (dr)
+               return iscsit_xmit_datain_pdu(conn, cmd, buf);
+       else
+               return iscsit_xmit_nondatain_pdu(conn, cmd, buf, buf_len);
+}
 
 static enum target_prot_op iscsit_get_sup_prot_ops(struct iscsi_conn *conn)
 {
@@ -507,6 +670,7 @@ static enum target_prot_op iscsit_get_sup_prot_ops(struct iscsi_conn *conn)
 static struct iscsit_transport iscsi_target_transport = {
        .name                   = "iSCSI/TCP",
        .transport_type         = ISCSI_TCP,
+       .rdma_shutdown          = false,
        .owner                  = NULL,
        .iscsit_setup_np        = iscsit_setup_np,
        .iscsit_accept_np       = iscsit_accept_np,
@@ -519,6 +683,8 @@ static struct iscsit_transport iscsi_target_transport = {
        .iscsit_queue_data_in   = iscsit_queue_rsp,
        .iscsit_queue_status    = iscsit_queue_rsp,
        .iscsit_aborted_task    = iscsit_aborted_task,
+       .iscsit_xmit_pdu        = iscsit_xmit_pdu,
+       .iscsit_get_rx_pdu      = iscsit_get_rx_pdu,
        .iscsit_get_sup_prot_ops = iscsit_get_sup_prot_ops,
 };
 
@@ -634,7 +800,7 @@ static void __exit iscsi_target_cleanup_module(void)
        kfree(iscsit_global);
 }
 
-static int iscsit_add_reject(
+int iscsit_add_reject(
        struct iscsi_conn *conn,
        u8 reason,
        unsigned char *buf)
@@ -664,6 +830,7 @@ static int iscsit_add_reject(
 
        return -1;
 }
+EXPORT_SYMBOL(iscsit_add_reject);
 
 static int iscsit_add_reject_from_cmd(
        struct iscsi_cmd *cmd,
@@ -719,6 +886,7 @@ int iscsit_reject_cmd(struct iscsi_cmd *cmd, u8 reason, unsigned char *buf)
 {
        return iscsit_add_reject_from_cmd(cmd, reason, false, buf);
 }
+EXPORT_SYMBOL(iscsit_reject_cmd);
 
 /*
  * Map some portion of the allocated scatterlist to an iovec, suitable for
@@ -737,7 +905,14 @@ static int iscsit_map_iovec(
        /*
         * We know each entry in t_data_sg contains a page.
         */
-       sg = &cmd->se_cmd.t_data_sg[data_offset / PAGE_SIZE];
+       u32 ent = data_offset / PAGE_SIZE;
+
+       if (ent >= cmd->se_cmd.t_data_nents) {
+               pr_err("Initial page entry out-of-bounds\n");
+               return -1;
+       }
+
+       sg = &cmd->se_cmd.t_data_sg[ent];
        page_off = (data_offset % PAGE_SIZE);
 
        cmd->first_data_sg = sg;
@@ -2335,7 +2510,7 @@ iscsit_handle_logout_cmd(struct iscsi_conn *conn, struct iscsi_cmd *cmd,
 }
 EXPORT_SYMBOL(iscsit_handle_logout_cmd);
 
-static int iscsit_handle_snack(
+int iscsit_handle_snack(
        struct iscsi_conn *conn,
        unsigned char *buf)
 {
@@ -2388,6 +2563,7 @@ static int iscsit_handle_snack(
 
        return 0;
 }
+EXPORT_SYMBOL(iscsit_handle_snack);
 
 static void iscsit_rx_thread_wait_for_tcp(struct iscsi_conn *conn)
 {
@@ -2534,7 +2710,6 @@ static int iscsit_send_conn_drop_async_message(
 {
        struct iscsi_async *hdr;
 
-       cmd->tx_size = ISCSI_HDR_LEN;
        cmd->iscsi_opcode = ISCSI_OP_ASYNC_EVENT;
 
        hdr                     = (struct iscsi_async *) cmd->pdu;
@@ -2552,25 +2727,11 @@ static int iscsit_send_conn_drop_async_message(
        hdr->param2             = cpu_to_be16(conn->sess->sess_ops->DefaultTime2Wait);
        hdr->param3             = cpu_to_be16(conn->sess->sess_ops->DefaultTime2Retain);
 
-       if (conn->conn_ops->HeaderDigest) {
-               u32 *header_digest = (u32 *)&cmd->pdu[ISCSI_HDR_LEN];
-
-               iscsit_do_crypto_hash_buf(conn->conn_tx_hash, hdr,
-                               ISCSI_HDR_LEN, 0, NULL, (u8 *)header_digest);
-
-               cmd->tx_size += ISCSI_CRC_LEN;
-               pr_debug("Attaching CRC32C HeaderDigest to"
-                       " Async Message 0x%08x\n", *header_digest);
-       }
-
-       cmd->iov_misc[0].iov_base       = cmd->pdu;
-       cmd->iov_misc[0].iov_len        = cmd->tx_size;
-       cmd->iov_misc_count             = 1;
-
        pr_debug("Sending Connection Dropped Async Message StatSN:"
                " 0x%08x, for CID: %hu on CID: %hu\n", cmd->stat_sn,
                        cmd->logout_cid, conn->cid);
-       return 0;
+
+       return conn->conn_transport->iscsit_xmit_pdu(conn, cmd, NULL, NULL, 0);
 }
 
 static void iscsit_tx_thread_wait_for_tcp(struct iscsi_conn *conn)
@@ -2583,7 +2744,7 @@ static void iscsit_tx_thread_wait_for_tcp(struct iscsi_conn *conn)
        }
 }
 
-static void
+void
 iscsit_build_datain_pdu(struct iscsi_cmd *cmd, struct iscsi_conn *conn,
                        struct iscsi_datain *datain, struct iscsi_data_rsp *hdr,
                        bool set_statsn)
@@ -2627,15 +2788,14 @@ iscsit_build_datain_pdu(struct iscsi_cmd *cmd, struct iscsi_conn *conn,
                cmd->init_task_tag, ntohl(hdr->statsn), ntohl(hdr->datasn),
                ntohl(hdr->offset), datain->length, conn->cid);
 }
+EXPORT_SYMBOL(iscsit_build_datain_pdu);
 
 static int iscsit_send_datain(struct iscsi_cmd *cmd, struct iscsi_conn *conn)
 {
        struct iscsi_data_rsp *hdr = (struct iscsi_data_rsp *)&cmd->pdu[0];
        struct iscsi_datain datain;
        struct iscsi_datain_req *dr;
-       struct kvec *iov;
-       u32 iov_count = 0, tx_size = 0;
-       int eodr = 0, ret, iov_ret;
+       int eodr = 0, ret;
        bool set_statsn = false;
 
        memset(&datain, 0, sizeof(struct iscsi_datain));
@@ -2677,64 +2837,9 @@ static int iscsit_send_datain(struct iscsi_cmd *cmd, struct iscsi_conn *conn)
 
        iscsit_build_datain_pdu(cmd, conn, &datain, hdr, set_statsn);
 
-       iov = &cmd->iov_data[0];
-       iov[iov_count].iov_base = cmd->pdu;
-       iov[iov_count++].iov_len        = ISCSI_HDR_LEN;
-       tx_size += ISCSI_HDR_LEN;
-
-       if (conn->conn_ops->HeaderDigest) {
-               u32 *header_digest = (u32 *)&cmd->pdu[ISCSI_HDR_LEN];
-
-               iscsit_do_crypto_hash_buf(conn->conn_tx_hash, cmd->pdu,
-                               ISCSI_HDR_LEN, 0, NULL, (u8 *)header_digest);
-
-               iov[0].iov_len += ISCSI_CRC_LEN;
-               tx_size += ISCSI_CRC_LEN;
-
-               pr_debug("Attaching CRC32 HeaderDigest"
-                       " for DataIN PDU 0x%08x\n", *header_digest);
-       }
-
-       iov_ret = iscsit_map_iovec(cmd, &cmd->iov_data[1],
-                               datain.offset, datain.length);
-       if (iov_ret < 0)
-               return -1;
-
-       iov_count += iov_ret;
-       tx_size += datain.length;
-
-       cmd->padding = ((-datain.length) & 3);
-       if (cmd->padding) {
-               iov[iov_count].iov_base         = cmd->pad_bytes;
-               iov[iov_count++].iov_len        = cmd->padding;
-               tx_size += cmd->padding;
-
-               pr_debug("Attaching %u padding bytes\n",
-                               cmd->padding);
-       }
-       if (conn->conn_ops->DataDigest) {
-               cmd->data_crc = iscsit_do_crypto_hash_sg(conn->conn_tx_hash, cmd,
-                        datain.offset, datain.length, cmd->padding, cmd->pad_bytes);
-
-               iov[iov_count].iov_base = &cmd->data_crc;
-               iov[iov_count++].iov_len = ISCSI_CRC_LEN;
-               tx_size += ISCSI_CRC_LEN;
-
-               pr_debug("Attached CRC32C DataDigest %d bytes, crc"
-                       " 0x%08x\n", datain.length+cmd->padding, cmd->data_crc);
-       }
-
-       cmd->iov_data_count = iov_count;
-       cmd->tx_size = tx_size;
-
-       ret = iscsit_fe_sendpage_sg(cmd, conn);
-
-       iscsit_unmap_iovec(cmd);
-
-       if (ret < 0) {
-               iscsit_tx_thread_wait_for_tcp(conn);
+       ret = conn->conn_transport->iscsit_xmit_pdu(conn, cmd, dr, &datain, 0);
+       if (ret < 0)
                return ret;
-       }
 
        if (dr->dr_complete) {
                eodr = (cmd->se_cmd.se_cmd_flags & SCF_TRANSPORT_TASK_SENSE) ?
@@ -2843,34 +2948,14 @@ EXPORT_SYMBOL(iscsit_build_logout_rsp);
 static int
 iscsit_send_logout(struct iscsi_cmd *cmd, struct iscsi_conn *conn)
 {
-       struct kvec *iov;
-       int niov = 0, tx_size, rc;
+       int rc;
 
        rc = iscsit_build_logout_rsp(cmd, conn,
                        (struct iscsi_logout_rsp *)&cmd->pdu[0]);
        if (rc < 0)
                return rc;
 
-       tx_size = ISCSI_HDR_LEN;
-       iov = &cmd->iov_misc[0];
-       iov[niov].iov_base      = cmd->pdu;
-       iov[niov++].iov_len     = ISCSI_HDR_LEN;
-
-       if (conn->conn_ops->HeaderDigest) {
-               u32 *header_digest = (u32 *)&cmd->pdu[ISCSI_HDR_LEN];
-
-               iscsit_do_crypto_hash_buf(conn->conn_tx_hash, &cmd->pdu[0],
-                               ISCSI_HDR_LEN, 0, NULL, (u8 *)header_digest);
-
-               iov[0].iov_len += ISCSI_CRC_LEN;
-               tx_size += ISCSI_CRC_LEN;
-               pr_debug("Attaching CRC32C HeaderDigest to"
-                       " Logout Response 0x%08x\n", *header_digest);
-       }
-       cmd->iov_misc_count = niov;
-       cmd->tx_size = tx_size;
-
-       return 0;
+       return conn->conn_transport->iscsit_xmit_pdu(conn, cmd, NULL, NULL, 0);
 }
 
 void
@@ -2910,34 +2995,16 @@ static int iscsit_send_unsolicited_nopin(
        int want_response)
 {
        struct iscsi_nopin *hdr = (struct iscsi_nopin *)&cmd->pdu[0];
-       int tx_size = ISCSI_HDR_LEN, ret;
+       int ret;
 
        iscsit_build_nopin_rsp(cmd, conn, hdr, false);
 
-       if (conn->conn_ops->HeaderDigest) {
-               u32 *header_digest = (u32 *)&cmd->pdu[ISCSI_HDR_LEN];
-
-               iscsit_do_crypto_hash_buf(conn->conn_tx_hash, hdr,
-                               ISCSI_HDR_LEN, 0, NULL, (u8 *)header_digest);
-
-               tx_size += ISCSI_CRC_LEN;
-               pr_debug("Attaching CRC32C HeaderDigest to"
-                       " NopIN 0x%08x\n", *header_digest);
-       }
-
-       cmd->iov_misc[0].iov_base       = cmd->pdu;
-       cmd->iov_misc[0].iov_len        = tx_size;
-       cmd->iov_misc_count     = 1;
-       cmd->tx_size            = tx_size;
-
        pr_debug("Sending Unsolicited NOPIN TTT: 0x%08x StatSN:"
                " 0x%08x CID: %hu\n", hdr->ttt, cmd->stat_sn, conn->cid);
 
-       ret = iscsit_send_tx_data(cmd, conn, 1);
-       if (ret < 0) {
-               iscsit_tx_thread_wait_for_tcp(conn);
+       ret = conn->conn_transport->iscsit_xmit_pdu(conn, cmd, NULL, NULL, 0);
+       if (ret < 0)
                return ret;
-       }
 
        spin_lock_bh(&cmd->istate_lock);
        cmd->i_state = want_response ?
@@ -2951,75 +3018,24 @@ static int
 iscsit_send_nopin(struct iscsi_cmd *cmd, struct iscsi_conn *conn)
 {
        struct iscsi_nopin *hdr = (struct iscsi_nopin *)&cmd->pdu[0];
-       struct kvec *iov;
-       u32 padding = 0;
-       int niov = 0, tx_size;
 
        iscsit_build_nopin_rsp(cmd, conn, hdr, true);
 
-       tx_size = ISCSI_HDR_LEN;
-       iov = &cmd->iov_misc[0];
-       iov[niov].iov_base      = cmd->pdu;
-       iov[niov++].iov_len     = ISCSI_HDR_LEN;
-
-       if (conn->conn_ops->HeaderDigest) {
-               u32 *header_digest = (u32 *)&cmd->pdu[ISCSI_HDR_LEN];
-
-               iscsit_do_crypto_hash_buf(conn->conn_tx_hash, hdr,
-                               ISCSI_HDR_LEN, 0, NULL, (u8 *)header_digest);
-
-               iov[0].iov_len += ISCSI_CRC_LEN;
-               tx_size += ISCSI_CRC_LEN;
-               pr_debug("Attaching CRC32C HeaderDigest"
-                       " to NopIn 0x%08x\n", *header_digest);
-       }
-
        /*
         * NOPOUT Ping Data is attached to struct iscsi_cmd->buf_ptr.
         * NOPOUT DataSegmentLength is at struct iscsi_cmd->buf_ptr_size.
         */
-       if (cmd->buf_ptr_size) {
-               iov[niov].iov_base      = cmd->buf_ptr;
-               iov[niov++].iov_len     = cmd->buf_ptr_size;
-               tx_size += cmd->buf_ptr_size;
-
-               pr_debug("Echoing back %u bytes of ping"
-                       " data.\n", cmd->buf_ptr_size);
-
-               padding = ((-cmd->buf_ptr_size) & 3);
-               if (padding != 0) {
-                       iov[niov].iov_base = &cmd->pad_bytes;
-                       iov[niov++].iov_len = padding;
-                       tx_size += padding;
-                       pr_debug("Attaching %u additional"
-                               " padding bytes.\n", padding);
-               }
-               if (conn->conn_ops->DataDigest) {
-                       iscsit_do_crypto_hash_buf(conn->conn_tx_hash,
-                               cmd->buf_ptr, cmd->buf_ptr_size,
-                               padding, (u8 *)&cmd->pad_bytes,
-                               (u8 *)&cmd->data_crc);
-
-                       iov[niov].iov_base = &cmd->data_crc;
-                       iov[niov++].iov_len = ISCSI_CRC_LEN;
-                       tx_size += ISCSI_CRC_LEN;
-                       pr_debug("Attached DataDigest for %u"
-                               " bytes of ping data, CRC 0x%08x\n",
-                               cmd->buf_ptr_size, cmd->data_crc);
-               }
-       }
+       pr_debug("Echoing back %u bytes of ping data.\n", cmd->buf_ptr_size);
 
-       cmd->iov_misc_count = niov;
-       cmd->tx_size = tx_size;
-
-       return 0;
+       return conn->conn_transport->iscsit_xmit_pdu(conn, cmd, NULL,
+                                                    cmd->buf_ptr,
+                                                    cmd->buf_ptr_size);
 }
 
 static int iscsit_send_r2t(
        struct iscsi_cmd *cmd,
        struct iscsi_conn *conn)
 {
-       int tx_size = 0;
        struct iscsi_r2t *r2t;
        struct iscsi_r2t_rsp *hdr;
        int ret;
@@ -3035,7 +3051,10 @@ static int iscsit_send_r2t(
        int_to_scsilun(cmd->se_cmd.orig_fe_lun,
                        (struct scsi_lun *)&hdr->lun);
        hdr->itt                = cmd->init_task_tag;
-       r2t->targ_xfer_tag      = session_get_next_ttt(conn->sess);
+       if (conn->conn_transport->iscsit_get_r2t_ttt)
+               conn->conn_transport->iscsit_get_r2t_ttt(conn, cmd, r2t);
+       else
+               r2t->targ_xfer_tag = session_get_next_ttt(conn->sess);
        hdr->ttt                = cpu_to_be32(r2t->targ_xfer_tag);
        hdr->statsn             = cpu_to_be32(conn->stat_sn);
        hdr->exp_cmdsn          = cpu_to_be32(conn->sess->exp_cmd_sn);
@@ -3044,38 +3063,18 @@ static int iscsit_send_r2t(
        hdr->data_offset        = cpu_to_be32(r2t->offset);
        hdr->data_length        = cpu_to_be32(r2t->xfer_len);
 
-       cmd->iov_misc[0].iov_base       = cmd->pdu;
-       cmd->iov_misc[0].iov_len        = ISCSI_HDR_LEN;
-       tx_size += ISCSI_HDR_LEN;
-
-       if (conn->conn_ops->HeaderDigest) {
-               u32 *header_digest = (u32 *)&cmd->pdu[ISCSI_HDR_LEN];
-
-               iscsit_do_crypto_hash_buf(conn->conn_tx_hash, hdr,
-                               ISCSI_HDR_LEN, 0, NULL, (u8 *)header_digest);
-
-               cmd->iov_misc[0].iov_len += ISCSI_CRC_LEN;
-               tx_size += ISCSI_CRC_LEN;
-               pr_debug("Attaching CRC32 HeaderDigest for R2T"
-                       " PDU 0x%08x\n", *header_digest);
-       }
-
        pr_debug("Built %sR2T, ITT: 0x%08x, TTT: 0x%08x, StatSN:"
                " 0x%08x, R2TSN: 0x%08x, Offset: %u, DDTL: %u, CID: %hu\n",
                (!r2t->recovery_r2t) ? "" : "Recovery ", cmd->init_task_tag,
                r2t->targ_xfer_tag, ntohl(hdr->statsn), r2t->r2t_sn,
                        r2t->offset, r2t->xfer_len, conn->cid);
 
-       cmd->iov_misc_count = 1;
-       cmd->tx_size = tx_size;
-
        spin_lock_bh(&cmd->r2t_lock);
        r2t->sent_r2t = 1;
        spin_unlock_bh(&cmd->r2t_lock);
 
-       ret = iscsit_send_tx_data(cmd, conn, 1);
+       ret = conn->conn_transport->iscsit_xmit_pdu(conn, cmd, NULL, NULL, 0);
        if (ret < 0) {
-               iscsit_tx_thread_wait_for_tcp(conn);
                return ret;
        }
 
@@ -3166,6 +3165,7 @@ int iscsit_build_r2ts_for_cmd(
 
        return 0;
 }
+EXPORT_SYMBOL(iscsit_build_r2ts_for_cmd);
 
 void iscsit_build_rsp_pdu(struct iscsi_cmd *cmd, struct iscsi_conn *conn,
                        bool inc_stat_sn, struct iscsi_scsi_rsp *hdr)
@@ -3204,18 +3204,12 @@ EXPORT_SYMBOL(iscsit_build_rsp_pdu);
 static int iscsit_send_response(struct iscsi_cmd *cmd, struct iscsi_conn *conn)
 {
        struct iscsi_scsi_rsp *hdr = (struct iscsi_scsi_rsp *)&cmd->pdu[0];
-       struct kvec *iov;
-       u32 padding = 0, tx_size = 0;
-       int iov_count = 0;
        bool inc_stat_sn = (cmd->i_state == ISTATE_SEND_STATUS);
+       void *data_buf = NULL;
+       u32 padding = 0, data_buf_len = 0;
 
        iscsit_build_rsp_pdu(cmd, conn, inc_stat_sn, hdr);
 
-       iov = &cmd->iov_misc[0];
-       iov[iov_count].iov_base = cmd->pdu;
-       iov[iov_count++].iov_len = ISCSI_HDR_LEN;
-       tx_size += ISCSI_HDR_LEN;
-
        /*
         * Attach SENSE DATA payload to iSCSI Response PDU
         */
@@ -3227,56 +3221,23 @@ static int iscsit_send_response(struct iscsi_cmd *cmd, struct iscsi_conn *conn)
 
                padding         = -(cmd->se_cmd.scsi_sense_length) & 3;
                hton24(hdr->dlength, (u32)cmd->se_cmd.scsi_sense_length);
-               iov[iov_count].iov_base = cmd->sense_buffer;
-               iov[iov_count++].iov_len =
-                               (cmd->se_cmd.scsi_sense_length + padding);
-               tx_size += cmd->se_cmd.scsi_sense_length;
+               data_buf = cmd->sense_buffer;
+               data_buf_len = cmd->se_cmd.scsi_sense_length + padding;
 
                if (padding) {
                        memset(cmd->sense_buffer +
                                cmd->se_cmd.scsi_sense_length, 0, padding);
-                       tx_size += padding;
                        pr_debug("Adding %u bytes of padding to"
                                " SENSE.\n", padding);
                }
 
-               if (conn->conn_ops->DataDigest) {
-                       iscsit_do_crypto_hash_buf(conn->conn_tx_hash,
-                               cmd->sense_buffer,
-                               (cmd->se_cmd.scsi_sense_length + padding),
-                               0, NULL, (u8 *)&cmd->data_crc);
-
-                       iov[iov_count].iov_base    = &cmd->data_crc;
-                       iov[iov_count++].iov_len     = ISCSI_CRC_LEN;
-                       tx_size += ISCSI_CRC_LEN;
-
-                       pr_debug("Attaching CRC32 DataDigest for"
-                               " SENSE, %u bytes CRC 0x%08x\n",
-                               (cmd->se_cmd.scsi_sense_length + padding),
-                               cmd->data_crc);
-               }
-
                pr_debug("Attaching SENSE DATA: %u bytes to iSCSI"
                                " Response PDU\n",
                                cmd->se_cmd.scsi_sense_length);
        }
 
-       if (conn->conn_ops->HeaderDigest) {
-               u32 *header_digest = (u32 *)&cmd->pdu[ISCSI_HDR_LEN];
-
-               iscsit_do_crypto_hash_buf(conn->conn_tx_hash, cmd->pdu,
-                               ISCSI_HDR_LEN, 0, NULL, (u8 *)header_digest);
-
-               iov[0].iov_len += ISCSI_CRC_LEN;
-               tx_size += ISCSI_CRC_LEN;
-               pr_debug("Attaching CRC32 HeaderDigest for Response"
-                               " PDU 0x%08x\n", *header_digest);
-       }
-
-       cmd->iov_misc_count = iov_count;
-       cmd->tx_size = tx_size;
-
-       return 0;
+       return conn->conn_transport->iscsit_xmit_pdu(conn, cmd, NULL, data_buf,
+                                                    data_buf_len);
 }
 
 static u8 iscsit_convert_tcm_tmr_rsp(struct se_tmr_req *se_tmr)
@@ -3323,30 +3284,10 @@ static int
 iscsit_send_task_mgt_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn)
 {
        struct iscsi_tm_rsp *hdr = (struct iscsi_tm_rsp *)&cmd->pdu[0];
-       u32 tx_size = 0;
 
        iscsit_build_task_mgt_rsp(cmd, conn, hdr);
 
-       cmd->iov_misc[0].iov_base       = cmd->pdu;
-       cmd->iov_misc[0].iov_len        = ISCSI_HDR_LEN;
-       tx_size += ISCSI_HDR_LEN;
-
-       if (conn->conn_ops->HeaderDigest) {
-               u32 *header_digest = (u32 *)&cmd->pdu[ISCSI_HDR_LEN];
-
-               iscsit_do_crypto_hash_buf(conn->conn_tx_hash, hdr,
-                               ISCSI_HDR_LEN, 0, NULL, (u8 *)header_digest);
-
-               cmd->iov_misc[0].iov_len += ISCSI_CRC_LEN;
-               tx_size += ISCSI_CRC_LEN;
-               pr_debug("Attaching CRC32 HeaderDigest for Task"
-                       " Mgmt Response PDU 0x%08x\n", *header_digest);
-       }
-
-       cmd->iov_misc_count = 1;
-       cmd->tx_size = tx_size;
-
-       return 0;
+       return conn->conn_transport->iscsit_xmit_pdu(conn, cmd, NULL, NULL, 0);
 }
 
 static bool iscsit_check_inaddr_any(struct iscsi_np *np)
@@ -3583,53 +3524,16 @@ static int iscsit_send_text_rsp(
        struct iscsi_conn *conn)
 {
        struct iscsi_text_rsp *hdr = (struct iscsi_text_rsp *)cmd->pdu;
-       struct kvec *iov;
-       u32 tx_size = 0;
-       int text_length, iov_count = 0, rc;
-
-       rc = iscsit_build_text_rsp(cmd, conn, hdr, ISCSI_TCP);
-       if (rc < 0)
-               return rc;
-
-       text_length = rc;
-       iov = &cmd->iov_misc[0];
-       iov[iov_count].iov_base = cmd->pdu;
-       iov[iov_count++].iov_len = ISCSI_HDR_LEN;
-       iov[iov_count].iov_base = cmd->buf_ptr;
-       iov[iov_count++].iov_len = text_length;
-
-       tx_size += (ISCSI_HDR_LEN + text_length);
-
-       if (conn->conn_ops->HeaderDigest) {
-               u32 *header_digest = (u32 *)&cmd->pdu[ISCSI_HDR_LEN];
-
-               iscsit_do_crypto_hash_buf(conn->conn_tx_hash, hdr,
-                               ISCSI_HDR_LEN, 0, NULL, (u8 *)header_digest);
-
-               iov[0].iov_len += ISCSI_CRC_LEN;
-               tx_size += ISCSI_CRC_LEN;
-               pr_debug("Attaching CRC32 HeaderDigest for"
-                       " Text Response PDU 0x%08x\n", *header_digest);
-       }
-
-       if (conn->conn_ops->DataDigest) {
-               iscsit_do_crypto_hash_buf(conn->conn_tx_hash,
-                               cmd->buf_ptr, text_length,
-                               0, NULL, (u8 *)&cmd->data_crc);
-
-               iov[iov_count].iov_base = &cmd->data_crc;
-               iov[iov_count++].iov_len = ISCSI_CRC_LEN;
-               tx_size += ISCSI_CRC_LEN;
-
-               pr_debug("Attaching DataDigest for %u bytes of text"
-                       " data, CRC 0x%08x\n", text_length,
-                       cmd->data_crc);
-       }
+       int text_length;
 
-       cmd->iov_misc_count = iov_count;
-       cmd->tx_size = tx_size;
+       text_length = iscsit_build_text_rsp(cmd, conn, hdr,
+                               conn->conn_transport->transport_type);
+       if (text_length < 0)
+               return text_length;
 
-       return 0;
+       return conn->conn_transport->iscsit_xmit_pdu(conn, cmd, NULL,
+                                                    cmd->buf_ptr,
+                                                    text_length);
 }
 
 void
@@ -3654,49 +3558,15 @@ static int iscsit_send_reject(
        struct iscsi_conn *conn)
 {
        struct iscsi_reject *hdr = (struct iscsi_reject *)&cmd->pdu[0];
-       struct kvec *iov;
-       u32 iov_count = 0, tx_size;
 
        iscsit_build_reject(cmd, conn, hdr);
 
-       iov = &cmd->iov_misc[0];
-       iov[iov_count].iov_base = cmd->pdu;
-       iov[iov_count++].iov_len = ISCSI_HDR_LEN;
-       iov[iov_count].iov_base = cmd->buf_ptr;
-       iov[iov_count++].iov_len = ISCSI_HDR_LEN;
-
-       tx_size = (ISCSI_HDR_LEN + ISCSI_HDR_LEN);
-
-       if (conn->conn_ops->HeaderDigest) {
-               u32 *header_digest = (u32 *)&cmd->pdu[ISCSI_HDR_LEN];
-
-               iscsit_do_crypto_hash_buf(conn->conn_tx_hash, hdr,
-                               ISCSI_HDR_LEN, 0, NULL, (u8 *)header_digest);
-
-               iov[0].iov_len += ISCSI_CRC_LEN;
-               tx_size += ISCSI_CRC_LEN;
-               pr_debug("Attaching CRC32 HeaderDigest for"
-                       " REJECT PDU 0x%08x\n", *header_digest);
-       }
-
-       if (conn->conn_ops->DataDigest) {
-               iscsit_do_crypto_hash_buf(conn->conn_tx_hash, cmd->buf_ptr,
-                               ISCSI_HDR_LEN, 0, NULL, (u8 *)&cmd->data_crc);
-
-               iov[iov_count].iov_base = &cmd->data_crc;
-               iov[iov_count++].iov_len  = ISCSI_CRC_LEN;
-               tx_size += ISCSI_CRC_LEN;
-               pr_debug("Attaching CRC32 DataDigest for REJECT"
-                               " PDU 0x%08x\n", cmd->data_crc);
-       }
-
-       cmd->iov_misc_count = iov_count;
-       cmd->tx_size = tx_size;
-
        pr_debug("Built Reject PDU StatSN: 0x%08x, Reason: 0x%02x,"
                " CID: %hu\n", ntohl(hdr->statsn), hdr->reason, conn->cid);
 
-       return 0;
+       return conn->conn_transport->iscsit_xmit_pdu(conn, cmd, NULL,
+                                                    cmd->buf_ptr,
+                                                    ISCSI_HDR_LEN);
 }
 
 void iscsit_thread_get_cpumask(struct iscsi_conn *conn)
@@ -3724,33 +3594,7 @@ void iscsit_thread_get_cpumask(struct iscsi_conn *conn)
        cpumask_setall(conn->conn_cpumask);
 }
 
-static inline void iscsit_thread_check_cpumask(
-       struct iscsi_conn *conn,
-       struct task_struct *p,
-       int mode)
-{
-       /*
-        * mode == 1 signals iscsi_target_tx_thread() usage.
-        * mode == 0 signals iscsi_target_rx_thread() usage.
-        */
-       if (mode == 1) {
-               if (!conn->conn_tx_reset_cpumask)
-                       return;
-               conn->conn_tx_reset_cpumask = 0;
-       } else {
-               if (!conn->conn_rx_reset_cpumask)
-                       return;
-               conn->conn_rx_reset_cpumask = 0;
-       }
-       /*
-        * Update the CPU mask for this single kthread so that
-        * both TX and RX kthreads are scheduled to run on the
-        * same CPU.
-        */
-       set_cpus_allowed_ptr(p, conn->conn_cpumask);
-}
-
-static int
+int
 iscsit_immediate_queue(struct iscsi_conn *conn, struct iscsi_cmd *cmd, int state)
 {
        int ret;
@@ -3792,6 +3636,7 @@ iscsit_immediate_queue(struct iscsi_conn *conn, struct iscsi_cmd *cmd, int state
 err:
        return -1;
 }
+EXPORT_SYMBOL(iscsit_immediate_queue);
 
 static int
 iscsit_handle_immediate_queue(struct iscsi_conn *conn)
@@ -3816,7 +3661,7 @@ iscsit_handle_immediate_queue(struct iscsi_conn *conn)
        return 0;
 }
 
-static int
+int
 iscsit_response_queue(struct iscsi_conn *conn, struct iscsi_cmd *cmd, int state)
 {
        int ret;
@@ -3889,13 +3734,6 @@ check_rsp_state:
        if (ret < 0)
                goto err;
 
-       if (iscsit_send_tx_data(cmd, conn, 1) < 0) {
-               iscsit_tx_thread_wait_for_tcp(conn);
-               iscsit_unmap_iovec(cmd);
-               goto err;
-       }
-       iscsit_unmap_iovec(cmd);
-
        switch (state) {
        case ISTATE_SEND_LOGOUTRSP:
                if (!iscsit_logout_post_handler(cmd, conn))
@@ -3928,6 +3766,7 @@ check_rsp_state:
 err:
        return -1;
 }
+EXPORT_SYMBOL(iscsit_response_queue);
 
 static int iscsit_handle_response_queue(struct iscsi_conn *conn)
 {
@@ -4087,36 +3926,12 @@ static bool iscsi_target_check_conn_state(struct iscsi_conn *conn)
        return ret;
 }
 
-int iscsi_target_rx_thread(void *arg)
+static void iscsit_get_rx_pdu(struct iscsi_conn *conn)
 {
-       int ret, rc;
+       int ret;
        u8 buffer[ISCSI_HDR_LEN], opcode;
        u32 checksum = 0, digest = 0;
-       struct iscsi_conn *conn = arg;
        struct kvec iov;
-       /*
-        * Allow ourselves to be interrupted by SIGINT so that a
-        * connection recovery / failure event can be triggered externally.
-        */
-       allow_signal(SIGINT);
-       /*
-        * Wait for iscsi_post_login_handler() to complete before allowing
-        * incoming iscsi/tcp socket I/O, and/or failing the connection.
-        */
-       rc = wait_for_completion_interruptible(&conn->rx_login_comp);
-       if (rc < 0 || iscsi_target_check_conn_state(conn))
-               return 0;
-
-       if (conn->conn_transport->transport_type == ISCSI_INFINIBAND) {
-               struct completion comp;
-
-               init_completion(&comp);
-               rc = wait_for_completion_interruptible(&comp);
-               if (rc < 0)
-                       goto transport_err;
-
-               goto transport_err;
-       }
 
        while (!kthread_should_stop()) {
                /*
@@ -4134,7 +3949,7 @@ int iscsi_target_rx_thread(void *arg)
                ret = rx_data(conn, &iov, 1, ISCSI_HDR_LEN);
                if (ret != ISCSI_HDR_LEN) {
                        iscsit_rx_thread_wait_for_tcp(conn);
-                       goto transport_err;
+                       return;
                }
 
                if (conn->conn_ops->HeaderDigest) {
@@ -4144,7 +3959,7 @@ int iscsi_target_rx_thread(void *arg)
                        ret = rx_data(conn, &iov, 1, ISCSI_CRC_LEN);
                        if (ret != ISCSI_CRC_LEN) {
                                iscsit_rx_thread_wait_for_tcp(conn);
-                               goto transport_err;
+                               return;
                        }
 
                        iscsit_do_crypto_hash_buf(conn->conn_rx_hash,
@@ -4168,7 +3983,7 @@ int iscsi_target_rx_thread(void *arg)
                }
 
                if (conn->conn_state == TARG_CONN_STATE_IN_LOGOUT)
-                       goto transport_err;
+                       return;
 
                opcode = buffer[0] & ISCSI_OPCODE_MASK;
 
@@ -4179,15 +3994,38 @@ int iscsi_target_rx_thread(void *arg)
                        " while in Discovery Session, rejecting.\n", opcode);
                        iscsit_add_reject(conn, ISCSI_REASON_PROTOCOL_ERROR,
                                          buffer);
-                       goto transport_err;
+                       return;
                }
 
                ret = iscsi_target_rx_opcode(conn, buffer);
                if (ret < 0)
-                       goto transport_err;
+                       return;
        }
+}
+
+int iscsi_target_rx_thread(void *arg)
+{
+       int rc;
+       struct iscsi_conn *conn = arg;
+
+       /*
+        * Allow ourselves to be interrupted by SIGINT so that a
+        * connection recovery / failure event can be triggered externally.
+        */
+       allow_signal(SIGINT);
+       /*
+        * Wait for iscsi_post_login_handler() to complete before allowing
+        * incoming iscsi/tcp socket I/O, and/or failing the connection.
+        */
+       rc = wait_for_completion_interruptible(&conn->rx_login_comp);
+       if (rc < 0 || iscsi_target_check_conn_state(conn))
+               return 0;
+
+       if (!conn->conn_transport->iscsit_get_rx_pdu)
+               return 0;
+
+       conn->conn_transport->iscsit_get_rx_pdu(conn);
 
-transport_err:
        if (!signal_pending(current))
                atomic_set(&conn->transport_failed, 1);
        iscsit_take_action_for_connection_exit(conn);
@@ -4240,16 +4078,17 @@ int iscsit_close_connection(
        pr_debug("Closing iSCSI connection CID %hu on SID:"
                " %u\n", conn->cid, sess->sid);
        /*
-        * Always up conn_logout_comp for the traditional TCP case just in case
-        * the RX Thread in iscsi_target_rx_opcode() is sleeping and the logout
-        * response never got sent because the connection failed.
+        * Always up conn_logout_comp for the traditional TCP and HW_OFFLOAD
+        * case just in case the RX Thread in iscsi_target_rx_opcode() is
+        * sleeping and the logout response never got sent because the
+        * connection failed.
         *
         * However for iser-target, isert_wait4logout() is using conn_logout_comp
         * to signal logout response TX interrupt completion.  Go ahead and skip
         * this for iser since isert_rx_opcode() does not wait on logout failure,
         * and to avoid iscsi_conn pointer dereference in iser-target code.
         */
-       if (conn->conn_transport->transport_type == ISCSI_TCP)
+       if (!conn->conn_transport->rdma_shutdown)
                complete(&conn->conn_logout_comp);
 
        if (!strcmp(current->comm, ISCSI_RX_THREAD_NAME)) {
@@ -4438,7 +4277,7 @@ int iscsit_close_connection(
        if (!atomic_read(&sess->session_reinstatement) &&
             atomic_read(&sess->session_fall_back_to_erl0)) {
                spin_unlock_bh(&sess->conn_lock);
-               target_put_session(sess->se_sess);
+               iscsit_close_session(sess);
 
                return 0;
        } else if (atomic_read(&sess->session_logout)) {
@@ -4467,6 +4306,10 @@ int iscsit_close_connection(
        }
 }
 
+/*
+ * If the iSCSI Session for the iSCSI Initiator Node exists,
+ * forcefully shutdown the iSCSI NEXUS.
+ */
 int iscsit_close_session(struct iscsi_session *sess)
 {
        struct iscsi_portal_group *tpg = sess->tpg;
@@ -4556,7 +4399,7 @@ static void iscsit_logout_post_handler_closesession(
         * always sleep waiting for RX/TX thread shutdown to complete
         * within iscsit_close_connection().
         */
-       if (conn->conn_transport->transport_type == ISCSI_TCP)
+       if (!conn->conn_transport->rdma_shutdown)
                sleep = cmpxchg(&conn->tx_thread_active, true, false);
 
        atomic_set(&conn->conn_logout_remove, 0);
@@ -4565,7 +4408,7 @@ static void iscsit_logout_post_handler_closesession(
        iscsit_dec_conn_usage_count(conn);
        iscsit_stop_session(sess, sleep, sleep);
        iscsit_dec_session_usage_count(sess);
-       target_put_session(sess->se_sess);
+       iscsit_close_session(sess);
 }
 
 static void iscsit_logout_post_handler_samecid(
@@ -4573,7 +4416,7 @@ static void iscsit_logout_post_handler_samecid(
 {
        int sleep = 1;
 
-       if (conn->conn_transport->transport_type == ISCSI_TCP)
+       if (!conn->conn_transport->rdma_shutdown)
                sleep = cmpxchg(&conn->tx_thread_active, true, false);
 
        atomic_set(&conn->conn_logout_remove, 0);
@@ -4736,7 +4579,7 @@ int iscsit_free_session(struct iscsi_session *sess)
        } else
                spin_unlock_bh(&sess->conn_lock);
 
-       target_put_session(sess->se_sess);
+       iscsit_close_session(sess);
        return 0;
 }
 
index 97e5b69..923c032 100644 (file)
@@ -43,14 +43,15 @@ static inline struct iscsi_tpg_np *to_iscsi_tpg_np(struct config_item *item)
        return container_of(to_tpg_np(item), struct iscsi_tpg_np, se_tpg_np);
 }
 
-static ssize_t lio_target_np_sctp_show(struct config_item *item, char *page)
+static ssize_t lio_target_np_driver_show(struct config_item *item, char *page,
+                                        enum iscsit_transport_type type)
 {
        struct iscsi_tpg_np *tpg_np = to_iscsi_tpg_np(item);
-       struct iscsi_tpg_np *tpg_np_sctp;
+       struct iscsi_tpg_np *tpg_np_new;
        ssize_t rb;
 
-       tpg_np_sctp = iscsit_tpg_locate_child_np(tpg_np, ISCSI_SCTP_TCP);
-       if (tpg_np_sctp)
+       tpg_np_new = iscsit_tpg_locate_child_np(tpg_np, type);
+       if (tpg_np_new)
                rb = sprintf(page, "1\n");
        else
                rb = sprintf(page, "0\n");
@@ -58,19 +59,20 @@ static ssize_t lio_target_np_sctp_show(struct config_item *item, char *page)
        return rb;
 }
 
-static ssize_t lio_target_np_sctp_store(struct config_item *item,
-               const char *page, size_t count)
+static ssize_t lio_target_np_driver_store(struct config_item *item,
+               const char *page, size_t count, enum iscsit_transport_type type,
+               const char *mod_name)
 {
        struct iscsi_tpg_np *tpg_np = to_iscsi_tpg_np(item);
        struct iscsi_np *np;
        struct iscsi_portal_group *tpg;
-       struct iscsi_tpg_np *tpg_np_sctp = NULL;
+       struct iscsi_tpg_np *tpg_np_new = NULL;
        u32 op;
-       int ret;
+       int rc;
 
-       ret = kstrtou32(page, 0, &op);
-       if (ret)
-               return ret;
+       rc = kstrtou32(page, 0, &op);
+       if (rc)
+               return rc;
        if ((op != 1) && (op != 0)) {
                pr_err("Illegal value for tpg_enable: %u\n", op);
                return -EINVAL;
@@ -87,107 +89,64 @@ static ssize_t lio_target_np_sctp_store(struct config_item *item,
                return -EINVAL;
 
        if (op) {
-               /*
-                * Use existing np->np_sockaddr for SCTP network portal reference
-                */
-               tpg_np_sctp = iscsit_tpg_add_network_portal(tpg, &np->np_sockaddr,
-                                       tpg_np, ISCSI_SCTP_TCP);
-               if (!tpg_np_sctp || IS_ERR(tpg_np_sctp))
-                       goto out;
-       } else {
-               tpg_np_sctp = iscsit_tpg_locate_child_np(tpg_np, ISCSI_SCTP_TCP);
-               if (!tpg_np_sctp)
-                       goto out;
+               if (strlen(mod_name)) {
+                       rc = request_module(mod_name);
+                       if (rc != 0) {
+                               pr_warn("Unable to request_module for %s\n",
+                                       mod_name);
+                               rc = 0;
+                       }
+               }
 
-               ret = iscsit_tpg_del_network_portal(tpg, tpg_np_sctp);
-               if (ret < 0)
+               tpg_np_new = iscsit_tpg_add_network_portal(tpg,
+                                       &np->np_sockaddr, tpg_np, type);
+               if (IS_ERR(tpg_np_new))
                        goto out;
+       } else {
+               tpg_np_new = iscsit_tpg_locate_child_np(tpg_np, type);
+               if (tpg_np_new) {
+                       rc = iscsit_tpg_del_network_portal(tpg, tpg_np_new);
+                       if (rc < 0)
+                               goto out;
+               }
        }
 
        iscsit_put_tpg(tpg);
        return count;
 out:
        iscsit_put_tpg(tpg);
-       return -EINVAL;
+       return rc;
 }
 
 static ssize_t lio_target_np_iser_show(struct config_item *item, char *page)
 {
-       struct iscsi_tpg_np *tpg_np = to_iscsi_tpg_np(item);
-       struct iscsi_tpg_np *tpg_np_iser;
-       ssize_t rb;
-
-       tpg_np_iser = iscsit_tpg_locate_child_np(tpg_np, ISCSI_INFINIBAND);
-       if (tpg_np_iser)
-               rb = sprintf(page, "1\n");
-       else
-               rb = sprintf(page, "0\n");
-
-       return rb;
+       return lio_target_np_driver_show(item, page, ISCSI_INFINIBAND);
 }
 
 static ssize_t lio_target_np_iser_store(struct config_item *item,
-               const char *page, size_t count)
+                                       const char *page, size_t count)
 {
-       struct iscsi_tpg_np *tpg_np = to_iscsi_tpg_np(item);
-       struct iscsi_np *np;
-       struct iscsi_portal_group *tpg;
-       struct iscsi_tpg_np *tpg_np_iser = NULL;
-       char *endptr;
-       u32 op;
-       int rc = 0;
-
-       op = simple_strtoul(page, &endptr, 0);
-       if ((op != 1) && (op != 0)) {
-               pr_err("Illegal value for tpg_enable: %u\n", op);
-               return -EINVAL;
-       }
-       np = tpg_np->tpg_np;
-       if (!np) {
-               pr_err("Unable to locate struct iscsi_np from"
-                               " struct iscsi_tpg_np\n");
-               return -EINVAL;
-       }
-
-       tpg = tpg_np->tpg;
-       if (iscsit_get_tpg(tpg) < 0)
-               return -EINVAL;
-
-       if (op) {
-               rc = request_module("ib_isert");
-               if (rc != 0) {
-                       pr_warn("Unable to request_module for ib_isert\n");
-                       rc = 0;
-               }
-
-               tpg_np_iser = iscsit_tpg_add_network_portal(tpg, &np->np_sockaddr,
-                               tpg_np, ISCSI_INFINIBAND);
-               if (IS_ERR(tpg_np_iser)) {
-                       rc = PTR_ERR(tpg_np_iser);
-                       goto out;
-               }
-       } else {
-               tpg_np_iser = iscsit_tpg_locate_child_np(tpg_np, ISCSI_INFINIBAND);
-               if (tpg_np_iser) {
-                       rc = iscsit_tpg_del_network_portal(tpg, tpg_np_iser);
-                       if (rc < 0)
-                               goto out;
-               }
-       }
+       return lio_target_np_driver_store(item, page, count,
+                                         ISCSI_INFINIBAND, "ib_isert");
+}
+CONFIGFS_ATTR(lio_target_np_, iser);
 
-       iscsit_put_tpg(tpg);
-       return count;
-out:
-       iscsit_put_tpg(tpg);
-       return rc;
+static ssize_t lio_target_np_cxgbit_show(struct config_item *item, char *page)
+{
+       return lio_target_np_driver_show(item, page, ISCSI_CXGBIT);
 }
 
-CONFIGFS_ATTR(lio_target_np_, sctp);
-CONFIGFS_ATTR(lio_target_np_, iser);
+static ssize_t lio_target_np_cxgbit_store(struct config_item *item,
+                                         const char *page, size_t count)
+{
+       return lio_target_np_driver_store(item, page, count,
+                                         ISCSI_CXGBIT, "cxgbit");
+}
+CONFIGFS_ATTR(lio_target_np_, cxgbit);
 
 static struct configfs_attribute *lio_target_portal_attrs[] = {
-       &lio_target_np_attr_sctp,
        &lio_target_np_attr_iser,
+       &lio_target_np_attr_cxgbit,
        NULL,
 };
 
@@ -1554,7 +1513,7 @@ static int lio_tpg_check_prot_fabric_only(
  * This function calls iscsit_inc_session_usage_count() on the
  * struct iscsi_session in question.
  */
-static int lio_tpg_shutdown_session(struct se_session *se_sess)
+static void lio_tpg_close_session(struct se_session *se_sess)
 {
        struct iscsi_session *sess = se_sess->fabric_sess_ptr;
        struct se_portal_group *se_tpg = &sess->tpg->tpg_se_tpg;
@@ -1566,7 +1525,7 @@ static int lio_tpg_shutdown_session(struct se_session *se_sess)
            (sess->time2retain_timer_flags & ISCSI_TF_EXPIRED)) {
                spin_unlock(&sess->conn_lock);
                spin_unlock_bh(&se_tpg->session_lock);
-               return 0;
+               return;
        }
        atomic_set(&sess->session_reinstatement, 1);
        spin_unlock(&sess->conn_lock);
@@ -1575,20 +1534,6 @@ static int lio_tpg_shutdown_session(struct se_session *se_sess)
        spin_unlock_bh(&se_tpg->session_lock);
 
        iscsit_stop_session(sess, 1, 1);
-       return 1;
-}
-
-/*
- * Calls iscsit_dec_session_usage_count() as inverse of
- * lio_tpg_shutdown_session()
- */
-static void lio_tpg_close_session(struct se_session *se_sess)
-{
-       struct iscsi_session *sess = se_sess->fabric_sess_ptr;
-       /*
-        * If the iSCSI Session for the iSCSI Initiator Node exists,
-        * forcefully shutdown the iSCSI NEXUS.
-        */
        iscsit_close_session(sess);
 }
 
@@ -1640,7 +1585,6 @@ const struct target_core_fabric_ops iscsi_ops = {
        .tpg_get_inst_index             = lio_tpg_get_inst_index,
        .check_stop_free                = lio_check_stop_free,
        .release_cmd                    = lio_release_cmd,
-       .shutdown_session               = lio_tpg_shutdown_session,
        .close_session                  = lio_tpg_close_session,
        .sess_get_index                 = lio_sess_get_index,
        .sess_get_initiator_sid         = lio_sess_get_initiator_sid,
index fb3b52b..647d4a5 100644 (file)
@@ -524,3 +524,4 @@ struct iscsi_datain_req *iscsit_get_datain_values(
 
        return NULL;
 }
+EXPORT_SYMBOL(iscsit_get_datain_values);
index 210f6e4..b54e72c 100644 (file)
@@ -786,7 +786,7 @@ static void iscsit_handle_time2retain_timeout(unsigned long data)
        }
 
        spin_unlock_bh(&se_tpg->session_lock);
-       target_put_session(sess->se_sess);
+       iscsit_close_session(sess);
 }
 
 void iscsit_start_time2retain_handler(struct iscsi_session *sess)
index 8436d56..b5212f0 100644 (file)
@@ -228,7 +228,7 @@ int iscsi_check_for_session_reinstatement(struct iscsi_conn *conn)
        if (sess->session_state == TARG_SESS_STATE_FAILED) {
                spin_unlock_bh(&sess->conn_lock);
                iscsit_dec_session_usage_count(sess);
-               target_put_session(sess->se_sess);
+               iscsit_close_session(sess);
                return 0;
        }
        spin_unlock_bh(&sess->conn_lock);
@@ -236,7 +236,7 @@ int iscsi_check_for_session_reinstatement(struct iscsi_conn *conn)
        iscsit_stop_session(sess, 1, 1);
        iscsit_dec_session_usage_count(sess);
 
-       target_put_session(sess->se_sess);
+       iscsit_close_session(sess);
        return 0;
 }
 
@@ -258,7 +258,7 @@ static void iscsi_login_set_conn_values(
        mutex_unlock(&auth_id_lock);
 }
 
-static __printf(2, 3) int iscsi_change_param_sprintf(
+__printf(2, 3) int iscsi_change_param_sprintf(
        struct iscsi_conn *conn,
        const char *fmt, ...)
 {
@@ -279,6 +279,7 @@ static __printf(2, 3) int iscsi_change_param_sprintf(
 
        return 0;
 }
+EXPORT_SYMBOL(iscsi_change_param_sprintf);
 
 /*
  *     This is the leading connection of a new session,
@@ -1387,6 +1388,16 @@ static int __iscsi_target_login_thread(struct iscsi_np *np)
                        goto old_sess_out;
        }
 
+       if (conn->conn_transport->iscsit_validate_params) {
+               ret = conn->conn_transport->iscsit_validate_params(conn);
+               if (ret < 0) {
+                       if (zero_tsih)
+                               goto new_sess_out;
+                       else
+                               goto old_sess_out;
+               }
+       }
+
        ret = iscsi_target_start_negotiation(login, conn);
        if (ret < 0)
                goto new_sess_out;
index 9fc9117..89d34bd 100644 (file)
@@ -269,6 +269,7 @@ int iscsi_target_check_login_request(
 
        return 0;
 }
+EXPORT_SYMBOL(iscsi_target_check_login_request);
 
 static int iscsi_target_check_first_request(
        struct iscsi_conn *conn,
@@ -1246,16 +1247,16 @@ int iscsi_target_start_negotiation(
 {
        int ret;
 
-       ret = iscsi_target_do_login(conn, login);
-       if (!ret) {
-               if (conn->sock) {
-                       struct sock *sk = conn->sock->sk;
+       if (conn->sock) {
+               struct sock *sk = conn->sock->sk;
 
-                       write_lock_bh(&sk->sk_callback_lock);
-                       set_bit(LOGIN_FLAGS_READY, &conn->login_flags);
-                       write_unlock_bh(&sk->sk_callback_lock);
-               }
-       } else if (ret < 0) {
+               write_lock_bh(&sk->sk_callback_lock);
+               set_bit(LOGIN_FLAGS_READY, &conn->login_flags);
+               write_unlock_bh(&sk->sk_callback_lock);
+       }
+
+       ret = iscsi_target_do_login(conn, login);
+       if (ret < 0) {
                cancel_delayed_work_sync(&conn->login_work);
                cancel_delayed_work_sync(&conn->login_cleanup_work);
                iscsi_target_restore_sock_callbacks(conn);
index 3a1f9a7..0efa80b 100644 (file)
@@ -680,6 +680,7 @@ struct iscsi_param *iscsi_find_param_from_key(
        pr_err("Unable to locate key \"%s\".\n", key);
        return NULL;
 }
+EXPORT_SYMBOL(iscsi_find_param_from_key);
 
 int iscsi_extract_key_value(char *textbuf, char **key, char **value)
 {
index 5772038..1f38177 100644 (file)
@@ -514,6 +514,7 @@ void iscsit_add_cmd_to_immediate_queue(
 
        wake_up(&conn->queues_wq);
 }
+EXPORT_SYMBOL(iscsit_add_cmd_to_immediate_queue);
 
 struct iscsi_queue_req *iscsit_get_cmd_from_immediate_queue(struct iscsi_conn *conn)
 {
@@ -725,6 +726,9 @@ void __iscsit_free_cmd(struct iscsi_cmd *cmd, bool scsi_cmd,
                iscsit_remove_cmd_from_immediate_queue(cmd, conn);
                iscsit_remove_cmd_from_response_queue(cmd, conn);
        }
+
+       if (conn && conn->conn_transport->iscsit_release_cmd)
+               conn->conn_transport->iscsit_release_cmd(conn, cmd);
 }
 
 void iscsit_free_cmd(struct iscsi_cmd *cmd, bool shutdown)
@@ -773,6 +777,7 @@ void iscsit_free_cmd(struct iscsi_cmd *cmd, bool shutdown)
                break;
        }
 }
+EXPORT_SYMBOL(iscsit_free_cmd);
 
 int iscsit_check_session_usage_count(struct iscsi_session *sess)
 {
index 0ad5ac5..5091b31 100644 (file)
@@ -601,16 +601,6 @@ static int tcm_loop_get_cmd_state(struct se_cmd *se_cmd)
        return tl_cmd->sc_cmd_state;
 }
 
-static int tcm_loop_shutdown_session(struct se_session *se_sess)
-{
-       return 0;
-}
-
-static void tcm_loop_close_session(struct se_session *se_sess)
-{
-       return;
-};
-
 static int tcm_loop_write_pending(struct se_cmd *se_cmd)
 {
        /*
@@ -1243,8 +1233,6 @@ static const struct target_core_fabric_ops loop_ops = {
        .tpg_get_inst_index             = tcm_loop_get_inst_index,
        .check_stop_free                = tcm_loop_check_stop_free,
        .release_cmd                    = tcm_loop_release_cmd,
-       .shutdown_session               = tcm_loop_shutdown_session,
-       .close_session                  = tcm_loop_close_session,
        .sess_get_index                 = tcm_loop_sess_get_index,
        .write_pending                  = tcm_loop_write_pending,
        .write_pending_status           = tcm_loop_write_pending_status,
index c57e788..58bb6ed 100644 (file)
@@ -1726,16 +1726,6 @@ static void sbp_release_cmd(struct se_cmd *se_cmd)
        sbp_free_request(req);
 }
 
-static int sbp_shutdown_session(struct se_session *se_sess)
-{
-       return 0;
-}
-
-static void sbp_close_session(struct se_session *se_sess)
-{
-       return;
-}
-
 static u32 sbp_sess_get_index(struct se_session *se_sess)
 {
        return 0;
@@ -2349,8 +2339,6 @@ static const struct target_core_fabric_ops sbp_ops = {
        .tpg_check_prod_mode_write_protect = sbp_check_false,
        .tpg_get_inst_index             = sbp_tpg_get_inst_index,
        .release_cmd                    = sbp_release_cmd,
-       .shutdown_session               = sbp_shutdown_session,
-       .close_session                  = sbp_close_session,
        .sess_get_index                 = sbp_sess_get_index,
        .write_pending                  = sbp_write_pending,
        .write_pending_status           = sbp_write_pending_status,
index 49aba4a..4c82bbe 100644 (file)
@@ -932,7 +932,7 @@ static int core_alua_update_tpg_primary_metadata(
                        tg_pt_gp->tg_pt_gp_alua_access_status);
 
        snprintf(path, ALUA_METADATA_PATH_LEN,
-               "/var/target/alua/tpgs_%s/%s", &wwn->unit_serial[0],
+               "%s/alua/tpgs_%s/%s", db_root, &wwn->unit_serial[0],
                config_item_name(&tg_pt_gp->tg_pt_gp_group.cg_item));
 
        rc = core_alua_write_tpg_metadata(path, md_buf, len);
@@ -1275,8 +1275,8 @@ static int core_alua_update_tpg_secondary_metadata(struct se_lun *lun)
                        atomic_read(&lun->lun_tg_pt_secondary_offline),
                        lun->lun_tg_pt_secondary_stat);
 
-       snprintf(path, ALUA_METADATA_PATH_LEN, "/var/target/alua/%s/%s/lun_%llu",
-                       se_tpg->se_tpg_tfo->get_fabric_name(), wwn,
+       snprintf(path, ALUA_METADATA_PATH_LEN, "%s/alua/%s/%s/lun_%llu",
+                       db_root, se_tpg->se_tpg_tfo->get_fabric_name(), wwn,
                        lun->unpacked_lun);
 
        rc = core_alua_write_tpg_metadata(path, md_buf, len);
index d498533..2001005 100644 (file)
@@ -99,6 +99,67 @@ static ssize_t target_core_item_version_show(struct config_item *item,
 
 CONFIGFS_ATTR_RO(target_core_item_, version);
 
+char db_root[DB_ROOT_LEN] = DB_ROOT_DEFAULT;
+static char db_root_stage[DB_ROOT_LEN];
+
+static ssize_t target_core_item_dbroot_show(struct config_item *item,
+                                           char *page)
+{
+       return sprintf(page, "%s\n", db_root);
+}
+
+static ssize_t target_core_item_dbroot_store(struct config_item *item,
+                                       const char *page, size_t count)
+{
+       ssize_t read_bytes;
+       struct file *fp;
+
+       mutex_lock(&g_tf_lock);
+       if (!list_empty(&g_tf_list)) {
+               mutex_unlock(&g_tf_lock);
+               pr_err("db_root: cannot be changed: target drivers registered");
+               return -EINVAL;
+       }
+
+       if (count > (DB_ROOT_LEN - 1)) {
+               mutex_unlock(&g_tf_lock);
+               pr_err("db_root: count %d exceeds DB_ROOT_LEN-1: %u\n",
+                      (int)count, DB_ROOT_LEN - 1);
+               return -EINVAL;
+       }
+
+       read_bytes = snprintf(db_root_stage, DB_ROOT_LEN, "%s", page);
+       if (!read_bytes) {
+               mutex_unlock(&g_tf_lock);
+               return -EINVAL;
+       }
+       if (db_root_stage[read_bytes - 1] == '\n')
+               db_root_stage[read_bytes - 1] = '\0';
+
+       /* validate new db root before accepting it */
+       fp = filp_open(db_root_stage, O_RDONLY, 0);
+       if (IS_ERR(fp)) {
+               mutex_unlock(&g_tf_lock);
+               pr_err("db_root: cannot open: %s\n", db_root_stage);
+               return -EINVAL;
+       }
+       if (!S_ISDIR(fp->f_inode->i_mode)) {
+               filp_close(fp, 0);
+               mutex_unlock(&g_tf_lock);
+               pr_err("db_root: not a directory: %s\n", db_root_stage);
+               return -EINVAL;
+       }
+       filp_close(fp, 0);
+
+       strncpy(db_root, db_root_stage, read_bytes);
+
+       mutex_unlock(&g_tf_lock);
+
+       return read_bytes;
+}
+
+CONFIGFS_ATTR(target_core_item_, dbroot);
+
 static struct target_fabric_configfs *target_core_get_fabric(
        const char *name)
 {
@@ -239,6 +300,7 @@ static struct configfs_group_operations target_core_fabric_group_ops = {
  */
 static struct configfs_attribute *target_core_fabric_item_attrs[] = {
        &target_core_item_attr_version,
+       &target_core_item_attr_dbroot,
        NULL,
 };
 
@@ -323,14 +385,6 @@ static int target_fabric_tf_ops_check(const struct target_core_fabric_ops *tfo)
                pr_err("Missing tfo->release_cmd()\n");
                return -EINVAL;
        }
-       if (!tfo->shutdown_session) {
-               pr_err("Missing tfo->shutdown_session()\n");
-               return -EINVAL;
-       }
-       if (!tfo->close_session) {
-               pr_err("Missing tfo->close_session()\n");
-               return -EINVAL;
-       }
        if (!tfo->sess_get_index) {
                pr_err("Missing tfo->sess_get_index()\n");
                return -EINVAL;
index 86b4a83..fc91e85 100644 (file)
@@ -155,4 +155,10 @@ void       target_stat_setup_mappedlun_default_groups(struct se_lun_acl *);
 /* target_core_xcopy.c */
 extern struct se_portal_group xcopy_pt_tpg;
 
+/* target_core_configfs.c */
+#define DB_ROOT_LEN            4096
+#define        DB_ROOT_DEFAULT         "/var/target"
+
+extern char db_root[];
+
 #endif /* TARGET_CORE_INTERNAL_H */
index b179573..47463c9 100644 (file)
@@ -1985,7 +1985,7 @@ static int __core_scsi3_write_aptpl_to_file(
                return -EMSGSIZE;
        }
 
-       snprintf(path, 512, "/var/target/pr/aptpl_%s", &wwn->unit_serial[0]);
+       snprintf(path, 512, "%s/pr/aptpl_%s", db_root, &wwn->unit_serial[0]);
        file = filp_open(path, flags, 0600);
        if (IS_ERR(file)) {
                pr_err("filp_open(%s) for APTPL metadata"
index 47a833f..24b36fd 100644 (file)
@@ -403,7 +403,6 @@ static sense_reason_t rd_do_prot_rw(struct se_cmd *cmd, bool is_read)
        struct se_device *se_dev = cmd->se_dev;
        struct rd_dev *dev = RD_DEV(se_dev);
        struct rd_dev_sg_table *prot_table;
-       bool need_to_release = false;
        struct scatterlist *prot_sg;
        u32 sectors = cmd->data_length / se_dev->dev_attrib.block_size;
        u32 prot_offset, prot_page;
@@ -432,9 +431,6 @@ static sense_reason_t rd_do_prot_rw(struct se_cmd *cmd, bool is_read)
        if (!rc)
                sbc_dif_copy_prot(cmd, sectors, is_read, prot_sg, prot_offset);
 
-       if (need_to_release)
-               kfree(prot_sg);
-
        return rc;
 }
 
index ddf0460..d99752c 100644 (file)
@@ -336,44 +336,39 @@ struct se_node_acl *core_tpg_add_initiator_node_acl(
        return acl;
 }
 
-void core_tpg_del_initiator_node_acl(struct se_node_acl *acl)
+static void target_shutdown_sessions(struct se_node_acl *acl)
 {
-       struct se_portal_group *tpg = acl->se_tpg;
-       LIST_HEAD(sess_list);
-       struct se_session *sess, *sess_tmp;
+       struct se_session *sess;
        unsigned long flags;
-       int rc;
-
-       mutex_lock(&tpg->acl_node_mutex);
-       if (acl->dynamic_node_acl) {
-               acl->dynamic_node_acl = 0;
-       }
-       list_del(&acl->acl_list);
-       mutex_unlock(&tpg->acl_node_mutex);
 
+restart:
        spin_lock_irqsave(&acl->nacl_sess_lock, flags);
-       acl->acl_stop = 1;
-
-       list_for_each_entry_safe(sess, sess_tmp, &acl->acl_sess_list,
-                               sess_acl_list) {
-               if (sess->sess_tearing_down != 0)
+       list_for_each_entry(sess, &acl->acl_sess_list, sess_acl_list) {
+               if (sess->sess_tearing_down)
                        continue;
 
-               if (!target_get_session(sess))
-                       continue;
-               list_move(&sess->sess_acl_list, &sess_list);
+               list_del_init(&sess->sess_acl_list);
+               spin_unlock_irqrestore(&acl->nacl_sess_lock, flags);
+
+               if (acl->se_tpg->se_tpg_tfo->close_session)
+                       acl->se_tpg->se_tpg_tfo->close_session(sess);
+               goto restart;
        }
        spin_unlock_irqrestore(&acl->nacl_sess_lock, flags);
+}
 
-       list_for_each_entry_safe(sess, sess_tmp, &sess_list, sess_acl_list) {
-               list_del(&sess->sess_acl_list);
+void core_tpg_del_initiator_node_acl(struct se_node_acl *acl)
+{
+       struct se_portal_group *tpg = acl->se_tpg;
+
+       mutex_lock(&tpg->acl_node_mutex);
+       if (acl->dynamic_node_acl)
+               acl->dynamic_node_acl = 0;
+       list_del(&acl->acl_list);
+       mutex_unlock(&tpg->acl_node_mutex);
+
+       target_shutdown_sessions(acl);
 
-               rc = tpg->se_tpg_tfo->shutdown_session(sess);
-               target_put_session(sess);
-               if (!rc)
-                       continue;
-               target_put_session(sess);
-       }
        target_put_nacl(acl);
        /*
         * Wait for last target_put_nacl() to complete in target_complete_nacl()
@@ -400,11 +395,7 @@ int core_tpg_set_initiator_node_queue_depth(
        struct se_node_acl *acl,
        u32 queue_depth)
 {
-       LIST_HEAD(sess_list);
        struct se_portal_group *tpg = acl->se_tpg;
-       struct se_session *sess, *sess_tmp;
-       unsigned long flags;
-       int rc;
 
        /*
         * User has requested to change the queue depth for a Initiator Node.
@@ -413,30 +404,10 @@ int core_tpg_set_initiator_node_queue_depth(
         */
        target_set_nacl_queue_depth(tpg, acl, queue_depth);
 
-       spin_lock_irqsave(&acl->nacl_sess_lock, flags);
-       list_for_each_entry_safe(sess, sess_tmp, &acl->acl_sess_list,
-                                sess_acl_list) {
-               if (sess->sess_tearing_down != 0)
-                       continue;
-               if (!target_get_session(sess))
-                       continue;
-               spin_unlock_irqrestore(&acl->nacl_sess_lock, flags);
-
-               /*
-                * Finally call tpg->se_tpg_tfo->close_session() to force session
-                * reinstatement to occur if there is an active session for the
-                * $FABRIC_MOD Initiator Node in question.
-                */
-               rc = tpg->se_tpg_tfo->shutdown_session(sess);
-               target_put_session(sess);
-               if (!rc) {
-                       spin_lock_irqsave(&acl->nacl_sess_lock, flags);
-                       continue;
-               }
-               target_put_session(sess);
-               spin_lock_irqsave(&acl->nacl_sess_lock, flags);
-       }
-       spin_unlock_irqrestore(&acl->nacl_sess_lock, flags);
+       /*
+        * Shutdown all pending sessions to force session reinstatement.
+        */
+       target_shutdown_sessions(acl);
 
        pr_debug("Successfully changed queue depth to: %d for Initiator"
                " Node: %s on %s Target Portal Group: %u\n", acl->queue_depth,
index 590384a..5ab3967 100644 (file)
@@ -239,7 +239,6 @@ struct se_session *transport_init_session(enum target_prot_op sup_prot_ops)
        INIT_LIST_HEAD(&se_sess->sess_cmd_list);
        INIT_LIST_HEAD(&se_sess->sess_wait_list);
        spin_lock_init(&se_sess->sess_cmd_lock);
-       kref_init(&se_sess->sess_kref);
        se_sess->sup_prot_ops = sup_prot_ops;
 
        return se_sess;
@@ -430,27 +429,6 @@ target_alloc_session(struct se_portal_group *tpg,
 }
 EXPORT_SYMBOL(target_alloc_session);
 
-static void target_release_session(struct kref *kref)
-{
-       struct se_session *se_sess = container_of(kref,
-                       struct se_session, sess_kref);
-       struct se_portal_group *se_tpg = se_sess->se_tpg;
-
-       se_tpg->se_tpg_tfo->close_session(se_sess);
-}
-
-int target_get_session(struct se_session *se_sess)
-{
-       return kref_get_unless_zero(&se_sess->sess_kref);
-}
-EXPORT_SYMBOL(target_get_session);
-
-void target_put_session(struct se_session *se_sess)
-{
-       kref_put(&se_sess->sess_kref, target_release_session);
-}
-EXPORT_SYMBOL(target_put_session);
-
 ssize_t target_show_dynamic_sessions(struct se_portal_group *se_tpg, char *page)
 {
        struct se_session *se_sess;
@@ -499,8 +477,8 @@ void transport_deregister_session_configfs(struct se_session *se_sess)
        se_nacl = se_sess->se_node_acl;
        if (se_nacl) {
                spin_lock_irqsave(&se_nacl->nacl_sess_lock, flags);
-               if (se_nacl->acl_stop == 0)
-                       list_del(&se_sess->sess_acl_list);
+               if (!list_empty(&se_sess->sess_acl_list))
+                       list_del_init(&se_sess->sess_acl_list);
                /*
                 * If the session list is empty, then clear the pointer.
                 * Otherwise, set the struct se_session pointer from the tail
index c30003b..e28209b 100644 (file)
@@ -139,7 +139,6 @@ extern unsigned int ft_debug_logging;
  * Session ops.
  */
 void ft_sess_put(struct ft_sess *);
-int ft_sess_shutdown(struct se_session *);
 void ft_sess_close(struct se_session *);
 u32 ft_sess_get_index(struct se_session *);
 u32 ft_sess_get_port_name(struct se_session *, unsigned char *, u32);
index 4d375e9..42ee911 100644 (file)
@@ -442,7 +442,6 @@ static const struct target_core_fabric_ops ft_fabric_ops = {
        .tpg_get_inst_index =           ft_tpg_get_inst_index,
        .check_stop_free =              ft_check_stop_free,
        .release_cmd =                  ft_release_cmd,
-       .shutdown_session =             ft_sess_shutdown,
        .close_session =                ft_sess_close,
        .sess_get_index =               ft_sess_get_index,
        .sess_get_initiator_sid =       NULL,
index d0c3e18..f5186a7 100644 (file)
@@ -302,18 +302,6 @@ static void ft_sess_delete_all(struct ft_tport *tport)
  * TCM ops for sessions.
  */
 
-/*
- * Determine whether session is allowed to be shutdown in the current context.
- * Returns non-zero if the session should be shutdown.
- */
-int ft_sess_shutdown(struct se_session *se_sess)
-{
-       struct ft_sess *sess = se_sess->fabric_sess_ptr;
-
-       pr_debug("port_id %x\n", sess->port_id);
-       return 1;
-}
-
 /*
  * Remove session and send PRLO.
  * This is called when the ACL is being deleted or queue depth is changing.
index d89d60c..2d702ca 100644 (file)
@@ -260,16 +260,6 @@ config ARMADA_THERMAL
          Enable this option if you want to have support for thermal management
          controller present in Armada 370 and Armada XP SoC.
 
-config TEGRA_SOCTHERM
-       tristate "Tegra SOCTHERM thermal management"
-       depends on ARCH_TEGRA
-       help
-         Enable this option for integrated thermal management support on NVIDIA
-         Tegra124 systems-on-chip. The driver supports four thermal zones
-         (CPU, GPU, MEM, PLLX). Cooling devices can be bound to the thermal
-         zones to manage temperatures. This option is also required for the
-         emergency thermal reset (thermtrip) feature to function.
-
 config DB8500_CPUFREQ_COOLING
        tristate "DB8500 cpufreq cooling"
        depends on ARCH_U8500 || COMPILE_TEST
@@ -377,6 +367,17 @@ depends on ARCH_STI && OF
 source "drivers/thermal/st/Kconfig"
 endmenu
 
+config TANGO_THERMAL
+       tristate "Tango thermal management"
+       depends on ARCH_TANGO || COMPILE_TEST
+       help
+         Enable the Tango thermal driver, which supports the primitive
+         temperature sensor embedded in Tango chips since the SMP8758.
+         This sensor only generates a 1-bit signal to indicate whether
+         the die temperature exceeds a programmable threshold.
+
+source "drivers/thermal/tegra/Kconfig"
+
 config QCOM_SPMI_TEMP_ALARM
        tristate "Qualcomm SPMI PMIC Temperature Alarm"
        depends on OF && SPMI && IIO
@@ -388,4 +389,14 @@ config QCOM_SPMI_TEMP_ALARM
          real time die temperature if an ADC is present or an estimate of the
          temperature based upon the over temperature stage value.
 
+config GENERIC_ADC_THERMAL
+       tristate "Generic ADC based thermal sensor"
+       depends on IIO
+       help
+         This enabled a thermal sysfs driver for the temperature sensor
+         which is connected to the General Purpose ADC. The ADC channel
+         is read via IIO framework and the channel information is provided
+         to this driver. This driver reports the temperature by reading ADC
+         channel and converts it to temperature based on lookup table.
+
 endif
index 8e9cbc3..10b07c1 100644 (file)
@@ -35,6 +35,7 @@ obj-y                         += samsung/
 obj-$(CONFIG_DOVE_THERMAL)     += dove_thermal.o
 obj-$(CONFIG_DB8500_THERMAL)   += db8500_thermal.o
 obj-$(CONFIG_ARMADA_THERMAL)   += armada_thermal.o
+obj-$(CONFIG_TANGO_THERMAL)    += tango_thermal.o
 obj-$(CONFIG_IMX_THERMAL)      += imx_thermal.o
 obj-$(CONFIG_DB8500_CPUFREQ_COOLING)   += db8500_cpufreq_cooling.o
 obj-$(CONFIG_INTEL_POWERCLAMP) += intel_powerclamp.o
@@ -46,6 +47,7 @@ obj-$(CONFIG_TI_SOC_THERMAL)  += ti-soc-thermal/
 obj-$(CONFIG_INT340X_THERMAL)  += int340x_thermal/
 obj-$(CONFIG_INTEL_PCH_THERMAL)        += intel_pch_thermal.o
 obj-$(CONFIG_ST_THERMAL)       += st/
-obj-$(CONFIG_TEGRA_SOCTHERM)   += tegra_soctherm.o
+obj-$(CONFIG_TEGRA_SOCTHERM)   += tegra/
 obj-$(CONFIG_HISI_THERMAL)     += hisi_thermal.o
 obj-$(CONFIG_MTK_THERMAL)      += mtk_thermal.o
+obj-$(CONFIG_GENERIC_ADC_THERMAL)      += thermal-generic-adc.o
index 70836c5..fc52016 100644 (file)
@@ -29,7 +29,13 @@ static void thermal_zone_trip_update(struct thermal_zone_device *tz, int trip)
        struct thermal_instance *instance;
 
        tz->ops->get_trip_temp(tz, trip, &trip_temp);
-       tz->ops->get_trip_hyst(tz, trip, &trip_hyst);
+
+       if (!tz->ops->get_trip_hyst) {
+               pr_warn_once("Undefined get_trip_hyst for thermal zone %s - "
+                               "running with default hysteresis zero\n", tz->type);
+               trip_hyst = 0;
+       } else
+               tz->ops->get_trip_hyst(tz, trip, &trip_hyst);
 
        dev_dbg(&tz->device, "Trip%d[temp=%d]:temp=%d:hyst=%d\n",
                                trip, trip_temp, tz->temperature,
index 5e820b5..97fad8f 100644 (file)
@@ -160,7 +160,7 @@ static int hisi_thermal_get_temp(void *_sensor, int *temp)
        struct hisi_thermal_sensor *sensor = _sensor;
        struct hisi_thermal_data *data = sensor->thermal;
 
-       int sensor_id = 0, i;
+       int sensor_id = -1, i;
        long max_temp = 0;
 
        *temp = hisi_thermal_get_sensor_temp(data, sensor);
@@ -168,12 +168,19 @@ static int hisi_thermal_get_temp(void *_sensor, int *temp)
        sensor->sensor_temp = *temp;
 
        for (i = 0; i < HISI_MAX_SENSORS; i++) {
+               if (!data->sensors[i].tzd)
+                       continue;
+
                if (data->sensors[i].sensor_temp >= max_temp) {
                        max_temp = data->sensors[i].sensor_temp;
                        sensor_id = i;
                }
        }
 
+       /* If no sensor has been enabled, then skip to enable irq */
+       if (sensor_id == -1)
+               return 0;
+
        mutex_lock(&data->thermal_lock);
        data->irq_bind_sensor = sensor_id;
        mutex_unlock(&data->thermal_lock);
@@ -226,8 +233,12 @@ static irqreturn_t hisi_thermal_alarm_irq_thread(int irq, void *dev)
                 sensor->thres_temp / 1000);
        mutex_unlock(&data->thermal_lock);
 
-       for (i = 0; i < HISI_MAX_SENSORS; i++)
+       for (i = 0; i < HISI_MAX_SENSORS; i++) {
+               if (!data->sensors[i].tzd)
+                       continue;
+
                thermal_zone_device_update(data->sensors[i].tzd);
+       }
 
        return IRQ_HANDLED;
 }
@@ -243,10 +254,11 @@ static int hisi_thermal_register_sensor(struct platform_device *pdev,
        sensor->id = index;
        sensor->thermal = data;
 
-       sensor->tzd = thermal_zone_of_sensor_register(&pdev->dev, sensor->id,
-                               sensor, &hisi_of_thermal_ops);
+       sensor->tzd = devm_thermal_zone_of_sensor_register(&pdev->dev,
+                               sensor->id, sensor, &hisi_of_thermal_ops);
        if (IS_ERR(sensor->tzd)) {
                ret = PTR_ERR(sensor->tzd);
+               sensor->tzd = NULL;
                dev_err(&pdev->dev, "failed to register sensor id %d: %d\n",
                        sensor->id, ret);
                return ret;
@@ -331,28 +343,21 @@ static int hisi_thermal_probe(struct platform_device *pdev)
                return ret;
        }
 
+       hisi_thermal_enable_bind_irq_sensor(data);
+       irq_get_irqchip_state(data->irq, IRQCHIP_STATE_MASKED,
+                             &data->irq_enabled);
+
        for (i = 0; i < HISI_MAX_SENSORS; ++i) {
                ret = hisi_thermal_register_sensor(pdev, data,
                                                   &data->sensors[i], i);
-               if (ret) {
+               if (ret)
                        dev_err(&pdev->dev,
                                "failed to register thermal sensor: %d\n", ret);
-                       goto err_get_sensor_data;
-               }
+               else
+                       hisi_thermal_toggle_sensor(&data->sensors[i], true);
        }
 
-       hisi_thermal_enable_bind_irq_sensor(data);
-       data->irq_enabled = true;
-
-       for (i = 0; i < HISI_MAX_SENSORS; i++)
-               hisi_thermal_toggle_sensor(&data->sensors[i], true);
-
        return 0;
-
-err_get_sensor_data:
-       clk_disable_unprepare(data->clk);
-
-       return ret;
 }
 
 static int hisi_thermal_remove(struct platform_device *pdev)
@@ -363,8 +368,10 @@ static int hisi_thermal_remove(struct platform_device *pdev)
        for (i = 0; i < HISI_MAX_SENSORS; i++) {
                struct hisi_thermal_sensor *sensor = &data->sensors[i];
 
+               if (!sensor->tzd)
+                       continue;
+
                hisi_thermal_toggle_sensor(sensor, false);
-               thermal_zone_of_sensor_unregister(&pdev->dev, sensor->tzd);
        }
 
        hisi_thermal_disable_sensor(data);
index 13d431c..a578cd2 100644 (file)
@@ -177,7 +177,7 @@ static int int3406_thermal_probe(struct platform_device *pdev)
                return -ENODEV;
        d->raw_bd = bd;
 
-       ret = acpi_video_get_levels(ACPI_COMPANION(&pdev->dev), &d->br);
+       ret = acpi_video_get_levels(ACPI_COMPANION(&pdev->dev), &d->br, NULL);
        if (ret)
                return ret;
 
index 36fa724..42c1ac0 100644 (file)
@@ -198,49 +198,33 @@ static struct thermal_zone_device_ops proc_thermal_local_ops = {
        .get_temp       = proc_thermal_get_zone_temp,
 };
 
-static int proc_thermal_add(struct device *dev,
-                           struct proc_thermal_device **priv)
+static int proc_thermal_read_ppcc(struct proc_thermal_device *proc_priv)
 {
-       struct proc_thermal_device *proc_priv;
-       struct acpi_device *adev;
+       int i;
        acpi_status status;
        struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL };
        union acpi_object *elements, *ppcc;
        union acpi_object *p;
-       unsigned long long tmp;
-       struct thermal_zone_device_ops *ops = NULL;
-       int i;
-       int ret;
-
-       adev = ACPI_COMPANION(dev);
-       if (!adev)
-               return -ENODEV;
+       int ret = 0;
 
-       status = acpi_evaluate_object(adev->handle, "PPCC", NULL, &buf);
+       status = acpi_evaluate_object(proc_priv->adev->handle, "PPCC",
+                                     NULL, &buf);
        if (ACPI_FAILURE(status))
                return -ENODEV;
 
        p = buf.pointer;
        if (!p || (p->type != ACPI_TYPE_PACKAGE)) {
-               dev_err(dev, "Invalid PPCC data\n");
+               dev_err(proc_priv->dev, "Invalid PPCC data\n");
                ret = -EFAULT;
                goto free_buffer;
        }
+
        if (!p->package.count) {
-               dev_err(dev, "Invalid PPCC package size\n");
+               dev_err(proc_priv->dev, "Invalid PPCC package size\n");
                ret = -EFAULT;
                goto free_buffer;
        }
 
-       proc_priv = devm_kzalloc(dev, sizeof(*proc_priv), GFP_KERNEL);
-       if (!proc_priv) {
-               ret = -ENOMEM;
-               goto free_buffer;
-       }
-
-       proc_priv->dev = dev;
-       proc_priv->adev = adev;
-
        for (i = 0; i < min((int)p->package.count - 1, 2); ++i) {
                elements = &(p->package.elements[i+1]);
                if (elements->type != ACPI_TYPE_PACKAGE ||
@@ -257,12 +241,62 @@ static int proc_thermal_add(struct device *dev,
                proc_priv->power_limits[i].step_uw = ppcc[5].integer.value;
        }
 
+free_buffer:
+       kfree(buf.pointer);
+
+       return ret;
+}
+
+#define PROC_POWER_CAPABILITY_CHANGED  0x83
+static void proc_thermal_notify(acpi_handle handle, u32 event, void *data)
+{
+       struct proc_thermal_device *proc_priv = data;
+
+       if (!proc_priv)
+               return;
+
+       switch (event) {
+       case PROC_POWER_CAPABILITY_CHANGED:
+               proc_thermal_read_ppcc(proc_priv);
+               int340x_thermal_zone_device_update(proc_priv->int340x_zone);
+               break;
+       default:
+               dev_err(proc_priv->dev, "Unsupported event [0x%x]\n", event);
+               break;
+       }
+}
+
+
+static int proc_thermal_add(struct device *dev,
+                           struct proc_thermal_device **priv)
+{
+       struct proc_thermal_device *proc_priv;
+       struct acpi_device *adev;
+       acpi_status status;
+       unsigned long long tmp;
+       struct thermal_zone_device_ops *ops = NULL;
+       int ret;
+
+       adev = ACPI_COMPANION(dev);
+       if (!adev)
+               return -ENODEV;
+
+       proc_priv = devm_kzalloc(dev, sizeof(*proc_priv), GFP_KERNEL);
+       if (!proc_priv)
+               return -ENOMEM;
+
+       proc_priv->dev = dev;
+       proc_priv->adev = adev;
        *priv = proc_priv;
 
-       ret = sysfs_create_group(&dev->kobj,
-                                &power_limit_attribute_group);
+       ret = proc_thermal_read_ppcc(proc_priv);
+       if (!ret) {
+               ret = sysfs_create_group(&dev->kobj,
+                                        &power_limit_attribute_group);
+
+       }
        if (ret)
-               goto free_buffer;
+               return ret;
 
        status = acpi_evaluate_integer(adev->handle, "_TMP", NULL, &tmp);
        if (ACPI_FAILURE(status)) {
@@ -274,20 +308,32 @@ static int proc_thermal_add(struct device *dev,
 
        proc_priv->int340x_zone = int340x_thermal_zone_add(adev, ops);
        if (IS_ERR(proc_priv->int340x_zone)) {
-               sysfs_remove_group(&proc_priv->dev->kobj,
-                          &power_limit_attribute_group);
                ret = PTR_ERR(proc_priv->int340x_zone);
+               goto remove_group;
        } else
                ret = 0;
 
-free_buffer:
-       kfree(buf.pointer);
+       ret = acpi_install_notify_handler(adev->handle, ACPI_DEVICE_NOTIFY,
+                                         proc_thermal_notify,
+                                         (void *)proc_priv);
+       if (ret)
+               goto remove_zone;
+
+       return 0;
+
+remove_zone:
+       int340x_thermal_zone_remove(proc_priv->int340x_zone);
+remove_group:
+       sysfs_remove_group(&proc_priv->dev->kobj,
+                          &power_limit_attribute_group);
 
        return ret;
 }
 
 static void proc_thermal_remove(struct proc_thermal_device *proc_priv)
 {
+       acpi_remove_notify_handler(proc_priv->adev->handle,
+                                  ACPI_DEVICE_NOTIFY, proc_thermal_notify);
        int340x_thermal_zone_remove(proc_priv->int340x_zone);
        sysfs_remove_group(&proc_priv->dev->kobj,
                           &power_limit_attribute_group);
index 6c79588..015ce2e 100644 (file)
@@ -510,12 +510,6 @@ static int start_power_clamp(void)
        unsigned long cpu;
        struct task_struct *thread;
 
-       /* check if pkg cstate counter is completely 0, abort in this case */
-       if (!has_pkg_state_counter()) {
-               pr_err("pkg cstate counter not functional, abort\n");
-               return -EINVAL;
-       }
-
        set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
        /* prevent cpu hotplug */
        get_online_cpus();
@@ -672,35 +666,11 @@ static struct thermal_cooling_device_ops powerclamp_cooling_ops = {
        .set_cur_state = powerclamp_set_cur_state,
 };
 
-/* runs on Nehalem and later */
 static const struct x86_cpu_id intel_powerclamp_ids[] __initconst = {
-       { X86_VENDOR_INTEL, 6, 0x1a},
-       { X86_VENDOR_INTEL, 6, 0x1c},
-       { X86_VENDOR_INTEL, 6, 0x1e},
-       { X86_VENDOR_INTEL, 6, 0x1f},
-       { X86_VENDOR_INTEL, 6, 0x25},
-       { X86_VENDOR_INTEL, 6, 0x26},
-       { X86_VENDOR_INTEL, 6, 0x2a},
-       { X86_VENDOR_INTEL, 6, 0x2c},
-       { X86_VENDOR_INTEL, 6, 0x2d},
-       { X86_VENDOR_INTEL, 6, 0x2e},
-       { X86_VENDOR_INTEL, 6, 0x2f},
-       { X86_VENDOR_INTEL, 6, 0x37},
-       { X86_VENDOR_INTEL, 6, 0x3a},
-       { X86_VENDOR_INTEL, 6, 0x3c},
-       { X86_VENDOR_INTEL, 6, 0x3d},
-       { X86_VENDOR_INTEL, 6, 0x3e},
-       { X86_VENDOR_INTEL, 6, 0x3f},
-       { X86_VENDOR_INTEL, 6, 0x45},
-       { X86_VENDOR_INTEL, 6, 0x46},
-       { X86_VENDOR_INTEL, 6, 0x47},
-       { X86_VENDOR_INTEL, 6, 0x4c},
-       { X86_VENDOR_INTEL, 6, 0x4d},
-       { X86_VENDOR_INTEL, 6, 0x4e},
-       { X86_VENDOR_INTEL, 6, 0x4f},
-       { X86_VENDOR_INTEL, 6, 0x56},
-       { X86_VENDOR_INTEL, 6, 0x57},
-       { X86_VENDOR_INTEL, 6, 0x5e},
+       { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_MWAIT },
+       { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_ARAT },
+       { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_NONSTOP_TSC },
+       { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_CONSTANT_TSC},
        {}
 };
 MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
@@ -712,11 +682,12 @@ static int __init powerclamp_probe(void)
                                boot_cpu_data.x86, boot_cpu_data.x86_model);
                return -ENODEV;
        }
-       if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ||
-               !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) ||
-               !boot_cpu_has(X86_FEATURE_MWAIT) ||
-               !boot_cpu_has(X86_FEATURE_ARAT))
+
+       /* The goal for idle time alignment is to achieve package cstate. */
+       if (!has_pkg_state_counter()) {
+               pr_info("No package C-state available");
                return -ENODEV;
+       }
 
        /* find the deepest mwait value */
        find_target_mwait();
index 507632b..262ab0a 100644 (file)
@@ -144,7 +144,6 @@ struct mtk_thermal {
        s32 o_slope;
        s32 vts[MT8173_NUM_SENSORS];
 
-       struct thermal_zone_device *tzd;
 };
 
 struct mtk_thermal_bank_cfg {
@@ -572,16 +571,11 @@ static int mtk_thermal_probe(struct platform_device *pdev)
 
        platform_set_drvdata(pdev, mt);
 
-       mt->tzd = thermal_zone_of_sensor_register(&pdev->dev, 0, mt,
-                               &mtk_thermal_ops);
-       if (IS_ERR(mt->tzd))
-               goto err_register;
+       devm_thermal_zone_of_sensor_register(&pdev->dev, 0, mt,
+                                            &mtk_thermal_ops);
 
        return 0;
 
-err_register:
-       clk_disable_unprepare(mt->clk_peri_therm);
-
 err_disable_clk_auxadc:
        clk_disable_unprepare(mt->clk_auxadc);
 
@@ -592,8 +586,6 @@ static int mtk_thermal_remove(struct platform_device *pdev)
 {
        struct mtk_thermal *mt = platform_get_drvdata(pdev);
 
-       thermal_zone_of_sensor_unregister(&pdev->dev, mt->tzd);
-
        clk_disable_unprepare(mt->clk_peri_therm);
        clk_disable_unprepare(mt->clk_auxadc);
 
index d8ec44b..b8e509c 100644 (file)
@@ -331,6 +331,14 @@ static int of_thermal_set_trip_temp(struct thermal_zone_device *tz, int trip,
        if (trip >= data->ntrips || trip < 0)
                return -EDOM;
 
+       if (data->ops->set_trip_temp) {
+               int ret;
+
+               ret = data->ops->set_trip_temp(data->sensor_data, trip, temp);
+               if (ret)
+                       return ret;
+       }
+
        /* thermal framework should take care of data->mask & (1 << trip) */
        data->trips[trip].temperature = temp;
 
@@ -906,7 +914,7 @@ finish:
        return tz;
 
 free_tbps:
-       for (i = 0; i < tz->num_tbps; i++)
+       for (i = i - 1; i >= 0; i--)
                of_node_put(tz->tbps[i].cooling_device);
        kfree(tz->tbps);
 free_trips:
index b677aad..f8a3c60 100644 (file)
@@ -260,7 +260,7 @@ static int qpnp_tm_probe(struct platform_device *pdev)
        if (ret < 0)
                goto fail;
 
-       chip->tz_dev = thermal_zone_of_sensor_register(&pdev->dev, 0, chip,
+       chip->tz_dev = devm_thermal_zone_of_sensor_register(&pdev->dev, 0, chip,
                                                        &qpnp_tm_sensor_ops);
        if (IS_ERR(chip->tz_dev)) {
                dev_err(&pdev->dev, "failed to register sensor\n");
@@ -281,7 +281,6 @@ static int qpnp_tm_remove(struct platform_device *pdev)
 {
        struct qpnp_tm_chip *chip = dev_get_drvdata(&pdev->dev);
 
-       thermal_zone_of_sensor_unregister(&pdev->dev, chip->tz_dev);
        if (!IS_ERR(chip->adc))
                iio_channel_release(chip->adc);
 
index 82daba0..71a3392 100644 (file)
@@ -492,7 +492,7 @@ static int rcar_thermal_probe(struct platform_device *pdev)
                        goto error_unregister;
 
                if (of_data == USE_OF_THERMAL)
-                       priv->zone = thermal_zone_of_sensor_register(
+                       priv->zone = devm_thermal_zone_of_sensor_register(
                                                dev, i, priv,
                                                &rcar_thermal_zone_of_ops);
                else
index 233a564..5d491f1 100644 (file)
@@ -1,7 +1,5 @@
 /*
- * Copyright (c) 2014, Fuzhou Rockchip Electronics Co., Ltd
- *
- * Copyright (c) 2015, Fuzhou Rockchip Electronics Co., Ltd
+ * Copyright (c) 2014-2016, Fuzhou Rockchip Electronics Co., Ltd
  * Caesar Wang <wxt@rock-chips.com>
  *
  * This program is free software; you can redistribute it and/or modify it
 #include <linux/of_address.h>
 #include <linux/of_irq.h>
 #include <linux/platform_device.h>
+#include <linux/regmap.h>
 #include <linux/reset.h>
 #include <linux/thermal.h>
+#include <linux/mfd/syscon.h>
 #include <linux/pinctrl/consumer.h>
 
 /**
@@ -73,7 +73,7 @@ enum adc_sort_mode {
 #define SOC_MAX_SENSORS        2
 
 /**
- * struct chip_tsadc_table: hold information about chip-specific differences
+ * struct chip_tsadc_table - hold information about chip-specific differences
  * @id: conversion table
  * @length: size of conversion table
  * @data_mask: mask to apply on data inputs
@@ -86,6 +86,20 @@ struct chip_tsadc_table {
        enum adc_sort_mode mode;
 };
 
+/**
+ * struct rockchip_tsadc_chip - hold the private data of tsadc chip
+ * @chn_id[SOC_MAX_SENSORS]: the sensor id of chip correspond to the channel
+ * @chn_num: the channel number of tsadc chip
+ * @tshut_temp: the hardware-controlled shutdown temperature value
+ * @tshut_mode: the hardware-controlled shutdown mode (0:CRU 1:GPIO)
+ * @tshut_polarity: the hardware-controlled active polarity (0:LOW 1:HIGH)
+ * @initialize: SoC special initialize tsadc controller method
+ * @irq_ack: clear the interrupt
+ * @get_temp: get the temperature
+ * @set_tshut_temp: set the hardware-controlled shutdown temperature
+ * @set_tshut_mode: set the hardware-controlled shutdown mode
+ * @table: the chip-specific conversion table
+ */
 struct rockchip_tsadc_chip {
        /* The sensor id of chip correspond to the ADC channel */
        int chn_id[SOC_MAX_SENSORS];
@@ -97,7 +111,8 @@ struct rockchip_tsadc_chip {
        enum tshut_polarity tshut_polarity;
 
        /* Chip-wide methods */
-       void (*initialize)(void __iomem *reg, enum tshut_polarity p);
+       void (*initialize)(struct regmap *grf,
+                          void __iomem *reg, enum tshut_polarity p);
        void (*irq_ack)(void __iomem *reg);
        void (*control)(void __iomem *reg, bool on);
 
@@ -112,12 +127,32 @@ struct rockchip_tsadc_chip {
        struct chip_tsadc_table table;
 };
 
+/**
+ * struct rockchip_thermal_sensor - hold the information of thermal sensor
+ * @thermal:  pointer to the platform/configuration data
+ * @tzd: pointer to a thermal zone
+ * @id: identifier of the thermal sensor
+ */
 struct rockchip_thermal_sensor {
        struct rockchip_thermal_data *thermal;
        struct thermal_zone_device *tzd;
        int id;
 };
 
+/**
+ * struct rockchip_thermal_data - hold the private data of thermal driver
+ * @chip: pointer to the platform/configuration data
+ * @pdev: platform device of thermal
+ * @reset: the reset controller of tsadc
+ * @sensors[SOC_MAX_SENSORS]: the thermal sensor
+ * @clk: the controller clock is divided by the exteral 24MHz
+ * @pclk: the advanced peripherals bus clock
+ * @grf: the general register file will be used to do static set by software
+ * @regs: the base address of tsadc controller
+ * @tshut_temp: the hardware-controlled shutdown temperature value
+ * @tshut_mode: the hardware-controlled shutdown mode (0:CRU 1:GPIO)
+ * @tshut_polarity: the hardware-controlled active polarity (0:LOW 1:HIGH)
+ */
 struct rockchip_thermal_data {
        const struct rockchip_tsadc_chip *chip;
        struct platform_device *pdev;
@@ -128,6 +163,7 @@ struct rockchip_thermal_data {
        struct clk *clk;
        struct clk *pclk;
 
+       struct regmap *grf;
        void __iomem *regs;
 
        int tshut_temp;
@@ -142,6 +178,7 @@ struct rockchip_thermal_data {
  * TSADCV3_* are used for newer SoCs than RK3288. (e.g: RK3228, RK3399)
  *
  */
+#define TSADCV2_USER_CON                       0x00
 #define TSADCV2_AUTO_CON                       0x04
 #define TSADCV2_INT_EN                         0x08
 #define TSADCV2_INT_PD                         0x0c
@@ -155,12 +192,7 @@ struct rockchip_thermal_data {
 #define TSADCV2_AUTO_EN                                BIT(0)
 #define TSADCV2_AUTO_SRC_EN(chn)               BIT(4 + (chn))
 #define TSADCV2_AUTO_TSHUT_POLARITY_HIGH       BIT(8)
-/**
- * TSADCV1_AUTO_Q_SEL_EN:
- * whether select (1024 - tsadc_q) as output
- * 1'b0:use tsadc_q as output(temperature-code is rising sequence)
- * 1'b1:use(1024 - tsadc_q) as output (temperature-code is falling sequence)
- */
+
 #define TSADCV3_AUTO_Q_SEL_EN                  BIT(1)
 
 #define TSADCV2_INT_SRC_EN(chn)                        BIT(chn)
@@ -177,19 +209,32 @@ struct rockchip_thermal_data {
 #define TSADCV2_HIGHT_TSHUT_DEBOUNCE_COUNT     4
 #define TSADCV2_AUTO_PERIOD_TIME               250 /* msec */
 #define TSADCV2_AUTO_PERIOD_HT_TIME            50  /* msec */
+#define TSADCV2_USER_INTER_PD_SOC              0x340 /* 13 clocks */
 
-struct tsadc_table {
-       u32 code;
-       int temp;
-};
+#define GRF_SARADC_TESTBIT                     0x0e644
+#define GRF_TSADC_TESTBIT_L                    0x0e648
+#define GRF_TSADC_TESTBIT_H                    0x0e64c
+
+#define GRF_TSADC_TSEN_PD_ON                   (0x30003 << 0)
+#define GRF_TSADC_TSEN_PD_OFF                  (0x30000 << 0)
+#define GRF_SARADC_TESTBIT_ON                  (0x10001 << 2)
+#define GRF_TSADC_TESTBIT_H_ON                 (0x10001 << 2)
 
 /**
+ * struct tsadc_table - code to temperature conversion table
+ * @code: the value of adc channel
+ * @temp: the temperature
  * Note:
- * Code to Temperature mapping of the Temperature sensor is a piece wise linear
+ * code to temperature mapping of the temperature sensor is a piece wise linear
  * curve.Any temperature, code faling between to 2 give temperatures can be
  * linearly interpolated.
- * Code to Temperature mapping should be updated based on sillcon results.
+ * Code to Temperature mapping should be updated based on manufacturer results.
  */
+struct tsadc_table {
+       u32 code;
+       int temp;
+};
+
 static const struct tsadc_table rk3228_code_table[] = {
        {0, -40000},
        {588, -40000},
@@ -308,40 +353,40 @@ static const struct tsadc_table rk3368_code_table[] = {
 
 static const struct tsadc_table rk3399_code_table[] = {
        {0, -40000},
-       {593, -40000},
-       {598, -35000},
-       {603, -30000},
-       {609, -25000},
-       {614, -20000},
-       {619, -15000},
-       {625, -10000},
-       {630, -5000},
-       {635, 0},
-       {641, 5000},
-       {646, 10000},
-       {651, 15000},
-       {657, 20000},
-       {662, 25000},
-       {667, 30000},
-       {673, 35000},
-       {678, 40000},
-       {684, 45000},
-       {689, 50000},
-       {694, 55000},
-       {700, 60000},
-       {705, 65000},
-       {711, 70000},
-       {716, 75000},
-       {722, 80000},
-       {727, 85000},
-       {733, 90000},
-       {738, 95000},
-       {743, 100000},
-       {749, 105000},
-       {754, 110000},
-       {760, 115000},
-       {765, 120000},
-       {771, 125000},
+       {402, -40000},
+       {410, -35000},
+       {419, -30000},
+       {427, -25000},
+       {436, -20000},
+       {444, -15000},
+       {453, -10000},
+       {461, -5000},
+       {470, 0},
+       {478, 5000},
+       {487, 10000},
+       {496, 15000},
+       {504, 20000},
+       {513, 25000},
+       {521, 30000},
+       {530, 35000},
+       {538, 40000},
+       {547, 45000},
+       {555, 50000},
+       {564, 55000},
+       {573, 60000},
+       {581, 65000},
+       {590, 70000},
+       {599, 75000},
+       {607, 80000},
+       {616, 85000},
+       {624, 90000},
+       {633, 95000},
+       {642, 100000},
+       {650, 105000},
+       {659, 110000},
+       {668, 115000},
+       {677, 120000},
+       {685, 125000},
        {TSADCV3_DATA_MASK, 125000},
 };
 
@@ -405,8 +450,8 @@ static int rk_tsadcv2_code_to_temp(struct chip_tsadc_table table, u32 code,
                        return -EAGAIN;         /* Incorrect reading */
 
                while (low <= high) {
-                       if (code >= table.id[mid - 1].code &&
-                           code < table.id[mid].code)
+                       if (code <= table.id[mid].code &&
+                           code > table.id[mid - 1].code)
                                break;
                        else if (code > table.id[mid].code)
                                low = mid + 1;
@@ -449,7 +494,7 @@ static int rk_tsadcv2_code_to_temp(struct chip_tsadc_table table, u32 code,
  *     If the temperature is higher than COMP_INT or COMP_SHUT for
  *     "debounce" times, TSADC controller will generate interrupt or TSHUT.
  */
-static void rk_tsadcv2_initialize(void __iomem *regs,
+static void rk_tsadcv2_initialize(struct regmap *grf, void __iomem *regs,
                                  enum tshut_polarity tshut_polarity)
 {
        if (tshut_polarity == TSHUT_HIGH_ACTIVE)
@@ -466,6 +511,62 @@ static void rk_tsadcv2_initialize(void __iomem *regs,
                       regs + TSADCV2_AUTO_PERIOD_HT);
        writel_relaxed(TSADCV2_HIGHT_TSHUT_DEBOUNCE_COUNT,
                       regs + TSADCV2_HIGHT_TSHUT_DEBOUNCE);
+
+       if (IS_ERR(grf)) {
+               pr_warn("%s: Missing rockchip,grf property\n", __func__);
+               return;
+       }
+}
+
+/**
+ * rk_tsadcv3_initialize - initialize TASDC Controller.
+ *
+ * (1) The tsadc control power sequence.
+ *
+ * (2) Set TSADC_V2_AUTO_PERIOD:
+ *     Configure the interleave between every two accessing of
+ *     TSADC in normal operation.
+ *
+ * (2) Set TSADCV2_AUTO_PERIOD_HT:
+ *     Configure the interleave between every two accessing of
+ *     TSADC after the temperature is higher than COM_SHUT or COM_INT.
+ *
+ * (3) Set TSADCV2_HIGH_INT_DEBOUNCE and TSADC_HIGHT_TSHUT_DEBOUNCE:
+ *     If the temperature is higher than COMP_INT or COMP_SHUT for
+ *     "debounce" times, TSADC controller will generate interrupt or TSHUT.
+ */
+static void rk_tsadcv3_initialize(struct regmap *grf, void __iomem *regs,
+                                 enum tshut_polarity tshut_polarity)
+{
+       /* The tsadc control power sequence */
+       if (IS_ERR(grf)) {
+               /* Set interleave value to workround ic time sync issue */
+               writel_relaxed(TSADCV2_USER_INTER_PD_SOC, regs +
+                              TSADCV2_USER_CON);
+       } else {
+               regmap_write(grf, GRF_TSADC_TESTBIT_L, GRF_TSADC_TSEN_PD_ON);
+               mdelay(10);
+               regmap_write(grf, GRF_TSADC_TESTBIT_L, GRF_TSADC_TSEN_PD_OFF);
+               usleep_range(15, 100); /* The spec note says at least 15 us */
+               regmap_write(grf, GRF_SARADC_TESTBIT, GRF_SARADC_TESTBIT_ON);
+               regmap_write(grf, GRF_TSADC_TESTBIT_H, GRF_TSADC_TESTBIT_H_ON);
+               usleep_range(90, 200); /* The spec note says at least 90 us */
+       }
+
+       if (tshut_polarity == TSHUT_HIGH_ACTIVE)
+               writel_relaxed(0U | TSADCV2_AUTO_TSHUT_POLARITY_HIGH,
+                              regs + TSADCV2_AUTO_CON);
+       else
+               writel_relaxed(0U & ~TSADCV2_AUTO_TSHUT_POLARITY_HIGH,
+                              regs + TSADCV2_AUTO_CON);
+
+       writel_relaxed(TSADCV2_AUTO_PERIOD_TIME, regs + TSADCV2_AUTO_PERIOD);
+       writel_relaxed(TSADCV2_HIGHT_INT_DEBOUNCE_COUNT,
+                      regs + TSADCV2_HIGHT_INT_DEBOUNCE);
+       writel_relaxed(TSADCV2_AUTO_PERIOD_HT_TIME,
+                      regs + TSADCV2_AUTO_PERIOD_HT);
+       writel_relaxed(TSADCV2_HIGHT_TSHUT_DEBOUNCE_COUNT,
+                      regs + TSADCV2_HIGHT_TSHUT_DEBOUNCE);
 }
 
 static void rk_tsadcv2_irq_ack(void __iomem *regs)
@@ -498,10 +599,11 @@ static void rk_tsadcv2_control(void __iomem *regs, bool enable)
 }
 
 /**
- * @rk_tsadcv3_control:
- * TSADC controller works at auto mode, and some SoCs need set the tsadc_q_sel
- * bit on TSADCV2_AUTO_CON[1]. The (1024 - tsadc_q) as output adc value if
- * setting this bit to enable.
+ * rk_tsadcv3_control - the tsadc controller is enabled or disabled.
+ *
+ * NOTE: TSADC controller works at auto mode, and some SoCs need set the
+ * tsadc_q_sel bit on TSADCV2_AUTO_CON[1]. The (1024 - tsadc_q) as output
+ * adc value if setting this bit to enable.
  */
 static void rk_tsadcv3_control(void __iomem *regs, bool enable)
 {
@@ -603,6 +705,30 @@ static const struct rockchip_tsadc_chip rk3288_tsadc_data = {
        },
 };
 
+static const struct rockchip_tsadc_chip rk3366_tsadc_data = {
+       .chn_id[SENSOR_CPU] = 0, /* cpu sensor is channel 0 */
+       .chn_id[SENSOR_GPU] = 1, /* gpu sensor is channel 1 */
+       .chn_num = 2, /* two channels for tsadc */
+
+       .tshut_mode = TSHUT_MODE_GPIO, /* default TSHUT via GPIO give PMIC */
+       .tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */
+       .tshut_temp = 95000,
+
+       .initialize = rk_tsadcv3_initialize,
+       .irq_ack = rk_tsadcv3_irq_ack,
+       .control = rk_tsadcv3_control,
+       .get_temp = rk_tsadcv2_get_temp,
+       .set_tshut_temp = rk_tsadcv2_tshut_temp,
+       .set_tshut_mode = rk_tsadcv2_tshut_mode,
+
+       .table = {
+               .id = rk3228_code_table,
+               .length = ARRAY_SIZE(rk3228_code_table),
+               .data_mask = TSADCV3_DATA_MASK,
+               .mode = ADC_INCREMENT,
+       },
+};
+
 static const struct rockchip_tsadc_chip rk3368_tsadc_data = {
        .chn_id[SENSOR_CPU] = 0, /* cpu sensor is channel 0 */
        .chn_id[SENSOR_GPU] = 1, /* gpu sensor is channel 1 */
@@ -636,7 +762,7 @@ static const struct rockchip_tsadc_chip rk3399_tsadc_data = {
        .tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */
        .tshut_temp = 95000,
 
-       .initialize = rk_tsadcv2_initialize,
+       .initialize = rk_tsadcv3_initialize,
        .irq_ack = rk_tsadcv3_irq_ack,
        .control = rk_tsadcv3_control,
        .get_temp = rk_tsadcv2_get_temp,
@@ -660,6 +786,10 @@ static const struct of_device_id of_rockchip_thermal_match[] = {
                .compatible = "rockchip,rk3288-tsadc",
                .data = (void *)&rk3288_tsadc_data,
        },
+       {
+               .compatible = "rockchip,rk3366-tsadc",
+               .data = (void *)&rk3366_tsadc_data,
+       },
        {
                .compatible = "rockchip,rk3368-tsadc",
                .data = (void *)&rk3368_tsadc_data,
@@ -768,6 +898,11 @@ static int rockchip_configure_from_dt(struct device *dev,
                return -EINVAL;
        }
 
+       /* The tsadc wont to handle the error in here since some SoCs didn't
+        * need this property.
+        */
+       thermal->grf = syscon_regmap_lookup_by_phandle(np, "rockchip,grf");
+
        return 0;
 }
 
@@ -786,8 +921,8 @@ rockchip_thermal_register_sensor(struct platform_device *pdev,
 
        sensor->thermal = thermal;
        sensor->id = id;
-       sensor->tzd = thermal_zone_of_sensor_register(&pdev->dev, id, sensor,
-                                                     &rockchip_of_thermal_ops);
+       sensor->tzd = devm_thermal_zone_of_sensor_register(&pdev->dev, id,
+                                       sensor, &rockchip_of_thermal_ops);
        if (IS_ERR(sensor->tzd)) {
                error = PTR_ERR(sensor->tzd);
                dev_err(&pdev->dev, "failed to register sensor %d: %d\n",
@@ -815,7 +950,7 @@ static int rockchip_thermal_probe(struct platform_device *pdev)
        const struct of_device_id *match;
        struct resource *res;
        int irq;
-       int i, j;
+       int i;
        int error;
 
        match = of_match_node(of_rockchip_thermal_match, np);
@@ -888,7 +1023,8 @@ static int rockchip_thermal_probe(struct platform_device *pdev)
                goto err_disable_pclk;
        }
 
-       thermal->chip->initialize(thermal->regs, thermal->tshut_polarity);
+       thermal->chip->initialize(thermal->grf, thermal->regs,
+                                 thermal->tshut_polarity);
 
        for (i = 0; i < thermal->chip->chn_num; i++) {
                error = rockchip_thermal_register_sensor(pdev, thermal,
@@ -898,9 +1034,6 @@ static int rockchip_thermal_probe(struct platform_device *pdev)
                        dev_err(&pdev->dev,
                                "failed to register sensor[%d] : error = %d\n",
                                i, error);
-                       for (j = 0; j < i; j++)
-                               thermal_zone_of_sensor_unregister(&pdev->dev,
-                                               thermal->sensors[j].tzd);
                        goto err_disable_pclk;
                }
        }
@@ -912,7 +1045,7 @@ static int rockchip_thermal_probe(struct platform_device *pdev)
        if (error) {
                dev_err(&pdev->dev,
                        "failed to request tsadc irq: %d\n", error);
-               goto err_unregister_sensor;
+               goto err_disable_pclk;
        }
 
        thermal->chip->control(thermal->regs, true);
@@ -924,11 +1057,6 @@ static int rockchip_thermal_probe(struct platform_device *pdev)
 
        return 0;
 
-err_unregister_sensor:
-       while (i--)
-               thermal_zone_of_sensor_unregister(&pdev->dev,
-                                                 thermal->sensors[i].tzd);
-
 err_disable_pclk:
        clk_disable_unprepare(thermal->pclk);
 err_disable_clk:
@@ -946,7 +1074,6 @@ static int rockchip_thermal_remove(struct platform_device *pdev)
                struct rockchip_thermal_sensor *sensor = &thermal->sensors[i];
 
                rockchip_thermal_toggle_sensor(sensor, false);
-               thermal_zone_of_sensor_unregister(&pdev->dev, sensor->tzd);
        }
 
        thermal->chip->control(thermal->regs, false);
@@ -988,12 +1115,15 @@ static int __maybe_unused rockchip_thermal_resume(struct device *dev)
                return error;
 
        error = clk_enable(thermal->pclk);
-       if (error)
+       if (error) {
+               clk_disable(thermal->clk);
                return error;
+       }
 
        rockchip_thermal_reset_controller(thermal->reset);
 
-       thermal->chip->initialize(thermal->regs, thermal->tshut_polarity);
+       thermal->chip->initialize(thermal->grf, thermal->regs,
+                                 thermal->tshut_polarity);
 
        for (i = 0; i < thermal->chip->chn_num; i++) {
                int id = thermal->sensors[i].id;
diff --git a/drivers/thermal/tango_thermal.c b/drivers/thermal/tango_thermal.c
new file mode 100644 (file)
index 0000000..70e0d9f
--- /dev/null
@@ -0,0 +1,109 @@
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/thermal.h>
+#include <linux/platform_device.h>
+
+/*
+ * According to a data sheet draft, "this temperature sensor uses a bandgap
+ * type of circuit to compare a voltage which has a negative temperature
+ * coefficient with a voltage that is proportional to absolute temperature.
+ * A resistor bank allows 41 different temperature thresholds to be selected
+ * and the logic output will then indicate whether the actual die temperature
+ * lies above or below the selected threshold."
+ */
+
+#define TEMPSI_CMD     0
+#define TEMPSI_RES     4
+#define TEMPSI_CFG     8
+
+#define CMD_OFF                0
+#define CMD_ON         1
+#define CMD_READ       2
+
+#define IDX_MIN                15
+#define IDX_MAX                40
+
+struct tango_thermal_priv {
+       void __iomem *base;
+       int thresh_idx;
+};
+
+static bool temp_above_thresh(void __iomem *base, int thresh_idx)
+{
+       writel(CMD_READ | thresh_idx << 8, base + TEMPSI_CMD);
+       usleep_range(10, 20);
+       writel(CMD_READ | thresh_idx << 8, base + TEMPSI_CMD);
+
+       return readl(base + TEMPSI_RES);
+}
+
+static int tango_get_temp(void *arg, int *res)
+{
+       struct tango_thermal_priv *priv = arg;
+       int idx = priv->thresh_idx;
+
+       if (temp_above_thresh(priv->base, idx)) {
+               /* Search upward by incrementing thresh_idx */
+               while (idx < IDX_MAX && temp_above_thresh(priv->base, ++idx))
+                       cpu_relax();
+               idx = idx - 1; /* always return lower bound */
+       } else {
+               /* Search downward by decrementing thresh_idx */
+               while (idx > IDX_MIN && !temp_above_thresh(priv->base, --idx))
+                       cpu_relax();
+       }
+
+       *res = (idx * 9 / 2 - 38) * 1000; /* millidegrees Celsius */
+       priv->thresh_idx = idx;
+
+       return 0;
+}
+
+static const struct thermal_zone_of_device_ops ops = {
+       .get_temp       = tango_get_temp,
+};
+
+static int tango_thermal_probe(struct platform_device *pdev)
+{
+       struct resource *res;
+       struct tango_thermal_priv *priv;
+       struct thermal_zone_device *tzdev;
+
+       priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
+       if (!priv)
+               return -ENOMEM;
+
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       priv->base = devm_ioremap_resource(&pdev->dev, res);
+       if (IS_ERR(priv->base))
+               return PTR_ERR(priv->base);
+
+       priv->thresh_idx = IDX_MIN;
+       writel(0, priv->base + TEMPSI_CFG);
+       writel(CMD_ON, priv->base + TEMPSI_CMD);
+
+       tzdev = devm_thermal_zone_of_sensor_register(&pdev->dev, 0, priv, &ops);
+       return PTR_ERR_OR_ZERO(tzdev);
+}
+
+static const struct of_device_id tango_sensor_ids[] = {
+       {
+               .compatible = "sigma,smp8758-thermal",
+       },
+       { /* sentinel */ }
+};
+
+static struct platform_driver tango_thermal_driver = {
+       .probe  = tango_thermal_probe,
+       .driver = {
+               .name           = "tango-thermal",
+               .of_match_table = tango_sensor_ids,
+       },
+};
+
+module_platform_driver(tango_thermal_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Sigma Designs");
+MODULE_DESCRIPTION("Tango temperature sensor");
diff --git a/drivers/thermal/tegra/Kconfig b/drivers/thermal/tegra/Kconfig
new file mode 100644 (file)
index 0000000..cec586e
--- /dev/null
@@ -0,0 +1,13 @@
+menu "NVIDIA Tegra thermal drivers"
+depends on ARCH_TEGRA
+
+config TEGRA_SOCTHERM
+       tristate "Tegra SOCTHERM thermal management"
+       help
+         Enable this option for integrated thermal management support on NVIDIA
+         Tegra systems-on-chip. The driver supports four thermal zones
+         (CPU, GPU, MEM, PLLX). Cooling devices can be bound to the thermal
+         zones to manage temperatures. This option is also required for the
+         emergency thermal reset (thermtrip) feature to function.
+
+endmenu
diff --git a/drivers/thermal/tegra/Makefile b/drivers/thermal/tegra/Makefile
new file mode 100644 (file)
index 0000000..1ce1af2
--- /dev/null
@@ -0,0 +1,6 @@
+obj-$(CONFIG_TEGRA_SOCTHERM)   += tegra-soctherm.o
+
+tegra-soctherm-y                               := soctherm.o soctherm-fuse.o
+tegra-soctherm-$(CONFIG_ARCH_TEGRA_124_SOC)    += tegra124-soctherm.o
+tegra-soctherm-$(CONFIG_ARCH_TEGRA_132_SOC)    += tegra132-soctherm.o
+tegra-soctherm-$(CONFIG_ARCH_TEGRA_210_SOC)    += tegra210-soctherm.o
diff --git a/drivers/thermal/tegra/soctherm-fuse.c b/drivers/thermal/tegra/soctherm-fuse.c
new file mode 100644 (file)
index 0000000..2996318
--- /dev/null
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2014-2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <soc/tegra/fuse.h>
+
+#include "soctherm.h"
+
+#define NOMINAL_CALIB_FT                       105
+#define NOMINAL_CALIB_CP                       25
+
+#define FUSE_TSENSOR_CALIB_CP_TS_BASE_MASK     0x1fff
+#define FUSE_TSENSOR_CALIB_FT_TS_BASE_MASK     (0x1fff << 13)
+#define FUSE_TSENSOR_CALIB_FT_TS_BASE_SHIFT    13
+
+#define FUSE_TSENSOR_COMMON                    0x180
+
+/*
+ * Tegra210: Layout of bits in FUSE_TSENSOR_COMMON:
+ *    3                   2                   1                   0
+ *  1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |       BASE_FT       |      BASE_CP      | SHFT_FT | SHIFT_CP  |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * Tegra12x, etc:
+ * In chips prior to Tegra210, this fuse was incorrectly sized as 26 bits,
+ * and didn't hold SHIFT_CP in [31:26]. Therefore these missing six bits
+ * were obtained via the FUSE_SPARE_REALIGNMENT_REG register [5:0].
+ *
+ * FUSE_TSENSOR_COMMON:
+ *    3                   2                   1                   0
+ *  1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |-----------| SHFT_FT |       BASE_FT       |      BASE_CP      |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * FUSE_SPARE_REALIGNMENT_REG:
+ *    3                   2                   1                   0
+ *  1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |---------------------------------------------------| SHIFT_CP  |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+
+#define CALIB_COEFFICIENT 1000000LL
+
+/**
+ * div64_s64_precise() - wrapper for div64_s64()
+ * @a:  the dividend
+ * @b:  the divisor
+ *
+ * Implements division with fairly accurate rounding instead of truncation by
+ * shifting the dividend to the left by 16 so that the quotient has a
+ * much higher precision.
+ *
+ * Return: the quotient of a / b.
+ */
+static s64 div64_s64_precise(s64 a, s32 b)
+{
+       s64 r, al;
+
+       /* Scale up for increased precision division */
+       al = a << 16;
+
+       r = div64_s64(al * 2 + 1, 2 * b);
+       return r >> 16;
+}
+
+int tegra_calc_shared_calib(const struct tegra_soctherm_fuse *tfuse,
+                           struct tsensor_shared_calib *shared)
+{
+       u32 val;
+       s32 shifted_cp, shifted_ft;
+       int err;
+
+       err = tegra_fuse_readl(FUSE_TSENSOR_COMMON, &val);
+       if (err)
+               return err;
+
+       shared->base_cp = (val & tfuse->fuse_base_cp_mask) >>
+                         tfuse->fuse_base_cp_shift;
+       shared->base_ft = (val & tfuse->fuse_base_ft_mask) >>
+                         tfuse->fuse_base_ft_shift;
+
+       shifted_ft = (val & tfuse->fuse_shift_ft_mask) >>
+                    tfuse->fuse_shift_ft_shift;
+       shifted_ft = sign_extend32(shifted_ft, 4);
+
+       if (tfuse->fuse_spare_realignment) {
+               err = tegra_fuse_readl(tfuse->fuse_spare_realignment, &val);
+               if (err)
+                       return err;
+       }
+
+       shifted_cp = sign_extend32(val, 5);
+
+       shared->actual_temp_cp = 2 * NOMINAL_CALIB_CP + shifted_cp;
+       shared->actual_temp_ft = 2 * NOMINAL_CALIB_FT + shifted_ft;
+
+       return 0;
+}
+
+int tegra_calc_tsensor_calib(const struct tegra_tsensor *sensor,
+                            const struct tsensor_shared_calib *shared,
+                            u32 *calibration)
+{
+       const struct tegra_tsensor_group *sensor_group;
+       u32 val, calib;
+       s32 actual_tsensor_ft, actual_tsensor_cp;
+       s32 delta_sens, delta_temp;
+       s32 mult, div;
+       s16 therma, thermb;
+       s64 temp;
+       int err;
+
+       sensor_group = sensor->group;
+
+       err = tegra_fuse_readl(sensor->calib_fuse_offset, &val);
+       if (err)
+               return err;
+
+       actual_tsensor_cp = (shared->base_cp * 64) + sign_extend32(val, 12);
+       val = (val & FUSE_TSENSOR_CALIB_FT_TS_BASE_MASK) >>
+             FUSE_TSENSOR_CALIB_FT_TS_BASE_SHIFT;
+       actual_tsensor_ft = (shared->base_ft * 32) + sign_extend32(val, 12);
+
+       delta_sens = actual_tsensor_ft - actual_tsensor_cp;
+       delta_temp = shared->actual_temp_ft - shared->actual_temp_cp;
+
+       mult = sensor_group->pdiv * sensor->config->tsample_ate;
+       div = sensor->config->tsample * sensor_group->pdiv_ate;
+
+       temp = (s64)delta_temp * (1LL << 13) * mult;
+       therma = div64_s64_precise(temp, (s64)delta_sens * div);
+
+       temp = ((s64)actual_tsensor_ft * shared->actual_temp_cp) -
+               ((s64)actual_tsensor_cp * shared->actual_temp_ft);
+       thermb = div64_s64_precise(temp, delta_sens);
+
+       temp = (s64)therma * sensor->fuse_corr_alpha;
+       therma = div64_s64_precise(temp, CALIB_COEFFICIENT);
+
+       temp = (s64)thermb * sensor->fuse_corr_alpha + sensor->fuse_corr_beta;
+       thermb = div64_s64_precise(temp, CALIB_COEFFICIENT);
+
+       calib = ((u16)therma << SENSOR_CONFIG2_THERMA_SHIFT) |
+               ((u16)thermb << SENSOR_CONFIG2_THERMB_SHIFT);
+
+       *calibration = calib;
+
+       return 0;
+}
+
+MODULE_AUTHOR("Wei Ni <wni@nvidia.com>");
+MODULE_DESCRIPTION("Tegra SOCTHERM fuse management");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/thermal/tegra/soctherm.c b/drivers/thermal/tegra/soctherm.c
new file mode 100644 (file)
index 0000000..b865172
--- /dev/null
@@ -0,0 +1,685 @@
+/*
+ * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Author:
+ *     Mikko Perttunen <mperttunen@nvidia.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/debugfs.h>
+#include <linux/bitops.h>
+#include <linux/clk.h>
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/reset.h>
+#include <linux/thermal.h>
+
+#include <dt-bindings/thermal/tegra124-soctherm.h>
+
+#include "soctherm.h"
+
+#define SENSOR_CONFIG0                         0
+#define SENSOR_CONFIG0_STOP                    BIT(0)
+#define SENSOR_CONFIG0_CPTR_OVER               BIT(2)
+#define SENSOR_CONFIG0_OVER                    BIT(3)
+#define SENSOR_CONFIG0_TCALC_OVER              BIT(4)
+#define SENSOR_CONFIG0_TALL_MASK               (0xfffff << 8)
+#define SENSOR_CONFIG0_TALL_SHIFT              8
+
+#define SENSOR_CONFIG1                         4
+#define SENSOR_CONFIG1_TSAMPLE_MASK            0x3ff
+#define SENSOR_CONFIG1_TSAMPLE_SHIFT           0
+#define SENSOR_CONFIG1_TIDDQ_EN_MASK           (0x3f << 15)
+#define SENSOR_CONFIG1_TIDDQ_EN_SHIFT          15
+#define SENSOR_CONFIG1_TEN_COUNT_MASK          (0x3f << 24)
+#define SENSOR_CONFIG1_TEN_COUNT_SHIFT         24
+#define SENSOR_CONFIG1_TEMP_ENABLE             BIT(31)
+
+/*
+ * SENSOR_CONFIG2 is defined in soctherm.h
+ * because, it will be used by tegra_soctherm_fuse.c
+ */
+
+#define SENSOR_STATUS0                         0xc
+#define SENSOR_STATUS0_VALID_MASK              BIT(31)
+#define SENSOR_STATUS0_CAPTURE_MASK            0xffff
+
+#define SENSOR_STATUS1                         0x10
+#define SENSOR_STATUS1_TEMP_VALID_MASK         BIT(31)
+#define SENSOR_STATUS1_TEMP_MASK               0xffff
+
+#define READBACK_VALUE_MASK                    0xff00
+#define READBACK_VALUE_SHIFT                   8
+#define READBACK_ADD_HALF                      BIT(7)
+#define READBACK_NEGATE                                BIT(0)
+
+/* get val from register(r) mask bits(m) */
+#define REG_GET_MASK(r, m)     (((r) & (m)) >> (ffs(m) - 1))
+/* set val(v) to mask bits(m) of register(r) */
+#define REG_SET_MASK(r, m, v)  (((r) & ~(m)) | \
+                                (((v) & (m >> (ffs(m) - 1))) << (ffs(m) - 1)))
+
+static const int min_low_temp = -127000;
+static const int max_high_temp = 127000;
+
+struct tegra_thermctl_zone {
+       void __iomem *reg;
+       struct device *dev;
+       struct thermal_zone_device *tz;
+       const struct tegra_tsensor_group *sg;
+};
+
+struct tegra_soctherm {
+       struct reset_control *reset;
+       struct clk *clock_tsensor;
+       struct clk *clock_soctherm;
+       void __iomem *regs;
+       struct thermal_zone_device **thermctl_tzs;
+
+       u32 *calib;
+       struct tegra_soctherm_soc *soc;
+
+       struct dentry *debugfs_dir;
+};
+
+static void enable_tsensor(struct tegra_soctherm *tegra, unsigned int i)
+{
+       const struct tegra_tsensor *sensor = &tegra->soc->tsensors[i];
+       void __iomem *base = tegra->regs + sensor->base;
+       unsigned int val;
+
+       val = sensor->config->tall << SENSOR_CONFIG0_TALL_SHIFT;
+       writel(val, base + SENSOR_CONFIG0);
+
+       val  = (sensor->config->tsample - 1) << SENSOR_CONFIG1_TSAMPLE_SHIFT;
+       val |= sensor->config->tiddq_en << SENSOR_CONFIG1_TIDDQ_EN_SHIFT;
+       val |= sensor->config->ten_count << SENSOR_CONFIG1_TEN_COUNT_SHIFT;
+       val |= SENSOR_CONFIG1_TEMP_ENABLE;
+       writel(val, base + SENSOR_CONFIG1);
+
+       writel(tegra->calib[i], base + SENSOR_CONFIG2);
+}
+
+/*
+ * Translate from soctherm readback format to millicelsius.
+ * The soctherm readback format in bits is as follows:
+ *   TTTTTTTT H______N
+ * where T's contain the temperature in Celsius,
+ * H denotes an addition of 0.5 Celsius and N denotes negation
+ * of the final value.
+ */
+static int translate_temp(u16 val)
+{
+       int t;
+
+       t = ((val & READBACK_VALUE_MASK) >> READBACK_VALUE_SHIFT) * 1000;
+       if (val & READBACK_ADD_HALF)
+               t += 500;
+       if (val & READBACK_NEGATE)
+               t *= -1;
+
+       return t;
+}
+
+static int tegra_thermctl_get_temp(void *data, int *out_temp)
+{
+       struct tegra_thermctl_zone *zone = data;
+       u32 val;
+
+       val = readl(zone->reg);
+       val = REG_GET_MASK(val, zone->sg->sensor_temp_mask);
+       *out_temp = translate_temp(val);
+
+       return 0;
+}
+
+static int
+thermtrip_program(struct device *dev, const struct tegra_tsensor_group *sg,
+                 int trip_temp);
+
+static int tegra_thermctl_set_trip_temp(void *data, int trip, int temp)
+{
+       struct tegra_thermctl_zone *zone = data;
+       struct thermal_zone_device *tz = zone->tz;
+       const struct tegra_tsensor_group *sg = zone->sg;
+       struct device *dev = zone->dev;
+       enum thermal_trip_type type;
+       int ret;
+
+       if (!tz)
+               return -EINVAL;
+
+       ret = tz->ops->get_trip_type(tz, trip, &type);
+       if (ret)
+               return ret;
+
+       if (type != THERMAL_TRIP_CRITICAL)
+               return 0;
+
+       return thermtrip_program(dev, sg, temp);
+}
+
+static const struct thermal_zone_of_device_ops tegra_of_thermal_ops = {
+       .get_temp = tegra_thermctl_get_temp,
+       .set_trip_temp = tegra_thermctl_set_trip_temp,
+};
+
+/**
+ * enforce_temp_range() - check and enforce temperature range [min, max]
+ * @trip_temp: the trip temperature to check
+ *
+ * Checks and enforces the permitted temperature range that SOC_THERM
+ * HW can support This is
+ * done while taking care of precision.
+ *
+ * Return: The precision adjusted capped temperature in millicelsius.
+ */
+static int enforce_temp_range(struct device *dev, int trip_temp)
+{
+       int temp;
+
+       temp = clamp_val(trip_temp, min_low_temp, max_high_temp);
+       if (temp != trip_temp)
+               dev_info(dev, "soctherm: trip temperature %d forced to %d\n",
+                        trip_temp, temp);
+       return temp;
+}
+
+/**
+ * thermtrip_program() - Configures the hardware to shut down the
+ * system if a given sensor group reaches a given temperature
+ * @dev: ptr to the struct device for the SOC_THERM IP block
+ * @sg: pointer to the sensor group to set the thermtrip temperature for
+ * @trip_temp: the temperature in millicelsius to trigger the thermal trip at
+ *
+ * Sets the thermal trip threshold of the given sensor group to be the
+ * @trip_temp.  If this threshold is crossed, the hardware will shut
+ * down.
+ *
+ * Note that, although @trip_temp is specified in millicelsius, the
+ * hardware is programmed in degrees Celsius.
+ *
+ * Return: 0 upon success, or %-EINVAL upon failure.
+ */
+static int thermtrip_program(struct device *dev,
+                            const struct tegra_tsensor_group *sg,
+                            int trip_temp)
+{
+       struct tegra_soctherm *ts = dev_get_drvdata(dev);
+       int temp;
+       u32 r;
+
+       if (!sg || !sg->thermtrip_threshold_mask)
+               return -EINVAL;
+
+       temp = enforce_temp_range(dev, trip_temp) / ts->soc->thresh_grain;
+
+       r = readl(ts->regs + THERMCTL_THERMTRIP_CTL);
+       r = REG_SET_MASK(r, sg->thermtrip_threshold_mask, temp);
+       r = REG_SET_MASK(r, sg->thermtrip_enable_mask, 1);
+       r = REG_SET_MASK(r, sg->thermtrip_any_en_mask, 0);
+       writel(r, ts->regs + THERMCTL_THERMTRIP_CTL);
+
+       return 0;
+}
+
+/**
+ * tegra_soctherm_set_hwtrips() - set HW trip point from DT data
+ * @dev: struct device * of the SOC_THERM instance
+ *
+ * Configure the SOC_THERM HW trip points, setting "THERMTRIP"
+ * trip points , using "critical" type trip_temp from thermal
+ * zone.
+ * After they have been configured, THERMTRIP will take action
+ * when the configured SoC thermal sensor group reaches a
+ * certain temperature.
+ *
+ * Return: 0 upon success, or a negative error code on failure.
+ * "Success" does not mean that trips was enabled; it could also
+ * mean that no node was found in DT.
+ * THERMTRIP has been enabled successfully when a message similar to
+ * this one appears on the serial console:
+ * "thermtrip: will shut down when sensor group XXX reaches YYYYYY mC"
+ */
+static int tegra_soctherm_set_hwtrips(struct device *dev,
+                                     const struct tegra_tsensor_group *sg,
+                                     struct thermal_zone_device *tz)
+{
+       int temperature;
+       int ret;
+
+       ret = tz->ops->get_crit_temp(tz, &temperature);
+       if (ret) {
+               dev_warn(dev, "thermtrip: %s: missing critical temperature\n",
+                        sg->name);
+               return ret;
+       }
+
+       ret = thermtrip_program(dev, sg, temperature);
+       if (ret) {
+               dev_err(dev, "thermtrip: %s: error during enable\n",
+                       sg->name);
+               return ret;
+       }
+
+       dev_info(dev,
+                "thermtrip: will shut down when %s reaches %d mC\n",
+                sg->name, temperature);
+
+       return 0;
+}
+
+#ifdef CONFIG_DEBUG_FS
+static int regs_show(struct seq_file *s, void *data)
+{
+       struct platform_device *pdev = s->private;
+       struct tegra_soctherm *ts = platform_get_drvdata(pdev);
+       const struct tegra_tsensor *tsensors = ts->soc->tsensors;
+       const struct tegra_tsensor_group **ttgs = ts->soc->ttgs;
+       u32 r, state;
+       int i;
+
+       seq_puts(s, "-----TSENSE (convert HW)-----\n");
+
+       for (i = 0; i < ts->soc->num_tsensors; i++) {
+               r = readl(ts->regs + tsensors[i].base + SENSOR_CONFIG1);
+               state = REG_GET_MASK(r, SENSOR_CONFIG1_TEMP_ENABLE);
+
+               seq_printf(s, "%s: ", tsensors[i].name);
+               seq_printf(s, "En(%d) ", state);
+
+               if (!state) {
+                       seq_puts(s, "\n");
+                       continue;
+               }
+
+               state = REG_GET_MASK(r, SENSOR_CONFIG1_TIDDQ_EN_MASK);
+               seq_printf(s, "tiddq(%d) ", state);
+               state = REG_GET_MASK(r, SENSOR_CONFIG1_TEN_COUNT_MASK);
+               seq_printf(s, "ten_count(%d) ", state);
+               state = REG_GET_MASK(r, SENSOR_CONFIG1_TSAMPLE_MASK);
+               seq_printf(s, "tsample(%d) ", state + 1);
+
+               r = readl(ts->regs + tsensors[i].base + SENSOR_STATUS1);
+               state = REG_GET_MASK(r, SENSOR_STATUS1_TEMP_VALID_MASK);
+               seq_printf(s, "Temp(%d/", state);
+               state = REG_GET_MASK(r, SENSOR_STATUS1_TEMP_MASK);
+               seq_printf(s, "%d) ", translate_temp(state));
+
+               r = readl(ts->regs + tsensors[i].base + SENSOR_STATUS0);
+               state = REG_GET_MASK(r, SENSOR_STATUS0_VALID_MASK);
+               seq_printf(s, "Capture(%d/", state);
+               state = REG_GET_MASK(r, SENSOR_STATUS0_CAPTURE_MASK);
+               seq_printf(s, "%d) ", state);
+
+               r = readl(ts->regs + tsensors[i].base + SENSOR_CONFIG0);
+               state = REG_GET_MASK(r, SENSOR_CONFIG0_STOP);
+               seq_printf(s, "Stop(%d) ", state);
+               state = REG_GET_MASK(r, SENSOR_CONFIG0_TALL_MASK);
+               seq_printf(s, "Tall(%d) ", state);
+               state = REG_GET_MASK(r, SENSOR_CONFIG0_TCALC_OVER);
+               seq_printf(s, "Over(%d/", state);
+               state = REG_GET_MASK(r, SENSOR_CONFIG0_OVER);
+               seq_printf(s, "%d/", state);
+               state = REG_GET_MASK(r, SENSOR_CONFIG0_CPTR_OVER);
+               seq_printf(s, "%d) ", state);
+
+               r = readl(ts->regs + tsensors[i].base + SENSOR_CONFIG2);
+               state = REG_GET_MASK(r, SENSOR_CONFIG2_THERMA_MASK);
+               seq_printf(s, "Therm_A/B(%d/", state);
+               state = REG_GET_MASK(r, SENSOR_CONFIG2_THERMB_MASK);
+               seq_printf(s, "%d)\n", (s16)state);
+       }
+
+       r = readl(ts->regs + SENSOR_PDIV);
+       seq_printf(s, "PDIV: 0x%x\n", r);
+
+       r = readl(ts->regs + SENSOR_HOTSPOT_OFF);
+       seq_printf(s, "HOTSPOT: 0x%x\n", r);
+
+       seq_puts(s, "\n");
+       seq_puts(s, "-----SOC_THERM-----\n");
+
+       r = readl(ts->regs + SENSOR_TEMP1);
+       state = REG_GET_MASK(r, SENSOR_TEMP1_CPU_TEMP_MASK);
+       seq_printf(s, "Temperatures: CPU(%d) ", translate_temp(state));
+       state = REG_GET_MASK(r, SENSOR_TEMP1_GPU_TEMP_MASK);
+       seq_printf(s, " GPU(%d) ", translate_temp(state));
+       r = readl(ts->regs + SENSOR_TEMP2);
+       state = REG_GET_MASK(r, SENSOR_TEMP2_PLLX_TEMP_MASK);
+       seq_printf(s, " PLLX(%d) ", translate_temp(state));
+       state = REG_GET_MASK(r, SENSOR_TEMP2_MEM_TEMP_MASK);
+       seq_printf(s, " MEM(%d)\n", translate_temp(state));
+
+       r = readl(ts->regs + THERMCTL_THERMTRIP_CTL);
+       state = REG_GET_MASK(r, ttgs[0]->thermtrip_any_en_mask);
+       seq_printf(s, "Thermtrip Any En(%d)\n", state);
+       for (i = 0; i < ts->soc->num_ttgs; i++) {
+               state = REG_GET_MASK(r, ttgs[i]->thermtrip_enable_mask);
+               seq_printf(s, "     %s En(%d) ", ttgs[i]->name, state);
+               state = REG_GET_MASK(r, ttgs[i]->thermtrip_threshold_mask);
+               state *= ts->soc->thresh_grain;
+               seq_printf(s, "Thresh(%d)\n", state);
+       }
+
+       return 0;
+}
+
+static int regs_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, regs_show, inode->i_private);
+}
+
+static const struct file_operations regs_fops = {
+       .open           = regs_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static void soctherm_debug_init(struct platform_device *pdev)
+{
+       struct tegra_soctherm *tegra = platform_get_drvdata(pdev);
+       struct dentry *root, *file;
+
+       root = debugfs_create_dir("soctherm", NULL);
+       if (!root) {
+               dev_err(&pdev->dev, "failed to create debugfs directory\n");
+               return;
+       }
+
+       tegra->debugfs_dir = root;
+
+       file = debugfs_create_file("reg_contents", 0644, root,
+                                  pdev, &regs_fops);
+       if (!file) {
+               dev_err(&pdev->dev, "failed to create debugfs file\n");
+               debugfs_remove_recursive(tegra->debugfs_dir);
+               tegra->debugfs_dir = NULL;
+       }
+}
+#else
+static inline void soctherm_debug_init(struct platform_device *pdev) {}
+#endif
+
+static int soctherm_clk_enable(struct platform_device *pdev, bool enable)
+{
+       struct tegra_soctherm *tegra = platform_get_drvdata(pdev);
+       int err;
+
+       if (!tegra->clock_soctherm || !tegra->clock_tsensor)
+               return -EINVAL;
+
+       reset_control_assert(tegra->reset);
+
+       if (enable) {
+               err = clk_prepare_enable(tegra->clock_soctherm);
+               if (err) {
+                       reset_control_deassert(tegra->reset);
+                       return err;
+               }
+
+               err = clk_prepare_enable(tegra->clock_tsensor);
+               if (err) {
+                       clk_disable_unprepare(tegra->clock_soctherm);
+                       reset_control_deassert(tegra->reset);
+                       return err;
+               }
+       } else {
+               clk_disable_unprepare(tegra->clock_tsensor);
+               clk_disable_unprepare(tegra->clock_soctherm);
+       }
+
+       reset_control_deassert(tegra->reset);
+
+       return 0;
+}
+
+static void soctherm_init(struct platform_device *pdev)
+{
+       struct tegra_soctherm *tegra = platform_get_drvdata(pdev);
+       const struct tegra_tsensor_group **ttgs = tegra->soc->ttgs;
+       int i;
+       u32 pdiv, hotspot;
+
+       /* Initialize raw sensors */
+       for (i = 0; i < tegra->soc->num_tsensors; ++i)
+               enable_tsensor(tegra, i);
+
+       /* program pdiv and hotspot offsets per THERM */
+       pdiv = readl(tegra->regs + SENSOR_PDIV);
+       hotspot = readl(tegra->regs + SENSOR_HOTSPOT_OFF);
+       for (i = 0; i < tegra->soc->num_ttgs; ++i) {
+               pdiv = REG_SET_MASK(pdiv, ttgs[i]->pdiv_mask,
+                                   ttgs[i]->pdiv);
+               /* hotspot offset from PLLX, doesn't need to configure PLLX */
+               if (ttgs[i]->id == TEGRA124_SOCTHERM_SENSOR_PLLX)
+                       continue;
+               hotspot =  REG_SET_MASK(hotspot,
+                                       ttgs[i]->pllx_hotspot_mask,
+                                       ttgs[i]->pllx_hotspot_diff);
+       }
+       writel(pdiv, tegra->regs + SENSOR_PDIV);
+       writel(hotspot, tegra->regs + SENSOR_HOTSPOT_OFF);
+}
+
+static const struct of_device_id tegra_soctherm_of_match[] = {
+#ifdef CONFIG_ARCH_TEGRA_124_SOC
+       {
+               .compatible = "nvidia,tegra124-soctherm",
+               .data = &tegra124_soctherm,
+       },
+#endif
+#ifdef CONFIG_ARCH_TEGRA_132_SOC
+       {
+               .compatible = "nvidia,tegra132-soctherm",
+               .data = &tegra132_soctherm,
+       },
+#endif
+#ifdef CONFIG_ARCH_TEGRA_210_SOC
+       {
+               .compatible = "nvidia,tegra210-soctherm",
+               .data = &tegra210_soctherm,
+       },
+#endif
+       { },
+};
+MODULE_DEVICE_TABLE(of, tegra_soctherm_of_match);
+
+static int tegra_soctherm_probe(struct platform_device *pdev)
+{
+       const struct of_device_id *match;
+       struct tegra_soctherm *tegra;
+       struct thermal_zone_device *z;
+       struct tsensor_shared_calib shared_calib;
+       struct resource *res;
+       struct tegra_soctherm_soc *soc;
+       unsigned int i;
+       int err;
+
+       match = of_match_node(tegra_soctherm_of_match, pdev->dev.of_node);
+       if (!match)
+               return -ENODEV;
+
+       soc = (struct tegra_soctherm_soc *)match->data;
+       if (soc->num_ttgs > TEGRA124_SOCTHERM_SENSOR_NUM)
+               return -EINVAL;
+
+       tegra = devm_kzalloc(&pdev->dev, sizeof(*tegra), GFP_KERNEL);
+       if (!tegra)
+               return -ENOMEM;
+
+       dev_set_drvdata(&pdev->dev, tegra);
+
+       tegra->soc = soc;
+
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       tegra->regs = devm_ioremap_resource(&pdev->dev, res);
+       if (IS_ERR(tegra->regs))
+               return PTR_ERR(tegra->regs);
+
+       tegra->reset = devm_reset_control_get(&pdev->dev, "soctherm");
+       if (IS_ERR(tegra->reset)) {
+               dev_err(&pdev->dev, "can't get soctherm reset\n");
+               return PTR_ERR(tegra->reset);
+       }
+
+       tegra->clock_tsensor = devm_clk_get(&pdev->dev, "tsensor");
+       if (IS_ERR(tegra->clock_tsensor)) {
+               dev_err(&pdev->dev, "can't get tsensor clock\n");
+               return PTR_ERR(tegra->clock_tsensor);
+       }
+
+       tegra->clock_soctherm = devm_clk_get(&pdev->dev, "soctherm");
+       if (IS_ERR(tegra->clock_soctherm)) {
+               dev_err(&pdev->dev, "can't get soctherm clock\n");
+               return PTR_ERR(tegra->clock_soctherm);
+       }
+
+       tegra->calib = devm_kzalloc(&pdev->dev,
+                                   sizeof(u32) * soc->num_tsensors,
+                                   GFP_KERNEL);
+       if (!tegra->calib)
+               return -ENOMEM;
+
+       /* calculate shared calibration data */
+       err = tegra_calc_shared_calib(soc->tfuse, &shared_calib);
+       if (err)
+               return err;
+
+       /* calculate tsensor calibaration data */
+       for (i = 0; i < soc->num_tsensors; ++i) {
+               err = tegra_calc_tsensor_calib(&soc->tsensors[i],
+                                              &shared_calib,
+                                              &tegra->calib[i]);
+               if (err)
+                       return err;
+       }
+
+       tegra->thermctl_tzs = devm_kzalloc(&pdev->dev,
+                                          sizeof(*z) * soc->num_ttgs,
+                                          GFP_KERNEL);
+       if (!tegra->thermctl_tzs)
+               return -ENOMEM;
+
+       err = soctherm_clk_enable(pdev, true);
+       if (err)
+               return err;
+
+       soctherm_init(pdev);
+
+       for (i = 0; i < soc->num_ttgs; ++i) {
+               struct tegra_thermctl_zone *zone =
+                       devm_kzalloc(&pdev->dev, sizeof(*zone), GFP_KERNEL);
+               if (!zone) {
+                       err = -ENOMEM;
+                       goto disable_clocks;
+               }
+
+               zone->reg = tegra->regs + soc->ttgs[i]->sensor_temp_offset;
+               zone->dev = &pdev->dev;
+               zone->sg = soc->ttgs[i];
+
+               z = devm_thermal_zone_of_sensor_register(&pdev->dev,
+                                                        soc->ttgs[i]->id, zone,
+                                                        &tegra_of_thermal_ops);
+               if (IS_ERR(z)) {
+                       err = PTR_ERR(z);
+                       dev_err(&pdev->dev, "failed to register sensor: %d\n",
+                               err);
+                       goto disable_clocks;
+               }
+
+               zone->tz = z;
+               tegra->thermctl_tzs[soc->ttgs[i]->id] = z;
+
+               /* Configure hw trip points */
+               tegra_soctherm_set_hwtrips(&pdev->dev, soc->ttgs[i], z);
+       }
+
+       soctherm_debug_init(pdev);
+
+       return 0;
+
+disable_clocks:
+       soctherm_clk_enable(pdev, false);
+
+       return err;
+}
+
+static int tegra_soctherm_remove(struct platform_device *pdev)
+{
+       struct tegra_soctherm *tegra = platform_get_drvdata(pdev);
+
+       debugfs_remove_recursive(tegra->debugfs_dir);
+
+       soctherm_clk_enable(pdev, false);
+
+       return 0;
+}
+
+static int __maybe_unused soctherm_suspend(struct device *dev)
+{
+       struct platform_device *pdev = to_platform_device(dev);
+
+       soctherm_clk_enable(pdev, false);
+
+       return 0;
+}
+
+static int __maybe_unused soctherm_resume(struct device *dev)
+{
+       struct platform_device *pdev = to_platform_device(dev);
+       struct tegra_soctherm *tegra = platform_get_drvdata(pdev);
+       struct tegra_soctherm_soc *soc = tegra->soc;
+       int err, i;
+
+       err = soctherm_clk_enable(pdev, true);
+       if (err) {
+               dev_err(&pdev->dev,
+                       "Resume failed: enable clocks failed\n");
+               return err;
+       }
+
+       soctherm_init(pdev);
+
+       for (i = 0; i < soc->num_ttgs; ++i) {
+               struct thermal_zone_device *tz;
+
+               tz = tegra->thermctl_tzs[soc->ttgs[i]->id];
+               tegra_soctherm_set_hwtrips(dev, soc->ttgs[i], tz);
+       }
+
+       return 0;
+}
+
+static SIMPLE_DEV_PM_OPS(tegra_soctherm_pm, soctherm_suspend, soctherm_resume);
+
+static struct platform_driver tegra_soctherm_driver = {
+       .probe = tegra_soctherm_probe,
+       .remove = tegra_soctherm_remove,
+       .driver = {
+               .name = "tegra_soctherm",
+               .pm = &tegra_soctherm_pm,
+               .of_match_table = tegra_soctherm_of_match,
+       },
+};
+module_platform_driver(tegra_soctherm_driver);
+
+MODULE_AUTHOR("Mikko Perttunen <mperttunen@nvidia.com>");
+MODULE_DESCRIPTION("NVIDIA Tegra SOCTHERM thermal management driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/thermal/tegra/soctherm.h b/drivers/thermal/tegra/soctherm.h
new file mode 100644 (file)
index 0000000..28e18ec
--- /dev/null
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2014-2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef __DRIVERS_THERMAL_TEGRA_SOCTHERM_H
+#define __DRIVERS_THERMAL_TEGRA_SOCTHERM_H
+
+#define SENSOR_CONFIG2                          8
+#define SENSOR_CONFIG2_THERMA_MASK             (0xffff << 16)
+#define SENSOR_CONFIG2_THERMA_SHIFT            16
+#define SENSOR_CONFIG2_THERMB_MASK             0xffff
+#define SENSOR_CONFIG2_THERMB_SHIFT            0
+
+#define THERMCTL_THERMTRIP_CTL                 0x80
+/* BITs are defined in device file */
+
+#define SENSOR_PDIV                            0x1c0
+#define SENSOR_PDIV_CPU_MASK                   (0xf << 12)
+#define SENSOR_PDIV_GPU_MASK                   (0xf << 8)
+#define SENSOR_PDIV_MEM_MASK                   (0xf << 4)
+#define SENSOR_PDIV_PLLX_MASK                  (0xf << 0)
+
+#define SENSOR_HOTSPOT_OFF                     0x1c4
+#define SENSOR_HOTSPOT_CPU_MASK                        (0xff << 16)
+#define SENSOR_HOTSPOT_GPU_MASK                        (0xff << 8)
+#define SENSOR_HOTSPOT_MEM_MASK                        (0xff << 0)
+
+#define SENSOR_TEMP1                           0x1c8
+#define SENSOR_TEMP1_CPU_TEMP_MASK             (0xffff << 16)
+#define SENSOR_TEMP1_GPU_TEMP_MASK             0xffff
+#define SENSOR_TEMP2                           0x1cc
+#define SENSOR_TEMP2_MEM_TEMP_MASK             (0xffff << 16)
+#define SENSOR_TEMP2_PLLX_TEMP_MASK            0xffff
+
+/**
+ * struct tegra_tsensor_group - SOC_THERM sensor group data
+ * @name: short name of the temperature sensor group
+ * @id: numeric ID of the temperature sensor group
+ * @sensor_temp_offset: offset of the SENSOR_TEMP* register
+ * @sensor_temp_mask: bit mask for this sensor group in SENSOR_TEMP* register
+ * @pdiv: the sensor count post-divider to use during runtime
+ * @pdiv_ate: the sensor count post-divider used during automated test
+ * @pdiv_mask: register bitfield mask for the PDIV field for this sensor
+ * @pllx_hotspot_diff: hotspot offset from the PLLX sensor, must be 0 for
+    PLLX sensor group
+ * @pllx_hotspot_mask: register bitfield mask for the HOTSPOT field
+ */
+struct tegra_tsensor_group {
+       const char *name;
+       u8 id;
+       u16 sensor_temp_offset;
+       u32 sensor_temp_mask;
+       u32 pdiv, pdiv_ate, pdiv_mask;
+       u32 pllx_hotspot_diff, pllx_hotspot_mask;
+       u32 thermtrip_enable_mask;
+       u32 thermtrip_any_en_mask;
+       u32 thermtrip_threshold_mask;
+};
+
+struct tegra_tsensor_configuration {
+       u32 tall, tiddq_en, ten_count, pdiv, pdiv_ate, tsample, tsample_ate;
+};
+
+struct tegra_tsensor {
+       const char *name;
+       const u32 base;
+       const struct tegra_tsensor_configuration *config;
+       const u32 calib_fuse_offset;
+       /*
+        * Correction values used to modify values read from
+        * calibration fuses
+        */
+       const s32 fuse_corr_alpha, fuse_corr_beta;
+       const struct tegra_tsensor_group *group;
+};
+
+struct tegra_soctherm_fuse {
+       u32 fuse_base_cp_mask, fuse_base_cp_shift;
+       u32 fuse_base_ft_mask, fuse_base_ft_shift;
+       u32 fuse_shift_ft_mask, fuse_shift_ft_shift;
+       u32 fuse_spare_realignment;
+};
+
+struct tsensor_shared_calib {
+       u32 base_cp, base_ft;
+       u32 actual_temp_cp, actual_temp_ft;
+};
+
+struct tegra_soctherm_soc {
+       const struct tegra_tsensor *tsensors;
+       const unsigned int num_tsensors;
+       const struct tegra_tsensor_group **ttgs;
+       const unsigned int num_ttgs;
+       const struct tegra_soctherm_fuse *tfuse;
+       const int thresh_grain;
+};
+
+int tegra_calc_shared_calib(const struct tegra_soctherm_fuse *tfuse,
+                           struct tsensor_shared_calib *shared);
+int tegra_calc_tsensor_calib(const struct tegra_tsensor *sensor,
+                            const struct tsensor_shared_calib *shared,
+                            u32 *calib);
+
+#ifdef CONFIG_ARCH_TEGRA_124_SOC
+extern const struct tegra_soctherm_soc tegra124_soctherm;
+#endif
+
+#ifdef CONFIG_ARCH_TEGRA_132_SOC
+extern const struct tegra_soctherm_soc tegra132_soctherm;
+#endif
+
+#ifdef CONFIG_ARCH_TEGRA_210_SOC
+extern const struct tegra_soctherm_soc tegra210_soctherm;
+#endif
+
+#endif
+
diff --git a/drivers/thermal/tegra/tegra124-soctherm.c b/drivers/thermal/tegra/tegra124-soctherm.c
new file mode 100644 (file)
index 0000000..beb9d36
--- /dev/null
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2014-2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/platform_device.h>
+
+#include <dt-bindings/thermal/tegra124-soctherm.h>
+
+#include "soctherm.h"
+
+#define TEGRA124_THERMTRIP_ANY_EN_MASK         (0x1 << 28)
+#define TEGRA124_THERMTRIP_MEM_EN_MASK         (0x1 << 27)
+#define TEGRA124_THERMTRIP_GPU_EN_MASK         (0x1 << 26)
+#define TEGRA124_THERMTRIP_CPU_EN_MASK         (0x1 << 25)
+#define TEGRA124_THERMTRIP_TSENSE_EN_MASK      (0x1 << 24)
+#define TEGRA124_THERMTRIP_GPUMEM_THRESH_MASK  (0xff << 16)
+#define TEGRA124_THERMTRIP_CPU_THRESH_MASK     (0xff << 8)
+#define TEGRA124_THERMTRIP_TSENSE_THRESH_MASK  0xff
+
+#define TEGRA124_THRESH_GRAIN                  1000
+
+static const struct tegra_tsensor_configuration tegra124_tsensor_config = {
+       .tall = 16300,
+       .tiddq_en = 1,
+       .ten_count = 1,
+       .tsample = 120,
+       .tsample_ate = 480,
+};
+
+static const struct tegra_tsensor_group tegra124_tsensor_group_cpu = {
+       .id = TEGRA124_SOCTHERM_SENSOR_CPU,
+       .name   = "cpu",
+       .sensor_temp_offset     = SENSOR_TEMP1,
+       .sensor_temp_mask       = SENSOR_TEMP1_CPU_TEMP_MASK,
+       .pdiv = 8,
+       .pdiv_ate = 8,
+       .pdiv_mask = SENSOR_PDIV_CPU_MASK,
+       .pllx_hotspot_diff = 10,
+       .pllx_hotspot_mask = SENSOR_HOTSPOT_CPU_MASK,
+       .thermtrip_any_en_mask = TEGRA124_THERMTRIP_ANY_EN_MASK,
+       .thermtrip_enable_mask = TEGRA124_THERMTRIP_CPU_EN_MASK,
+       .thermtrip_threshold_mask = TEGRA124_THERMTRIP_CPU_THRESH_MASK,
+};
+
+static const struct tegra_tsensor_group tegra124_tsensor_group_gpu = {
+       .id = TEGRA124_SOCTHERM_SENSOR_GPU,
+       .name = "gpu",
+       .sensor_temp_offset = SENSOR_TEMP1,
+       .sensor_temp_mask = SENSOR_TEMP1_GPU_TEMP_MASK,
+       .pdiv = 8,
+       .pdiv_ate = 8,
+       .pdiv_mask = SENSOR_PDIV_GPU_MASK,
+       .pllx_hotspot_diff = 5,
+       .pllx_hotspot_mask = SENSOR_HOTSPOT_GPU_MASK,
+       .thermtrip_any_en_mask = TEGRA124_THERMTRIP_ANY_EN_MASK,
+       .thermtrip_enable_mask = TEGRA124_THERMTRIP_GPU_EN_MASK,
+       .thermtrip_threshold_mask = TEGRA124_THERMTRIP_GPUMEM_THRESH_MASK,
+};
+
+static const struct tegra_tsensor_group tegra124_tsensor_group_pll = {
+       .id = TEGRA124_SOCTHERM_SENSOR_PLLX,
+       .name = "pll",
+       .sensor_temp_offset = SENSOR_TEMP2,
+       .sensor_temp_mask = SENSOR_TEMP2_PLLX_TEMP_MASK,
+       .pdiv = 8,
+       .pdiv_ate = 8,
+       .pdiv_mask = SENSOR_PDIV_PLLX_MASK,
+       .thermtrip_any_en_mask = TEGRA124_THERMTRIP_ANY_EN_MASK,
+       .thermtrip_enable_mask = TEGRA124_THERMTRIP_TSENSE_EN_MASK,
+       .thermtrip_threshold_mask = TEGRA124_THERMTRIP_TSENSE_THRESH_MASK,
+};
+
+static const struct tegra_tsensor_group tegra124_tsensor_group_mem = {
+       .id = TEGRA124_SOCTHERM_SENSOR_MEM,
+       .name = "mem",
+       .sensor_temp_offset = SENSOR_TEMP2,
+       .sensor_temp_mask = SENSOR_TEMP2_MEM_TEMP_MASK,
+       .pdiv = 8,
+       .pdiv_ate = 8,
+       .pdiv_mask = SENSOR_PDIV_MEM_MASK,
+       .pllx_hotspot_diff = 0,
+       .pllx_hotspot_mask = SENSOR_HOTSPOT_MEM_MASK,
+       .thermtrip_any_en_mask = TEGRA124_THERMTRIP_ANY_EN_MASK,
+       .thermtrip_enable_mask = TEGRA124_THERMTRIP_MEM_EN_MASK,
+       .thermtrip_threshold_mask = TEGRA124_THERMTRIP_GPUMEM_THRESH_MASK,
+};
+
+static const struct tegra_tsensor_group *tegra124_tsensor_groups[] = {
+       &tegra124_tsensor_group_cpu,
+       &tegra124_tsensor_group_gpu,
+       &tegra124_tsensor_group_pll,
+       &tegra124_tsensor_group_mem,
+};
+
+static const struct tegra_tsensor tegra124_tsensors[] = {
+       {
+               .name = "cpu0",
+               .base = 0xc0,
+               .config = &tegra124_tsensor_config,
+               .calib_fuse_offset = 0x098,
+               .fuse_corr_alpha = 1135400,
+               .fuse_corr_beta = -6266900,
+               .group = &tegra124_tsensor_group_cpu,
+       }, {
+               .name = "cpu1",
+               .base = 0xe0,
+               .config = &tegra124_tsensor_config,
+               .calib_fuse_offset = 0x084,
+               .fuse_corr_alpha = 1122220,
+               .fuse_corr_beta = -5700700,
+               .group = &tegra124_tsensor_group_cpu,
+       }, {
+               .name = "cpu2",
+               .base = 0x100,
+               .config = &tegra124_tsensor_config,
+               .calib_fuse_offset = 0x088,
+               .fuse_corr_alpha = 1127000,
+               .fuse_corr_beta = -6768200,
+               .group = &tegra124_tsensor_group_cpu,
+       }, {
+               .name = "cpu3",
+               .base = 0x120,
+               .config = &tegra124_tsensor_config,
+               .calib_fuse_offset = 0x12c,
+               .fuse_corr_alpha = 1110900,
+               .fuse_corr_beta = -6232000,
+               .group = &tegra124_tsensor_group_cpu,
+       }, {
+               .name = "mem0",
+               .base = 0x140,
+               .config = &tegra124_tsensor_config,
+               .calib_fuse_offset = 0x158,
+               .fuse_corr_alpha = 1122300,
+               .fuse_corr_beta = -5936400,
+               .group = &tegra124_tsensor_group_mem,
+       }, {
+               .name = "mem1",
+               .base = 0x160,
+               .config = &tegra124_tsensor_config,
+               .calib_fuse_offset = 0x15c,
+               .fuse_corr_alpha = 1145700,
+               .fuse_corr_beta = -7124600,
+               .group = &tegra124_tsensor_group_mem,
+       }, {
+               .name = "gpu",
+               .base = 0x180,
+               .config = &tegra124_tsensor_config,
+               .calib_fuse_offset = 0x154,
+               .fuse_corr_alpha = 1120100,
+               .fuse_corr_beta = -6000500,
+               .group = &tegra124_tsensor_group_gpu,
+       }, {
+               .name = "pllx",
+               .base = 0x1a0,
+               .config = &tegra124_tsensor_config,
+               .calib_fuse_offset = 0x160,
+               .fuse_corr_alpha = 1106500,
+               .fuse_corr_beta = -6729300,
+               .group = &tegra124_tsensor_group_pll,
+       },
+};
+
+/*
+ * Mask/shift bits in FUSE_TSENSOR_COMMON and
+ * FUSE_TSENSOR_COMMON, which are described in
+ * tegra_soctherm_fuse.c
+ */
+static const struct tegra_soctherm_fuse tegra124_soctherm_fuse = {
+       .fuse_base_cp_mask = 0x3ff,
+       .fuse_base_cp_shift = 0,
+       .fuse_base_ft_mask = 0x7ff << 10,
+       .fuse_base_ft_shift = 10,
+       .fuse_shift_ft_mask = 0x1f << 21,
+       .fuse_shift_ft_shift = 21,
+       .fuse_spare_realignment = 0x1fc,
+};
+
+const struct tegra_soctherm_soc tegra124_soctherm = {
+       .tsensors = tegra124_tsensors,
+       .num_tsensors = ARRAY_SIZE(tegra124_tsensors),
+       .ttgs = tegra124_tsensor_groups,
+       .num_ttgs = ARRAY_SIZE(tegra124_tsensor_groups),
+       .tfuse = &tegra124_soctherm_fuse,
+       .thresh_grain = TEGRA124_THRESH_GRAIN,
+};
diff --git a/drivers/thermal/tegra/tegra132-soctherm.c b/drivers/thermal/tegra/tegra132-soctherm.c
new file mode 100644 (file)
index 0000000..e2aa84e
--- /dev/null
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2014-2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/platform_device.h>
+
+#include <dt-bindings/thermal/tegra124-soctherm.h>
+
+#include "soctherm.h"
+
+#define TEGRA132_THERMTRIP_ANY_EN_MASK         (0x1 << 28)
+#define TEGRA132_THERMTRIP_MEM_EN_MASK         (0x1 << 27)
+#define TEGRA132_THERMTRIP_GPU_EN_MASK         (0x1 << 26)
+#define TEGRA132_THERMTRIP_CPU_EN_MASK         (0x1 << 25)
+#define TEGRA132_THERMTRIP_TSENSE_EN_MASK      (0x1 << 24)
+#define TEGRA132_THERMTRIP_GPUMEM_THRESH_MASK  (0xff << 16)
+#define TEGRA132_THERMTRIP_CPU_THRESH_MASK     (0xff << 8)
+#define TEGRA132_THERMTRIP_TSENSE_THRESH_MASK  0xff
+
+#define TEGRA132_THRESH_GRAIN                  1000
+
+static const struct tegra_tsensor_configuration tegra132_tsensor_config = {
+       .tall = 16300,
+       .tiddq_en = 1,
+       .ten_count = 1,
+       .tsample = 120,
+       .tsample_ate = 480,
+};
+
+static const struct tegra_tsensor_group tegra132_tsensor_group_cpu = {
+       .id = TEGRA124_SOCTHERM_SENSOR_CPU,
+       .name = "cpu",
+       .sensor_temp_offset = SENSOR_TEMP1,
+       .sensor_temp_mask = SENSOR_TEMP1_CPU_TEMP_MASK,
+       .pdiv = 8,
+       .pdiv_ate = 8,
+       .pdiv_mask = SENSOR_PDIV_CPU_MASK,
+       .pllx_hotspot_diff = 10,
+       .pllx_hotspot_mask = SENSOR_HOTSPOT_CPU_MASK,
+       .thermtrip_any_en_mask = TEGRA132_THERMTRIP_ANY_EN_MASK,
+       .thermtrip_enable_mask = TEGRA132_THERMTRIP_CPU_EN_MASK,
+       .thermtrip_threshold_mask = TEGRA132_THERMTRIP_CPU_THRESH_MASK,
+};
+
+static const struct tegra_tsensor_group tegra132_tsensor_group_gpu = {
+       .id = TEGRA124_SOCTHERM_SENSOR_GPU,
+       .name = "gpu",
+       .sensor_temp_offset = SENSOR_TEMP1,
+       .sensor_temp_mask = SENSOR_TEMP1_GPU_TEMP_MASK,
+       .pdiv = 8,
+       .pdiv_ate = 8,
+       .pdiv_mask = SENSOR_PDIV_GPU_MASK,
+       .pllx_hotspot_diff = 5,
+       .pllx_hotspot_mask = SENSOR_HOTSPOT_GPU_MASK,
+       .thermtrip_any_en_mask = TEGRA132_THERMTRIP_ANY_EN_MASK,
+       .thermtrip_enable_mask = TEGRA132_THERMTRIP_GPU_EN_MASK,
+       .thermtrip_threshold_mask = TEGRA132_THERMTRIP_GPUMEM_THRESH_MASK,
+};
+
+static const struct tegra_tsensor_group tegra132_tsensor_group_pll = {
+       .id = TEGRA124_SOCTHERM_SENSOR_PLLX,
+       .name = "pll",
+       .sensor_temp_offset = SENSOR_TEMP2,
+       .sensor_temp_mask = SENSOR_TEMP2_PLLX_TEMP_MASK,
+       .pdiv = 8,
+       .pdiv_ate = 8,
+       .pdiv_mask = SENSOR_PDIV_PLLX_MASK,
+       .thermtrip_any_en_mask = TEGRA132_THERMTRIP_ANY_EN_MASK,
+       .thermtrip_enable_mask = TEGRA132_THERMTRIP_TSENSE_EN_MASK,
+       .thermtrip_threshold_mask = TEGRA132_THERMTRIP_TSENSE_THRESH_MASK,
+};
+
+static const struct tegra_tsensor_group tegra132_tsensor_group_mem = {
+       .id = TEGRA124_SOCTHERM_SENSOR_MEM,
+       .name = "mem",
+       .sensor_temp_offset = SENSOR_TEMP2,
+       .sensor_temp_mask = SENSOR_TEMP2_MEM_TEMP_MASK,
+       .pdiv = 8,
+       .pdiv_ate = 8,
+       .pdiv_mask = SENSOR_PDIV_MEM_MASK,
+       .pllx_hotspot_diff = 0,
+       .pllx_hotspot_mask = SENSOR_HOTSPOT_MEM_MASK,
+       .thermtrip_any_en_mask = TEGRA132_THERMTRIP_ANY_EN_MASK,
+       .thermtrip_enable_mask = TEGRA132_THERMTRIP_MEM_EN_MASK,
+       .thermtrip_threshold_mask = TEGRA132_THERMTRIP_GPUMEM_THRESH_MASK,
+};
+
+static const struct tegra_tsensor_group *tegra132_tsensor_groups[] = {
+       &tegra132_tsensor_group_cpu,
+       &tegra132_tsensor_group_gpu,
+       &tegra132_tsensor_group_pll,
+       &tegra132_tsensor_group_mem,
+};
+
+static struct tegra_tsensor tegra132_tsensors[] = {
+       {
+               .name = "cpu0",
+               .base = 0xc0,
+               .config = &tegra132_tsensor_config,
+               .calib_fuse_offset = 0x098,
+               .fuse_corr_alpha = 1126600,
+               .fuse_corr_beta = -9433500,
+               .group = &tegra132_tsensor_group_cpu,
+       }, {
+               .name = "cpu1",
+               .base = 0xe0,
+               .config = &tegra132_tsensor_config,
+               .calib_fuse_offset = 0x084,
+               .fuse_corr_alpha = 1110800,
+               .fuse_corr_beta = -7383000,
+               .group = &tegra132_tsensor_group_cpu,
+       }, {
+               .name = "cpu2",
+               .base = 0x100,
+               .config = &tegra132_tsensor_config,
+               .calib_fuse_offset = 0x088,
+               .fuse_corr_alpha = 1113800,
+               .fuse_corr_beta = -6215200,
+               .group = &tegra132_tsensor_group_cpu,
+       }, {
+               .name = "cpu3",
+               .base = 0x120,
+               .config = &tegra132_tsensor_config,
+               .calib_fuse_offset = 0x12c,
+               .fuse_corr_alpha = 1129600,
+               .fuse_corr_beta = -8196100,
+               .group = &tegra132_tsensor_group_cpu,
+       }, {
+               .name = "mem0",
+               .base = 0x140,
+               .config = &tegra132_tsensor_config,
+               .calib_fuse_offset = 0x158,
+               .fuse_corr_alpha = 1132900,
+               .fuse_corr_beta = -6755300,
+               .group = &tegra132_tsensor_group_mem,
+       }, {
+               .name = "mem1",
+               .base = 0x160,
+               .config = &tegra132_tsensor_config,
+               .calib_fuse_offset = 0x15c,
+               .fuse_corr_alpha = 1142300,
+               .fuse_corr_beta = -7374200,
+               .group = &tegra132_tsensor_group_mem,
+       }, {
+               .name = "gpu",
+               .base = 0x180,
+               .config = &tegra132_tsensor_config,
+               .calib_fuse_offset = 0x154,
+               .fuse_corr_alpha = 1125100,
+               .fuse_corr_beta = -6350400,
+               .group = &tegra132_tsensor_group_gpu,
+       }, {
+               .name = "pllx",
+               .base = 0x1a0,
+               .config = &tegra132_tsensor_config,
+               .calib_fuse_offset = 0x160,
+               .fuse_corr_alpha = 1118100,
+               .fuse_corr_beta = -8208800,
+               .group = &tegra132_tsensor_group_pll,
+       },
+};
+
+/*
+ * Mask/shift bits in FUSE_TSENSOR_COMMON and
+ * FUSE_TSENSOR_COMMON, which are described in
+ * tegra_soctherm_fuse.c
+ */
+static const struct tegra_soctherm_fuse tegra132_soctherm_fuse = {
+       .fuse_base_cp_mask = 0x3ff,
+       .fuse_base_cp_shift = 0,
+       .fuse_base_ft_mask = 0x7ff << 10,
+       .fuse_base_ft_shift = 10,
+       .fuse_shift_ft_mask = 0x1f << 21,
+       .fuse_shift_ft_shift = 21,
+       .fuse_spare_realignment = 0x1fc,
+};
+
+const struct tegra_soctherm_soc tegra132_soctherm = {
+       .tsensors = tegra132_tsensors,
+       .num_tsensors = ARRAY_SIZE(tegra132_tsensors),
+       .ttgs = tegra132_tsensor_groups,
+       .num_ttgs = ARRAY_SIZE(tegra132_tsensor_groups),
+       .tfuse = &tegra132_soctherm_fuse,
+       .thresh_grain = TEGRA132_THRESH_GRAIN,
+};
diff --git a/drivers/thermal/tegra/tegra210-soctherm.c b/drivers/thermal/tegra/tegra210-soctherm.c
new file mode 100644 (file)
index 0000000..19cc0ab
--- /dev/null
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2014-2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <soc/tegra/fuse.h>
+
+#include <dt-bindings/thermal/tegra124-soctherm.h>
+
+#include "soctherm.h"
+
+#define TEGRA210_THERMTRIP_ANY_EN_MASK         (0x1 << 31)
+#define TEGRA210_THERMTRIP_MEM_EN_MASK         (0x1 << 30)
+#define TEGRA210_THERMTRIP_GPU_EN_MASK         (0x1 << 29)
+#define TEGRA210_THERMTRIP_CPU_EN_MASK         (0x1 << 28)
+#define TEGRA210_THERMTRIP_TSENSE_EN_MASK      (0x1 << 27)
+#define TEGRA210_THERMTRIP_GPUMEM_THRESH_MASK  (0x1ff << 18)
+#define TEGRA210_THERMTRIP_CPU_THRESH_MASK     (0x1ff << 9)
+#define TEGRA210_THERMTRIP_TSENSE_THRESH_MASK  0x1ff
+
+#define TEGRA210_THRESH_GRAIN                  500
+
+static const struct tegra_tsensor_configuration tegra210_tsensor_config = {
+       .tall = 16300,
+       .tiddq_en = 1,
+       .ten_count = 1,
+       .tsample = 120,
+       .tsample_ate = 480,
+};
+
+static const struct tegra_tsensor_group tegra210_tsensor_group_cpu = {
+       .id = TEGRA124_SOCTHERM_SENSOR_CPU,
+       .name = "cpu",
+       .sensor_temp_offset = SENSOR_TEMP1,
+       .sensor_temp_mask = SENSOR_TEMP1_CPU_TEMP_MASK,
+       .pdiv = 8,
+       .pdiv_ate = 8,
+       .pdiv_mask = SENSOR_PDIV_CPU_MASK,
+       .pllx_hotspot_diff = 10,
+       .pllx_hotspot_mask = SENSOR_HOTSPOT_CPU_MASK,
+       .thermtrip_any_en_mask = TEGRA210_THERMTRIP_ANY_EN_MASK,
+       .thermtrip_enable_mask = TEGRA210_THERMTRIP_CPU_EN_MASK,
+       .thermtrip_threshold_mask = TEGRA210_THERMTRIP_CPU_THRESH_MASK,
+};
+
+static const struct tegra_tsensor_group tegra210_tsensor_group_gpu = {
+       .id = TEGRA124_SOCTHERM_SENSOR_GPU,
+       .name = "gpu",
+       .sensor_temp_offset = SENSOR_TEMP1,
+       .sensor_temp_mask = SENSOR_TEMP1_GPU_TEMP_MASK,
+       .pdiv = 8,
+       .pdiv_ate = 8,
+       .pdiv_mask = SENSOR_PDIV_GPU_MASK,
+       .pllx_hotspot_diff = 5,
+       .pllx_hotspot_mask = SENSOR_HOTSPOT_GPU_MASK,
+       .thermtrip_any_en_mask = TEGRA210_THERMTRIP_ANY_EN_MASK,
+       .thermtrip_enable_mask = TEGRA210_THERMTRIP_GPU_EN_MASK,
+       .thermtrip_threshold_mask = TEGRA210_THERMTRIP_GPUMEM_THRESH_MASK,
+};
+
+static const struct tegra_tsensor_group tegra210_tsensor_group_pll = {
+       .id = TEGRA124_SOCTHERM_SENSOR_PLLX,
+       .name = "pll",
+       .sensor_temp_offset = SENSOR_TEMP2,
+       .sensor_temp_mask = SENSOR_TEMP2_PLLX_TEMP_MASK,
+       .pdiv = 8,
+       .pdiv_ate = 8,
+       .pdiv_mask = SENSOR_PDIV_PLLX_MASK,
+       .thermtrip_any_en_mask = TEGRA210_THERMTRIP_ANY_EN_MASK,
+       .thermtrip_enable_mask = TEGRA210_THERMTRIP_TSENSE_EN_MASK,
+       .thermtrip_threshold_mask = TEGRA210_THERMTRIP_TSENSE_THRESH_MASK,
+};
+
+static const struct tegra_tsensor_group tegra210_tsensor_group_mem = {
+       .id = TEGRA124_SOCTHERM_SENSOR_MEM,
+       .name = "mem",
+       .sensor_temp_offset = SENSOR_TEMP2,
+       .sensor_temp_mask = SENSOR_TEMP2_MEM_TEMP_MASK,
+       .pdiv = 8,
+       .pdiv_ate = 8,
+       .pdiv_mask = SENSOR_PDIV_MEM_MASK,
+       .pllx_hotspot_diff = 0,
+       .pllx_hotspot_mask = SENSOR_HOTSPOT_MEM_MASK,
+       .thermtrip_any_en_mask = TEGRA210_THERMTRIP_ANY_EN_MASK,
+       .thermtrip_enable_mask = TEGRA210_THERMTRIP_MEM_EN_MASK,
+       .thermtrip_threshold_mask = TEGRA210_THERMTRIP_GPUMEM_THRESH_MASK,
+};
+
+static const struct tegra_tsensor_group *tegra210_tsensor_groups[] = {
+       &tegra210_tsensor_group_cpu,
+       &tegra210_tsensor_group_gpu,
+       &tegra210_tsensor_group_pll,
+       &tegra210_tsensor_group_mem,
+};
+
+static const struct tegra_tsensor tegra210_tsensors[] = {
+       {
+               .name = "cpu0",
+               .base = 0xc0,
+               .config = &tegra210_tsensor_config,
+               .calib_fuse_offset = 0x098,
+               .fuse_corr_alpha = 1085000,
+               .fuse_corr_beta = 3244200,
+               .group = &tegra210_tsensor_group_cpu,
+       }, {
+               .name = "cpu1",
+               .base = 0xe0,
+               .config = &tegra210_tsensor_config,
+               .calib_fuse_offset = 0x084,
+               .fuse_corr_alpha = 1126200,
+               .fuse_corr_beta = -67500,
+               .group = &tegra210_tsensor_group_cpu,
+       }, {
+               .name = "cpu2",
+               .base = 0x100,
+               .config = &tegra210_tsensor_config,
+               .calib_fuse_offset = 0x088,
+               .fuse_corr_alpha = 1098400,
+               .fuse_corr_beta = 2251100,
+               .group = &tegra210_tsensor_group_cpu,
+       }, {
+               .name = "cpu3",
+               .base = 0x120,
+               .config = &tegra210_tsensor_config,
+               .calib_fuse_offset = 0x12c,
+               .fuse_corr_alpha = 1108000,
+               .fuse_corr_beta = 602700,
+               .group = &tegra210_tsensor_group_cpu,
+       }, {
+               .name = "mem0",
+               .base = 0x140,
+               .config = &tegra210_tsensor_config,
+               .calib_fuse_offset = 0x158,
+               .fuse_corr_alpha = 1069200,
+               .fuse_corr_beta = 3549900,
+               .group = &tegra210_tsensor_group_mem,
+       }, {
+               .name = "mem1",
+               .base = 0x160,
+               .config = &tegra210_tsensor_config,
+               .calib_fuse_offset = 0x15c,
+               .fuse_corr_alpha = 1173700,
+               .fuse_corr_beta = -6263600,
+               .group = &tegra210_tsensor_group_mem,
+       }, {
+               .name = "gpu",
+               .base = 0x180,
+               .config = &tegra210_tsensor_config,
+               .calib_fuse_offset = 0x154,
+               .fuse_corr_alpha = 1074300,
+               .fuse_corr_beta = 2734900,
+               .group = &tegra210_tsensor_group_gpu,
+       }, {
+               .name = "pllx",
+               .base = 0x1a0,
+               .config = &tegra210_tsensor_config,
+               .calib_fuse_offset = 0x160,
+               .fuse_corr_alpha = 1039700,
+               .fuse_corr_beta = 6829100,
+               .group = &tegra210_tsensor_group_pll,
+       },
+};
+
+/*
+ * Mask/shift bits in FUSE_TSENSOR_COMMON and
+ * FUSE_TSENSOR_COMMON, which are described in
+ * tegra_soctherm_fuse.c
+ */
+static const struct tegra_soctherm_fuse tegra210_soctherm_fuse = {
+       .fuse_base_cp_mask = 0x3ff << 11,
+       .fuse_base_cp_shift = 11,
+       .fuse_base_ft_mask = 0x7ff << 21,
+       .fuse_base_ft_shift = 21,
+       .fuse_shift_ft_mask = 0x1f << 6,
+       .fuse_shift_ft_shift = 6,
+       .fuse_spare_realignment = 0,
+};
+
+const struct tegra_soctherm_soc tegra210_soctherm = {
+       .tsensors = tegra210_tsensors,
+       .num_tsensors = ARRAY_SIZE(tegra210_tsensors),
+       .ttgs = tegra210_tsensor_groups,
+       .num_ttgs = ARRAY_SIZE(tegra210_tsensor_groups),
+       .tfuse = &tegra210_soctherm_fuse,
+       .thresh_grain = TEGRA210_THRESH_GRAIN,
+};
diff --git a/drivers/thermal/tegra_soctherm.c b/drivers/thermal/tegra_soctherm.c
deleted file mode 100644 (file)
index 1369752..0000000
+++ /dev/null
@@ -1,476 +0,0 @@
-/*
- * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Author:
- *     Mikko Perttunen <mperttunen@nvidia.com>
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- */
-
-#include <linux/bitops.h>
-#include <linux/clk.h>
-#include <linux/delay.h>
-#include <linux/err.h>
-#include <linux/interrupt.h>
-#include <linux/io.h>
-#include <linux/module.h>
-#include <linux/of.h>
-#include <linux/platform_device.h>
-#include <linux/reset.h>
-#include <linux/thermal.h>
-
-#include <soc/tegra/fuse.h>
-
-#define SENSOR_CONFIG0                         0
-#define SENSOR_CONFIG0_STOP                    BIT(0)
-#define SENSOR_CONFIG0_TALL_SHIFT              8
-#define SENSOR_CONFIG0_TCALC_OVER              BIT(4)
-#define SENSOR_CONFIG0_OVER                    BIT(3)
-#define SENSOR_CONFIG0_CPTR_OVER               BIT(2)
-
-#define SENSOR_CONFIG1                         4
-#define SENSOR_CONFIG1_TSAMPLE_SHIFT           0
-#define SENSOR_CONFIG1_TIDDQ_EN_SHIFT          15
-#define SENSOR_CONFIG1_TEN_COUNT_SHIFT         24
-#define SENSOR_CONFIG1_TEMP_ENABLE             BIT(31)
-
-#define SENSOR_CONFIG2                         8
-#define SENSOR_CONFIG2_THERMA_SHIFT            16
-#define SENSOR_CONFIG2_THERMB_SHIFT            0
-
-#define SENSOR_PDIV                            0x1c0
-#define SENSOR_PDIV_T124                       0x8888
-#define SENSOR_HOTSPOT_OFF                     0x1c4
-#define SENSOR_HOTSPOT_OFF_T124                        0x00060600
-#define SENSOR_TEMP1                           0x1c8
-#define SENSOR_TEMP2                           0x1cc
-
-#define SENSOR_TEMP_MASK                       0xffff
-#define READBACK_VALUE_MASK                    0xff00
-#define READBACK_VALUE_SHIFT                   8
-#define READBACK_ADD_HALF                      BIT(7)
-#define READBACK_NEGATE                                BIT(0)
-
-#define FUSE_TSENSOR8_CALIB                    0x180
-#define FUSE_SPARE_REALIGNMENT_REG_0           0x1fc
-
-#define FUSE_TSENSOR_CALIB_CP_TS_BASE_MASK     0x1fff
-#define FUSE_TSENSOR_CALIB_FT_TS_BASE_MASK     (0x1fff << 13)
-#define FUSE_TSENSOR_CALIB_FT_TS_BASE_SHIFT    13
-
-#define FUSE_TSENSOR8_CALIB_CP_TS_BASE_MASK    0x3ff
-#define FUSE_TSENSOR8_CALIB_FT_TS_BASE_MASK    (0x7ff << 10)
-#define FUSE_TSENSOR8_CALIB_FT_TS_BASE_SHIFT   10
-
-#define FUSE_SPARE_REALIGNMENT_REG_SHIFT_CP_MASK 0x3f
-#define FUSE_SPARE_REALIGNMENT_REG_SHIFT_FT_MASK (0x1f << 21)
-#define FUSE_SPARE_REALIGNMENT_REG_SHIFT_FT_SHIFT 21
-
-#define NOMINAL_CALIB_FT_T124                  105
-#define NOMINAL_CALIB_CP_T124                  25
-
-struct tegra_tsensor_configuration {
-       u32 tall, tsample, tiddq_en, ten_count, pdiv, tsample_ate, pdiv_ate;
-};
-
-struct tegra_tsensor {
-       const struct tegra_tsensor_configuration *config;
-       u32 base, calib_fuse_offset;
-       /* Correction values used to modify values read from calibration fuses */
-       s32 fuse_corr_alpha, fuse_corr_beta;
-};
-
-struct tegra_thermctl_zone {
-       void __iomem *reg;
-       unsigned int shift;
-};
-
-static const struct tegra_tsensor_configuration t124_tsensor_config = {
-       .tall = 16300,
-       .tsample = 120,
-       .tiddq_en = 1,
-       .ten_count = 1,
-       .pdiv = 8,
-       .tsample_ate = 480,
-       .pdiv_ate = 8
-};
-
-static const struct tegra_tsensor t124_tsensors[] = {
-       {
-               .config = &t124_tsensor_config,
-               .base = 0xc0,
-               .calib_fuse_offset = 0x098,
-               .fuse_corr_alpha = 1135400,
-               .fuse_corr_beta = -6266900,
-       },
-       {
-               .config = &t124_tsensor_config,
-               .base = 0xe0,
-               .calib_fuse_offset = 0x084,
-               .fuse_corr_alpha = 1122220,
-               .fuse_corr_beta = -5700700,
-       },
-       {
-               .config = &t124_tsensor_config,
-               .base = 0x100,
-               .calib_fuse_offset = 0x088,
-               .fuse_corr_alpha = 1127000,
-               .fuse_corr_beta = -6768200,
-       },
-       {
-               .config = &t124_tsensor_config,
-               .base = 0x120,
-               .calib_fuse_offset = 0x12c,
-               .fuse_corr_alpha = 1110900,
-               .fuse_corr_beta = -6232000,
-       },
-       {
-               .config = &t124_tsensor_config,
-               .base = 0x140,
-               .calib_fuse_offset = 0x158,
-               .fuse_corr_alpha = 1122300,
-               .fuse_corr_beta = -5936400,
-       },
-       {
-               .config = &t124_tsensor_config,
-               .base = 0x160,
-               .calib_fuse_offset = 0x15c,
-               .fuse_corr_alpha = 1145700,
-               .fuse_corr_beta = -7124600,
-       },
-       {
-               .config = &t124_tsensor_config,
-               .base = 0x180,
-               .calib_fuse_offset = 0x154,
-               .fuse_corr_alpha = 1120100,
-               .fuse_corr_beta = -6000500,
-       },
-       {
-               .config = &t124_tsensor_config,
-               .base = 0x1a0,
-               .calib_fuse_offset = 0x160,
-               .fuse_corr_alpha = 1106500,
-               .fuse_corr_beta = -6729300,
-       },
-};
-
-struct tegra_soctherm {
-       struct reset_control *reset;
-       struct clk *clock_tsensor;
-       struct clk *clock_soctherm;
-       void __iomem *regs;
-
-       struct thermal_zone_device *thermctl_tzs[4];
-};
-
-struct tsensor_shared_calibration {
-       u32 base_cp, base_ft;
-       u32 actual_temp_cp, actual_temp_ft;
-};
-
-static int calculate_shared_calibration(struct tsensor_shared_calibration *r)
-{
-       u32 val, shifted_cp, shifted_ft;
-       int err;
-
-       err = tegra_fuse_readl(FUSE_TSENSOR8_CALIB, &val);
-       if (err)
-               return err;
-       r->base_cp = val & FUSE_TSENSOR8_CALIB_CP_TS_BASE_MASK;
-       r->base_ft = (val & FUSE_TSENSOR8_CALIB_FT_TS_BASE_MASK)
-               >> FUSE_TSENSOR8_CALIB_FT_TS_BASE_SHIFT;
-       val = ((val & FUSE_SPARE_REALIGNMENT_REG_SHIFT_FT_MASK)
-               >> FUSE_SPARE_REALIGNMENT_REG_SHIFT_FT_SHIFT);
-       shifted_ft = sign_extend32(val, 4);
-
-       err = tegra_fuse_readl(FUSE_SPARE_REALIGNMENT_REG_0, &val);
-       if (err)
-               return err;
-       shifted_cp = sign_extend32(val, 5);
-
-       r->actual_temp_cp = 2 * NOMINAL_CALIB_CP_T124 + shifted_cp;
-       r->actual_temp_ft = 2 * NOMINAL_CALIB_FT_T124 + shifted_ft;
-
-       return 0;
-}
-
-static s64 div64_s64_precise(s64 a, s64 b)
-{
-       s64 r, al;
-
-       /* Scale up for increased precision division */
-       al = a << 16;
-
-       r = div64_s64(al * 2 + 1, 2 * b);
-       return r >> 16;
-}
-
-static int
-calculate_tsensor_calibration(const struct tegra_tsensor *sensor,
-                             const struct tsensor_shared_calibration *shared,
-                             u32 *calib)
-{
-       u32 val;
-       s32 actual_tsensor_ft, actual_tsensor_cp, delta_sens, delta_temp,
-           mult, div;
-       s16 therma, thermb;
-       s64 tmp;
-       int err;
-
-       err = tegra_fuse_readl(sensor->calib_fuse_offset, &val);
-       if (err)
-               return err;
-
-       actual_tsensor_cp = (shared->base_cp * 64) + sign_extend32(val, 12);
-       val = (val & FUSE_TSENSOR_CALIB_FT_TS_BASE_MASK)
-               >> FUSE_TSENSOR_CALIB_FT_TS_BASE_SHIFT;
-       actual_tsensor_ft = (shared->base_ft * 32) + sign_extend32(val, 12);
-
-       delta_sens = actual_tsensor_ft - actual_tsensor_cp;
-       delta_temp = shared->actual_temp_ft - shared->actual_temp_cp;
-
-       mult = sensor->config->pdiv * sensor->config->tsample_ate;
-       div = sensor->config->tsample * sensor->config->pdiv_ate;
-
-       therma = div64_s64_precise((s64) delta_temp * (1LL << 13) * mult,
-                                  (s64) delta_sens * div);
-
-       tmp = (s64)actual_tsensor_ft * shared->actual_temp_cp -
-             (s64)actual_tsensor_cp * shared->actual_temp_ft;
-       thermb = div64_s64_precise(tmp, (s64)delta_sens);
-
-       therma = div64_s64_precise((s64)therma * sensor->fuse_corr_alpha,
-                                  (s64)1000000LL);
-       thermb = div64_s64_precise((s64)thermb * sensor->fuse_corr_alpha +
-                                  sensor->fuse_corr_beta, (s64)1000000LL);
-
-       *calib = ((u16)therma << SENSOR_CONFIG2_THERMA_SHIFT) |
-                ((u16)thermb << SENSOR_CONFIG2_THERMB_SHIFT);
-
-       return 0;
-}
-
-static int enable_tsensor(struct tegra_soctherm *tegra,
-                         const struct tegra_tsensor *sensor,
-                         const struct tsensor_shared_calibration *shared)
-{
-       void __iomem *base = tegra->regs + sensor->base;
-       unsigned int val;
-       u32 calib;
-       int err;
-
-       err = calculate_tsensor_calibration(sensor, shared, &calib);
-       if (err)
-               return err;
-
-       val = sensor->config->tall << SENSOR_CONFIG0_TALL_SHIFT;
-       writel(val, base + SENSOR_CONFIG0);
-
-       val  = (sensor->config->tsample - 1) << SENSOR_CONFIG1_TSAMPLE_SHIFT;
-       val |= sensor->config->tiddq_en << SENSOR_CONFIG1_TIDDQ_EN_SHIFT;
-       val |= sensor->config->ten_count << SENSOR_CONFIG1_TEN_COUNT_SHIFT;
-       val |= SENSOR_CONFIG1_TEMP_ENABLE;
-       writel(val, base + SENSOR_CONFIG1);
-
-       writel(calib, base + SENSOR_CONFIG2);
-
-       return 0;
-}
-
-/*
- * Translate from soctherm readback format to millicelsius.
- * The soctherm readback format in bits is as follows:
- *   TTTTTTTT H______N
- * where T's contain the temperature in Celsius,
- * H denotes an addition of 0.5 Celsius and N denotes negation
- * of the final value.
- */
-static int translate_temp(u16 val)
-{
-       long t;
-
-       t = ((val & READBACK_VALUE_MASK) >> READBACK_VALUE_SHIFT) * 1000;
-       if (val & READBACK_ADD_HALF)
-               t += 500;
-       if (val & READBACK_NEGATE)
-               t *= -1;
-
-       return t;
-}
-
-static int tegra_thermctl_get_temp(void *data, int *out_temp)
-{
-       struct tegra_thermctl_zone *zone = data;
-       u32 val;
-
-       val = (readl(zone->reg) >> zone->shift) & SENSOR_TEMP_MASK;
-       *out_temp = translate_temp(val);
-
-       return 0;
-}
-
-static const struct thermal_zone_of_device_ops tegra_of_thermal_ops = {
-       .get_temp = tegra_thermctl_get_temp,
-};
-
-static const struct of_device_id tegra_soctherm_of_match[] = {
-       { .compatible = "nvidia,tegra124-soctherm" },
-       { },
-};
-MODULE_DEVICE_TABLE(of, tegra_soctherm_of_match);
-
-struct thermctl_zone_desc {
-       unsigned int offset;
-       unsigned int shift;
-};
-
-static const struct thermctl_zone_desc t124_thermctl_temp_zones[] = {
-       { SENSOR_TEMP1, 16 },
-       { SENSOR_TEMP2, 16 },
-       { SENSOR_TEMP1, 0 },
-       { SENSOR_TEMP2, 0 }
-};
-
-static int tegra_soctherm_probe(struct platform_device *pdev)
-{
-       struct tegra_soctherm *tegra;
-       struct thermal_zone_device *tz;
-       struct tsensor_shared_calibration shared_calib;
-       struct resource *res;
-       unsigned int i;
-       int err;
-
-       const struct tegra_tsensor *tsensors = t124_tsensors;
-
-       tegra = devm_kzalloc(&pdev->dev, sizeof(*tegra), GFP_KERNEL);
-       if (!tegra)
-               return -ENOMEM;
-
-       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-       tegra->regs = devm_ioremap_resource(&pdev->dev, res);
-       if (IS_ERR(tegra->regs))
-               return PTR_ERR(tegra->regs);
-
-       tegra->reset = devm_reset_control_get(&pdev->dev, "soctherm");
-       if (IS_ERR(tegra->reset)) {
-               dev_err(&pdev->dev, "can't get soctherm reset\n");
-               return PTR_ERR(tegra->reset);
-       }
-
-       tegra->clock_tsensor = devm_clk_get(&pdev->dev, "tsensor");
-       if (IS_ERR(tegra->clock_tsensor)) {
-               dev_err(&pdev->dev, "can't get tsensor clock\n");
-               return PTR_ERR(tegra->clock_tsensor);
-       }
-
-       tegra->clock_soctherm = devm_clk_get(&pdev->dev, "soctherm");
-       if (IS_ERR(tegra->clock_soctherm)) {
-               dev_err(&pdev->dev, "can't get soctherm clock\n");
-               return PTR_ERR(tegra->clock_soctherm);
-       }
-
-       reset_control_assert(tegra->reset);
-
-       err = clk_prepare_enable(tegra->clock_soctherm);
-       if (err)
-               return err;
-
-       err = clk_prepare_enable(tegra->clock_tsensor);
-       if (err) {
-               clk_disable_unprepare(tegra->clock_soctherm);
-               return err;
-       }
-
-       reset_control_deassert(tegra->reset);
-
-       /* Initialize raw sensors */
-
-       err = calculate_shared_calibration(&shared_calib);
-       if (err)
-               goto disable_clocks;
-
-       for (i = 0; i < ARRAY_SIZE(t124_tsensors); ++i) {
-               err = enable_tsensor(tegra, tsensors + i, &shared_calib);
-               if (err)
-                       goto disable_clocks;
-       }
-
-       writel(SENSOR_PDIV_T124, tegra->regs + SENSOR_PDIV);
-       writel(SENSOR_HOTSPOT_OFF_T124, tegra->regs + SENSOR_HOTSPOT_OFF);
-
-       /* Initialize thermctl sensors */
-
-       for (i = 0; i < ARRAY_SIZE(tegra->thermctl_tzs); ++i) {
-               struct tegra_thermctl_zone *zone =
-                       devm_kzalloc(&pdev->dev, sizeof(*zone), GFP_KERNEL);
-               if (!zone) {
-                       err = -ENOMEM;
-                       goto unregister_tzs;
-               }
-
-               zone->reg = tegra->regs + t124_thermctl_temp_zones[i].offset;
-               zone->shift = t124_thermctl_temp_zones[i].shift;
-
-               tz = thermal_zone_of_sensor_register(&pdev->dev, i, zone,
-                                                    &tegra_of_thermal_ops);
-               if (IS_ERR(tz)) {
-                       err = PTR_ERR(tz);
-                       dev_err(&pdev->dev, "failed to register sensor: %d\n",
-                               err);
-                       goto unregister_tzs;
-               }
-
-               tegra->thermctl_tzs[i] = tz;
-       }
-
-       return 0;
-
-unregister_tzs:
-       while (i--)
-               thermal_zone_of_sensor_unregister(&pdev->dev,
-                                                 tegra->thermctl_tzs[i]);
-
-disable_clocks:
-       clk_disable_unprepare(tegra->clock_tsensor);
-       clk_disable_unprepare(tegra->clock_soctherm);
-
-       return err;
-}
-
-static int tegra_soctherm_remove(struct platform_device *pdev)
-{
-       struct tegra_soctherm *tegra = platform_get_drvdata(pdev);
-       unsigned int i;
-
-       for (i = 0; i < ARRAY_SIZE(tegra->thermctl_tzs); ++i) {
-               thermal_zone_of_sensor_unregister(&pdev->dev,
-                                                 tegra->thermctl_tzs[i]);
-       }
-
-       clk_disable_unprepare(tegra->clock_tsensor);
-       clk_disable_unprepare(tegra->clock_soctherm);
-
-       return 0;
-}
-
-static struct platform_driver tegra_soctherm_driver = {
-       .probe = tegra_soctherm_probe,
-       .remove = tegra_soctherm_remove,
-       .driver = {
-               .name = "tegra-soctherm",
-               .of_match_table = tegra_soctherm_of_match,
-       },
-};
-module_platform_driver(tegra_soctherm_driver);
-
-MODULE_AUTHOR("Mikko Perttunen <mperttunen@nvidia.com>");
-MODULE_DESCRIPTION("NVIDIA Tegra SOCTHERM thermal management driver");
-MODULE_LICENSE("GPL v2");
diff --git a/drivers/thermal/thermal-generic-adc.c b/drivers/thermal/thermal-generic-adc.c
new file mode 100644 (file)
index 0000000..73f55d6
--- /dev/null
@@ -0,0 +1,182 @@
+/*
+ * Generic ADC thermal driver
+ *
+ * Copyright (C) 2016 NVIDIA CORPORATION. All rights reserved.
+ *
+ * Author: Laxman Dewangan <ldewangan@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/iio/consumer.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/thermal.h>
+
+struct gadc_thermal_info {
+       struct device *dev;
+       struct thermal_zone_device *tz_dev;
+       struct iio_channel *channel;
+       s32 *lookup_table;
+       int nlookup_table;
+};
+
+static int gadc_thermal_adc_to_temp(struct gadc_thermal_info *gti, int val)
+{
+       int temp, adc_hi, adc_lo;
+       int i;
+
+       for (i = 0; i < gti->nlookup_table; i++) {
+               if (val >= gti->lookup_table[2 * i + 1])
+                       break;
+       }
+
+       if (i == 0) {
+               temp = gti->lookup_table[0];
+       } else if (i >= (gti->nlookup_table - 1)) {
+               temp = gti->lookup_table[2 * (gti->nlookup_table - 1)];
+       } else {
+               adc_hi = gti->lookup_table[2 * i - 1];
+               adc_lo = gti->lookup_table[2 * i + 1];
+               temp = gti->lookup_table[2 * i];
+               temp -= ((val - adc_lo) * 1000) / (adc_hi - adc_lo);
+       }
+
+       return temp;
+}
+
+static int gadc_thermal_get_temp(void *data, int *temp)
+{
+       struct gadc_thermal_info *gti = data;
+       int val;
+       int ret;
+
+       ret = iio_read_channel_processed(gti->channel, &val);
+       if (ret < 0) {
+               dev_err(gti->dev, "IIO channel read failed %d\n", ret);
+               return ret;
+       }
+       *temp = gadc_thermal_adc_to_temp(gti, val);
+
+       return 0;
+}
+
+static const struct thermal_zone_of_device_ops gadc_thermal_ops = {
+       .get_temp = gadc_thermal_get_temp,
+};
+
+static int gadc_thermal_read_linear_lookup_table(struct device *dev,
+                                                struct gadc_thermal_info *gti)
+{
+       struct device_node *np = dev->of_node;
+       int ntable;
+       int ret;
+
+       ntable = of_property_count_elems_of_size(np, "temperature-lookup-table",
+                                                sizeof(u32));
+       if (ntable < 0) {
+               dev_err(dev, "Lookup table is not provided\n");
+               return ntable;
+       }
+
+       if (ntable % 2) {
+               dev_err(dev, "Pair of temperature vs ADC read value missing\n");
+               return -EINVAL;
+       }
+
+       gti->lookup_table = devm_kzalloc(dev, sizeof(*gti->lookup_table) *
+                                        ntable, GFP_KERNEL);
+       if (!gti->lookup_table)
+               return -ENOMEM;
+
+       ret = of_property_read_u32_array(np, "temperature-lookup-table",
+                                        (u32 *)gti->lookup_table, ntable);
+       if (ret < 0) {
+               dev_err(dev, "Failed to read temperature lookup table: %d\n",
+                       ret);
+               return ret;
+       }
+
+       gti->nlookup_table = ntable / 2;
+
+       return 0;
+}
+
+static int gadc_thermal_probe(struct platform_device *pdev)
+{
+       struct gadc_thermal_info *gti;
+       int ret;
+
+       if (!pdev->dev.of_node) {
+               dev_err(&pdev->dev, "Only DT based supported\n");
+               return -ENODEV;
+       }
+
+       gti = devm_kzalloc(&pdev->dev, sizeof(*gti), GFP_KERNEL);
+       if (!gti)
+               return -ENOMEM;
+
+       ret = gadc_thermal_read_linear_lookup_table(&pdev->dev, gti);
+       if (ret < 0)
+               return ret;
+
+       gti->dev = &pdev->dev;
+       platform_set_drvdata(pdev, gti);
+
+       gti->channel = iio_channel_get(&pdev->dev, "sensor-channel");
+       if (IS_ERR(gti->channel)) {
+               ret = PTR_ERR(gti->channel);
+               dev_err(&pdev->dev, "IIO channel not found: %d\n", ret);
+               return ret;
+       }
+
+       gti->tz_dev = thermal_zone_of_sensor_register(&pdev->dev, 0,
+                                                     gti, &gadc_thermal_ops);
+       if (IS_ERR(gti->tz_dev)) {
+               ret = PTR_ERR(gti->tz_dev);
+               dev_err(&pdev->dev, "Thermal zone sensor register failed: %d\n",
+                       ret);
+               goto sensor_fail;
+       }
+
+       return 0;
+
+sensor_fail:
+       iio_channel_release(gti->channel);
+
+       return ret;
+}
+
+static int gadc_thermal_remove(struct platform_device *pdev)
+{
+       struct gadc_thermal_info *gti = platform_get_drvdata(pdev);
+
+       thermal_zone_of_sensor_unregister(&pdev->dev, gti->tz_dev);
+       iio_channel_release(gti->channel);
+
+       return 0;
+}
+
+static const struct of_device_id of_adc_thermal_match[] = {
+       { .compatible = "generic-adc-thermal", },
+       {},
+};
+MODULE_DEVICE_TABLE(of, of_adc_thermal_match);
+
+static struct platform_driver gadc_thermal_driver = {
+       .driver = {
+               .name = "generic-adc-thermal",
+               .of_match_table = of_adc_thermal_match,
+       },
+       .probe = gadc_thermal_probe,
+       .remove = gadc_thermal_remove,
+};
+
+module_platform_driver(gadc_thermal_driver);
+
+MODULE_AUTHOR("Laxman Dewangan <ldewangan@nvidia.com>");
+MODULE_DESCRIPTION("Generic ADC thermal driver using IIO framework with DT");
+MODULE_LICENSE("GPL v2");
index b213a12..15c0a9a 100644 (file)
@@ -337,7 +337,7 @@ int ti_thermal_expose_sensor(struct ti_bandgap *bgp, int id,
                return -EINVAL;
 
        /* in case this is specified by DT */
-       data->ti_thermal = thermal_zone_of_sensor_register(bgp->dev, id,
+       data->ti_thermal = devm_thermal_zone_of_sensor_register(bgp->dev, id,
                                        data, &ti_of_thermal_ops);
        if (IS_ERR(data->ti_thermal)) {
                /* Create thermal zone */
@@ -368,9 +368,6 @@ int ti_thermal_remove_sensor(struct ti_bandgap *bgp, int id)
        if (data && data->ti_thermal) {
                if (data->our_zone)
                        thermal_zone_device_unregister(data->ti_thermal);
-               else
-                       thermal_zone_of_sensor_unregister(bgp->dev,
-                                                         data->ti_thermal);
        }
 
        return 0;
index 7fc919f..97f0a2b 100644 (file)
@@ -555,7 +555,7 @@ static int pkg_temp_thermal_cpu_callback(struct notifier_block *nfb,
 {
        unsigned int cpu = (unsigned long) hcpu;
 
-       switch (action) {
+       switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_ONLINE:
        case CPU_DOWN_FAILED:
                get_core_online(cpu);
index 82c4d2e..9510305 100644 (file)
@@ -120,17 +120,6 @@ config UNIX98_PTYS
          All modern Linux systems use the Unix98 ptys.  Say Y unless
          you're on an embedded system and want to conserve memory.
 
-config DEVPTS_MULTIPLE_INSTANCES
-       bool "Support multiple instances of devpts"
-       depends on UNIX98_PTYS
-       default n
-       ---help---
-         Enable support for multiple instances of devpts filesystem.
-         If you want to have isolated PTY namespaces (eg: in containers),
-         say Y here.  Otherwise, say N. If enabled, each mount of devpts
-         filesystem with the '-o newinstance' option will create an
-         independent PTY namespace.
-
 config LEGACY_PTYS
        bool "Legacy (BSD) PTY support"
        default y
index dd4b841..f856c45 100644 (file)
@@ -668,7 +668,7 @@ static void pty_unix98_remove(struct tty_driver *driver, struct tty_struct *tty)
        else
                fsi = tty->link->driver_data;
        devpts_kill_index(fsi, tty->index);
-       devpts_put_ref(fsi);
+       devpts_release(fsi);
 }
 
 static const struct tty_operations ptm_unix98_ops = {
@@ -733,10 +733,11 @@ static int ptmx_open(struct inode *inode, struct file *filp)
        if (retval)
                return retval;
 
-       fsi = devpts_get_ref(inode, filp);
-       retval = -ENODEV;
-       if (!fsi)
+       fsi = devpts_acquire(filp);
+       if (IS_ERR(fsi)) {
+               retval = PTR_ERR(fsi);
                goto out_free_file;
+       }
 
        /* find a device that is not in use. */
        mutex_lock(&devpts_mutex);
@@ -745,7 +746,7 @@ static int ptmx_open(struct inode *inode, struct file *filp)
 
        retval = index;
        if (index < 0)
-               goto out_put_ref;
+               goto out_put_fsi;
 
 
        mutex_lock(&tty_mutex);
@@ -789,8 +790,8 @@ err_release:
        return retval;
 out:
        devpts_kill_index(fsi, index);
-out_put_ref:
-       devpts_put_ref(fsi);
+out_put_fsi:
+       devpts_release(fsi);
 out_free_file:
        tty_free_file(filp);
        return retval;
index a2aa655..1b7331e 100644 (file)
@@ -2360,7 +2360,7 @@ static int pl011_probe_dt_alias(int index, struct device *dev)
                return ret;
 
        ret = of_alias_get_id(np, "serial");
-       if (IS_ERR_VALUE(ret)) {
+       if (ret < 0) {
                seen_dev_without_alias = true;
                ret = index;
        } else {
index 1897106..699447a 100644 (file)
@@ -654,7 +654,7 @@ static int sprd_probe_dt_alias(int index, struct device *dev)
                return ret;
 
        ret = of_alias_get_id(np, "serial");
-       if (IS_ERR_VALUE(ret))
+       if (ret < 0)
                ret = index;
        else if (ret >= ARRAY_SIZE(sprd_port) || sprd_port[ret] != NULL) {
                dev_warn(dev, "requested serial port %d not available.\n", ret);
index 2ace029..35fe3c8 100644 (file)
@@ -1290,15 +1290,6 @@ static void usbg_release_cmd(struct se_cmd *se_cmd)
        percpu_ida_free(&se_sess->sess_tag_pool, se_cmd->map_tag);
 }
 
-static int usbg_shutdown_session(struct se_session *se_sess)
-{
-       return 0;
-}
-
-static void usbg_close_session(struct se_session *se_sess)
-{
-}
-
 static u32 usbg_sess_get_index(struct se_session *se_sess)
 {
        return 0;
@@ -1735,8 +1726,6 @@ static const struct target_core_fabric_ops usbg_ops = {
        .tpg_check_prod_mode_write_protect = usbg_check_false,
        .tpg_get_inst_index             = usbg_tpg_get_inst_index,
        .release_cmd                    = usbg_release_cmd,
-       .shutdown_session               = usbg_shutdown_session,
-       .close_session                  = usbg_close_session,
        .sess_get_index                 = usbg_sess_get_index,
        .sess_get_initiator_sid         = NULL,
        .write_pending                  = usbg_send_write_request,
index 712a849..188b1ff 100644 (file)
@@ -113,6 +113,35 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
 static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev);
 static void vfio_pci_disable(struct vfio_pci_device *vdev);
 
+/*
+ * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND
+ * _and_ the ability detect when the device is asserting INTx via PCI_STATUS.
+ * If a device implements the former but not the latter we would typically
+ * expect broken_intx_masking be set and require an exclusive interrupt.
+ * However since we do have control of the device's ability to assert INTx,
+ * we can instead pretend that the device does not implement INTx, virtualizing
+ * the pin register to report zero and maintaining DisINTx set on the host.
+ */
+static bool vfio_pci_nointx(struct pci_dev *pdev)
+{
+       switch (pdev->vendor) {
+       case PCI_VENDOR_ID_INTEL:
+               switch (pdev->device) {
+               /* All i40e (XL710/X710) 10/20/40GbE NICs */
+               case 0x1572:
+               case 0x1574:
+               case 0x1580 ... 0x1581:
+               case 0x1583 ... 0x1589:
+               case 0x37d0 ... 0x37d2:
+                       return true;
+               default:
+                       return false;
+               }
+       }
+
+       return false;
+}
+
 static int vfio_pci_enable(struct vfio_pci_device *vdev)
 {
        struct pci_dev *pdev = vdev->pdev;
@@ -136,23 +165,29 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev)
                pr_debug("%s: Couldn't store %s saved state\n",
                         __func__, dev_name(&pdev->dev));
 
-       ret = vfio_config_init(vdev);
-       if (ret) {
-               kfree(vdev->pci_saved_state);
-               vdev->pci_saved_state = NULL;
-               pci_disable_device(pdev);
-               return ret;
+       if (likely(!nointxmask)) {
+               if (vfio_pci_nointx(pdev)) {
+                       dev_info(&pdev->dev, "Masking broken INTx support\n");
+                       vdev->nointx = true;
+                       pci_intx(pdev, 0);
+               } else
+                       vdev->pci_2_3 = pci_intx_mask_supported(pdev);
        }
 
-       if (likely(!nointxmask))
-               vdev->pci_2_3 = pci_intx_mask_supported(pdev);
-
        pci_read_config_word(pdev, PCI_COMMAND, &cmd);
        if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) {
                cmd &= ~PCI_COMMAND_INTX_DISABLE;
                pci_write_config_word(pdev, PCI_COMMAND, cmd);
        }
 
+       ret = vfio_config_init(vdev);
+       if (ret) {
+               kfree(vdev->pci_saved_state);
+               vdev->pci_saved_state = NULL;
+               pci_disable_device(pdev);
+               return ret;
+       }
+
        msix_pos = pdev->msix_cap;
        if (msix_pos) {
                u16 flags;
@@ -304,7 +339,7 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
        if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
                u8 pin;
                pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin);
-               if (IS_ENABLED(CONFIG_VFIO_PCI_INTX) && pin)
+               if (IS_ENABLED(CONFIG_VFIO_PCI_INTX) && !vdev->nointx && pin)
                        return 1;
 
        } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {
index 142c533..688691d 100644 (file)
@@ -408,6 +408,7 @@ static void vfio_bar_restore(struct vfio_pci_device *vdev)
 {
        struct pci_dev *pdev = vdev->pdev;
        u32 *rbar = vdev->rbar;
+       u16 cmd;
        int i;
 
        if (pdev->is_virtfn)
@@ -420,6 +421,12 @@ static void vfio_bar_restore(struct vfio_pci_device *vdev)
                pci_user_write_config_dword(pdev, i, *rbar);
 
        pci_user_write_config_dword(pdev, PCI_ROM_ADDRESS, *rbar);
+
+       if (vdev->nointx) {
+               pci_user_read_config_word(pdev, PCI_COMMAND, &cmd);
+               cmd |= PCI_COMMAND_INTX_DISABLE;
+               pci_user_write_config_word(pdev, PCI_COMMAND, cmd);
+       }
 }
 
 static __le32 vfio_generate_bar_flags(struct pci_dev *pdev, int bar)
@@ -515,6 +522,23 @@ static int vfio_basic_config_read(struct vfio_pci_device *vdev, int pos,
        return count;
 }
 
+/* Test whether BARs match the value we think they should contain */
+static bool vfio_need_bar_restore(struct vfio_pci_device *vdev)
+{
+       int i = 0, pos = PCI_BASE_ADDRESS_0, ret;
+       u32 bar;
+
+       for (; pos <= PCI_BASE_ADDRESS_5; i++, pos += 4) {
+               if (vdev->rbar[i]) {
+                       ret = pci_user_read_config_dword(vdev->pdev, pos, &bar);
+                       if (ret || vdev->rbar[i] != bar)
+                               return true;
+               }
+       }
+
+       return false;
+}
+
 static int vfio_basic_config_write(struct vfio_pci_device *vdev, int pos,
                                   int count, struct perm_bits *perm,
                                   int offset, __le32 val)
@@ -553,7 +577,8 @@ static int vfio_basic_config_write(struct vfio_pci_device *vdev, int pos,
                 * SR-IOV devices will trigger this, but we catch them later
                 */
                if ((new_mem && virt_mem && !phys_mem) ||
-                   (new_io && virt_io && !phys_io))
+                   (new_io && virt_io && !phys_io) ||
+                   vfio_need_bar_restore(vdev))
                        vfio_bar_restore(vdev);
        }
 
@@ -724,7 +749,8 @@ static int vfio_vpd_config_write(struct vfio_pci_device *vdev, int pos,
                if (pci_write_vpd(pdev, addr & ~PCI_VPD_ADDR_F, 4, &data) != 4)
                        return count;
        } else {
-               if (pci_read_vpd(pdev, addr, 4, &data) != 4)
+               data = 0;
+               if (pci_read_vpd(pdev, addr, 4, &data) < 0)
                        return count;
                *pdata = cpu_to_le32(data);
        }
@@ -1124,9 +1150,12 @@ static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos)
                        return pcibios_err_to_errno(ret);
 
                if (PCI_X_CMD_VERSION(word)) {
-                       /* Test for extended capabilities */
-                       pci_read_config_dword(pdev, PCI_CFG_SPACE_SIZE, &dword);
-                       vdev->extended_caps = (dword != 0);
+                       if (pdev->cfg_size > PCI_CFG_SPACE_SIZE) {
+                               /* Test for extended capabilities */
+                               pci_read_config_dword(pdev, PCI_CFG_SPACE_SIZE,
+                                                     &dword);
+                               vdev->extended_caps = (dword != 0);
+                       }
                        return PCI_CAP_PCIX_SIZEOF_V2;
                } else
                        return PCI_CAP_PCIX_SIZEOF_V0;
@@ -1138,9 +1167,11 @@ static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos)
 
                return byte;
        case PCI_CAP_ID_EXP:
-               /* Test for extended capabilities */
-               pci_read_config_dword(pdev, PCI_CFG_SPACE_SIZE, &dword);
-               vdev->extended_caps = (dword != 0);
+               if (pdev->cfg_size > PCI_CFG_SPACE_SIZE) {
+                       /* Test for extended capabilities */
+                       pci_read_config_dword(pdev, PCI_CFG_SPACE_SIZE, &dword);
+                       vdev->extended_caps = (dword != 0);
+               }
 
                /* length based on version */
                if ((pcie_caps_reg(pdev) & PCI_EXP_FLAGS_VERS) == 1)
@@ -1545,7 +1576,7 @@ int vfio_config_init(struct vfio_pci_device *vdev)
                *(__le16 *)&vconfig[PCI_DEVICE_ID] = cpu_to_le16(pdev->device);
        }
 
-       if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX))
+       if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) || vdev->nointx)
                vconfig[PCI_INTERRUPT_PIN] = 0;
 
        ret = vfio_cap_init(vdev);
index e9ea3fe..15ecfc9 100644 (file)
@@ -228,9 +228,9 @@ static int vfio_intx_set_signal(struct vfio_pci_device *vdev, int fd)
 
 static void vfio_intx_disable(struct vfio_pci_device *vdev)
 {
-       vfio_intx_set_signal(vdev, -1);
        vfio_virqfd_disable(&vdev->ctx[0].unmask);
        vfio_virqfd_disable(&vdev->ctx[0].mask);
+       vfio_intx_set_signal(vdev, -1);
        vdev->irq_type = VFIO_PCI_NUM_IRQS;
        vdev->num_ctx = 0;
        kfree(vdev->ctx);
@@ -401,13 +401,13 @@ static void vfio_msi_disable(struct vfio_pci_device *vdev, bool msix)
        struct pci_dev *pdev = vdev->pdev;
        int i;
 
-       vfio_msi_set_block(vdev, 0, vdev->num_ctx, NULL, msix);
-
        for (i = 0; i < vdev->num_ctx; i++) {
                vfio_virqfd_disable(&vdev->ctx[i].unmask);
                vfio_virqfd_disable(&vdev->ctx[i].mask);
        }
 
+       vfio_msi_set_block(vdev, 0, vdev->num_ctx, NULL, msix);
+
        if (msix) {
                pci_disable_msix(vdev->pdev);
                kfree(vdev->msix);
index 8a7d546..016c14a 100644 (file)
@@ -83,6 +83,7 @@ struct vfio_pci_device {
        bool                    bardirty;
        bool                    has_vga;
        bool                    needs_reset;
+       bool                    nointx;
        struct pci_saved_state  *pci_saved_state;
        int                     refcnt;
        struct eventfd_ctx      *err_trigger;
index 3054e3f..80378dd 100644 (file)
@@ -331,14 +331,12 @@ static void tce_iommu_free_table(struct iommu_table *tbl);
 static void tce_iommu_release(void *iommu_data)
 {
        struct tce_container *container = iommu_data;
-       struct iommu_table_group *table_group;
        struct tce_iommu_group *tcegrp;
        long i;
 
        while (tce_groups_attached(container)) {
                tcegrp = list_first_entry(&container->group_list,
                                struct tce_iommu_group, next);
-               table_group = iommu_group_get_iommudata(tcegrp->grp);
                tce_iommu_detach_group(iommu_data, tcegrp->grp);
        }
 
index 15a6582..2ba1942 100644 (file)
@@ -515,7 +515,7 @@ static int map_try_harder(struct vfio_domain *domain, dma_addr_t iova,
                          unsigned long pfn, long npage, int prot)
 {
        long i;
-       int ret;
+       int ret = 0;
 
        for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) {
                ret = iommu_map(domain->domain, iova,
index 0e6fd55..9d6320e 100644 (file)
@@ -333,16 +333,6 @@ static void vhost_scsi_release_cmd(struct se_cmd *se_cmd)
        percpu_ida_free(&se_sess->sess_tag_pool, se_cmd->map_tag);
 }
 
-static int vhost_scsi_shutdown_session(struct se_session *se_sess)
-{
-       return 0;
-}
-
-static void vhost_scsi_close_session(struct se_session *se_sess)
-{
-       return;
-}
-
 static u32 vhost_scsi_sess_get_index(struct se_session *se_sess)
 {
        return 0;
@@ -2114,8 +2104,6 @@ static struct target_core_fabric_ops vhost_scsi_ops = {
        .tpg_get_inst_index             = vhost_scsi_tpg_get_inst_index,
        .release_cmd                    = vhost_scsi_release_cmd,
        .check_stop_free                = vhost_scsi_check_stop_free,
-       .shutdown_session               = vhost_scsi_shutdown_session,
-       .close_session                  = vhost_scsi_close_session,
        .sess_get_index                 = vhost_scsi_sess_get_index,
        .sess_get_initiator_sid         = NULL,
        .write_pending                  = vhost_scsi_write_pending,
index 35fe482..60d6c2a 100644 (file)
@@ -162,7 +162,7 @@ static int lm3630a_intr_config(struct lm3630a_chip *pchip)
 
 static void lm3630a_pwm_ctrl(struct lm3630a_chip *pchip, int br, int br_max)
 {
-       unsigned int period = pwm_get_period(pchip->pwmd);
+       unsigned int period = pchip->pdata->pwm_period;
        unsigned int duty = br * period / br_max;
 
        pwm_config(pchip->pwmd, duty, period);
@@ -424,8 +424,13 @@ static int lm3630a_probe(struct i2c_client *client,
                        dev_err(&client->dev, "fail : get pwm device\n");
                        return PTR_ERR(pchip->pwmd);
                }
+
+               /*
+                * FIXME: pwm_apply_args() should be removed when switching to
+                * the atomic PWM API.
+                */
+               pwm_apply_args(pchip->pwmd);
        }
-       pchip->pwmd->period = pdata->pwm_period;
 
        /* interrupt enable  : irq 0 is not allowed */
        pchip->irq = client->irq;
index daca9e6..e5b14f5 100644 (file)
@@ -246,6 +246,12 @@ static void lp855x_pwm_ctrl(struct lp855x *lp, int br, int max_br)
                        return;
 
                lp->pwm = pwm;
+
+               /*
+                * FIXME: pwm_apply_args() should be removed when switching to
+                * the atomic PWM API.
+                */
+               pwm_apply_args(pwm);
        }
 
        pwm_config(lp->pwm, duty, period);
index 5d583d7..cf869ec 100644 (file)
@@ -145,6 +145,12 @@ static void lp8788_pwm_ctrl(struct lp8788_bl *bl, int br, int max_br)
                }
 
                bl->pwm = pwm;
+
+               /*
+                * FIXME: pwm_apply_args() should be removed when switching to
+                * the atomic PWM API.
+                */
+               pwm_apply_args(pwm);
        }
 
        pwm_config(bl->pwm, duty, period);
index 64f9e1b..b2b366b 100644 (file)
@@ -201,6 +201,7 @@ static int pwm_backlight_probe(struct platform_device *pdev)
        struct device_node *node = pdev->dev.of_node;
        struct pwm_bl_data *pb;
        int initial_blank = FB_BLANK_UNBLANK;
+       struct pwm_args pargs;
        int ret;
 
        if (!data) {
@@ -306,17 +307,22 @@ static int pwm_backlight_probe(struct platform_device *pdev)
 
        dev_dbg(&pdev->dev, "got pwm for backlight\n");
 
+       /*
+        * FIXME: pwm_apply_args() should be removed when switching to
+        * the atomic PWM API.
+        */
+       pwm_apply_args(pb->pwm);
+
        /*
         * The DT case will set the pwm_period_ns field to 0 and store the
         * period, parsed from the DT, in the PWM device. For the non-DT case,
         * set the period from platform data if it has not already been set
         * via the PWM lookup table.
         */
-       pb->period = pwm_get_period(pb->pwm);
-       if (!pb->period && (data->pwm_period_ns > 0)) {
+       pwm_get_args(pb->pwm, &pargs);
+       pb->period = pargs.period;
+       if (!pb->period && (data->pwm_period_ns > 0))
                pb->period = data->pwm_period_ns;
-               pwm_set_period(pb->pwm, data->pwm_period_ns);
-       }
 
        pb->lth_brightness = data->lth_brightness * (pb->period / pb->scale);
 
index d8d583d..c229b1a 100644 (file)
@@ -713,7 +713,7 @@ static int da8xx_fb_config_clk_divider(struct da8xx_fb_par *par,
 
        if (par->lcdc_clk_rate != lcdc_clk_rate) {
                ret = clk_set_rate(par->lcdc_clk, lcdc_clk_rate);
-               if (IS_ERR_VALUE(ret)) {
+               if (ret) {
                        dev_err(par->dev,
                                "unable to set clock rate at %u\n",
                                lcdc_clk_rate);
@@ -784,7 +784,7 @@ static int lcd_init(struct da8xx_fb_par *par, const struct lcd_ctrl_config *cfg,
        int ret = 0;
 
        ret = da8xx_fb_calc_config_clk_divider(par, panel);
-       if (IS_ERR_VALUE(ret)) {
+       if (ret) {
                dev_err(par->dev, "unable to configure clock\n");
                return ret;
        }
index 8ea531d..bbfe7e2 100644 (file)
@@ -51,8 +51,8 @@ static void hdmi_core_ddc_init(struct hdmi_core_data *core)
 {
        void __iomem *base = core->base;
        const unsigned long long iclk = 266000000;      /* DSS L3 ICLK */
-       const unsigned ss_scl_high = 4000;              /* ns */
-       const unsigned ss_scl_low = 4700;               /* ns */
+       const unsigned ss_scl_high = 4600;              /* ns */
+       const unsigned ss_scl_low = 5400;               /* ns */
        const unsigned fs_scl_high = 600;               /* ns */
        const unsigned fs_scl_low = 1300;               /* ns */
        const unsigned sda_hold = 1000;                 /* ns */
@@ -442,7 +442,7 @@ static void hdmi_core_write_avi_infoframe(struct hdmi_core_data *core,
 
        c = (ptr[1] >> 6) & 0x3;
        m = (ptr[1] >> 4) & 0x3;
-       r = (ptr[1] >> 0) & 0x3;
+       r = (ptr[1] >> 0) & 0xf;
 
        itc = (ptr[2] >> 7) & 0x1;
        ec = (ptr[2] >> 4) & 0x7;
index 21dafe5..a9c45c8 100644 (file)
@@ -286,6 +286,7 @@ static int ssd1307fb_init(struct ssd1307fb_par *par)
 {
        int ret;
        u32 precharge, dclk, com_invdir, compins;
+       struct pwm_args pargs;
 
        if (par->device_info->need_pwm) {
                par->pwm = pwm_get(&par->client->dev, NULL);
@@ -294,7 +295,15 @@ static int ssd1307fb_init(struct ssd1307fb_par *par)
                        return PTR_ERR(par->pwm);
                }
 
-               par->pwm_period = pwm_get_period(par->pwm);
+               /*
+                * FIXME: pwm_apply_args() should be removed when switching to
+                * the atomic PWM API.
+                */
+               pwm_apply_args(par->pwm);
+
+               pwm_get_args(par->pwm, &pargs);
+
+               par->pwm_period = pargs.period;
                /* Enable the PWM */
                pwm_config(par->pwm, par->pwm_period / 2, par->pwm_period);
                pwm_enable(par->pwm);
index 7b6d74f..476c0e3 100644 (file)
@@ -75,7 +75,7 @@ struct virtio_balloon {
 
        /* The array of pfns we tell the Host about. */
        unsigned int num_pfns;
-       u32 pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX];
+       __virtio32 pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX];
 
        /* Memory statistics */
        struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR];
@@ -127,14 +127,16 @@ static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
 
 }
 
-static void set_page_pfns(u32 pfns[], struct page *page)
+static void set_page_pfns(struct virtio_balloon *vb,
+                         __virtio32 pfns[], struct page *page)
 {
        unsigned int i;
 
        /* Set balloon pfns pointing at this page.
         * Note that the first pfn points at start of the page. */
        for (i = 0; i < VIRTIO_BALLOON_PAGES_PER_PAGE; i++)
-               pfns[i] = page_to_balloon_pfn(page) + i;
+               pfns[i] = cpu_to_virtio32(vb->vdev,
+                                         page_to_balloon_pfn(page) + i);
 }
 
 static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
@@ -158,7 +160,7 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
                        msleep(200);
                        break;
                }
-               set_page_pfns(vb->pfns + vb->num_pfns, page);
+               set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
                vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE;
                if (!virtio_has_feature(vb->vdev,
                                        VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
@@ -177,10 +179,12 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
 static void release_pages_balloon(struct virtio_balloon *vb)
 {
        unsigned int i;
+       struct page *page;
 
        /* Find pfns pointing at start of each page, get pages and free them. */
        for (i = 0; i < vb->num_pfns; i += VIRTIO_BALLOON_PAGES_PER_PAGE) {
-               struct page *page = balloon_pfn_to_page(vb->pfns[i]);
+               page = balloon_pfn_to_page(virtio32_to_cpu(vb->vdev,
+                                                          vb->pfns[i]));
                if (!virtio_has_feature(vb->vdev,
                                        VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
                        adjust_managed_page_count(page, 1);
@@ -203,7 +207,7 @@ static unsigned leak_balloon(struct virtio_balloon *vb, size_t num)
                page = balloon_page_dequeue(vb_dev_info);
                if (!page)
                        break;
-               set_page_pfns(vb->pfns + vb->num_pfns, page);
+               set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
                vb->num_pages -= VIRTIO_BALLOON_PAGES_PER_PAGE;
        }
 
@@ -471,13 +475,13 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info,
        __count_vm_event(BALLOON_MIGRATE);
        spin_unlock_irqrestore(&vb_dev_info->pages_lock, flags);
        vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE;
-       set_page_pfns(vb->pfns, newpage);
+       set_page_pfns(vb, vb->pfns, newpage);
        tell_host(vb, vb->inflate_vq);
 
        /* balloon's page migration 2nd step -- deflate "page" */
        balloon_page_delete(page);
        vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE;
-       set_page_pfns(vb->pfns, page);
+       set_page_pfns(vb, vb->pfns, page);
        tell_host(vb, vb->deflate_vq);
 
        mutex_unlock(&vb->balloon_lock);
index 5b45e27..b54f26c 100644 (file)
@@ -661,6 +661,14 @@ config ATLAS7_WATCHDOG
          To compile this driver as a module, choose M here: the
          module will be called atlas7_wdt.
 
+config RENESAS_WDT
+       tristate "Renesas WDT Watchdog"
+       depends on ARCH_RENESAS || COMPILE_TEST
+       select WATCHDOG_CORE
+       help
+         This driver adds watchdog support for the integrated watchdogs in the
+         Renesas R-Car and other SH-Mobile SoCs (usually named RWDT or SWDT).
+
 # AVR32 Architecture
 
 config AT32AP700X_WDT
index 9bde095..a46e7c1 100644 (file)
@@ -73,6 +73,7 @@ obj-$(CONFIG_DIGICOLOR_WATCHDOG) += digicolor_wdt.o
 obj-$(CONFIG_LPC18XX_WATCHDOG) += lpc18xx_wdt.o
 obj-$(CONFIG_BCM7038_WDT) += bcm7038_wdt.o
 obj-$(CONFIG_ATLAS7_WATCHDOG) += atlas7_wdt.o
+obj-$(CONFIG_RENESAS_WDT) += renesas_wdt.o
 
 # AVR32 Architecture
 obj-$(CONFIG_AT32AP700X_WDT) += at32ap700x_wdt.o
index 0200768..71ee079 100644 (file)
@@ -611,9 +611,7 @@ static int cpwd_probe(struct platform_device *op)
        }
 
        if (p->broken) {
-               init_timer(&cpwd_timer);
-               cpwd_timer.function     = cpwd_brokentimer;
-               cpwd_timer.data         = (unsigned long) p;
+               setup_timer(&cpwd_timer, cpwd_brokentimer, (unsigned long)p);
                cpwd_timer.expires      = WD_BTIMEOUT;
 
                pr_info("PLD defect workaround enabled for model %s\n",
index 016bd93..d4ba262 100644 (file)
@@ -38,7 +38,7 @@
 
 #define SIO_F71808FG_LD_WDT    0x07    /* Watchdog timer logical device */
 #define SIO_UNLOCK_KEY         0x87    /* Key to enable Super-I/O */
-#define SIO_LOCK_KEY           0xAA    /* Key to diasble Super-I/O */
+#define SIO_LOCK_KEY           0xAA    /* Key to disable Super-I/O */
 
 #define SIO_REG_LDSEL          0x07    /* Logical device select */
 #define SIO_REG_DEVID          0x20    /* Device ID (2 bytes) */
@@ -59,6 +59,7 @@
 #define SIO_F71869A_ID         0x1007  /* Chipset ID */
 #define SIO_F71882_ID          0x0541  /* Chipset ID */
 #define SIO_F71889_ID          0x0723  /* Chipset ID */
+#define SIO_F81865_ID          0x0704  /* Chipset ID */
 
 #define F71808FG_REG_WDO_CONF          0xf0
 #define F71808FG_REG_WDT_CONF          0xf5
 
 #define F71808FG_FLAG_WDOUT_EN         7
 
-#define F71808FG_FLAG_WDTMOUT_STS      5
+#define F71808FG_FLAG_WDTMOUT_STS      6
 #define F71808FG_FLAG_WD_EN            5
 #define F71808FG_FLAG_WD_PULSE         4
 #define F71808FG_FLAG_WD_UNIT          3
 
+#define F81865_REG_WDO_CONF            0xfa
+#define F81865_FLAG_WDOUT_EN           0
+
 /* Default values */
 #define WATCHDOG_TIMEOUT       60      /* 1 minute default timeout */
 #define WATCHDOG_MAX_TIMEOUT   (60 * 255)
@@ -112,7 +116,7 @@ module_param(start_withtimeout, uint, 0);
 MODULE_PARM_DESC(start_withtimeout, "Start watchdog timer on module load with"
        " given initial timeout. Zero (default) disables this feature.");
 
-enum chips { f71808fg, f71858fg, f71862fg, f71869, f71882fg, f71889fg };
+enum chips { f71808fg, f71858fg, f71862fg, f71869, f71882fg, f71889fg, f81865 };
 
 static const char *f71808e_names[] = {
        "f71808fg",
@@ -121,6 +125,7 @@ static const char *f71808e_names[] = {
        "f71869",
        "f71882fg",
        "f71889fg",
+       "f81865",
 };
 
 /* Super-I/O Function prototypes */
@@ -360,6 +365,11 @@ static int watchdog_start(void)
                        superio_inb(watchdog.sioaddr, SIO_REG_MFUNCT3) & 0xcf);
                break;
 
+       case f81865:
+               /* Set pin 70 to WDTRST# */
+               superio_clear_bit(watchdog.sioaddr, SIO_REG_MFUNCT3, 5);
+               break;
+
        default:
                /*
                 * 'default' label to shut up the compiler and catch
@@ -371,8 +381,13 @@ static int watchdog_start(void)
 
        superio_select(watchdog.sioaddr, SIO_F71808FG_LD_WDT);
        superio_set_bit(watchdog.sioaddr, SIO_REG_ENABLE, 0);
-       superio_set_bit(watchdog.sioaddr, F71808FG_REG_WDO_CONF,
-                       F71808FG_FLAG_WDOUT_EN);
+
+       if (watchdog.type == f81865)
+               superio_set_bit(watchdog.sioaddr, F81865_REG_WDO_CONF,
+                               F81865_FLAG_WDOUT_EN);
+       else
+               superio_set_bit(watchdog.sioaddr, F71808FG_REG_WDO_CONF,
+                               F71808FG_FLAG_WDOUT_EN);
 
        superio_set_bit(watchdog.sioaddr, F71808FG_REG_WDT_CONF,
                        F71808FG_FLAG_WD_EN);
@@ -655,7 +670,7 @@ static int __init watchdog_init(int sioaddr)
        superio_select(watchdog.sioaddr, SIO_F71808FG_LD_WDT);
 
        wdt_conf = superio_inb(sioaddr, F71808FG_REG_WDT_CONF);
-       watchdog.caused_reboot = wdt_conf & F71808FG_FLAG_WDTMOUT_STS;
+       watchdog.caused_reboot = wdt_conf & BIT(F71808FG_FLAG_WDTMOUT_STS);
 
        superio_exit(sioaddr);
 
@@ -770,6 +785,9 @@ static int __init f71808e_find(int sioaddr)
                /* Confirmed (by datasheet) not to have a watchdog. */
                err = -ENODEV;
                goto exit;
+       case SIO_F81865_ID:
+               watchdog.type = f81865;
+               break;
        default:
                pr_info("Unrecognized Fintek device: %04x\n",
                        (unsigned int)devid);
index 331aed8..62f346b 100644 (file)
@@ -37,6 +37,8 @@
 
 #define IMX2_WDT_WCR           0x00            /* Control Register */
 #define IMX2_WDT_WCR_WT                (0xFF << 8)     /* -> Watchdog Timeout Field */
+#define IMX2_WDT_WCR_WDA       (1 << 5)        /* -> External Reset WDOG_B */
+#define IMX2_WDT_WCR_SRS       (1 << 4)        /* -> Software Reset Signal */
 #define IMX2_WDT_WCR_WRE       (1 << 3)        /* -> WDOG Reset Enable */
 #define IMX2_WDT_WCR_WDE       (1 << 2)        /* -> Watchdog Enable */
 #define IMX2_WDT_WCR_WDZST     (1 << 0)        /* -> Watchdog timer Suspend */
@@ -59,6 +61,7 @@ struct imx2_wdt_device {
        struct clk *clk;
        struct regmap *regmap;
        struct watchdog_device wdog;
+       bool ext_reset;
 };
 
 static bool nowayout = WATCHDOG_NOWAYOUT;
@@ -83,6 +86,12 @@ static int imx2_wdt_restart(struct watchdog_device *wdog, unsigned long action,
        struct imx2_wdt_device *wdev = watchdog_get_drvdata(wdog);
        unsigned int wcr_enable = IMX2_WDT_WCR_WDE;
 
+       /* Use internal reset or external - not both */
+       if (wdev->ext_reset)
+               wcr_enable |= IMX2_WDT_WCR_SRS; /* do not assert int reset */
+       else
+               wcr_enable |= IMX2_WDT_WCR_WDA; /* do not assert ext-reset */
+
        /* Assert SRS signal */
        regmap_write(wdev->regmap, IMX2_WDT_WCR, wcr_enable);
        /*
@@ -112,8 +121,12 @@ static inline void imx2_wdt_setup(struct watchdog_device *wdog)
        val |= IMX2_WDT_WCR_WDZST;
        /* Strip the old watchdog Time-Out value */
        val &= ~IMX2_WDT_WCR_WT;
-       /* Generate reset if WDOG times out */
-       val &= ~IMX2_WDT_WCR_WRE;
+       /* Generate internal chip-level reset if WDOG times out */
+       if (!wdev->ext_reset)
+               val &= ~IMX2_WDT_WCR_WRE;
+       /* Or if external-reset assert WDOG_B reset only on time-out */
+       else
+               val |= IMX2_WDT_WCR_WRE;
        /* Keep Watchdog Disabled */
        val &= ~IMX2_WDT_WCR_WDE;
        /* Set the watchdog's Time-Out value */
@@ -230,6 +243,8 @@ static int __init imx2_wdt_probe(struct platform_device *pdev)
        regmap_read(wdev->regmap, IMX2_WDT_WRSR, &val);
        wdog->bootstatus = val & IMX2_WDT_WRSR_TOUT ? WDIOF_CARDRESET : 0;
 
+       wdev->ext_reset = of_property_read_bool(pdev->dev.of_node,
+                                               "fsl,ext-reset-output");
        wdog->timeout = clamp_t(unsigned, timeout, 1, IMX2_WDT_MAX_TIME);
        if (wdog->timeout != timeout)
                dev_warn(&pdev->dev, "Initial timeout out of range! Clamped from %u to %u\n",
index 6a7d5c3..c8d51dd 100644 (file)
@@ -160,10 +160,8 @@ static int jz4740_wdt_probe(struct platform_device *pdev)
 
        drvdata = devm_kzalloc(&pdev->dev, sizeof(struct jz4740_wdt_drvdata),
                               GFP_KERNEL);
-       if (!drvdata) {
-               dev_err(&pdev->dev, "Unable to alloacate watchdog device\n");
+       if (!drvdata)
                return -ENOMEM;
-       }
 
        if (heartbeat < 1 || heartbeat > MAX_HEARTBEAT)
                heartbeat = DEFAULT_HEARTBEAT;
index 14521c8..b55981f 100644 (file)
@@ -431,7 +431,7 @@ static int octeon_wdt_cpu_callback(struct notifier_block *nfb,
 {
        unsigned int cpu = (unsigned long)hcpu;
 
-       switch (action) {
+       switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_DOWN_PREPARE:
                octeon_wdt_disable_interrupt(cpu);
                break;
index 20563cc..a043fa4 100644 (file)
@@ -21,6 +21,7 @@
 
 #define WDT_RST                0x38
 #define WDT_EN         0x40
+#define WDT_STS                0x44
 #define WDT_BITE_TIME  0x5C
 
 struct qcom_wdt {
@@ -108,7 +109,8 @@ static const struct watchdog_ops qcom_wdt_ops = {
 static const struct watchdog_info qcom_wdt_info = {
        .options        = WDIOF_KEEPALIVEPING
                        | WDIOF_MAGICCLOSE
-                       | WDIOF_SETTIMEOUT,
+                       | WDIOF_SETTIMEOUT
+                       | WDIOF_CARDRESET,
        .identity       = KBUILD_MODNAME,
 };
 
@@ -171,6 +173,9 @@ static int qcom_wdt_probe(struct platform_device *pdev)
        wdt->wdd.max_timeout = 0x10000000U / wdt->rate;
        wdt->wdd.parent = &pdev->dev;
 
+       if (readl(wdt->base + WDT_STS) & 1)
+               wdt->wdd.bootstatus = WDIOF_CARDRESET;
+
        /*
         * If 'timeout-sec' unspecified in devicetree, assume a 30 second
         * default, unless the max timeout is less than 30 seconds, then use
diff --git a/drivers/watchdog/renesas_wdt.c b/drivers/watchdog/renesas_wdt.c
new file mode 100644 (file)
index 0000000..cf61c92
--- /dev/null
@@ -0,0 +1,213 @@
+/*
+ * Watchdog driver for Renesas WDT watchdog
+ *
+ * Copyright (C) 2015-16 Wolfram Sang, Sang Engineering <wsa@sang-engineering.com>
+ * Copyright (C) 2015-16 Renesas Electronics Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+#include <linux/bitops.h>
+#include <linux/clk.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
+#include <linux/watchdog.h>
+
+#define RWTCNT         0
+#define RWTCSRA                4
+#define RWTCSRA_WOVF   BIT(4)
+#define RWTCSRA_WRFLG  BIT(5)
+#define RWTCSRA_TME    BIT(7)
+
+#define RWDT_DEFAULT_TIMEOUT 60U
+
+static const unsigned int clk_divs[] = { 1, 4, 16, 32, 64, 128, 1024 };
+
+static bool nowayout = WATCHDOG_NOWAYOUT;
+module_param(nowayout, bool, 0);
+MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default="
+                               __MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
+
+struct rwdt_priv {
+       void __iomem *base;
+       struct watchdog_device wdev;
+       struct clk *clk;
+       unsigned int clks_per_sec;
+       u8 cks;
+};
+
+static void rwdt_write(struct rwdt_priv *priv, u32 val, unsigned int reg)
+{
+       if (reg == RWTCNT)
+               val |= 0x5a5a0000;
+       else
+               val |= 0xa5a5a500;
+
+       writel_relaxed(val, priv->base + reg);
+}
+
+static int rwdt_init_timeout(struct watchdog_device *wdev)
+{
+       struct rwdt_priv *priv = watchdog_get_drvdata(wdev);
+
+       rwdt_write(priv, 65536 - wdev->timeout * priv->clks_per_sec, RWTCNT);
+
+       return 0;
+}
+
+static int rwdt_start(struct watchdog_device *wdev)
+{
+       struct rwdt_priv *priv = watchdog_get_drvdata(wdev);
+
+       clk_prepare_enable(priv->clk);
+
+       rwdt_write(priv, priv->cks, RWTCSRA);
+       rwdt_init_timeout(wdev);
+
+       while (readb_relaxed(priv->base + RWTCSRA) & RWTCSRA_WRFLG)
+               cpu_relax();
+
+       rwdt_write(priv, priv->cks | RWTCSRA_TME, RWTCSRA);
+
+       return 0;
+}
+
+static int rwdt_stop(struct watchdog_device *wdev)
+{
+       struct rwdt_priv *priv = watchdog_get_drvdata(wdev);
+
+       rwdt_write(priv, priv->cks, RWTCSRA);
+       clk_disable_unprepare(priv->clk);
+
+       return 0;
+}
+
+static unsigned int rwdt_get_timeleft(struct watchdog_device *wdev)
+{
+       struct rwdt_priv *priv = watchdog_get_drvdata(wdev);
+       u16 val = readw_relaxed(priv->base + RWTCNT);
+
+       return DIV_ROUND_CLOSEST(65536 - val, priv->clks_per_sec);
+}
+
+static const struct watchdog_info rwdt_ident = {
+       .options = WDIOF_MAGICCLOSE | WDIOF_KEEPALIVEPING | WDIOF_SETTIMEOUT,
+       .identity = "Renesas WDT Watchdog",
+};
+
+static const struct watchdog_ops rwdt_ops = {
+       .owner = THIS_MODULE,
+       .start = rwdt_start,
+       .stop = rwdt_stop,
+       .ping = rwdt_init_timeout,
+       .get_timeleft = rwdt_get_timeleft,
+};
+
+static int rwdt_probe(struct platform_device *pdev)
+{
+       struct rwdt_priv *priv;
+       struct resource *res;
+       unsigned long rate;
+       unsigned int clks_per_sec;
+       int ret, i;
+
+       priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
+       if (!priv)
+               return -ENOMEM;
+
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       priv->base = devm_ioremap_resource(&pdev->dev, res);
+       if (IS_ERR(priv->base))
+               return PTR_ERR(priv->base);
+
+       priv->clk = devm_clk_get(&pdev->dev, NULL);
+       if (IS_ERR(priv->clk))
+               return PTR_ERR(priv->clk);
+
+       rate = clk_get_rate(priv->clk);
+       if (!rate)
+               return -ENOENT;
+
+       for (i = ARRAY_SIZE(clk_divs) - 1; i >= 0; i--) {
+               clks_per_sec = DIV_ROUND_UP(rate, clk_divs[i]);
+               if (clks_per_sec) {
+                       priv->clks_per_sec = clks_per_sec;
+                       priv->cks = i;
+                       break;
+               }
+       }
+
+       if (!clks_per_sec) {
+               dev_err(&pdev->dev, "Can't find suitable clock divider\n");
+               return -ERANGE;
+       }
+
+       pm_runtime_enable(&pdev->dev);
+       pm_runtime_get_sync(&pdev->dev);
+
+       priv->wdev.info = &rwdt_ident,
+       priv->wdev.ops = &rwdt_ops,
+       priv->wdev.parent = &pdev->dev;
+       priv->wdev.min_timeout = 1;
+       priv->wdev.max_timeout = 65536 / clks_per_sec;
+       priv->wdev.timeout = min(priv->wdev.max_timeout, RWDT_DEFAULT_TIMEOUT);
+
+       platform_set_drvdata(pdev, priv);
+       watchdog_set_drvdata(&priv->wdev, priv);
+       watchdog_set_nowayout(&priv->wdev, nowayout);
+
+       /* This overrides the default timeout only if DT configuration was found */
+       ret = watchdog_init_timeout(&priv->wdev, 0, &pdev->dev);
+       if (ret)
+               dev_warn(&pdev->dev, "Specified timeout value invalid, using default\n");
+
+       ret = watchdog_register_device(&priv->wdev);
+       if (ret < 0) {
+               pm_runtime_put(&pdev->dev);
+               pm_runtime_disable(&pdev->dev);
+               return ret;
+       }
+
+       return 0;
+}
+
+static int rwdt_remove(struct platform_device *pdev)
+{
+       struct rwdt_priv *priv = platform_get_drvdata(pdev);
+
+       watchdog_unregister_device(&priv->wdev);
+       pm_runtime_put(&pdev->dev);
+       pm_runtime_disable(&pdev->dev);
+
+       return 0;
+}
+
+/*
+ * This driver does also fit for R-Car Gen2 (r8a779[0-4]) WDT. However, for SMP
+ * to work there, one also needs a RESET (RST) driver which does not exist yet
+ * due to HW issues. This needs to be solved before adding compatibles here.
+ */
+static const struct of_device_id rwdt_ids[] = {
+       { .compatible = "renesas,rcar-gen3-wdt", },
+       { /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, rwdt_ids);
+
+static struct platform_driver rwdt_driver = {
+       .driver = {
+               .name = "renesas_wdt",
+               .of_match_table = rwdt_ids,
+       },
+       .probe = rwdt_probe,
+       .remove = rwdt_remove,
+};
+module_platform_driver(rwdt_driver);
+
+MODULE_DESCRIPTION("Renesas WDT Watchdog Driver");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Wolfram Sang <wsa@sang-engineering.com>");
index f908121..517a733 100644 (file)
@@ -275,9 +275,7 @@ static int sh_wdt_probe(struct platform_device *pdev)
                return rc;
        }
 
-       init_timer(&wdt->timer);
-       wdt->timer.function     = sh_wdt_ping;
-       wdt->timer.data         = (unsigned long)wdt;
+       setup_timer(&wdt->timer, sh_wdt_ping, (unsigned long)wdt);
        wdt->timer.expires      = next_ping_period(clock_division_ratio);
 
        dev_info(&pdev->dev, "initialized.\n");
index 6467b91..028618c 100644 (file)
@@ -73,6 +73,13 @@ MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started."
 /*
  * Some TCO specific functions
  */
+
+static bool tco_has_sp5100_reg_layout(struct pci_dev *dev)
+{
+       return dev->device == PCI_DEVICE_ID_ATI_SBX00_SMBUS &&
+              dev->revision < 0x40;
+}
+
 static void tco_timer_start(void)
 {
        u32 val;
@@ -129,7 +136,7 @@ static void tco_timer_enable(void)
 {
        int val;
 
-       if (sp5100_tco_pci->revision >= 0x40) {
+       if (!tco_has_sp5100_reg_layout(sp5100_tco_pci)) {
                /* For SB800 or later */
                /* Set the Watchdog timer resolution to 1 sec */
                outb(SB800_PM_WATCHDOG_CONFIG, SB800_IO_PM_INDEX_REG);
@@ -342,8 +349,7 @@ static unsigned char sp5100_tco_setupdevice(void)
        /*
         * Determine type of southbridge chipset.
         */
-       if (sp5100_tco_pci->device == PCI_DEVICE_ID_ATI_SBX00_SMBUS &&
-           sp5100_tco_pci->revision < 0x40) {
+       if (tco_has_sp5100_reg_layout(sp5100_tco_pci)) {
                dev_name = SP5100_DEVNAME;
                index_reg = SP5100_IO_PM_INDEX_REG;
                data_reg = SP5100_IO_PM_DATA_REG;
@@ -388,8 +394,7 @@ static unsigned char sp5100_tco_setupdevice(void)
         * Secondly, Find the watchdog timer MMIO address
         * from SBResource_MMIO register.
         */
-       if (sp5100_tco_pci->device == PCI_DEVICE_ID_ATI_SBX00_SMBUS &&
-           sp5100_tco_pci->revision < 0x40) {
+       if (tco_has_sp5100_reg_layout(sp5100_tco_pci)) {
                /* Read SBResource_MMIO from PCI config(PCI_Reg: 9Ch) */
                pci_read_config_dword(sp5100_tco_pci,
                                      SP5100_SB_RESOURCE_MMIO_BASE, &val);
index 981a668..7c3ba58 100644 (file)
@@ -104,7 +104,7 @@ static void watchdog_check_min_max_timeout(struct watchdog_device *wdd)
  * timeout module parameter (if it is valid value) or the timeout-sec property
  * (only if it is a valid value and the timeout_parm is out of bounds).
  * If none of them are valid then we keep the old value (which should normally
- * be the default timeout value.
+ * be the default timeout value).
  *
  * A zero is returned on success and -EINVAL for failure.
  */
index e2c5abb..3595cff 100644 (file)
@@ -736,7 +736,6 @@ static int watchdog_release(struct inode *inode, struct file *file)
                watchdog_ping(wdd);
        }
 
-       cancel_delayed_work_sync(&wd_data->work);
        watchdog_update_worker(wdd);
 
        /* make sure that /dev/watchdog can be re-opened */
index 9b7a35c..030e91b 100644 (file)
@@ -8,6 +8,7 @@ nostackp := $(call cc-option, -fno-stack-protector)
 CFLAGS_features.o                      := $(nostackp)
 
 CFLAGS_efi.o                           += -fshort-wchar
+LDFLAGS                                        += $(call ld-option, --no-wchar-size-warning)
 
 dom0-$(CONFIG_PCI) += pci.o
 dom0-$(CONFIG_USB_SUPPORT) += dbgp.o
index cb7138c..71d49a9 100644 (file)
@@ -487,7 +487,8 @@ static void eoi_pirq(struct irq_data *data)
        if (!VALID_EVTCHN(evtchn))
                return;
 
-       if (unlikely(irqd_is_setaffinity_pending(data))) {
+       if (unlikely(irqd_is_setaffinity_pending(data)) &&
+           likely(!irqd_irq_disabled(data))) {
                int masked = test_and_set_mask(evtchn);
 
                clear_evtchn(evtchn);
@@ -1370,7 +1371,8 @@ static void ack_dynirq(struct irq_data *data)
        if (!VALID_EVTCHN(evtchn))
                return;
 
-       if (unlikely(irqd_is_setaffinity_pending(data))) {
+       if (unlikely(irqd_is_setaffinity_pending(data)) &&
+           likely(!irqd_irq_disabled(data))) {
                int masked = test_and_set_mask(evtchn);
 
                clear_evtchn(evtchn);
index dc49538..6793957 100644 (file)
@@ -748,7 +748,7 @@ static long gntdev_ioctl_notify(struct gntdev_priv *priv, void __user *u)
        return rc;
 }
 
-#define GNTDEV_COPY_BATCH 24
+#define GNTDEV_COPY_BATCH 16
 
 struct gntdev_copy_batch {
        struct gnttab_copy ops[GNTDEV_COPY_BATCH];
index ff93262..d6950e0 100644 (file)
@@ -1399,15 +1399,6 @@ static void scsiback_release_cmd(struct se_cmd *se_cmd)
        percpu_ida_free(&se_sess->sess_tag_pool, se_cmd->map_tag);
 }
 
-static int scsiback_shutdown_session(struct se_session *se_sess)
-{
-       return 0;
-}
-
-static void scsiback_close_session(struct se_session *se_sess)
-{
-}
-
 static u32 scsiback_sess_get_index(struct se_session *se_sess)
 {
        return 0;
@@ -1841,8 +1832,6 @@ static const struct target_core_fabric_ops scsiback_ops = {
        .tpg_get_inst_index             = scsiback_tpg_get_inst_index,
        .check_stop_free                = scsiback_check_stop_free,
        .release_cmd                    = scsiback_release_cmd,
-       .shutdown_session               = scsiback_shutdown_session,
-       .close_session                  = scsiback_close_session,
        .sess_get_index                 = scsiback_sess_get_index,
        .sess_get_initiator_sid         = NULL,
        .write_pending                  = scsiback_write_pending,
index eb3589e..0576eae 100644 (file)
@@ -239,13 +239,13 @@ static int v9fs_xattr_get_acl(const struct xattr_handler *handler,
 }
 
 static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
-                             struct dentry *dentry, const char *name,
-                             const void *value, size_t size, int flags)
+                             struct dentry *dentry, struct inode *inode,
+                             const char *name, const void *value,
+                             size_t size, int flags)
 {
        int retval;
        struct posix_acl *acl;
        struct v9fs_session_info *v9ses;
-       struct inode *inode = d_inode(dentry);
 
        v9ses = v9fs_dentry2v9ses(dentry);
        /*
index 18c62ba..a6bd349 100644 (file)
@@ -147,8 +147,9 @@ static int v9fs_xattr_handler_get(const struct xattr_handler *handler,
 }
 
 static int v9fs_xattr_handler_set(const struct xattr_handler *handler,
-                                 struct dentry *dentry, const char *name,
-                                 const void *value, size_t size, int flags)
+                                 struct dentry *dentry, struct inode *inode,
+                                 const char *name, const void *value,
+                                 size_t size, int flags)
 {
        const char *full_name = xattr_full_name(handler, name);
 
index 6725f59..b8fcb41 100644 (file)
@@ -52,6 +52,7 @@ config FS_DAX_PMD
        depends on FS_DAX
        depends on ZONE_DEVICE
        depends on TRANSPARENT_HUGEPAGE
+       depends on BROKEN
 
 endif # BLOCK
 
index 2a6713b..d638486 100644 (file)
@@ -528,7 +528,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
        char                    *prefix = NULL;
 
        new_opts = kstrdup(data, GFP_KERNEL);
-       if (!new_opts)
+       if (data && !new_opts)
                return -ENOMEM;
 
        pr_debug("%s(flags=0x%x,opts=\"%s\")\n", __func__, *flags, data);
@@ -546,7 +546,8 @@ affs_remount(struct super_block *sb, int *flags, char *data)
        }
 
        flush_delayed_work(&sbi->sb_work);
-       replace_mount_options(sb, new_opts);
+       if (new_opts)
+               replace_mount_options(sb, new_opts);
 
        sbi->s_flags = mount_flags;
        sbi->s_mode  = mode;
index 65de439..14d506e 100644 (file)
@@ -643,10 +643,6 @@ ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from)
                return 0;
 
        result = generic_file_write_iter(iocb, from);
-       if (IS_ERR_VALUE(result)) {
-               _leave(" = %zd", result);
-               return result;
-       }
 
        _leave(" = %zd", result);
        return result;
index 72e35b7..3ba385e 100644 (file)
@@ -100,8 +100,8 @@ static int bad_inode_setattr(struct dentry *direntry, struct iattr *attrs)
        return -EIO;
 }
 
-static int bad_inode_setxattr(struct dentry *dentry, const char *name,
-               const void *value, size_t size, int flags)
+static int bad_inode_setxattr(struct dentry *dentry, struct inode *inode,
+               const char *name, const void *value, size_t size, int flags)
 {
        return -EIO;
 }
index 2fab9f1..ae1b540 100644 (file)
@@ -127,12 +127,8 @@ static int set_brk(unsigned long start, unsigned long end)
 {
        start = PAGE_ALIGN(start);
        end = PAGE_ALIGN(end);
-       if (end > start) {
-               unsigned long addr;
-               addr = vm_brk(start, end - start);
-               if (BAD_ADDR(addr))
-                       return addr;
-       }
+       if (end > start)
+               return vm_brk(start, end - start);
        return 0;
 }
 
@@ -275,7 +271,7 @@ static int load_aout_binary(struct linux_binprm * bprm)
                map_size = ex.a_text+ex.a_data;
 #endif
                error = vm_brk(text_addr & PAGE_MASK, map_size);
-               if (error != (text_addr & PAGE_MASK))
+               if (error)
                        return error;
 
                error = read_code(bprm->file, text_addr, pos,
@@ -298,7 +294,7 @@ static int load_aout_binary(struct linux_binprm * bprm)
 
                if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) {
                        error = vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
-                       if (IS_ERR_VALUE(error))
+                       if (error)
                                return error;
 
                        read_code(bprm->file, N_TXTADDR(ex), fd_offset,
@@ -382,7 +378,7 @@ static int load_aout_library(struct file *file)
                               file);
                }
                retval = vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
-               if (IS_ERR_VALUE(retval))
+               if (retval)
                        goto out;
 
                read_code(file, start_addr, N_TXTOFF(ex),
@@ -402,9 +398,8 @@ static int load_aout_library(struct file *file)
        len = PAGE_ALIGN(ex.a_text + ex.a_data);
        bss = ex.a_text + ex.a_data + ex.a_bss;
        if (bss > len) {
-               error = vm_brk(start_addr + len, bss - len);
-               retval = error;
-               if (error != start_addr + len)
+               retval = vm_brk(start_addr + len, bss - len);
+               if (retval)
                        goto out;
        }
        retval = 0;
index 938fc4e..e158b22 100644 (file)
@@ -96,10 +96,9 @@ static int set_brk(unsigned long start, unsigned long end)
        start = ELF_PAGEALIGN(start);
        end = ELF_PAGEALIGN(end);
        if (end > start) {
-               unsigned long addr;
-               addr = vm_brk(start, end - start);
-               if (BAD_ADDR(addr))
-                       return addr;
+               int error = vm_brk(start, end - start);
+               if (error)
+                       return error;
        }
        current->mm->start_brk = current->mm->brk = end;
        return 0;
@@ -629,7 +628,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 
                /* Map the last of the bss segment */
                error = vm_brk(elf_bss, last_bss - elf_bss);
-               if (BAD_ADDR(error))
+               if (error)
                        goto out;
        }
 
@@ -1178,7 +1177,7 @@ static int load_elf_library(struct file *file)
        bss = eppnt->p_memsz + eppnt->p_vaddr;
        if (bss > len) {
                error = vm_brk(len, bss - len);
-               if (BAD_ADDR(error))
+               if (error)
                        goto out_free_ph;
        }
        error = 0;
index f723cd3..caf9e39 100644 (file)
@@ -337,7 +337,7 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp)
                                        "(%d != %d)", (unsigned) r, curid, id);
                        goto failed;
                } else if ( ! p->lib_list[id].loaded &&
-                               IS_ERR_VALUE(load_flat_shared_library(id, p))) {
+                               load_flat_shared_library(id, p) < 0) {
                        printk("BINFMT_FLAT: failed to load library %d", id);
                        goto failed;
                }
@@ -837,7 +837,7 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
 
        res = prepare_binprm(&bprm);
 
-       if (!IS_ERR_VALUE(res))
+       if (!res)
                res = load_flat_file(&bprm, libs, id, NULL);
 
        abort_creds(bprm.cred);
@@ -883,7 +883,7 @@ static int load_flat_binary(struct linux_binprm * bprm)
        stack_len += FLAT_STACK_ALIGN - 1;  /* reserve for upcoming alignment */
        
        res = load_flat_file(bprm, &libinfo, 0, &stack_len);
-       if (IS_ERR_VALUE(res))
+       if (res < 0)
                return res;
        
        /* Update data segment pointers for all libraries */
index 1089dbf..71ccab1 100644 (file)
@@ -51,6 +51,18 @@ struct block_device *I_BDEV(struct inode *inode)
 }
 EXPORT_SYMBOL(I_BDEV);
 
+void __vfs_msg(struct super_block *sb, const char *prefix, const char *fmt, ...)
+{
+       struct va_format vaf;
+       va_list args;
+
+       va_start(args, fmt);
+       vaf.fmt = fmt;
+       vaf.va = &args;
+       printk_ratelimited("%sVFS (%s): %pV\n", prefix, sb->s_id, &vaf);
+       va_end(args);
+}
+
 static void bdev_write_inode(struct block_device *bdev)
 {
        struct inode *inode = bdev->bd_inode;
@@ -489,7 +501,7 @@ long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
        sector += get_start_sect(bdev);
        if (sector % (PAGE_SIZE / 512))
                return -EINVAL;
-       avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn);
+       avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn, size);
        if (!avail)
                return -ERANGE;
        if (avail > 0 && avail & ~PAGE_MASK)
@@ -498,6 +510,75 @@ long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
 }
 EXPORT_SYMBOL_GPL(bdev_direct_access);
 
+/**
+ * bdev_dax_supported() - Check if the device supports dax for filesystem
+ * @sb: The superblock of the device
+ * @blocksize: The block size of the device
+ *
+ * This is a library function for filesystems to check if the block device
+ * can be mounted with dax option.
+ *
+ * Return: negative errno if unsupported, 0 if supported.
+ */
+int bdev_dax_supported(struct super_block *sb, int blocksize)
+{
+       struct blk_dax_ctl dax = {
+               .sector = 0,
+               .size = PAGE_SIZE,
+       };
+       int err;
+
+       if (blocksize != PAGE_SIZE) {
+               vfs_msg(sb, KERN_ERR, "error: unsupported blocksize for dax");
+               return -EINVAL;
+       }
+
+       err = bdev_direct_access(sb->s_bdev, &dax);
+       if (err < 0) {
+               switch (err) {
+               case -EOPNOTSUPP:
+                       vfs_msg(sb, KERN_ERR,
+                               "error: device does not support dax");
+                       break;
+               case -EINVAL:
+                       vfs_msg(sb, KERN_ERR,
+                               "error: unaligned partition for dax");
+                       break;
+               default:
+                       vfs_msg(sb, KERN_ERR,
+                               "error: dax access failed (%d)", err);
+               }
+               return err;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(bdev_dax_supported);
+
+/**
+ * bdev_dax_capable() - Return if the raw device is capable for dax
+ * @bdev: The device for raw block device access
+ */
+bool bdev_dax_capable(struct block_device *bdev)
+{
+       struct blk_dax_ctl dax = {
+               .size = PAGE_SIZE,
+       };
+
+       if (!IS_ENABLED(CONFIG_FS_DAX))
+               return false;
+
+       dax.sector = 0;
+       if (bdev_direct_access(bdev, &dax) < 0)
+               return false;
+
+       dax.sector = bdev->bd_part->nr_sects - (PAGE_SIZE / 512);
+       if (bdev_direct_access(bdev, &dax) < 0)
+               return false;
+
+       return true;
+}
+
 /*
  * pseudo-fs
  */
@@ -1160,33 +1241,6 @@ void bd_set_size(struct block_device *bdev, loff_t size)
 }
 EXPORT_SYMBOL(bd_set_size);
 
-static bool blkdev_dax_capable(struct block_device *bdev)
-{
-       struct gendisk *disk = bdev->bd_disk;
-
-       if (!disk->fops->direct_access || !IS_ENABLED(CONFIG_FS_DAX))
-               return false;
-
-       /*
-        * If the partition is not aligned on a page boundary, we can't
-        * do dax I/O to it.
-        */
-       if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512))
-                       || (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
-               return false;
-
-       /*
-        * If the device has known bad blocks, force all I/O through the
-        * driver / page cache.
-        *
-        * TODO: support finer grained dax error handling
-        */
-       if (disk->bb && disk->bb->count)
-               return false;
-
-       return true;
-}
-
 static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
 
 /*
@@ -1266,7 +1320,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 
                        if (!ret) {
                                bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
-                               if (!blkdev_dax_capable(bdev))
+                               if (!bdev_dax_capable(bdev))
                                        bdev->bd_inode->i_flags &= ~S_DAX;
                        }
 
@@ -1303,7 +1357,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                                goto out_clear;
                        }
                        bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
-                       if (!blkdev_dax_capable(bdev))
+                       if (!bdev_dax_capable(bdev))
                                bdev->bd_inode->i_flags &= ~S_DAX;
                }
        } else {
index d309018..8bb3509 100644 (file)
@@ -1939,7 +1939,7 @@ static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
  * from ipath->fspath->val[i].
  * when it returns, there are ipath->fspath->elem_cnt number of paths available
  * in ipath->fspath->val[]. when the allocated space wasn't sufficient, the
- * number of missed paths in recored in ipath->fspath->elem_missed, otherwise,
+ * number of missed paths is recorded in ipath->fspath->elem_missed, otherwise,
  * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would
  * have been needed to return all paths.
  */
index 1da5753..4919aed 100644 (file)
@@ -313,7 +313,7 @@ struct btrfs_dio_private {
        struct bio *dio_bio;
 
        /*
-        * The original bio may be splited to several sub-bios, this is
+        * The original bio may be split to several sub-bios, this is
         * done during endio of sub-bios
         */
        int (*subio_endio)(struct inode *, struct btrfs_io_bio *, int);
index 516e19d..b677a6e 100644 (file)
@@ -1939,7 +1939,7 @@ again:
                /*
                 * Clear all references of this block. Do not free
                 * the block itself even if is not referenced anymore
-                * because it still carries valueable information
+                * because it still carries valuable information
                 * like whether it was ever written and IO completed.
                 */
                list_for_each_entry_safe(l, tmp, &block->ref_to_list,
index decd0a3..427c36b 100644 (file)
@@ -156,7 +156,7 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
 
                /*
                 * RCU really hurts here, we could free up the root node because
-                * it was cow'ed but we may not get the new root node yet so do
+                * it was COWed but we may not get the new root node yet so do
                 * the inc_not_zero dance and if it doesn't work then
                 * synchronize_rcu and try again.
                 */
@@ -955,7 +955,7 @@ int btrfs_block_can_be_shared(struct btrfs_root *root,
                              struct extent_buffer *buf)
 {
        /*
-        * Tree blocks not in refernece counted trees and tree roots
+        * Tree blocks not in reference counted trees and tree roots
         * are never shared. If a block was allocated after the last
         * snapshot and the block was not allocated by tree relocation,
         * we know the block is not shared.
@@ -1270,7 +1270,7 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
 
 /*
  * tm is a pointer to the first operation to rewind within eb. then, all
- * previous operations will be rewinded (until we reach something older than
+ * previous operations will be rewound (until we reach something older than
  * time_seq).
  */
 static void
@@ -1345,7 +1345,7 @@ __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
 }
 
 /*
- * Called with eb read locked. If the buffer cannot be rewinded, the same buffer
+ * Called with eb read locked. If the buffer cannot be rewound, the same buffer
  * is returned. If rewind operations happen, a fresh buffer is returned. The
  * returned buffer is always read-locked. If the returned buffer is not the
  * input buffer, the lock on the input buffer is released and the input buffer
@@ -1516,7 +1516,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
         * 3) the root is not forced COW.
         *
         * What is forced COW:
-        *    when we create snapshot during commiting the transaction,
+        *    when we create snapshot during committing the transaction,
         *    after we've finished coping src root, we must COW the shared
         *    block to ensure the metadata consistency.
         */
@@ -1531,7 +1531,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
 
 /*
  * cows a single block, see __btrfs_cow_block for the real work.
- * This version of it has extra checks so that a block isn't cow'd more than
+ * This version of it has extra checks so that a block isn't COWed more than
  * once per transaction, as long as it hasn't been written yet
  */
 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
@@ -2986,7 +2986,7 @@ again:
                btrfs_unlock_up_safe(p, level + 1);
 
                /*
-                * Since we can unwind eb's we want to do a real search every
+                * Since we can unwind ebs we want to do a real search every
                 * time.
                 */
                prev_cmp = -1;
index ddcc58f..101c3cf 100644 (file)
@@ -89,7 +89,7 @@ static const int btrfs_csum_sizes[] = { 4 };
 /* four bytes for CRC32 */
 #define BTRFS_EMPTY_DIR_SIZE 0
 
-/* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */
+/* specific to btrfs_map_block(), therefore not in include/linux/blk_types.h */
 #define REQ_GET_READ_MIRRORS   (1 << 30)
 
 /* ioprio of readahead is set to idle */
@@ -431,7 +431,7 @@ struct btrfs_space_info {
         * bytes_pinned does not reflect the bytes that will be pinned once the
         * delayed refs are flushed, so this counter is inc'ed every time we
         * call btrfs_free_extent so it is a realtime count of what will be
-        * freed once the transaction is committed.  It will be zero'ed every
+        * freed once the transaction is committed.  It will be zeroed every
         * time the transaction commits.
         */
        struct percpu_counter total_bytes_pinned;
@@ -1401,7 +1401,7 @@ static inline void btrfs_init_map_token (struct btrfs_map_token *token)
        token->kaddr = NULL;
 }
 
-/* some macros to generate set/get funcs for the struct fields.  This
+/* some macros to generate set/get functions for the struct fields.  This
  * assumes there is a lefoo_to_cpu for every type, so lets make a simple
  * one for u8:
  */
index c24b653..5fca953 100644 (file)
@@ -188,7 +188,7 @@ struct btrfs_delayed_ref_root {
 
        /*
         * To make qgroup to skip given root.
-        * This is for snapshot, as btrfs_qgroup_inherit() will manully
+        * This is for snapshot, as btrfs_qgroup_inherit() will manually
         * modify counters for snapshot and its source, so we should skip
         * the snapshot in new_root/old_roots or it will get calculated twice
         */
index 85f12e6..63ef9cd 100644 (file)
@@ -450,7 +450,7 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_root *root,
 }
 
 /*
- * blocked until all flighting bios are finished.
+ * blocked until all in-flight bios operations are finished.
  */
 static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
 {
index 91d1239..6628fca 100644 (file)
@@ -384,7 +384,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
        /*
         * Things reading via commit roots that don't have normal protection,
         * like send, can have a really old block in cache that may point at a
-        * block that has been free'd and re-allocated.  So don't clear uptodate
+        * block that has been freed and re-allocated.  So don't clear uptodate
         * if we find an eb that is under IO (dirty/writeback) because we could
         * end up reading in the stale data and then writing it back out and
         * making everybody very sad.
@@ -418,7 +418,7 @@ static int btrfs_check_super_csum(char *raw_disk_sb)
                /*
                 * The super_block structure does not span the whole
                 * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space
-                * is filled with zeros and is included in the checkum.
+                * is filled with zeros and is included in the checksum.
                 */
                crc = btrfs_csum_data(raw_disk_sb + BTRFS_CSUM_SIZE,
                                crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
@@ -600,7 +600,7 @@ static noinline int check_leaf(struct btrfs_root *root,
 
                /*
                 * Check to make sure that we don't point outside of the leaf,
-                * just incase all the items are consistent to eachother, but
+                * just in case all the items are consistent to each other, but
                 * all point outside of the leaf.
                 */
                if (btrfs_item_end_nr(leaf, slot) >
@@ -3022,7 +3022,7 @@ retry_root_backup:
        }
 
        /*
-        * Mount does not set all options immediatelly, we can do it now and do
+        * Mount does not set all options immediately, we can do it now and do
         * not have to wait for transaction commit
         */
        btrfs_apply_pending_changes(fs_info);
@@ -3255,7 +3255,7 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
                btrfs_warn_rl_in_rcu(device->dev_root->fs_info,
                                "lost page write due to IO error on %s",
                                          rcu_str_deref(device->name));
-               /* note, we dont' set_buffer_write_io_error because we have
+               /* note, we don't set_buffer_write_io_error because we have
                 * our own ways of dealing with the IO errors
                 */
                clear_buffer_uptodate(bh);
@@ -4367,7 +4367,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
                if (ret)
                        break;
 
-               clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
+               clear_extent_bits(dirty_pages, start, end, mark);
                while (start <= end) {
                        eb = btrfs_find_tree_block(root->fs_info, start);
                        start += root->nodesize;
@@ -4402,7 +4402,7 @@ again:
                if (ret)
                        break;
 
-               clear_extent_dirty(unpin, start, end, GFP_NOFS);
+               clear_extent_dirty(unpin, start, end);
                btrfs_error_unpin_extent_range(root, start, end);
                cond_resched();
        }
index 9424864..689d25a 100644 (file)
@@ -231,9 +231,9 @@ static int add_excluded_extent(struct btrfs_root *root,
 {
        u64 end = start + num_bytes - 1;
        set_extent_bits(&root->fs_info->freed_extents[0],
-                       start, end, EXTENT_UPTODATE, GFP_NOFS);
+                       start, end, EXTENT_UPTODATE);
        set_extent_bits(&root->fs_info->freed_extents[1],
-                       start, end, EXTENT_UPTODATE, GFP_NOFS);
+                       start, end, EXTENT_UPTODATE);
        return 0;
 }
 
@@ -246,9 +246,9 @@ static void free_excluded_extents(struct btrfs_root *root,
        end = start + cache->key.offset - 1;
 
        clear_extent_bits(&root->fs_info->freed_extents[0],
-                         start, end, EXTENT_UPTODATE, GFP_NOFS);
+                         start, end, EXTENT_UPTODATE);
        clear_extent_bits(&root->fs_info->freed_extents[1],
-                         start, end, EXTENT_UPTODATE, GFP_NOFS);
+                         start, end, EXTENT_UPTODATE);
 }
 
 static int exclude_super_stripes(struct btrfs_root *root,
@@ -980,7 +980,7 @@ out_free:
  * event that tree block loses its owner tree's reference and do the
  * back refs conversion.
  *
- * When a tree block is COW'd through a tree, there are four cases:
+ * When a tree block is COWed through a tree, there are four cases:
  *
  * The reference count of the block is one and the tree is the block's
  * owner tree. Nothing to do in this case.
@@ -2042,6 +2042,11 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
        struct btrfs_bio *bbio = NULL;
 
 
+       /*
+        * Avoid races with device replace and make sure our bbio has devices
+        * associated to its stripes that don't go away while we are discarding.
+        */
+       btrfs_bio_counter_inc_blocked(root->fs_info);
        /* Tell the block device(s) that the sectors can be discarded */
        ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
                              bytenr, &num_bytes, &bbio, 0);
@@ -2074,6 +2079,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
                }
                btrfs_put_bbio(bbio);
        }
+       btrfs_bio_counter_dec(root->fs_info);
 
        if (actual_bytes)
                *actual_bytes = discarded_bytes;
@@ -2595,7 +2601,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                        }
 
                        /*
-                        * Need to drop our head ref lock and re-aqcuire the
+                        * Need to drop our head ref lock and re-acquire the
                         * delayed ref lock and then re-check to make sure
                         * nobody got added.
                         */
@@ -2747,7 +2753,7 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
 
        /*
         * We don't ever fill up leaves all the way so multiply by 2 just to be
-        * closer to what we're really going to want to ouse.
+        * closer to what we're really going to want to use.
         */
        return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
 }
@@ -2851,7 +2857,7 @@ static void delayed_ref_async_start(struct btrfs_work *work)
        }
 
        /*
-        * trans->sync means that when we call end_transaciton, we won't
+        * trans->sync means that when we call end_transaction, we won't
         * wait on delayed refs
         */
        trans->sync = true;
@@ -4296,7 +4302,7 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
  * Called if we need to clear a data reservation for this inode
  * Normally in a error case.
  *
- * This one will handle the per-indoe data rsv map for accurate reserved
+ * This one will handle the per-inode data rsv map for accurate reserved
  * space framework.
  */
 void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
@@ -4967,7 +4973,7 @@ void btrfs_init_async_reclaim_work(struct work_struct *work)
  * @orig_bytes - the number of bytes we want
  * @flush - whether or not we can flush to make our reservation
  *
- * This will reserve orgi_bytes number of bytes from the space info associated
+ * This will reserve orig_bytes number of bytes from the space info associated
  * with the block_rsv.  If there is not enough space it will make an attempt to
  * flush out space to make room.  It will do this by flushing delalloc if
  * possible or committing the transaction.  If flush is 0 then no attempts to
@@ -5572,7 +5578,7 @@ void btrfs_orphan_release_metadata(struct inode *inode)
  * common file/directory operations, they change two fs/file trees
  * and root tree, the number of items that the qgroup reserves is
  * different with the free space reservation. So we can not use
- * the space reseravtion mechanism in start_transaction().
+ * the space reservation mechanism in start_transaction().
  */
 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
                                     struct btrfs_block_rsv *rsv,
@@ -5621,7 +5627,7 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
 /**
  * drop_outstanding_extent - drop an outstanding extent
  * @inode: the inode we're dropping the extent for
- * @num_bytes: the number of bytes we're relaseing.
+ * @num_bytes: the number of bytes we're releasing.
  *
  * This is called when we are freeing up an outstanding extent, either called
  * after an error or after an extent is written.  This will return the number of
@@ -5647,7 +5653,7 @@ static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes)
                drop_inode_space = 1;
 
        /*
-        * If we have more or the same amount of outsanding extents than we have
+        * If we have more or the same amount of outstanding extents than we have
         * reserved then we need to leave the reserved extents count alone.
         */
        if (BTRFS_I(inode)->outstanding_extents >=
@@ -5661,8 +5667,8 @@ static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes)
 }
 
 /**
- * calc_csum_metadata_size - return the amount of metada space that must be
- *     reserved/free'd for the given bytes.
+ * calc_csum_metadata_size - return the amount of metadata space that must be
+ *     reserved/freed for the given bytes.
  * @inode: the inode we're manipulating
  * @num_bytes: the number of bytes in question
  * @reserve: 1 if we are reserving space, 0 if we are freeing space
@@ -5814,7 +5820,7 @@ out_fail:
 
                /*
                 * This is tricky, but first we need to figure out how much we
-                * free'd from any free-ers that occurred during this
+                * freed from any free-ers that occurred during this
                 * reservation, so we reset ->csum_bytes to the csum_bytes
                 * before we dropped our lock, and then call the free for the
                 * number of bytes that were freed while we were trying our
@@ -5836,7 +5842,7 @@ out_fail:
 
                /*
                 * Now reset ->csum_bytes to what it should be.  If bytes is
-                * more than to_free then we would have free'd more space had we
+                * more than to_free then we would have freed more space had we
                 * not had an artificially high ->csum_bytes, so we need to free
                 * the remainder.  If bytes is the same or less then we don't
                 * need to do anything, the other free-ers did the correct
@@ -6515,7 +6521,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
                        ret = btrfs_discard_extent(root, start,
                                                   end + 1 - start, NULL);
 
-               clear_extent_dirty(unpin, start, end, GFP_NOFS);
+               clear_extent_dirty(unpin, start, end);
                unpin_extent_range(root, start, end, true);
                mutex_unlock(&fs_info->unused_bg_unpin_mutex);
                cond_resched();
@@ -7578,7 +7584,7 @@ loop:
                if (loop == LOOP_CACHING_NOWAIT) {
                        /*
                         * We want to skip the LOOP_CACHING_WAIT step if we
-                        * don't have any unached bgs and we've alrelady done a
+                        * don't have any uncached bgs and we've already done a
                         * full search through.
                         */
                        if (orig_have_caching_bg || !full_search)
@@ -7982,7 +7988,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 
        /*
         * Mixed block groups will exclude before processing the log so we only
-        * need to do the exlude dance if this fs isn't mixed.
+        * need to do the exclude dance if this fs isn't mixed.
         */
        if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) {
                ret = __exclude_logged_extent(root, ins->objectid, ins->offset);
@@ -8032,7 +8038,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                                        buf->start + buf->len - 1, GFP_NOFS);
                else
                        set_extent_new(&root->dirty_log_pages, buf->start,
-                                       buf->start + buf->len - 1, GFP_NOFS);
+                                       buf->start + buf->len - 1);
        } else {
                buf->log_index = -1;
                set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
@@ -9426,7 +9432,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
        u64 free_bytes = 0;
        int factor;
 
-       /* It's df, we don't care if it's racey */
+       /* It's df, we don't care if it's racy */
        if (list_empty(&sinfo->ro_bgs))
                return 0;
 
@@ -10635,14 +10641,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                 */
                mutex_lock(&fs_info->unused_bg_unpin_mutex);
                ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
-                                 EXTENT_DIRTY, GFP_NOFS);
+                                 EXTENT_DIRTY);
                if (ret) {
                        mutex_unlock(&fs_info->unused_bg_unpin_mutex);
                        btrfs_dec_block_group_ro(root, block_group);
                        goto end_trans;
                }
                ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
-                                 EXTENT_DIRTY, GFP_NOFS);
+                                 EXTENT_DIRTY);
                if (ret) {
                        mutex_unlock(&fs_info->unused_bg_unpin_mutex);
                        btrfs_dec_block_group_ro(root, block_group);
index 2f83448..6e953de 100644 (file)
@@ -726,14 +726,6 @@ next:
        start = last_end + 1;
        if (start <= end && state && !need_resched())
                goto hit_next;
-       goto search_again;
-
-out:
-       spin_unlock(&tree->lock);
-       if (prealloc)
-               free_extent_state(prealloc);
-
-       return 0;
 
 search_again:
        if (start > end)
@@ -742,6 +734,14 @@ search_again:
        if (gfpflags_allow_blocking(mask))
                cond_resched();
        goto again;
+
+out:
+       spin_unlock(&tree->lock);
+       if (prealloc)
+               free_extent_state(prealloc);
+
+       return 0;
+
 }
 
 static void wait_on_state(struct extent_io_tree *tree,
@@ -873,8 +873,14 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
        bits |= EXTENT_FIRST_DELALLOC;
 again:
        if (!prealloc && gfpflags_allow_blocking(mask)) {
+               /*
+                * Don't care for allocation failure here because we might end
+                * up not needing the pre-allocated extent state at all, which
+                * is the case if we only have in the tree extent states that
+                * cover our input range and don't cover too any other range.
+                * If we end up needing a new extent state we allocate it later.
+                */
                prealloc = alloc_extent_state(mask);
-               BUG_ON(!prealloc);
        }
 
        spin_lock(&tree->lock);
@@ -1037,7 +1043,13 @@ hit_next:
                goto out;
        }
 
-       goto search_again;
+search_again:
+       if (start > end)
+               goto out;
+       spin_unlock(&tree->lock);
+       if (gfpflags_allow_blocking(mask))
+               cond_resched();
+       goto again;
 
 out:
        spin_unlock(&tree->lock);
@@ -1046,13 +1058,6 @@ out:
 
        return err;
 
-search_again:
-       if (start > end)
-               goto out;
-       spin_unlock(&tree->lock);
-       if (gfpflags_allow_blocking(mask))
-               cond_resched();
-       goto again;
 }
 
 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
@@ -1073,17 +1078,18 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
  * @bits:      the bits to set in this range
  * @clear_bits:        the bits to clear in this range
  * @cached_state:      state that we're going to cache
- * @mask:      the allocation mask
  *
  * This will go through and set bits for the given range.  If any states exist
  * already in this range they are set with the given bit and cleared of the
  * clear_bits.  This is only meant to be used by things that are mergeable, ie
  * converting from say DELALLOC to DIRTY.  This is not meant to be used with
  * boundary bits like LOCK.
+ *
+ * All allocations are done with GFP_NOFS.
  */
 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                       unsigned bits, unsigned clear_bits,
-                      struct extent_state **cached_state, gfp_t mask)
+                      struct extent_state **cached_state)
 {
        struct extent_state *state;
        struct extent_state *prealloc = NULL;
@@ -1098,7 +1104,7 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
        btrfs_debug_check_extent_io_range(tree, start, end);
 
 again:
-       if (!prealloc && gfpflags_allow_blocking(mask)) {
+       if (!prealloc) {
                /*
                 * Best effort, don't worry if extent state allocation fails
                 * here for the first iteration. We might have a cached state
@@ -1106,7 +1112,7 @@ again:
                 * extent state allocations are needed. We'll only know this
                 * after locking the tree.
                 */
-               prealloc = alloc_extent_state(mask);
+               prealloc = alloc_extent_state(GFP_NOFS);
                if (!prealloc && !first_iteration)
                        return -ENOMEM;
        }
@@ -1263,7 +1269,13 @@ hit_next:
                goto out;
        }
 
-       goto search_again;
+search_again:
+       if (start > end)
+               goto out;
+       spin_unlock(&tree->lock);
+       cond_resched();
+       first_iteration = false;
+       goto again;
 
 out:
        spin_unlock(&tree->lock);
@@ -1271,21 +1283,11 @@ out:
                free_extent_state(prealloc);
 
        return err;
-
-search_again:
-       if (start > end)
-               goto out;
-       spin_unlock(&tree->lock);
-       if (gfpflags_allow_blocking(mask))
-               cond_resched();
-       first_iteration = false;
-       goto again;
 }
 
 /* wrappers around set/clear extent bit */
 int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-                          unsigned bits, gfp_t mask,
-                          struct extent_changeset *changeset)
+                          unsigned bits, struct extent_changeset *changeset)
 {
        /*
         * We don't support EXTENT_LOCKED yet, as current changeset will
@@ -1295,7 +1297,7 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
         */
        BUG_ON(bits & EXTENT_LOCKED);
 
-       return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, mask,
+       return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
                                changeset);
 }
 
@@ -1308,8 +1310,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 }
 
 int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-                            unsigned bits, gfp_t mask,
-                            struct extent_changeset *changeset)
+               unsigned bits, struct extent_changeset *changeset)
 {
        /*
         * Don't support EXTENT_LOCKED case, same reason as
@@ -1317,7 +1318,7 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
         */
        BUG_ON(bits & EXTENT_LOCKED);
 
-       return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask,
+       return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS,
                                  changeset);
 }
 
@@ -1975,13 +1976,13 @@ int free_io_failure(struct inode *inode, struct io_failure_record *rec)
        set_state_failrec(failure_tree, rec->start, NULL);
        ret = clear_extent_bits(failure_tree, rec->start,
                                rec->start + rec->len - 1,
-                               EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+                               EXTENT_LOCKED | EXTENT_DIRTY);
        if (ret)
                err = ret;
 
        ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
                                rec->start + rec->len - 1,
-                               EXTENT_DAMAGED, GFP_NOFS);
+                               EXTENT_DAMAGED);
        if (ret && !err)
                err = ret;
 
@@ -2024,9 +2025,16 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
        bio->bi_iter.bi_size = 0;
        map_length = length;
 
+       /*
+        * Avoid races with device replace and make sure our bbio has devices
+        * associated to its stripes that don't go away while we are doing the
+        * read repair operation.
+        */
+       btrfs_bio_counter_inc_blocked(fs_info);
        ret = btrfs_map_block(fs_info, WRITE, logical,
                              &map_length, &bbio, mirror_num);
        if (ret) {
+               btrfs_bio_counter_dec(fs_info);
                bio_put(bio);
                return -EIO;
        }
@@ -2036,6 +2044,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
        dev = bbio->stripes[mirror_num-1].dev;
        btrfs_put_bbio(bbio);
        if (!dev || !dev->bdev || !dev->writeable) {
+               btrfs_bio_counter_dec(fs_info);
                bio_put(bio);
                return -EIO;
        }
@@ -2044,6 +2053,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
 
        if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) {
                /* try to remap that extent elsewhere? */
+               btrfs_bio_counter_dec(fs_info);
                bio_put(bio);
                btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
                return -EIO;
@@ -2053,6 +2063,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
                "read error corrected: ino %llu off %llu (dev %s sector %llu)",
                                  btrfs_ino(inode), start,
                                  rcu_str_deref(dev->name), sector);
+       btrfs_bio_counter_dec(fs_info);
        bio_put(bio);
        return 0;
 }
@@ -2232,13 +2243,12 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
 
                /* set the bits in the private failure tree */
                ret = set_extent_bits(failure_tree, start, end,
-                                       EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+                                       EXTENT_LOCKED | EXTENT_DIRTY);
                if (ret >= 0)
                        ret = set_state_failrec(failure_tree, start, failrec);
                /* set the bits in the inode's tree */
                if (ret >= 0)
-                       ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
-                                               GFP_NOFS);
+                       ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED);
                if (ret < 0) {
                        kfree(failrec);
                        return ret;
@@ -4389,8 +4399,12 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        if (ret < 0) {
                btrfs_free_path(path);
                return ret;
+       } else {
+               WARN_ON(!ret);
+               if (ret == 1)
+                       ret = 0;
        }
-       WARN_ON(!ret);
+
        path->slots[0]--;
        btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
        found_type = found_key.type;
@@ -4601,7 +4615,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb)
                if (mapped)
                        spin_unlock(&page->mapping->private_lock);
 
-               /* One for when we alloced the page */
+               /* One for when we allocated the page */
                put_page(page);
        } while (index != 0);
 }
@@ -5761,7 +5775,7 @@ int try_release_extent_buffer(struct page *page)
        struct extent_buffer *eb;
 
        /*
-        * We need to make sure noboody is attaching this page to an eb right
+        * We need to make sure nobody is attaching this page to an eb right
         * now.
         */
        spin_lock(&page->mapping->private_lock);
index 981f402..1baf19c 100644 (file)
@@ -220,8 +220,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
                   unsigned bits, int filled,
                   struct extent_state *cached_state);
 int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-                            unsigned bits, gfp_t mask,
-                            struct extent_changeset *changeset);
+               unsigned bits, struct extent_changeset *changeset);
 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                     unsigned bits, int wake, int delete,
                     struct extent_state **cached, gfp_t mask);
@@ -240,27 +239,27 @@ static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start,
 }
 
 static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
-               u64 end, unsigned bits, gfp_t mask)
+               u64 end, unsigned bits)
 {
        int wake = 0;
 
        if (bits & EXTENT_LOCKED)
                wake = 1;
 
-       return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask);
+       return clear_extent_bit(tree, start, end, bits, wake, 0, NULL,
+                       GFP_NOFS);
 }
 
 int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-                          unsigned bits, gfp_t mask,
-                          struct extent_changeset *changeset);
+                          unsigned bits, struct extent_changeset *changeset);
 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                   unsigned bits, u64 *failed_start,
                   struct extent_state **cached_state, gfp_t mask);
 
 static inline int set_extent_bits(struct extent_io_tree *tree, u64 start,
-               u64 end, unsigned bits, gfp_t mask)
+               u64 end, unsigned bits)
 {
-       return set_extent_bit(tree, start, end, bits, NULL, NULL, mask);
+       return set_extent_bit(tree, start, end, bits, NULL, NULL, GFP_NOFS);
 }
 
 static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
@@ -278,37 +277,38 @@ static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start,
 }
 
 static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
-               u64 end, gfp_t mask)
+               u64 end)
 {
        return clear_extent_bit(tree, start, end,
                                EXTENT_DIRTY | EXTENT_DELALLOC |
-                               EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
+                               EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
 }
 
 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                       unsigned bits, unsigned clear_bits,
-                      struct extent_state **cached_state, gfp_t mask);
+                      struct extent_state **cached_state);
 
 static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start,
-               u64 end, struct extent_state **cached_state, gfp_t mask)
+               u64 end, struct extent_state **cached_state)
 {
        return set_extent_bit(tree, start, end,
                              EXTENT_DELALLOC | EXTENT_UPTODATE,
-                             NULL, cached_state, mask);
+                             NULL, cached_state, GFP_NOFS);
 }
 
 static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start,
-               u64 end, struct extent_state **cached_state, gfp_t mask)
+               u64 end, struct extent_state **cached_state)
 {
        return set_extent_bit(tree, start, end,
                              EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
-                             NULL, cached_state, mask);
+                             NULL, cached_state, GFP_NOFS);
 }
 
 static inline int set_extent_new(struct extent_io_tree *tree, u64 start,
-               u64 end, gfp_t mask)
+               u64 end)
 {
-       return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL, mask);
+       return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL,
+                       GFP_NOFS);
 }
 
 static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start,
index 318b048..e0715fc 100644 (file)
@@ -62,7 +62,7 @@ struct extent_map *alloc_extent_map(void)
 
 /**
  * free_extent_map - drop reference count of an extent_map
- * @em:                extent map being releasead
+ * @em:                extent map being released
  *
  * Drops the reference out on @em by one and free the structure
  * if the reference count hits zero.
index 7a7d6e2..62a81ee 100644 (file)
@@ -248,7 +248,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
                                    BTRFS_DATA_RELOC_TREE_OBJECTID) {
                                        set_extent_bits(io_tree, offset,
                                                offset + root->sectorsize - 1,
-                                               EXTENT_NODATASUM, GFP_NOFS);
+                                               EXTENT_NODATASUM);
                                } else {
                                        btrfs_info(BTRFS_I(inode)->root->fs_info,
                                                   "no csum found for inode %llu start %llu",
index c98805c..e0c9bd3 100644 (file)
@@ -1596,6 +1596,13 @@ again:
 
                copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
 
+               num_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info,
+                                               reserve_bytes);
+               dirty_sectors = round_up(copied + sector_offset,
+                                       root->sectorsize);
+               dirty_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info,
+                                               dirty_sectors);
+
                /*
                 * if we have trouble faulting in the pages, fall
                 * back to one page at a time
@@ -1605,6 +1612,7 @@ again:
 
                if (copied == 0) {
                        force_page_uptodate = true;
+                       dirty_sectors = 0;
                        dirty_pages = 0;
                } else {
                        force_page_uptodate = false;
@@ -1615,20 +1623,19 @@ again:
                /*
                 * If we had a short copy we need to release the excess delaloc
                 * bytes we reserved.  We need to increment outstanding_extents
-                * because btrfs_delalloc_release_space will decrement it, but
+                * because btrfs_delalloc_release_space and
+                * btrfs_delalloc_release_metadata will decrement it, but
                 * we still have an outstanding extent for the chunk we actually
                 * managed to copy.
                 */
-               num_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info,
-                                               reserve_bytes);
-               dirty_sectors = round_up(copied + sector_offset,
-                                       root->sectorsize);
-               dirty_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info,
-                                               dirty_sectors);
-
                if (num_sectors > dirty_sectors) {
-                       release_bytes = (write_bytes - copied)
-                               & ~((u64)root->sectorsize - 1);
+                       /*
+                        * we round down because we don't want to count
+                        * any partial blocks actually sent through the
+                        * IO machines
+                        */
+                       release_bytes = round_down(release_bytes - copied,
+                                     root->sectorsize);
                        if (copied > 0) {
                                spin_lock(&BTRFS_I(inode)->lock);
                                BTRFS_I(inode)->outstanding_extents++;
@@ -2022,7 +2029,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
             BTRFS_I(inode)->last_trans
             <= root->fs_info->last_trans_committed)) {
                /*
-                * We'v had everything committed since the last time we were
+                * We've had everything committed since the last time we were
                 * modified so clear this flag in case it was set for whatever
                 * reason, it's no longer relevant.
                 */
@@ -2370,7 +2377,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 
        /* Check the aligned pages after the first unaligned page,
         * if offset != orig_start, which means the first unaligned page
-        * including serveral following pages are already in holes,
+        * including several following pages are already in holes,
         * the extra check can be skipped */
        if (offset == orig_start) {
                /* after truncate page, check hole again */
index 5e6062c..c6dc118 100644 (file)
@@ -1983,7 +1983,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
                /*
                 * If this block group has some small extents we don't want to
                 * use up all of our free slots in the cache with them, we want
-                * to reserve them to larger extents, however if we have plent
+                * to reserve them to larger extents, however if we have plenty
                 * of cache left then go ahead an dadd them, no sense in adding
                 * the overhead of a bitmap if we don't have to.
                 */
index 33178c4..3af651c 100644 (file)
@@ -123,7 +123,7 @@ int btrfs_return_cluster_to_free_space(
 int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
                           u64 *trimmed, u64 start, u64 end, u64 minlen);
 
-/* Support functions for runnint our sanity tests */
+/* Support functions for running our sanity tests */
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 int test_add_free_space_entry(struct btrfs_block_group_cache *cache,
                              u64 offset, u64 bytes, bool bitmap);
index 91419ef..8b1212e 100644 (file)
@@ -455,7 +455,7 @@ again:
 
        /*
         * skip compression for a small file range(<=blocksize) that
-        * isn't an inline extent, since it dosen't save disk space at all.
+        * isn't an inline extent, since it doesn't save disk space at all.
         */
        if (total_compressed <= blocksize &&
           (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
@@ -1978,7 +1978,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
 {
        WARN_ON((end & (PAGE_SIZE - 1)) == 0);
        return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
-                                  cached_state, GFP_NOFS);
+                                  cached_state);
 }
 
 /* see btrfs_writepage_start_hook for details on why this is required */
@@ -3119,8 +3119,7 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
 
        if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
            test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
-               clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
-                                 GFP_NOFS);
+               clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM);
                return 0;
        }
 
@@ -3722,7 +3721,7 @@ cache_index:
         * and doesn't have an inode ref with the name "bar" anymore.
         *
         * Setting last_unlink_trans to last_trans is a pessimistic approach,
-        * but it guarantees correctness at the expense of ocassional full
+        * but it guarantees correctness at the expense of occasional full
         * transaction commits on fsync if our inode is a directory, or if our
         * inode is not a directory, logging its parent unnecessarily.
         */
@@ -4978,7 +4977,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
                 * be instantly completed which will give us extents that need
                 * to be truncated.  If we fail to get an orphan inode down we
                 * could have left over extents that were never meant to live,
-                * so we need to garuntee from this point on that everything
+                * so we need to guarantee from this point on that everything
                 * will be consistent.
                 */
                ret = btrfs_orphan_add(trans, inode);
@@ -5248,7 +5247,7 @@ void btrfs_evict_inode(struct inode *inode)
                }
 
                /*
-                * We can't just steal from the global reserve, we need tomake
+                * We can't just steal from the global reserve, we need to make
                 * sure there is room to do it, if not we need to commit and try
                 * again.
                 */
@@ -6980,7 +6979,18 @@ insert:
                 * existing will always be non-NULL, since there must be
                 * extent causing the -EEXIST.
                 */
-               if (start >= extent_map_end(existing) ||
+               if (existing->start == em->start &&
+                   extent_map_end(existing) == extent_map_end(em) &&
+                   em->block_start == existing->block_start) {
+                       /*
+                        * these two extents are the same, it happens
+                        * with inlines especially
+                        */
+                       free_extent_map(em);
+                       em = existing;
+                       err = 0;
+
+               } else if (start >= extent_map_end(existing) ||
                    start <= existing->start) {
                        /*
                         * The existing extent map is the one nearest to
@@ -7433,7 +7443,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
                                 cached_state);
                /*
                 * We're concerned with the entire range that we're going to be
-                * doing DIO to, so we need to make sure theres no ordered
+                * doing DIO to, so we need to make sure there's no ordered
                 * extents in this range.
                 */
                ordered = btrfs_lookup_ordered_range(inode, lockstart,
@@ -7595,7 +7605,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
        if (current->journal_info) {
                /*
                 * Need to pull our outstanding extents and set journal_info to NULL so
-                * that anything that needs to check if there's a transction doesn't get
+                * that anything that needs to check if there's a transaction doesn't get
                 * confused.
                 */
                dio_data = current->journal_info;
@@ -7628,7 +7638,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
         * decompress it, so there will be buffering required no matter what we
         * do, so go ahead and fallback to buffered.
         *
-        * We return -ENOTBLK because thats what makes DIO go ahead and go back
+        * We return -ENOTBLK because that's what makes DIO go ahead and go back
         * to buffered IO.  Don't blame me, this is the price we pay for using
         * the generic code.
         */
@@ -9041,7 +9051,7 @@ static int btrfs_truncate(struct inode *inode)
                return ret;
 
        /*
-        * Yes ladies and gentelment, this is indeed ugly.  The fact is we have
+        * Yes ladies and gentlemen, this is indeed ugly.  The fact is we have
         * 3 things going on here
         *
         * 1) We need to reserve space for our orphan item and the space to
@@ -9055,15 +9065,15 @@ static int btrfs_truncate(struct inode *inode)
         * space reserved in case it uses space during the truncate (thank you
         * very much snapshotting).
         *
-        * And we need these to all be seperate.  The fact is we can use alot of
+        * And we need these to all be separate.  The fact is we can use a lot of
         * space doing the truncate, and we have no earthly idea how much space
-        * we will use, so we need the truncate reservation to be seperate so it
+        * we will use, so we need the truncate reservation to be separate so it
         * doesn't end up using space reserved for updating the inode or
         * removing the orphan item.  We also need to be able to stop the
         * transaction and start a new one, which means we need to be able to
         * update the inode several times, and we have no idea of knowing how
         * many times that will be, so we can't just reserve 1 item for the
-        * entirety of the opration, so that has to be done seperately as well.
+        * entirety of the operation, so that has to be done separately as well.
         * Then there is the orphan item, which does indeed need to be held on
         * to for the whole operation, and we need nobody to touch this reserved
         * space except the orphan code.
index 4e70069..0517356 100644 (file)
@@ -296,7 +296,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
                }
        } else {
                /*
-                * Revert back under same assuptions as above
+                * Revert back under same assumptions as above
                 */
                if (S_ISREG(mode)) {
                        if (inode->i_size == 0)
@@ -465,7 +465,7 @@ static noinline int create_subvol(struct inode *dir,
 
        /*
         * Don't create subvolume whose level is not zero. Or qgroup will be
-        * screwed up since it assume subvolme qgroup's level to be 0.
+        * screwed up since it assumes subvolume qgroup's level to be 0.
         */
        if (btrfs_qgroup_level(objectid)) {
                ret = -ENOSPC;
@@ -780,7 +780,7 @@ free_pending:
  *     a. be owner of dir, or
  *     b. be owner of victim, or
  *     c. have CAP_FOWNER capability
- *  6. If the victim is append-only or immutable we can't do antyhing with
+ *  6. If the victim is append-only or immutable we can't do anything with
  *     links pointing to it.
  *  7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
  *  8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
@@ -846,11 +846,9 @@ static noinline int btrfs_mksubvol(struct path *parent,
        struct dentry *dentry;
        int error;
 
-       inode_lock_nested(dir, I_MUTEX_PARENT);
-       // XXX: should've been
-       // mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
-       // if (error == -EINTR)
-       //      return error;
+       error = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
+       if (error == -EINTR)
+               return error;
 
        dentry = lookup_one_len(name, parent->dentry, namelen);
        error = PTR_ERR(dentry);
@@ -1239,7 +1237,7 @@ again:
 
 
        set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1,
-                         &cached_state, GFP_NOFS);
+                         &cached_state);
 
        unlock_extent_cached(&BTRFS_I(inode)->io_tree,
                             page_start, page_end - 1, &cached_state,
@@ -2377,11 +2375,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
                goto out;
 
 
-       inode_lock_nested(dir, I_MUTEX_PARENT);
-       // XXX: should've been
-       // err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
-       // if (err == -EINTR)
-       //      goto out_drop_write;
+       err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
+       if (err == -EINTR)
+               goto out_drop_write;
        dentry = lookup_one_len(vol_args->name, parent, namelen);
        if (IS_ERR(dentry)) {
                err = PTR_ERR(dentry);
@@ -2571,7 +2567,7 @@ out_dput:
        dput(dentry);
 out_unlock_dir:
        inode_unlock(dir);
-//out_drop_write:
+out_drop_write:
        mnt_drop_write_file(file);
 out:
        kfree(vol_args);
@@ -4654,7 +4650,7 @@ again:
        }
 
        /*
-        * mut. excl. ops lock is locked.  Three possibilites:
+        * mut. excl. ops lock is locked.  Three possibilities:
         *   (1) some other op is running
         *   (2) balance is running
         *   (3) balance is paused -- special case (think resume)
@@ -5571,7 +5567,7 @@ long btrfs_ioctl(struct file *file, unsigned int
                ret = btrfs_sync_fs(file_inode(file)->i_sb, 1);
                /*
                 * The transaction thread may want to do more work,
-                * namely it pokes the cleaner ktread that will start
+                * namely it pokes the cleaner kthread that will start
                 * processing uncleaned subvols.
                 */
                wake_up_process(root->fs_info->transaction_kthread);
index 5591704..e96634a 100644 (file)
@@ -718,12 +718,13 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr,
        return count;
 }
 
-void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr,
+int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr,
                              const u64 range_start, const u64 range_len)
 {
        struct btrfs_root *root;
        struct list_head splice;
        int done;
+       int total_done = 0;
 
        INIT_LIST_HEAD(&splice);
 
@@ -742,6 +743,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr,
                done = btrfs_wait_ordered_extents(root, nr,
                                                  range_start, range_len);
                btrfs_put_fs_root(root);
+               total_done += done;
 
                spin_lock(&fs_info->ordered_root_lock);
                if (nr != -1) {
@@ -752,6 +754,8 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr,
        list_splice_tail(&splice, &fs_info->ordered_roots);
        spin_unlock(&fs_info->ordered_root_lock);
        mutex_unlock(&fs_info->ordered_operations_mutex);
+
+       return total_done;
 }
 
 /*
index 8ef1262..4515077 100644 (file)
@@ -58,7 +58,7 @@ struct btrfs_ordered_sum {
 
 #define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */
 
-#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
+#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to preallocated extent */
 
 #define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */
 
@@ -199,7 +199,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
                           u32 *sum, int len);
 int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr,
                               const u64 range_start, const u64 range_len);
-void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr,
+int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr,
                              const u64 range_start, const u64 range_len);
 void btrfs_get_logged_extents(struct inode *inode,
                              struct list_head *logged_list,
index 9e11955..9d4c05b 100644 (file)
@@ -85,7 +85,7 @@ struct btrfs_qgroup {
 
        /*
         * temp variables for accounting operations
-        * Refer to qgroup_shared_accouting() for details.
+        * Refer to qgroup_shared_accounting() for details.
         */
        u64 old_refcnt;
        u64 new_refcnt;
@@ -499,7 +499,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
        }
        /*
         * we call btrfs_free_qgroup_config() when umounting
-        * filesystem and disabling quota, so we set qgroup_ulit
+        * filesystem and disabling quota, so we set qgroup_ulist
         * to be null here to avoid double free.
         */
        ulist_free(fs_info->qgroup_ulist);
@@ -1036,7 +1036,7 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info,
 
 /*
  * The easy accounting, if we are adding/removing the only ref for an extent
- * then this qgroup and all of the parent qgroups get their refrence and
+ * then this qgroup and all of the parent qgroups get their reference and
  * exclusive counts adjusted.
  *
  * Caller should hold fs_info->qgroup_lock.
@@ -1436,7 +1436,7 @@ int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
 
        /*
         * No need to do lock, since this function will only be called in
-        * btrfs_commmit_transaction().
+        * btrfs_commit_transaction().
         */
        node = rb_first(&delayed_refs->dirty_extent_root);
        while (node) {
@@ -1557,7 +1557,7 @@ static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
  * A:  cur_old_roots < nr_old_roots    (not exclusive before)
  * !A: cur_old_roots == nr_old_roots   (possible exclusive before)
  * B:  cur_new_roots < nr_new_roots    (not exclusive now)
- * !B: cur_new_roots == nr_new_roots   (possible exclsuive now)
+ * !B: cur_new_roots == nr_new_roots   (possible exclusive now)
  *
  * Results:
  * +: Possible sharing -> exclusive    -: Possible exclusive -> sharing
@@ -1851,7 +1851,7 @@ out:
 }
 
 /*
- * Copy the acounting information between qgroups. This is necessary
+ * Copy the accounting information between qgroups. This is necessary
  * when a snapshot or a subvolume is created. Throwing an error will
  * cause a transaction abort so we take extra care here to only error
  * when a readonly fs is a reasonable outcome.
@@ -2340,7 +2340,7 @@ out:
        mutex_unlock(&fs_info->qgroup_rescan_lock);
 
        /*
-        * only update status, since the previous part has alreay updated the
+        * only update status, since the previous part has already updated the
         * qgroup info.
         */
        trans = btrfs_start_transaction(fs_info->quota_root, 1);
@@ -2542,8 +2542,7 @@ int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len)
        changeset.bytes_changed = 0;
        changeset.range_changed = ulist_alloc(GFP_NOFS);
        ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
-                       start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS,
-                       &changeset);
+                       start + len -1, EXTENT_QGROUP_RESERVED, &changeset);
        trace_btrfs_qgroup_reserve_data(inode, start, len,
                                        changeset.bytes_changed,
                                        QGROUP_RESERVE);
@@ -2580,8 +2579,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len,
                return -ENOMEM;
 
        ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start, 
-                       start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS,
-                       &changeset);
+                       start + len -1, EXTENT_QGROUP_RESERVED, &changeset);
        if (ret < 0)
                goto out;
 
@@ -2672,7 +2670,7 @@ void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
 }
 
 /*
- * Check qgroup reserved space leaking, normally at destory inode
+ * Check qgroup reserved space leaking, normally at destroy inode
  * time
  */
 void btrfs_qgroup_check_reserved_leak(struct inode *inode)
@@ -2688,7 +2686,7 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode)
                return;
 
        ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
-                       EXTENT_QGROUP_RESERVED, GFP_NOFS, &changeset);
+                       EXTENT_QGROUP_RESERVED, &changeset);
 
        WARN_ON(ret < 0);
        if (WARN_ON(changeset.bytes_changed)) {
index 0b7792e..f8b6d41 100644 (file)
@@ -576,7 +576,7 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
         * we can't merge with cached rbios, since the
         * idea is that when we merge the destination
         * rbio is going to run our IO for us.  We can
-        * steal from cached rbio's though, other functions
+        * steal from cached rbios though, other functions
         * handle that.
         */
        if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
@@ -2368,7 +2368,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
                        run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
                }
 
-               /* Check scrubbing pairty and repair it */
+               /* Check scrubbing parity and repair it */
                p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
                parity = kmap(p);
                if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE))
@@ -2493,7 +2493,7 @@ static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
                /*
                 * Here means we got one corrupted data stripe and one
                 * corrupted parity on RAID6, if the corrupted parity
-                * is scrubbing parity, luckly, use the other one to repair
+                * is scrubbing parity, luckily, use the other one to repair
                 * the data, or we can not repair the data stripe.
                 */
                if (failp != rbio->scrubp)
index 298631e..8428db7 100644 (file)
@@ -761,12 +761,14 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info)
 
        do {
                enqueued = 0;
+               mutex_lock(&fs_devices->device_list_mutex);
                list_for_each_entry(device, &fs_devices->devices, dev_list) {
                        if (atomic_read(&device->reada_in_flight) <
                            MAX_IN_FLIGHT)
                                enqueued += reada_start_machine_dev(fs_info,
                                                                    device);
                }
+               mutex_unlock(&fs_devices->device_list_mutex);
                total += enqueued;
        } while (enqueued && total < 10000);
 
index 1cfd35c..0477dca 100644 (file)
@@ -668,8 +668,8 @@ int find_inline_backref(struct extent_buffer *leaf, int slot,
  * roots of b-trees that reference the tree block.
  *
  * the basic idea of this function is check backrefs of a given block
- * to find upper level blocks that refernece the block, and then check
- * bakcrefs of these upper level blocks recursively. the recursion stop
+ * to find upper level blocks that reference the block, and then check
+ * backrefs of these upper level blocks recursively. the recursion stop
  * when tree root is reached or backrefs for the block is cached.
  *
  * NOTE: if we find backrefs for a block are cached, we know backrefs
@@ -1160,7 +1160,7 @@ out:
                        if (!RB_EMPTY_NODE(&upper->rb_node))
                                continue;
 
-                       /* Add this guy's upper edges to the list to proces */
+                       /* Add this guy's upper edges to the list to process */
                        list_for_each_entry(edge, &upper->upper, list[LOWER])
                                list_add_tail(&edge->list[UPPER], &list);
                        if (list_empty(&upper->upper))
@@ -2396,7 +2396,7 @@ again:
                }
 
                /*
-                * we keep the old last snapshod transid in rtranid when we
+                * we keep the old last snapshot transid in rtranid when we
                 * created the relocation tree.
                 */
                last_snap = btrfs_root_rtransid(&reloc_root->root_item);
@@ -2616,7 +2616,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
                         * only one thread can access block_rsv at this point,
                         * so we don't need hold lock to protect block_rsv.
                         * we expand more reservation size here to allow enough
-                        * space for relocation and we will return eailer in
+                        * space for relocation and we will return earlier in
                         * enospc case.
                         */
                        rc->block_rsv->size = tmp + rc->extent_root->nodesize *
@@ -2814,7 +2814,7 @@ static void mark_block_processed(struct reloc_control *rc,
                                 u64 bytenr, u32 blocksize)
 {
        set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
-                       EXTENT_DIRTY, GFP_NOFS);
+                       EXTENT_DIRTY);
 }
 
 static void __mark_block_processed(struct reloc_control *rc,
@@ -3182,7 +3182,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
                    page_start + offset == cluster->boundary[nr]) {
                        set_extent_bits(&BTRFS_I(inode)->io_tree,
                                        page_start, page_end,
-                                       EXTENT_BOUNDARY, GFP_NOFS);
+                                       EXTENT_BOUNDARY);
                        nr++;
                }
 
@@ -4059,8 +4059,7 @@ restart:
        }
 
        btrfs_release_path(path);
-       clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
-                         GFP_NOFS);
+       clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY);
 
        if (trans) {
                btrfs_end_transaction_throttle(trans, rc->extent_root);
@@ -4591,7 +4590,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
 
 /*
  * called before creating snapshot. it calculates metadata reservation
- * requried for relocating tree blocks in the snapshot
+ * required for relocating tree blocks in the snapshot
  */
 void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
                              u64 *bytes_to_reserve)
index b2b14e7..f1c3086 100644 (file)
@@ -71,9 +71,9 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
  * search_key: the key to search
  * path: the path we search
  * root_item: the root item of the tree we look for
- * root_key: the reak key of the tree we look for
+ * root_key: the root key of the tree we look for
  *
- * If ->offset of 'seach_key' is -1ULL, it means we are not sure the offset
+ * If ->offset of 'search_key' is -1ULL, it means we are not sure the offset
  * of the search key, just lookup the root with the highest offset for a
  * given objectid.
  *
index fa35cdc..70427ef 100644 (file)
@@ -745,7 +745,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
                 * sure we read the bad mirror.
                 */
                ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
-                                       EXTENT_DAMAGED, GFP_NOFS);
+                                       EXTENT_DAMAGED);
                if (ret) {
                        /* set_extent_bits should give proper error */
                        WARN_ON(ret > 0);
@@ -763,7 +763,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
                                                end, EXTENT_DAMAGED, 0, NULL);
                if (!corrected)
                        clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
-                                               EXTENT_DAMAGED, GFP_NOFS);
+                                               EXTENT_DAMAGED);
        }
 
 out:
@@ -1044,7 +1044,7 @@ nodatasum_case:
 
                /*
                 * !is_metadata and !have_csum, this means that the data
-                * might not be COW'ed, that it might be modified
+                * might not be COWed, that it might be modified
                 * concurrently. The general strategy to work on the
                 * commit root does not help in the case when COW is not
                 * used.
@@ -1125,7 +1125,7 @@ nodatasum_case:
         * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
         * of mirror #2 is readable but the final checksum test fails,
         * then the 2nd page of mirror #3 could be tried, whether now
-        * the final checksum succeedes. But this would be a rare
+        * the final checksum succeeds. But this would be a rare
         * exception and is therefore not implemented. At least it is
         * avoided that the good copy is overwritten.
         * A more useful improvement would be to pick the sectors
@@ -2181,7 +2181,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
        struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
        u64 length = sblock->page_count * PAGE_SIZE;
        u64 logical = sblock->pagev[0]->logical;
-       struct btrfs_bio *bbio;
+       struct btrfs_bio *bbio = NULL;
        struct bio *bio;
        struct btrfs_raid_bio *rbio;
        int ret;
@@ -2982,6 +2982,7 @@ again:
                                                       extent_len);
 
                        mapped_length = extent_len;
+                       bbio = NULL;
                        ret = btrfs_map_block(fs_info, READ, extent_logical,
                                              &mapped_length, &bbio, 0);
                        if (!ret) {
@@ -3581,6 +3582,46 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
                 */
                scrub_pause_on(fs_info);
                ret = btrfs_inc_block_group_ro(root, cache);
+               if (!ret && is_dev_replace) {
+                       /*
+                        * If we are doing a device replace wait for any tasks
+                        * that started dellaloc right before we set the block
+                        * group to RO mode, as they might have just allocated
+                        * an extent from it or decided they could do a nocow
+                        * write. And if any such tasks did that, wait for their
+                        * ordered extents to complete and then commit the
+                        * current transaction, so that we can later see the new
+                        * extent items in the extent tree - the ordered extents
+                        * create delayed data references (for cow writes) when
+                        * they complete, which will be run and insert the
+                        * corresponding extent items into the extent tree when
+                        * we commit the transaction they used when running
+                        * inode.c:btrfs_finish_ordered_io(). We later use
+                        * the commit root of the extent tree to find extents
+                        * to copy from the srcdev into the tgtdev, and we don't
+                        * want to miss any new extents.
+                        */
+                       btrfs_wait_block_group_reservations(cache);
+                       btrfs_wait_nocow_writers(cache);
+                       ret = btrfs_wait_ordered_roots(fs_info, -1,
+                                                      cache->key.objectid,
+                                                      cache->key.offset);
+                       if (ret > 0) {
+                               struct btrfs_trans_handle *trans;
+
+                               trans = btrfs_join_transaction(root);
+                               if (IS_ERR(trans))
+                                       ret = PTR_ERR(trans);
+                               else
+                                       ret = btrfs_commit_transaction(trans,
+                                                                      root);
+                               if (ret) {
+                                       scrub_pause_off(fs_info);
+                                       btrfs_put_block_group(cache);
+                                       break;
+                               }
+                       }
+               }
                scrub_pause_off(fs_info);
 
                if (ret == 0) {
@@ -3601,9 +3642,11 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
                        break;
                }
 
+               btrfs_dev_replace_lock(&fs_info->dev_replace, 1);
                dev_replace->cursor_right = found_key.offset + length;
                dev_replace->cursor_left = found_key.offset;
                dev_replace->item_needs_writeback = 1;
+               btrfs_dev_replace_unlock(&fs_info->dev_replace, 1);
                ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
                                  found_key.offset, cache, is_dev_replace);
 
@@ -3639,6 +3682,11 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 
                scrub_pause_off(fs_info);
 
+               btrfs_dev_replace_lock(&fs_info->dev_replace, 1);
+               dev_replace->cursor_left = dev_replace->cursor_right;
+               dev_replace->item_needs_writeback = 1;
+               btrfs_dev_replace_unlock(&fs_info->dev_replace, 1);
+
                if (ro_set)
                        btrfs_dec_block_group_ro(root, cache);
 
@@ -3676,9 +3724,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
                        ret = -ENOMEM;
                        break;
                }
-
-               dev_replace->cursor_left = dev_replace->cursor_right;
-               dev_replace->item_needs_writeback = 1;
 skip:
                key.offset = found_key.offset + length;
                btrfs_release_path(path);
index 6a8c860..b71dd29 100644 (file)
@@ -1831,7 +1831,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
 
        /*
         * If we have a parent root we need to verify that the parent dir was
-        * not delted and then re-created, if it was then we have no overwrite
+        * not deleted and then re-created, if it was then we have no overwrite
         * and we can just unlink this entry.
         */
        if (sctx->parent_root) {
@@ -4192,9 +4192,9 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key,
                return -ENOMEM;
 
        /*
-        * This hack is needed because empty acl's are stored as zero byte
+        * This hack is needed because empty acls are stored as zero byte
         * data in xattrs. Problem with that is, that receiving these zero byte
-        * acl's will fail later. To fix this, we send a dummy acl list that
+        * acls will fail later. To fix this, we send a dummy acl list that
         * only contains the version number and no entries.
         */
        if (!strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, name_len) ||
index e05619f..875c757 100644 (file)
@@ -36,7 +36,7 @@ static inline void put_unaligned_le8(u8 val, void *p)
  *
  * The end result is that anyone who #includes ctree.h gets a
  * declaration for the btrfs_set_foo functions and btrfs_foo functions,
- * which are wappers of btrfs_set_token_#bits functions and
+ * which are wrappers of btrfs_set_token_#bits functions and
  * btrfs_get_token_#bits functions, which are defined in this file.
  *
  * These setget functions do all the extent_buffer related mapping
index bf71071..4e59a91 100644 (file)
@@ -112,7 +112,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
                 * Note that a running device replace operation is not
                 * canceled here although there is no way to update
                 * the progress. It would add the risk of a deadlock,
-                * therefore the canceling is ommited. The only penalty
+                * therefore the canceling is omitted. The only penalty
                 * is that some I/O remains active until the procedure
                 * completes. The next time when the filesystem is
                 * mounted writeable again, the device replace
@@ -1877,7 +1877,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
        int ret;
 
        /*
-        * We aren't under the device list lock, so this is racey-ish, but good
+        * We aren't under the device list lock, so this is racy-ish, but good
         * enough for our purposes.
         */
        nr_devices = fs_info->fs_devices->open_devices;
@@ -1896,7 +1896,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
        if (!devices_info)
                return -ENOMEM;
 
-       /* calc min stripe number for data space alloction */
+       /* calc min stripe number for data space allocation */
        type = btrfs_get_alloc_profile(root, 1);
        if (type & BTRFS_BLOCK_GROUP_RAID0) {
                min_stripes = 2;
@@ -1932,7 +1932,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
                avail_space *= BTRFS_STRIPE_LEN;
 
                /*
-                * In order to avoid overwritting the superblock on the drive,
+                * In order to avoid overwriting the superblock on the drive,
                 * btrfs starts at an offset of at least 1MB when doing chunk
                 * allocation.
                 */
index 70948b1..5572460 100644 (file)
@@ -113,7 +113,7 @@ static int test_find_delalloc(void)
         * |--- delalloc ---|
         * |---  search  ---|
         */
-       set_extent_delalloc(&tmp, 0, 4095, NULL, GFP_KERNEL);
+       set_extent_delalloc(&tmp, 0, 4095, NULL);
        start = 0;
        end = 0;
        found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
@@ -144,7 +144,7 @@ static int test_find_delalloc(void)
                test_msg("Couldn't find the locked page\n");
                goto out_bits;
        }
-       set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL, GFP_KERNEL);
+       set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL);
        start = test_start;
        end = 0;
        found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
@@ -176,7 +176,7 @@ static int test_find_delalloc(void)
        locked_page = find_lock_page(inode->i_mapping, test_start >>
                                     PAGE_SHIFT);
        if (!locked_page) {
-               test_msg("Could'nt find the locked page\n");
+               test_msg("Couldn't find the locked page\n");
                goto out_bits;
        }
        start = test_start;
@@ -199,7 +199,7 @@ static int test_find_delalloc(void)
         *
         * We are re-using our test_start from above since it works out well.
         */
-       set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL, GFP_KERNEL);
+       set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL);
        start = test_start;
        end = 0;
        found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
@@ -262,7 +262,7 @@ static int test_find_delalloc(void)
        }
        ret = 0;
 out_bits:
-       clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_KERNEL);
+       clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1);
 out:
        if (locked_page)
                put_page(locked_page);
index 5142475..0eeb8f3 100644 (file)
@@ -25,7 +25,7 @@
 #define BITS_PER_BITMAP                (PAGE_SIZE * 8)
 
 /*
- * This test just does basic sanity checking, making sure we can add an exten
+ * This test just does basic sanity checking, making sure we can add an extent
  * entry and remove space from either end and the middle, and make sure we can
  * remove space that covers adjacent extent entries.
  */
@@ -396,8 +396,9 @@ static int check_cache_empty(struct btrfs_block_group_cache *cache)
  * wasn't optimal as they could be spread all over the block group while under
  * concurrency (extra overhead and fragmentation).
  *
- * This stealing approach is benefical, since we always prefer to allocate from
- * extent entries, both for clustered and non-clustered allocation requests.
+ * This stealing approach is beneficial, since we always prefer to allocate
+ * from extent entries, both for clustered and non-clustered allocation
+ * requests.
  */
 static int
 test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
index 863a6a3..8a25fe8 100644 (file)
@@ -264,7 +264,7 @@ static noinline int test_btrfs_get_extent(void)
 
        /*
         * We will just free a dummy node if it's ref count is 2 so we need an
-        * extra ref so our searches don't accidently release our page.
+        * extra ref so our searches don't accidentally release our page.
         */
        extent_buffer_get(root->node);
        btrfs_set_header_nritems(root->node, 0);
index 8ea5d34..8aa4ded 100644 (file)
@@ -234,7 +234,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
        }
 
        /*
-        * Since the test trans doesn't havee the complicated delayed refs,
+        * Since the test trans doesn't have the complicated delayed refs,
         * we can only call btrfs_qgroup_account_extent() directly to test
         * quota.
         */
index 5b0b758..f6e24cb 100644 (file)
@@ -944,7 +944,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
 
                err = convert_extent_bit(dirty_pages, start, end,
                                         EXTENT_NEED_WAIT,
-                                        mark, &cached_state, GFP_NOFS);
+                                        mark, &cached_state);
                /*
                 * convert_extent_bit can return -ENOMEM, which is most of the
                 * time a temporary error. So when it happens, ignore the error
index 72be51f..9fe0ec2 100644 (file)
@@ -144,7 +144,7 @@ struct btrfs_pending_snapshot {
        /* block reservation for the operation */
        struct btrfs_block_rsv block_rsv;
        u64 qgroup_reserved;
-       /* extra metadata reseration for relocation */
+       /* extra metadata reservation for relocation */
        int error;
        bool readonly;
        struct list_head list;
index 8aaca5c..b7665af 100644 (file)
@@ -2330,7 +2330,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
                                break;
 
                        /* for regular files, make sure corresponding
-                        * orhpan item exist. extents past the new EOF
+                        * orphan item exist. extents past the new EOF
                         * will be truncated later by orphan cleanup.
                         */
                        if (S_ISREG(mode)) {
@@ -3001,7 +3001,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
                        break;
 
                clear_extent_bits(&log->dirty_log_pages, start, end,
-                                 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
+                                 EXTENT_DIRTY | EXTENT_NEW);
        }
 
        /*
@@ -4914,7 +4914,7 @@ out_unlock:
  * the actual unlink operation, so if we do this check before a concurrent task
  * sets last_unlink_trans it means we've logged a consistent version/state of
  * all the inode items, otherwise we are not sure and must do a transaction
- * commit (the concurrent task migth have only updated last_unlink_trans before
+ * commit (the concurrent task might have only updated last_unlink_trans before
  * we logged the inode or it might have also done the unlink).
  */
 static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
@@ -4973,7 +4973,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
        while (1) {
                /*
                 * If we are logging a directory then we start with our inode,
-                * not our parents inode, so we need to skipp setting the
+                * not our parent's inode, so we need to skip setting the
                 * logged_trans so that further down in the log code we don't
                 * think this inode has already been logged.
                 */
@@ -5357,7 +5357,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                log_dentries = true;
 
        /*
-        * On unlink we must make sure all our current and old parent directores
+        * On unlink we must make sure all our current and old parent directory
         * inodes are fully logged. This is to prevent leaving dangling
         * directory index entries in directories that were our parents but are
         * not anymore. Not doing this results in old parent directory being
index 91feb2b..b1434bb 100644 (file)
@@ -28,7 +28,7 @@
  * }
  * ulist_free(ulist);
  *
- * This assumes the graph nodes are adressable by u64. This stems from the
+ * This assumes the graph nodes are addressable by u64. This stems from the
  * usage for tree enumeration in btrfs, where the logical addresses are
  * 64 bit.
  *
index 2b88127..da9e003 100644 (file)
@@ -2190,7 +2190,7 @@ static int btrfs_prepare_sprout(struct btrfs_root *root)
 }
 
 /*
- * strore the expected generation for seed devices in device items.
+ * Store the expected generation for seed devices in device items.
  */
 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root)
@@ -2761,6 +2761,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
        u64 dev_extent_len = 0;
        u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
        int i, ret = 0;
+       struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
 
        /* Just in case */
        root = root->fs_info->chunk_root;
@@ -2787,12 +2788,19 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
        check_system_chunk(trans, extent_root, map->type);
        unlock_chunks(root->fs_info->chunk_root);
 
+       /*
+        * Take the device list mutex to prevent races with the final phase of
+        * a device replace operation that replaces the device object associated
+        * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
+        */
+       mutex_lock(&fs_devices->device_list_mutex);
        for (i = 0; i < map->num_stripes; i++) {
                struct btrfs_device *device = map->stripes[i].dev;
                ret = btrfs_free_dev_extent(trans, device,
                                            map->stripes[i].physical,
                                            &dev_extent_len);
                if (ret) {
+                       mutex_unlock(&fs_devices->device_list_mutex);
                        btrfs_abort_transaction(trans, root, ret);
                        goto out;
                }
@@ -2811,11 +2819,14 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
                if (map->stripes[i].dev) {
                        ret = btrfs_update_device(trans, map->stripes[i].dev);
                        if (ret) {
+                               mutex_unlock(&fs_devices->device_list_mutex);
                                btrfs_abort_transaction(trans, root, ret);
                                goto out;
                        }
                }
        }
+       mutex_unlock(&fs_devices->device_list_mutex);
+
        ret = btrfs_free_chunk(trans, root, chunk_objectid, chunk_offset);
        if (ret) {
                btrfs_abort_transaction(trans, root, ret);
@@ -3387,7 +3398,7 @@ static int should_balance_chunk(struct btrfs_root *root,
        } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
                /*
                 * Same logic as the 'limit' filter; the minimum cannot be
-                * determined here because we do not have the global informatoin
+                * determined here because we do not have the global information
                 * about the count of all chunks that satisfy the filters.
                 */
                if (bargs->limit_max == 0)
@@ -5762,20 +5773,17 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                        }
                }
                if (found) {
-                       if (physical_of_found + map->stripe_len <=
-                           dev_replace->cursor_left) {
-                               struct btrfs_bio_stripe *tgtdev_stripe =
-                                       bbio->stripes + num_stripes;
+                       struct btrfs_bio_stripe *tgtdev_stripe =
+                               bbio->stripes + num_stripes;
 
-                               tgtdev_stripe->physical = physical_of_found;
-                               tgtdev_stripe->length =
-                                       bbio->stripes[index_srcdev].length;
-                               tgtdev_stripe->dev = dev_replace->tgtdev;
-                               bbio->tgtdev_map[index_srcdev] = num_stripes;
+                       tgtdev_stripe->physical = physical_of_found;
+                       tgtdev_stripe->length =
+                               bbio->stripes[index_srcdev].length;
+                       tgtdev_stripe->dev = dev_replace->tgtdev;
+                       bbio->tgtdev_map[index_srcdev] = num_stripes;
 
-                               tgtdev_indexes++;
-                               num_stripes++;
-                       }
+                       tgtdev_indexes++;
+                       num_stripes++;
                }
        }
 
@@ -6076,7 +6084,7 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
 {
        atomic_inc(&bbio->error);
        if (atomic_dec_and_test(&bbio->stripes_pending)) {
-               /* Shoud be the original bio. */
+               /* Should be the original bio. */
                WARN_ON(bio != bbio->orig_bio);
 
                btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
@@ -6560,7 +6568,7 @@ int btrfs_read_sys_array(struct btrfs_root *root)
        set_extent_buffer_uptodate(sb);
        btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
        /*
-        * The sb extent buffer is artifical and just used to read the system array.
+        * The sb extent buffer is artificial and just used to read the system array.
         * set_extent_buffer_uptodate() call does not properly mark all it's
         * pages up-to-date when the page is larger: extent does not cover the
         * whole page and consequently check_page_uptodate does not find all
@@ -6630,13 +6638,13 @@ int btrfs_read_sys_array(struct btrfs_root *root)
                sb_array_offset += len;
                cur_offset += len;
        }
-       free_extent_buffer(sb);
+       free_extent_buffer_stale(sb);
        return ret;
 
 out_short_read:
        printk(KERN_ERR "BTRFS: sys_array too short to read %u bytes at offset %u\n",
                        len, cur_offset);
-       free_extent_buffer(sb);
+       free_extent_buffer_stale(sb);
        return -EIO;
 }
 
index 3bfb252..d1a177a 100644 (file)
@@ -380,23 +380,21 @@ static int btrfs_xattr_handler_get(const struct xattr_handler *handler,
 }
 
 static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
-                                  struct dentry *dentry, const char *name,
-                                  const void *buffer, size_t size,
-                                  int flags)
+                                  struct dentry *unused, struct inode *inode,
+                                  const char *name, const void *buffer,
+                                  size_t size, int flags)
 {
-       struct inode *inode = d_inode(dentry);
-
        name = xattr_full_name(handler, name);
        return __btrfs_setxattr(NULL, inode, name, buffer, size, flags);
 }
 
 static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
-                                       struct dentry *dentry,
+                                       struct dentry *unused, struct inode *inode,
                                        const char *name, const void *value,
                                        size_t size, int flags)
 {
        name = xattr_full_name(handler, name);
-       return btrfs_set_prop(d_inode(dentry), name, value, size, flags);
+       return btrfs_set_prop(inode, name, value, size, flags);
 }
 
 static const struct xattr_handler btrfs_security_xattr_handler = {
index 861d611..ce5f345 100644 (file)
@@ -380,7 +380,7 @@ static void cachefiles_sync_cache(struct fscache_cache *_cache)
  * check if the backing cache is updated to FS-Cache
  * - called by FS-Cache when evaluates if need to invalidate the cache
  */
-static bool cachefiles_check_consistency(struct fscache_operation *op)
+static int cachefiles_check_consistency(struct fscache_operation *op)
 {
        struct cachefiles_object *object;
        struct cachefiles_cache *cache;
index 43098cd..26a9d10 100644 (file)
@@ -257,12 +257,12 @@ static int ceph_readpage(struct file *filp, struct page *page)
 /*
  * Finish an async read(ahead) op.
  */
-static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
+static void finish_read(struct ceph_osd_request *req)
 {
        struct inode *inode = req->r_inode;
        struct ceph_osd_data *osd_data;
-       int rc = req->r_result;
-       int bytes = le32_to_cpu(msg->hdr.data_len);
+       int rc = req->r_result <= 0 ? req->r_result : 0;
+       int bytes = req->r_result >= 0 ? req->r_result : 0;
        int num_pages;
        int i;
 
@@ -276,8 +276,10 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
        for (i = 0; i < num_pages; i++) {
                struct page *page = osd_data->pages[i];
 
-               if (rc < 0 && rc != -ENOENT)
+               if (rc < 0 && rc != -ENOENT) {
+                       ceph_fscache_readpage_cancel(inode, page);
                        goto unlock;
+               }
                if (bytes < (int)PAGE_SIZE) {
                        /* zero (remainder of) page */
                        int s = bytes < 0 ? 0 : bytes;
@@ -376,8 +378,6 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
        req->r_callback = finish_read;
        req->r_inode = inode;
 
-       ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
-
        dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);
        ret = ceph_osdc_start_request(osdc, req, false);
        if (ret < 0)
@@ -537,8 +537,6 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
            CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
                set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);
 
-       ceph_readpage_to_fscache(inode, page);
-
        set_page_writeback(page);
        err = ceph_osdc_writepages(osdc, ceph_vino(inode),
                                   &ci->i_layout, snapc,
@@ -546,11 +544,21 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
                                   truncate_seq, truncate_size,
                                   &inode->i_mtime, &page, 1);
        if (err < 0) {
-               dout("writepage setting page/mapping error %d %p\n", err, page);
+               struct writeback_control tmp_wbc;
+               if (!wbc)
+                       wbc = &tmp_wbc;
+               if (err == -ERESTARTSYS) {
+                       /* killed by SIGKILL */
+                       dout("writepage interrupted page %p\n", page);
+                       redirty_page_for_writepage(wbc, page);
+                       end_page_writeback(page);
+                       goto out;
+               }
+               dout("writepage setting page/mapping error %d %p\n",
+                    err, page);
                SetPageError(page);
                mapping_set_error(&inode->i_data, err);
-               if (wbc)
-                       wbc->pages_skipped++;
+               wbc->pages_skipped++;
        } else {
                dout("writepage cleaned page %p\n", page);
                err = 0;  /* vfs expects us to return 0 */
@@ -571,12 +579,16 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc)
        BUG_ON(!inode);
        ihold(inode);
        err = writepage_nounlock(page, wbc);
+       if (err == -ERESTARTSYS) {
+               /* direct memory reclaimer was killed by SIGKILL. return 0
+                * to prevent caller from setting mapping/page error */
+               err = 0;
+       }
        unlock_page(page);
        iput(inode);
        return err;
 }
 
-
 /*
  * lame release_pages helper.  release_pages() isn't exported to
  * modules.
@@ -600,8 +612,7 @@ static void ceph_release_pages(struct page **pages, int num)
  * If we get an error, set the mapping error bit, but not the individual
  * page error bits.
  */
-static void writepages_finish(struct ceph_osd_request *req,
-                             struct ceph_msg *msg)
+static void writepages_finish(struct ceph_osd_request *req)
 {
        struct inode *inode = req->r_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
@@ -615,7 +626,6 @@ static void writepages_finish(struct ceph_osd_request *req,
        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
        bool remove_page;
 
-
        dout("writepages_finish %p rc %d\n", inode, rc);
        if (rc < 0)
                mapping_set_error(mapping, rc);
@@ -650,6 +660,9 @@ static void writepages_finish(struct ceph_osd_request *req,
                                clear_bdi_congested(&fsc->backing_dev_info,
                                                    BLK_RW_ASYNC);
 
+                       if (rc < 0)
+                               SetPageError(page);
+
                        ceph_put_snap_context(page_snap_context(page));
                        page->private = 0;
                        ClearPagePrivate(page);
@@ -718,8 +731,11 @@ static int ceph_writepages_start(struct address_space *mapping,
             (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
 
        if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
-               pr_warn("writepage_start %p on forced umount\n", inode);
-               truncate_pagecache(inode, 0);
+               if (ci->i_wrbuffer_ref > 0) {
+                       pr_warn_ratelimited(
+                               "writepage_start %p %lld forced umount\n",
+                               inode, ceph_ino(inode));
+               }
                mapping_set_error(mapping, -EIO);
                return -EIO; /* we're in a forced umount, don't write! */
        }
@@ -1063,10 +1079,7 @@ new_request:
                        pages = NULL;
                }
 
-               vino = ceph_vino(inode);
-               ceph_osdc_build_request(req, offset, snapc, vino.snap,
-                                       &inode->i_mtime);
-
+               req->r_mtime = inode->i_mtime;
                rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
                BUG_ON(rc);
                req = NULL;
@@ -1099,8 +1112,7 @@ release_pvec_pages:
                mapping->writeback_index = index;
 
 out:
-       if (req)
-               ceph_osdc_put_request(req);
+       ceph_osdc_put_request(req);
        ceph_put_snap_context(snapc);
        dout("writepages done, rc = %d\n", rc);
        return rc;
@@ -1134,6 +1146,7 @@ static int ceph_update_writeable_page(struct file *file,
                            struct page *page)
 {
        struct inode *inode = file_inode(file);
+       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
        struct ceph_inode_info *ci = ceph_inode(inode);
        loff_t page_off = pos & PAGE_MASK;
        int pos_in_page = pos & ~PAGE_MASK;
@@ -1142,6 +1155,12 @@ static int ceph_update_writeable_page(struct file *file,
        int r;
        struct ceph_snap_context *snapc, *oldest;
 
+       if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+               dout(" page %p forced umount\n", page);
+               unlock_page(page);
+               return -EIO;
+       }
+
 retry_locked:
        /* writepages currently holds page lock, but if we change that later, */
        wait_on_page_writeback(page);
@@ -1165,7 +1184,7 @@ retry_locked:
                        snapc = ceph_get_snap_context(snapc);
                        unlock_page(page);
                        ceph_queue_writeback(inode);
-                       r = wait_event_interruptible(ci->i_cap_wq,
+                       r = wait_event_killable(ci->i_cap_wq,
                               context_is_writeable_or_written(inode, snapc));
                        ceph_put_snap_context(snapc);
                        if (r == -ERESTARTSYS)
@@ -1311,6 +1330,17 @@ const struct address_space_operations ceph_aops = {
        .direct_IO = ceph_direct_io,
 };
 
+static void ceph_block_sigs(sigset_t *oldset)
+{
+       sigset_t mask;
+       siginitsetinv(&mask, sigmask(SIGKILL));
+       sigprocmask(SIG_BLOCK, &mask, oldset);
+}
+
+static void ceph_restore_sigs(sigset_t *oldset)
+{
+       sigprocmask(SIG_SETMASK, oldset, NULL);
+}
 
 /*
  * vm ops
@@ -1323,6 +1353,9 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        struct page *pinned_page = NULL;
        loff_t off = vmf->pgoff << PAGE_SHIFT;
        int want, got, ret;
+       sigset_t oldset;
+
+       ceph_block_sigs(&oldset);
 
        dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n",
             inode, ceph_vinop(inode), off, (size_t)PAGE_SIZE);
@@ -1330,17 +1363,12 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
        else
                want = CEPH_CAP_FILE_CACHE;
-       while (1) {
-               got = 0;
-               ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want,
-                                   -1, &got, &pinned_page);
-               if (ret == 0)
-                       break;
-               if (ret != -ERESTARTSYS) {
-                       WARN_ON(1);
-                       return VM_FAULT_SIGBUS;
-               }
-       }
+
+       got = 0;
+       ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page);
+       if (ret < 0)
+               goto out_restore;
+
        dout("filemap_fault %p %llu~%zd got cap refs on %s\n",
             inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got));
 
@@ -1357,7 +1385,7 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        ceph_put_cap_refs(ci, got);
 
        if (ret != -EAGAIN)
-               return ret;
+               goto out_restore;
 
        /* read inline data */
        if (off >= PAGE_SIZE) {
@@ -1371,15 +1399,18 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                                                ~__GFP_FS));
                if (!page) {
                        ret = VM_FAULT_OOM;
-                       goto out;
+                       goto out_inline;
                }
                ret1 = __ceph_do_getattr(inode, page,
                                         CEPH_STAT_CAP_INLINE_DATA, true);
                if (ret1 < 0 || off >= i_size_read(inode)) {
                        unlock_page(page);
                        put_page(page);
-                       ret = VM_FAULT_SIGBUS;
-                       goto out;
+                       if (ret1 < 0)
+                               ret = ret1;
+                       else
+                               ret = VM_FAULT_SIGBUS;
+                       goto out_inline;
                }
                if (ret1 < PAGE_SIZE)
                        zero_user_segment(page, ret1, PAGE_SIZE);
@@ -1388,10 +1419,15 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                SetPageUptodate(page);
                vmf->page = page;
                ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED;
+out_inline:
+               dout("filemap_fault %p %llu~%zd read inline data ret %d\n",
+                    inode, off, (size_t)PAGE_SIZE, ret);
        }
-out:
-       dout("filemap_fault %p %llu~%zd read inline data ret %d\n",
-            inode, off, (size_t)PAGE_SIZE, ret);
+out_restore:
+       ceph_restore_sigs(&oldset);
+       if (ret < 0)
+               ret = (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS;
+
        return ret;
 }
 
@@ -1409,10 +1445,13 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        loff_t size = i_size_read(inode);
        size_t len;
        int want, got, ret;
+       sigset_t oldset;
 
        prealloc_cf = ceph_alloc_cap_flush();
        if (!prealloc_cf)
-               return VM_FAULT_SIGBUS;
+               return VM_FAULT_OOM;
+
+       ceph_block_sigs(&oldset);
 
        if (ci->i_inline_version != CEPH_INLINE_NONE) {
                struct page *locked_page = NULL;
@@ -1423,10 +1462,8 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
                ret = ceph_uninline_data(vma->vm_file, locked_page);
                if (locked_page)
                        unlock_page(locked_page);
-               if (ret < 0) {
-                       ret = VM_FAULT_SIGBUS;
+               if (ret < 0)
                        goto out_free;
-               }
        }
 
        if (off + PAGE_SIZE <= size)
@@ -1440,45 +1477,36 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
                want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
        else
                want = CEPH_CAP_FILE_BUFFER;
-       while (1) {
-               got = 0;
-               ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len,
-                                   &got, NULL);
-               if (ret == 0)
-                       break;
-               if (ret != -ERESTARTSYS) {
-                       WARN_ON(1);
-                       ret = VM_FAULT_SIGBUS;
-                       goto out_free;
-               }
-       }
+
+       got = 0;
+       ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len,
+                           &got, NULL);
+       if (ret < 0)
+               goto out_free;
+
        dout("page_mkwrite %p %llu~%zd got cap refs on %s\n",
             inode, off, len, ceph_cap_string(got));
 
        /* Update time before taking page lock */
        file_update_time(vma->vm_file);
 
-       lock_page(page);
+       do {
+               lock_page(page);
 
-       ret = VM_FAULT_NOPAGE;
-       if ((off > size) ||
-           (page->mapping != inode->i_mapping)) {
-               unlock_page(page);
-               goto out;
-       }
+               if ((off > size) || (page->mapping != inode->i_mapping)) {
+                       unlock_page(page);
+                       ret = VM_FAULT_NOPAGE;
+                       break;
+               }
+
+               ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
+               if (ret >= 0) {
+                       /* success.  we'll keep the page locked. */
+                       set_page_dirty(page);
+                       ret = VM_FAULT_LOCKED;
+               }
+       } while (ret == -EAGAIN);
 
-       ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
-       if (ret >= 0) {
-               /* success.  we'll keep the page locked. */
-               set_page_dirty(page);
-               ret = VM_FAULT_LOCKED;
-       } else {
-               if (ret == -ENOMEM)
-                       ret = VM_FAULT_OOM;
-               else
-                       ret = VM_FAULT_SIGBUS;
-       }
-out:
        if (ret == VM_FAULT_LOCKED ||
            ci->i_inline_version != CEPH_INLINE_NONE) {
                int dirty;
@@ -1495,8 +1523,10 @@ out:
             inode, off, len, ceph_cap_string(got), ret);
        ceph_put_cap_refs(ci, got);
 out_free:
+       ceph_restore_sigs(&oldset);
        ceph_free_cap_flush(prealloc_cf);
-
+       if (ret < 0)
+               ret = (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS;
        return ret;
 }
 
@@ -1614,7 +1644,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
                goto out;
        }
 
-       ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime);
+       req->r_mtime = inode->i_mtime;
        err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
        if (!err)
                err = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1657,7 +1687,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
                        goto out_put;
        }
 
-       ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime);
+       req->r_mtime = inode->i_mtime;
        err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
        if (!err)
                err = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1758,9 +1788,11 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
        rd_req->r_flags = CEPH_OSD_FLAG_READ;
        osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0);
        rd_req->r_base_oloc.pool = pool;
-       snprintf(rd_req->r_base_oid.name, sizeof(rd_req->r_base_oid.name),
-                "%llx.00000000", ci->i_vino.ino);
-       rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name);
+       ceph_oid_printf(&rd_req->r_base_oid, "%llx.00000000", ci->i_vino.ino);
+
+       err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS);
+       if (err)
+               goto out_unlock;
 
        wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL,
                                         1, false, GFP_NOFS);
@@ -1769,11 +1801,14 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
                goto out_unlock;
        }
 
-       wr_req->r_flags = CEPH_OSD_FLAG_WRITE |
-                         CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
+       wr_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ACK;
        osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
-       wr_req->r_base_oloc.pool = pool;
-       wr_req->r_base_oid = rd_req->r_base_oid;
+       ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc);
+       ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid);
+
+       err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS);
+       if (err)
+               goto out_unlock;
 
        /* one page should be large enough for STAT data */
        pages = ceph_alloc_page_vector(1, GFP_KERNEL);
@@ -1784,12 +1819,9 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 
        osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE,
                                     0, false, true);
-       ceph_osdc_build_request(rd_req, 0, NULL, CEPH_NOSNAP,
-                               &ci->vfs_inode.i_mtime);
        err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
 
-       ceph_osdc_build_request(wr_req, 0, NULL, CEPH_NOSNAP,
-                               &ci->vfs_inode.i_mtime);
+       wr_req->r_mtime = ci->vfs_inode.i_mtime;
        err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
 
        if (!err)
@@ -1823,10 +1855,8 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 out_unlock:
        up_write(&mdsc->pool_perm_rwsem);
 
-       if (rd_req)
-               ceph_osdc_put_request(rd_req);
-       if (wr_req)
-               ceph_osdc_put_request(wr_req);
+       ceph_osdc_put_request(rd_req);
+       ceph_osdc_put_request(wr_req);
 out:
        if (!err)
                err = have;
index a351480..238c55b 100644 (file)
@@ -25,6 +25,7 @@
 #include "cache.h"
 
 struct ceph_aux_inode {
+       u64             version;
        struct timespec mtime;
        loff_t          size;
 };
@@ -69,15 +70,8 @@ int ceph_fscache_register_fs(struct ceph_fs_client* fsc)
        fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index,
                                              &ceph_fscache_fsid_object_def,
                                              fsc, true);
-
-       if (fsc->fscache == NULL) {
+       if (!fsc->fscache)
                pr_err("Unable to resgister fsid: %p fscache cookie", fsc);
-               return 0;
-       }
-
-       fsc->revalidate_wq = alloc_workqueue("ceph-revalidate", 0, 1);
-       if (fsc->revalidate_wq == NULL)
-               return -ENOMEM;
 
        return 0;
 }
@@ -105,6 +99,7 @@ static uint16_t ceph_fscache_inode_get_aux(const void *cookie_netfs_data,
        const struct inode* inode = &ci->vfs_inode;
 
        memset(&aux, 0, sizeof(aux));
+       aux.version = ci->i_version;
        aux.mtime = inode->i_mtime;
        aux.size = i_size_read(inode);
 
@@ -131,6 +126,7 @@ static enum fscache_checkaux ceph_fscache_inode_check_aux(
                return FSCACHE_CHECKAUX_OBSOLETE;
 
        memset(&aux, 0, sizeof(aux));
+       aux.version = ci->i_version;
        aux.mtime = inode->i_mtime;
        aux.size = i_size_read(inode);
 
@@ -181,32 +177,26 @@ static const struct fscache_cookie_def ceph_fscache_inode_object_def = {
        .now_uncached   = ceph_fscache_inode_now_uncached,
 };
 
-void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
-                                       struct ceph_inode_info* ci)
+void ceph_fscache_register_inode_cookie(struct inode *inode)
 {
-       struct inode* inode = &ci->vfs_inode;
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 
        /* No caching for filesystem */
        if (fsc->fscache == NULL)
                return;
 
        /* Only cache for regular files that are read only */
-       if ((ci->vfs_inode.i_mode & S_IFREG) == 0)
+       if (!S_ISREG(inode->i_mode))
                return;
 
-       /* Avoid multiple racing open requests */
-       inode_lock(inode);
-
-       if (ci->fscache)
-               goto done;
-
-       ci->fscache = fscache_acquire_cookie(fsc->fscache,
-                                            &ceph_fscache_inode_object_def,
-                                            ci, true);
-       fscache_check_consistency(ci->fscache);
-done:
+       inode_lock_nested(inode, I_MUTEX_CHILD);
+       if (!ci->fscache) {
+               ci->fscache = fscache_acquire_cookie(fsc->fscache,
+                                       &ceph_fscache_inode_object_def,
+                                       ci, false);
+       }
        inode_unlock(inode);
-
 }
 
 void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
@@ -222,6 +212,34 @@ void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
        fscache_relinquish_cookie(cookie, 0);
 }
 
+static bool ceph_fscache_can_enable(void *data)
+{
+       struct inode *inode = data;
+       return !inode_is_open_for_write(inode);
+}
+
+void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp)
+{
+       struct ceph_inode_info *ci = ceph_inode(inode);
+
+       if (!fscache_cookie_valid(ci->fscache))
+               return;
+
+       if (inode_is_open_for_write(inode)) {
+               dout("fscache_file_set_cookie %p %p disabling cache\n",
+                    inode, filp);
+               fscache_disable_cookie(ci->fscache, false);
+               fscache_uncache_all_inode_pages(ci->fscache, inode);
+       } else {
+               fscache_enable_cookie(ci->fscache, ceph_fscache_can_enable,
+                               inode);
+               if (fscache_cookie_enabled(ci->fscache)) {
+                       dout("fscache_file_set_cookie %p %p enabing cache\n",
+                            inode, filp);
+               }
+       }
+}
+
 static void ceph_vfs_readpage_complete(struct page *page, void *data, int error)
 {
        if (!error)
@@ -236,10 +254,9 @@ static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int
        unlock_page(page);
 }
 
-static inline int cache_valid(struct ceph_inode_info *ci)
+static inline bool cache_valid(struct ceph_inode_info *ci)
 {
-       return ((ceph_caps_issued(ci) & CEPH_CAP_FILE_CACHE) &&
-               (ci->i_fscache_gen == ci->i_rdcache_gen));
+       return ci->i_fscache_gen == ci->i_rdcache_gen;
 }
 
 
@@ -332,69 +349,27 @@ void ceph_invalidate_fscache_page(struct inode* inode, struct page *page)
 
 void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
 {
-       if (fsc->revalidate_wq)
-               destroy_workqueue(fsc->revalidate_wq);
-
        fscache_relinquish_cookie(fsc->fscache, 0);
        fsc->fscache = NULL;
 }
 
-static void ceph_revalidate_work(struct work_struct *work)
-{
-       int issued;
-       u32 orig_gen;
-       struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
-                                                 i_revalidate_work);
-       struct inode *inode = &ci->vfs_inode;
-
-       spin_lock(&ci->i_ceph_lock);
-       issued = __ceph_caps_issued(ci, NULL);
-       orig_gen = ci->i_rdcache_gen;
-       spin_unlock(&ci->i_ceph_lock);
-
-       if (!(issued & CEPH_CAP_FILE_CACHE)) {
-               dout("revalidate_work lost cache before validation %p\n",
-                    inode);
-               goto out;
-       }
-
-       if (!fscache_check_consistency(ci->fscache))
-               fscache_invalidate(ci->fscache);
-
-       spin_lock(&ci->i_ceph_lock);
-       /* Update the new valid generation (backwards sanity check too) */
-       if (orig_gen > ci->i_fscache_gen) {
-               ci->i_fscache_gen = orig_gen;
-       }
-       spin_unlock(&ci->i_ceph_lock);
-
-out:
-       iput(&ci->vfs_inode);
-}
-
-void ceph_queue_revalidate(struct inode *inode)
+/*
+ * caller should hold CEPH_CAP_FILE_{RD,CACHE}
+ */
+void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci)
 {
-       struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
-       struct ceph_inode_info *ci = ceph_inode(inode);
-
-       if (fsc->revalidate_wq == NULL || ci->fscache == NULL)
+       if (cache_valid(ci))
                return;
 
-       ihold(inode);
-
-       if (queue_work(ceph_sb_to_client(inode->i_sb)->revalidate_wq,
-                      &ci->i_revalidate_work)) {
-               dout("ceph_queue_revalidate %p\n", inode);
-       } else {
-               dout("ceph_queue_revalidate %p failed\n)", inode);
-               iput(inode);
+       /* resue i_truncate_mutex. There should be no pending
+        * truncate while the caller holds CEPH_CAP_FILE_RD */
+       mutex_lock(&ci->i_truncate_mutex);
+       if (!cache_valid(ci)) {
+               if (fscache_check_consistency(ci->fscache))
+                       fscache_invalidate(ci->fscache);
+               spin_lock(&ci->i_ceph_lock);
+               ci->i_fscache_gen = ci->i_rdcache_gen;
+               spin_unlock(&ci->i_ceph_lock);
        }
-}
-
-void ceph_fscache_inode_init(struct ceph_inode_info *ci)
-{
-       ci->fscache = NULL;
-       /* The first load is verifed cookie open time */
-       ci->i_fscache_gen = 1;
-       INIT_WORK(&ci->i_revalidate_work, ceph_revalidate_work);
+       mutex_unlock(&ci->i_truncate_mutex);
 }
index 5ac591b..7e72c75 100644 (file)
@@ -34,10 +34,10 @@ void ceph_fscache_unregister(void);
 int ceph_fscache_register_fs(struct ceph_fs_client* fsc);
 void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc);
 
-void ceph_fscache_inode_init(struct ceph_inode_info *ci);
-void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
-                                       struct ceph_inode_info* ci);
+void ceph_fscache_register_inode_cookie(struct inode *inode);
 void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci);
+void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp);
+void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci);
 
 int ceph_readpage_from_fscache(struct inode *inode, struct page *page);
 int ceph_readpages_from_fscache(struct inode *inode,
@@ -46,12 +46,11 @@ int ceph_readpages_from_fscache(struct inode *inode,
                                unsigned *nr_pages);
 void ceph_readpage_to_fscache(struct inode *inode, struct page *page);
 void ceph_invalidate_fscache_page(struct inode* inode, struct page *page);
-void ceph_queue_revalidate(struct inode *inode);
 
-static inline void ceph_fscache_update_objectsize(struct inode *inode)
+static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci)
 {
-       struct ceph_inode_info *ci = ceph_inode(inode);
-       fscache_attr_changed(ci->fscache);
+       ci->fscache = NULL;
+       ci->i_fscache_gen = 0;
 }
 
 static inline void ceph_fscache_invalidate(struct inode *inode)
@@ -88,6 +87,11 @@ static inline void ceph_fscache_readpages_cancel(struct inode *inode,
        return fscache_readpages_cancel(ci->fscache, pages);
 }
 
+static inline void ceph_disable_fscache_readpage(struct ceph_inode_info *ci)
+{
+       ci->i_fscache_gen = ci->i_rdcache_gen - 1;
+}
+
 #else
 
 static inline int ceph_fscache_register(void)
@@ -112,8 +116,20 @@ static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci)
 {
 }
 
-static inline void ceph_fscache_register_inode_cookie(struct ceph_fs_client* parent_fsc,
-                                                     struct ceph_inode_info* ci)
+static inline void ceph_fscache_register_inode_cookie(struct inode *inode)
+{
+}
+
+static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
+{
+}
+
+static inline void ceph_fscache_file_set_cookie(struct inode *inode,
+                                               struct file *filp)
+{
+}
+
+static inline void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci)
 {
 }
 
@@ -141,10 +157,6 @@ static inline void ceph_readpage_to_fscache(struct inode *inode,
 {
 }
 
-static inline void ceph_fscache_update_objectsize(struct inode *inode)
-{
-}
-
 static inline void ceph_fscache_invalidate(struct inode *inode)
 {
 }
@@ -154,10 +166,6 @@ static inline void ceph_invalidate_fscache_page(struct inode *inode,
 {
 }
 
-static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
-{
-}
-
 static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
 {
        return 1;
@@ -173,7 +181,7 @@ static inline void ceph_fscache_readpages_cancel(struct inode *inode,
 {
 }
 
-static inline void ceph_queue_revalidate(struct inode *inode)
+static inline void ceph_disable_fscache_readpage(struct ceph_inode_info *ci)
 {
 }
 
index cfaeef1..6f60d0a 100644 (file)
@@ -1656,7 +1656,7 @@ retry_locked:
         */
        if ((!is_delayed || mdsc->stopping) &&
            !S_ISDIR(inode->i_mode) &&          /* ignore readdir cache */
-           ci->i_wrbuffer_ref == 0 &&          /* no dirty pages... */
+           !(ci->i_wb_ref || ci->i_wrbuffer_ref) &&   /* no dirty pages... */
            inode->i_data.nrpages &&            /* have cached pages */
            (revoking & (CEPH_CAP_FILE_CACHE|
                         CEPH_CAP_FILE_LAZYIO)) && /*  or revoking cache */
@@ -1698,8 +1698,8 @@ retry_locked:
 
                revoking = cap->implemented & ~cap->issued;
                dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n",
-                    cap->mds, cap, ceph_cap_string(cap->issued),
-                    ceph_cap_string(cap_used),
+                    cap->mds, cap, ceph_cap_string(cap_used),
+                    ceph_cap_string(cap->issued),
                     ceph_cap_string(cap->implemented),
                     ceph_cap_string(revoking));
 
@@ -2317,7 +2317,7 @@ again:
 
        /* make sure file is actually open */
        file_wanted = __ceph_caps_file_wanted(ci);
-       if ((file_wanted & need) == 0) {
+       if ((file_wanted & need) != need) {
                dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
                     ceph_cap_string(need), ceph_cap_string(file_wanted));
                *err = -EBADF;
@@ -2393,6 +2393,9 @@ again:
                                snap_rwsem_locked = true;
                        }
                        *got = need | (have & want);
+                       if ((need & CEPH_CAP_FILE_RD) &&
+                           !(*got & CEPH_CAP_FILE_CACHE))
+                               ceph_disable_fscache_readpage(ci);
                        __take_cap_refs(ci, *got, true);
                        ret = 1;
                }
@@ -2412,12 +2415,26 @@ again:
                        goto out_unlock;
                }
 
-               if (!__ceph_is_any_caps(ci) &&
-                   ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
-                       dout("get_cap_refs %p forced umount\n", inode);
-                       *err = -EIO;
-                       ret = 1;
-                       goto out_unlock;
+               if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) {
+                       int mds_wanted;
+                       if (ACCESS_ONCE(mdsc->fsc->mount_state) ==
+                           CEPH_MOUNT_SHUTDOWN) {
+                               dout("get_cap_refs %p forced umount\n", inode);
+                               *err = -EIO;
+                               ret = 1;
+                               goto out_unlock;
+                       }
+                       mds_wanted = __ceph_caps_mds_wanted(ci);
+                       if ((mds_wanted & need) != need) {
+                               dout("get_cap_refs %p caps were dropped"
+                                    " (session killed?)\n", inode);
+                               *err = -ESTALE;
+                               ret = 1;
+                               goto out_unlock;
+                       }
+                       if ((mds_wanted & file_wanted) ==
+                           (file_wanted & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
+                               ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED;
                }
 
                dout("get_cap_refs %p have %s needed %s\n", inode,
@@ -2487,7 +2504,7 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
                        if (err == -EAGAIN)
                                continue;
                        if (err < 0)
-                               return err;
+                               ret = err;
                } else {
                        ret = wait_event_interruptible(ci->i_cap_wq,
                                        try_get_cap_refs(ci, need, want, endoff,
@@ -2496,8 +2513,15 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
                                continue;
                        if (err < 0)
                                ret = err;
-                       if (ret < 0)
-                               return ret;
+               }
+               if (ret < 0) {
+                       if (err == -ESTALE) {
+                               /* session was killed, try renew caps */
+                               ret = ceph_renew_caps(&ci->vfs_inode);
+                               if (ret == 0)
+                                       continue;
+                       }
+                       return ret;
                }
 
                if (ci->i_inline_version != CEPH_INLINE_NONE &&
@@ -2533,6 +2557,9 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
                break;
        }
 
+       if ((_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE))
+               ceph_fscache_revalidate_cookie(ci);
+
        *got = _got;
        return 0;
 }
@@ -2774,7 +2801,6 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
        bool writeback = false;
        bool queue_trunc = false;
        bool queue_invalidate = false;
-       bool queue_revalidate = false;
        bool deleted_inode = false;
        bool fill_inline = false;
 
@@ -2807,7 +2833,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
        if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */
            ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
            (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
-           !ci->i_wrbuffer_ref) {
+           !(ci->i_wrbuffer_ref || ci->i_wb_ref)) {
                if (try_nonblocking_invalidate(inode)) {
                        /* there were locked pages.. invalidate later
                           in a separate thread. */
@@ -2816,8 +2842,6 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
                                ci->i_rdcache_revoking = ci->i_rdcache_gen;
                        }
                }
-
-               ceph_fscache_invalidate(inode);
        }
 
        /* side effects now are allowed */
@@ -2859,11 +2883,6 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
                }
        }
 
-       /* Do we need to revalidate our fscache cookie. Don't bother on the
-        * first cache cap as we already validate at cookie creation time. */
-       if ((issued & CEPH_CAP_FILE_CACHE) && ci->i_rdcache_gen > 1)
-               queue_revalidate = true;
-
        if (newcaps & CEPH_CAP_ANY_RD) {
                /* ctime/mtime/atime? */
                ceph_decode_timespec(&mtime, &grant->mtime);
@@ -2972,11 +2991,8 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
        if (fill_inline)
                ceph_fill_inline_data(inode, NULL, inline_data, inline_len);
 
-       if (queue_trunc) {
+       if (queue_trunc)
                ceph_queue_vmtruncate(inode);
-               ceph_queue_revalidate(inode);
-       } else if (queue_revalidate)
-               ceph_queue_revalidate(inode);
 
        if (writeback)
                /*
@@ -3178,10 +3194,8 @@ static void handle_cap_trunc(struct inode *inode,
                                          truncate_seq, truncate_size, size);
        spin_unlock(&ci->i_ceph_lock);
 
-       if (queue_trunc) {
+       if (queue_trunc)
                ceph_queue_vmtruncate(inode);
-               ceph_fscache_invalidate(inode);
-       }
 }
 
 /*
@@ -3226,6 +3240,8 @@ retry:
 
        if (target < 0) {
                __ceph_remove_cap(cap, false);
+               if (!ci->i_auth_cap)
+                       ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
                goto out_unlock;
        }
 
index 31f8314..39ff678 100644 (file)
@@ -109,7 +109,7 @@ static int mdsc_show(struct seq_file *s, void *p)
                                   path ? path : "");
                        spin_unlock(&req->r_old_dentry->d_lock);
                        kfree(path);
-               } else if (req->r_path2) {
+               } else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) {
                        if (req->r_ino2.ino)
                                seq_printf(s, " #%llx/%s", req->r_ino2.ino,
                                           req->r_path2);
index 3ab1192..6e0fedf 100644 (file)
@@ -70,16 +70,42 @@ out_unlock:
 }
 
 /*
- * for readdir, we encode the directory frag and offset within that
- * frag into f_pos.
+ * for f_pos for readdir:
+ * - hash order:
+ *     (0xff << 52) | ((24 bits hash) << 28) |
+ *     (the nth entry has hash collision);
+ * - frag+name order;
+ *     ((frag value) << 28) | (the nth entry in frag);
  */
+#define OFFSET_BITS    28
+#define OFFSET_MASK    ((1 << OFFSET_BITS) - 1)
+#define HASH_ORDER     (0xffull << (OFFSET_BITS + 24))
+loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order)
+{
+       loff_t fpos = ((loff_t)high << 28) | (loff_t)off;
+       if (hash_order)
+               fpos |= HASH_ORDER;
+       return fpos;
+}
+
+static bool is_hash_order(loff_t p)
+{
+       return (p & HASH_ORDER) == HASH_ORDER;
+}
+
 static unsigned fpos_frag(loff_t p)
 {
-       return p >> 32;
+       return p >> OFFSET_BITS;
 }
+
+static unsigned fpos_hash(loff_t p)
+{
+       return ceph_frag_value(fpos_frag(p));
+}
+
 static unsigned fpos_off(loff_t p)
 {
-       return p & 0xffffffff;
+       return p & OFFSET_MASK;
 }
 
 static int fpos_cmp(loff_t l, loff_t r)
@@ -111,6 +137,50 @@ static int note_last_dentry(struct ceph_file_info *fi, const char *name,
        return 0;
 }
 
+
+static struct dentry *
+__dcache_find_get_entry(struct dentry *parent, u64 idx,
+                       struct ceph_readdir_cache_control *cache_ctl)
+{
+       struct inode *dir = d_inode(parent);
+       struct dentry *dentry;
+       unsigned idx_mask = (PAGE_SIZE / sizeof(struct dentry *)) - 1;
+       loff_t ptr_pos = idx * sizeof(struct dentry *);
+       pgoff_t ptr_pgoff = ptr_pos >> PAGE_SHIFT;
+
+       if (ptr_pos >= i_size_read(dir))
+               return NULL;
+
+       if (!cache_ctl->page || ptr_pgoff != page_index(cache_ctl->page)) {
+               ceph_readdir_cache_release(cache_ctl);
+               cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff);
+               if (!cache_ctl->page) {
+                       dout(" page %lu not found\n", ptr_pgoff);
+                       return ERR_PTR(-EAGAIN);
+               }
+               /* reading/filling the cache are serialized by
+                  i_mutex, no need to use page lock */
+               unlock_page(cache_ctl->page);
+               cache_ctl->dentries = kmap(cache_ctl->page);
+       }
+
+       cache_ctl->index = idx & idx_mask;
+
+       rcu_read_lock();
+       spin_lock(&parent->d_lock);
+       /* check i_size again here, because empty directory can be
+        * marked as complete while not holding the i_mutex. */
+       if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir))
+               dentry = cache_ctl->dentries[cache_ctl->index];
+       else
+               dentry = NULL;
+       spin_unlock(&parent->d_lock);
+       if (dentry && !lockref_get_not_dead(&dentry->d_lockref))
+               dentry = NULL;
+       rcu_read_unlock();
+       return dentry ? : ERR_PTR(-EAGAIN);
+}
+
 /*
  * When possible, we try to satisfy a readdir by peeking at the
  * dcache.  We make this work by carefully ordering dentries on
@@ -130,75 +200,68 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
        struct inode *dir = d_inode(parent);
        struct dentry *dentry, *last = NULL;
        struct ceph_dentry_info *di;
-       unsigned nsize = PAGE_SIZE / sizeof(struct dentry *);
-       int err = 0;
-       loff_t ptr_pos = 0;
        struct ceph_readdir_cache_control cache_ctl = {};
+       u64 idx = 0;
+       int err = 0;
 
-       dout("__dcache_readdir %p v%u at %llu\n", dir, shared_gen, ctx->pos);
+       dout("__dcache_readdir %p v%u at %llx\n", dir, shared_gen, ctx->pos);
+
+       /* search start position */
+       if (ctx->pos > 2) {
+               u64 count = div_u64(i_size_read(dir), sizeof(struct dentry *));
+               while (count > 0) {
+                       u64 step = count >> 1;
+                       dentry = __dcache_find_get_entry(parent, idx + step,
+                                                        &cache_ctl);
+                       if (!dentry) {
+                               /* use linar search */
+                               idx = 0;
+                               break;
+                       }
+                       if (IS_ERR(dentry)) {
+                               err = PTR_ERR(dentry);
+                               goto out;
+                       }
+                       di = ceph_dentry(dentry);
+                       spin_lock(&dentry->d_lock);
+                       if (fpos_cmp(di->offset, ctx->pos) < 0) {
+                               idx += step + 1;
+                               count -= step + 1;
+                       } else {
+                               count = step;
+                       }
+                       spin_unlock(&dentry->d_lock);
+                       dput(dentry);
+               }
 
-       /* we can calculate cache index for the first dirfrag */
-       if (ceph_frag_is_leftmost(fpos_frag(ctx->pos))) {
-               cache_ctl.index = fpos_off(ctx->pos) - 2;
-               BUG_ON(cache_ctl.index < 0);
-               ptr_pos = cache_ctl.index * sizeof(struct dentry *);
+               dout("__dcache_readdir %p cache idx %llu\n", dir, idx);
        }
 
-       while (true) {
-               pgoff_t pgoff;
-               bool emit_dentry;
 
-               if (ptr_pos >= i_size_read(dir)) {
+       for (;;) {
+               bool emit_dentry = false;
+               dentry = __dcache_find_get_entry(parent, idx++, &cache_ctl);
+               if (!dentry) {
                        fi->flags |= CEPH_F_ATEND;
                        err = 0;
                        break;
                }
-
-               err = -EAGAIN;
-               pgoff = ptr_pos >> PAGE_SHIFT;
-               if (!cache_ctl.page || pgoff != page_index(cache_ctl.page)) {
-                       ceph_readdir_cache_release(&cache_ctl);
-                       cache_ctl.page = find_lock_page(&dir->i_data, pgoff);
-                       if (!cache_ctl.page) {
-                               dout(" page %lu not found\n", pgoff);
-                               break;
-                       }
-                       /* reading/filling the cache are serialized by
-                        * i_mutex, no need to use page lock */
-                       unlock_page(cache_ctl.page);
-                       cache_ctl.dentries = kmap(cache_ctl.page);
+               if (IS_ERR(dentry)) {
+                       err = PTR_ERR(dentry);
+                       goto out;
                }
 
-               rcu_read_lock();
-               spin_lock(&parent->d_lock);
-               /* check i_size again here, because empty directory can be
-                * marked as complete while not holding the i_mutex. */
-               if (ceph_dir_is_complete_ordered(dir) &&
-                   ptr_pos < i_size_read(dir))
-                       dentry = cache_ctl.dentries[cache_ctl.index % nsize];
-               else
-                       dentry = NULL;
-               spin_unlock(&parent->d_lock);
-               if (dentry && !lockref_get_not_dead(&dentry->d_lockref))
-                       dentry = NULL;
-               rcu_read_unlock();
-               if (!dentry)
-                       break;
-
-               emit_dentry = false;
                di = ceph_dentry(dentry);
                spin_lock(&dentry->d_lock);
                if (di->lease_shared_gen == shared_gen &&
                    d_really_is_positive(dentry) &&
-                   ceph_snap(d_inode(dentry)) != CEPH_SNAPDIR &&
-                   ceph_ino(d_inode(dentry)) != CEPH_INO_CEPH &&
                    fpos_cmp(ctx->pos, di->offset) <= 0) {
                        emit_dentry = true;
                }
                spin_unlock(&dentry->d_lock);
 
                if (emit_dentry) {
-                       dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos,
+                       dout(" %llx dentry %p %pd %p\n", di->offset,
                             dentry, dentry, d_inode(dentry));
                        ctx->pos = di->offset;
                        if (!dir_emit(ctx, dentry->d_name.name,
@@ -218,10 +281,8 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
                } else {
                        dput(dentry);
                }
-
-               cache_ctl.index++;
-               ptr_pos += sizeof(struct dentry *);
        }
+out:
        ceph_readdir_cache_release(&cache_ctl);
        if (last) {
                int ret;
@@ -235,6 +296,16 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
        return err;
 }
 
+static bool need_send_readdir(struct ceph_file_info *fi, loff_t pos)
+{
+       if (!fi->last_readdir)
+               return true;
+       if (is_hash_order(pos))
+               return !ceph_frag_contains_value(fi->frag, fpos_hash(pos));
+       else
+               return fi->frag != fpos_frag(pos);
+}
+
 static int ceph_readdir(struct file *file, struct dir_context *ctx)
 {
        struct ceph_file_info *fi = file->private_data;
@@ -242,13 +313,12 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
        struct ceph_mds_client *mdsc = fsc->mdsc;
-       unsigned frag = fpos_frag(ctx->pos);
-       int off = fpos_off(ctx->pos);
+       int i;
        int err;
        u32 ftype;
        struct ceph_mds_reply_info_parsed *rinfo;
 
-       dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off);
+       dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos);
        if (fi->flags & CEPH_F_ATEND)
                return 0;
 
@@ -260,7 +330,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
                            inode->i_mode >> 12))
                        return 0;
                ctx->pos = 1;
-               off = 1;
        }
        if (ctx->pos == 1) {
                ino_t ino = parent_ino(file->f_path.dentry);
@@ -270,7 +339,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
                            inode->i_mode >> 12))
                        return 0;
                ctx->pos = 2;
-               off = 2;
        }
 
        /* can we use the dcache? */
@@ -285,8 +353,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
                err = __dcache_readdir(file, ctx, shared_gen);
                if (err != -EAGAIN)
                        return err;
-               frag = fpos_frag(ctx->pos);
-               off = fpos_off(ctx->pos);
        } else {
                spin_unlock(&ci->i_ceph_lock);
        }
@@ -294,8 +360,9 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
        /* proceed with a normal readdir */
 more:
        /* do we have the correct frag content buffered? */
-       if (fi->frag != frag || fi->last_readdir == NULL) {
+       if (need_send_readdir(fi, ctx->pos)) {
                struct ceph_mds_request *req;
+               unsigned frag;
                int op = ceph_snap(inode) == CEPH_SNAPDIR ?
                        CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
 
@@ -305,6 +372,13 @@ more:
                        fi->last_readdir = NULL;
                }
 
+               if (is_hash_order(ctx->pos)) {
+                       frag = ceph_choose_frag(ci, fpos_hash(ctx->pos),
+                                               NULL, NULL);
+               } else {
+                       frag = fpos_frag(ctx->pos);
+               }
+
                dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
                     ceph_vinop(inode), frag, fi->last_name);
                req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
@@ -331,6 +405,8 @@ more:
                req->r_readdir_cache_idx = fi->readdir_cache_idx;
                req->r_readdir_offset = fi->next_offset;
                req->r_args.readdir.frag = cpu_to_le32(frag);
+               req->r_args.readdir.flags =
+                               cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS);
 
                req->r_inode = inode;
                ihold(inode);
@@ -340,22 +416,26 @@ more:
                        ceph_mdsc_put_request(req);
                        return err;
                }
-               dout("readdir got and parsed readdir result=%d"
-                    " on frag %x, end=%d, complete=%d\n", err, frag,
+               dout("readdir got and parsed readdir result=%d on "
+                    "frag %x, end=%d, complete=%d, hash_order=%d\n",
+                    err, frag,
                     (int)req->r_reply_info.dir_end,
-                    (int)req->r_reply_info.dir_complete);
-
+                    (int)req->r_reply_info.dir_complete,
+                    (int)req->r_reply_info.hash_order);
 
-               /* note next offset and last dentry name */
                rinfo = &req->r_reply_info;
                if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
                        frag = le32_to_cpu(rinfo->dir_dir->frag);
-                       off = req->r_readdir_offset;
-                       fi->next_offset = off;
+                       if (!rinfo->hash_order) {
+                               fi->next_offset = req->r_readdir_offset;
+                               /* adjust ctx->pos to beginning of frag */
+                               ctx->pos = ceph_make_fpos(frag,
+                                                         fi->next_offset,
+                                                         false);
+                       }
                }
 
                fi->frag = frag;
-               fi->offset = fi->next_offset;
                fi->last_readdir = req;
 
                if (req->r_did_prepopulate) {
@@ -363,7 +443,8 @@ more:
                        if (fi->readdir_cache_idx < 0) {
                                /* preclude from marking dir ordered */
                                fi->dir_ordered_count = 0;
-                       } else if (ceph_frag_is_leftmost(frag) && off == 2) {
+                       } else if (ceph_frag_is_leftmost(frag) &&
+                                  fi->next_offset == 2) {
                                /* note dir version at start of readdir so
                                 * we can tell if any dentries get dropped */
                                fi->dir_release_count = req->r_dir_release_cnt;
@@ -377,65 +458,87 @@ more:
                        fi->dir_release_count = 0;
                }
 
-               if (req->r_reply_info.dir_end) {
-                       kfree(fi->last_name);
-                       fi->last_name = NULL;
-                       if (ceph_frag_is_rightmost(frag))
-                               fi->next_offset = 2;
-                       else
-                               fi->next_offset = 0;
-               } else {
-                       err = note_last_dentry(fi,
-                                      rinfo->dir_dname[rinfo->dir_nr-1],
-                                      rinfo->dir_dname_len[rinfo->dir_nr-1],
-                                      fi->next_offset + rinfo->dir_nr);
+               /* note next offset and last dentry name */
+               if (rinfo->dir_nr > 0) {
+                       struct ceph_mds_reply_dir_entry *rde =
+                                       rinfo->dir_entries + (rinfo->dir_nr-1);
+                       unsigned next_offset = req->r_reply_info.dir_end ?
+                                       2 : (fpos_off(rde->offset) + 1);
+                       err = note_last_dentry(fi, rde->name, rde->name_len,
+                                              next_offset);
                        if (err)
                                return err;
+               } else if (req->r_reply_info.dir_end) {
+                       fi->next_offset = 2;
+                       /* keep last name */
                }
        }
 
        rinfo = &fi->last_readdir->r_reply_info;
-       dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
-            rinfo->dir_nr, off, fi->offset);
-
-       ctx->pos = ceph_make_fpos(frag, off);
-       while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) {
-               struct ceph_mds_reply_inode *in =
-                       rinfo->dir_in[off - fi->offset].in;
+       dout("readdir frag %x num %d pos %llx chunk first %llx\n",
+            fi->frag, rinfo->dir_nr, ctx->pos,
+            rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL);
+
+       i = 0;
+       /* search start position */
+       if (rinfo->dir_nr > 0) {
+               int step, nr = rinfo->dir_nr;
+               while (nr > 0) {
+                       step = nr >> 1;
+                       if (rinfo->dir_entries[i + step].offset < ctx->pos) {
+                               i +=  step + 1;
+                               nr -= step + 1;
+                       } else {
+                               nr = step;
+                       }
+               }
+       }
+       for (; i < rinfo->dir_nr; i++) {
+               struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
                struct ceph_vino vino;
                ino_t ino;
 
-               dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
-                    off, off - fi->offset, rinfo->dir_nr, ctx->pos,
-                    rinfo->dir_dname_len[off - fi->offset],
-                    rinfo->dir_dname[off - fi->offset], in);
-               BUG_ON(!in);
-               ftype = le32_to_cpu(in->mode) >> 12;
-               vino.ino = le64_to_cpu(in->ino);
-               vino.snap = le64_to_cpu(in->snapid);
+               BUG_ON(rde->offset < ctx->pos);
+
+               ctx->pos = rde->offset;
+               dout("readdir (%d/%d) -> %llx '%.*s' %p\n",
+                    i, rinfo->dir_nr, ctx->pos,
+                    rde->name_len, rde->name, &rde->inode.in);
+
+               BUG_ON(!rde->inode.in);
+               ftype = le32_to_cpu(rde->inode.in->mode) >> 12;
+               vino.ino = le64_to_cpu(rde->inode.in->ino);
+               vino.snap = le64_to_cpu(rde->inode.in->snapid);
                ino = ceph_vino_to_ino(vino);
-               if (!dir_emit(ctx,
-                           rinfo->dir_dname[off - fi->offset],
-                           rinfo->dir_dname_len[off - fi->offset],
-                           ceph_translate_ino(inode->i_sb, ino), ftype)) {
+
+               if (!dir_emit(ctx, rde->name, rde->name_len,
+                             ceph_translate_ino(inode->i_sb, ino), ftype)) {
                        dout("filldir stopping us...\n");
                        return 0;
                }
-               off++;
                ctx->pos++;
        }
 
-       if (fi->last_name) {
+       if (fi->next_offset > 2) {
                ceph_mdsc_put_request(fi->last_readdir);
                fi->last_readdir = NULL;
                goto more;
        }
 
        /* more frags? */
-       if (!ceph_frag_is_rightmost(frag)) {
-               frag = ceph_frag_next(frag);
-               off = 0;
-               ctx->pos = ceph_make_fpos(frag, off);
+       if (!ceph_frag_is_rightmost(fi->frag)) {
+               unsigned frag = ceph_frag_next(fi->frag);
+               if (is_hash_order(ctx->pos)) {
+                       loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag),
+                                                       fi->next_offset, true);
+                       if (new_pos > ctx->pos)
+                               ctx->pos = new_pos;
+                       /* keep last_name */
+               } else {
+                       ctx->pos = ceph_make_fpos(frag, fi->next_offset, false);
+                       kfree(fi->last_name);
+                       fi->last_name = NULL;
+               }
                dout("readdir next frag is %x\n", frag);
                goto more;
        }
@@ -467,7 +570,7 @@ more:
        return 0;
 }
 
-static void reset_readdir(struct ceph_file_info *fi, unsigned frag)
+static void reset_readdir(struct ceph_file_info *fi)
 {
        if (fi->last_readdir) {
                ceph_mdsc_put_request(fi->last_readdir);
@@ -477,18 +580,38 @@ static void reset_readdir(struct ceph_file_info *fi, unsigned frag)
        fi->last_name = NULL;
        fi->dir_release_count = 0;
        fi->readdir_cache_idx = -1;
-       if (ceph_frag_is_leftmost(frag))
-               fi->next_offset = 2;  /* compensate for . and .. */
-       else
-               fi->next_offset = 0;
+       fi->next_offset = 2;  /* compensate for . and .. */
        fi->flags &= ~CEPH_F_ATEND;
 }
 
+/*
+ * discard buffered readdir content on seekdir(0), or seek to new frag,
+ * or seek prior to current chunk
+ */
+static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos)
+{
+       struct ceph_mds_reply_info_parsed *rinfo;
+       loff_t chunk_offset;
+       if (new_pos == 0)
+               return true;
+       if (is_hash_order(new_pos)) {
+               /* no need to reset last_name for a forward seek when
+                * dentries are sotred in hash order */
+       } else if (fi->frag |= fpos_frag(new_pos)) {
+               return true;
+       }
+       rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL;
+       if (!rinfo || !rinfo->dir_nr)
+               return true;
+       chunk_offset = rinfo->dir_entries[0].offset;
+       return new_pos < chunk_offset ||
+              is_hash_order(new_pos) != is_hash_order(chunk_offset);
+}
+
 static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
 {
        struct ceph_file_info *fi = file->private_data;
        struct inode *inode = file->f_mapping->host;
-       loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset);
        loff_t retval;
 
        inode_lock(inode);
@@ -505,25 +628,22 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
        }
 
        if (offset >= 0) {
+               if (need_reset_readdir(fi, offset)) {
+                       dout("dir_llseek dropping %p content\n", file);
+                       reset_readdir(fi);
+               } else if (is_hash_order(offset) && offset > file->f_pos) {
+                       /* for hash offset, we don't know if a forward seek
+                        * is within same frag */
+                       fi->dir_release_count = 0;
+                       fi->readdir_cache_idx = -1;
+               }
+
                if (offset != file->f_pos) {
                        file->f_pos = offset;
                        file->f_version = 0;
                        fi->flags &= ~CEPH_F_ATEND;
                }
                retval = offset;
-
-               if (offset == 0 ||
-                   fpos_frag(offset) != fi->frag ||
-                   fpos_off(offset) < fi->offset) {
-                       /* discard buffered readdir content on seekdir(0), or
-                        * seek to new frag, or seek prior to current chunk */
-                       dout("dir_llseek dropping %p content\n", file);
-                       reset_readdir(fi, fpos_frag(offset));
-               } else if (fpos_cmp(offset, old_offset) > 0) {
-                       /* reset dir_release_count if we did a forward seek */
-                       fi->dir_release_count = 0;
-                       fi->readdir_cache_idx = -1;
-               }
        }
 out:
        inode_unlock(inode);
@@ -591,7 +711,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
        return dentry;
 }
 
-static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
+static bool is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
 {
        return ceph_ino(inode) == CEPH_INO_ROOT &&
                strncmp(dentry->d_name.name, ".ceph", 5) == 0;
index 4f1dc71..ce2f579 100644 (file)
@@ -137,23 +137,11 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
 {
        struct ceph_file_info *cf;
        int ret = 0;
-       struct ceph_inode_info *ci = ceph_inode(inode);
-       struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
-       struct ceph_mds_client *mdsc = fsc->mdsc;
 
        switch (inode->i_mode & S_IFMT) {
        case S_IFREG:
-               /* First file open request creates the cookie, we want to keep
-                * this cookie around for the filetime of the inode as not to
-                * have to worry about fscache register / revoke / operation
-                * races.
-                *
-                * Also, if we know the operation is going to invalidate data
-                * (non readonly) just nuke the cache right away.
-                */
-               ceph_fscache_register_inode_cookie(mdsc->fsc, ci);
-               if ((fmode & CEPH_FILE_MODE_WR))
-                       ceph_fscache_invalidate(inode);
+               ceph_fscache_register_inode_cookie(inode);
+               ceph_fscache_file_set_cookie(inode, file);
        case S_IFDIR:
                dout("init_file %p %p 0%o (regular)\n", inode, file,
                     inode->i_mode);
@@ -191,6 +179,59 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
        return ret;
 }
 
+/*
+ * try renew caps after session gets killed.
+ */
+int ceph_renew_caps(struct inode *inode)
+{
+       struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       struct ceph_mds_request *req;
+       int err, flags, wanted;
+
+       spin_lock(&ci->i_ceph_lock);
+       wanted = __ceph_caps_file_wanted(ci);
+       if (__ceph_is_any_real_caps(ci) &&
+           (!(wanted & CEPH_CAP_ANY_WR) == 0 || ci->i_auth_cap)) {
+               int issued = __ceph_caps_issued(ci, NULL);
+               spin_unlock(&ci->i_ceph_lock);
+               dout("renew caps %p want %s issued %s updating mds_wanted\n",
+                    inode, ceph_cap_string(wanted), ceph_cap_string(issued));
+               ceph_check_caps(ci, 0, NULL);
+               return 0;
+       }
+       spin_unlock(&ci->i_ceph_lock);
+
+       flags = 0;
+       if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
+               flags = O_RDWR;
+       else if (wanted & CEPH_CAP_FILE_RD)
+               flags = O_RDONLY;
+       else if (wanted & CEPH_CAP_FILE_WR)
+               flags = O_WRONLY;
+#ifdef O_LAZY
+       if (wanted & CEPH_CAP_FILE_LAZYIO)
+               flags |= O_LAZY;
+#endif
+
+       req = prepare_open_request(inode->i_sb, flags, 0);
+       if (IS_ERR(req)) {
+               err = PTR_ERR(req);
+               goto out;
+       }
+
+       req->r_inode = inode;
+       ihold(inode);
+       req->r_num_caps = 1;
+       req->r_fmode = -1;
+
+       err = ceph_mdsc_do_request(mdsc, NULL, req);
+       ceph_mdsc_put_request(req);
+out:
+       dout("renew caps %p open result=%d\n", inode, err);
+       return err < 0 ? err : 0;
+}
+
 /*
  * If we already have the requisite capabilities, we can satisfy
  * the open request locally (no need to request new caps from the
@@ -616,8 +657,7 @@ static void ceph_aio_complete(struct inode *inode,
        kfree(aio_req);
 }
 
-static void ceph_aio_complete_req(struct ceph_osd_request *req,
-                                 struct ceph_msg *msg)
+static void ceph_aio_complete_req(struct ceph_osd_request *req)
 {
        int rc = req->r_result;
        struct inode *inode = req->r_inode;
@@ -714,14 +754,21 @@ static void ceph_aio_retry_work(struct work_struct *work)
        req->r_flags =  CEPH_OSD_FLAG_ORDERSNAP |
                        CEPH_OSD_FLAG_ONDISK |
                        CEPH_OSD_FLAG_WRITE;
-       req->r_base_oloc = orig_req->r_base_oloc;
-       req->r_base_oid = orig_req->r_base_oid;
+       ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc);
+       ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid);
+
+       ret = ceph_osdc_alloc_messages(req, GFP_NOFS);
+       if (ret) {
+               ceph_osdc_put_request(req);
+               req = orig_req;
+               goto out;
+       }
 
        req->r_ops[0] = orig_req->r_ops[0];
        osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
 
-       ceph_osdc_build_request(req, req->r_ops[0].extent.offset,
-                               snapc, CEPH_NOSNAP, &aio_req->mtime);
+       req->r_mtime = aio_req->mtime;
+       req->r_data_offset = req->r_ops[0].extent.offset;
 
        ceph_osdc_put_request(orig_req);
 
@@ -733,7 +780,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
 out:
        if (ret < 0) {
                req->r_result = ret;
-               ceph_aio_complete_req(req, NULL);
+               ceph_aio_complete_req(req);
        }
 
        ceph_put_snap_context(snapc);
@@ -764,6 +811,8 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
                list_add_tail(&req->r_unsafe_item,
                              &ci->i_unsafe_writes);
                spin_unlock(&ci->i_unsafe_lock);
+
+               complete_all(&req->r_completion);
        } else {
                spin_lock(&ci->i_unsafe_lock);
                list_del_init(&req->r_unsafe_item);
@@ -875,14 +924,12 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
                                        (pos+len) | (PAGE_SIZE - 1));
 
                        osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
+                       req->r_mtime = mtime;
                }
 
-
                osd_req_op_extent_osd_data_pages(req, 0, pages, len, start,
                                                 false, false);
 
-               ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
-
                if (aio_req) {
                        aio_req->total_len += len;
                        aio_req->num_reqs++;
@@ -956,7 +1003,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
                                                              req, false);
                        if (ret < 0) {
                                req->r_result = ret;
-                               ceph_aio_complete_req(req, NULL);
+                               ceph_aio_complete_req(req);
                        }
                }
                return -EIOCBQUEUED;
@@ -1067,9 +1114,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
                osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
                                                false, true);
 
-               /* BUG_ON(vino.snap != CEPH_NOSNAP); */
-               ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
-
+               req->r_mtime = mtime;
                ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
                if (!ret)
                        ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1292,7 +1337,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
        }
 
 retry_snap:
-       if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) {
+       if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL)) {
                err = -ENOSPC;
                goto out;
        }
@@ -1350,7 +1395,6 @@ retry_snap:
                        iov_iter_advance(from, written);
                ceph_put_snap_context(snapc);
        } else {
-               loff_t old_size = i_size_read(inode);
                /*
                 * No need to acquire the i_truncate_mutex. Because
                 * the MDS revokes Fwb caps before sending truncate
@@ -1361,8 +1405,6 @@ retry_snap:
                written = generic_perform_write(file, from, pos);
                if (likely(written >= 0))
                        iocb->ki_pos = pos + written;
-               if (i_size_read(inode) > old_size)
-                       ceph_fscache_update_objectsize(inode);
                inode_unlock(inode);
        }
 
@@ -1383,7 +1425,7 @@ retry_snap:
        ceph_put_cap_refs(ci, got);
 
        if (written >= 0) {
-               if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))
+               if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_NEARFULL))
                        iocb->ki_flags |= IOCB_DSYNC;
 
                written = generic_write_sync(iocb, written);
@@ -1524,9 +1566,7 @@ static int ceph_zero_partial_object(struct inode *inode,
                goto out;
        }
 
-       ceph_osdc_build_request(req, offset, NULL, ceph_vino(inode).snap,
-                               &inode->i_mtime);
-
+       req->r_mtime = inode->i_mtime;
        ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
        if (!ret) {
                ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1617,8 +1657,8 @@ static long ceph_fallocate(struct file *file, int mode,
                goto unlock;
        }
 
-       if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) &&
-               !(mode & FALLOC_FL_PUNCH_HOLE)) {
+       if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) &&
+           !(mode & FALLOC_FL_PUNCH_HOLE)) {
                ret = -ENOSPC;
                goto unlock;
        }
index e669cfa..f059b59 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
 #include <linux/random.h>
+#include <linux/sort.h>
 
 #include "super.h"
 #include "mds_client.h"
@@ -254,6 +255,9 @@ static int ceph_fill_dirfrag(struct inode *inode,
                diri_auth = ci->i_auth_cap->mds;
        spin_unlock(&ci->i_ceph_lock);
 
+       if (mds == -1) /* CDIR_AUTH_PARENT */
+               mds = diri_auth;
+
        mutex_lock(&ci->i_fragtree_mutex);
        if (ndist == 0 && mds == diri_auth) {
                /* no delegation info needed. */
@@ -300,20 +304,38 @@ out:
        return err;
 }
 
+static int frag_tree_split_cmp(const void *l, const void *r)
+{
+       struct ceph_frag_tree_split *ls = (struct ceph_frag_tree_split*)l;
+       struct ceph_frag_tree_split *rs = (struct ceph_frag_tree_split*)r;
+       return ceph_frag_compare(ls->frag, rs->frag);
+}
+
+static bool is_frag_child(u32 f, struct ceph_inode_frag *frag)
+{
+       if (!frag)
+               return f == ceph_frag_make(0, 0);
+       if (ceph_frag_bits(f) != ceph_frag_bits(frag->frag) + frag->split_by)
+               return false;
+       return ceph_frag_contains_value(frag->frag, ceph_frag_value(f));
+}
+
 static int ceph_fill_fragtree(struct inode *inode,
                              struct ceph_frag_tree_head *fragtree,
                              struct ceph_mds_reply_dirfrag *dirinfo)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
-       struct ceph_inode_frag *frag;
+       struct ceph_inode_frag *frag, *prev_frag = NULL;
        struct rb_node *rb_node;
-       int i;
-       u32 id, nsplits;
+       unsigned i, split_by, nsplits;
+       u32 id;
        bool update = false;
 
        mutex_lock(&ci->i_fragtree_mutex);
        nsplits = le32_to_cpu(fragtree->nsplits);
-       if (nsplits) {
+       if (nsplits != ci->i_fragtree_nsplits) {
+               update = true;
+       } else if (nsplits) {
                i = prandom_u32() % nsplits;
                id = le32_to_cpu(fragtree->splits[i].frag);
                if (!__ceph_find_frag(ci, id))
@@ -332,10 +354,22 @@ static int ceph_fill_fragtree(struct inode *inode,
        if (!update)
                goto out_unlock;
 
+       if (nsplits > 1) {
+               sort(fragtree->splits, nsplits, sizeof(fragtree->splits[0]),
+                    frag_tree_split_cmp, NULL);
+       }
+
        dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode));
        rb_node = rb_first(&ci->i_fragtree);
        for (i = 0; i < nsplits; i++) {
                id = le32_to_cpu(fragtree->splits[i].frag);
+               split_by = le32_to_cpu(fragtree->splits[i].by);
+               if (split_by == 0 || ceph_frag_bits(id) + split_by > 24) {
+                       pr_err("fill_fragtree %llx.%llx invalid split %d/%u, "
+                              "frag %x split by %d\n", ceph_vinop(inode),
+                              i, nsplits, id, split_by);
+                       continue;
+               }
                frag = NULL;
                while (rb_node) {
                        frag = rb_entry(rb_node, struct ceph_inode_frag, node);
@@ -347,8 +381,14 @@ static int ceph_fill_fragtree(struct inode *inode,
                                break;
                        }
                        rb_node = rb_next(rb_node);
-                       rb_erase(&frag->node, &ci->i_fragtree);
-                       kfree(frag);
+                       /* delete stale split/leaf node */
+                       if (frag->split_by > 0 ||
+                           !is_frag_child(frag->frag, prev_frag)) {
+                               rb_erase(&frag->node, &ci->i_fragtree);
+                               if (frag->split_by > 0)
+                                       ci->i_fragtree_nsplits--;
+                               kfree(frag);
+                       }
                        frag = NULL;
                }
                if (!frag) {
@@ -356,14 +396,23 @@ static int ceph_fill_fragtree(struct inode *inode,
                        if (IS_ERR(frag))
                                continue;
                }
-               frag->split_by = le32_to_cpu(fragtree->splits[i].by);
+               if (frag->split_by == 0)
+                       ci->i_fragtree_nsplits++;
+               frag->split_by = split_by;
                dout(" frag %x split by %d\n", frag->frag, frag->split_by);
+               prev_frag = frag;
        }
        while (rb_node) {
                frag = rb_entry(rb_node, struct ceph_inode_frag, node);
                rb_node = rb_next(rb_node);
-               rb_erase(&frag->node, &ci->i_fragtree);
-               kfree(frag);
+               /* delete stale split/leaf node */
+               if (frag->split_by > 0 ||
+                   !is_frag_child(frag->frag, prev_frag)) {
+                       rb_erase(&frag->node, &ci->i_fragtree);
+                       if (frag->split_by > 0)
+                               ci->i_fragtree_nsplits--;
+                       kfree(frag);
+               }
        }
 out_unlock:
        mutex_unlock(&ci->i_fragtree_mutex);
@@ -513,6 +562,7 @@ void ceph_destroy_inode(struct inode *inode)
                rb_erase(n, &ci->i_fragtree);
                kfree(frag);
        }
+       ci->i_fragtree_nsplits = 0;
 
        __ceph_destroy_xattrs(ci);
        if (ci->i_xattrs.blob)
@@ -533,6 +583,11 @@ int ceph_drop_inode(struct inode *inode)
        return 1;
 }
 
+static inline blkcnt_t calc_inode_blocks(u64 size)
+{
+       return (size + (1<<9) - 1) >> 9;
+}
+
 /*
  * Helpers to fill in size, ctime, mtime, and atime.  We have to be
  * careful because either the client or MDS may have more up to date
@@ -555,7 +610,7 @@ int ceph_fill_file_size(struct inode *inode, int issued,
                        size = 0;
                }
                i_size_write(inode, size);
-               inode->i_blocks = (size + (1<<9) - 1) >> 9;
+               inode->i_blocks = calc_inode_blocks(size);
                ci->i_reported_size = size;
                if (truncate_seq != ci->i_truncate_seq) {
                        dout("truncate_seq %u -> %u\n",
@@ -814,9 +869,13 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 
                        spin_unlock(&ci->i_ceph_lock);
 
-                       err = -EINVAL;
-                       if (WARN_ON(symlen != i_size_read(inode)))
-                               goto out;
+                       if (symlen != i_size_read(inode)) {
+                               pr_err("fill_inode %llx.%llx BAD symlink "
+                                       "size %lld\n", ceph_vinop(inode),
+                                       i_size_read(inode));
+                               i_size_write(inode, symlen);
+                               inode->i_blocks = calc_inode_blocks(symlen);
+                       }
 
                        err = -ENOMEM;
                        sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS);
@@ -1309,12 +1368,13 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
        int i, err = 0;
 
        for (i = 0; i < rinfo->dir_nr; i++) {
+               struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
                struct ceph_vino vino;
                struct inode *in;
                int rc;
 
-               vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
-               vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
+               vino.ino = le64_to_cpu(rde->inode.in->ino);
+               vino.snap = le64_to_cpu(rde->inode.in->snapid);
 
                in = ceph_get_inode(req->r_dentry->d_sb, vino);
                if (IS_ERR(in)) {
@@ -1322,14 +1382,14 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
                        dout("new_inode badness got %d\n", err);
                        continue;
                }
-               rc = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session,
+               rc = fill_inode(in, NULL, &rde->inode, NULL, session,
                                req->r_request_started, -1,
                                &req->r_caps_reservation);
                if (rc < 0) {
                        pr_err("fill_inode badness on %p got %d\n", in, rc);
                        err = rc;
-                       continue;
                }
+               iput(in);
        }
 
        return err;
@@ -1387,6 +1447,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
                             struct ceph_mds_session *session)
 {
        struct dentry *parent = req->r_dentry;
+       struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
        struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
        struct qstr dname;
        struct dentry *dn;
@@ -1394,22 +1455,27 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
        int err = 0, skipped = 0, ret, i;
        struct inode *snapdir = NULL;
        struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
-       struct ceph_dentry_info *di;
        u32 frag = le32_to_cpu(rhead->args.readdir.frag);
+       u32 last_hash = 0;
+       u32 fpos_offset;
        struct ceph_readdir_cache_control cache_ctl = {};
 
        if (req->r_aborted)
                return readdir_prepopulate_inodes_only(req, session);
 
+       if (rinfo->hash_order && req->r_path2) {
+               last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
+                                         req->r_path2, strlen(req->r_path2));
+               last_hash = ceph_frag_value(last_hash);
+       }
+
        if (rinfo->dir_dir &&
            le32_to_cpu(rinfo->dir_dir->frag) != frag) {
                dout("readdir_prepopulate got new frag %x -> %x\n",
                     frag, le32_to_cpu(rinfo->dir_dir->frag));
                frag = le32_to_cpu(rinfo->dir_dir->frag);
-               if (ceph_frag_is_leftmost(frag))
+               if (!rinfo->hash_order)
                        req->r_readdir_offset = 2;
-               else
-                       req->r_readdir_offset = 0;
        }
 
        if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
@@ -1427,24 +1493,37 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
        if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) {
                /* note dir version at start of readdir so we can tell
                 * if any dentries get dropped */
-               struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
                req->r_dir_release_cnt = atomic64_read(&ci->i_release_count);
                req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count);
                req->r_readdir_cache_idx = 0;
        }
 
        cache_ctl.index = req->r_readdir_cache_idx;
+       fpos_offset = req->r_readdir_offset;
 
        /* FIXME: release caps/leases if error occurs */
        for (i = 0; i < rinfo->dir_nr; i++) {
+               struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
                struct ceph_vino vino;
 
-               dname.name = rinfo->dir_dname[i];
-               dname.len = rinfo->dir_dname_len[i];
+               dname.name = rde->name;
+               dname.len = rde->name_len;
                dname.hash = full_name_hash(dname.name, dname.len);
 
-               vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
-               vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
+               vino.ino = le64_to_cpu(rde->inode.in->ino);
+               vino.snap = le64_to_cpu(rde->inode.in->snapid);
+
+               if (rinfo->hash_order) {
+                       u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
+                                                rde->name, rde->name_len);
+                       hash = ceph_frag_value(hash);
+                       if (hash != last_hash)
+                               fpos_offset = 2;
+                       last_hash = hash;
+                       rde->offset = ceph_make_fpos(hash, fpos_offset++, true);
+               } else {
+                       rde->offset = ceph_make_fpos(frag, fpos_offset++, false);
+               }
 
 retry_lookup:
                dn = d_lookup(parent, &dname);
@@ -1490,7 +1569,7 @@ retry_lookup:
                        }
                }
 
-               ret = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session,
+               ret = fill_inode(in, NULL, &rde->inode, NULL, session,
                                 req->r_request_started, -1,
                                 &req->r_caps_reservation);
                if (ret < 0) {
@@ -1523,11 +1602,9 @@ retry_lookup:
                        dn = realdn;
                }
 
-               di = dn->d_fsdata;
-               di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
+               ceph_dentry(dn)->offset = rde->offset;
 
-               update_dentry_lease(dn, rinfo->dir_dlease[i],
-                                   req->r_session,
+               update_dentry_lease(dn, rde->lease, req->r_session,
                                    req->r_request_started);
 
                if (err == 0 && skipped == 0 && cache_ctl.index >= 0) {
@@ -1562,7 +1639,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size)
        spin_lock(&ci->i_ceph_lock);
        dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
        i_size_write(inode, size);
-       inode->i_blocks = (size + (1 << 9) - 1) >> 9;
+       inode->i_blocks = calc_inode_blocks(size);
 
        /* tell the MDS if we are approaching max_size */
        if ((size << 1) >= ci->i_max_size &&
@@ -1624,10 +1701,21 @@ static void ceph_invalidate_work(struct work_struct *work)
        struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
                                                  i_pg_inv_work);
        struct inode *inode = &ci->vfs_inode;
+       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
        u32 orig_gen;
        int check = 0;
 
        mutex_lock(&ci->i_truncate_mutex);
+
+       if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+               pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n",
+                                   inode, ceph_ino(inode));
+               mapping_set_error(inode->i_mapping, -EIO);
+               truncate_pagecache(inode, 0);
+               mutex_unlock(&ci->i_truncate_mutex);
+               goto out;
+       }
+
        spin_lock(&ci->i_ceph_lock);
        dout("invalidate_pages %p gen %d revoking %d\n", inode,
             ci->i_rdcache_gen, ci->i_rdcache_revoking);
@@ -1641,7 +1729,9 @@ static void ceph_invalidate_work(struct work_struct *work)
        orig_gen = ci->i_rdcache_gen;
        spin_unlock(&ci->i_ceph_lock);
 
-       truncate_pagecache(inode, 0);
+       if (invalidate_inode_pages2(inode->i_mapping) < 0) {
+               pr_err("invalidate_pages %p fails\n", inode);
+       }
 
        spin_lock(&ci->i_ceph_lock);
        if (orig_gen == ci->i_rdcache_gen &&
@@ -1920,8 +2010,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
                if ((issued & CEPH_CAP_FILE_EXCL) &&
                    attr->ia_size > inode->i_size) {
                        i_size_write(inode, attr->ia_size);
-                       inode->i_blocks =
-                               (attr->ia_size + (1 << 9) - 1) >> 9;
+                       inode->i_blocks = calc_inode_blocks(attr->ia_size);
                        inode->i_ctime = attr->ia_ctime;
                        ci->i_reported_size = attr->ia_size;
                        dirtied |= CEPH_CAP_FILE_EXCL;
index f851d8d..be6b165 100644 (file)
@@ -193,12 +193,12 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
        if (copy_from_user(&dl, arg, sizeof(dl)))
                return -EFAULT;
 
-       down_read(&osdc->map_sem);
+       down_read(&osdc->lock);
        r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len,
                                          &dl.object_no, &dl.object_offset,
                                          &olen);
        if (r < 0) {
-               up_read(&osdc->map_sem);
+               up_read(&osdc->lock);
                return -EIO;
        }
        dl.file_offset -= dl.object_offset;
@@ -213,15 +213,15 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
                 ceph_ino(inode), dl.object_no);
 
        oloc.pool = ceph_file_layout_pg_pool(ci->i_layout);
-       ceph_oid_set_name(&oid, dl.object_name);
+       ceph_oid_printf(&oid, "%s", dl.object_name);
 
-       r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid);
+       r = ceph_object_locator_to_pg(osdc->osdmap, &oid, &oloc, &pgid);
        if (r < 0) {
-               up_read(&osdc->map_sem);
+               up_read(&osdc->lock);
                return r;
        }
 
-       dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
+       dl.osd = ceph_pg_to_acting_primary(osdc->osdmap, &pgid);
        if (dl.osd >= 0) {
                struct ceph_entity_addr *a =
                        ceph_osd_addr(osdc->osdmap, dl.osd);
@@ -230,7 +230,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
        } else {
                memset(&dl.osd_addr, 0, sizeof(dl.osd_addr));
        }
-       up_read(&osdc->map_sem);
+       up_read(&osdc->lock);
 
        /* send result back to user */
        if (copy_to_user(arg, &dl, sizeof(dl)))
index 85b8517..2103b82 100644 (file)
@@ -181,17 +181,18 @@ static int parse_reply_info_dir(void **p, void *end,
 
        ceph_decode_need(p, end, sizeof(num) + 2, bad);
        num = ceph_decode_32(p);
-       info->dir_end = ceph_decode_8(p);
-       info->dir_complete = ceph_decode_8(p);
+       {
+               u16 flags = ceph_decode_16(p);
+               info->dir_end = !!(flags & CEPH_READDIR_FRAG_END);
+               info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE);
+               info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER);
+       }
        if (num == 0)
                goto done;
 
-       BUG_ON(!info->dir_in);
-       info->dir_dname = (void *)(info->dir_in + num);
-       info->dir_dname_len = (void *)(info->dir_dname + num);
-       info->dir_dlease = (void *)(info->dir_dname_len + num);
-       if ((unsigned long)(info->dir_dlease + num) >
-           (unsigned long)info->dir_in + info->dir_buf_size) {
+       BUG_ON(!info->dir_entries);
+       if ((unsigned long)(info->dir_entries + num) >
+           (unsigned long)info->dir_entries + info->dir_buf_size) {
                pr_err("dir contents are larger than expected\n");
                WARN_ON(1);
                goto bad;
@@ -199,21 +200,23 @@ static int parse_reply_info_dir(void **p, void *end,
 
        info->dir_nr = num;
        while (num) {
+               struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i;
                /* dentry */
                ceph_decode_need(p, end, sizeof(u32)*2, bad);
-               info->dir_dname_len[i] = ceph_decode_32(p);
-               ceph_decode_need(p, end, info->dir_dname_len[i], bad);
-               info->dir_dname[i] = *p;
-               *p += info->dir_dname_len[i];
-               dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i],
-                    info->dir_dname[i]);
-               info->dir_dlease[i] = *p;
+               rde->name_len = ceph_decode_32(p);
+               ceph_decode_need(p, end, rde->name_len, bad);
+               rde->name = *p;
+               *p += rde->name_len;
+               dout("parsed dir dname '%.*s'\n", rde->name_len, rde->name);
+               rde->lease = *p;
                *p += sizeof(struct ceph_mds_reply_lease);
 
                /* inode */
-               err = parse_reply_info_in(p, end, &info->dir_in[i], features);
+               err = parse_reply_info_in(p, end, &rde->inode, features);
                if (err < 0)
                        goto out_bad;
+               /* ceph_readdir_prepopulate() will update it */
+               rde->offset = 0;
                i++;
                num--;
        }
@@ -345,9 +348,9 @@ out_bad:
 
 static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
 {
-       if (!info->dir_in)
+       if (!info->dir_entries)
                return;
-       free_pages((unsigned long)info->dir_in, get_order(info->dir_buf_size));
+       free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size));
 }
 
 
@@ -567,51 +570,23 @@ void ceph_mdsc_release_request(struct kref *kref)
        kfree(req);
 }
 
+DEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node)
+
 /*
  * lookup session, bump ref if found.
  *
  * called under mdsc->mutex.
  */
-static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc,
-                                            u64 tid)
+static struct ceph_mds_request *
+lookup_get_request(struct ceph_mds_client *mdsc, u64 tid)
 {
        struct ceph_mds_request *req;
-       struct rb_node *n = mdsc->request_tree.rb_node;
-
-       while (n) {
-               req = rb_entry(n, struct ceph_mds_request, r_node);
-               if (tid < req->r_tid)
-                       n = n->rb_left;
-               else if (tid > req->r_tid)
-                       n = n->rb_right;
-               else {
-                       ceph_mdsc_get_request(req);
-                       return req;
-               }
-       }
-       return NULL;
-}
 
-static void __insert_request(struct ceph_mds_client *mdsc,
-                            struct ceph_mds_request *new)
-{
-       struct rb_node **p = &mdsc->request_tree.rb_node;
-       struct rb_node *parent = NULL;
-       struct ceph_mds_request *req = NULL;
+       req = lookup_request(&mdsc->request_tree, tid);
+       if (req)
+               ceph_mdsc_get_request(req);
 
-       while (*p) {
-               parent = *p;
-               req = rb_entry(parent, struct ceph_mds_request, r_node);
-               if (new->r_tid < req->r_tid)
-                       p = &(*p)->rb_left;
-               else if (new->r_tid > req->r_tid)
-                       p = &(*p)->rb_right;
-               else
-                       BUG();
-       }
-
-       rb_link_node(&new->r_node, parent, p);
-       rb_insert_color(&new->r_node, &mdsc->request_tree);
+       return req;
 }
 
 /*
@@ -630,7 +605,7 @@ static void __register_request(struct ceph_mds_client *mdsc,
                                  req->r_num_caps);
        dout("__register_request %p tid %lld\n", req, req->r_tid);
        ceph_mdsc_get_request(req);
-       __insert_request(mdsc, req);
+       insert_request(&mdsc->request_tree, req);
 
        req->r_uid = current_fsuid();
        req->r_gid = current_fsgid();
@@ -663,8 +638,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
                }
        }
 
-       rb_erase(&req->r_node, &mdsc->request_tree);
-       RB_CLEAR_NODE(&req->r_node);
+       erase_request(&mdsc->request_tree, req);
 
        if (req->r_unsafe_dir && req->r_got_unsafe) {
                struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
@@ -868,12 +842,14 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
        int metadata_bytes = 0;
        int metadata_key_count = 0;
        struct ceph_options *opt = mdsc->fsc->client->options;
+       struct ceph_mount_options *fsopt = mdsc->fsc->mount_options;
        void *p;
 
        const char* metadata[][2] = {
                {"hostname", utsname()->nodename},
                {"kernel_version", utsname()->release},
-               {"entity_id", opt->name ? opt->name : ""},
+               {"entity_id", opt->name ? : ""},
+               {"root", fsopt->server_path ? : "/"},
                {NULL, NULL}
        };
 
@@ -1149,9 +1125,11 @@ out:
 static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
                                  void *arg)
 {
+       struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg;
        struct ceph_inode_info *ci = ceph_inode(inode);
        LIST_HEAD(to_remove);
-       int drop = 0;
+       bool drop = false;
+       bool invalidate = false;
 
        dout("removing cap %p, ci is %p, inode is %p\n",
             cap, ci, &ci->vfs_inode);
@@ -1159,8 +1137,13 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
        __ceph_remove_cap(cap, false);
        if (!ci->i_auth_cap) {
                struct ceph_cap_flush *cf;
-               struct ceph_mds_client *mdsc =
-                       ceph_sb_to_client(inode->i_sb)->mdsc;
+               struct ceph_mds_client *mdsc = fsc->mdsc;
+
+               ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
+
+               if (ci->i_wrbuffer_ref > 0 &&
+                   ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
+                       invalidate = true;
 
                while (true) {
                        struct rb_node *n = rb_first(&ci->i_cap_flush_tree);
@@ -1183,7 +1166,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
                                inode, ceph_ino(inode));
                        ci->i_dirty_caps = 0;
                        list_del_init(&ci->i_dirty_item);
-                       drop = 1;
+                       drop = true;
                }
                if (!list_empty(&ci->i_flushing_item)) {
                        pr_warn_ratelimited(
@@ -1193,7 +1176,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
                        ci->i_flushing_caps = 0;
                        list_del_init(&ci->i_flushing_item);
                        mdsc->num_cap_flushing--;
-                       drop = 1;
+                       drop = true;
                }
                spin_unlock(&mdsc->cap_dirty_lock);
 
@@ -1210,7 +1193,11 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
                list_del(&cf->list);
                ceph_free_cap_flush(cf);
        }
-       while (drop--)
+
+       wake_up_all(&ci->i_cap_wq);
+       if (invalidate)
+               ceph_queue_invalidate(inode);
+       if (drop)
                iput(inode);
        return 0;
 }
@@ -1220,12 +1207,13 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
  */
 static void remove_session_caps(struct ceph_mds_session *session)
 {
+       struct ceph_fs_client *fsc = session->s_mdsc->fsc;
+       struct super_block *sb = fsc->sb;
        dout("remove_session_caps on %p\n", session);
-       iterate_session_caps(session, remove_session_caps_cb, NULL);
+       iterate_session_caps(session, remove_session_caps_cb, fsc);
 
        spin_lock(&session->s_cap_lock);
        if (session->s_nr_caps > 0) {
-               struct super_block *sb = session->s_mdsc->fsc->sb;
                struct inode *inode;
                struct ceph_cap *cap, *prev = NULL;
                struct ceph_vino vino;
@@ -1270,13 +1258,13 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
 
-       wake_up_all(&ci->i_cap_wq);
        if (arg) {
                spin_lock(&ci->i_ceph_lock);
                ci->i_wanted_max_size = 0;
                ci->i_requested_max_size = 0;
                spin_unlock(&ci->i_ceph_lock);
        }
+       wake_up_all(&ci->i_cap_wq);
        return 0;
 }
 
@@ -1671,8 +1659,7 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
        struct ceph_inode_info *ci = ceph_inode(dir);
        struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
        struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options;
-       size_t size = sizeof(*rinfo->dir_in) + sizeof(*rinfo->dir_dname_len) +
-                     sizeof(*rinfo->dir_dname) + sizeof(*rinfo->dir_dlease);
+       size_t size = sizeof(struct ceph_mds_reply_dir_entry);
        int order, num_entries;
 
        spin_lock(&ci->i_ceph_lock);
@@ -1683,14 +1670,14 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
 
        order = get_order(size * num_entries);
        while (order >= 0) {
-               rinfo->dir_in = (void*)__get_free_pages(GFP_KERNEL |
-                                                       __GFP_NOWARN,
-                                                       order);
-               if (rinfo->dir_in)
+               rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL |
+                                                            __GFP_NOWARN,
+                                                            order);
+               if (rinfo->dir_entries)
                        break;
                order--;
        }
-       if (!rinfo->dir_in)
+       if (!rinfo->dir_entries)
                return -ENOMEM;
 
        num_entries = (PAGE_SIZE << order) / size;
@@ -1722,6 +1709,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
        INIT_LIST_HEAD(&req->r_unsafe_target_item);
        req->r_fmode = -1;
        kref_init(&req->r_kref);
+       RB_CLEAR_NODE(&req->r_node);
        INIT_LIST_HEAD(&req->r_wait);
        init_completion(&req->r_completion);
        init_completion(&req->r_safe_completion);
@@ -2414,7 +2402,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        /* get request, session */
        tid = le64_to_cpu(msg->hdr.tid);
        mutex_lock(&mdsc->mutex);
-       req = __lookup_request(mdsc, tid);
+       req = lookup_get_request(mdsc, tid);
        if (!req) {
                dout("handle_reply on unknown tid %llu\n", tid);
                mutex_unlock(&mdsc->mutex);
@@ -2604,7 +2592,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
        fwd_seq = ceph_decode_32(&p);
 
        mutex_lock(&mdsc->mutex);
-       req = __lookup_request(mdsc, tid);
+       req = lookup_get_request(mdsc, tid);
        if (!req) {
                dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
                goto out;  /* dup reply? */
index ee69a53..e7d38aa 100644 (file)
@@ -47,6 +47,14 @@ struct ceph_mds_reply_info_in {
        u32 pool_ns_len;
 };
 
+struct ceph_mds_reply_dir_entry {
+       char                          *name;
+       u32                           name_len;
+       struct ceph_mds_reply_lease   *lease;
+       struct ceph_mds_reply_info_in inode;
+       loff_t                        offset;
+};
+
 /*
  * parsed info about an mds reply, including information about
  * either: 1) the target inode and/or its parent directory and dentry,
@@ -73,11 +81,10 @@ struct ceph_mds_reply_info_parsed {
                        struct ceph_mds_reply_dirfrag *dir_dir;
                        size_t                        dir_buf_size;
                        int                           dir_nr;
-                       char                          **dir_dname;
-                       u32                           *dir_dname_len;
-                       struct ceph_mds_reply_lease   **dir_dlease;
-                       struct ceph_mds_reply_info_in *dir_in;
-                       u8                            dir_complete, dir_end;
+                       bool                          dir_complete;
+                       bool                          dir_end;
+                       bool                          hash_order;
+                       struct ceph_mds_reply_dir_entry  *dir_entries;
                };
 
                /* for create results */
index 261531e..8c3591a 100644 (file)
@@ -54,16 +54,21 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
        const void *start = *p;
        int i, j, n;
        int err = -EINVAL;
-       u16 version;
+       u8 mdsmap_v, mdsmap_cv;
 
        m = kzalloc(sizeof(*m), GFP_NOFS);
        if (m == NULL)
                return ERR_PTR(-ENOMEM);
 
-       ceph_decode_16_safe(p, end, version, bad);
-       if (version > 3) {
-               pr_warn("got mdsmap version %d > 3, failing", version);
-               goto bad;
+       ceph_decode_need(p, end, 1 + 1, bad);
+       mdsmap_v = ceph_decode_8(p);
+       mdsmap_cv = ceph_decode_8(p);
+       if (mdsmap_v >= 4) {
+              u32 mdsmap_len;
+              ceph_decode_32_safe(p, end, mdsmap_len, bad);
+              if (end < *p + mdsmap_len)
+                      goto bad;
+              end = *p + mdsmap_len;
        }
 
        ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
@@ -87,16 +92,29 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
                u32 namelen;
                s32 mds, inc, state;
                u64 state_seq;
-               u8 infoversion;
+               u8 info_v;
+               void *info_end = NULL;
                struct ceph_entity_addr addr;
                u32 num_export_targets;
                void *pexport_targets = NULL;
                struct ceph_timespec laggy_since;
                struct ceph_mds_info *info;
 
-               ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
+               ceph_decode_need(p, end, sizeof(u64) + 1, bad);
                global_id = ceph_decode_64(p);
-               infoversion = ceph_decode_8(p);
+               info_v= ceph_decode_8(p);
+               if (info_v >= 4) {
+                       u32 info_len;
+                       u8 info_cv;
+                       ceph_decode_need(p, end, 1 + sizeof(u32), bad);
+                       info_cv = ceph_decode_8(p);
+                       info_len = ceph_decode_32(p);
+                       info_end = *p + info_len;
+                       if (info_end > end)
+                               goto bad;
+               }
+
+               ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
                *p += sizeof(u64);
                namelen = ceph_decode_32(p);  /* skip mds name */
                *p += namelen;
@@ -115,7 +133,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
                *p += sizeof(u32);
                ceph_decode_32_safe(p, end, namelen, bad);
                *p += namelen;
-               if (infoversion >= 2) {
+               if (info_v >= 2) {
                        ceph_decode_32_safe(p, end, num_export_targets, bad);
                        pexport_targets = *p;
                        *p += num_export_targets * sizeof(u32);
@@ -123,6 +141,12 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
                        num_export_targets = 0;
                }
 
+               if (info_end && *p != info_end) {
+                       if (*p > info_end)
+                               goto bad;
+                       *p = info_end;
+               }
+
                dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
                     i+1, n, global_id, mds, inc,
                     ceph_pr_addr(&addr.in_addr),
@@ -163,6 +187,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
        m->m_cas_pg_pool = ceph_decode_64(p);
 
        /* ok, we don't care about the rest. */
+       *p = end;
        dout("mdsmap_decode success epoch %u\n", m->m_epoch);
        return m;
 
index f12d5e2..91e0248 100644 (file)
@@ -108,6 +108,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
  * mount options
  */
 enum {
+       Opt_mds_namespace,
        Opt_wsize,
        Opt_rsize,
        Opt_rasize,
@@ -143,6 +144,7 @@ enum {
 };
 
 static match_table_t fsopt_tokens = {
+       {Opt_mds_namespace, "mds_namespace=%d"},
        {Opt_wsize, "wsize=%d"},
        {Opt_rsize, "rsize=%d"},
        {Opt_rasize, "rasize=%d"},
@@ -212,6 +214,9 @@ static int parse_fsopt_token(char *c, void *private)
                break;
 
                /* misc */
+       case Opt_mds_namespace:
+               fsopt->mds_namespace = intval;
+               break;
        case Opt_wsize:
                fsopt->wsize = intval;
                break;
@@ -297,6 +302,7 @@ static void destroy_mount_options(struct ceph_mount_options *args)
 {
        dout("destroy_mount_options %p\n", args);
        kfree(args->snapdir_name);
+       kfree(args->server_path);
        kfree(args);
 }
 
@@ -328,14 +334,17 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt,
        if (ret)
                return ret;
 
+       ret = strcmp_null(fsopt1->server_path, fsopt2->server_path);
+       if (ret)
+               return ret;
+
        return ceph_compare_options(new_opt, fsc->client);
 }
 
 static int parse_mount_options(struct ceph_mount_options **pfsopt,
                               struct ceph_options **popt,
                               int flags, char *options,
-                              const char *dev_name,
-                              const char **path)
+                              const char *dev_name)
 {
        struct ceph_mount_options *fsopt;
        const char *dev_name_end;
@@ -367,6 +376,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
        fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
        fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
        fsopt->congestion_kb = default_congestion_kb();
+       fsopt->mds_namespace = CEPH_FS_CLUSTER_ID_NONE;
 
        /*
         * Distinguish the server list from the path in "dev_name".
@@ -380,12 +390,13 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
         */
        dev_name_end = strchr(dev_name, '/');
        if (dev_name_end) {
-               /* skip over leading '/' for path */
-               *path = dev_name_end + 1;
+               fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL);
+               if (!fsopt->server_path) {
+                       err = -ENOMEM;
+                       goto out;
+               }
        } else {
-               /* path is empty */
                dev_name_end = dev_name + strlen(dev_name);
-               *path = dev_name_end;
        }
        err = -EINVAL;
        dev_name_end--;         /* back up to ':' separator */
@@ -395,7 +406,8 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
                goto out;
        }
        dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
-       dout("server path '%s'\n", *path);
+       if (fsopt->server_path)
+               dout("server path '%s'\n", fsopt->server_path);
 
        *popt = ceph_parse_options(options, dev_name, dev_name_end,
                                 parse_fsopt_token, (void *)fsopt);
@@ -457,6 +469,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
                seq_puts(m, ",noacl");
 #endif
 
+       if (fsopt->mds_namespace != CEPH_FS_CLUSTER_ID_NONE)
+               seq_printf(m, ",mds_namespace=%d", fsopt->mds_namespace);
        if (fsopt->wsize)
                seq_printf(m, ",wsize=%d", fsopt->wsize);
        if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
@@ -511,9 +525,8 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
 {
        struct ceph_fs_client *fsc;
        const u64 supported_features =
-               CEPH_FEATURE_FLOCK |
-               CEPH_FEATURE_DIRLAYOUTHASH |
-               CEPH_FEATURE_MDS_INLINE_DATA;
+               CEPH_FEATURE_FLOCK | CEPH_FEATURE_DIRLAYOUTHASH |
+               CEPH_FEATURE_MDSENC | CEPH_FEATURE_MDS_INLINE_DATA;
        const u64 required_features = 0;
        int page_count;
        size_t size;
@@ -530,6 +543,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
                goto fail;
        }
        fsc->client->extra_mon_dispatch = extra_mon_dispatch;
+       fsc->client->monc.fs_cluster_id = fsopt->mds_namespace;
        ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true);
 
        fsc->mount_options = fsopt;
@@ -785,8 +799,7 @@ out:
 /*
  * mount: join the ceph cluster, and open root directory.
  */
-static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
-                     const char *path)
+static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc)
 {
        int err;
        unsigned long started = jiffies;  /* note the start time */
@@ -815,11 +828,12 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
                        goto fail;
        }
 
-       if (path[0] == 0) {
+       if (!fsc->mount_options->server_path) {
                root = fsc->sb->s_root;
                dget(root);
        } else {
-               dout("mount opening base mountpoint\n");
+               const char *path = fsc->mount_options->server_path + 1;
+               dout("mount opening path %s\n", path);
                root = open_root_dentry(fsc, path, started);
                if (IS_ERR(root)) {
                        err = PTR_ERR(root);
@@ -935,7 +949,6 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
        struct dentry *res;
        int err;
        int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
-       const char *path = NULL;
        struct ceph_mount_options *fsopt = NULL;
        struct ceph_options *opt = NULL;
 
@@ -944,7 +957,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
 #ifdef CONFIG_CEPH_FS_POSIX_ACL
        flags |= MS_POSIXACL;
 #endif
-       err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
+       err = parse_mount_options(&fsopt, &opt, flags, data, dev_name);
        if (err < 0) {
                res = ERR_PTR(err);
                goto out_final;
@@ -987,7 +1000,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
                }
        }
 
-       res = ceph_real_mount(fsc, path);
+       res = ceph_real_mount(fsc);
        if (IS_ERR(res))
                goto out_splat;
        dout("root %p inode %p ino %llx.%llx\n", res,
index 7b99eb7..0168b49 100644 (file)
@@ -62,6 +62,7 @@ struct ceph_mount_options {
        int cap_release_safety;
        int max_readdir;       /* max readdir result (entires) */
        int max_readdir_bytes; /* max readdir result (bytes) */
+       int mds_namespace;
 
        /*
         * everything above this point can be memcmp'd; everything below
@@ -69,6 +70,7 @@ struct ceph_mount_options {
         */
 
        char *snapdir_name;   /* default ".snap" */
+       char *server_path;    /* default  "/" */
 };
 
 struct ceph_fs_client {
@@ -101,7 +103,6 @@ struct ceph_fs_client {
 
 #ifdef CONFIG_CEPH_FSCACHE
        struct fscache_cookie *fscache;
-       struct workqueue_struct *revalidate_wq;
 #endif
 };
 
@@ -295,6 +296,7 @@ struct ceph_inode_info {
        u64 i_files, i_subdirs;
 
        struct rb_root i_fragtree;
+       int i_fragtree_nsplits;
        struct mutex i_fragtree_mutex;
 
        struct ceph_inode_xattrs_info i_xattrs;
@@ -357,8 +359,7 @@ struct ceph_inode_info {
 
 #ifdef CONFIG_CEPH_FSCACHE
        struct fscache_cookie *fscache;
-       u32 i_fscache_gen; /* sequence, for delayed fscache validate */
-       struct work_struct i_revalidate_work;
+       u32 i_fscache_gen;
 #endif
        struct inode vfs_inode; /* at end */
 };
@@ -469,6 +470,7 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
 #define CEPH_I_POOL_RD         (1 << 5)  /* can read from pool */
 #define CEPH_I_POOL_WR         (1 << 6)  /* can write to pool */
 #define CEPH_I_SEC_INITED      (1 << 7)  /* security initialized */
+#define CEPH_I_CAP_DROPPED     (1 << 8)  /* caps were forcibly dropped */
 
 static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
                                           long long release_count,
@@ -537,11 +539,6 @@ static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
        return (struct ceph_dentry_info *)dentry->d_fsdata;
 }
 
-static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
-{
-       return ((loff_t)frag << 32) | (loff_t)off;
-}
-
 /*
  * caps helpers
  */
@@ -632,7 +629,6 @@ struct ceph_file_info {
        struct ceph_mds_request *last_readdir;
 
        /* readdir: position within a frag */
-       unsigned offset;       /* offset of last chunk, adjusted for . and .. */
        unsigned next_offset;  /* offset of next chunk (last_name's + 1) */
        char *last_name;       /* last entry in previous chunk */
        long long dir_release_count;
@@ -927,6 +923,7 @@ extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
 /* file.c */
 extern const struct file_operations ceph_file_fops;
 
+extern int ceph_renew_caps(struct inode *inode);
 extern int ceph_open(struct inode *inode, struct file *file);
 extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
                            struct file *file, unsigned flags, umode_t mode,
@@ -942,6 +939,7 @@ extern const struct inode_operations ceph_snapdir_iops;
 extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
        ceph_snapdir_dentry_ops;
 
+extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order);
 extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
 extern int ceph_handle_snapdir(struct ceph_mds_request *req,
                               struct dentry *dentry, int err);
index 0d66722..4870b29 100644 (file)
@@ -77,7 +77,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
        char buf[128];
 
        dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
-       down_read(&osdc->map_sem);
+       down_read(&osdc->lock);
        pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
        if (pool_name) {
                size_t len = strlen(pool_name);
@@ -109,7 +109,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
                                ret = -ERANGE;
                }
        }
-       up_read(&osdc->map_sem);
+       up_read(&osdc->lock);
        return ret;
 }
 
@@ -143,13 +143,13 @@ static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
        s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
        const char *pool_name;
 
-       down_read(&osdc->map_sem);
+       down_read(&osdc->lock);
        pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
        if (pool_name)
                ret = snprintf(val, size, "%s", pool_name);
        else
                ret = snprintf(val, size, "%lld", (unsigned long long)pool);
-       up_read(&osdc->map_sem);
+       up_read(&osdc->lock);
        return ret;
 }
 
@@ -862,6 +862,7 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
        struct ceph_mds_request *req;
        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct ceph_pagelist *pagelist = NULL;
+       int op = CEPH_MDS_OP_SETXATTR;
        int err;
 
        if (size > 0) {
@@ -875,20 +876,21 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
                if (err)
                        goto out;
        } else if (!value) {
-               flags |= CEPH_XATTR_REMOVE;
+               if (flags & CEPH_XATTR_REPLACE)
+                       op = CEPH_MDS_OP_RMXATTR;
+               else
+                       flags |= CEPH_XATTR_REMOVE;
        }
 
        dout("setxattr value=%.*s\n", (int)size, value);
 
        /* do request */
-       req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
-                                      USE_AUTH_MDS);
+       req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
        if (IS_ERR(req)) {
                err = PTR_ERR(req);
                goto out;
        }
 
-       req->r_args.setxattr.flags = cpu_to_le32(flags);
        req->r_path2 = kstrdup(name, GFP_NOFS);
        if (!req->r_path2) {
                ceph_mdsc_put_request(req);
@@ -896,8 +898,11 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
                goto out;
        }
 
-       req->r_pagelist = pagelist;
-       pagelist = NULL;
+       if (op == CEPH_MDS_OP_SETXATTR) {
+               req->r_args.setxattr.flags = cpu_to_le32(flags);
+               req->r_pagelist = pagelist;
+               pagelist = NULL;
+       }
 
        req->r_inode = inode;
        ihold(inode);
@@ -1051,12 +1056,13 @@ static int ceph_get_xattr_handler(const struct xattr_handler *handler,
 }
 
 static int ceph_set_xattr_handler(const struct xattr_handler *handler,
-                                 struct dentry *dentry, const char *name,
-                                 const void *value, size_t size, int flags)
+                                 struct dentry *unused, struct inode *inode,
+                                 const char *name, const void *value,
+                                 size_t size, int flags)
 {
        if (!ceph_is_valid_xattr(name))
                return -EOPNOTSUPP;
-       return __ceph_setxattr(d_inode(dentry), name, value, size, flags);
+       return __ceph_setxattr(inode, name, value, size, flags);
 }
 
 const struct xattr_handler ceph_other_xattr_handler = {
index c8b77aa..5e23f64 100644 (file)
@@ -39,8 +39,9 @@
 enum { XATTR_USER, XATTR_CIFS_ACL, XATTR_ACL_ACCESS, XATTR_ACL_DEFAULT };
 
 static int cifs_xattr_set(const struct xattr_handler *handler,
-                         struct dentry *dentry, const char *name,
-                         const void *value, size_t size, int flags)
+                         struct dentry *dentry, struct inode *inode,
+                         const char *name, const void *value,
+                         size_t size, int flags)
 {
        int rc = -EOPNOTSUPP;
        unsigned int xid;
@@ -99,12 +100,12 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
                        if (value &&
                            pTcon->ses->server->ops->set_acl)
                                rc = pTcon->ses->server->ops->set_acl(pacl,
-                                               size, d_inode(dentry),
+                                               size, inode,
                                                full_path, CIFS_ACL_DACL);
                        else
                                rc = -EOPNOTSUPP;
                        if (rc == 0) /* force revalidate of the inode */
-                               CIFS_I(d_inode(dentry))->time = 0;
+                               CIFS_I(inode)->time = 0;
                        kfree(pacl);
                }
 #endif /* CONFIG_CIFS_ACL */
index 8754e9a..be6e48b 100644 (file)
@@ -936,6 +936,8 @@ static int compat_filldir(struct dir_context *ctx, const char *name, int namlen,
        }
        dirent = buf->previous;
        if (dirent) {
+               if (signal_pending(current))
+                       return -EINTR;
                if (__put_user(offset, &dirent->d_off))
                        goto efault;
        }
@@ -1020,6 +1022,8 @@ static int compat_filldir64(struct dir_context *ctx, const char *name,
        dirent = buf->previous;
 
        if (dirent) {
+               if (signal_pending(current))
+                       return -EINTR;
                if (__put_user_unaligned(offset, &dirent->d_off))
                        goto efault;
        }
index a345c16..761495b 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
 #include <linux/pfn_t.h>
 #include <linux/sizes.h>
 
-#define RADIX_DAX_MASK 0xf
-#define RADIX_DAX_SHIFT        4
-#define RADIX_DAX_PTE  (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY)
-#define RADIX_DAX_PMD  (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY)
-#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK)
+/*
+ * We use lowest available bit in exceptional entry for locking, other two
+ * bits to determine entry type. In total 3 special bits.
+ */
+#define RADIX_DAX_SHIFT        (RADIX_TREE_EXCEPTIONAL_SHIFT + 3)
+#define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
+#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
+#define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD)
+#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK)
 #define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
 #define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
-               RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE)))
+               RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \
+               RADIX_TREE_EXCEPTIONAL_ENTRY))
+
+/* We choose 4096 entries - same as per-zone page wait tables */
+#define DAX_WAIT_TABLE_BITS 12
+#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
+
+wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
+
+static int __init init_dax_wait_table(void)
+{
+       int i;
+
+       for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
+               init_waitqueue_head(wait_table + i);
+       return 0;
+}
+fs_initcall(init_dax_wait_table);
+
+static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
+                                             pgoff_t index)
+{
+       unsigned long hash = hash_long((unsigned long)mapping ^ index,
+                                      DAX_WAIT_TABLE_BITS);
+       return wait_table + hash;
+}
 
 static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
 {
@@ -87,50 +116,6 @@ struct page *read_dax_sector(struct block_device *bdev, sector_t n)
        return page;
 }
 
-/*
- * dax_clear_sectors() is called from within transaction context from XFS,
- * and hence this means the stack from this point must follow GFP_NOFS
- * semantics for all operations.
- */
-int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size)
-{
-       struct blk_dax_ctl dax = {
-               .sector = _sector,
-               .size = _size,
-       };
-
-       might_sleep();
-       do {
-               long count, sz;
-
-               count = dax_map_atomic(bdev, &dax);
-               if (count < 0)
-                       return count;
-               sz = min_t(long, count, SZ_128K);
-               clear_pmem(dax.addr, sz);
-               dax.size -= sz;
-               dax.sector += sz / 512;
-               dax_unmap_atomic(bdev, &dax);
-               cond_resched();
-       } while (dax.size);
-
-       wmb_pmem();
-       return 0;
-}
-EXPORT_SYMBOL_GPL(dax_clear_sectors);
-
-/* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */
-static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first,
-               loff_t pos, loff_t end)
-{
-       loff_t final = end - pos + first; /* The final byte of the buffer */
-
-       if (first > 0)
-               clear_pmem(addr, first);
-       if (final < size)
-               clear_pmem(addr + final, size - final);
-}
-
 static bool buffer_written(struct buffer_head *bh)
 {
        return buffer_mapped(bh) && !buffer_unwritten(bh);
@@ -169,6 +154,9 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
        struct blk_dax_ctl dax = {
                .addr = (void __pmem *) ERR_PTR(-EIO),
        };
+       unsigned blkbits = inode->i_blkbits;
+       sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1)
+                                                               >> blkbits;
 
        if (rw == READ)
                end = min(end, i_size_read(inode));
@@ -176,7 +164,6 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
        while (pos < end) {
                size_t len;
                if (pos == max) {
-                       unsigned blkbits = inode->i_blkbits;
                        long page = pos >> PAGE_SHIFT;
                        sector_t block = page << (PAGE_SHIFT - blkbits);
                        unsigned first = pos - (block << blkbits);
@@ -192,6 +179,13 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
                                        bh->b_size = 1 << blkbits;
                                bh_max = pos - first + bh->b_size;
                                bdev = bh->b_bdev;
+                               /*
+                                * We allow uninitialized buffers for writes
+                                * beyond EOF as those cannot race with faults
+                                */
+                               WARN_ON_ONCE(
+                                       (buffer_new(bh) && block < file_blks) ||
+                                       (rw == WRITE && buffer_unwritten(bh)));
                        } else {
                                unsigned done = bh->b_size -
                                                (bh_max - (pos - first));
@@ -211,11 +205,6 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
                                        rc = map_len;
                                        break;
                                }
-                               if (buffer_unwritten(bh) || buffer_new(bh)) {
-                                       dax_new_buf(dax.addr, map_len, first,
-                                                       pos, end);
-                                       need_wmb = true;
-                               }
                                dax.addr += first;
                                size = map_len - first;
                        }
@@ -276,15 +265,8 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
        memset(&bh, 0, sizeof(bh));
        bh.b_bdev = inode->i_sb->s_bdev;
 
-       if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) {
-               struct address_space *mapping = inode->i_mapping;
+       if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
                inode_lock(inode);
-               retval = filemap_write_and_wait_range(mapping, pos, end - 1);
-               if (retval) {
-                       inode_unlock(inode);
-                       goto out;
-               }
-       }
 
        /* Protects against truncate */
        if (!(flags & DIO_SKIP_DIO_COUNT))
@@ -305,11 +287,267 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
 
        if (!(flags & DIO_SKIP_DIO_COUNT))
                inode_dio_end(inode);
- out:
        return retval;
 }
 EXPORT_SYMBOL_GPL(dax_do_io);
 
+/*
+ * DAX radix tree locking
+ */
+struct exceptional_entry_key {
+       struct address_space *mapping;
+       unsigned long index;
+};
+
+struct wait_exceptional_entry_queue {
+       wait_queue_t wait;
+       struct exceptional_entry_key key;
+};
+
+static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
+                                      int sync, void *keyp)
+{
+       struct exceptional_entry_key *key = keyp;
+       struct wait_exceptional_entry_queue *ewait =
+               container_of(wait, struct wait_exceptional_entry_queue, wait);
+
+       if (key->mapping != ewait->key.mapping ||
+           key->index != ewait->key.index)
+               return 0;
+       return autoremove_wake_function(wait, mode, sync, NULL);
+}
+
+/*
+ * Check whether the given slot is locked. The function must be called with
+ * mapping->tree_lock held
+ */
+static inline int slot_locked(struct address_space *mapping, void **slot)
+{
+       unsigned long entry = (unsigned long)
+               radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
+       return entry & RADIX_DAX_ENTRY_LOCK;
+}
+
+/*
+ * Mark the given slot is locked. The function must be called with
+ * mapping->tree_lock held
+ */
+static inline void *lock_slot(struct address_space *mapping, void **slot)
+{
+       unsigned long entry = (unsigned long)
+               radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
+
+       entry |= RADIX_DAX_ENTRY_LOCK;
+       radix_tree_replace_slot(slot, (void *)entry);
+       return (void *)entry;
+}
+
+/*
+ * Mark the given slot is unlocked. The function must be called with
+ * mapping->tree_lock held
+ */
+static inline void *unlock_slot(struct address_space *mapping, void **slot)
+{
+       unsigned long entry = (unsigned long)
+               radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
+
+       entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
+       radix_tree_replace_slot(slot, (void *)entry);
+       return (void *)entry;
+}
+
+/*
+ * Lookup entry in radix tree, wait for it to become unlocked if it is
+ * exceptional entry and return it. The caller must call
+ * put_unlocked_mapping_entry() when he decided not to lock the entry or
+ * put_locked_mapping_entry() when he locked the entry and now wants to
+ * unlock it.
+ *
+ * The function must be called with mapping->tree_lock held.
+ */
+static void *get_unlocked_mapping_entry(struct address_space *mapping,
+                                       pgoff_t index, void ***slotp)
+{
+       void *ret, **slot;
+       struct wait_exceptional_entry_queue ewait;
+       wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
+
+       init_wait(&ewait.wait);
+       ewait.wait.func = wake_exceptional_entry_func;
+       ewait.key.mapping = mapping;
+       ewait.key.index = index;
+
+       for (;;) {
+               ret = __radix_tree_lookup(&mapping->page_tree, index, NULL,
+                                         &slot);
+               if (!ret || !radix_tree_exceptional_entry(ret) ||
+                   !slot_locked(mapping, slot)) {
+                       if (slotp)
+                               *slotp = slot;
+                       return ret;
+               }
+               prepare_to_wait_exclusive(wq, &ewait.wait,
+                                         TASK_UNINTERRUPTIBLE);
+               spin_unlock_irq(&mapping->tree_lock);
+               schedule();
+               finish_wait(wq, &ewait.wait);
+               spin_lock_irq(&mapping->tree_lock);
+       }
+}
+
+/*
+ * Find radix tree entry at given index. If it points to a page, return with
+ * the page locked. If it points to the exceptional entry, return with the
+ * radix tree entry locked. If the radix tree doesn't contain given index,
+ * create empty exceptional entry for the index and return with it locked.
+ *
+ * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
+ * persistent memory the benefit is doubtful. We can add that later if we can
+ * show it helps.
+ */
+static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index)
+{
+       void *ret, **slot;
+
+restart:
+       spin_lock_irq(&mapping->tree_lock);
+       ret = get_unlocked_mapping_entry(mapping, index, &slot);
+       /* No entry for given index? Make sure radix tree is big enough. */
+       if (!ret) {
+               int err;
+
+               spin_unlock_irq(&mapping->tree_lock);
+               err = radix_tree_preload(
+                               mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
+               if (err)
+                       return ERR_PTR(err);
+               ret = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
+                              RADIX_DAX_ENTRY_LOCK);
+               spin_lock_irq(&mapping->tree_lock);
+               err = radix_tree_insert(&mapping->page_tree, index, ret);
+               radix_tree_preload_end();
+               if (err) {
+                       spin_unlock_irq(&mapping->tree_lock);
+                       /* Someone already created the entry? */
+                       if (err == -EEXIST)
+                               goto restart;
+                       return ERR_PTR(err);
+               }
+               /* Good, we have inserted empty locked entry into the tree. */
+               mapping->nrexceptional++;
+               spin_unlock_irq(&mapping->tree_lock);
+               return ret;
+       }
+       /* Normal page in radix tree? */
+       if (!radix_tree_exceptional_entry(ret)) {
+               struct page *page = ret;
+
+               get_page(page);
+               spin_unlock_irq(&mapping->tree_lock);
+               lock_page(page);
+               /* Page got truncated? Retry... */
+               if (unlikely(page->mapping != mapping)) {
+                       unlock_page(page);
+                       put_page(page);
+                       goto restart;
+               }
+               return page;
+       }
+       ret = lock_slot(mapping, slot);
+       spin_unlock_irq(&mapping->tree_lock);
+       return ret;
+}
+
+void dax_wake_mapping_entry_waiter(struct address_space *mapping,
+                                  pgoff_t index, bool wake_all)
+{
+       wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
+
+       /*
+        * Checking for locked entry and prepare_to_wait_exclusive() happens
+        * under mapping->tree_lock, ditto for entry handling in our callers.
+        * So at this point all tasks that could have seen our entry locked
+        * must be in the waitqueue and the following check will see them.
+        */
+       if (waitqueue_active(wq)) {
+               struct exceptional_entry_key key;
+
+               key.mapping = mapping;
+               key.index = index;
+               __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
+       }
+}
+
+void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
+{
+       void *ret, **slot;
+
+       spin_lock_irq(&mapping->tree_lock);
+       ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
+       if (WARN_ON_ONCE(!ret || !radix_tree_exceptional_entry(ret) ||
+                        !slot_locked(mapping, slot))) {
+               spin_unlock_irq(&mapping->tree_lock);
+               return;
+       }
+       unlock_slot(mapping, slot);
+       spin_unlock_irq(&mapping->tree_lock);
+       dax_wake_mapping_entry_waiter(mapping, index, false);
+}
+
+static void put_locked_mapping_entry(struct address_space *mapping,
+                                    pgoff_t index, void *entry)
+{
+       if (!radix_tree_exceptional_entry(entry)) {
+               unlock_page(entry);
+               put_page(entry);
+       } else {
+               dax_unlock_mapping_entry(mapping, index);
+       }
+}
+
+/*
+ * Called when we are done with radix tree entry we looked up via
+ * get_unlocked_mapping_entry() and which we didn't lock in the end.
+ */
+static void put_unlocked_mapping_entry(struct address_space *mapping,
+                                      pgoff_t index, void *entry)
+{
+       if (!radix_tree_exceptional_entry(entry))
+               return;
+
+       /* We have to wake up next waiter for the radix tree entry lock */
+       dax_wake_mapping_entry_waiter(mapping, index, false);
+}
+
+/*
+ * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
+ * entry to get unlocked before deleting it.
+ */
+int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
+{
+       void *entry;
+
+       spin_lock_irq(&mapping->tree_lock);
+       entry = get_unlocked_mapping_entry(mapping, index, NULL);
+       /*
+        * This gets called from truncate / punch_hole path. As such, the caller
+        * must hold locks protecting against concurrent modifications of the
+        * radix tree (usually fs-private i_mmap_sem for writing). Since the
+        * caller has seen exceptional entry for this index, we better find it
+        * at that index as well...
+        */
+       if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) {
+               spin_unlock_irq(&mapping->tree_lock);
+               return 0;
+       }
+       radix_tree_delete(&mapping->page_tree, index);
+       mapping->nrexceptional--;
+       spin_unlock_irq(&mapping->tree_lock);
+       dax_wake_mapping_entry_waiter(mapping, index, true);
+
+       return 1;
+}
+
 /*
  * The user has performed a load from a hole in the file.  Allocating
  * a new page in the file would cause excessive storage usage for
@@ -318,24 +556,24 @@ EXPORT_SYMBOL_GPL(dax_do_io);
  * otherwise it will simply fall out of the page cache under memory
  * pressure without ever having been dirtied.
  */
-static int dax_load_hole(struct address_space *mapping, struct page *page,
-                                                       struct vm_fault *vmf)
+static int dax_load_hole(struct address_space *mapping, void *entry,
+                        struct vm_fault *vmf)
 {
-       unsigned long size;
-       struct inode *inode = mapping->host;
-       if (!page)
-               page = find_or_create_page(mapping, vmf->pgoff,
-                                               GFP_KERNEL | __GFP_ZERO);
-       if (!page)
-               return VM_FAULT_OOM;
-       /* Recheck i_size under page lock to avoid truncate race */
-       size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-       if (vmf->pgoff >= size) {
-               unlock_page(page);
-               put_page(page);
-               return VM_FAULT_SIGBUS;
+       struct page *page;
+
+       /* Hole page already exists? Return it...  */
+       if (!radix_tree_exceptional_entry(entry)) {
+               vmf->page = entry;
+               return VM_FAULT_LOCKED;
        }
 
+       /* This will replace locked radix tree entry with a hole page */
+       page = find_or_create_page(mapping, vmf->pgoff,
+                                  vmf->gfp_mask | __GFP_ZERO);
+       if (!page) {
+               put_locked_mapping_entry(mapping, vmf->pgoff, entry);
+               return VM_FAULT_OOM;
+       }
        vmf->page = page;
        return VM_FAULT_LOCKED;
 }
@@ -359,77 +597,72 @@ static int copy_user_bh(struct page *to, struct inode *inode,
        return 0;
 }
 
-#define NO_SECTOR -1
 #define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT))
 
-static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
-               sector_t sector, bool pmd_entry, bool dirty)
+static void *dax_insert_mapping_entry(struct address_space *mapping,
+                                     struct vm_fault *vmf,
+                                     void *entry, sector_t sector)
 {
        struct radix_tree_root *page_tree = &mapping->page_tree;
-       pgoff_t pmd_index = DAX_PMD_INDEX(index);
-       int type, error = 0;
-       void *entry;
+       int error = 0;
+       bool hole_fill = false;
+       void *new_entry;
+       pgoff_t index = vmf->pgoff;
 
-       WARN_ON_ONCE(pmd_entry && !dirty);
-       if (dirty)
+       if (vmf->flags & FAULT_FLAG_WRITE)
                __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 
-       spin_lock_irq(&mapping->tree_lock);
-
-       entry = radix_tree_lookup(page_tree, pmd_index);
-       if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) {
-               index = pmd_index;
-               goto dirty;
+       /* Replacing hole page with block mapping? */
+       if (!radix_tree_exceptional_entry(entry)) {
+               hole_fill = true;
+               /*
+                * Unmap the page now before we remove it from page cache below.
+                * The page is locked so it cannot be faulted in again.
+                */
+               unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
+                                   PAGE_SIZE, 0);
+               error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
+               if (error)
+                       return ERR_PTR(error);
        }
 
-       entry = radix_tree_lookup(page_tree, index);
-       if (entry) {
-               type = RADIX_DAX_TYPE(entry);
-               if (WARN_ON_ONCE(type != RADIX_DAX_PTE &&
-                                       type != RADIX_DAX_PMD)) {
-                       error = -EIO;
+       spin_lock_irq(&mapping->tree_lock);
+       new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) |
+                      RADIX_DAX_ENTRY_LOCK);
+       if (hole_fill) {
+               __delete_from_page_cache(entry, NULL);
+               /* Drop pagecache reference */
+               put_page(entry);
+               error = radix_tree_insert(page_tree, index, new_entry);
+               if (error) {
+                       new_entry = ERR_PTR(error);
                        goto unlock;
                }
+               mapping->nrexceptional++;
+       } else {
+               void **slot;
+               void *ret;
 
-               if (!pmd_entry || type == RADIX_DAX_PMD)
-                       goto dirty;
-
-               /*
-                * We only insert dirty PMD entries into the radix tree.  This
-                * means we don't need to worry about removing a dirty PTE
-                * entry and inserting a clean PMD entry, thus reducing the
-                * range we would flush with a follow-up fsync/msync call.
-                */
-               radix_tree_delete(&mapping->page_tree, index);
-               mapping->nrexceptional--;
-       }
-
-       if (sector == NO_SECTOR) {
-               /*
-                * This can happen during correct operation if our pfn_mkwrite
-                * fault raced against a hole punch operation.  If this
-                * happens the pte that was hole punched will have been
-                * unmapped and the radix tree entry will have been removed by
-                * the time we are called, but the call will still happen.  We
-                * will return all the way up to wp_pfn_shared(), where the
-                * pte_same() check will fail, eventually causing page fault
-                * to be retried by the CPU.
-                */
-               goto unlock;
+               ret = __radix_tree_lookup(page_tree, index, NULL, &slot);
+               WARN_ON_ONCE(ret != entry);
+               radix_tree_replace_slot(slot, new_entry);
        }
-
-       error = radix_tree_insert(page_tree, index,
-                       RADIX_DAX_ENTRY(sector, pmd_entry));
-       if (error)
-               goto unlock;
-
-       mapping->nrexceptional++;
- dirty:
-       if (dirty)
+       if (vmf->flags & FAULT_FLAG_WRITE)
                radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
  unlock:
        spin_unlock_irq(&mapping->tree_lock);
-       return error;
+       if (hole_fill) {
+               radix_tree_preload_end();
+               /*
+                * We don't need hole page anymore, it has been replaced with
+                * locked radix tree entry now.
+                */
+               if (mapping->a_ops->freepage)
+                       mapping->a_ops->freepage(entry);
+               unlock_page(entry);
+               put_page(entry);
+       }
+       return new_entry;
 }
 
 static int dax_writeback_one(struct block_device *bdev,
@@ -555,56 +788,29 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 }
 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
 
-static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
+static int dax_insert_mapping(struct address_space *mapping,
+                       struct buffer_head *bh, void **entryp,
                        struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        unsigned long vaddr = (unsigned long)vmf->virtual_address;
-       struct address_space *mapping = inode->i_mapping;
        struct block_device *bdev = bh->b_bdev;
        struct blk_dax_ctl dax = {
-               .sector = to_sector(bh, inode),
+               .sector = to_sector(bh, mapping->host),
                .size = bh->b_size,
        };
-       pgoff_t size;
-       int error;
-
-       i_mmap_lock_read(mapping);
-
-       /*
-        * Check truncate didn't happen while we were allocating a block.
-        * If it did, this block may or may not be still allocated to the
-        * file.  We can't tell the filesystem to free it because we can't
-        * take i_mutex here.  In the worst case, the file still has blocks
-        * allocated past the end of the file.
-        */
-       size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-       if (unlikely(vmf->pgoff >= size)) {
-               error = -EIO;
-               goto out;
-       }
+       void *ret;
+       void *entry = *entryp;
 
-       if (dax_map_atomic(bdev, &dax) < 0) {
-               error = PTR_ERR(dax.addr);
-               goto out;
-       }
-
-       if (buffer_unwritten(bh) || buffer_new(bh)) {
-               clear_pmem(dax.addr, PAGE_SIZE);
-               wmb_pmem();
-       }
+       if (dax_map_atomic(bdev, &dax) < 0)
+               return PTR_ERR(dax.addr);
        dax_unmap_atomic(bdev, &dax);
 
-       error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false,
-                       vmf->flags & FAULT_FLAG_WRITE);
-       if (error)
-               goto out;
-
-       error = vm_insert_mixed(vma, vaddr, dax.pfn);
+       ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector);
+       if (IS_ERR(ret))
+               return PTR_ERR(ret);
+       *entryp = ret;
 
- out:
-       i_mmap_unlock_read(mapping);
-
-       return error;
+       return vm_insert_mixed(vma, vaddr, dax.pfn);
 }
 
 /**
@@ -612,24 +818,18 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
  * @vma: The virtual memory area where the fault occurred
  * @vmf: The description of the fault
  * @get_block: The filesystem method used to translate file offsets to blocks
- * @complete_unwritten: The filesystem method used to convert unwritten blocks
- *     to written so the data written to them is exposed. This is required for
- *     required by write faults for filesystems that will return unwritten
- *     extent mappings from @get_block, but it is optional for reads as
- *     dax_insert_mapping() will always zero unwritten blocks. If the fs does
- *     not support unwritten extents, the it should pass NULL.
  *
  * When a page fault occurs, filesystems may call this helper in their
  * fault handler for DAX files. __dax_fault() assumes the caller has done all
  * the necessary locking for the page fault to proceed successfully.
  */
 int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-                       get_block_t get_block, dax_iodone_t complete_unwritten)
+                       get_block_t get_block)
 {
        struct file *file = vma->vm_file;
        struct address_space *mapping = file->f_mapping;
        struct inode *inode = mapping->host;
-       struct page *page;
+       void *entry;
        struct buffer_head bh;
        unsigned long vaddr = (unsigned long)vmf->virtual_address;
        unsigned blkbits = inode->i_blkbits;
@@ -638,6 +838,11 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
        int error;
        int major = 0;
 
+       /*
+        * Check whether offset isn't beyond end of file now. Caller is supposed
+        * to hold locks serializing us with truncate / punch hole so this is
+        * a reliable test.
+        */
        size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
        if (vmf->pgoff >= size)
                return VM_FAULT_SIGBUS;
@@ -647,49 +852,17 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
        bh.b_bdev = inode->i_sb->s_bdev;
        bh.b_size = PAGE_SIZE;
 
- repeat:
-       page = find_get_page(mapping, vmf->pgoff);
-       if (page) {
-               if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
-                       put_page(page);
-                       return VM_FAULT_RETRY;
-               }
-               if (unlikely(page->mapping != mapping)) {
-                       unlock_page(page);
-                       put_page(page);
-                       goto repeat;
-               }
-               size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-               if (unlikely(vmf->pgoff >= size)) {
-                       /*
-                        * We have a struct page covering a hole in the file
-                        * from a read fault and we've raced with a truncate
-                        */
-                       error = -EIO;
-                       goto unlock_page;
-               }
+       entry = grab_mapping_entry(mapping, vmf->pgoff);
+       if (IS_ERR(entry)) {
+               error = PTR_ERR(entry);
+               goto out;
        }
 
        error = get_block(inode, block, &bh, 0);
        if (!error && (bh.b_size < PAGE_SIZE))
                error = -EIO;           /* fs corruption? */
        if (error)
-               goto unlock_page;
-
-       if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
-               if (vmf->flags & FAULT_FLAG_WRITE) {
-                       error = get_block(inode, block, &bh, 1);
-                       count_vm_event(PGMAJFAULT);
-                       mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
-                       major = VM_FAULT_MAJOR;
-                       if (!error && (bh.b_size < PAGE_SIZE))
-                               error = -EIO;
-                       if (error)
-                               goto unlock_page;
-               } else {
-                       return dax_load_hole(mapping, page, vmf);
-               }
-       }
+               goto unlock_entry;
 
        if (vmf->cow_page) {
                struct page *new_page = vmf->cow_page;
@@ -698,53 +871,35 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
                else
                        clear_user_highpage(new_page, vaddr);
                if (error)
-                       goto unlock_page;
-               vmf->page = page;
-               if (!page) {
-                       i_mmap_lock_read(mapping);
-                       /* Check we didn't race with truncate */
-                       size = (i_size_read(inode) + PAGE_SIZE - 1) >>
-                                                               PAGE_SHIFT;
-                       if (vmf->pgoff >= size) {
-                               i_mmap_unlock_read(mapping);
-                               error = -EIO;
-                               goto out;
-                       }
+                       goto unlock_entry;
+               if (!radix_tree_exceptional_entry(entry)) {
+                       vmf->page = entry;
+                       return VM_FAULT_LOCKED;
                }
-               return VM_FAULT_LOCKED;
+               vmf->entry = entry;
+               return VM_FAULT_DAX_LOCKED;
        }
 
-       /* Check we didn't race with a read fault installing a new page */
-       if (!page && major)
-               page = find_lock_page(mapping, vmf->pgoff);
-
-       if (page) {
-               unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
-                                                       PAGE_SIZE, 0);
-               delete_from_page_cache(page);
-               unlock_page(page);
-               put_page(page);
-               page = NULL;
-       }
-
-       /*
-        * If we successfully insert the new mapping over an unwritten extent,
-        * we need to ensure we convert the unwritten extent. If there is an
-        * error inserting the mapping, the filesystem needs to leave it as
-        * unwritten to prevent exposure of the stale underlying data to
-        * userspace, but we still need to call the completion function so
-        * the private resources on the mapping buffer can be released. We
-        * indicate what the callback should do via the uptodate variable, same
-        * as for normal BH based IO completions.
-        */
-       error = dax_insert_mapping(inode, &bh, vma, vmf);
-       if (buffer_unwritten(&bh)) {
-               if (complete_unwritten)
-                       complete_unwritten(&bh, !error);
-               else
-                       WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
+       if (!buffer_mapped(&bh)) {
+               if (vmf->flags & FAULT_FLAG_WRITE) {
+                       error = get_block(inode, block, &bh, 1);
+                       count_vm_event(PGMAJFAULT);
+                       mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+                       major = VM_FAULT_MAJOR;
+                       if (!error && (bh.b_size < PAGE_SIZE))
+                               error = -EIO;
+                       if (error)
+                               goto unlock_entry;
+               } else {
+                       return dax_load_hole(mapping, entry, vmf);
+               }
        }
 
+       /* Filesystem should not return unwritten buffers to us! */
+       WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
+       error = dax_insert_mapping(mapping, &bh, &entry, vma, vmf);
+ unlock_entry:
+       put_locked_mapping_entry(mapping, vmf->pgoff, entry);
  out:
        if (error == -ENOMEM)
                return VM_FAULT_OOM | major;
@@ -752,13 +907,6 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
        if ((error < 0) && (error != -EBUSY))
                return VM_FAULT_SIGBUS | major;
        return VM_FAULT_NOPAGE | major;
-
- unlock_page:
-       if (page) {
-               unlock_page(page);
-               put_page(page);
-       }
-       goto out;
 }
 EXPORT_SYMBOL(__dax_fault);
 
@@ -772,7 +920,7 @@ EXPORT_SYMBOL(__dax_fault);
  * fault handler for DAX files.
  */
 int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-             get_block_t get_block, dax_iodone_t complete_unwritten)
+             get_block_t get_block)
 {
        int result;
        struct super_block *sb = file_inode(vma->vm_file)->i_sb;
@@ -781,7 +929,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
                sb_start_pagefault(sb);
                file_update_time(vma->vm_file);
        }
-       result = __dax_fault(vma, vmf, get_block, complete_unwritten);
+       result = __dax_fault(vma, vmf, get_block);
        if (vmf->flags & FAULT_FLAG_WRITE)
                sb_end_pagefault(sb);
 
@@ -789,7 +937,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 }
 EXPORT_SYMBOL_GPL(dax_fault);
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
 /*
  * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
  * more often than one might expect in the below function.
@@ -815,8 +963,7 @@ static void __dax_dbg(struct buffer_head *bh, unsigned long address,
 #define dax_pmd_dbg(bh, address, reason)       __dax_dbg(bh, address, reason, "dax_pmd")
 
 int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
-               pmd_t *pmd, unsigned int flags, get_block_t get_block,
-               dax_iodone_t complete_unwritten)
+               pmd_t *pmd, unsigned int flags, get_block_t get_block)
 {
        struct file *file = vma->vm_file;
        struct address_space *mapping = file->f_mapping;
@@ -828,7 +975,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
        struct block_device *bdev;
        pgoff_t size, pgoff;
        sector_t block;
-       int error, result = 0;
+       int result = 0;
        bool alloc = false;
 
        /* dax pmd mappings require pfn_t_devmap() */
@@ -875,6 +1022,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
                if (get_block(inode, block, &bh, 1) != 0)
                        return VM_FAULT_SIGBUS;
                alloc = true;
+               WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
        }
 
        bdev = bh.b_bdev;
@@ -900,26 +1048,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
                truncate_pagecache_range(inode, lstart, lend);
        }
 
-       i_mmap_lock_read(mapping);
-
-       /*
-        * If a truncate happened while we were allocating blocks, we may
-        * leave blocks allocated to the file that are beyond EOF.  We can't
-        * take i_mutex here, so just leave them hanging; they'll be freed
-        * when the file is deleted.
-        */
-       size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-       if (pgoff >= size) {
-               result = VM_FAULT_SIGBUS;
-               goto out;
-       }
-       if ((pgoff | PG_PMD_COLOUR) >= size) {
-               dax_pmd_dbg(&bh, address,
-                               "offset + huge page size > file size");
-               goto fallback;
-       }
-
-       if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
+       if (!write && !buffer_mapped(&bh)) {
                spinlock_t *ptl;
                pmd_t entry;
                struct page *zero_page = get_huge_zero_page();
@@ -954,8 +1083,8 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
                long length = dax_map_atomic(bdev, &dax);
 
                if (length < 0) {
-                       result = VM_FAULT_SIGBUS;
-                       goto out;
+                       dax_pmd_dbg(&bh, address, "dax-error fallback");
+                       goto fallback;
                }
                if (length < PMD_SIZE) {
                        dax_pmd_dbg(&bh, address, "dax-length too small");
@@ -973,14 +1102,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
                        dax_pmd_dbg(&bh, address, "pfn not in memmap");
                        goto fallback;
                }
-
-               if (buffer_unwritten(&bh) || buffer_new(&bh)) {
-                       clear_pmem(dax.addr, PMD_SIZE);
-                       wmb_pmem();
-                       count_vm_event(PGMAJFAULT);
-                       mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
-                       result |= VM_FAULT_MAJOR;
-               }
                dax_unmap_atomic(bdev, &dax);
 
                /*
@@ -999,13 +1120,10 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
                 * the write to insert a dirty entry.
                 */
                if (write) {
-                       error = dax_radix_entry(mapping, pgoff, dax.sector,
-                                       true, true);
-                       if (error) {
-                               dax_pmd_dbg(&bh, address,
-                                               "PMD radix insertion failed");
-                               goto fallback;
-                       }
+                       /*
+                        * We should insert radix-tree entry and dirty it here.
+                        * For now this is broken...
+                        */
                }
 
                dev_dbg(part_to_dev(bdev->bd_part),
@@ -1018,11 +1136,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
        }
 
  out:
-       i_mmap_unlock_read(mapping);
-
-       if (buffer_unwritten(&bh))
-               complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
-
        return result;
 
  fallback:
@@ -1042,8 +1155,7 @@ EXPORT_SYMBOL_GPL(__dax_pmd_fault);
  * pmd_fault handler for DAX files.
  */
 int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
-                       pmd_t *pmd, unsigned int flags, get_block_t get_block,
-                       dax_iodone_t complete_unwritten)
+                       pmd_t *pmd, unsigned int flags, get_block_t get_block)
 {
        int result;
        struct super_block *sb = file_inode(vma->vm_file)->i_sb;
@@ -1052,8 +1164,7 @@ int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
                sb_start_pagefault(sb);
                file_update_time(vma->vm_file);
        }
-       result = __dax_pmd_fault(vma, address, pmd, flags, get_block,
-                               complete_unwritten);
+       result = __dax_pmd_fault(vma, address, pmd, flags, get_block);
        if (flags & FAULT_FLAG_WRITE)
                sb_end_pagefault(sb);
 
@@ -1070,27 +1181,59 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault);
 int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        struct file *file = vma->vm_file;
-       int error;
-
-       /*
-        * We pass NO_SECTOR to dax_radix_entry() because we expect that a
-        * RADIX_DAX_PTE entry already exists in the radix tree from a
-        * previous call to __dax_fault().  We just want to look up that PTE
-        * entry using vmf->pgoff and make sure the dirty tag is set.  This
-        * saves us from having to make a call to get_block() here to look
-        * up the sector.
-        */
-       error = dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false,
-                       true);
+       struct address_space *mapping = file->f_mapping;
+       void *entry;
+       pgoff_t index = vmf->pgoff;
 
-       if (error == -ENOMEM)
-               return VM_FAULT_OOM;
-       if (error)
-               return VM_FAULT_SIGBUS;
+       spin_lock_irq(&mapping->tree_lock);
+       entry = get_unlocked_mapping_entry(mapping, index, NULL);
+       if (!entry || !radix_tree_exceptional_entry(entry))
+               goto out;
+       radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
+       put_unlocked_mapping_entry(mapping, index, entry);
+out:
+       spin_unlock_irq(&mapping->tree_lock);
        return VM_FAULT_NOPAGE;
 }
 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
 
+static bool dax_range_is_aligned(struct block_device *bdev,
+                                unsigned int offset, unsigned int length)
+{
+       unsigned short sector_size = bdev_logical_block_size(bdev);
+
+       if (!IS_ALIGNED(offset, sector_size))
+               return false;
+       if (!IS_ALIGNED(length, sector_size))
+               return false;
+
+       return true;
+}
+
+int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
+               unsigned int offset, unsigned int length)
+{
+       struct blk_dax_ctl dax = {
+               .sector         = sector,
+               .size           = PAGE_SIZE,
+       };
+
+       if (dax_range_is_aligned(bdev, offset, length)) {
+               sector_t start_sector = dax.sector + (offset >> 9);
+
+               return blkdev_issue_zeroout(bdev, start_sector,
+                               length >> 9, GFP_NOFS, true);
+       } else {
+               if (dax_map_atomic(bdev, &dax) < 0)
+                       return PTR_ERR(dax.addr);
+               clear_pmem(dax.addr + offset, length);
+               wmb_pmem();
+               dax_unmap_atomic(bdev, &dax);
+       }
+       return 0;
+}
+EXPORT_SYMBOL_GPL(__dax_zero_page_range);
+
 /**
  * dax_zero_page_range - zero a range within a page of a DAX file
  * @inode: The file being truncated
@@ -1102,12 +1245,6 @@ EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
  * page in a DAX file.  This is intended for hole-punch operations.  If
  * you are truncating a file, the helper function dax_truncate_page() may be
  * more convenient.
- *
- * We work in terms of PAGE_SIZE here for commonality with
- * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
- * took care of disposing of the unnecessary blocks.  Even if the filesystem
- * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
- * since the file might be mmapped.
  */
 int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
                                                        get_block_t get_block)
@@ -1126,23 +1263,11 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
        bh.b_bdev = inode->i_sb->s_bdev;
        bh.b_size = PAGE_SIZE;
        err = get_block(inode, index, &bh, 0);
-       if (err < 0)
+       if (err < 0 || !buffer_written(&bh))
                return err;
-       if (buffer_written(&bh)) {
-               struct block_device *bdev = bh.b_bdev;
-               struct blk_dax_ctl dax = {
-                       .sector = to_sector(&bh, inode),
-                       .size = PAGE_SIZE,
-               };
 
-               if (dax_map_atomic(bdev, &dax) < 0)
-                       return PTR_ERR(dax.addr);
-               clear_pmem(dax.addr + offset, length);
-               wmb_pmem();
-               dax_unmap_atomic(bdev, &dax);
-       }
-
-       return 0;
+       return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode),
+                       offset, length);
 }
 EXPORT_SYMBOL_GPL(dax_zero_page_range);
 
@@ -1154,12 +1279,6 @@ EXPORT_SYMBOL_GPL(dax_zero_page_range);
  *
  * Similar to block_truncate_page(), this function can be called by a
  * filesystem when it is truncating a DAX file to handle the partial page.
- *
- * We work in terms of PAGE_SIZE here for commonality with
- * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
- * took care of disposing of the unnecessary blocks.  Even if the filesystem
- * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
- * since the file might be mmapped.
  */
 int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
 {
index c622872..ad4a542 100644 (file)
@@ -1670,8 +1670,7 @@ struct dentry *d_alloc_name(struct dentry *parent, const char *name)
        struct qstr q;
 
        q.name = name;
-       q.len = strlen(name);
-       q.hash = full_name_hash(q.name, q.len);
+       q.hash_len = hashlen_string(name);
        return d_alloc(parent, &q);
 }
 EXPORT_SYMBOL(d_alloc_name);
index 0b2954d..37c134a 100644 (file)
@@ -95,8 +95,6 @@ static struct ctl_table pty_root_table[] = {
 
 static DEFINE_MUTEX(allocated_ptys_lock);
 
-static struct vfsmount *devpts_mnt;
-
 struct pts_mount_opts {
        int setuid;
        int setgid;
@@ -104,7 +102,7 @@ struct pts_mount_opts {
        kgid_t   gid;
        umode_t mode;
        umode_t ptmxmode;
-       int newinstance;
+       int reserve;
        int max;
 };
 
@@ -117,11 +115,9 @@ static const match_table_t tokens = {
        {Opt_uid, "uid=%u"},
        {Opt_gid, "gid=%u"},
        {Opt_mode, "mode=%o"},
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
        {Opt_ptmxmode, "ptmxmode=%o"},
        {Opt_newinstance, "newinstance"},
        {Opt_max, "max=%d"},
-#endif
        {Opt_err, NULL}
 };
 
@@ -137,15 +133,48 @@ static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb)
        return sb->s_fs_info;
 }
 
-static inline struct super_block *pts_sb_from_inode(struct inode *inode)
+struct pts_fs_info *devpts_acquire(struct file *filp)
 {
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
-       if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
-               return inode->i_sb;
-#endif
-       if (!devpts_mnt)
-               return NULL;
-       return devpts_mnt->mnt_sb;
+       struct pts_fs_info *result;
+       struct path path;
+       struct super_block *sb;
+       int err;
+
+       path = filp->f_path;
+       path_get(&path);
+
+       /* Has the devpts filesystem already been found? */
+       sb = path.mnt->mnt_sb;
+       if (sb->s_magic != DEVPTS_SUPER_MAGIC) {
+               /* Is a devpts filesystem at "pts" in the same directory? */
+               err = path_pts(&path);
+               if (err) {
+                       result = ERR_PTR(err);
+                       goto out;
+               }
+
+               /* Is the path the root of a devpts filesystem? */
+               result = ERR_PTR(-ENODEV);
+               sb = path.mnt->mnt_sb;
+               if ((sb->s_magic != DEVPTS_SUPER_MAGIC) ||
+                   (path.mnt->mnt_root != sb->s_root))
+                       goto out;
+       }
+
+       /*
+        * pty code needs to hold extra references in case of last /dev/tty close
+        */
+       atomic_inc(&sb->s_active);
+       result = DEVPTS_SB(sb);
+
+out:
+       path_put(&path);
+       return result;
+}
+
+void devpts_release(struct pts_fs_info *fsi)
+{
+       deactivate_super(fsi->sb);
 }
 
 #define PARSE_MOUNT    0
@@ -154,9 +183,7 @@ static inline struct super_block *pts_sb_from_inode(struct inode *inode)
 /*
  * parse_mount_options():
  *     Set @opts to mount options specified in @data. If an option is not
- *     specified in @data, set it to its default value. The exception is
- *     'newinstance' option which can only be set/cleared on a mount (i.e.
- *     cannot be changed during remount).
+ *     specified in @data, set it to its default value.
  *
  * Note: @data may be NULL (in which case all options are set to default).
  */
@@ -174,9 +201,12 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
        opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
        opts->max     = NR_UNIX98_PTY_MAX;
 
-       /* newinstance makes sense only on initial mount */
+       /* Only allow instances mounted from the initial mount
+        * namespace to tap the reserve pool of ptys.
+        */
        if (op == PARSE_MOUNT)
-               opts->newinstance = 0;
+               opts->reserve =
+                       (current->nsproxy->mnt_ns == init_task.nsproxy->mnt_ns);
 
        while ((p = strsep(&data, ",")) != NULL) {
                substring_t args[MAX_OPT_ARGS];
@@ -211,16 +241,12 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
                                return -EINVAL;
                        opts->mode = option & S_IALLUGO;
                        break;
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
                case Opt_ptmxmode:
                        if (match_octal(&args[0], &option))
                                return -EINVAL;
                        opts->ptmxmode = option & S_IALLUGO;
                        break;
                case Opt_newinstance:
-                       /* newinstance makes sense only on initial mount */
-                       if (op == PARSE_MOUNT)
-                               opts->newinstance = 1;
                        break;
                case Opt_max:
                        if (match_int(&args[0], &option) ||
@@ -228,7 +254,6 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
                                return -EINVAL;
                        opts->max = option;
                        break;
-#endif
                default:
                        pr_err("called with bogus options\n");
                        return -EINVAL;
@@ -238,7 +263,6 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
        return 0;
 }
 
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
 static int mknod_ptmx(struct super_block *sb)
 {
        int mode;
@@ -305,12 +329,6 @@ static void update_ptmx_mode(struct pts_fs_info *fsi)
                inode->i_mode = S_IFCHR|fsi->mount_opts.ptmxmode;
        }
 }
-#else
-static inline void update_ptmx_mode(struct pts_fs_info *fsi)
-{
-       return;
-}
-#endif
 
 static int devpts_remount(struct super_block *sb, int *flags, char *data)
 {
@@ -344,11 +362,9 @@ static int devpts_show_options(struct seq_file *seq, struct dentry *root)
                seq_printf(seq, ",gid=%u",
                           from_kgid_munged(&init_user_ns, opts->gid));
        seq_printf(seq, ",mode=%03o", opts->mode);
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
        seq_printf(seq, ",ptmxmode=%03o", opts->ptmxmode);
        if (opts->max < NR_UNIX98_PTY_MAX)
                seq_printf(seq, ",max=%d", opts->max);
-#endif
 
        return 0;
 }
@@ -410,40 +426,11 @@ fail:
        return -ENOMEM;
 }
 
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
-static int compare_init_pts_sb(struct super_block *s, void *p)
-{
-       if (devpts_mnt)
-               return devpts_mnt->mnt_sb == s;
-       return 0;
-}
-
 /*
  * devpts_mount()
  *
- *     If the '-o newinstance' mount option was specified, mount a new
- *     (private) instance of devpts.  PTYs created in this instance are
- *     independent of the PTYs in other devpts instances.
- *
- *     If the '-o newinstance' option was not specified, mount/remount the
- *     initial kernel mount of devpts.  This type of mount gives the
- *     legacy, single-instance semantics.
- *
- *     The 'newinstance' option is needed to support multiple namespace
- *     semantics in devpts while preserving backward compatibility of the
- *     current 'single-namespace' semantics. i.e all mounts of devpts
- *     without the 'newinstance' mount option should bind to the initial
- *     kernel mount, like mount_single().
- *
- *     Mounts with 'newinstance' option create a new, private namespace.
- *
- *     NOTE:
- *
- *     For single-mount semantics, devpts cannot use mount_single(),
- *     because mount_single()/sget() find and use the super-block from
- *     the most recent mount of devpts. But that recent mount may be a
- *     'newinstance' mount and mount_single() would pick the newinstance
- *     super-block instead of the initial super-block.
+ *     Mount a new (private) instance of devpts.  PTYs created in this
+ *     instance are independent of the PTYs in other devpts instances.
  */
 static struct dentry *devpts_mount(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data)
@@ -456,18 +443,7 @@ static struct dentry *devpts_mount(struct file_system_type *fs_type,
        if (error)
                return ERR_PTR(error);
 
-       /* Require newinstance for all user namespace mounts to ensure
-        * the mount options are not changed.
-        */
-       if ((current_user_ns() != &init_user_ns) && !opts.newinstance)
-               return ERR_PTR(-EINVAL);
-
-       if (opts.newinstance)
-               s = sget(fs_type, NULL, set_anon_super, flags, NULL);
-       else
-               s = sget(fs_type, compare_init_pts_sb, set_anon_super, flags,
-                        NULL);
-
+       s = sget(fs_type, NULL, set_anon_super, flags, NULL);
        if (IS_ERR(s))
                return ERR_CAST(s);
 
@@ -491,18 +467,6 @@ out_undo_sget:
        return ERR_PTR(error);
 }
 
-#else
-/*
- * This supports only the legacy single-instance semantics (no
- * multiple-instance semantics)
- */
-static struct dentry *devpts_mount(struct file_system_type *fs_type, int flags,
-               const char *dev_name, void *data)
-{
-       return mount_single(fs_type, flags, data, devpts_fill_super);
-}
-#endif
-
 static void devpts_kill_sb(struct super_block *sb)
 {
        struct pts_fs_info *fsi = DEVPTS_SB(sb);
@@ -516,9 +480,7 @@ static struct file_system_type devpts_fs_type = {
        .name           = "devpts",
        .mount          = devpts_mount,
        .kill_sb        = devpts_kill_sb,
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
        .fs_flags       = FS_USERNS_MOUNT | FS_USERNS_DEV_MOUNT,
-#endif
 };
 
 /*
@@ -531,16 +493,13 @@ int devpts_new_index(struct pts_fs_info *fsi)
        int index;
        int ida_ret;
 
-       if (!fsi)
-               return -ENODEV;
-
 retry:
        if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL))
                return -ENOMEM;
 
        mutex_lock(&allocated_ptys_lock);
-       if (pty_count >= pty_limit -
-                       (fsi->mount_opts.newinstance ? pty_reserve : 0)) {
+       if (pty_count >= (pty_limit -
+                         (fsi->mount_opts.reserve ? 0 : pty_reserve))) {
                mutex_unlock(&allocated_ptys_lock);
                return -ENOSPC;
        }
@@ -571,30 +530,6 @@ void devpts_kill_index(struct pts_fs_info *fsi, int idx)
        mutex_unlock(&allocated_ptys_lock);
 }
 
-/*
- * pty code needs to hold extra references in case of last /dev/tty close
- */
-struct pts_fs_info *devpts_get_ref(struct inode *ptmx_inode, struct file *file)
-{
-       struct super_block *sb;
-       struct pts_fs_info *fsi;
-
-       sb = pts_sb_from_inode(ptmx_inode);
-       if (!sb)
-               return NULL;
-       fsi = DEVPTS_SB(sb);
-       if (!fsi)
-               return NULL;
-
-       atomic_inc(&sb->s_active);
-       return fsi;
-}
-
-void devpts_put_ref(struct pts_fs_info *fsi)
-{
-       deactivate_super(fsi->sb);
-}
-
 /**
  * devpts_pty_new -- create a new inode in /dev/pts/
  * @ptmx_inode: inode of the master
@@ -607,16 +542,12 @@ void devpts_put_ref(struct pts_fs_info *fsi)
 struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv)
 {
        struct dentry *dentry;
-       struct super_block *sb;
+       struct super_block *sb = fsi->sb;
        struct inode *inode;
        struct dentry *root;
        struct pts_mount_opts *opts;
        char s[12];
 
-       if (!fsi)
-               return ERR_PTR(-ENODEV);
-
-       sb = fsi->sb;
        root = sb->s_root;
        opts = &fsi->mount_opts;
 
@@ -676,20 +607,8 @@ void devpts_pty_kill(struct dentry *dentry)
 static int __init init_devpts_fs(void)
 {
        int err = register_filesystem(&devpts_fs_type);
-       struct ctl_table_header *table;
-
        if (!err) {
-               struct vfsmount *mnt;
-
-               table = register_sysctl_table(pty_root_table);
-               mnt = kern_mount(&devpts_fs_type);
-               if (IS_ERR(mnt)) {
-                       err = PTR_ERR(mnt);
-                       unregister_filesystem(&devpts_fs_type);
-                       unregister_sysctl_table(table);
-               } else {
-                       devpts_mnt = mnt;
-               }
+               register_sysctl_table(pty_root_table);
        }
        return err;
 }
index 3bf3f20..f3b4408 100644 (file)
@@ -628,11 +628,11 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
                map_bh->b_size = fs_count << i_blkbits;
 
                /*
-                * For writes inside i_size on a DIO_SKIP_HOLES filesystem we
-                * forbid block creations: only overwrites are permitted.
-                * We will return early to the caller once we see an
-                * unmapped buffer head returned, and the caller will fall
-                * back to buffered I/O.
+                * For writes that could fill holes inside i_size on a
+                * DIO_SKIP_HOLES filesystem we forbid block creations: only
+                * overwrites are permitted. We will return early to the caller
+                * once we see an unmapped buffer head returned, and the caller
+                * will fall back to buffered I/O.
                 *
                 * Otherwise the decision is left to the get_blocks method,
                 * which may decide to handle it or also return an unmapped
@@ -640,8 +640,8 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
                 */
                create = dio->rw & WRITE;
                if (dio->flags & DIO_SKIP_HOLES) {
-                       if (sdio->block_in_file < (i_size_read(dio->inode) >>
-                                                       sdio->blkbits))
+                       if (fs_startblk <= ((i_size_read(dio->inode) - 1) >>
+                                                       i_blkbits))
                                create = 0;
                }
 
index ebd40f4..0d8eb34 100644 (file)
@@ -1141,12 +1141,13 @@ ecryptfs_write_metadata_to_contents(struct inode *ecryptfs_inode,
 
 static int
 ecryptfs_write_metadata_to_xattr(struct dentry *ecryptfs_dentry,
+                                struct inode *ecryptfs_inode,
                                 char *page_virt, size_t size)
 {
        int rc;
 
-       rc = ecryptfs_setxattr(ecryptfs_dentry, ECRYPTFS_XATTR_NAME, page_virt,
-                              size, 0);
+       rc = ecryptfs_setxattr(ecryptfs_dentry, ecryptfs_inode,
+                              ECRYPTFS_XATTR_NAME, page_virt, size, 0);
        return rc;
 }
 
@@ -1215,8 +1216,8 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry,
                goto out_free;
        }
        if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
-               rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, virt,
-                                                     size);
+               rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, ecryptfs_inode,
+                                                     virt, size);
        else
                rc = ecryptfs_write_metadata_to_contents(ecryptfs_inode, virt,
                                                         virt_len);
index 3ec495d..4ba1547 100644 (file)
@@ -609,8 +609,8 @@ ssize_t
 ecryptfs_getxattr_lower(struct dentry *lower_dentry, struct inode *lower_inode,
                        const char *name, void *value, size_t size);
 int
-ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
-                 size_t size, int flags);
+ecryptfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name,
+                 const void *value, size_t size, int flags);
 int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode);
 #ifdef CONFIG_ECRYPT_FS_MESSAGING
 int ecryptfs_process_response(struct ecryptfs_daemon *daemon,
index 318b046..9d153b6 100644 (file)
@@ -1001,7 +1001,8 @@ static int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 }
 
 int
-ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+ecryptfs_setxattr(struct dentry *dentry, struct inode *inode,
+                 const char *name, const void *value,
                  size_t size, int flags)
 {
        int rc = 0;
@@ -1014,8 +1015,8 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
        }
 
        rc = vfs_setxattr(lower_dentry, name, value, size, flags);
-       if (!rc && d_really_is_positive(dentry))
-               fsstack_copy_attr_all(d_inode(dentry), d_inode(lower_dentry));
+       if (!rc && inode)
+               fsstack_copy_attr_all(inode, d_inode(lower_dentry));
 out:
        return rc;
 }
index 148d11b..9c3437c 100644 (file)
@@ -442,7 +442,8 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode)
        if (size < 0)
                size = 8;
        put_unaligned_be64(i_size_read(ecryptfs_inode), xattr_virt);
-       rc = lower_inode->i_op->setxattr(lower_dentry, ECRYPTFS_XATTR_NAME,
+       rc = lower_inode->i_op->setxattr(lower_dentry, lower_inode,
+                                        ECRYPTFS_XATTR_NAME,
                                         xattr_virt, size, 0);
        inode_unlock(lower_inode);
        if (rc)
index c1400b1..868c023 100644 (file)
@@ -51,7 +51,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        }
        down_read(&ei->dax_sem);
 
-       ret = __dax_fault(vma, vmf, ext2_get_block, NULL);
+       ret = __dax_fault(vma, vmf, ext2_get_block);
 
        up_read(&ei->dax_sem);
        if (vmf->flags & FAULT_FLAG_WRITE)
@@ -72,7 +72,7 @@ static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
        }
        down_read(&ei->dax_sem);
 
-       ret = __dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL);
+       ret = __dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block);
 
        up_read(&ei->dax_sem);
        if (flags & FAULT_FLAG_WRITE)
index b675610..fcbe586 100644 (file)
@@ -26,6 +26,7 @@
 #include <linux/highuid.h>
 #include <linux/pagemap.h>
 #include <linux/dax.h>
+#include <linux/blkdev.h>
 #include <linux/quotaops.h>
 #include <linux/writeback.h>
 #include <linux/buffer_head.h>
@@ -737,19 +738,18 @@ static int ext2_get_blocks(struct inode *inode,
                 * so that it's not found by another thread before it's
                 * initialised
                 */
-               err = dax_clear_sectors(inode->i_sb->s_bdev,
-                               le32_to_cpu(chain[depth-1].key) <<
-                               (inode->i_blkbits - 9),
-                               1 << inode->i_blkbits);
+               err = sb_issue_zeroout(inode->i_sb,
+                               le32_to_cpu(chain[depth-1].key), count,
+                               GFP_NOFS);
                if (err) {
                        mutex_unlock(&ei->truncate_mutex);
                        goto cleanup;
                }
-       }
+       } else
+               set_buffer_new(bh_result);
 
        ext2_splice_branch(inode, iblock, partial, indirect_blks, count);
        mutex_unlock(&ei->truncate_mutex);
-       set_buffer_new(bh_result);
 got_it:
        map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
        if (count > blocks_to_boundary)
index b78caf2..1d93795 100644 (file)
@@ -922,16 +922,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
        blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
 
        if (sbi->s_mount_opt & EXT2_MOUNT_DAX) {
-               if (blocksize != PAGE_SIZE) {
-                       ext2_msg(sb, KERN_ERR,
-                                       "error: unsupported blocksize for dax");
+               err = bdev_dax_supported(sb, blocksize);
+               if (err)
                        goto failed_mount;
-               }
-               if (!sb->s_bdev->bd_disk->fops->direct_access) {
-                       ext2_msg(sb, KERN_ERR,
-                                       "error: device does not support dax");
-                       goto failed_mount;
-               }
        }
 
        /* If the blocksize doesn't match, re-read the thing.. */
index 7fd3b86..7b9e9c1 100644 (file)
@@ -18,10 +18,11 @@ ext2_xattr_security_get(const struct xattr_handler *handler,
 
 static int
 ext2_xattr_security_set(const struct xattr_handler *handler,
-                       struct dentry *dentry, const char *name,
-                       const void *value, size_t size, int flags)
+                       struct dentry *unused, struct inode *inode,
+                       const char *name, const void *value,
+                       size_t size, int flags)
 {
-       return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name,
+       return ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY, name,
                              value, size, flags);
 }
 
index 0f85705..65049b7 100644 (file)
@@ -25,10 +25,11 @@ ext2_xattr_trusted_get(const struct xattr_handler *handler,
 
 static int
 ext2_xattr_trusted_set(const struct xattr_handler *handler,
-                      struct dentry *dentry, const char *name,
-                      const void *value, size_t size, int flags)
+                      struct dentry *unused, struct inode *inode,
+                      const char *name, const void *value,
+                      size_t size, int flags)
 {
-       return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name,
+       return ext2_xattr_set(inode, EXT2_XATTR_INDEX_TRUSTED, name,
                              value, size, flags);
 }
 
index 1fafd27..fb2f992 100644 (file)
@@ -29,13 +29,14 @@ ext2_xattr_user_get(const struct xattr_handler *handler,
 
 static int
 ext2_xattr_user_set(const struct xattr_handler *handler,
-                   struct dentry *dentry, const char *name,
-                   const void *value, size_t size, int flags)
+                   struct dentry *unused, struct inode *inode,
+                   const char *name, const void *value,
+                   size_t size, int flags)
 {
-       if (!test_opt(dentry->d_sb, XATTR_USER))
+       if (!test_opt(inode->i_sb, XATTR_USER))
                return -EOPNOTSUPP;
 
-       return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_USER,
+       return ext2_xattr_set(inode, EXT2_XATTR_INDEX_USER,
                              name, value, size, flags);
 }
 
index fe1f50f..3020fd7 100644 (file)
@@ -610,7 +610,8 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 
        jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
 
-       return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
+       jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
+       return 1;
 }
 
 /*
index 5d00bf0..68323e3 100644 (file)
@@ -150,6 +150,11 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
        while (ctx->pos < inode->i_size) {
                struct ext4_map_blocks map;
 
+               if (fatal_signal_pending(current)) {
+                       err = -ERESTARTSYS;
+                       goto errout;
+               }
+               cond_resched();
                map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb);
                map.m_len = 1;
                err = ext4_map_blocks(NULL, inode, &map, 0);
index 72f4c9e..b84aa1c 100644 (file)
@@ -33,6 +33,7 @@
 #include <linux/ratelimit.h>
 #include <crypto/hash.h>
 #include <linux/falloc.h>
+#include <linux/percpu-rwsem.h>
 #ifdef __KERNEL__
 #include <linux/compat.h>
 #endif
@@ -581,6 +582,9 @@ enum {
 #define EXT4_GET_BLOCKS_ZERO                   0x0200
 #define EXT4_GET_BLOCKS_CREATE_ZERO            (EXT4_GET_BLOCKS_CREATE |\
                                        EXT4_GET_BLOCKS_ZERO)
+       /* Caller will submit data before dropping transaction handle. This
+        * allows jbd2 to avoid submitting data before commit. */
+#define EXT4_GET_BLOCKS_IO_SUBMIT              0x0400
 
 /*
  * The bit position of these flags must not overlap with any of the
@@ -1505,6 +1509,9 @@ struct ext4_sb_info {
        struct ratelimit_state s_err_ratelimit_state;
        struct ratelimit_state s_warning_ratelimit_state;
        struct ratelimit_state s_msg_ratelimit_state;
+
+       /* Barrier between changing inodes' journal flags and writepages ops. */
+       struct percpu_rw_semaphore s_journal_flag_rwsem;
 };
 
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1549,7 +1556,6 @@ enum {
        EXT4_STATE_DIOREAD_LOCK,        /* Disable support for dio read
                                           nolocking */
        EXT4_STATE_MAY_INLINE_DATA,     /* may have in-inode data */
-       EXT4_STATE_ORDERED_MODE,        /* data=ordered mode */
        EXT4_STATE_EXT_PRECACHED,       /* extents have been precached */
 };
 
@@ -2521,8 +2527,8 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
 struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
 int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
                             struct buffer_head *bh_result, int create);
-int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
-                           struct buffer_head *bh_result, int create);
+int ext4_dax_get_block(struct inode *inode, sector_t iblock,
+                      struct buffer_head *bh_result, int create);
 int ext4_get_block(struct inode *inode, sector_t iblock,
                   struct buffer_head *bh_result, int create);
 int ext4_dio_get_block(struct inode *inode, sector_t iblock,
@@ -2581,7 +2587,6 @@ extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk,
 /* indirect.c */
 extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
                                struct ext4_map_blocks *map, int flags);
-extern ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
 extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
 extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
 extern void ext4_ind_truncate(handle_t *, struct inode *inode);
@@ -3329,6 +3334,13 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
        }
 }
 
+static inline bool ext4_aligned_io(struct inode *inode, loff_t off, loff_t len)
+{
+       int blksize = 1 << inode->i_blkbits;
+
+       return IS_ALIGNED(off, blksize) && IS_ALIGNED(len, blksize);
+}
+
 #endif /* __KERNEL__ */
 
 #define EFSBADCRC      EBADMSG         /* Bad CRC detected */
index 5f58462..09c1ef3 100644 (file)
@@ -359,10 +359,21 @@ static inline int ext4_journal_force_commit(journal_t *journal)
        return 0;
 }
 
-static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
+static inline int ext4_jbd2_inode_add_write(handle_t *handle,
+                                           struct inode *inode)
 {
        if (ext4_handle_valid(handle))
-               return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode);
+               return jbd2_journal_inode_add_write(handle,
+                                                   EXT4_I(inode)->jinode);
+       return 0;
+}
+
+static inline int ext4_jbd2_inode_add_wait(handle_t *handle,
+                                          struct inode *inode)
+{
+       if (ext4_handle_valid(handle))
+               return jbd2_journal_inode_add_wait(handle,
+                                                  EXT4_I(inode)->jinode);
        return 0;
 }
 
index 95bf467..2a2eef9 100644 (file)
@@ -120,9 +120,14 @@ static int ext4_ext_truncate_extend_restart(handle_t *handle,
 
        if (!ext4_handle_valid(handle))
                return 0;
-       if (handle->h_buffer_credits > needed)
+       if (handle->h_buffer_credits >= needed)
                return 0;
-       err = ext4_journal_extend(handle, needed);
+       /*
+        * If we need to extend the journal get a few extra blocks
+        * while we're at it for efficiency's sake.
+        */
+       needed += 3;
+       err = ext4_journal_extend(handle, needed - handle->h_buffer_credits);
        if (err <= 0)
                return err;
        err = ext4_truncate_restart_trans(handle, inode, needed);
@@ -907,13 +912,6 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
 
                eh = ext_block_hdr(bh);
                ppos++;
-               if (unlikely(ppos > depth)) {
-                       put_bh(bh);
-                       EXT4_ERROR_INODE(inode,
-                                        "ppos %d > depth %d", ppos, depth);
-                       ret = -EFSCORRUPTED;
-                       goto err;
-               }
                path[ppos].p_bh = bh;
                path[ppos].p_hdr = eh;
        }
@@ -2583,7 +2581,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
                }
        } else
                ext4_error(sbi->s_sb, "strange request: removal(2) "
-                          "%u-%u from %u:%u\n",
+                          "%u-%u from %u:%u",
                           from, to, le32_to_cpu(ex->ee_block), ee_len);
        return 0;
 }
@@ -3738,7 +3736,7 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
        if (ee_block != map->m_lblk || ee_len > map->m_len) {
 #ifdef EXT4_DEBUG
                ext4_warning("Inode (%ld) finished: extent logical block %llu,"
-                            " len %u; IO logical block %llu, len %u\n",
+                            " len %u; IO logical block %llu, len %u",
                             inode->i_ino, (unsigned long long)ee_block, ee_len,
                             (unsigned long long)map->m_lblk, map->m_len);
 #endif
index e38b987..37e0592 100644 (file)
@@ -707,7 +707,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
            (status & EXTENT_STATUS_WRITTEN)) {
                ext4_warning(inode->i_sb, "Inserting extent [%u/%u] as "
                                " delayed and written which can potentially "
-                               " cause data loss.\n", lblk, len);
+                               " cause data loss.", lblk, len);
                WARN_ON(1);
        }
 
index 00ff691..df44c87 100644 (file)
@@ -202,7 +202,7 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        if (IS_ERR(handle))
                result = VM_FAULT_SIGBUS;
        else
-               result = __dax_fault(vma, vmf, ext4_dax_mmap_get_block, NULL);
+               result = __dax_fault(vma, vmf, ext4_dax_get_block);
 
        if (write) {
                if (!IS_ERR(handle))
@@ -238,7 +238,7 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
                result = VM_FAULT_SIGBUS;
        else
                result = __dax_pmd_fault(vma, addr, pmd, flags,
-                               ext4_dax_mmap_get_block, NULL);
+                                        ext4_dax_get_block);
 
        if (write) {
                if (!IS_ERR(handle))
@@ -373,7 +373,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
        if (ext4_encrypted_inode(d_inode(dir)) &&
            !ext4_is_child_context_consistent_with_parent(d_inode(dir), inode)) {
                ext4_warning(inode->i_sb,
-                            "Inconsistent encryption contexts: %lu/%lu\n",
+                            "Inconsistent encryption contexts: %lu/%lu",
                             (unsigned long) d_inode(dir)->i_ino,
                             (unsigned long) inode->i_ino);
                dput(dir);
index 237b877..3da4cf8 100644 (file)
@@ -1150,25 +1150,20 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
        unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
        ext4_group_t block_group;
        int bit;
-       struct buffer_head *bitmap_bh;
+       struct buffer_head *bitmap_bh = NULL;
        struct inode *inode = NULL;
-       long err = -EIO;
+       int err = -EFSCORRUPTED;
 
-       /* Error cases - e2fsck has already cleaned up for us */
-       if (ino > max_ino) {
-               ext4_warning(sb, "bad orphan ino %lu!  e2fsck was run?", ino);
-               err = -EFSCORRUPTED;
-               goto error;
-       }
+       if (ino < EXT4_FIRST_INO(sb) || ino > max_ino)
+               goto bad_orphan;
 
        block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
        bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
        bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
        if (IS_ERR(bitmap_bh)) {
-               err = PTR_ERR(bitmap_bh);
-               ext4_warning(sb, "inode bitmap error %ld for orphan %lu",
-                            ino, err);
-               goto error;
+               ext4_error(sb, "inode bitmap error %ld for orphan %lu",
+                          ino, PTR_ERR(bitmap_bh));
+               return (struct inode *) bitmap_bh;
        }
 
        /* Having the inode bit set should be a 100% indicator that this
@@ -1179,15 +1174,21 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
                goto bad_orphan;
 
        inode = ext4_iget(sb, ino);
-       if (IS_ERR(inode))
-               goto iget_failed;
+       if (IS_ERR(inode)) {
+               err = PTR_ERR(inode);
+               ext4_error(sb, "couldn't read orphan inode %lu (err %d)",
+                          ino, err);
+               return inode;
+       }
 
        /*
-        * If the orphans has i_nlinks > 0 then it should be able to be
-        * truncated, otherwise it won't be removed from the orphan list
-        * during processing and an infinite loop will result.
+        * If the orphans has i_nlinks > 0 then it should be able to
+        * be truncated, otherwise it won't be removed from the orphan
+        * list during processing and an infinite loop will result.
+        * Similarly, it must not be a bad inode.
         */
-       if (inode->i_nlink && !ext4_can_truncate(inode))
+       if ((inode->i_nlink && !ext4_can_truncate(inode)) ||
+           is_bad_inode(inode))
                goto bad_orphan;
 
        if (NEXT_ORPHAN(inode) > max_ino)
@@ -1195,29 +1196,25 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
        brelse(bitmap_bh);
        return inode;
 
-iget_failed:
-       err = PTR_ERR(inode);
-       inode = NULL;
 bad_orphan:
-       ext4_warning(sb, "bad orphan inode %lu!  e2fsck was run?", ino);
-       printk(KERN_WARNING "ext4_test_bit(bit=%d, block=%llu) = %d\n",
-              bit, (unsigned long long)bitmap_bh->b_blocknr,
-              ext4_test_bit(bit, bitmap_bh->b_data));
-       printk(KERN_WARNING "inode=%p\n", inode);
+       ext4_error(sb, "bad orphan inode %lu", ino);
+       if (bitmap_bh)
+               printk(KERN_ERR "ext4_test_bit(bit=%d, block=%llu) = %d\n",
+                      bit, (unsigned long long)bitmap_bh->b_blocknr,
+                      ext4_test_bit(bit, bitmap_bh->b_data));
        if (inode) {
-               printk(KERN_WARNING "is_bad_inode(inode)=%d\n",
+               printk(KERN_ERR "is_bad_inode(inode)=%d\n",
                       is_bad_inode(inode));
-               printk(KERN_WARNING "NEXT_ORPHAN(inode)=%u\n",
+               printk(KERN_ERR "NEXT_ORPHAN(inode)=%u\n",
                       NEXT_ORPHAN(inode));
-               printk(KERN_WARNING "max_ino=%lu\n", max_ino);
-               printk(KERN_WARNING "i_nlink=%u\n", inode->i_nlink);
+               printk(KERN_ERR "max_ino=%lu\n", max_ino);
+               printk(KERN_ERR "i_nlink=%u\n", inode->i_nlink);
                /* Avoid freeing blocks if we got a bad deleted inode */
                if (inode->i_nlink == 0)
                        inode->i_blocks = 0;
                iput(inode);
        }
        brelse(bitmap_bh);
-error:
        return ERR_PTR(err);
 }
 
index 627b7e8..bc15c2c 100644 (file)
@@ -648,133 +648,6 @@ out:
        return err;
 }
 
-/*
- * O_DIRECT for ext3 (or indirect map) based files
- *
- * If the O_DIRECT write will extend the file then add this inode to the
- * orphan list.  So recovery will truncate it back to the original size
- * if the machine crashes during the write.
- *
- * If the O_DIRECT write is intantiating holes inside i_size and the machine
- * crashes then stale disk data _may_ be exposed inside the file. But current
- * VFS code falls back into buffered path in that case so we are safe.
- */
-ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
-{
-       struct file *file = iocb->ki_filp;
-       struct inode *inode = file->f_mapping->host;
-       struct ext4_inode_info *ei = EXT4_I(inode);
-       loff_t offset = iocb->ki_pos;
-       handle_t *handle;
-       ssize_t ret;
-       int orphan = 0;
-       size_t count = iov_iter_count(iter);
-       int retries = 0;
-
-       if (iov_iter_rw(iter) == WRITE) {
-               loff_t final_size = offset + count;
-
-               if (final_size > inode->i_size) {
-                       /* Credits for sb + inode write */
-                       handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
-                       if (IS_ERR(handle)) {
-                               ret = PTR_ERR(handle);
-                               goto out;
-                       }
-                       ret = ext4_orphan_add(handle, inode);
-                       if (ret) {
-                               ext4_journal_stop(handle);
-                               goto out;
-                       }
-                       orphan = 1;
-                       ei->i_disksize = inode->i_size;
-                       ext4_journal_stop(handle);
-               }
-       }
-
-retry:
-       if (iov_iter_rw(iter) == READ && ext4_should_dioread_nolock(inode)) {
-               /*
-                * Nolock dioread optimization may be dynamically disabled
-                * via ext4_inode_block_unlocked_dio(). Check inode's state
-                * while holding extra i_dio_count ref.
-                */
-               inode_dio_begin(inode);
-               smp_mb();
-               if (unlikely(ext4_test_inode_state(inode,
-                                                   EXT4_STATE_DIOREAD_LOCK))) {
-                       inode_dio_end(inode);
-                       goto locked;
-               }
-               if (IS_DAX(inode))
-                       ret = dax_do_io(iocb, inode, iter,
-                                       ext4_dio_get_block, NULL, 0);
-               else
-                       ret = __blockdev_direct_IO(iocb, inode,
-                                                  inode->i_sb->s_bdev, iter,
-                                                  ext4_dio_get_block,
-                                                  NULL, NULL, 0);
-               inode_dio_end(inode);
-       } else {
-locked:
-               if (IS_DAX(inode))
-                       ret = dax_do_io(iocb, inode, iter,
-                                       ext4_dio_get_block, NULL, DIO_LOCKING);
-               else
-                       ret = blockdev_direct_IO(iocb, inode, iter,
-                                                ext4_dio_get_block);
-
-               if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
-                       loff_t isize = i_size_read(inode);
-                       loff_t end = offset + count;
-
-                       if (end > isize)
-                               ext4_truncate_failed_write(inode);
-               }
-       }
-       if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
-               goto retry;
-
-       if (orphan) {
-               int err;
-
-               /* Credits for sb + inode write */
-               handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
-               if (IS_ERR(handle)) {
-                       /* This is really bad luck. We've written the data
-                        * but cannot extend i_size. Bail out and pretend
-                        * the write failed... */
-                       ret = PTR_ERR(handle);
-                       if (inode->i_nlink)
-                               ext4_orphan_del(NULL, inode);
-
-                       goto out;
-               }
-               if (inode->i_nlink)
-                       ext4_orphan_del(handle, inode);
-               if (ret > 0) {
-                       loff_t end = offset + ret;
-                       if (end > inode->i_size) {
-                               ei->i_disksize = end;
-                               i_size_write(inode, end);
-                               /*
-                                * We're going to return a positive `ret'
-                                * here due to non-zero-length I/O, so there's
-                                * no way of reporting error returns from
-                                * ext4_mark_inode_dirty() to userspace.  So
-                                * ignore it.
-                                */
-                               ext4_mark_inode_dirty(handle, inode);
-                       }
-               }
-               err = ext4_journal_stop(handle);
-               if (ret == 0)
-                       ret = err;
-       }
-out:
-       return ret;
-}
-
 /*
  * Calculate the number of metadata blocks need to reserve
  * to allocate a new block at @lblocks for non extent file based file
index 7bc6c85..ff7538c 100644 (file)
@@ -1780,7 +1780,7 @@ int empty_inline_dir(struct inode *dir, int *has_inline_data)
                        ext4_warning(dir->i_sb,
                                     "bad inline directory (dir #%lu) - "
                                     "inode %u, rec_len %u, name_len %d"
-                                    "inline size %d\n",
+                                    "inline size %d",
                                     dir->i_ino, le32_to_cpu(de->inode),
                                     le16_to_cpu(de->rec_len), de->name_len,
                                     inline_size);
index 79b298d..f7140ca 100644 (file)
@@ -684,6 +684,24 @@ out_sem:
                ret = check_block_validity(inode, map);
                if (ret != 0)
                        return ret;
+
+               /*
+                * Inodes with freshly allocated blocks where contents will be
+                * visible after transaction commit must be on transaction's
+                * ordered data list.
+                */
+               if (map->m_flags & EXT4_MAP_NEW &&
+                   !(map->m_flags & EXT4_MAP_UNWRITTEN) &&
+                   !(flags & EXT4_GET_BLOCKS_ZERO) &&
+                   !IS_NOQUOTA(inode) &&
+                   ext4_should_order_data(inode)) {
+                       if (flags & EXT4_GET_BLOCKS_IO_SUBMIT)
+                               ret = ext4_jbd2_inode_add_wait(handle, inode);
+                       else
+                               ret = ext4_jbd2_inode_add_write(handle, inode);
+                       if (ret)
+                               return ret;
+               }
        }
        return retval;
 }
@@ -1289,15 +1307,6 @@ static int ext4_write_end(struct file *file,
        int i_size_changed = 0;
 
        trace_ext4_write_end(inode, pos, len, copied);
-       if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) {
-               ret = ext4_jbd2_file_inode(handle, inode);
-               if (ret) {
-                       unlock_page(page);
-                       put_page(page);
-                       goto errout;
-               }
-       }
-
        if (ext4_has_inline_data(inode)) {
                ret = ext4_write_inline_data_end(inode, pos, len,
                                                 copied, page);
@@ -2313,7 +2322,8 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
         * the data was copied into the page cache.
         */
        get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
-                          EXT4_GET_BLOCKS_METADATA_NOFAIL;
+                          EXT4_GET_BLOCKS_METADATA_NOFAIL |
+                          EXT4_GET_BLOCKS_IO_SUBMIT;
        dioread_nolock = ext4_should_dioread_nolock(inode);
        if (dioread_nolock)
                get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
@@ -2602,11 +2612,14 @@ static int ext4_writepages(struct address_space *mapping,
        struct blk_plug plug;
        bool give_up_on_write = false;
 
+       percpu_down_read(&sbi->s_journal_flag_rwsem);
        trace_ext4_writepages(inode, wbc);
 
-       if (dax_mapping(mapping))
-               return dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev,
-                                                  wbc);
+       if (dax_mapping(mapping)) {
+               ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev,
+                                                 wbc);
+               goto out_writepages;
+       }
 
        /*
         * No pages to write? This is mainly a kludge to avoid starting
@@ -2776,6 +2789,7 @@ retry:
 out_writepages:
        trace_ext4_writepages_result(inode, wbc, ret,
                                     nr_to_write - wbc->nr_to_write);
+       percpu_up_read(&sbi->s_journal_flag_rwsem);
        return ret;
 }
 
@@ -3215,75 +3229,52 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
 }
 
 #ifdef CONFIG_FS_DAX
-int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
-                           struct buffer_head *bh_result, int create)
+/*
+ * Get block function for DAX IO and mmap faults. It takes care of converting
+ * unwritten extents to written ones and initializes new / converted blocks
+ * to zeros.
+ */
+int ext4_dax_get_block(struct inode *inode, sector_t iblock,
+                      struct buffer_head *bh_result, int create)
 {
-       int ret, err;
-       int credits;
-       struct ext4_map_blocks map;
-       handle_t *handle = NULL;
-       int flags = 0;
-
-       ext4_debug("ext4_dax_mmap_get_block: inode %lu, create flag %d\n",
-                  inode->i_ino, create);
-       map.m_lblk = iblock;
-       map.m_len = bh_result->b_size >> inode->i_blkbits;
-       credits = ext4_chunk_trans_blocks(inode, map.m_len);
-       if (create) {
-               flags |= EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_CREATE_ZERO;
-               handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
-               if (IS_ERR(handle)) {
-                       ret = PTR_ERR(handle);
-                       return ret;
-               }
-       }
+       int ret;
 
-       ret = ext4_map_blocks(handle, inode, &map, flags);
-       if (create) {
-               err = ext4_journal_stop(handle);
-               if (ret >= 0 && err < 0)
-                       ret = err;
-       }
-       if (ret <= 0)
-               goto out;
-       if (map.m_flags & EXT4_MAP_UNWRITTEN) {
-               int err2;
+       ext4_debug("inode %lu, create flag %d\n", inode->i_ino, create);
+       if (!create)
+               return _ext4_get_block(inode, iblock, bh_result, 0);
 
-               /*
-                * We are protected by i_mmap_sem so we know block cannot go
-                * away from under us even though we dropped i_data_sem.
-                * Convert extent to written and write zeros there.
-                *
-                * Note: We may get here even when create == 0.
-                */
-               handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
-               if (IS_ERR(handle)) {
-                       ret = PTR_ERR(handle);
-                       goto out;
-               }
+       ret = ext4_get_block_trans(inode, iblock, bh_result,
+                                  EXT4_GET_BLOCKS_PRE_IO |
+                                  EXT4_GET_BLOCKS_CREATE_ZERO);
+       if (ret < 0)
+               return ret;
 
-               err = ext4_map_blocks(handle, inode, &map,
-                     EXT4_GET_BLOCKS_CONVERT | EXT4_GET_BLOCKS_CREATE_ZERO);
-               if (err < 0)
-                       ret = err;
-               err2 = ext4_journal_stop(handle);
-               if (err2 < 0 && ret > 0)
-                       ret = err2;
-       }
-out:
-       WARN_ON_ONCE(ret == 0 && create);
-       if (ret > 0) {
-               map_bh(bh_result, inode->i_sb, map.m_pblk);
+       if (buffer_unwritten(bh_result)) {
                /*
-                * At least for now we have to clear BH_New so that DAX code
-                * doesn't attempt to zero blocks again in a racy way.
+                * We are protected by i_mmap_sem or i_mutex so we know block
+                * cannot go away from under us even though we dropped
+                * i_data_sem. Convert extent to written and write zeros there.
                 */
-               map.m_flags &= ~EXT4_MAP_NEW;
-               ext4_update_bh_state(bh_result, map.m_flags);
-               bh_result->b_size = map.m_len << inode->i_blkbits;
-               ret = 0;
+               ret = ext4_get_block_trans(inode, iblock, bh_result,
+                                          EXT4_GET_BLOCKS_CONVERT |
+                                          EXT4_GET_BLOCKS_CREATE_ZERO);
+               if (ret < 0)
+                       return ret;
        }
-       return ret;
+       /*
+        * At least for now we have to clear BH_New so that DAX code
+        * doesn't attempt to zero blocks again in a racy way.
+        */
+       clear_buffer_new(bh_result);
+       return 0;
+}
+#else
+/* Just define empty function, it will never get called. */
+int ext4_dax_get_block(struct inode *inode, sector_t iblock,
+                      struct buffer_head *bh_result, int create)
+{
+       BUG();
+       return 0;
 }
 #endif
 
@@ -3316,7 +3307,9 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 }
 
 /*
- * For ext4 extent files, ext4 will do direct-io write to holes,
+ * Handling of direct IO writes.
+ *
+ * For ext4 extent files, ext4 will do direct-io write even to holes,
  * preallocated extents, and those write extend the file, no need to
  * fall back to buffered IO.
  *
@@ -3334,10 +3327,11 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
  * if the machine crashes during the write.
  *
  */
-static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
+       struct ext4_inode_info *ei = EXT4_I(inode);
        ssize_t ret;
        loff_t offset = iocb->ki_pos;
        size_t count = iov_iter_count(iter);
@@ -3345,10 +3339,25 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
        get_block_t *get_block_func = NULL;
        int dio_flags = 0;
        loff_t final_size = offset + count;
+       int orphan = 0;
+       handle_t *handle;
 
-       /* Use the old path for reads and writes beyond i_size. */
-       if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size)
-               return ext4_ind_direct_IO(iocb, iter);
+       if (final_size > inode->i_size) {
+               /* Credits for sb + inode write */
+               handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+               if (IS_ERR(handle)) {
+                       ret = PTR_ERR(handle);
+                       goto out;
+               }
+               ret = ext4_orphan_add(handle, inode);
+               if (ret) {
+                       ext4_journal_stop(handle);
+                       goto out;
+               }
+               orphan = 1;
+               ei->i_disksize = inode->i_size;
+               ext4_journal_stop(handle);
+       }
 
        BUG_ON(iocb->private == NULL);
 
@@ -3357,8 +3366,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
         * conversion. This also disallows race between truncate() and
         * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
         */
-       if (iov_iter_rw(iter) == WRITE)
-               inode_dio_begin(inode);
+       inode_dio_begin(inode);
 
        /* If we do a overwrite dio, i_mutex locking can be released */
        overwrite = *((int *)iocb->private);
@@ -3367,7 +3375,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
                inode_unlock(inode);
 
        /*
-        * We could direct write to holes and fallocate.
+        * For extent mapped files we could direct write to holes and fallocate.
         *
         * Allocated blocks to fill the hole are marked as unwritten to prevent
         * parallel buffered read to expose the stale data before DIO complete
@@ -3389,7 +3397,23 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
        iocb->private = NULL;
        if (overwrite)
                get_block_func = ext4_dio_get_block_overwrite;
-       else if (is_sync_kiocb(iocb)) {
+       else if (IS_DAX(inode)) {
+               /*
+                * We can avoid zeroing for aligned DAX writes beyond EOF. Other
+                * writes need zeroing either because they can race with page
+                * faults or because they use partial blocks.
+                */
+               if (round_down(offset, 1<<inode->i_blkbits) >= inode->i_size &&
+                   ext4_aligned_io(inode, offset, count))
+                       get_block_func = ext4_dio_get_block;
+               else
+                       get_block_func = ext4_dax_get_block;
+               dio_flags = DIO_LOCKING;
+       } else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
+                  round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) {
+               get_block_func = ext4_dio_get_block;
+               dio_flags = DIO_LOCKING | DIO_SKIP_HOLES;
+       } else if (is_sync_kiocb(iocb)) {
                get_block_func = ext4_dio_get_block_unwritten_sync;
                dio_flags = DIO_LOCKING;
        } else {
@@ -3399,10 +3423,10 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
        BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode));
 #endif
-       if (IS_DAX(inode))
+       if (IS_DAX(inode)) {
                ret = dax_do_io(iocb, inode, iter, get_block_func,
                                ext4_end_io_dio, dio_flags);
-       else
+       else
                ret = __blockdev_direct_IO(iocb, inode,
                                           inode->i_sb->s_bdev, iter,
                                           get_block_func,
@@ -3422,12 +3446,86 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
                ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
        }
 
-       if (iov_iter_rw(iter) == WRITE)
-               inode_dio_end(inode);
+       inode_dio_end(inode);
        /* take i_mutex locking again if we do a ovewrite dio */
        if (overwrite)
                inode_lock(inode);
 
+       if (ret < 0 && final_size > inode->i_size)
+               ext4_truncate_failed_write(inode);
+
+       /* Handle extending of i_size after direct IO write */
+       if (orphan) {
+               int err;
+
+               /* Credits for sb + inode write */
+               handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+               if (IS_ERR(handle)) {
+                       /* This is really bad luck. We've written the data
+                        * but cannot extend i_size. Bail out and pretend
+                        * the write failed... */
+                       ret = PTR_ERR(handle);
+                       if (inode->i_nlink)
+                               ext4_orphan_del(NULL, inode);
+
+                       goto out;
+               }
+               if (inode->i_nlink)
+                       ext4_orphan_del(handle, inode);
+               if (ret > 0) {
+                       loff_t end = offset + ret;
+                       if (end > inode->i_size) {
+                               ei->i_disksize = end;
+                               i_size_write(inode, end);
+                               /*
+                                * We're going to return a positive `ret'
+                                * here due to non-zero-length I/O, so there's
+                                * no way of reporting error returns from
+                                * ext4_mark_inode_dirty() to userspace.  So
+                                * ignore it.
+                                */
+                               ext4_mark_inode_dirty(handle, inode);
+                       }
+               }
+               err = ext4_journal_stop(handle);
+               if (ret == 0)
+                       ret = err;
+       }
+out:
+       return ret;
+}
+
+static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter)
+{
+       int unlocked = 0;
+       struct inode *inode = iocb->ki_filp->f_mapping->host;
+       ssize_t ret;
+
+       if (ext4_should_dioread_nolock(inode)) {
+               /*
+                * Nolock dioread optimization may be dynamically disabled
+                * via ext4_inode_block_unlocked_dio(). Check inode's state
+                * while holding extra i_dio_count ref.
+                */
+               inode_dio_begin(inode);
+               smp_mb();
+               if (unlikely(ext4_test_inode_state(inode,
+                                                   EXT4_STATE_DIOREAD_LOCK)))
+                       inode_dio_end(inode);
+               else
+                       unlocked = 1;
+       }
+       if (IS_DAX(inode)) {
+               ret = dax_do_io(iocb, inode, iter, ext4_dio_get_block,
+                               NULL, unlocked ? 0 : DIO_LOCKING);
+       } else {
+               ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
+                                          iter, ext4_dio_get_block,
+                                          NULL, NULL,
+                                          unlocked ? 0 : DIO_LOCKING);
+       }
+       if (unlocked)
+               inode_dio_end(inode);
        return ret;
 }
 
@@ -3455,10 +3553,10 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
                return 0;
 
        trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
-       if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
-               ret = ext4_ext_direct_IO(iocb, iter);
+       if (iov_iter_rw(iter) == READ)
+               ret = ext4_direct_IO_read(iocb, iter);
        else
-               ret = ext4_ind_direct_IO(iocb, iter);
+               ret = ext4_direct_IO_write(iocb, iter);
        trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
        return ret;
 }
@@ -3534,10 +3632,7 @@ void ext4_set_aops(struct inode *inode)
 {
        switch (ext4_inode_journal_mode(inode)) {
        case EXT4_INODE_ORDERED_DATA_MODE:
-               ext4_set_inode_state(inode, EXT4_STATE_ORDERED_MODE);
-               break;
        case EXT4_INODE_WRITEBACK_DATA_MODE:
-               ext4_clear_inode_state(inode, EXT4_STATE_ORDERED_MODE);
                break;
        case EXT4_INODE_JOURNAL_DATA_MODE:
                inode->i_mapping->a_ops = &ext4_journalled_aops;
@@ -3630,8 +3725,8 @@ static int __ext4_block_zero_page_range(handle_t *handle,
        } else {
                err = 0;
                mark_buffer_dirty(bh);
-               if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE))
-                       err = ext4_jbd2_file_inode(handle, inode);
+               if (ext4_should_order_data(inode))
+                       err = ext4_jbd2_inode_add_write(handle, inode);
        }
 
 unlock:
@@ -5429,6 +5524,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
        journal_t *journal;
        handle_t *handle;
        int err;
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 
        /*
         * We have to be very careful here: changing a data block's
@@ -5445,22 +5541,30 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
                return 0;
        if (is_journal_aborted(journal))
                return -EROFS;
-       /* We have to allocate physical blocks for delalloc blocks
-        * before flushing journal. otherwise delalloc blocks can not
-        * be allocated any more. even more truncate on delalloc blocks
-        * could trigger BUG by flushing delalloc blocks in journal.
-        * There is no delalloc block in non-journal data mode.
-        */
-       if (val && test_opt(inode->i_sb, DELALLOC)) {
-               err = ext4_alloc_da_blocks(inode);
-               if (err < 0)
-                       return err;
-       }
 
        /* Wait for all existing dio workers */
        ext4_inode_block_unlocked_dio(inode);
        inode_dio_wait(inode);
 
+       /*
+        * Before flushing the journal and switching inode's aops, we have
+        * to flush all dirty data the inode has. There can be outstanding
+        * delayed allocations, there can be unwritten extents created by
+        * fallocate or buffered writes in dioread_nolock mode covered by
+        * dirty data which can be converted only after flushing the dirty
+        * data (and journalled aops don't know how to handle these cases).
+        */
+       if (val) {
+               down_write(&EXT4_I(inode)->i_mmap_sem);
+               err = filemap_write_and_wait(inode->i_mapping);
+               if (err < 0) {
+                       up_write(&EXT4_I(inode)->i_mmap_sem);
+                       ext4_inode_resume_unlocked_dio(inode);
+                       return err;
+               }
+       }
+
+       percpu_down_write(&sbi->s_journal_flag_rwsem);
        jbd2_journal_lock_updates(journal);
 
        /*
@@ -5477,6 +5581,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
                err = jbd2_journal_flush(journal);
                if (err < 0) {
                        jbd2_journal_unlock_updates(journal);
+                       percpu_up_write(&sbi->s_journal_flag_rwsem);
                        ext4_inode_resume_unlocked_dio(inode);
                        return err;
                }
@@ -5485,6 +5590,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
        ext4_set_aops(inode);
 
        jbd2_journal_unlock_updates(journal);
+       percpu_up_write(&sbi->s_journal_flag_rwsem);
+
+       if (val)
+               up_write(&EXT4_I(inode)->i_mmap_sem);
        ext4_inode_resume_unlocked_dio(inode);
 
        /* Finally we can mark the inode as dirty. */
index 7497f50..28cc412 100644 (file)
@@ -365,7 +365,7 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
                struct dquot *transfer_to[MAXQUOTAS] = { };
 
                transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid));
-               if (transfer_to[PRJQUOTA]) {
+               if (!IS_ERR(transfer_to[PRJQUOTA])) {
                        err = __dquot_transfer(inode, transfer_to);
                        dqput(transfer_to[PRJQUOTA]);
                        if (err)
index eeeade7..c1ab3ec 100644 (file)
@@ -1266,6 +1266,7 @@ static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
 static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
 {
        int order = 1;
+       int bb_incr = 1 << (e4b->bd_blkbits - 1);
        void *bb;
 
        BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
@@ -1278,7 +1279,8 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
                        /* this block is part of buddy of order 'order' */
                        return order;
                }
-               bb += 1 << (e4b->bd_blkbits - order);
+               bb += bb_incr;
+               bb_incr >>= 1;
                order++;
        }
        return 0;
@@ -2583,7 +2585,7 @@ int ext4_mb_init(struct super_block *sb)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        unsigned i, j;
-       unsigned offset;
+       unsigned offset, offset_incr;
        unsigned max;
        int ret;
 
@@ -2612,11 +2614,13 @@ int ext4_mb_init(struct super_block *sb)
 
        i = 1;
        offset = 0;
+       offset_incr = 1 << (sb->s_blocksize_bits - 1);
        max = sb->s_blocksize << 2;
        do {
                sbi->s_mb_offsets[i] = offset;
                sbi->s_mb_maxs[i] = max;
-               offset += 1 << (sb->s_blocksize_bits - i);
+               offset += offset_incr;
+               offset_incr = offset_incr >> 1;
                max = max >> 1;
                i++;
        } while (i <= sb->s_blocksize_bits + 1);
@@ -4935,7 +4939,7 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
         * boundary.
         */
        if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
-               ext4_warning(sb, "too much blocks added to group %u\n",
+               ext4_warning(sb, "too much blocks added to group %u",
                             block_group);
                err = -EINVAL;
                goto error_return;
index 2444527..23d436d 100644 (file)
@@ -121,7 +121,7 @@ void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
        __ext4_warning(sb, function, line, "%s", msg);
        __ext4_warning(sb, function, line,
                       "MMP failure info: last update time: %llu, last update "
-                      "node: %s, last update device: %s\n",
+                      "node: %s, last update device: %s",
                       (long long unsigned int) le64_to_cpu(mmp->mmp_time),
                       mmp->mmp_nodename, mmp->mmp_bdevname);
 }
@@ -353,7 +353,7 @@ skip:
         * wait for MMP interval and check mmp_seq.
         */
        if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
-               ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+               ext4_warning(sb, "MMP startup interrupted, failing mount");
                goto failed;
        }
 
index 325cef4..a920c5d 100644 (file)
@@ -400,7 +400,7 @@ data_copy:
 
        /* Even in case of data=writeback it is reasonable to pin
         * inode to transaction, to prevent unexpected data loss */
-       *err = ext4_jbd2_file_inode(handle, orig_inode);
+       *err = ext4_jbd2_inode_add_write(handle, orig_inode);
 
 unlock_pages:
        unlock_page(pagep[0]);
index 5611ec9..ec4c399 100644 (file)
@@ -1107,6 +1107,11 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
        }
 
        while (1) {
+               if (fatal_signal_pending(current)) {
+                       err = -ERESTARTSYS;
+                       goto errout;
+               }
+               cond_resched();
                block = dx_get_block(frame->at);
                ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo,
                                             start_hash, start_minor_hash);
@@ -1613,7 +1618,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
                        if (nokey)
                                return ERR_PTR(-ENOKEY);
                        ext4_warning(inode->i_sb,
-                                    "Inconsistent encryption contexts: %lu/%lu\n",
+                                    "Inconsistent encryption contexts: %lu/%lu",
                                     (unsigned long) dir->i_ino,
                                     (unsigned long) inode->i_ino);
                        return ERR_PTR(-EPERM);
@@ -2828,7 +2833,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
                         * list entries can cause panics at unmount time.
                         */
                        mutex_lock(&sbi->s_orphan_lock);
-                       list_del(&EXT4_I(inode)->i_orphan);
+                       list_del_init(&EXT4_I(inode)->i_orphan);
                        mutex_unlock(&sbi->s_orphan_lock);
                }
        }
index e4fc8ea..2a01df9 100644 (file)
@@ -342,9 +342,7 @@ void ext4_io_submit(struct ext4_io_submit *io)
        if (bio) {
                int io_op = io->io_wbc->sync_mode == WB_SYNC_ALL ?
                            WRITE_SYNC : WRITE;
-               bio_get(io->io_bio);
                submit_bio(io_op, io->io_bio);
-               bio_put(io->io_bio);
        }
        io->io_bio = NULL;
 }
index 34038e3..cf68100 100644 (file)
@@ -41,7 +41,7 @@ int ext4_resize_begin(struct super_block *sb)
         */
        if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
                ext4_warning(sb, "There are errors in the filesystem, "
-                            "so online resizing is not allowed\n");
+                            "so online resizing is not allowed");
                return -EPERM;
        }
 
index 304c712..3822a5a 100644 (file)
@@ -859,6 +859,7 @@ static void ext4_put_super(struct super_block *sb)
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
        percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
+       percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
        brelse(sbi->s_sbh);
 #ifdef CONFIG_QUOTA
        for (i = 0; i < EXT4_MAXQUOTAS; i++)
@@ -3416,16 +3417,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        }
 
        if (sbi->s_mount_opt & EXT4_MOUNT_DAX) {
-               if (blocksize != PAGE_SIZE) {
-                       ext4_msg(sb, KERN_ERR,
-                                       "error: unsupported blocksize for dax");
-                       goto failed_mount;
-               }
-               if (!sb->s_bdev->bd_disk->fops->direct_access) {
-                       ext4_msg(sb, KERN_ERR,
-                                       "error: device does not support dax");
+               err = bdev_dax_supported(sb, blocksize);
+               if (err)
                        goto failed_mount;
-               }
        }
 
        if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) {
@@ -3930,6 +3924,9 @@ no_journal:
        if (!err)
                err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
                                          GFP_KERNEL);
+       if (!err)
+               err = percpu_init_rwsem(&sbi->s_journal_flag_rwsem);
+
        if (err) {
                ext4_msg(sb, KERN_ERR, "insufficient memory");
                goto failed_mount6;
index 123a7d0..a892111 100644 (file)
@@ -22,10 +22,11 @@ ext4_xattr_security_get(const struct xattr_handler *handler,
 
 static int
 ext4_xattr_security_set(const struct xattr_handler *handler,
-                       struct dentry *dentry, const char *name,
-                       const void *value, size_t size, int flags)
+                       struct dentry *unused, struct inode *inode,
+                       const char *name, const void *value,
+                       size_t size, int flags)
 {
-       return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY,
+       return ext4_xattr_set(inode, EXT4_XATTR_INDEX_SECURITY,
                              name, value, size, flags);
 }
 
index 60652fa..c7765c7 100644 (file)
@@ -29,10 +29,11 @@ ext4_xattr_trusted_get(const struct xattr_handler *handler,
 
 static int
 ext4_xattr_trusted_set(const struct xattr_handler *handler,
-                      struct dentry *dentry, const char *name,
-                      const void *value, size_t size, int flags)
+                      struct dentry *unused, struct inode *inode,
+                      const char *name, const void *value,
+                      size_t size, int flags)
 {
-       return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED,
+       return ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED,
                              name, value, size, flags);
 }
 
index 17a446f..ca20e42 100644 (file)
@@ -30,12 +30,13 @@ ext4_xattr_user_get(const struct xattr_handler *handler,
 
 static int
 ext4_xattr_user_set(const struct xattr_handler *handler,
-                   struct dentry *dentry, const char *name,
-                   const void *value, size_t size, int flags)
+                   struct dentry *unused, struct inode *inode,
+                   const char *name, const void *value,
+                   size_t size, int flags)
 {
-       if (!test_opt(dentry->d_sb, XATTR_USER))
+       if (!test_opt(inode->i_sb, XATTR_USER))
                return -EOPNOTSUPP;
-       return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_USER,
+       return ext4_xattr_set(inode, EXT4_XATTR_INDEX_USER,
                              name, value, size, flags);
 }
 
index 00ea567..e3decae 100644 (file)
@@ -50,10 +50,11 @@ static int f2fs_xattr_generic_get(const struct xattr_handler *handler,
 }
 
 static int f2fs_xattr_generic_set(const struct xattr_handler *handler,
-               struct dentry *dentry, const char *name, const void *value,
+               struct dentry *unused, struct inode *inode,
+               const char *name, const void *value,
                size_t size, int flags)
 {
-       struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
+       struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 
        switch (handler->flags) {
        case F2FS_XATTR_INDEX_USER:
@@ -69,7 +70,7 @@ static int f2fs_xattr_generic_set(const struct xattr_handler *handler,
        default:
                return -EINVAL;
        }
-       return f2fs_setxattr(d_inode(dentry), handler->flags, name,
+       return f2fs_setxattr(inode, handler->flags, name,
                                        value, size, NULL, flags);
 }
 
@@ -95,11 +96,10 @@ static int f2fs_xattr_advise_get(const struct xattr_handler *handler,
 }
 
 static int f2fs_xattr_advise_set(const struct xattr_handler *handler,
-               struct dentry *dentry, const char *name, const void *value,
+               struct dentry *unused, struct inode *inode,
+               const char *name, const void *value,
                size_t size, int flags)
 {
-       struct inode *inode = d_inode(dentry);
-
        if (!inode_owner_or_capable(inode))
                return -EPERM;
        if (value == NULL)
index 3078b67..c8c4f79 100644 (file)
@@ -887,6 +887,8 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
                        put_page(results[i]);
        }
 
+       wake_up_bit(&cookie->flags, 0);
+
        _leave("");
 }
 
index b941905..ccd4971 100644 (file)
@@ -1719,10 +1719,10 @@ static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry,
        return fuse_update_attributes(inode, stat, NULL, NULL);
 }
 
-static int fuse_setxattr(struct dentry *entry, const char *name,
-                        const void *value, size_t size, int flags)
+static int fuse_setxattr(struct dentry *unused, struct inode *inode,
+                        const char *name, const void *value,
+                        size_t size, int flags)
 {
-       struct inode *inode = d_inode(entry);
        struct fuse_conn *fc = get_fuse_conn(inode);
        FUSE_ARGS(args);
        struct fuse_setxattr_in inarg;
index 4a01f30..271d939 100644 (file)
@@ -783,12 +783,15 @@ static int get_leaf_nr(struct gfs2_inode *dip, u32 index,
                       u64 *leaf_out)
 {
        __be64 *hash;
+       int error;
 
        hash = gfs2_dir_get_hash_table(dip);
-       if (IS_ERR(hash))
-               return PTR_ERR(hash);
-       *leaf_out = be64_to_cpu(*(hash + index));
-       return 0;
+       error = PTR_ERR_OR_ZERO(hash);
+
+       if (!error)
+               *leaf_out = be64_to_cpu(*(hash + index));
+
+       return error;
 }
 
 static int get_first_leaf(struct gfs2_inode *dip, u32 index,
@@ -798,7 +801,7 @@ static int get_first_leaf(struct gfs2_inode *dip, u32 index,
        int error;
 
        error = get_leaf_nr(dip, index, &leaf_no);
-       if (!IS_ERR_VALUE(error))
+       if (!error)
                error = get_leaf(dip, leaf_no, bh_out);
 
        return error;
@@ -1014,7 +1017,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
 
        index = name->hash >> (32 - dip->i_depth);
        error = get_leaf_nr(dip, index, &leaf_no);
-       if (IS_ERR_VALUE(error))
+       if (error)
                return error;
 
        /*  Get the old leaf block  */
index f42ab53..3a28535 100644 (file)
@@ -1251,10 +1251,10 @@ int __gfs2_xattr_set(struct inode *inode, const char *name,
 }
 
 static int gfs2_xattr_set(const struct xattr_handler *handler,
-                         struct dentry *dentry, const char *name,
-                         const void *value, size_t size, int flags)
+                         struct dentry *unused, struct inode *inode,
+                         const char *name, const void *value,
+                         size_t size, int flags)
 {
-       struct inode *inode = d_inode(dentry);
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_holder gh;
        int ret;
index 064f92f..d9a8691 100644 (file)
 #include "hfs_fs.h"
 #include "btree.h"
 
-int hfs_setxattr(struct dentry *dentry, const char *name,
-                const void *value, size_t size, int flags)
+int hfs_setxattr(struct dentry *unused, struct inode *inode,
+                const char *name, const void *value,
+                size_t size, int flags)
 {
-       struct inode *inode = d_inode(dentry);
        struct hfs_find_data fd;
        hfs_cat_rec rec;
        struct hfs_cat_file *file;
index fa3eed8..ee2f385 100644 (file)
@@ -212,7 +212,7 @@ extern void hfs_evict_inode(struct inode *);
 extern void hfs_delete_inode(struct inode *);
 
 /* attr.c */
-extern int hfs_setxattr(struct dentry *dentry, const char *name,
+extern int hfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name,
                        const void *value, size_t size, int flags);
 extern ssize_t hfs_getxattr(struct dentry *dentry, struct inode *inode,
                            const char *name, void *value, size_t size);
index 4f118d2..d37bb88 100644 (file)
@@ -424,7 +424,7 @@ static int copy_name(char *buffer, const char *xattr_name, int name_len)
        return len;
 }
 
-int hfsplus_setxattr(struct dentry *dentry, const char *name,
+int hfsplus_setxattr(struct inode *inode, const char *name,
                     const void *value, size_t size, int flags,
                     const char *prefix, size_t prefixlen)
 {
@@ -437,8 +437,7 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name,
                return -ENOMEM;
        strcpy(xattr_name, prefix);
        strcpy(xattr_name + prefixlen, name);
-       res = __hfsplus_setxattr(d_inode(dentry), xattr_name, value, size,
-                                flags);
+       res = __hfsplus_setxattr(inode, xattr_name, value, size, flags);
        kfree(xattr_name);
        return res;
 }
@@ -864,8 +863,9 @@ static int hfsplus_osx_getxattr(const struct xattr_handler *handler,
 }
 
 static int hfsplus_osx_setxattr(const struct xattr_handler *handler,
-                               struct dentry *dentry, const char *name,
-                               const void *buffer, size_t size, int flags)
+                               struct dentry *unused, struct inode *inode,
+                               const char *name, const void *buffer,
+                               size_t size, int flags)
 {
        /*
         * Don't allow setting properly prefixed attributes
@@ -880,7 +880,7 @@ static int hfsplus_osx_setxattr(const struct xattr_handler *handler,
         * creates), so we pass the name through unmodified (after
         * ensuring it doesn't conflict with another namespace).
         */
-       return __hfsplus_setxattr(d_inode(dentry), name, buffer, size, flags);
+       return __hfsplus_setxattr(inode, name, buffer, size, flags);
 }
 
 const struct xattr_handler hfsplus_xattr_osx_handler = {
index d04ba6f..68f6b53 100644 (file)
@@ -21,7 +21,7 @@ extern const struct xattr_handler *hfsplus_xattr_handlers[];
 int __hfsplus_setxattr(struct inode *inode, const char *name,
                        const void *value, size_t size, int flags);
 
-int hfsplus_setxattr(struct dentry *dentry, const char *name,
+int hfsplus_setxattr(struct inode *inode, const char *name,
                                   const void *value, size_t size, int flags,
                                   const char *prefix, size_t prefixlen);
 
index ae2ca8c..37b3efa 100644 (file)
@@ -23,10 +23,11 @@ static int hfsplus_security_getxattr(const struct xattr_handler *handler,
 }
 
 static int hfsplus_security_setxattr(const struct xattr_handler *handler,
-                                    struct dentry *dentry, const char *name,
-                                    const void *buffer, size_t size, int flags)
+                                    struct dentry *unused, struct inode *inode,
+                                    const char *name, const void *buffer,
+                                    size_t size, int flags)
 {
-       return hfsplus_setxattr(dentry, name, buffer, size, flags,
+       return hfsplus_setxattr(inode, name, buffer, size, flags,
                                XATTR_SECURITY_PREFIX,
                                XATTR_SECURITY_PREFIX_LEN);
 }
index eae2947..94519d6 100644 (file)
@@ -21,10 +21,11 @@ static int hfsplus_trusted_getxattr(const struct xattr_handler *handler,
 }
 
 static int hfsplus_trusted_setxattr(const struct xattr_handler *handler,
-                                   struct dentry *dentry, const char *name,
-                                   const void *buffer, size_t size, int flags)
+                                   struct dentry *unused, struct inode *inode,
+                                   const char *name, const void *buffer,
+                                   size_t size, int flags)
 {
-       return hfsplus_setxattr(dentry, name, buffer, size, flags,
+       return hfsplus_setxattr(inode, name, buffer, size, flags,
                                XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
 }
 
index 3c9eec3..fae6c0e 100644 (file)
@@ -21,10 +21,11 @@ static int hfsplus_user_getxattr(const struct xattr_handler *handler,
 }
 
 static int hfsplus_user_setxattr(const struct xattr_handler *handler,
-                                struct dentry *dentry, const char *name,
-                                const void *buffer, size_t size, int flags)
+                                struct dentry *unused, struct inode *inode,
+                                const char *name, const void *buffer,
+                                size_t size, int flags)
 {
-       return hfsplus_setxattr(dentry, name, buffer, size, flags,
+       return hfsplus_setxattr(inode, name, buffer, size, flags,
                                XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
 }
 
index 458cf46..82067ca 100644 (file)
@@ -15,6 +15,7 @@
 #include <linux/sched.h>
 #include <linux/bitmap.h>
 #include <linux/slab.h>
+#include <linux/seq_file.h>
 
 /* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */
 
@@ -453,10 +454,6 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
        int lowercase, eas, chk, errs, chkdsk, timeshift;
        int o;
        struct hpfs_sb_info *sbi = hpfs_sb(s);
-       char *new_opts = kstrdup(data, GFP_KERNEL);
-
-       if (!new_opts)
-               return -ENOMEM;
 
        sync_filesystem(s);
 
@@ -493,17 +490,44 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
 
        if (!(*flags & MS_RDONLY)) mark_dirty(s, 1);
 
-       replace_mount_options(s, new_opts);
-
        hpfs_unlock(s);
        return 0;
 
 out_err:
        hpfs_unlock(s);
-       kfree(new_opts);
        return -EINVAL;
 }
 
+static int hpfs_show_options(struct seq_file *seq, struct dentry *root)
+{
+       struct hpfs_sb_info *sbi = hpfs_sb(root->d_sb);
+
+       seq_printf(seq, ",uid=%u", from_kuid_munged(&init_user_ns, sbi->sb_uid));
+       seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, sbi->sb_gid));
+       seq_printf(seq, ",umask=%03o", (~sbi->sb_mode & 0777));
+       if (sbi->sb_lowercase)
+               seq_printf(seq, ",case=lower");
+       if (!sbi->sb_chk)
+               seq_printf(seq, ",check=none");
+       if (sbi->sb_chk == 2)
+               seq_printf(seq, ",check=strict");
+       if (!sbi->sb_err)
+               seq_printf(seq, ",errors=continue");
+       if (sbi->sb_err == 2)
+               seq_printf(seq, ",errors=panic");
+       if (!sbi->sb_chkdsk)
+               seq_printf(seq, ",chkdsk=no");
+       if (sbi->sb_chkdsk == 2)
+               seq_printf(seq, ",chkdsk=always");
+       if (!sbi->sb_eas)
+               seq_printf(seq, ",eas=no");
+       if (sbi->sb_eas == 1)
+               seq_printf(seq, ",eas=ro");
+       if (sbi->sb_timeshift)
+               seq_printf(seq, ",timeshift=%d", sbi->sb_timeshift);
+       return 0;
+}
+
 /* Super operations */
 
 static const struct super_operations hpfs_sops =
@@ -514,7 +538,7 @@ static const struct super_operations hpfs_sops =
        .put_super      = hpfs_put_super,
        .statfs         = hpfs_statfs,
        .remount_fs     = hpfs_remount_fs,
-       .show_options   = generic_show_options,
+       .show_options   = hpfs_show_options,
 };
 
 static int hpfs_fill_super(struct super_block *s, void *options, int silent)
@@ -537,8 +561,6 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
 
        int o;
 
-       save_mount_options(s, options);
-
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi) {
                return -ENOMEM;
index 2ad98d6..7007809 100644 (file)
@@ -219,6 +219,8 @@ static int journal_submit_data_buffers(journal_t *journal,
 
        spin_lock(&journal->j_list_lock);
        list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+               if (!(jinode->i_flags & JI_WRITE_DATA))
+                       continue;
                mapping = jinode->i_vfs_inode->i_mapping;
                jinode->i_flags |= JI_COMMIT_RUNNING;
                spin_unlock(&journal->j_list_lock);
@@ -256,6 +258,8 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
        /* For locking, see the comment in journal_submit_data_buffers() */
        spin_lock(&journal->j_list_lock);
        list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+               if (!(jinode->i_flags & JI_WAIT_DATA))
+                       continue;
                jinode->i_flags |= JI_COMMIT_RUNNING;
                spin_unlock(&journal->j_list_lock);
                err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
index 435f0b2..b31852f 100644 (file)
@@ -94,7 +94,8 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
 EXPORT_SYMBOL(jbd2_journal_invalidatepage);
 EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
 EXPORT_SYMBOL(jbd2_journal_force_commit);
-EXPORT_SYMBOL(jbd2_journal_file_inode);
+EXPORT_SYMBOL(jbd2_journal_inode_add_write);
+EXPORT_SYMBOL(jbd2_journal_inode_add_wait);
 EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
 EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
 EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
index 2c56c3e..1749519 100644 (file)
@@ -2462,7 +2462,8 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
 /*
  * File inode in the inode list of the handle's transaction
  */
-int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
+static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode,
+                                  unsigned long flags)
 {
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal;
@@ -2487,12 +2488,14 @@ int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
         * and if jinode->i_next_transaction == transaction, commit code
         * will only file the inode where we want it.
         */
-       if (jinode->i_transaction == transaction ||
-           jinode->i_next_transaction == transaction)
+       if ((jinode->i_transaction == transaction ||
+           jinode->i_next_transaction == transaction) &&
+           (jinode->i_flags & flags) == flags)
                return 0;
 
        spin_lock(&journal->j_list_lock);
-
+       jinode->i_flags |= flags;
+       /* Is inode already attached where we need it? */
        if (jinode->i_transaction == transaction ||
            jinode->i_next_transaction == transaction)
                goto done;
@@ -2523,6 +2526,17 @@ done:
        return 0;
 }
 
+int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *jinode)
+{
+       return jbd2_journal_file_inode(handle, jinode,
+                                      JI_WRITE_DATA | JI_WAIT_DATA);
+}
+
+int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *jinode)
+{
+       return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA);
+}
+
 /*
  * File truncate and transaction commit interact with each other in a
  * non-trivial way.  If a transaction writing data block A is
index 3ed9a4b..c2332e3 100644 (file)
@@ -57,10 +57,11 @@ static int jffs2_security_getxattr(const struct xattr_handler *handler,
 }
 
 static int jffs2_security_setxattr(const struct xattr_handler *handler,
-                                  struct dentry *dentry, const char *name,
-                                  const void *buffer, size_t size, int flags)
+                                  struct dentry *unused, struct inode *inode,
+                                  const char *name, const void *buffer,
+                                  size_t size, int flags)
 {
-       return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY,
+       return do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY,
                                 name, buffer, size, flags);
 }
 
index 4ebecff..5d60308 100644 (file)
@@ -25,10 +25,11 @@ static int jffs2_trusted_getxattr(const struct xattr_handler *handler,
 }
 
 static int jffs2_trusted_setxattr(const struct xattr_handler *handler,
-                                 struct dentry *dentry, const char *name,
-                                 const void *buffer, size_t size, int flags)
+                                 struct dentry *unused, struct inode *inode,
+                                 const char *name, const void *buffer,
+                                 size_t size, int flags)
 {
-       return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED,
+       return do_jffs2_setxattr(inode, JFFS2_XPREFIX_TRUSTED,
                                 name, buffer, size, flags);
 }
 
index bce249e..9d027b4 100644 (file)
@@ -25,10 +25,11 @@ static int jffs2_user_getxattr(const struct xattr_handler *handler,
 }
 
 static int jffs2_user_setxattr(const struct xattr_handler *handler,
-                              struct dentry *dentry, const char *name,
-                              const void *buffer, size_t size, int flags)
+                              struct dentry *unused, struct inode *inode,
+                              const char *name, const void *buffer,
+                              size_t size, int flags)
 {
-       return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_USER,
+       return do_jffs2_setxattr(inode, JFFS2_XPREFIX_USER,
                                 name, buffer, size, flags);
 }
 
index beb182b..0bf3c33 100644 (file)
@@ -943,11 +943,10 @@ static int jfs_xattr_get(const struct xattr_handler *handler,
 }
 
 static int jfs_xattr_set(const struct xattr_handler *handler,
-                        struct dentry *dentry, const char *name,
-                        const void *value, size_t size, int flags)
+                        struct dentry *unused, struct inode *inode,
+                        const char *name, const void *value,
+                        size_t size, int flags)
 {
-       struct inode *inode = d_inode(dentry);
-
        name = xattr_full_name(handler, name);
        return __jfs_xattr_set(inode, name, value, size, flags);
 }
@@ -962,11 +961,10 @@ static int jfs_xattr_get_os2(const struct xattr_handler *handler,
 }
 
 static int jfs_xattr_set_os2(const struct xattr_handler *handler,
-                            struct dentry *dentry, const char *name,
-                            const void *value, size_t size, int flags)
+                            struct dentry *unused, struct inode *inode,
+                            const char *name, const void *value,
+                            size_t size, int flags)
 {
-       struct inode *inode = d_inode(dentry);
-
        if (is_known_namespace(name))
                return -EOPNOTSUPP;
        return __jfs_xattr_set(inode, name, value, size, flags);
index 1719649..63b925d 100644 (file)
@@ -160,10 +160,11 @@ static int kernfs_node_setsecdata(struct kernfs_node *kn, void **secdata,
        return 0;
 }
 
-int kernfs_iop_setxattr(struct dentry *dentry, const char *name,
-                       const void *value, size_t size, int flags)
+int kernfs_iop_setxattr(struct dentry *unused, struct inode *inode,
+                       const char *name, const void *value,
+                       size_t size, int flags)
 {
-       struct kernfs_node *kn = dentry->d_fsdata;
+       struct kernfs_node *kn = inode->i_private;
        struct kernfs_iattrs *attrs;
        void *secdata;
        int error;
@@ -175,11 +176,11 @@ int kernfs_iop_setxattr(struct dentry *dentry, const char *name,
 
        if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) {
                const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
-               error = security_inode_setsecurity(d_inode(dentry), suffix,
+               error = security_inode_setsecurity(inode, suffix,
                                                value, size, flags);
                if (error)
                        return error;
-               error = security_inode_getsecctx(d_inode(dentry),
+               error = security_inode_getsecctx(inode,
                                                &secdata, &secdata_len);
                if (error)
                        return error;
index 45c9192..3715923 100644 (file)
@@ -81,7 +81,8 @@ int kernfs_iop_permission(struct inode *inode, int mask);
 int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr);
 int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry,
                       struct kstat *stat);
-int kernfs_iop_setxattr(struct dentry *dentry, const char *name, const void *value,
+int kernfs_iop_setxattr(struct dentry *dentry, struct inode *inode,
+                       const char *name, const void *value,
                        size_t size, int flags);
 int kernfs_iop_removexattr(struct dentry *dentry, const char *name);
 ssize_t kernfs_iop_getxattr(struct dentry *dentry, struct inode *inode,
index 8765ff1..3db2721 100644 (file)
@@ -1118,8 +1118,9 @@ static int empty_dir_setattr(struct dentry *dentry, struct iattr *attr)
        return -EPERM;
 }
 
-static int empty_dir_setxattr(struct dentry *dentry, const char *name,
-                             const void *value, size_t size, int flags)
+static int empty_dir_setxattr(struct dentry *dentry, struct inode *inode,
+                             const char *name, const void *value,
+                             size_t size, int flags)
 {
        return -EOPNOTSUPP;
 }
index 5375571..6a82fb7 100644 (file)
@@ -35,6 +35,7 @@
 #include <linux/fs_struct.h>
 #include <linux/posix_acl.h>
 #include <linux/hash.h>
+#include <linux/bitops.h>
 #include <asm/uaccess.h>
 
 #include "internal.h"
@@ -1415,21 +1416,28 @@ static void follow_mount(struct path *path)
        }
 }
 
+static int path_parent_directory(struct path *path)
+{
+       struct dentry *old = path->dentry;
+       /* rare case of legitimate dget_parent()... */
+       path->dentry = dget_parent(path->dentry);
+       dput(old);
+       if (unlikely(!path_connected(path)))
+               return -ENOENT;
+       return 0;
+}
+
 static int follow_dotdot(struct nameidata *nd)
 {
        while(1) {
-               struct dentry *old = nd->path.dentry;
-
                if (nd->path.dentry == nd->root.dentry &&
                    nd->path.mnt == nd->root.mnt) {
                        break;
                }
                if (nd->path.dentry != nd->path.mnt->mnt_root) {
-                       /* rare case of legitimate dget_parent()... */
-                       nd->path.dentry = dget_parent(nd->path.dentry);
-                       dput(old);
-                       if (unlikely(!path_connected(&nd->path)))
-                               return -ENOENT;
+                       int ret = path_parent_directory(&nd->path);
+                       if (ret)
+                               return ret;
                        break;
                }
                if (!follow_up(&nd->path))
@@ -1797,74 +1805,144 @@ static int walk_component(struct nameidata *nd, int flags)
 
 #include <asm/word-at-a-time.h>
 
-#ifdef CONFIG_64BIT
+#ifdef HASH_MIX
 
-static inline unsigned int fold_hash(unsigned long hash)
-{
-       return hash_64(hash, 32);
-}
+/* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */
+
+#elif defined(CONFIG_64BIT)
+/*
+ * Register pressure in the mixing function is an issue, particularly
+ * on 32-bit x86, but almost any function requires one state value and
+ * one temporary.  Instead, use a function designed for two state values
+ * and no temporaries.
+ *
+ * This function cannot create a collision in only two iterations, so
+ * we have two iterations to achieve avalanche.  In those two iterations,
+ * we have six layers of mixing, which is enough to spread one bit's
+ * influence out to 2^6 = 64 state bits.
+ *
+ * Rotate constants are scored by considering either 64 one-bit input
+ * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the
+ * probability of that delta causing a change to each of the 128 output
+ * bits, using a sample of random initial states.
+ *
+ * The Shannon entropy of the computed probabilities is then summed
+ * to produce a score.  Ideally, any input change has a 50% chance of
+ * toggling any given output bit.
+ *
+ * Mixing scores (in bits) for (12,45):
+ * Input delta: 1-bit      2-bit
+ * 1 round:     713.3    42542.6
+ * 2 rounds:   2753.7   140389.8
+ * 3 rounds:   5954.1   233458.2
+ * 4 rounds:   7862.6   256672.2
+ * Perfect:    8192     258048
+ *            (64*128) (64*63/2 * 128)
+ */
+#define HASH_MIX(x, y, a)      \
+       (       x ^= (a),       \
+       y ^= x, x = rol64(x,12),\
+       x += y, y = rol64(y,45),\
+       y *= 9                  )
 
 /*
- * This is George Marsaglia's XORSHIFT generator.
- * It implements a maximum-period LFSR in only a few
- * instructions.  It also has the property (required
- * by hash_name()) that mix_hash(0) = 0.
+ * Fold two longs into one 32-bit hash value.  This must be fast, but
+ * latency isn't quite as critical, as there is a fair bit of additional
+ * work done before the hash value is used.
  */
-static inline unsigned long mix_hash(unsigned long hash)
+static inline unsigned int fold_hash(unsigned long x, unsigned long y)
 {
-       hash ^= hash << 13;
-       hash ^= hash >> 7;
-       hash ^= hash << 17;
-       return hash;
+       y ^= x * GOLDEN_RATIO_64;
+       y *= GOLDEN_RATIO_64;
+       return y >> 32;
 }
 
 #else  /* 32-bit case */
 
-#define fold_hash(x) (x)
+/*
+ * Mixing scores (in bits) for (7,20):
+ * Input delta: 1-bit      2-bit
+ * 1 round:     330.3     9201.6
+ * 2 rounds:   1246.4    25475.4
+ * 3 rounds:   1907.1    31295.1
+ * 4 rounds:   2042.3    31718.6
+ * Perfect:    2048      31744
+ *            (32*64)   (32*31/2 * 64)
+ */
+#define HASH_MIX(x, y, a)      \
+       (       x ^= (a),       \
+       y ^= x, x = rol32(x, 7),\
+       x += y, y = rol32(y,20),\
+       y *= 9                  )
 
-static inline unsigned long mix_hash(unsigned long hash)
+static inline unsigned int fold_hash(unsigned long x, unsigned long y)
 {
-       hash ^= hash << 13;
-       hash ^= hash >> 17;
-       hash ^= hash << 5;
-       return hash;
+       /* Use arch-optimized multiply if one exists */
+       return __hash_32(y ^ __hash_32(x));
 }
 
 #endif
 
-unsigned int full_name_hash(const unsigned char *name, unsigned int len)
+/*
+ * Return the hash of a string of known length.  This is carfully
+ * designed to match hash_name(), which is the more critical function.
+ * In particular, we must end by hashing a final word containing 0..7
+ * payload bytes, to match the way that hash_name() iterates until it
+ * finds the delimiter after the name.
+ */
+unsigned int full_name_hash(const char *name, unsigned int len)
 {
-       unsigned long a, hash = 0;
+       unsigned long a, x = 0, y = 0;
 
        for (;;) {
+               if (!len)
+                       goto done;
                a = load_unaligned_zeropad(name);
                if (len < sizeof(unsigned long))
                        break;
-               hash = mix_hash(hash + a);
+               HASH_MIX(x, y, a);
                name += sizeof(unsigned long);
                len -= sizeof(unsigned long);
-               if (!len)
-                       goto done;
        }
-       hash += a & bytemask_from_count(len);
+       x ^= a & bytemask_from_count(len);
 done:
-       return fold_hash(hash);
+       return fold_hash(x, y);
 }
 EXPORT_SYMBOL(full_name_hash);
 
+/* Return the "hash_len" (hash and length) of a null-terminated string */
+u64 hashlen_string(const char *name)
+{
+       unsigned long a = 0, x = 0, y = 0, adata, mask, len;
+       const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
+
+       len = -sizeof(unsigned long);
+       do {
+               HASH_MIX(x, y, a);
+               len += sizeof(unsigned long);
+               a = load_unaligned_zeropad(name+len);
+       } while (!has_zero(a, &adata, &constants));
+
+       adata = prep_zero_mask(a, adata, &constants);
+       mask = create_zero_mask(adata);
+       x ^= a & zero_bytemask(mask);
+
+       return hashlen_create(fold_hash(x, y), len + find_zero(mask));
+}
+EXPORT_SYMBOL(hashlen_string);
+
 /*
  * Calculate the length and hash of the path component, and
  * return the "hash_len" as the result.
  */
 static inline u64 hash_name(const char *name)
 {
-       unsigned long a, b, adata, bdata, mask, hash, len;
+       unsigned long a = 0, b, x = 0, y = 0, adata, bdata, mask, len;
        const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
 
-       hash = a = 0;
        len = -sizeof(unsigned long);
        do {
-               hash = mix_hash(hash + a);
+               HASH_MIX(x, y, a);
                len += sizeof(unsigned long);
                a = load_unaligned_zeropad(name+len);
                b = a ^ REPEAT_BYTE('/');
@@ -1872,25 +1950,40 @@ static inline u64 hash_name(const char *name)
 
        adata = prep_zero_mask(a, adata, &constants);
        bdata = prep_zero_mask(b, bdata, &constants);
-
        mask = create_zero_mask(adata | bdata);
+       x ^= a & zero_bytemask(mask);
 
-       hash += a & zero_bytemask(mask);
-       len += find_zero(mask);
-       return hashlen_create(fold_hash(hash), len);
+       return hashlen_create(fold_hash(x, y), len + find_zero(mask));
 }
 
-#else
+#else  /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */
 
-unsigned int full_name_hash(const unsigned char *name, unsigned int len)
+/* Return the hash of a string of known length */
+unsigned int full_name_hash(const char *name, unsigned int len)
 {
        unsigned long hash = init_name_hash();
        while (len--)
-               hash = partial_name_hash(*name++, hash);
+               hash = partial_name_hash((unsigned char)*name++, hash);
        return end_name_hash(hash);
 }
 EXPORT_SYMBOL(full_name_hash);
 
+/* Return the "hash_len" (hash and length) of a null-terminated string */
+u64 hashlen_string(const char *name)
+{
+       unsigned long hash = init_name_hash();
+       unsigned long len = 0, c;
+
+       c = (unsigned char)*name;
+       while (c) {
+               len++;
+               hash = partial_name_hash(c, hash);
+               c = (unsigned char)name[len];
+       }
+       return hashlen_create(end_name_hash(hash), len);
+}
+EXPORT_SYMBOL(hashlen_string);
+
 /*
  * We know there's a real path component here of at least
  * one character.
@@ -1934,7 +2027,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
                int type;
 
                err = may_lookup(nd);
-               if (err)
+               if (err)
                        return err;
 
                hash_len = hash_name(name);
@@ -2428,6 +2521,34 @@ struct dentry *lookup_one_len_unlocked(const char *name,
 }
 EXPORT_SYMBOL(lookup_one_len_unlocked);
 
+#ifdef CONFIG_UNIX98_PTYS
+int path_pts(struct path *path)
+{
+       /* Find something mounted on "pts" in the same directory as
+        * the input path.
+        */
+       struct dentry *child, *parent;
+       struct qstr this;
+       int ret;
+
+       ret = path_parent_directory(path);
+       if (ret)
+               return ret;
+
+       parent = path->dentry;
+       this.name = "pts";
+       this.len = 3;
+       child = d_hash_and_lookup(parent, &this);
+       if (!child)
+               return -ENOENT;
+
+       path->dentry = child;
+       dput(parent);
+       follow_mount(path);
+       return 0;
+}
+#endif
+
 int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
                 struct path *path, int *empty)
 {
@@ -4542,7 +4663,6 @@ int readlink_copy(char __user *buffer, int buflen, const char *link)
 out:
        return len;
 }
-EXPORT_SYMBOL(readlink_copy);
 
 /*
  * A helper for ->readlink().  This should be used *ONLY* for symlinks that
index 618ced3..aaa2e8d 100644 (file)
@@ -217,7 +217,8 @@ static u32 initiate_file_draining(struct nfs_client *clp,
        }
 
        if (pnfs_mark_matching_lsegs_return(lo, &free_me_list,
-                                       &args->cbl_range)) {
+                               &args->cbl_range,
+                               be32_to_cpu(args->cbl_stateid.seqid))) {
                rv = NFS4_OK;
                goto unlock;
        }
@@ -500,8 +501,10 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
        cps->slot = slot;
 
        /* The ca_maxresponsesize_cached is 0 with no DRC */
-       if (args->csa_cachethis != 0)
-               return htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE);
+       if (args->csa_cachethis != 0) {
+               status = htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE);
+               goto out_unlock;
+       }
 
        /*
         * Check for pending referring calls.  If a match is found, a
index 976c906..d81f96a 100644 (file)
@@ -146,10 +146,16 @@ static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
        p = read_buf(xdr, NFS4_STATEID_SIZE);
        if (unlikely(p == NULL))
                return htonl(NFS4ERR_RESOURCE);
-       memcpy(stateid, p, NFS4_STATEID_SIZE);
+       memcpy(stateid->data, p, NFS4_STATEID_SIZE);
        return 0;
 }
 
+static __be32 decode_delegation_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+       stateid->type = NFS4_DELEGATION_STATEID_TYPE;
+       return decode_stateid(xdr, stateid);
+}
+
 static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound_hdr_arg *hdr)
 {
        __be32 *p;
@@ -211,7 +217,7 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr,
        __be32 *p;
        __be32 status;
 
-       status = decode_stateid(xdr, &args->stateid);
+       status = decode_delegation_stateid(xdr, &args->stateid);
        if (unlikely(status != 0))
                goto out;
        p = read_buf(xdr, 4);
@@ -227,6 +233,11 @@ out:
 }
 
 #if defined(CONFIG_NFS_V4_1)
+static __be32 decode_layout_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+       stateid->type = NFS4_LAYOUT_STATEID_TYPE;
+       return decode_stateid(xdr, stateid);
+}
 
 static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
                                       struct xdr_stream *xdr,
@@ -263,7 +274,7 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
                }
                p = xdr_decode_hyper(p, &args->cbl_range.offset);
                p = xdr_decode_hyper(p, &args->cbl_range.length);
-               status = decode_stateid(xdr, &args->cbl_stateid);
+               status = decode_layout_stateid(xdr, &args->cbl_stateid);
                if (unlikely(status != 0))
                        goto out;
        } else if (args->cbl_recall_type == RETURN_FSID) {
index 5166adc..322c258 100644 (file)
@@ -875,15 +875,16 @@ int nfs_delegations_present(struct nfs_client *clp)
 
 /**
  * nfs4_copy_delegation_stateid - Copy inode's state ID information
- * @dst: stateid data structure to fill in
  * @inode: inode to check
  * @flags: delegation type requirement
+ * @dst: stateid data structure to fill in
+ * @cred: optional argument to retrieve credential
  *
  * Returns "true" and fills in "dst->data" * if inode had a delegation,
  * otherwise "false" is returned.
  */
-bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode,
-               fmode_t flags)
+bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags,
+               nfs4_stateid *dst, struct rpc_cred **cred)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_delegation *delegation;
@@ -896,6 +897,8 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode,
        if (ret) {
                nfs4_stateid_copy(dst, &delegation->stateid);
                nfs_mark_delegation_referenced(delegation);
+               if (cred)
+                       *cred = get_rpccred(delegation->cred);
        }
        rcu_read_unlock();
        return ret;
index 333063e..64724d2 100644 (file)
@@ -56,7 +56,7 @@ void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
 int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync);
 int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid, fmode_t type);
 int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid);
-bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_t flags);
+bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, nfs4_stateid *dst, struct rpc_cred **cred);
 
 void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
 int nfs4_have_delegation(struct inode *inode, fmode_t flags);
index 741a92c..979b3c4 100644 (file)
@@ -87,6 +87,7 @@ struct nfs_direct_req {
        int                     mirror_count;
 
        ssize_t                 count,          /* bytes actually processed */
+                               max_count,      /* max expected count */
                                bytes_left,     /* bytes left to be sent */
                                io_start,       /* start of IO */
                                error;          /* any reported error */
@@ -123,6 +124,8 @@ nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr)
        int i;
        ssize_t count;
 
+       WARN_ON_ONCE(dreq->count >= dreq->max_count);
+
        if (dreq->mirror_count == 1) {
                dreq->mirrors[hdr->pgio_mirror_idx].count += hdr->good_bytes;
                dreq->count += hdr->good_bytes;
@@ -275,7 +278,7 @@ static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
 void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
                              struct nfs_direct_req *dreq)
 {
-       cinfo->lock = &dreq->inode->i_lock;
+       cinfo->inode = dreq->inode;
        cinfo->mds = &dreq->mds_cinfo;
        cinfo->ds = &dreq->ds_cinfo;
        cinfo->dreq = dreq;
@@ -591,7 +594,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
                goto out_unlock;
 
        dreq->inode = inode;
-       dreq->bytes_left = count;
+       dreq->bytes_left = dreq->max_count = count;
        dreq->io_start = iocb->ki_pos;
        dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
        l_ctx = nfs_get_lock_context(dreq->ctx);
@@ -630,13 +633,13 @@ nfs_direct_write_scan_commit_list(struct inode *inode,
                                  struct list_head *list,
                                  struct nfs_commit_info *cinfo)
 {
-       spin_lock(cinfo->lock);
+       spin_lock(&cinfo->inode->i_lock);
 #ifdef CONFIG_NFS_V4_1
        if (cinfo->ds != NULL && cinfo->ds->nwritten != 0)
                NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
 #endif
        nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0);
-       spin_unlock(cinfo->lock);
+       spin_unlock(&cinfo->inode->i_lock);
 }
 
 static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
@@ -671,13 +674,13 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
                if (!nfs_pageio_add_request(&desc, req)) {
                        nfs_list_remove_request(req);
                        nfs_list_add_request(req, &failed);
-                       spin_lock(cinfo.lock);
+                       spin_lock(&cinfo.inode->i_lock);
                        dreq->flags = 0;
                        if (desc.pg_error < 0)
                                dreq->error = desc.pg_error;
                        else
                                dreq->error = -EIO;
-                       spin_unlock(cinfo.lock);
+                       spin_unlock(&cinfo.inode->i_lock);
                }
                nfs_release_request(req);
        }
@@ -1023,7 +1026,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
                goto out_unlock;
 
        dreq->inode = inode;
-       dreq->bytes_left = iov_iter_count(iter);
+       dreq->bytes_left = dreq->max_count = iov_iter_count(iter);
        dreq->io_start = pos;
        dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
        l_ctx = nfs_get_lock_context(dreq->ctx);
index 3384dc8..aa59757 100644 (file)
@@ -795,7 +795,7 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg,
                buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW;
        }
 
-       spin_lock(cinfo->lock);
+       spin_lock(&cinfo->inode->i_lock);
        if (cinfo->ds->nbuckets >= size)
                goto out;
        for (i = 0; i < cinfo->ds->nbuckets; i++) {
@@ -811,7 +811,7 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg,
        swap(cinfo->ds->buckets, buckets);
        cinfo->ds->nbuckets = size;
 out:
-       spin_unlock(cinfo->lock);
+       spin_unlock(&cinfo->inode->i_lock);
        kfree(buckets);
        return 0;
 }
@@ -890,6 +890,7 @@ filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,
                                           0,
                                           NFS4_MAX_UINT64,
                                           IOMODE_READ,
+                                          false,
                                           GFP_KERNEL);
                if (IS_ERR(pgio->pg_lseg)) {
                        pgio->pg_error = PTR_ERR(pgio->pg_lseg);
@@ -915,6 +916,7 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
                                           0,
                                           NFS4_MAX_UINT64,
                                           IOMODE_RW,
+                                          false,
                                           GFP_NOFS);
                if (IS_ERR(pgio->pg_lseg)) {
                        pgio->pg_error = PTR_ERR(pgio->pg_lseg);
index 0cb1abd..0e8018b 100644 (file)
@@ -26,6 +26,8 @@
 
 #define FF_LAYOUT_POLL_RETRY_MAX     (15*HZ)
 
+static struct group_info       *ff_zero_group;
+
 static struct pnfs_layout_hdr *
 ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
 {
@@ -53,14 +55,15 @@ ff_layout_free_layout_hdr(struct pnfs_layout_hdr *lo)
        kfree(FF_LAYOUT_FROM_HDR(lo));
 }
 
-static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+static int decode_pnfs_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
 {
        __be32 *p;
 
        p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE);
        if (unlikely(p == NULL))
                return -ENOBUFS;
-       memcpy(stateid, p, NFS4_STATEID_SIZE);
+       stateid->type = NFS4_PNFS_DS_STATEID_TYPE;
+       memcpy(stateid->data, p, NFS4_STATEID_SIZE);
        dprintk("%s: stateid id= [%x%x%x%x]\n", __func__,
                p[0], p[1], p[2], p[3]);
        return 0;
@@ -211,10 +214,16 @@ static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags)
 
 static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror)
 {
+       struct rpc_cred *cred;
+
        ff_layout_remove_mirror(mirror);
        kfree(mirror->fh_versions);
-       if (mirror->cred)
-               put_rpccred(mirror->cred);
+       cred = rcu_access_pointer(mirror->ro_cred);
+       if (cred)
+               put_rpccred(cred);
+       cred = rcu_access_pointer(mirror->rw_cred);
+       if (cred)
+               put_rpccred(cred);
        nfs4_ff_layout_put_deviceid(mirror->mirror_ds);
        kfree(mirror);
 }
@@ -290,6 +299,8 @@ ff_lseg_merge(struct pnfs_layout_segment *new,
 {
        u64 new_end, old_end;
 
+       if (test_bit(NFS_LSEG_LAYOUTRETURN, &old->pls_flags))
+               return false;
        if (new->pls_range.iomode != old->pls_range.iomode)
                return false;
        old_end = pnfs_calc_offset_end(old->pls_range.offset,
@@ -310,8 +321,6 @@ ff_lseg_merge(struct pnfs_layout_segment *new,
                        new_end);
        if (test_bit(NFS_LSEG_ROC, &old->pls_flags))
                set_bit(NFS_LSEG_ROC, &new->pls_flags);
-       if (test_bit(NFS_LSEG_LAYOUTRETURN, &old->pls_flags))
-               set_bit(NFS_LSEG_LAYOUTRETURN, &new->pls_flags);
        return true;
 }
 
@@ -407,8 +416,9 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
                struct nfs4_ff_layout_mirror *mirror;
                struct nfs4_deviceid devid;
                struct nfs4_deviceid_node *idnode;
-               u32 ds_count;
-               u32 fh_count;
+               struct auth_cred acred = { .group_info = ff_zero_group };
+               struct rpc_cred __rcu *cred;
+               u32 ds_count, fh_count, id;
                int j;
 
                rc = -EIO;
@@ -456,7 +466,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
                fls->mirror_array[i]->efficiency = be32_to_cpup(p);
 
                /* stateid */
-               rc = decode_stateid(&stream, &fls->mirror_array[i]->stateid);
+               rc = decode_pnfs_stateid(&stream, &fls->mirror_array[i]->stateid);
                if (rc)
                        goto out_err_free;
 
@@ -484,24 +494,49 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
                fls->mirror_array[i]->fh_versions_cnt = fh_count;
 
                /* user */
-               rc = decode_name(&stream, &fls->mirror_array[i]->uid);
+               rc = decode_name(&stream, &id);
                if (rc)
                        goto out_err_free;
 
+               acred.uid = make_kuid(&init_user_ns, id);
+
                /* group */
-               rc = decode_name(&stream, &fls->mirror_array[i]->gid);
+               rc = decode_name(&stream, &id);
                if (rc)
                        goto out_err_free;
 
+               acred.gid = make_kgid(&init_user_ns, id);
+
+               /* find the cred for it */
+               rcu_assign_pointer(cred, rpc_lookup_generic_cred(&acred, 0, gfp_flags));
+               if (IS_ERR(cred)) {
+                       rc = PTR_ERR(cred);
+                       goto out_err_free;
+               }
+
+               if (lgr->range.iomode == IOMODE_READ)
+                       rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred);
+               else
+                       rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred);
+
                mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]);
                if (mirror != fls->mirror_array[i]) {
+                       /* swap cred ptrs so free_mirror will clean up old */
+                       if (lgr->range.iomode == IOMODE_READ) {
+                               cred = xchg(&mirror->ro_cred, cred);
+                               rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred);
+                       } else {
+                               cred = xchg(&mirror->rw_cred, cred);
+                               rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred);
+                       }
                        ff_layout_free_mirror(fls->mirror_array[i]);
                        fls->mirror_array[i] = mirror;
                }
 
-               dprintk("%s: uid %d gid %d\n", __func__,
-                       fls->mirror_array[i]->uid,
-                       fls->mirror_array[i]->gid);
+               dprintk("%s: iomode %s uid %u gid %u\n", __func__,
+                       lgr->range.iomode == IOMODE_READ ? "READ" : "RW",
+                       from_kuid(&init_user_ns, acred.uid),
+                       from_kgid(&init_user_ns, acred.gid));
        }
 
        p = xdr_inline_decode(&stream, 4);
@@ -745,7 +780,7 @@ ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
        else {
                int i;
 
-               spin_lock(cinfo->lock);
+               spin_lock(&cinfo->inode->i_lock);
                if (cinfo->ds->nbuckets != 0)
                        kfree(buckets);
                else {
@@ -759,7 +794,7 @@ ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
                                        NFS_INVALID_STABLE_HOW;
                        }
                }
-               spin_unlock(cinfo->lock);
+               spin_unlock(&cinfo->inode->i_lock);
                return 0;
        }
 }
@@ -785,6 +820,36 @@ ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
        return NULL;
 }
 
+static void
+ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio,
+                     struct nfs_page *req,
+                     bool strict_iomode)
+{
+retry_strict:
+       pnfs_put_lseg(pgio->pg_lseg);
+       pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+                                          req->wb_context,
+                                          0,
+                                          NFS4_MAX_UINT64,
+                                          IOMODE_READ,
+                                          strict_iomode,
+                                          GFP_KERNEL);
+       if (IS_ERR(pgio->pg_lseg)) {
+               pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+               pgio->pg_lseg = NULL;
+       }
+
+       /* If we don't have checking, do get a IOMODE_RW
+        * segment, and the server wants to avoid READs
+        * there, then retry!
+        */
+       if (pgio->pg_lseg && !strict_iomode &&
+           ff_layout_avoid_read_on_rw(pgio->pg_lseg)) {
+               strict_iomode = true;
+               goto retry_strict;
+       }
+}
+
 static void
 ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
                        struct nfs_page *req)
@@ -795,26 +860,23 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
        int ds_idx;
 
        /* Use full layout for now */
-       if (!pgio->pg_lseg) {
-               pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
-                                                  req->wb_context,
-                                                  0,
-                                                  NFS4_MAX_UINT64,
-                                                  IOMODE_READ,
-                                                  GFP_KERNEL);
-               if (IS_ERR(pgio->pg_lseg)) {
-                       pgio->pg_error = PTR_ERR(pgio->pg_lseg);
-                       pgio->pg_lseg = NULL;
-                       return;
-               }
-       }
+       if (!pgio->pg_lseg)
+               ff_layout_pg_get_read(pgio, req, false);
+       else if (ff_layout_avoid_read_on_rw(pgio->pg_lseg))
+               ff_layout_pg_get_read(pgio, req, true);
+
        /* If no lseg, fall back to read through mds */
        if (pgio->pg_lseg == NULL)
                goto out_mds;
 
        ds = ff_layout_choose_best_ds_for_read(pgio->pg_lseg, 0, &ds_idx);
-       if (!ds)
-               goto out_mds;
+       if (!ds) {
+               if (ff_layout_no_fallback_to_mds(pgio->pg_lseg))
+                       goto out_pnfs;
+               else
+                       goto out_mds;
+       }
+
        mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
 
        pgio->pg_mirror_idx = ds_idx;
@@ -828,6 +890,12 @@ out_mds:
        pnfs_put_lseg(pgio->pg_lseg);
        pgio->pg_lseg = NULL;
        nfs_pageio_reset_read_mds(pgio);
+       return;
+
+out_pnfs:
+       pnfs_set_lo_fail(pgio->pg_lseg);
+       pnfs_put_lseg(pgio->pg_lseg);
+       pgio->pg_lseg = NULL;
 }
 
 static void
@@ -847,6 +915,7 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
                                                   0,
                                                   NFS4_MAX_UINT64,
                                                   IOMODE_RW,
+                                                  false,
                                                   GFP_NOFS);
                if (IS_ERR(pgio->pg_lseg)) {
                        pgio->pg_error = PTR_ERR(pgio->pg_lseg);
@@ -870,8 +939,12 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
 
        for (i = 0; i < pgio->pg_mirror_count; i++) {
                ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true);
-               if (!ds)
-                       goto out_mds;
+               if (!ds) {
+                       if (ff_layout_no_fallback_to_mds(pgio->pg_lseg))
+                               goto out_pnfs;
+                       else
+                               goto out_mds;
+               }
                pgm = &pgio->pg_mirrors[i];
                mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
                pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize;
@@ -883,6 +956,12 @@ out_mds:
        pnfs_put_lseg(pgio->pg_lseg);
        pgio->pg_lseg = NULL;
        nfs_pageio_reset_write_mds(pgio);
+       return;
+
+out_pnfs:
+       pnfs_set_lo_fail(pgio->pg_lseg);
+       pnfs_put_lseg(pgio->pg_lseg);
+       pgio->pg_lseg = NULL;
 }
 
 static unsigned int
@@ -895,6 +974,7 @@ ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
                                                   0,
                                                   NFS4_MAX_UINT64,
                                                   IOMODE_RW,
+                                                  false,
                                                   GFP_NOFS);
                if (IS_ERR(pgio->pg_lseg)) {
                        pgio->pg_error = PTR_ERR(pgio->pg_lseg);
@@ -1067,8 +1147,7 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
                rpc_wake_up(&tbl->slot_tbl_waitq);
                /* fall through */
        default:
-               if (ff_layout_no_fallback_to_mds(lseg) ||
-                   ff_layout_has_available_ds(lseg))
+               if (ff_layout_avoid_mds_available_ds(lseg))
                        return -NFS4ERR_RESET_TO_PNFS;
 reset:
                dprintk("%s Retry through MDS. Error %d\n", __func__,
@@ -1215,8 +1294,6 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
                                        hdr->pgio_mirror_idx + 1,
                                        &hdr->pgio_mirror_idx))
                        goto out_eagain;
-               set_bit(NFS_LAYOUT_RETURN_REQUESTED,
-                       &hdr->lseg->pls_layout->plh_flags);
                pnfs_read_resend_pnfs(hdr);
                return task->tk_status;
        case -NFS4ERR_RESET_TO_MDS:
@@ -1260,7 +1337,7 @@ ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr)
 }
 
 static bool
-ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx)
+ff_layout_device_unavailable(struct pnfs_layout_segment *lseg, int idx)
 {
        /* No mirroring for now */
        struct nfs4_deviceid_node *node = FF_LAYOUT_DEVID_NODE(lseg, idx);
@@ -1297,16 +1374,10 @@ static int ff_layout_read_prepare_common(struct rpc_task *task,
                rpc_exit(task, -EIO);
                return -EIO;
        }
-       if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) {
-               dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
-               if (ff_layout_has_available_ds(hdr->lseg))
-                       pnfs_read_resend_pnfs(hdr);
-               else
-                       ff_layout_reset_read(hdr);
-               rpc_exit(task, 0);
+       if (ff_layout_device_unavailable(hdr->lseg, hdr->pgio_mirror_idx)) {
+               rpc_exit(task, -EHOSTDOWN);
                return -EAGAIN;
        }
-       hdr->pgio_done_cb = ff_layout_read_done_cb;
 
        ff_layout_read_record_layoutstats_start(task, hdr);
        return 0;
@@ -1496,14 +1567,8 @@ static int ff_layout_write_prepare_common(struct rpc_task *task,
                return -EIO;
        }
 
-       if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) {
-               bool retry_pnfs;
-
-               retry_pnfs = ff_layout_has_available_ds(hdr->lseg);
-               dprintk("%s task %u reset io to %s\n", __func__,
-                       task->tk_pid, retry_pnfs ? "pNFS" : "MDS");
-               ff_layout_reset_write(hdr, retry_pnfs);
-               rpc_exit(task, 0);
+       if (ff_layout_device_unavailable(hdr->lseg, hdr->pgio_mirror_idx)) {
+               rpc_exit(task, -EHOSTDOWN);
                return -EAGAIN;
        }
 
@@ -1712,7 +1777,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
                goto out_failed;
 
        ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
-       if (IS_ERR(ds_cred))
+       if (!ds_cred)
                goto out_failed;
 
        vers = nfs4_ff_layout_ds_version(lseg, idx);
@@ -1720,6 +1785,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
        dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__,
                ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count), vers);
 
+       hdr->pgio_done_cb = ff_layout_read_done_cb;
        atomic_inc(&ds->ds_clp->cl_count);
        hdr->ds_clp = ds->ds_clp;
        fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
@@ -1737,11 +1803,11 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
                          vers == 3 ? &ff_layout_read_call_ops_v3 :
                                      &ff_layout_read_call_ops_v4,
                          0, RPC_TASK_SOFTCONN);
-
+       put_rpccred(ds_cred);
        return PNFS_ATTEMPTED;
 
 out_failed:
-       if (ff_layout_has_available_ds(lseg))
+       if (ff_layout_avoid_mds_available_ds(lseg))
                return PNFS_TRY_AGAIN;
        return PNFS_NOT_ATTEMPTED;
 }
@@ -1769,7 +1835,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
                return PNFS_NOT_ATTEMPTED;
 
        ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
-       if (IS_ERR(ds_cred))
+       if (!ds_cred)
                return PNFS_NOT_ATTEMPTED;
 
        vers = nfs4_ff_layout_ds_version(lseg, idx);
@@ -1798,6 +1864,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
                          vers == 3 ? &ff_layout_write_call_ops_v3 :
                                      &ff_layout_write_call_ops_v4,
                          sync, RPC_TASK_SOFTCONN);
+       put_rpccred(ds_cred);
        return PNFS_ATTEMPTED;
 }
 
@@ -1824,7 +1891,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
        struct rpc_clnt *ds_clnt;
        struct rpc_cred *ds_cred;
        u32 idx;
-       int vers;
+       int vers, ret;
        struct nfs_fh *fh;
 
        idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
@@ -1838,7 +1905,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
                goto out_err;
 
        ds_cred = ff_layout_get_ds_cred(lseg, idx, data->cred);
-       if (IS_ERR(ds_cred))
+       if (!ds_cred)
                goto out_err;
 
        vers = nfs4_ff_layout_ds_version(lseg, idx);
@@ -1854,10 +1921,12 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
        if (fh)
                data->args.fh = fh;
 
-       return nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops,
+       ret = nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops,
                                   vers == 3 ? &ff_layout_commit_call_ops_v3 :
                                               &ff_layout_commit_call_ops_v4,
                                   how, RPC_TASK_SOFTCONN);
+       put_rpccred(ds_cred);
+       return ret;
 out_err:
        pnfs_generic_prepare_to_resend_writes(data);
        pnfs_generic_commit_release(data);
@@ -2223,6 +2292,11 @@ static int __init nfs4flexfilelayout_init(void)
 {
        printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Registering...\n",
               __func__);
+       if (!ff_zero_group) {
+               ff_zero_group = groups_alloc(0);
+               if (!ff_zero_group)
+                       return -ENOMEM;
+       }
        return pnfs_register_layoutdriver(&flexfilelayout_type);
 }
 
@@ -2231,6 +2305,10 @@ static void __exit nfs4flexfilelayout_exit(void)
        printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Unregistering...\n",
               __func__);
        pnfs_unregister_layoutdriver(&flexfilelayout_type);
+       if (ff_zero_group) {
+               put_group_info(ff_zero_group);
+               ff_zero_group = NULL;
+       }
 }
 
 MODULE_ALIAS("nfs-layouttype4-4");
index dd353bb..1bcdb15 100644 (file)
@@ -10,7 +10,8 @@
 #define FS_NFS_NFS4FLEXFILELAYOUT_H
 
 #define FF_FLAGS_NO_LAYOUTCOMMIT 1
-#define FF_FLAGS_NO_IO_THRU_MDS 2
+#define FF_FLAGS_NO_IO_THRU_MDS  2
+#define FF_FLAGS_NO_READ_IO      4
 
 #include "../pnfs.h"
 
@@ -76,9 +77,8 @@ struct nfs4_ff_layout_mirror {
        u32                             fh_versions_cnt;
        struct nfs_fh                   *fh_versions;
        nfs4_stateid                    stateid;
-       u32                             uid;
-       u32                             gid;
-       struct rpc_cred                 *cred;
+       struct rpc_cred __rcu           *ro_cred;
+       struct rpc_cred __rcu           *rw_cred;
        atomic_t                        ref;
        spinlock_t                      lock;
        struct nfs4_ff_layoutstat       read_stat;
@@ -153,6 +153,12 @@ ff_layout_no_fallback_to_mds(struct pnfs_layout_segment *lseg)
        return FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_IO_THRU_MDS;
 }
 
+static inline bool
+ff_layout_no_read_on_rw(struct pnfs_layout_segment *lseg)
+{
+       return FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_READ_IO;
+}
+
 static inline bool
 ff_layout_test_devid_unavailable(struct nfs4_deviceid_node *node)
 {
@@ -192,4 +198,7 @@ nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg,
 struct rpc_cred *ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg,
                                       u32 ds_idx, struct rpc_cred *mdscred);
 bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg);
+bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg);
+bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg);
+
 #endif /* FS_NFS_NFS4FLEXFILELAYOUT_H */
index add0e5a..0aa36be 100644 (file)
@@ -228,7 +228,8 @@ ff_ds_error_match(const struct nfs4_ff_layout_ds_err *e1,
                return e1->opnum < e2->opnum ? -1 : 1;
        if (e1->status != e2->status)
                return e1->status < e2->status ? -1 : 1;
-       ret = memcmp(&e1->stateid, &e2->stateid, sizeof(e1->stateid));
+       ret = memcmp(e1->stateid.data, e2->stateid.data,
+                       sizeof(e1->stateid.data));
        if (ret != 0)
                return ret;
        ret = memcmp(&e1->deviceid, &e2->deviceid, sizeof(e1->deviceid));
@@ -302,40 +303,26 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
        return 0;
 }
 
-/* currently we only support AUTH_NONE and AUTH_SYS */
-static rpc_authflavor_t
-nfs4_ff_layout_choose_authflavor(struct nfs4_ff_layout_mirror *mirror)
+static struct rpc_cred *
+ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode)
 {
-       if (mirror->uid == (u32)-1)
-               return RPC_AUTH_NULL;
-       return RPC_AUTH_UNIX;
-}
+       struct rpc_cred *cred, __rcu **pcred;
 
-/* fetch cred for NFSv3 DS */
-static int ff_layout_update_mirror_cred(struct nfs4_ff_layout_mirror *mirror,
-                                     struct nfs4_pnfs_ds *ds)
-{
-       if (ds->ds_clp && !mirror->cred &&
-           mirror->mirror_ds->ds_versions[0].version == 3) {
-               struct rpc_auth *auth = ds->ds_clp->cl_rpcclient->cl_auth;
-               struct rpc_cred *cred;
-               struct auth_cred acred = {
-                       .uid = make_kuid(&init_user_ns, mirror->uid),
-                       .gid = make_kgid(&init_user_ns, mirror->gid),
-               };
-
-               /* AUTH_NULL ignores acred */
-               cred = auth->au_ops->lookup_cred(auth, &acred, 0);
-               if (IS_ERR(cred)) {
-                       dprintk("%s: lookup_cred failed with %ld\n",
-                               __func__, PTR_ERR(cred));
-                       return PTR_ERR(cred);
-               } else {
-                       if (cmpxchg(&mirror->cred, NULL, cred))
-                               put_rpccred(cred);
-               }
-       }
-       return 0;
+       if (iomode == IOMODE_READ)
+               pcred = &mirror->ro_cred;
+       else
+               pcred = &mirror->rw_cred;
+
+       rcu_read_lock();
+       do {
+               cred = rcu_dereference(*pcred);
+               if (!cred)
+                       break;
+
+               cred = get_rpccred_rcu(cred);
+       } while(!cred);
+       rcu_read_unlock();
+       return cred;
 }
 
 struct nfs_fh *
@@ -356,7 +343,23 @@ out:
        return fh;
 }
 
-/* Upon return, either ds is connected, or ds is NULL */
+/**
+ * nfs4_ff_layout_prepare_ds - prepare a DS connection for an RPC call
+ * @lseg: the layout segment we're operating on
+ * @ds_idx: index of the DS to use
+ * @fail_return: return layout on connect failure?
+ *
+ * Try to prepare a DS connection to accept an RPC call. This involves
+ * selecting a mirror to use and connecting the client to it if it's not
+ * already connected.
+ *
+ * Since we only need a single functioning mirror to satisfy a read, we don't
+ * want to return the layout if there is one. For writes though, any down
+ * mirror should result in a LAYOUTRETURN. @fail_return is how we distinguish
+ * between the two cases.
+ *
+ * Returns a pointer to a connected DS object on success or NULL on failure.
+ */
 struct nfs4_pnfs_ds *
 nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
                          bool fail_return)
@@ -367,7 +370,6 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
        struct inode *ino = lseg->pls_layout->plh_inode;
        struct nfs_server *s = NFS_SERVER(ino);
        unsigned int max_payload;
-       rpc_authflavor_t flavor;
 
        if (!ff_layout_mirror_valid(lseg, mirror)) {
                pr_err_ratelimited("NFS: %s: No data server for offset index %d\n",
@@ -383,9 +385,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
        /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
        smp_rmb();
        if (ds->ds_clp)
-               goto out_update_creds;
-
-       flavor = nfs4_ff_layout_choose_authflavor(mirror);
+               goto out;
 
        /* FIXME: For now we assume the server sent only one version of NFS
         * to use for the DS.
@@ -394,7 +394,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
                             dataserver_retrans,
                             mirror->mirror_ds->ds_versions[0].version,
                             mirror->mirror_ds->ds_versions[0].minor_version,
-                            flavor);
+                            RPC_AUTH_UNIX);
 
        /* connect success, check rsize/wsize limit */
        if (ds->ds_clp) {
@@ -410,20 +410,10 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
                                         mirror, lseg->pls_range.offset,
                                         lseg->pls_range.length, NFS4ERR_NXIO,
                                         OP_ILLEGAL, GFP_NOIO);
-               if (!fail_return) {
-                       if (ff_layout_has_available_ds(lseg))
-                               set_bit(NFS_LAYOUT_RETURN_REQUESTED,
-                                       &lseg->pls_layout->plh_flags);
-                       else
-                               pnfs_error_mark_layout_for_return(ino, lseg);
-               } else
+               if (fail_return || !ff_layout_has_available_ds(lseg))
                        pnfs_error_mark_layout_for_return(ino, lseg);
                ds = NULL;
-               goto out;
        }
-out_update_creds:
-       if (ff_layout_update_mirror_cred(mirror, ds))
-               ds = NULL;
 out:
        return ds;
 }
@@ -433,16 +423,15 @@ ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx,
                      struct rpc_cred *mdscred)
 {
        struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
-       struct rpc_cred *cred = ERR_PTR(-EINVAL);
-
-       if (!nfs4_ff_layout_prepare_ds(lseg, ds_idx, true))
-               goto out;
+       struct rpc_cred *cred;
 
-       if (mirror && mirror->cred)
-               cred = mirror->cred;
-       else
-               cred = mdscred;
-out:
+       if (mirror) {
+               cred = ff_layout_get_mirror_cred(mirror, lseg->pls_range.iomode);
+               if (!cred)
+                       cred = get_rpccred(mdscred);
+       } else {
+               cred = get_rpccred(mdscred);
+       }
        return cred;
 }
 
@@ -562,6 +551,18 @@ bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
        return ff_rw_layout_has_available_ds(lseg);
 }
 
+bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg)
+{
+       return ff_layout_no_fallback_to_mds(lseg) ||
+              ff_layout_has_available_ds(lseg);
+}
+
+bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg)
+{
+       return lseg->pls_range.iomode == IOMODE_RW &&
+              ff_layout_no_read_on_rw(lseg);
+}
+
 module_param(dataserver_retrans, uint, 0644);
 MODULE_PARM_DESC(dataserver_retrans, "The  number of times the NFSv4.1 client "
                        "retries a request before it attempts further "
index f1d1d2c..5154fa6 100644 (file)
@@ -477,6 +477,7 @@ void nfs_mark_request_commit(struct nfs_page *req,
                             u32 ds_commit_idx);
 int nfs_write_need_commit(struct nfs_pgio_header *);
 void nfs_writeback_update_inode(struct nfs_pgio_header *hdr);
+int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf);
 int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
                            int how, struct nfs_commit_info *cinfo);
 void nfs_retry_commit(struct list_head *page_list,
index b587ccd..b6cd153 100644 (file)
@@ -13,6 +13,7 @@
 
 /* nfs4.2proc.c */
 int nfs42_proc_allocate(struct file *, loff_t, loff_t);
+ssize_t nfs42_proc_copy(struct file *, loff_t, struct file *, loff_t, size_t);
 int nfs42_proc_deallocate(struct file *, loff_t, loff_t);
 loff_t nfs42_proc_llseek(struct file *, loff_t, int);
 int nfs42_proc_layoutstats_generic(struct nfs_server *,
index dff8346..aa03ed0 100644 (file)
@@ -126,6 +126,111 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
        return err;
 }
 
+static ssize_t _nfs42_proc_copy(struct file *src, loff_t pos_src,
+                               struct nfs_lock_context *src_lock,
+                               struct file *dst, loff_t pos_dst,
+                               struct nfs_lock_context *dst_lock,
+                               size_t count)
+{
+       struct nfs42_copy_args args = {
+               .src_fh         = NFS_FH(file_inode(src)),
+               .src_pos        = pos_src,
+               .dst_fh         = NFS_FH(file_inode(dst)),
+               .dst_pos        = pos_dst,
+               .count          = count,
+       };
+       struct nfs42_copy_res res;
+       struct rpc_message msg = {
+               .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COPY],
+               .rpc_argp = &args,
+               .rpc_resp = &res,
+       };
+       struct inode *dst_inode = file_inode(dst);
+       struct nfs_server *server = NFS_SERVER(dst_inode);
+       int status;
+
+       status = nfs4_set_rw_stateid(&args.src_stateid, src_lock->open_context,
+                                    src_lock, FMODE_READ);
+       if (status)
+               return status;
+
+       status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context,
+                                    dst_lock, FMODE_WRITE);
+       if (status)
+               return status;
+
+       status = nfs4_call_sync(server->client, server, &msg,
+                               &args.seq_args, &res.seq_res, 0);
+       if (status == -ENOTSUPP)
+               server->caps &= ~NFS_CAP_COPY;
+       if (status)
+               return status;
+
+       if (res.write_res.verifier.committed != NFS_FILE_SYNC) {
+               status = nfs_commit_file(dst, &res.write_res.verifier.verifier);
+               if (status)
+                       return status;
+       }
+
+       truncate_pagecache_range(dst_inode, pos_dst,
+                                pos_dst + res.write_res.count);
+
+       return res.write_res.count;
+}
+
+ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src,
+                       struct file *dst, loff_t pos_dst,
+                       size_t count)
+{
+       struct nfs_server *server = NFS_SERVER(file_inode(dst));
+       struct nfs_lock_context *src_lock;
+       struct nfs_lock_context *dst_lock;
+       struct nfs4_exception src_exception = { };
+       struct nfs4_exception dst_exception = { };
+       ssize_t err, err2;
+
+       if (!nfs_server_capable(file_inode(dst), NFS_CAP_COPY))
+               return -EOPNOTSUPP;
+
+       src_lock = nfs_get_lock_context(nfs_file_open_context(src));
+       if (IS_ERR(src_lock))
+               return PTR_ERR(src_lock);
+
+       src_exception.inode = file_inode(src);
+       src_exception.state = src_lock->open_context->state;
+
+       dst_lock = nfs_get_lock_context(nfs_file_open_context(dst));
+       if (IS_ERR(dst_lock)) {
+               err = PTR_ERR(dst_lock);
+               goto out_put_src_lock;
+       }
+
+       dst_exception.inode = file_inode(dst);
+       dst_exception.state = dst_lock->open_context->state;
+
+       do {
+               inode_lock(file_inode(dst));
+               err = _nfs42_proc_copy(src, pos_src, src_lock,
+                                      dst, pos_dst, dst_lock, count);
+               inode_unlock(file_inode(dst));
+
+               if (err == -ENOTSUPP) {
+                       err = -EOPNOTSUPP;
+                       break;
+               }
+
+               err2 = nfs4_handle_exception(server, err, &src_exception);
+               err  = nfs4_handle_exception(server, err, &dst_exception);
+               if (!err)
+                       err = err2;
+       } while (src_exception.retry || dst_exception.retry);
+
+       nfs_put_lock_context(dst_lock);
+out_put_src_lock:
+       nfs_put_lock_context(src_lock);
+       return err;
+}
+
 static loff_t _nfs42_proc_llseek(struct file *filep,
                struct nfs_lock_context *lock, loff_t offset, int whence)
 {
@@ -232,7 +337,7 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
                         * with the current stateid.
                         */
                        set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
-                       pnfs_mark_matching_lsegs_invalid(lo, &head, NULL);
+                       pnfs_mark_matching_lsegs_invalid(lo, &head, NULL, 0);
                        spin_unlock(&inode->i_lock);
                        pnfs_free_lseg_list(&head);
                } else
index 0ca482a..6dc6f2a 100644 (file)
@@ -9,9 +9,22 @@
 #define encode_fallocate_maxsz         (encode_stateid_maxsz + \
                                         2 /* offset */ + \
                                         2 /* length */)
+#define NFS42_WRITE_RES_SIZE           (1 /* wr_callback_id size */ +\
+                                        XDR_QUADLEN(NFS4_STATEID_SIZE) + \
+                                        2 /* wr_count */ + \
+                                        1 /* wr_committed */ + \
+                                        XDR_QUADLEN(NFS4_VERIFIER_SIZE))
 #define encode_allocate_maxsz          (op_encode_hdr_maxsz + \
                                         encode_fallocate_maxsz)
 #define decode_allocate_maxsz          (op_decode_hdr_maxsz)
+#define encode_copy_maxsz              (op_encode_hdr_maxsz +          \
+                                        XDR_QUADLEN(NFS4_STATEID_SIZE) + \
+                                        XDR_QUADLEN(NFS4_STATEID_SIZE) + \
+                                        2 + 2 + 2 + 1 + 1 + 1)
+#define decode_copy_maxsz              (op_decode_hdr_maxsz + \
+                                        NFS42_WRITE_RES_SIZE + \
+                                        1 /* cr_consecutive */ + \
+                                        1 /* cr_synchronous */)
 #define encode_deallocate_maxsz                (op_encode_hdr_maxsz + \
                                         encode_fallocate_maxsz)
 #define decode_deallocate_maxsz                (op_decode_hdr_maxsz)
                                         decode_putfh_maxsz + \
                                         decode_allocate_maxsz + \
                                         decode_getattr_maxsz)
+#define NFS4_enc_copy_sz               (compound_encode_hdr_maxsz + \
+                                        encode_putfh_maxsz + \
+                                        encode_savefh_maxsz + \
+                                        encode_putfh_maxsz + \
+                                        encode_copy_maxsz)
+#define NFS4_dec_copy_sz               (compound_decode_hdr_maxsz + \
+                                        decode_putfh_maxsz + \
+                                        decode_savefh_maxsz + \
+                                        decode_putfh_maxsz + \
+                                        decode_copy_maxsz)
 #define NFS4_enc_deallocate_sz         (compound_encode_hdr_maxsz + \
                                         encode_putfh_maxsz + \
                                         encode_deallocate_maxsz + \
@@ -102,6 +125,23 @@ static void encode_allocate(struct xdr_stream *xdr,
        encode_fallocate(xdr, args);
 }
 
+static void encode_copy(struct xdr_stream *xdr,
+                       struct nfs42_copy_args *args,
+                       struct compound_hdr *hdr)
+{
+       encode_op_hdr(xdr, OP_COPY, decode_copy_maxsz, hdr);
+       encode_nfs4_stateid(xdr, &args->src_stateid);
+       encode_nfs4_stateid(xdr, &args->dst_stateid);
+
+       encode_uint64(xdr, args->src_pos);
+       encode_uint64(xdr, args->dst_pos);
+       encode_uint64(xdr, args->count);
+
+       encode_uint32(xdr, 1); /* consecutive = true */
+       encode_uint32(xdr, 1); /* synchronous = true */
+       encode_uint32(xdr, 0); /* src server list */
+}
+
 static void encode_deallocate(struct xdr_stream *xdr,
                              struct nfs42_falloc_args *args,
                              struct compound_hdr *hdr)
@@ -181,6 +221,26 @@ static void nfs4_xdr_enc_allocate(struct rpc_rqst *req,
        encode_nops(&hdr);
 }
 
+/*
+ * Encode COPY request
+ */
+static void nfs4_xdr_enc_copy(struct rpc_rqst *req,
+                             struct xdr_stream *xdr,
+                             struct nfs42_copy_args *args)
+{
+       struct compound_hdr hdr = {
+               .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+       };
+
+       encode_compound_hdr(xdr, req, &hdr);
+       encode_sequence(xdr, &args->seq_args, &hdr);
+       encode_putfh(xdr, args->src_fh, &hdr);
+       encode_savefh(xdr, &hdr);
+       encode_putfh(xdr, args->dst_fh, &hdr);
+       encode_copy(xdr, args, &hdr);
+       encode_nops(&hdr);
+}
+
 /*
  * Encode DEALLOCATE request
  */
@@ -266,6 +326,62 @@ static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
        return decode_op_hdr(xdr, OP_ALLOCATE);
 }
 
+static int decode_write_response(struct xdr_stream *xdr,
+                                struct nfs42_write_res *res)
+{
+       __be32 *p;
+       int stateids;
+
+       p = xdr_inline_decode(xdr, 4 + 8 + 4);
+       if (unlikely(!p))
+               goto out_overflow;
+
+       stateids = be32_to_cpup(p++);
+       p = xdr_decode_hyper(p, &res->count);
+       res->verifier.committed = be32_to_cpup(p);
+       return decode_verifier(xdr, &res->verifier.verifier);
+
+out_overflow:
+       print_overflow_msg(__func__, xdr);
+       return -EIO;
+}
+
+static int decode_copy_requirements(struct xdr_stream *xdr,
+                                   struct nfs42_copy_res *res) {
+       __be32 *p;
+
+       p = xdr_inline_decode(xdr, 4 + 4);
+       if (unlikely(!p))
+               goto out_overflow;
+
+       res->consecutive = be32_to_cpup(p++);
+       res->synchronous = be32_to_cpup(p++);
+       return 0;
+out_overflow:
+       print_overflow_msg(__func__, xdr);
+       return -EIO;
+}
+
+static int decode_copy(struct xdr_stream *xdr, struct nfs42_copy_res *res)
+{
+       int status;
+
+       status = decode_op_hdr(xdr, OP_COPY);
+       if (status == NFS4ERR_OFFLOAD_NO_REQS) {
+               status = decode_copy_requirements(xdr, res);
+               if (status)
+                       return status;
+               return NFS4ERR_OFFLOAD_NO_REQS;
+       } else if (status)
+               return status;
+
+       status = decode_write_response(xdr, &res->write_res);
+       if (status)
+               return status;
+
+       return decode_copy_requirements(xdr, res);
+}
+
 static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
 {
        return decode_op_hdr(xdr, OP_DEALLOCATE);
@@ -330,6 +446,36 @@ out:
        return status;
 }
 
+/*
+ * Decode COPY response
+ */
+static int nfs4_xdr_dec_copy(struct rpc_rqst *rqstp,
+                            struct xdr_stream *xdr,
+                            struct nfs42_copy_res *res)
+{
+       struct compound_hdr hdr;
+       int status;
+
+       status = decode_compound_hdr(xdr, &hdr);
+       if (status)
+               goto out;
+       status = decode_sequence(xdr, &res->seq_res, rqstp);
+       if (status)
+               goto out;
+       status = decode_putfh(xdr);
+       if (status)
+               goto out;
+       status = decode_savefh(xdr);
+       if (status)
+               goto out;
+       status = decode_putfh(xdr);
+       if (status)
+               goto out;
+       status = decode_copy(xdr, res);
+out:
+       return status;
+}
+
 /*
  * Decode DEALLOCATE request
  */
index 4afdee4..768456f 100644 (file)
@@ -438,8 +438,9 @@ extern void nfs41_handle_server_scope(struct nfs_client *,
                                      struct nfs41_server_scope **);
 extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
 extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
-extern int nfs4_select_rw_stateid(nfs4_stateid *, struct nfs4_state *,
-               fmode_t, const struct nfs_lockowner *);
+extern int nfs4_select_rw_stateid(struct nfs4_state *, fmode_t,
+               const struct nfs_lockowner *, nfs4_stateid *,
+               struct rpc_cred **);
 
 extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask);
 extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task);
@@ -496,12 +497,15 @@ extern struct svc_version nfs4_callback_version4;
 
 static inline void nfs4_stateid_copy(nfs4_stateid *dst, const nfs4_stateid *src)
 {
-       memcpy(dst, src, sizeof(*dst));
+       memcpy(dst->data, src->data, sizeof(dst->data));
+       dst->type = src->type;
 }
 
 static inline bool nfs4_stateid_match(const nfs4_stateid *dst, const nfs4_stateid *src)
 {
-       return memcmp(dst, src, sizeof(*dst)) == 0;
+       if (dst->type != src->type)
+               return false;
+       return memcmp(dst->data, src->data, sizeof(dst->data)) == 0;
 }
 
 static inline bool nfs4_stateid_match_other(const nfs4_stateid *dst, const nfs4_stateid *src)
index d039051..014b0e4 100644 (file)
@@ -129,6 +129,28 @@ nfs4_file_flush(struct file *file, fl_owner_t id)
 }
 
 #ifdef CONFIG_NFS_V4_2
+static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
+                                   struct file *file_out, loff_t pos_out,
+                                   size_t count, unsigned int flags)
+{
+       struct inode *in_inode = file_inode(file_in);
+       struct inode *out_inode = file_inode(file_out);
+       int ret;
+
+       if (in_inode == out_inode)
+               return -EINVAL;
+
+       /* flush any pending writes */
+       ret = nfs_sync_inode(in_inode);
+       if (ret)
+               return ret;
+       ret = nfs_sync_inode(out_inode);
+       if (ret)
+               return ret;
+
+       return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count);
+}
+
 static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence)
 {
        loff_t ret;
@@ -243,6 +265,7 @@ const struct file_operations nfs4_file_operations = {
        .check_flags    = nfs_check_flags,
        .setlease       = simple_nosetlease,
 #ifdef CONFIG_NFS_V4_2
+       .copy_file_range = nfs4_copy_file_range,
        .llseek         = nfs4_file_llseek,
        .fallocate      = nfs42_fallocate,
        .clone_file_range = nfs42_clone_file_range,
index 084e857..de97567 100644 (file)
 #define NFS4_POLL_RETRY_MIN    (HZ/10)
 #define NFS4_POLL_RETRY_MAX    (15*HZ)
 
+/* file attributes which can be mapped to nfs attributes */
+#define NFS4_VALID_ATTRS (ATTR_MODE \
+       | ATTR_UID \
+       | ATTR_GID \
+       | ATTR_SIZE \
+       | ATTR_ATIME \
+       | ATTR_MTIME \
+       | ATTR_CTIME \
+       | ATTR_ATIME_SET \
+       | ATTR_MTIME_SET)
+
 struct nfs4_opendata;
 static int _nfs4_proc_open(struct nfs4_opendata *data);
 static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
@@ -416,6 +427,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
                case -NFS4ERR_DELAY:
                        nfs_inc_server_stats(server, NFSIOS_DELAY);
                case -NFS4ERR_GRACE:
+               case -NFS4ERR_RECALLCONFLICT:
                        exception->delay = 1;
                        return 0;
 
@@ -2558,15 +2570,20 @@ static int _nfs4_do_open(struct inode *dir,
        if ((opendata->o_arg.open_flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL) &&
            (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) {
                nfs4_exclusive_attrset(opendata, sattr, &label);
-
-               nfs_fattr_init(opendata->o_res.f_attr);
-               status = nfs4_do_setattr(state->inode, cred,
-                               opendata->o_res.f_attr, sattr,
-                               state, label, olabel);
-               if (status == 0) {
-                       nfs_setattr_update_inode(state->inode, sattr,
-                                       opendata->o_res.f_attr);
-                       nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel);
+               /*
+                * send create attributes which was not set by open
+                * with an extra setattr.
+                */
+               if (sattr->ia_valid & NFS4_VALID_ATTRS) {
+                       nfs_fattr_init(opendata->o_res.f_attr);
+                       status = nfs4_do_setattr(state->inode, cred,
+                                       opendata->o_res.f_attr, sattr,
+                                       state, label, olabel);
+                       if (status == 0) {
+                               nfs_setattr_update_inode(state->inode, sattr,
+                                               opendata->o_res.f_attr);
+                               nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel);
+                       }
                }
        }
        if (opened && opendata->file_created)
@@ -2676,6 +2693,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
                .rpc_resp       = &res,
                .rpc_cred       = cred,
         };
+       struct rpc_cred *delegation_cred = NULL;
        unsigned long timestamp = jiffies;
        fmode_t fmode;
        bool truncate;
@@ -2691,7 +2709,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
        truncate = (sattr->ia_valid & ATTR_SIZE) ? true : false;
        fmode = truncate ? FMODE_WRITE : FMODE_READ;
 
-       if (nfs4_copy_delegation_stateid(&arg.stateid, inode, fmode)) {
+       if (nfs4_copy_delegation_stateid(inode, fmode, &arg.stateid, &delegation_cred)) {
                /* Use that stateid */
        } else if (truncate && state != NULL) {
                struct nfs_lockowner lockowner = {
@@ -2700,13 +2718,17 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
                };
                if (!nfs4_valid_open_stateid(state))
                        return -EBADF;
-               if (nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE,
-                               &lockowner) == -EIO)
+               if (nfs4_select_rw_stateid(state, FMODE_WRITE, &lockowner,
+                               &arg.stateid, &delegation_cred) == -EIO)
                        return -EBADF;
        } else
                nfs4_stateid_copy(&arg.stateid, &zero_stateid);
+       if (delegation_cred)
+               msg.rpc_cred = delegation_cred;
 
        status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
+
+       put_rpccred(delegation_cred);
        if (status == 0 && state != NULL)
                renew_lease(server, timestamp);
        trace_nfs4_setattr(inode, &arg.stateid, status);
@@ -4285,7 +4307,7 @@ int nfs4_set_rw_stateid(nfs4_stateid *stateid,
 
        if (l_ctx != NULL)
                lockowner = &l_ctx->lockowner;
-       return nfs4_select_rw_stateid(stateid, ctx->state, fmode, lockowner);
+       return nfs4_select_rw_stateid(ctx->state, fmode, lockowner, stateid, NULL);
 }
 EXPORT_SYMBOL_GPL(nfs4_set_rw_stateid);
 
@@ -4993,12 +5015,11 @@ static int nfs4_do_set_security_label(struct inode *inode,
 }
 
 static int
-nfs4_set_security_label(struct dentry *dentry, const void *buf, size_t buflen)
+nfs4_set_security_label(struct inode *inode, const void *buf, size_t buflen)
 {
        struct nfs4_label ilabel, *olabel = NULL;
        struct nfs_fattr fattr;
        struct rpc_cred *cred;
-       struct inode *inode = d_inode(dentry);
        int status;
 
        if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL))
@@ -6054,6 +6075,7 @@ static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *reques
 static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
 {
        struct nfs_inode *nfsi = NFS_I(state->inode);
+       struct nfs4_state_owner *sp = state->owner;
        unsigned char fl_flags = request->fl_flags;
        int status = -ENOLCK;
 
@@ -6068,6 +6090,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
        status = do_vfs_lock(state->inode, request);
        if (status < 0)
                goto out;
+       mutex_lock(&sp->so_delegreturn_mutex);
        down_read(&nfsi->rwsem);
        if (test_bit(NFS_DELEGATED_STATE, &state->flags)) {
                /* Yes: cache locks! */
@@ -6075,9 +6098,11 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
                request->fl_flags = fl_flags & ~FL_SLEEP;
                status = do_vfs_lock(state->inode, request);
                up_read(&nfsi->rwsem);
+               mutex_unlock(&sp->so_delegreturn_mutex);
                goto out;
        }
        up_read(&nfsi->rwsem);
+       mutex_unlock(&sp->so_delegreturn_mutex);
        status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW);
 out:
        request->fl_flags = fl_flags;
@@ -6255,11 +6280,11 @@ nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp)
 #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
 
 static int nfs4_xattr_set_nfs4_acl(const struct xattr_handler *handler,
-                                  struct dentry *dentry, const char *key,
-                                  const void *buf, size_t buflen,
-                                  int flags)
+                                  struct dentry *unused, struct inode *inode,
+                                  const char *key, const void *buf,
+                                  size_t buflen, int flags)
 {
-       return nfs4_proc_set_acl(d_inode(dentry), buf, buflen);
+       return nfs4_proc_set_acl(inode, buf, buflen);
 }
 
 static int nfs4_xattr_get_nfs4_acl(const struct xattr_handler *handler,
@@ -6277,12 +6302,12 @@ static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry)
 #ifdef CONFIG_NFS_V4_SECURITY_LABEL
 
 static int nfs4_xattr_set_nfs4_label(const struct xattr_handler *handler,
-                                    struct dentry *dentry, const char *key,
-                                    const void *buf, size_t buflen,
-                                    int flags)
+                                    struct dentry *unused, struct inode *inode,
+                                    const char *key, const void *buf,
+                                    size_t buflen, int flags)
 {
        if (security_ismaclabel(key))
-               return nfs4_set_security_label(dentry, buf, buflen);
+               return nfs4_set_security_label(inode, buf, buflen);
 
        return -EOPNOTSUPP;
 }
@@ -7351,9 +7376,11 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
  * always set csa_cachethis to FALSE because the current implementation
  * of the back channel DRC only supports caching the CB_SEQUENCE operation.
  */
-static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
+static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args,
+                                   struct rpc_clnt *clnt)
 {
        unsigned int max_rqst_sz, max_resp_sz;
+       unsigned int max_bc_payload = rpc_max_bc_payload(clnt);
 
        max_rqst_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxwrite_overhead;
        max_resp_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxread_overhead;
@@ -7371,8 +7398,8 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
                args->fc_attrs.max_ops, args->fc_attrs.max_reqs);
 
        /* Back channel attributes */
-       args->bc_attrs.max_rqst_sz = PAGE_SIZE;
-       args->bc_attrs.max_resp_sz = PAGE_SIZE;
+       args->bc_attrs.max_rqst_sz = max_bc_payload;
+       args->bc_attrs.max_resp_sz = max_bc_payload;
        args->bc_attrs.max_resp_sz_cached = 0;
        args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS;
        args->bc_attrs.max_reqs = NFS41_BC_MAX_CALLBACKS;
@@ -7476,7 +7503,7 @@ static int _nfs4_proc_create_session(struct nfs_client *clp,
        };
        int status;
 
-       nfs4_init_channel_attrs(&args);
+       nfs4_init_channel_attrs(&args, clp->cl_rpcclient);
        args.flags = (SESSION4_PERSIST | SESSION4_BACK_CHAN);
 
        status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
@@ -7820,40 +7847,34 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
        struct nfs4_layoutget *lgp = calldata;
        struct nfs_server *server = NFS_SERVER(lgp->args.inode);
        struct nfs4_session *session = nfs4_get_session(server);
-       int ret;
 
        dprintk("--> %s\n", __func__);
-       /* Note the is a race here, where a CB_LAYOUTRECALL can come in
-        * right now covering the LAYOUTGET we are about to send.
-        * However, that is not so catastrophic, and there seems
-        * to be no way to prevent it completely.
-        */
-       if (nfs41_setup_sequence(session, &lgp->args.seq_args,
-                               &lgp->res.seq_res, task))
-               return;
-       ret = pnfs_choose_layoutget_stateid(&lgp->args.stateid,
-                                         NFS_I(lgp->args.inode)->layout,
-                                         &lgp->args.range,
-                                         lgp->args.ctx->state);
-       if (ret < 0)
-               rpc_exit(task, ret);
+       nfs41_setup_sequence(session, &lgp->args.seq_args,
+                               &lgp->res.seq_res, task);
+       dprintk("<-- %s\n", __func__);
 }
 
 static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
 {
        struct nfs4_layoutget *lgp = calldata;
+
+       dprintk("--> %s\n", __func__);
+       nfs41_sequence_done(task, &lgp->res.seq_res);
+       dprintk("<-- %s\n", __func__);
+}
+
+static int
+nfs4_layoutget_handle_exception(struct rpc_task *task,
+               struct nfs4_layoutget *lgp, struct nfs4_exception *exception)
+{
        struct inode *inode = lgp->args.inode;
        struct nfs_server *server = NFS_SERVER(inode);
        struct pnfs_layout_hdr *lo;
-       struct nfs4_state *state = NULL;
-       unsigned long timeo, now, giveup;
+       int status = task->tk_status;
 
        dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status);
 
-       if (!nfs41_sequence_done(task, &lgp->res.seq_res))
-               goto out;
-
-       switch (task->tk_status) {
+       switch (status) {
        case 0:
                goto out;
 
@@ -7863,57 +7884,43 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
         * retry go inband.
         */
        case -NFS4ERR_LAYOUTUNAVAILABLE:
-               task->tk_status = -ENODATA;
+               status = -ENODATA;
                goto out;
        /*
         * NFS4ERR_BADLAYOUT means the MDS cannot return a layout of
         * length lgp->args.minlength != 0 (see RFC5661 section 18.43.3).
         */
        case -NFS4ERR_BADLAYOUT:
-               goto out_overflow;
+               status = -EOVERFLOW;
+               goto out;
        /*
         * NFS4ERR_LAYOUTTRYLATER is a conflict with another client
         * (or clients) writing to the same RAID stripe except when
         * the minlength argument is 0 (see RFC5661 section 18.43.3).
+        *
+        * Treat it like we would RECALLCONFLICT -- we retry for a little
+        * while, and then eventually give up.
         */
        case -NFS4ERR_LAYOUTTRYLATER:
-               if (lgp->args.minlength == 0)
-                       goto out_overflow;
-       /*
-        * NFS4ERR_RECALLCONFLICT is when conflict with self (must recall
-        * existing layout before getting a new one).
-        */
-       case -NFS4ERR_RECALLCONFLICT:
-               timeo = rpc_get_timeout(task->tk_client);
-               giveup = lgp->args.timestamp + timeo;
-               now = jiffies;
-               if (time_after(giveup, now)) {
-                       unsigned long delay;
-
-                       /* Delay for:
-                        * - Not less then NFS4_POLL_RETRY_MIN.
-                        * - One last time a jiffie before we give up
-                        * - exponential backoff (time_now minus start_attempt)
-                        */
-                       delay = max_t(unsigned long, NFS4_POLL_RETRY_MIN,
-                                   min((giveup - now - 1),
-                                       now - lgp->args.timestamp));
-
-                       dprintk("%s: NFS4ERR_RECALLCONFLICT waiting %lu\n",
-                               __func__, delay);
-                       rpc_delay(task, delay);
-                       /* Do not call nfs4_async_handle_error() */
-                       goto out_restart;
+               if (lgp->args.minlength == 0) {
+                       status = -EOVERFLOW;
+                       goto out;
                }
-               break;
+               /* Fallthrough */
+       case -NFS4ERR_RECALLCONFLICT:
+               nfs4_handle_exception(server, -NFS4ERR_RECALLCONFLICT,
+                                       exception);
+               status = -ERECALLCONFLICT;
+               goto out;
        case -NFS4ERR_EXPIRED:
        case -NFS4ERR_BAD_STATEID:
+               exception->timeout = 0;
                spin_lock(&inode->i_lock);
                if (nfs4_stateid_match(&lgp->args.stateid,
                                        &lgp->args.ctx->state->stateid)) {
                        spin_unlock(&inode->i_lock);
                        /* If the open stateid was bad, then recover it. */
-                       state = lgp->args.ctx->state;
+                       exception->state = lgp->args.ctx->state;
                        break;
                }
                lo = NFS_I(inode)->layout;
@@ -7926,25 +7933,21 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
                         * with the current stateid.
                         */
                        set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
-                       pnfs_mark_matching_lsegs_invalid(lo, &head, NULL);
+                       pnfs_mark_matching_lsegs_invalid(lo, &head, NULL, 0);
                        spin_unlock(&inode->i_lock);
                        pnfs_free_lseg_list(&head);
                } else
                        spin_unlock(&inode->i_lock);
-               goto out_restart;
+               status = -EAGAIN;
+               goto out;
        }
-       if (nfs4_async_handle_error(task, server, state, &lgp->timeout) == -EAGAIN)
-               goto out_restart;
+
+       status = nfs4_handle_exception(server, status, exception);
+       if (exception->retry)
+               status = -EAGAIN;
 out:
        dprintk("<-- %s\n", __func__);
-       return;
-out_restart:
-       task->tk_status = 0;
-       rpc_restart_call_prepare(task);
-       return;
-out_overflow:
-       task->tk_status = -EOVERFLOW;
-       goto out;
+       return status;
 }
 
 static size_t max_response_pages(struct nfs_server *server)
@@ -8013,7 +8016,7 @@ static const struct rpc_call_ops nfs4_layoutget_call_ops = {
 };
 
 struct pnfs_layout_segment *
-nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
+nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags)
 {
        struct inode *inode = lgp->args.inode;
        struct nfs_server *server = NFS_SERVER(inode);
@@ -8033,6 +8036,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
                .flags = RPC_TASK_ASYNC,
        };
        struct pnfs_layout_segment *lseg = NULL;
+       struct nfs4_exception exception = { .timeout = *timeout };
        int status = 0;
 
        dprintk("--> %s\n", __func__);
@@ -8046,7 +8050,6 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
                return ERR_PTR(-ENOMEM);
        }
        lgp->args.layout.pglen = max_pages * PAGE_SIZE;
-       lgp->args.timestamp = jiffies;
 
        lgp->res.layoutp = &lgp->args.layout;
        lgp->res.seq_res.sr_slot = NULL;
@@ -8056,13 +8059,17 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
        if (IS_ERR(task))
                return ERR_CAST(task);
        status = nfs4_wait_for_completion_rpc_task(task);
-       if (status == 0)
-               status = task->tk_status;
+       if (status == 0) {
+               status = nfs4_layoutget_handle_exception(task, lgp, &exception);
+               *timeout = exception.timeout;
+       }
+
        trace_nfs4_layoutget(lgp->args.ctx,
                        &lgp->args.range,
                        &lgp->res.range,
                        &lgp->res.stateid,
                        status);
+
        /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */
        if (status == 0 && lgp->res.layoutp->len)
                lseg = pnfs_layout_process(lgp);
@@ -8118,7 +8125,8 @@ static void nfs4_layoutreturn_release(void *calldata)
 
        dprintk("--> %s\n", __func__);
        spin_lock(&lo->plh_inode->i_lock);
-       pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range);
+       pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range,
+                       be32_to_cpu(lrp->args.stateid.seqid));
        pnfs_mark_layout_returned_if_empty(lo);
        if (lrp->res.lrs_present)
                pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
@@ -8653,6 +8661,9 @@ nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
 static bool nfs41_match_stateid(const nfs4_stateid *s1,
                const nfs4_stateid *s2)
 {
+       if (s1->type != s2->type)
+               return false;
+
        if (memcmp(s1->other, s2->other, sizeof(s1->other)) != 0)
                return false;
 
@@ -8793,6 +8804,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
                | NFS_CAP_STATEID_NFSV41
                | NFS_CAP_ATOMIC_OPEN_V1
                | NFS_CAP_ALLOCATE
+               | NFS_CAP_COPY
                | NFS_CAP_DEALLOCATE
                | NFS_CAP_SEEK
                | NFS_CAP_LAYOUTSTATS
index d854693..9679f47 100644 (file)
 
 #define OPENOWNER_POOL_SIZE    8
 
-const nfs4_stateid zero_stateid;
+const nfs4_stateid zero_stateid = {
+       { .data = { 0 } },
+       .type = NFS4_SPECIAL_STATEID_TYPE,
+};
 static DEFINE_MUTEX(nfs_clid_init_mutex);
 
 int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
@@ -985,15 +988,20 @@ static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
  * Byte-range lock aware utility to initialize the stateid of read/write
  * requests.
  */
-int nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state,
-               fmode_t fmode, const struct nfs_lockowner *lockowner)
+int nfs4_select_rw_stateid(struct nfs4_state *state,
+               fmode_t fmode, const struct nfs_lockowner *lockowner,
+               nfs4_stateid *dst, struct rpc_cred **cred)
 {
-       int ret = nfs4_copy_lock_stateid(dst, state, lockowner);
+       int ret;
+
+       if (cred != NULL)
+               *cred = NULL;
+       ret = nfs4_copy_lock_stateid(dst, state, lockowner);
        if (ret == -EIO)
                /* A lost lock - don't even consider delegations */
                goto out;
        /* returns true if delegation stateid found and copied */
-       if (nfs4_copy_delegation_stateid(dst, state->inode, fmode)) {
+       if (nfs4_copy_delegation_stateid(state->inode, fmode, dst, cred)) {
                ret = 0;
                goto out;
        }
index 2c8d05d..9c150b1 100644 (file)
@@ -1520,6 +1520,8 @@ DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn_on_close);
                { PNFS_UPDATE_LAYOUT_FOUND_CACHED, "found cached" },    \
                { PNFS_UPDATE_LAYOUT_RETURN, "layoutreturn" },          \
                { PNFS_UPDATE_LAYOUT_BLOCKED, "layouts blocked" },      \
+               { PNFS_UPDATE_LAYOUT_INVALID_OPEN, "invalid open" },    \
+               { PNFS_UPDATE_LAYOUT_RETRY, "retrying" },       \
                { PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" })
 
 TRACE_EVENT(pnfs_update_layout,
@@ -1528,9 +1530,10 @@ TRACE_EVENT(pnfs_update_layout,
                        u64 count,
                        enum pnfs_iomode iomode,
                        struct pnfs_layout_hdr *lo,
+                       struct pnfs_layout_segment *lseg,
                        enum pnfs_update_layout_reason reason
                ),
-               TP_ARGS(inode, pos, count, iomode, lo, reason),
+               TP_ARGS(inode, pos, count, iomode, lo, lseg, reason),
                TP_STRUCT__entry(
                        __field(dev_t, dev)
                        __field(u64, fileid)
@@ -1540,6 +1543,7 @@ TRACE_EVENT(pnfs_update_layout,
                        __field(enum pnfs_iomode, iomode)
                        __field(int, layoutstateid_seq)
                        __field(u32, layoutstateid_hash)
+                       __field(long, lseg)
                        __field(enum pnfs_update_layout_reason, reason)
                ),
                TP_fast_assign(
@@ -1559,11 +1563,12 @@ TRACE_EVENT(pnfs_update_layout,
                                __entry->layoutstateid_seq = 0;
                                __entry->layoutstateid_hash = 0;
                        }
+                       __entry->lseg = (long)lseg;
                ),
                TP_printk(
                        "fileid=%02x:%02x:%llu fhandle=0x%08x "
                        "iomode=%s pos=%llu count=%llu "
-                       "layoutstateid=%d:0x%08x (%s)",
+                       "layoutstateid=%d:0x%08x lseg=0x%lx (%s)",
                        MAJOR(__entry->dev), MINOR(__entry->dev),
                        (unsigned long long)__entry->fileid,
                        __entry->fhandle,
@@ -1571,6 +1576,7 @@ TRACE_EVENT(pnfs_update_layout,
                        (unsigned long long)__entry->pos,
                        (unsigned long long)__entry->count,
                        __entry->layoutstateid_seq, __entry->layoutstateid_hash,
+                       __entry->lseg,
                        show_pnfs_update_layout_reason(__entry->reason)
                )
 );
index 88474a4..661e753 100644 (file)
@@ -4270,6 +4270,24 @@ static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
        return decode_opaque_fixed(xdr, stateid, NFS4_STATEID_SIZE);
 }
 
+static int decode_open_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+       stateid->type = NFS4_OPEN_STATEID_TYPE;
+       return decode_stateid(xdr, stateid);
+}
+
+static int decode_lock_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+       stateid->type = NFS4_LOCK_STATEID_TYPE;
+       return decode_stateid(xdr, stateid);
+}
+
+static int decode_delegation_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+       stateid->type = NFS4_DELEGATION_STATEID_TYPE;
+       return decode_stateid(xdr, stateid);
+}
+
 static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
 {
        int status;
@@ -4278,7 +4296,7 @@ static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
        if (status != -EIO)
                nfs_increment_open_seqid(status, res->seqid);
        if (!status)
-               status = decode_stateid(xdr, &res->stateid);
+               status = decode_open_stateid(xdr, &res->stateid);
        return status;
 }
 
@@ -4937,7 +4955,7 @@ static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res)
        if (status == -EIO)
                goto out;
        if (status == 0) {
-               status = decode_stateid(xdr, &res->stateid);
+               status = decode_lock_stateid(xdr, &res->stateid);
                if (unlikely(status))
                        goto out;
        } else if (status == -NFS4ERR_DENIED)
@@ -4966,7 +4984,7 @@ static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res)
        if (status != -EIO)
                nfs_increment_lock_seqid(status, res->seqid);
        if (status == 0)
-               status = decode_stateid(xdr, &res->stateid);
+               status = decode_lock_stateid(xdr, &res->stateid);
        return status;
 }
 
@@ -5016,7 +5034,7 @@ static int decode_rw_delegation(struct xdr_stream *xdr,
        __be32 *p;
        int status;
 
-       status = decode_stateid(xdr, &res->delegation);
+       status = decode_delegation_stateid(xdr, &res->delegation);
        if (unlikely(status))
                return status;
        p = xdr_inline_decode(xdr, 4);
@@ -5096,7 +5114,7 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
        nfs_increment_open_seqid(status, res->seqid);
        if (status)
                return status;
-       status = decode_stateid(xdr, &res->stateid);
+       status = decode_open_stateid(xdr, &res->stateid);
        if (unlikely(status))
                return status;
 
@@ -5136,7 +5154,7 @@ static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmre
        if (status != -EIO)
                nfs_increment_open_seqid(status, res->seqid);
        if (!status)
-               status = decode_stateid(xdr, &res->stateid);
+               status = decode_open_stateid(xdr, &res->stateid);
        return status;
 }
 
@@ -5148,7 +5166,7 @@ static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *re
        if (status != -EIO)
                nfs_increment_open_seqid(status, res->seqid);
        if (!status)
-               status = decode_stateid(xdr, &res->stateid);
+               status = decode_open_stateid(xdr, &res->stateid);
        return status;
 }
 
@@ -5838,6 +5856,12 @@ out_overflow:
 }
 
 #if defined(CONFIG_NFS_V4_1)
+static int decode_layout_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+       stateid->type = NFS4_LAYOUT_STATEID_TYPE;
+       return decode_stateid(xdr, stateid);
+}
+
 static int decode_getdeviceinfo(struct xdr_stream *xdr,
                                struct nfs4_getdeviceinfo_res *res)
 {
@@ -5919,7 +5943,7 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
        if (unlikely(!p))
                goto out_overflow;
        res->return_on_close = be32_to_cpup(p);
-       decode_stateid(xdr, &res->stateid);
+       decode_layout_stateid(xdr, &res->stateid);
        p = xdr_inline_decode(xdr, 4);
        if (unlikely(!p))
                goto out_overflow;
@@ -5985,7 +6009,7 @@ static int decode_layoutreturn(struct xdr_stream *xdr,
                goto out_overflow;
        res->lrs_present = be32_to_cpup(p);
        if (res->lrs_present)
-               status = decode_stateid(xdr, &res->stateid);
+               status = decode_layout_stateid(xdr, &res->stateid);
        return status;
 out_overflow:
        print_overflow_msg(__func__, xdr);
@@ -7515,6 +7539,7 @@ struct rpc_procinfo       nfs4_procedures[] = {
        PROC(DEALLOCATE,        enc_deallocate,         dec_deallocate),
        PROC(LAYOUTSTATS,       enc_layoutstats,        dec_layoutstats),
        PROC(CLONE,             enc_clone,              dec_clone),
+       PROC(COPY,              enc_copy,               dec_copy),
 #endif /* CONFIG_NFS_V4_2 */
 };
 
index 1f6db42..174dd4c 100644 (file)
@@ -341,8 +341,10 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page,
         * long write-back delay. This will be adjusted in
         * update_nfs_request below if the region is not locked. */
        req->wb_page    = page;
-       req->wb_index   = page_file_index(page);
-       get_page(page);
+       if (page) {
+               req->wb_index = page_file_index(page);
+               get_page(page);
+       }
        req->wb_offset  = offset;
        req->wb_pgbase  = offset;
        req->wb_bytes   = count;
index 89a5ef4..0c7e0d4 100644 (file)
@@ -270,7 +270,7 @@ pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
        };
 
        set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
-       return pnfs_mark_matching_lsegs_invalid(lo, lseg_list, &range);
+       return pnfs_mark_matching_lsegs_invalid(lo, lseg_list, &range, 0);
 }
 
 static int
@@ -308,7 +308,7 @@ pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode)
 
        spin_lock(&inode->i_lock);
        pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
-       pnfs_mark_matching_lsegs_invalid(lo, &head, &range);
+       pnfs_mark_matching_lsegs_invalid(lo, &head, &range, 0);
        spin_unlock(&inode->i_lock);
        pnfs_free_lseg_list(&head);
        dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__,
@@ -522,13 +522,35 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
        return rv;
 }
 
-/* Returns count of number of matching invalid lsegs remaining in list
- * after call.
+/*
+ * Compare 2 layout stateid sequence ids, to see which is newer,
+ * taking into account wraparound issues.
+ */
+static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
+{
+       return (s32)(s1 - s2) > 0;
+}
+
+/**
+ * pnfs_mark_matching_lsegs_invalid - tear down lsegs or mark them for later
+ * @lo: layout header containing the lsegs
+ * @tmp_list: list head where doomed lsegs should go
+ * @recall_range: optional recall range argument to match (may be NULL)
+ * @seq: only invalidate lsegs obtained prior to this sequence (may be 0)
+ *
+ * Walk the list of lsegs in the layout header, and tear down any that should
+ * be destroyed. If "recall_range" is specified then the segment must match
+ * that range. If "seq" is non-zero, then only match segments that were handed
+ * out at or before that sequence.
+ *
+ * Returns number of matching invalid lsegs remaining in list after scanning
+ * it and purging them.
  */
 int
 pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
                            struct list_head *tmp_list,
-                           const struct pnfs_layout_range *recall_range)
+                           const struct pnfs_layout_range *recall_range,
+                           u32 seq)
 {
        struct pnfs_layout_segment *lseg, *next;
        int remaining = 0;
@@ -540,10 +562,12 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
        list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
                if (!recall_range ||
                    should_free_lseg(&lseg->pls_range, recall_range)) {
-                       dprintk("%s: freeing lseg %p iomode %d "
+                       if (seq && pnfs_seqid_is_newer(lseg->pls_seq, seq))
+                               continue;
+                       dprintk("%s: freeing lseg %p iomode %d seq %u"
                                "offset %llu length %llu\n", __func__,
-                               lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
-                               lseg->pls_range.length);
+                               lseg, lseg->pls_range.iomode, lseg->pls_seq,
+                               lseg->pls_range.offset, lseg->pls_range.length);
                        if (!mark_lseg_invalid(lseg, tmp_list))
                                remaining++;
                }
@@ -730,15 +754,6 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
        pnfs_destroy_layouts_byclid(clp, false);
 }
 
-/*
- * Compare 2 layout stateid sequence ids, to see which is newer,
- * taking into account wraparound issues.
- */
-static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
-{
-       return (s32)(s1 - s2) > 0;
-}
-
 /* update lo->plh_stateid with new if is more recent */
 void
 pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
@@ -781,50 +796,22 @@ pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo)
                test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
 }
 
-int
-pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
-                             const struct pnfs_layout_range *range,
-                             struct nfs4_state *open_state)
-{
-       int status = 0;
-
-       dprintk("--> %s\n", __func__);
-       spin_lock(&lo->plh_inode->i_lock);
-       if (pnfs_layoutgets_blocked(lo)) {
-               status = -EAGAIN;
-       } else if (!nfs4_valid_open_stateid(open_state)) {
-               status = -EBADF;
-       } else if (list_empty(&lo->plh_segs) ||
-                  test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
-               int seq;
-
-               do {
-                       seq = read_seqbegin(&open_state->seqlock);
-                       nfs4_stateid_copy(dst, &open_state->stateid);
-               } while (read_seqretry(&open_state->seqlock, seq));
-       } else
-               nfs4_stateid_copy(dst, &lo->plh_stateid);
-       spin_unlock(&lo->plh_inode->i_lock);
-       dprintk("<-- %s\n", __func__);
-       return status;
-}
-
 /*
-* Get layout from server.
-*    for now, assume that whole file layouts are requested.
-*    arg->offset: 0
-*    arg->length: all ones
-*/
+ * Get layout from server.
+ *    for now, assume that whole file layouts are requested.
+ *    arg->offset: 0
+ *    arg->length: all ones
+ */
 static struct pnfs_layout_segment *
 send_layoutget(struct pnfs_layout_hdr *lo,
           struct nfs_open_context *ctx,
+          nfs4_stateid *stateid,
           const struct pnfs_layout_range *range,
-          gfp_t gfp_flags)
+          long *timeout, gfp_t gfp_flags)
 {
        struct inode *ino = lo->plh_inode;
        struct nfs_server *server = NFS_SERVER(ino);
        struct nfs4_layoutget *lgp;
-       struct pnfs_layout_segment *lseg;
        loff_t i_size;
 
        dprintk("--> %s\n", __func__);
@@ -834,40 +821,31 @@ send_layoutget(struct pnfs_layout_hdr *lo,
         * store in lseg. If we race with a concurrent seqid morphing
         * op, then re-send the LAYOUTGET.
         */
-       do {
-               lgp = kzalloc(sizeof(*lgp), gfp_flags);
-               if (lgp == NULL)
-                       return NULL;
-
-               i_size = i_size_read(ino);
-
-               lgp->args.minlength = PAGE_SIZE;
-               if (lgp->args.minlength > range->length)
-                       lgp->args.minlength = range->length;
-               if (range->iomode == IOMODE_READ) {
-                       if (range->offset >= i_size)
-                               lgp->args.minlength = 0;
-                       else if (i_size - range->offset < lgp->args.minlength)
-                               lgp->args.minlength = i_size - range->offset;
-               }
-               lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
-               pnfs_copy_range(&lgp->args.range, range);
-               lgp->args.type = server->pnfs_curr_ld->id;
-               lgp->args.inode = ino;
-               lgp->args.ctx = get_nfs_open_context(ctx);
-               lgp->gfp_flags = gfp_flags;
-               lgp->cred = lo->plh_lc_cred;
-
-               lseg = nfs4_proc_layoutget(lgp, gfp_flags);
-       } while (lseg == ERR_PTR(-EAGAIN));
-
-       if (IS_ERR(lseg) && !nfs_error_is_fatal(PTR_ERR(lseg)))
-               lseg = NULL;
-       else
-               pnfs_layout_clear_fail_bit(lo,
-                               pnfs_iomode_to_fail_bit(range->iomode));
+       lgp = kzalloc(sizeof(*lgp), gfp_flags);
+       if (lgp == NULL)
+               return ERR_PTR(-ENOMEM);
 
-       return lseg;
+       i_size = i_size_read(ino);
+
+       lgp->args.minlength = PAGE_SIZE;
+       if (lgp->args.minlength > range->length)
+               lgp->args.minlength = range->length;
+       if (range->iomode == IOMODE_READ) {
+               if (range->offset >= i_size)
+                       lgp->args.minlength = 0;
+               else if (i_size - range->offset < lgp->args.minlength)
+                       lgp->args.minlength = i_size - range->offset;
+       }
+       lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
+       pnfs_copy_range(&lgp->args.range, range);
+       lgp->args.type = server->pnfs_curr_ld->id;
+       lgp->args.inode = ino;
+       lgp->args.ctx = get_nfs_open_context(ctx);
+       nfs4_stateid_copy(&lgp->args.stateid, stateid);
+       lgp->gfp_flags = gfp_flags;
+       lgp->cred = lo->plh_lc_cred;
+
+       return nfs4_proc_layoutget(lgp, timeout, gfp_flags);
 }
 
 static void pnfs_clear_layoutcommit(struct inode *inode,
@@ -899,6 +877,7 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo)
        if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
                return false;
        lo->plh_return_iomode = 0;
+       lo->plh_return_seq = 0;
        pnfs_get_layout_hdr(lo);
        clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
        return true;
@@ -969,6 +948,7 @@ static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo)
                bool send;
 
                nfs4_stateid_copy(&stateid, &lo->plh_stateid);
+               stateid.seqid = cpu_to_be32(lo->plh_return_seq);
                iomode = lo->plh_return_iomode;
                send = pnfs_prepare_layoutreturn(lo);
                spin_unlock(&inode->i_lock);
@@ -1012,7 +992,7 @@ _pnfs_return_layout(struct inode *ino)
        pnfs_get_layout_hdr(lo);
        empty = list_empty(&lo->plh_segs);
        pnfs_clear_layoutcommit(ino, &tmp_list);
-       pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
+       pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL, 0);
 
        if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
                struct pnfs_layout_range range = {
@@ -1341,23 +1321,28 @@ out_existing:
 
 /*
  * iomode matching rules:
- * iomode      lseg    match
- * -----       -----   -----
- * ANY         READ    true
- * ANY         RW      true
- * RW          READ    false
- * RW          RW      true
- * READ                READ    true
- * READ                RW      true
+ * iomode      lseg    strict match
+ *                      iomode
+ * -----       -----   ------ -----
+ * ANY         READ    N/A    true
+ * ANY         RW      N/A    true
+ * RW          READ    N/A    false
+ * RW          RW      N/A    true
+ * READ                READ    N/A    true
+ * READ                RW      true   false
+ * READ                RW      false  true
  */
 static bool
 pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range,
-                const struct pnfs_layout_range *range)
+                const struct pnfs_layout_range *range,
+                bool strict_iomode)
 {
        struct pnfs_layout_range range1;
 
        if ((range->iomode == IOMODE_RW &&
             ls_range->iomode != IOMODE_RW) ||
+           (range->iomode != ls_range->iomode &&
+            strict_iomode == true) ||
            !pnfs_lseg_range_intersecting(ls_range, range))
                return 0;
 
@@ -1372,7 +1357,8 @@ pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range,
  */
 static struct pnfs_layout_segment *
 pnfs_find_lseg(struct pnfs_layout_hdr *lo,
-               struct pnfs_layout_range *range)
+               struct pnfs_layout_range *range,
+               bool strict_iomode)
 {
        struct pnfs_layout_segment *lseg, *ret = NULL;
 
@@ -1381,7 +1367,8 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
        list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
                if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
                    !test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) &&
-                   pnfs_lseg_range_match(&lseg->pls_range, range)) {
+                   pnfs_lseg_range_match(&lseg->pls_range, range,
+                                         strict_iomode)) {
                        ret = pnfs_get_lseg(lseg);
                        break;
                }
@@ -1498,6 +1485,7 @@ pnfs_update_layout(struct inode *ino,
                   loff_t pos,
                   u64 count,
                   enum pnfs_iomode iomode,
+                  bool strict_iomode,
                   gfp_t gfp_flags)
 {
        struct pnfs_layout_range arg = {
@@ -1505,27 +1493,30 @@ pnfs_update_layout(struct inode *ino,
                .offset = pos,
                .length = count,
        };
-       unsigned pg_offset;
+       unsigned pg_offset, seq;
        struct nfs_server *server = NFS_SERVER(ino);
        struct nfs_client *clp = server->nfs_client;
-       struct pnfs_layout_hdr *lo;
+       struct pnfs_layout_hdr *lo = NULL;
        struct pnfs_layout_segment *lseg = NULL;
+       nfs4_stateid stateid;
+       long timeout = 0;
+       unsigned long giveup = jiffies + rpc_get_timeout(server->client);
        bool first;
 
        if (!pnfs_enabled_sb(NFS_SERVER(ino))) {
-               trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+               trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
                                 PNFS_UPDATE_LAYOUT_NO_PNFS);
                goto out;
        }
 
        if (iomode == IOMODE_READ && i_size_read(ino) == 0) {
-               trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+               trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
                                 PNFS_UPDATE_LAYOUT_RD_ZEROLEN);
                goto out;
        }
 
        if (pnfs_within_mdsthreshold(ctx, ino, iomode)) {
-               trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+               trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
                                 PNFS_UPDATE_LAYOUT_MDSTHRESH);
                goto out;
        }
@@ -1536,14 +1527,14 @@ lookup_again:
        lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
        if (lo == NULL) {
                spin_unlock(&ino->i_lock);
-               trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+               trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
                                 PNFS_UPDATE_LAYOUT_NOMEM);
                goto out;
        }
 
        /* Do we even need to bother with this? */
        if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
-               trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+               trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
                                 PNFS_UPDATE_LAYOUT_BULK_RECALL);
                dprintk("%s matches recall, use MDS\n", __func__);
                goto out_unlock;
@@ -1551,14 +1542,34 @@ lookup_again:
 
        /* if LAYOUTGET already failed once we don't try again */
        if (pnfs_layout_io_test_failed(lo, iomode)) {
-               trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+               trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
                                 PNFS_UPDATE_LAYOUT_IO_TEST_FAIL);
                goto out_unlock;
        }
 
-       first = list_empty(&lo->plh_segs);
-       if (first) {
-               /* The first layoutget for the file. Need to serialize per
+       lseg = pnfs_find_lseg(lo, &arg, strict_iomode);
+       if (lseg) {
+               trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
+                               PNFS_UPDATE_LAYOUT_FOUND_CACHED);
+               goto out_unlock;
+       }
+
+       if (!nfs4_valid_open_stateid(ctx->state)) {
+               trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
+                               PNFS_UPDATE_LAYOUT_INVALID_OPEN);
+               goto out_unlock;
+       }
+
+       /*
+        * Choose a stateid for the LAYOUTGET. If we don't have a layout
+        * stateid, or it has been invalidated, then we must use the open
+        * stateid.
+        */
+       if (lo->plh_stateid.seqid == 0 ||
+           test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
+
+               /*
+                * The first layoutget for the file. Need to serialize per
                 * RFC 5661 Errata 3208.
                 */
                if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET,
@@ -1567,18 +1578,17 @@ lookup_again:
                        wait_on_bit(&lo->plh_flags, NFS_LAYOUT_FIRST_LAYOUTGET,
                                    TASK_UNINTERRUPTIBLE);
                        pnfs_put_layout_hdr(lo);
+                       dprintk("%s retrying\n", __func__);
                        goto lookup_again;
                }
+
+               first = true;
+               do {
+                       seq = read_seqbegin(&ctx->state->seqlock);
+                       nfs4_stateid_copy(&stateid, &ctx->state->stateid);
+               } while (read_seqretry(&ctx->state->seqlock, seq));
        } else {
-               /* Check to see if the layout for the given range
-                * already exists
-                */
-               lseg = pnfs_find_lseg(lo, &arg);
-               if (lseg) {
-                       trace_pnfs_update_layout(ino, pos, count, iomode, lo,
-                                       PNFS_UPDATE_LAYOUT_FOUND_CACHED);
-                       goto out_unlock;
-               }
+               nfs4_stateid_copy(&stateid, &lo->plh_stateid);
        }
 
        /*
@@ -1593,15 +1603,17 @@ lookup_again:
                                pnfs_clear_first_layoutget(lo);
                        pnfs_put_layout_hdr(lo);
                        dprintk("%s retrying\n", __func__);
+                       trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+                                       lseg, PNFS_UPDATE_LAYOUT_RETRY);
                        goto lookup_again;
                }
-               trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+               trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
                                PNFS_UPDATE_LAYOUT_RETURN);
                goto out_put_layout_hdr;
        }
 
        if (pnfs_layoutgets_blocked(lo)) {
-               trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+               trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
                                PNFS_UPDATE_LAYOUT_BLOCKED);
                goto out_unlock;
        }
@@ -1626,10 +1638,36 @@ lookup_again:
        if (arg.length != NFS4_MAX_UINT64)
                arg.length = PAGE_ALIGN(arg.length);
 
-       lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
-       atomic_dec(&lo->plh_outstanding);
-       trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+       lseg = send_layoutget(lo, ctx, &stateid, &arg, &timeout, gfp_flags);
+       trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
                                 PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
+       if (IS_ERR(lseg)) {
+               switch(PTR_ERR(lseg)) {
+               case -ERECALLCONFLICT:
+                       if (time_after(jiffies, giveup))
+                               lseg = NULL;
+                       /* Fallthrough */
+               case -EAGAIN:
+                       pnfs_put_layout_hdr(lo);
+                       if (first)
+                               pnfs_clear_first_layoutget(lo);
+                       if (lseg) {
+                               trace_pnfs_update_layout(ino, pos, count,
+                                       iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY);
+                               goto lookup_again;
+                       }
+                       /* Fallthrough */
+               default:
+                       if (!nfs_error_is_fatal(PTR_ERR(lseg))) {
+                               pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
+                               lseg = NULL;
+                       }
+               }
+       } else {
+               pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
+       }
+
+       atomic_dec(&lo->plh_outstanding);
 out_put_layout_hdr:
        if (first)
                pnfs_clear_first_layoutget(lo);
@@ -1678,38 +1716,36 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
        struct pnfs_layout_segment *lseg;
        struct inode *ino = lo->plh_inode;
        LIST_HEAD(free_me);
-       int status = -EINVAL;
 
        if (!pnfs_sanity_check_layout_range(&res->range))
-               goto out;
+               return ERR_PTR(-EINVAL);
 
        /* Inject layout blob into I/O device driver */
        lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags);
-       if (!lseg || IS_ERR(lseg)) {
+       if (IS_ERR_OR_NULL(lseg)) {
                if (!lseg)
-                       status = -ENOMEM;
-               else
-                       status = PTR_ERR(lseg);
-               dprintk("%s: Could not allocate layout: error %d\n",
-                      __func__, status);
-               goto out;
+                       lseg = ERR_PTR(-ENOMEM);
+
+               dprintk("%s: Could not allocate layout: error %ld\n",
+                      __func__, PTR_ERR(lseg));
+               return lseg;
        }
 
        init_lseg(lo, lseg);
        lseg->pls_range = res->range;
+       lseg->pls_seq = be32_to_cpu(res->stateid.seqid);
 
        spin_lock(&ino->i_lock);
        if (pnfs_layoutgets_blocked(lo)) {
                dprintk("%s forget reply due to state\n", __func__);
-               goto out_forget_reply;
+               goto out_forget;
        }
 
        if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
                /* existing state ID, make sure the sequence number matches. */
                if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
                        dprintk("%s forget reply due to sequence\n", __func__);
-                       status = -EAGAIN;
-                       goto out_forget_reply;
+                       goto out_forget;
                }
                pnfs_set_layout_stateid(lo, &res->stateid, false);
        } else {
@@ -1718,7 +1754,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
                 * inode invalid, and don't bother validating the stateid
                 * sequence number.
                 */
-               pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL);
+               pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL, 0);
 
                nfs4_stateid_copy(&lo->plh_stateid, &res->stateid);
                lo->plh_barrier = be32_to_cpu(res->stateid.seqid);
@@ -1735,18 +1771,17 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
        spin_unlock(&ino->i_lock);
        pnfs_free_lseg_list(&free_me);
        return lseg;
-out:
-       return ERR_PTR(status);
 
-out_forget_reply:
+out_forget:
        spin_unlock(&ino->i_lock);
        lseg->pls_layout = lo;
        NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
-       goto out;
+       return ERR_PTR(-EAGAIN);
 }
 
 static void
-pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode)
+pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode,
+                        u32 seq)
 {
        if (lo->plh_return_iomode == iomode)
                return;
@@ -1754,6 +1789,8 @@ pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode)
                iomode = IOMODE_ANY;
        lo->plh_return_iomode = iomode;
        set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
+       if (!lo->plh_return_seq || pnfs_seqid_is_newer(seq, lo->plh_return_seq))
+               lo->plh_return_seq = seq;
 }
 
 /**
@@ -1769,7 +1806,8 @@ pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode)
 int
 pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
                                struct list_head *tmp_list,
-                               const struct pnfs_layout_range *return_range)
+                               const struct pnfs_layout_range *return_range,
+                               u32 seq)
 {
        struct pnfs_layout_segment *lseg, *next;
        int remaining = 0;
@@ -1792,8 +1830,11 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
                                continue;
                        remaining++;
                        set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
-                       pnfs_set_plh_return_iomode(lo, return_range->iomode);
                }
+
+       if (remaining)
+               pnfs_set_plh_return_info(lo, return_range->iomode, seq);
+
        return remaining;
 }
 
@@ -1810,13 +1851,14 @@ void pnfs_error_mark_layout_for_return(struct inode *inode,
        bool return_now = false;
 
        spin_lock(&inode->i_lock);
-       pnfs_set_plh_return_iomode(lo, range.iomode);
+       pnfs_set_plh_return_info(lo, range.iomode, lseg->pls_seq);
        /*
         * mark all matching lsegs so that we are sure to have no live
         * segments at hand when sending layoutreturn. See pnfs_put_lseg()
         * for how it works.
         */
-       if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range)) {
+       if (!pnfs_mark_matching_lsegs_return(lo, &free_me,
+                                               &range, lseg->pls_seq)) {
                nfs4_stateid stateid;
                enum pnfs_iomode iomode = lo->plh_return_iomode;
 
@@ -1849,6 +1891,7 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r
                                                   req_offset(req),
                                                   rd_size,
                                                   IOMODE_READ,
+                                                  false,
                                                   GFP_KERNEL);
                if (IS_ERR(pgio->pg_lseg)) {
                        pgio->pg_error = PTR_ERR(pgio->pg_lseg);
@@ -1873,6 +1916,7 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
                                                   req_offset(req),
                                                   wb_size,
                                                   IOMODE_RW,
+                                                  false,
                                                   GFP_NOFS);
                if (IS_ERR(pgio->pg_lseg)) {
                        pgio->pg_error = PTR_ERR(pgio->pg_lseg);
@@ -2143,12 +2187,15 @@ pnfs_try_to_read_data(struct nfs_pgio_header *hdr,
 }
 
 /* Resend all requests through pnfs. */
-int pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr)
+void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr)
 {
        struct nfs_pageio_descriptor pgio;
 
-       nfs_pageio_init_read(&pgio, hdr->inode, false, hdr->completion_ops);
-       return nfs_pageio_resend(&pgio, hdr);
+       if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+               nfs_pageio_init_read(&pgio, hdr->inode, false,
+                                       hdr->completion_ops);
+               hdr->task.tk_status = nfs_pageio_resend(&pgio, hdr);
+       }
 }
 EXPORT_SYMBOL_GPL(pnfs_read_resend_pnfs);
 
@@ -2158,12 +2205,11 @@ pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
        const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
        struct pnfs_layout_segment *lseg = desc->pg_lseg;
        enum pnfs_try_status trypnfs;
-       int err = 0;
 
        trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg);
        if (trypnfs == PNFS_TRY_AGAIN)
-               err = pnfs_read_resend_pnfs(hdr);
-       if (trypnfs == PNFS_NOT_ATTEMPTED || err)
+               pnfs_read_resend_pnfs(hdr);
+       if (trypnfs == PNFS_NOT_ATTEMPTED || hdr->task.tk_status)
                pnfs_read_through_mds(desc, hdr);
 }
 
@@ -2405,7 +2451,7 @@ pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags)
        spin_lock(&inode->i_lock);
        if (!NFS_I(inode)->layout) {
                spin_unlock(&inode->i_lock);
-               goto out;
+               goto out_clear_layoutstats;
        }
        hdr = NFS_I(inode)->layout;
        pnfs_get_layout_hdr(hdr);
@@ -2434,6 +2480,7 @@ out_free:
        kfree(data);
 out_put:
        pnfs_put_layout_hdr(hdr);
+out_clear_layoutstats:
        smp_mb__before_atomic();
        clear_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags);
        smp_mb__after_atomic();
index 1ac1db5..b21bd0b 100644 (file)
@@ -64,6 +64,7 @@ struct pnfs_layout_segment {
        struct list_head pls_lc_list;
        struct pnfs_layout_range pls_range;
        atomic_t pls_refcount;
+       u32 pls_seq;
        unsigned long pls_flags;
        struct pnfs_layout_hdr *pls_layout;
        struct work_struct pls_work;
@@ -194,6 +195,7 @@ struct pnfs_layout_hdr {
        unsigned long           plh_flags;
        nfs4_stateid            plh_stateid;
        u32                     plh_barrier; /* ignore lower seqids */
+       u32                     plh_return_seq;
        enum pnfs_iomode        plh_return_iomode;
        loff_t                  plh_lwb; /* last write byte for layoutcommit */
        struct rpc_cred         *plh_lc_cred; /* layoutcommit cred */
@@ -226,7 +228,7 @@ extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
 extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
                                   struct pnfs_device *dev,
                                   struct rpc_cred *cred);
-extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags);
+extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags);
 extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync);
 
 /* pnfs.c */
@@ -258,16 +260,14 @@ void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo);
 void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
                             const nfs4_stateid *new,
                             bool update_barrier);
-int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
-                                 struct pnfs_layout_hdr *lo,
-                                 const struct pnfs_layout_range *range,
-                                 struct nfs4_state *open_state);
 int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
                                struct list_head *tmp_list,
-                               const struct pnfs_layout_range *recall_range);
+                               const struct pnfs_layout_range *recall_range,
+                               u32 seq);
 int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
                                struct list_head *tmp_list,
-                               const struct pnfs_layout_range *recall_range);
+                               const struct pnfs_layout_range *recall_range,
+                               u32 seq);
 bool pnfs_roc(struct inode *ino);
 void pnfs_roc_release(struct inode *ino);
 void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
@@ -282,12 +282,13 @@ int _pnfs_return_layout(struct inode *);
 int pnfs_commit_and_return_layout(struct inode *);
 void pnfs_ld_write_done(struct nfs_pgio_header *);
 void pnfs_ld_read_done(struct nfs_pgio_header *);
-int pnfs_read_resend_pnfs(struct nfs_pgio_header *);
+void pnfs_read_resend_pnfs(struct nfs_pgio_header *);
 struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
                                               struct nfs_open_context *ctx,
                                               loff_t pos,
                                               u64 count,
                                               enum pnfs_iomode iomode,
+                                              bool strict_iomode,
                                               gfp_t gfp_flags);
 void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo);
 
index 4aaed89..0dfc476 100644 (file)
@@ -61,7 +61,7 @@ EXPORT_SYMBOL_GPL(pnfs_generic_commit_release);
 
 /* The generic layer is about to remove the req from the commit list.
  * If this will make the bucket empty, it will need to put the lseg reference.
- * Note this must be called holding the inode (/cinfo) lock
+ * Note this must be called holding i_lock
  */
 void
 pnfs_generic_clear_request_commit(struct nfs_page *req,
@@ -98,7 +98,7 @@ pnfs_generic_transfer_commit_list(struct list_head *src, struct list_head *dst,
                if (!nfs_lock_request(req))
                        continue;
                kref_get(&req->wb_kref);
-               if (cond_resched_lock(cinfo->lock))
+               if (cond_resched_lock(&cinfo->inode->i_lock))
                        list_safe_reset_next(req, tmp, wb_list);
                nfs_request_remove_commit_list(req, cinfo);
                clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
@@ -119,7 +119,7 @@ pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
        struct list_head *dst = &bucket->committing;
        int ret;
 
-       lockdep_assert_held(cinfo->lock);
+       lockdep_assert_held(&cinfo->inode->i_lock);
        ret = pnfs_generic_transfer_commit_list(src, dst, cinfo, max);
        if (ret) {
                cinfo->ds->nwritten -= ret;
@@ -142,7 +142,7 @@ int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo,
 {
        int i, rv = 0, cnt;
 
-       lockdep_assert_held(cinfo->lock);
+       lockdep_assert_held(&cinfo->inode->i_lock);
        for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) {
                cnt = pnfs_generic_scan_ds_commit_list(&cinfo->ds->buckets[i],
                                                       cinfo, max);
@@ -161,16 +161,16 @@ void pnfs_generic_recover_commit_reqs(struct list_head *dst,
        struct pnfs_layout_segment *freeme;
        int i;
 
-       lockdep_assert_held(cinfo->lock);
+       lockdep_assert_held(&cinfo->inode->i_lock);
 restart:
        for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
                if (pnfs_generic_transfer_commit_list(&b->written, dst,
                                                      cinfo, 0)) {
                        freeme = b->wlseg;
                        b->wlseg = NULL;
-                       spin_unlock(cinfo->lock);
+                       spin_unlock(&cinfo->inode->i_lock);
                        pnfs_put_lseg(freeme);
-                       spin_lock(cinfo->lock);
+                       spin_lock(&cinfo->inode->i_lock);
                        goto restart;
                }
        }
@@ -186,7 +186,7 @@ static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx)
        LIST_HEAD(pages);
        int i;
 
-       spin_lock(cinfo->lock);
+       spin_lock(&cinfo->inode->i_lock);
        for (i = idx; i < fl_cinfo->nbuckets; i++) {
                bucket = &fl_cinfo->buckets[i];
                if (list_empty(&bucket->committing))
@@ -194,12 +194,12 @@ static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx)
                freeme = bucket->clseg;
                bucket->clseg = NULL;
                list_splice_init(&bucket->committing, &pages);
-               spin_unlock(cinfo->lock);
+               spin_unlock(&cinfo->inode->i_lock);
                nfs_retry_commit(&pages, freeme, cinfo, i);
                pnfs_put_lseg(freeme);
-               spin_lock(cinfo->lock);
+               spin_lock(&cinfo->inode->i_lock);
        }
-       spin_unlock(cinfo->lock);
+       spin_unlock(&cinfo->inode->i_lock);
 }
 
 static unsigned int
@@ -238,14 +238,31 @@ void pnfs_fetch_commit_bucket_list(struct list_head *pages,
        struct pnfs_commit_bucket *bucket;
 
        bucket = &cinfo->ds->buckets[data->ds_commit_index];
-       spin_lock(cinfo->lock);
+       spin_lock(&cinfo->inode->i_lock);
        list_splice_init(&bucket->committing, pages);
        data->lseg = bucket->clseg;
        bucket->clseg = NULL;
-       spin_unlock(cinfo->lock);
+       spin_unlock(&cinfo->inode->i_lock);
 
 }
 
+/* Helper function for pnfs_generic_commit_pagelist to catch an empty
+ * page list. This can happen when two commits race. */
+static bool
+pnfs_generic_commit_cancel_empty_pagelist(struct list_head *pages,
+                                         struct nfs_commit_data *data,
+                                         struct nfs_commit_info *cinfo)
+{
+       if (list_empty(pages)) {
+               if (atomic_dec_and_test(&cinfo->mds->rpcs_out))
+                       wake_up_atomic_t(&cinfo->mds->rpcs_out);
+               nfs_commitdata_release(data);
+               return true;
+       }
+
+       return false;
+}
+
 /* This follows nfs_commit_list pretty closely */
 int
 pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
@@ -280,6 +297,11 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
        list_for_each_entry_safe(data, tmp, &list, pages) {
                list_del_init(&data->pages);
                if (data->ds_commit_index < 0) {
+                       /* another commit raced with us */
+                       if (pnfs_generic_commit_cancel_empty_pagelist(mds_pages,
+                               data, cinfo))
+                               continue;
+
                        nfs_init_commit(data, mds_pages, NULL, cinfo);
                        nfs_initiate_commit(NFS_CLIENT(inode), data,
                                            NFS_PROTO(data->inode),
@@ -288,6 +310,12 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
                        LIST_HEAD(pages);
 
                        pnfs_fetch_commit_bucket_list(&pages, data, cinfo);
+
+                       /* another commit raced with us */
+                       if (pnfs_generic_commit_cancel_empty_pagelist(&pages,
+                               data, cinfo))
+                               continue;
+
                        nfs_init_commit(data, &pages, data->lseg, cinfo);
                        initiate_commit(data, how);
                }
@@ -874,12 +902,12 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
        struct list_head *list;
        struct pnfs_commit_bucket *buckets;
 
-       spin_lock(cinfo->lock);
+       spin_lock(&cinfo->inode->i_lock);
        buckets = cinfo->ds->buckets;
        list = &buckets[ds_commit_idx].written;
        if (list_empty(list)) {
                if (!pnfs_is_valid_lseg(lseg)) {
-                       spin_unlock(cinfo->lock);
+                       spin_unlock(&cinfo->inode->i_lock);
                        cinfo->completion_ops->resched_write(cinfo, req);
                        return;
                }
@@ -896,7 +924,7 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
        cinfo->ds->nwritten++;
 
        nfs_request_add_commit_list_locked(req, list, cinfo);
-       spin_unlock(cinfo->lock);
+       spin_unlock(&cinfo->inode->i_lock);
        nfs_mark_page_unstable(req->wb_page, cinfo);
 }
 EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
index f126828..2137e02 100644 (file)
@@ -191,6 +191,7 @@ static const match_table_t nfs_mount_option_tokens = {
 
 enum {
        Opt_xprt_udp, Opt_xprt_udp6, Opt_xprt_tcp, Opt_xprt_tcp6, Opt_xprt_rdma,
+       Opt_xprt_rdma6,
 
        Opt_xprt_err
 };
@@ -201,6 +202,7 @@ static const match_table_t nfs_xprt_protocol_tokens = {
        { Opt_xprt_tcp, "tcp" },
        { Opt_xprt_tcp6, "tcp6" },
        { Opt_xprt_rdma, "rdma" },
+       { Opt_xprt_rdma6, "rdma6" },
 
        { Opt_xprt_err, NULL }
 };
@@ -1456,6 +1458,8 @@ static int nfs_parse_mount_options(char *raw,
                                mnt->flags |= NFS_MOUNT_TCP;
                                mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
                                break;
+                       case Opt_xprt_rdma6:
+                               protofamily = AF_INET6;
                        case Opt_xprt_rdma:
                                /* vector side protocols to TCP */
                                mnt->flags |= NFS_MOUNT_TCP;
@@ -2408,6 +2412,11 @@ static int nfs_compare_super_address(struct nfs_server *server1,
                                     struct nfs_server *server2)
 {
        struct sockaddr *sap1, *sap2;
+       struct rpc_xprt *xprt1 = server1->client->cl_xprt;
+       struct rpc_xprt *xprt2 = server2->client->cl_xprt;
+
+       if (!net_eq(xprt1->xprt_net, xprt2->xprt_net))
+               return 0;
 
        sap1 = (struct sockaddr *)&server1->nfs_client->cl_addr;
        sap2 = (struct sockaddr *)&server2->nfs_client->cl_addr;
index 5f4fd53..e1c74d3 100644 (file)
@@ -245,8 +245,7 @@ static void nfs_mark_uptodate(struct nfs_page *req)
 static int wb_priority(struct writeback_control *wbc)
 {
        int ret = 0;
-       if (wbc->for_reclaim)
-               return FLUSH_HIGHPRI | FLUSH_COND_STABLE;
+
        if (wbc->sync_mode == WB_SYNC_ALL)
                ret = FLUSH_COND_STABLE;
        return ret;
@@ -737,7 +736,7 @@ static void nfs_inode_remove_request(struct nfs_page *req)
                head = req->wb_head;
 
                spin_lock(&inode->i_lock);
-               if (likely(!PageSwapCache(head->wb_page))) {
+               if (likely(head->wb_page && !PageSwapCache(head->wb_page))) {
                        set_page_private(head->wb_page, 0);
                        ClearPagePrivate(head->wb_page);
                        smp_mb__after_atomic();
@@ -759,7 +758,8 @@ static void nfs_inode_remove_request(struct nfs_page *req)
 static void
 nfs_mark_request_dirty(struct nfs_page *req)
 {
-       __set_page_dirty_nobuffers(req->wb_page);
+       if (req->wb_page)
+               __set_page_dirty_nobuffers(req->wb_page);
 }
 
 /*
@@ -804,7 +804,7 @@ nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
  * number of outstanding requests requiring a commit as well as
  * the MM page stats.
  *
- * The caller must hold the cinfo->lock, and the nfs_page lock.
+ * The caller must hold cinfo->inode->i_lock, and the nfs_page lock.
  */
 void
 nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst,
@@ -832,10 +832,11 @@ EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked);
 void
 nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo)
 {
-       spin_lock(cinfo->lock);
+       spin_lock(&cinfo->inode->i_lock);
        nfs_request_add_commit_list_locked(req, &cinfo->mds->list, cinfo);
-       spin_unlock(cinfo->lock);
-       nfs_mark_page_unstable(req->wb_page, cinfo);
+       spin_unlock(&cinfo->inode->i_lock);
+       if (req->wb_page)
+               nfs_mark_page_unstable(req->wb_page, cinfo);
 }
 EXPORT_SYMBOL_GPL(nfs_request_add_commit_list);
 
@@ -864,7 +865,7 @@ EXPORT_SYMBOL_GPL(nfs_request_remove_commit_list);
 static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
                                      struct inode *inode)
 {
-       cinfo->lock = &inode->i_lock;
+       cinfo->inode = inode;
        cinfo->mds = &NFS_I(inode)->commit_info;
        cinfo->ds = pnfs_get_ds_info(inode);
        cinfo->dreq = NULL;
@@ -967,7 +968,7 @@ nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
        return cinfo->mds->ncommit;
 }
 
-/* cinfo->lock held by caller */
+/* cinfo->inode->i_lock held by caller */
 int
 nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
                     struct nfs_commit_info *cinfo, int max)
@@ -979,7 +980,7 @@ nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
                if (!nfs_lock_request(req))
                        continue;
                kref_get(&req->wb_kref);
-               if (cond_resched_lock(cinfo->lock))
+               if (cond_resched_lock(&cinfo->inode->i_lock))
                        list_safe_reset_next(req, tmp, wb_list);
                nfs_request_remove_commit_list(req, cinfo);
                nfs_list_add_request(req, dst);
@@ -1005,7 +1006,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst,
 {
        int ret = 0;
 
-       spin_lock(cinfo->lock);
+       spin_lock(&cinfo->inode->i_lock);
        if (cinfo->mds->ncommit > 0) {
                const int max = INT_MAX;
 
@@ -1013,7 +1014,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst,
                                           cinfo, max);
                ret += pnfs_scan_commit_lists(inode, cinfo, max - ret);
        }
-       spin_unlock(cinfo->lock);
+       spin_unlock(&cinfo->inode->i_lock);
        return ret;
 }
 
@@ -1709,6 +1710,10 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
 {
        struct nfs_commit_data  *data;
 
+       /* another commit raced with us */
+       if (list_empty(head))
+               return 0;
+
        data = nfs_commitdata_alloc();
 
        if (!data)
@@ -1724,6 +1729,36 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
        return -ENOMEM;
 }
 
+int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf)
+{
+       struct inode *inode = file_inode(file);
+       struct nfs_open_context *open;
+       struct nfs_commit_info cinfo;
+       struct nfs_page *req;
+       int ret;
+
+       open = get_nfs_open_context(nfs_file_open_context(file));
+       req  = nfs_create_request(open, NULL, NULL, 0, i_size_read(inode));
+       if (IS_ERR(req)) {
+               ret = PTR_ERR(req);
+               goto out_put;
+       }
+
+       nfs_init_cinfo_from_inode(&cinfo, inode);
+
+       memcpy(&req->wb_verf, verf, sizeof(struct nfs_write_verifier));
+       nfs_request_add_commit_list(req, &cinfo);
+       ret = nfs_commit_inode(inode, FLUSH_SYNC);
+       if (ret > 0)
+               ret = 0;
+
+       nfs_free_request(req);
+out_put:
+       put_nfs_open_context(open);
+       return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_commit_file);
+
 /*
  * COMMIT call returned
  */
@@ -1748,7 +1783,8 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
        while (!list_empty(&data->pages)) {
                req = nfs_list_entry(data->pages.next);
                nfs_list_remove_request(req);
-               nfs_clear_page_commit(req->wb_page);
+               if (req->wb_page)
+                       nfs_clear_page_commit(req->wb_page);
 
                dprintk("NFS:       commit (%s/%llu %d@%lld)",
                        req->wb_context->dentry->d_sb->s_id,
index 93d5853..dba2ff8 100644 (file)
@@ -379,7 +379,7 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
         */
        hdr = (void*)p - rqstp->rq_arg.head[0].iov_base;
        dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len
-               - hdr;
+               + rqstp->rq_arg.tail[0].iov_len - hdr;
        /*
         * Round the length of the data which was specified up to
         * the next multiple of XDR units and then compare that
index 825c7bc..953c075 100644 (file)
@@ -289,7 +289,7 @@ nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
 
                status = nfserr_bad_stateid;
                mutex_lock(&ls->ls_mutex);
-               if (stateid->si_generation > stid->sc_stateid.si_generation)
+               if (nfsd4_stateid_generation_after(stateid, &stid->sc_stateid))
                        goto out_unlock_stid;
                if (layout_type != ls->ls_layout_type)
                        goto out_unlock_stid;
index 0462eed..f5f82e1 100644 (file)
@@ -4651,12 +4651,6 @@ grace_disallows_io(struct net *net, struct inode *inode)
        return opens_in_grace(net) && mandatory_lock(inode);
 }
 
-/* Returns true iff a is later than b: */
-static bool stateid_generation_after(stateid_t *a, stateid_t *b)
-{
-       return (s32)(a->si_generation - b->si_generation) > 0;
-}
-
 static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session)
 {
        /*
@@ -4670,7 +4664,7 @@ static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_s
                return nfs_ok;
 
        /* If the client sends us a stateid from the future, it's buggy: */
-       if (stateid_generation_after(in, ref))
+       if (nfsd4_stateid_generation_after(in, ref))
                return nfserr_bad_stateid;
        /*
         * However, we could see a stateid from the past, even from a
index c050c53..986e51e 100644 (file)
@@ -573,6 +573,11 @@ enum nfsd4_cb_op {
        NFSPROC4_CLNT_CB_SEQUENCE,
 };
 
+/* Returns true iff a is later than b: */
+static inline bool nfsd4_stateid_generation_after(stateid_t *a, stateid_t *b)
+{
+       return (s32)(a->si_generation - b->si_generation) > 0;
+}
 
 struct nfsd4_compound_state;
 struct nfsd_net;
index a8d15be..6aaf3e3 100644 (file)
@@ -272,10 +272,21 @@ struct o2hb_region {
        struct delayed_work     hr_write_timeout_work;
        unsigned long           hr_last_timeout_start;
 
+       /* negotiate timer, used to negotiate extending hb timeout. */
+       struct delayed_work     hr_nego_timeout_work;
+       unsigned long           hr_nego_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+
        /* Used during o2hb_check_slot to hold a copy of the block
         * being checked because we temporarily have to zero out the
         * crc field. */
        struct o2hb_disk_heartbeat_block *hr_tmp_block;
+
+       /* Message key for negotiate timeout message. */
+       unsigned int            hr_key;
+       struct list_head        hr_handler_list;
+
+       /* last hb status, 0 for success, other value for error. */
+       int                     hr_last_hb_status;
 };
 
 struct o2hb_bio_wait_ctxt {
@@ -284,6 +295,17 @@ struct o2hb_bio_wait_ctxt {
        int               wc_error;
 };
 
+#define O2HB_NEGO_TIMEOUT_MS (O2HB_MAX_WRITE_TIMEOUT_MS/2)
+
+enum {
+       O2HB_NEGO_TIMEOUT_MSG = 1,
+       O2HB_NEGO_APPROVE_MSG = 2,
+};
+
+struct o2hb_nego_msg {
+       u8 node_num;
+};
+
 static void o2hb_write_timeout(struct work_struct *work)
 {
        int failed, quorum;
@@ -319,7 +341,7 @@ static void o2hb_write_timeout(struct work_struct *work)
        o2quo_disk_timeout();
 }
 
-static void o2hb_arm_write_timeout(struct o2hb_region *reg)
+static void o2hb_arm_timeout(struct o2hb_region *reg)
 {
        /* Arm writeout only after thread reaches steady state */
        if (atomic_read(&reg->hr_steady_iterations) != 0)
@@ -334,14 +356,132 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg)
                spin_unlock(&o2hb_live_lock);
        }
        cancel_delayed_work(&reg->hr_write_timeout_work);
-       reg->hr_last_timeout_start = jiffies;
        schedule_delayed_work(&reg->hr_write_timeout_work,
                              msecs_to_jiffies(O2HB_MAX_WRITE_TIMEOUT_MS));
+
+       cancel_delayed_work(&reg->hr_nego_timeout_work);
+       /* negotiate timeout must be less than write timeout. */
+       schedule_delayed_work(&reg->hr_nego_timeout_work,
+                             msecs_to_jiffies(O2HB_NEGO_TIMEOUT_MS));
+       memset(reg->hr_nego_node_bitmap, 0, sizeof(reg->hr_nego_node_bitmap));
 }
 
-static void o2hb_disarm_write_timeout(struct o2hb_region *reg)
+static void o2hb_disarm_timeout(struct o2hb_region *reg)
 {
        cancel_delayed_work_sync(&reg->hr_write_timeout_work);
+       cancel_delayed_work_sync(&reg->hr_nego_timeout_work);
+}
+
+static int o2hb_send_nego_msg(int key, int type, u8 target)
+{
+       struct o2hb_nego_msg msg;
+       int status, ret;
+
+       msg.node_num = o2nm_this_node();
+again:
+       ret = o2net_send_message(type, key, &msg, sizeof(msg),
+                       target, &status);
+
+       if (ret == -EAGAIN || ret == -ENOMEM) {
+               msleep(100);
+               goto again;
+       }
+
+       return ret;
+}
+
+static void o2hb_nego_timeout(struct work_struct *work)
+{
+       unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+       int master_node, i, ret;
+       struct o2hb_region *reg;
+
+       reg = container_of(work, struct o2hb_region, hr_nego_timeout_work.work);
+       /* don't negotiate timeout if last hb failed since it is very
+        * possible io failed. Should let write timeout fence self.
+        */
+       if (reg->hr_last_hb_status)
+               return;
+
+       o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap));
+       /* lowest node as master node to make negotiate decision. */
+       master_node = find_next_bit(live_node_bitmap, O2NM_MAX_NODES, 0);
+
+       if (master_node == o2nm_this_node()) {
+               if (!test_bit(master_node, reg->hr_nego_node_bitmap)) {
+                       printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%s).\n",
+                               o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000,
+                               config_item_name(&reg->hr_item), reg->hr_dev_name);
+                       set_bit(master_node, reg->hr_nego_node_bitmap);
+               }
+               if (memcmp(reg->hr_nego_node_bitmap, live_node_bitmap,
+                               sizeof(reg->hr_nego_node_bitmap))) {
+                       /* check negotiate bitmap every second to do timeout
+                        * approve decision.
+                        */
+                       schedule_delayed_work(&reg->hr_nego_timeout_work,
+                               msecs_to_jiffies(1000));
+
+                       return;
+               }
+
+               printk(KERN_NOTICE "o2hb: all nodes hb write hung, maybe region %s (%s) is down.\n",
+                       config_item_name(&reg->hr_item), reg->hr_dev_name);
+               /* approve negotiate timeout request. */
+               o2hb_arm_timeout(reg);
+
+               i = -1;
+               while ((i = find_next_bit(live_node_bitmap,
+                               O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
+                       if (i == master_node)
+                               continue;
+
+                       mlog(ML_HEARTBEAT, "send NEGO_APPROVE msg to node %d\n", i);
+                       ret = o2hb_send_nego_msg(reg->hr_key,
+                                       O2HB_NEGO_APPROVE_MSG, i);
+                       if (ret)
+                               mlog(ML_ERROR, "send NEGO_APPROVE msg to node %d fail %d\n",
+                                       i, ret);
+               }
+       } else {
+               /* negotiate timeout with master node. */
+               printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%s), negotiate timeout with node %d.\n",
+                       o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000, config_item_name(&reg->hr_item),
+                       reg->hr_dev_name, master_node);
+               ret = o2hb_send_nego_msg(reg->hr_key, O2HB_NEGO_TIMEOUT_MSG,
+                               master_node);
+               if (ret)
+                       mlog(ML_ERROR, "send NEGO_TIMEOUT msg to node %d fail %d\n",
+                               master_node, ret);
+       }
+}
+
+static int o2hb_nego_timeout_handler(struct o2net_msg *msg, u32 len, void *data,
+                               void **ret_data)
+{
+       struct o2hb_region *reg = data;
+       struct o2hb_nego_msg *nego_msg;
+
+       nego_msg = (struct o2hb_nego_msg *)msg->buf;
+       printk(KERN_NOTICE "o2hb: receive negotiate timeout message from node %d on region %s (%s).\n",
+               nego_msg->node_num, config_item_name(&reg->hr_item), reg->hr_dev_name);
+       if (nego_msg->node_num < O2NM_MAX_NODES)
+               set_bit(nego_msg->node_num, reg->hr_nego_node_bitmap);
+       else
+               mlog(ML_ERROR, "got nego timeout message from bad node.\n");
+
+       return 0;
+}
+
+static int o2hb_nego_approve_handler(struct o2net_msg *msg, u32 len, void *data,
+                               void **ret_data)
+{
+       struct o2hb_region *reg = data;
+
+       printk(KERN_NOTICE "o2hb: negotiate timeout approved by master node on region %s (%s).\n",
+               config_item_name(&reg->hr_item), reg->hr_dev_name);
+       o2hb_arm_timeout(reg);
+       return 0;
 }
 
 static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc)
@@ -1032,7 +1172,8 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
        /* Skip disarming the timeout if own slot has stale/bad data */
        if (own_slot_ok) {
                o2hb_set_quorum_device(reg);
-               o2hb_arm_write_timeout(reg);
+               o2hb_arm_timeout(reg);
+               reg->hr_last_timeout_start = jiffies;
        }
 
 bail:
@@ -1096,6 +1237,7 @@ static int o2hb_thread(void *data)
                before_hb = ktime_get_real();
 
                ret = o2hb_do_disk_heartbeat(reg);
+               reg->hr_last_hb_status = ret;
 
                after_hb = ktime_get_real();
 
@@ -1114,7 +1256,7 @@ static int o2hb_thread(void *data)
                }
        }
 
-       o2hb_disarm_write_timeout(reg);
+       o2hb_disarm_timeout(reg);
 
        /* unclean stop is only used in very bad situation */
        for(i = 0; !reg->hr_unclean_stop && i < reg->hr_blocks; i++)
@@ -1451,6 +1593,7 @@ static void o2hb_region_release(struct config_item *item)
        list_del(&reg->hr_all_item);
        spin_unlock(&o2hb_live_lock);
 
+       o2net_unregister_handler_list(&reg->hr_handler_list);
        kfree(reg);
 }
 
@@ -1762,6 +1905,7 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
        }
 
        INIT_DELAYED_WORK(&reg->hr_write_timeout_work, o2hb_write_timeout);
+       INIT_DELAYED_WORK(&reg->hr_nego_timeout_work, o2hb_nego_timeout);
 
        /*
         * A node is considered live after it has beat LIVE_THRESHOLD
@@ -1995,13 +2139,37 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g
 
        config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
 
+       /* this is the same way to generate msg key as dlm, for local heartbeat,
+        * name is also the same, so make initial crc value different to avoid
+        * message key conflict.
+        */
+       reg->hr_key = crc32_le(reg->hr_region_num + O2NM_MAX_REGIONS,
+               name, strlen(name));
+       INIT_LIST_HEAD(&reg->hr_handler_list);
+       ret = o2net_register_handler(O2HB_NEGO_TIMEOUT_MSG, reg->hr_key,
+                       sizeof(struct o2hb_nego_msg),
+                       o2hb_nego_timeout_handler,
+                       reg, NULL, &reg->hr_handler_list);
+       if (ret)
+               goto free;
+
+       ret = o2net_register_handler(O2HB_NEGO_APPROVE_MSG, reg->hr_key,
+                       sizeof(struct o2hb_nego_msg),
+                       o2hb_nego_approve_handler,
+                       reg, NULL, &reg->hr_handler_list);
+       if (ret)
+               goto unregister_handler;
+
        ret = o2hb_debug_region_init(reg, o2hb_debug_dir);
        if (ret) {
                config_item_put(&reg->hr_item);
-               goto free;
+               goto unregister_handler;
        }
 
        return &reg->hr_item;
+
+unregister_handler:
+       o2net_unregister_handler_list(&reg->hr_handler_list);
 free:
        kfree(reg);
        return ERR_PTR(ret);
index b95e7df..94b1836 100644 (file)
@@ -44,6 +44,9 @@
  * version here in tcp_internal.h should not need to be bumped for
  * filesystem locking changes.
  *
+ * New in version 12
+ *     - Negotiate hb timeout when storage is down.
+ *
  * New in version 11
  *     - Negotiation of filesystem locking in the dlm join.
  *
@@ -75,7 +78,7 @@
  *     - full 64 bit i_size in the metadata lock lvbs
  *     - introduction of "rw" lock and pushing meta/data locking down
  */
-#define O2NET_PROTOCOL_VERSION 11ULL
+#define O2NET_PROTOCOL_VERSION 12ULL
 struct o2net_handshake {
        __be64  protocol_version;
        __be64  connector_id;
index 0748777..c56a767 100644 (file)
@@ -176,12 +176,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
        }
        if (is_bad_inode(inode)) {
                iput(inode);
-               if ((flags & OCFS2_FI_FLAG_FILECHECK_CHK) ||
-                   (flags & OCFS2_FI_FLAG_FILECHECK_FIX))
-                       /* Return OCFS2_FILECHECK_ERR_XXX related errno */
-                       inode = ERR_PTR(rc);
-               else
-                       inode = ERR_PTR(-ESTALE);
+               inode = ERR_PTR(rc);
                goto bail;
        }
 
index f4cd3c3..497a417 100644 (file)
@@ -619,7 +619,7 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
 
 static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode)
 {
-       return jbd2_journal_file_inode(handle, &OCFS2_I(inode)->ip_jinode);
+       return jbd2_journal_inode_add_write(handle, &OCFS2_I(inode)->ip_jinode);
 }
 
 static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
index ad16995..d205385 100644 (file)
@@ -7254,10 +7254,11 @@ static int ocfs2_xattr_security_get(const struct xattr_handler *handler,
 }
 
 static int ocfs2_xattr_security_set(const struct xattr_handler *handler,
-                                   struct dentry *dentry, const char *name,
-                                   const void *value, size_t size, int flags)
+                                   struct dentry *unused, struct inode *inode,
+                                   const char *name, const void *value,
+                                   size_t size, int flags)
 {
-       return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY,
+       return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY,
                               name, value, size, flags);
 }
 
@@ -7325,10 +7326,11 @@ static int ocfs2_xattr_trusted_get(const struct xattr_handler *handler,
 }
 
 static int ocfs2_xattr_trusted_set(const struct xattr_handler *handler,
-                                  struct dentry *dentry, const char *name,
-                                  const void *value, size_t size, int flags)
+                                  struct dentry *unused, struct inode *inode,
+                                  const char *name, const void *value,
+                                  size_t size, int flags)
 {
-       return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED,
+       return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_TRUSTED,
                               name, value, size, flags);
 }
 
@@ -7354,15 +7356,16 @@ static int ocfs2_xattr_user_get(const struct xattr_handler *handler,
 }
 
 static int ocfs2_xattr_user_set(const struct xattr_handler *handler,
-                               struct dentry *dentry, const char *name,
-                               const void *value, size_t size, int flags)
+                               struct dentry *unused, struct inode *inode,
+                               const char *name, const void *value,
+                               size_t size, int flags)
 {
-       struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
        if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
                return -EOPNOTSUPP;
 
-       return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_USER,
+       return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_USER,
                               name, value, size, flags);
 }
 
index 99c1954..5893ddd 100644 (file)
@@ -448,13 +448,14 @@ out_unlock:
 }
 
 static int orangefs_xattr_set_default(const struct xattr_handler *handler,
-                                     struct dentry *dentry,
+                                     struct dentry *unused,
+                                     struct inode *inode,
                                      const char *name,
                                      const void *buffer,
                                      size_t size,
                                      int flags)
 {
-       return orangefs_inode_setxattr(dentry->d_inode,
+       return orangefs_inode_setxattr(inode,
                                    ORANGEFS_XATTR_NAME_DEFAULT_PREFIX,
                                    name,
                                    buffer,
@@ -478,13 +479,14 @@ static int orangefs_xattr_get_default(const struct xattr_handler *handler,
 }
 
 static int orangefs_xattr_set_trusted(const struct xattr_handler *handler,
-                                    struct dentry *dentry,
+                                    struct dentry *unused,
+                                    struct inode *inode,
                                     const char *name,
                                     const void *buffer,
                                     size_t size,
                                     int flags)
 {
-       return orangefs_inode_setxattr(dentry->d_inode,
+       return orangefs_inode_setxattr(inode,
                                    ORANGEFS_XATTR_NAME_TRUSTED_PREFIX,
                                    name,
                                    buffer,
index cc514da..80aa6f1 100644 (file)
@@ -336,7 +336,6 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
        struct dentry *upperdir;
        struct dentry *upperdentry;
        const struct cred *old_cred;
-       struct cred *override_cred;
        char *link = NULL;
 
        if (WARN_ON(!workdir))
@@ -357,28 +356,7 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
                        return PTR_ERR(link);
        }
 
-       err = -ENOMEM;
-       override_cred = prepare_creds();
-       if (!override_cred)
-               goto out_free_link;
-
-       override_cred->fsuid = stat->uid;
-       override_cred->fsgid = stat->gid;
-       /*
-        * CAP_SYS_ADMIN for copying up extended attributes
-        * CAP_DAC_OVERRIDE for create
-        * CAP_FOWNER for chmod, timestamp update
-        * CAP_FSETID for chmod
-        * CAP_CHOWN for chown
-        * CAP_MKNOD for mknod
-        */
-       cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
-       cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
-       cap_raise(override_cred->cap_effective, CAP_FOWNER);
-       cap_raise(override_cred->cap_effective, CAP_FSETID);
-       cap_raise(override_cred->cap_effective, CAP_CHOWN);
-       cap_raise(override_cred->cap_effective, CAP_MKNOD);
-       old_cred = override_creds(override_cred);
+       old_cred = ovl_override_creds(dentry->d_sb);
 
        err = -EIO;
        if (lock_rename(workdir, upperdir) != NULL) {
@@ -401,9 +379,7 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
 out_unlock:
        unlock_rename(workdir, upperdir);
        revert_creds(old_cred);
-       put_cred(override_cred);
 
-out_free_link:
        if (link)
                free_page((unsigned long) link);
 
index b3fc0a3..22f0253 100644 (file)
@@ -405,28 +405,13 @@ static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev,
                err = ovl_create_upper(dentry, inode, &stat, link, hardlink);
        } else {
                const struct cred *old_cred;
-               struct cred *override_cred;
 
-               err = -ENOMEM;
-               override_cred = prepare_creds();
-               if (!override_cred)
-                       goto out_iput;
-
-               /*
-                * CAP_SYS_ADMIN for setting opaque xattr
-                * CAP_DAC_OVERRIDE for create in workdir, rename
-                * CAP_FOWNER for removing whiteout from sticky dir
-                */
-               cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
-               cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
-               cap_raise(override_cred->cap_effective, CAP_FOWNER);
-               old_cred = override_creds(override_cred);
+               old_cred = ovl_override_creds(dentry->d_sb);
 
                err = ovl_create_over_whiteout(dentry, inode, &stat, link,
                                               hardlink);
 
                revert_creds(old_cred);
-               put_cred(override_cred);
        }
 
        if (!err)
@@ -662,32 +647,11 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir)
        if (OVL_TYPE_PURE_UPPER(type)) {
                err = ovl_remove_upper(dentry, is_dir);
        } else {
-               const struct cred *old_cred;
-               struct cred *override_cred;
-
-               err = -ENOMEM;
-               override_cred = prepare_creds();
-               if (!override_cred)
-                       goto out_drop_write;
-
-               /*
-                * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
-                * CAP_DAC_OVERRIDE for create in workdir, rename
-                * CAP_FOWNER for removing whiteout from sticky dir
-                * CAP_FSETID for chmod of opaque dir
-                * CAP_CHOWN for chown of opaque dir
-                */
-               cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
-               cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
-               cap_raise(override_cred->cap_effective, CAP_FOWNER);
-               cap_raise(override_cred->cap_effective, CAP_FSETID);
-               cap_raise(override_cred->cap_effective, CAP_CHOWN);
-               old_cred = override_creds(override_cred);
+               const struct cred *old_cred = ovl_override_creds(dentry->d_sb);
 
                err = ovl_remove_and_whiteout(dentry, is_dir);
 
                revert_creds(old_cred);
-               put_cred(override_cred);
        }
 out_drop_write:
        ovl_drop_write(dentry);
@@ -725,7 +689,6 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
        bool new_is_dir = false;
        struct dentry *opaquedir = NULL;
        const struct cred *old_cred = NULL;
-       struct cred *override_cred = NULL;
 
        err = -EINVAL;
        if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE))
@@ -794,26 +757,8 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
        old_opaque = !OVL_TYPE_PURE_UPPER(old_type);
        new_opaque = !OVL_TYPE_PURE_UPPER(new_type);
 
-       if (old_opaque || new_opaque) {
-               err = -ENOMEM;
-               override_cred = prepare_creds();
-               if (!override_cred)
-                       goto out_drop_write;
-
-               /*
-                * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
-                * CAP_DAC_OVERRIDE for create in workdir
-                * CAP_FOWNER for removing whiteout from sticky dir
-                * CAP_FSETID for chmod of opaque dir
-                * CAP_CHOWN for chown of opaque dir
-                */
-               cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
-               cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
-               cap_raise(override_cred->cap_effective, CAP_FOWNER);
-               cap_raise(override_cred->cap_effective, CAP_FSETID);
-               cap_raise(override_cred->cap_effective, CAP_CHOWN);
-               old_cred = override_creds(override_cred);
-       }
+       if (old_opaque || new_opaque)
+               old_cred = ovl_override_creds(old->d_sb);
 
        if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) {
                opaquedir = ovl_check_empty_and_clear(new);
@@ -943,10 +888,8 @@ out_dput_old:
 out_unlock:
        unlock_rename(new_upperdir, old_upperdir);
 out_revert_creds:
-       if (old_opaque || new_opaque) {
+       if (old_opaque || new_opaque)
                revert_creds(old_cred);
-               put_cred(override_cred);
-       }
 out_drop_write:
        ovl_drop_write(old);
 out:
index c7b31a0..0ed7c40 100644 (file)
@@ -210,8 +210,9 @@ static bool ovl_is_private_xattr(const char *name)
        return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0;
 }
 
-int ovl_setxattr(struct dentry *dentry, const char *name,
-                const void *value, size_t size, int flags)
+int ovl_setxattr(struct dentry *dentry, struct inode *inode,
+                const char *name, const void *value,
+                size_t size, int flags)
 {
        int err;
        struct dentry *upperdentry;
index 99ec4b0..4bd9b5b 100644 (file)
@@ -153,6 +153,7 @@ void ovl_drop_write(struct dentry *dentry);
 bool ovl_dentry_is_opaque(struct dentry *dentry);
 void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
 bool ovl_is_whiteout(struct dentry *dentry);
+const struct cred *ovl_override_creds(struct super_block *sb);
 void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
 struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
                          unsigned int flags);
@@ -171,8 +172,9 @@ int ovl_check_d_type_supported(struct path *realpath);
 /* inode.c */
 int ovl_setattr(struct dentry *dentry, struct iattr *attr);
 int ovl_permission(struct inode *inode, int mask);
-int ovl_setxattr(struct dentry *dentry, const char *name,
-                const void *value, size_t size, int flags);
+int ovl_setxattr(struct dentry *dentry, struct inode *inode,
+                const char *name, const void *value,
+                size_t size, int flags);
 ssize_t ovl_getxattr(struct dentry *dentry, struct inode *inode,
                     const char *name, void *value, size_t size);
 ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
index da186ee..cf37fc7 100644 (file)
@@ -36,6 +36,7 @@ struct ovl_dir_cache {
 
 struct ovl_readdir_data {
        struct dir_context ctx;
+       struct dentry *dentry;
        bool is_lowest;
        struct rb_root root;
        struct list_head *list;
@@ -206,21 +207,10 @@ static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
        struct ovl_cache_entry *p;
        struct dentry *dentry;
        const struct cred *old_cred;
-       struct cred *override_cred;
-
-       override_cred = prepare_creds();
-       if (!override_cred)
-               return -ENOMEM;
 
-       /*
-        * CAP_DAC_OVERRIDE for lookup
-        */
-       cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
-       old_cred = override_creds(override_cred);
+       old_cred = ovl_override_creds(rdd->dentry->d_sb);
 
-       inode_lock(dir->d_inode);
-       err = 0;
-       // XXX: err = mutex_lock_killable(&dir->d_inode->i_mutex);
+       err = down_write_killable(&dir->d_inode->i_rwsem);
        if (!err) {
                while (rdd->first_maybe_whiteout) {
                        p = rdd->first_maybe_whiteout;
@@ -234,7 +224,6 @@ static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
                inode_unlock(dir->d_inode);
        }
        revert_creds(old_cred);
-       put_cred(override_cred);
 
        return err;
 }
@@ -290,6 +279,7 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
        struct path realpath;
        struct ovl_readdir_data rdd = {
                .ctx.actor = ovl_fill_merge,
+               .dentry = dentry,
                .list = list,
                .root = RB_ROOT,
                .is_lowest = false,
index ed53ae0..ce02f46 100644 (file)
@@ -42,6 +42,8 @@ struct ovl_fs {
        long lower_namelen;
        /* pathnames of lower and upper dirs, for show_options */
        struct ovl_config config;
+       /* creds of process who forced instantiation of super block */
+       const struct cred *creator_cred;
 };
 
 struct ovl_dir_cache;
@@ -265,6 +267,13 @@ bool ovl_is_whiteout(struct dentry *dentry)
        return inode && IS_WHITEOUT(inode);
 }
 
+const struct cred *ovl_override_creds(struct super_block *sb)
+{
+       struct ovl_fs *ofs = sb->s_fs_info;
+
+       return override_creds(ofs->creator_cred);
+}
+
 static bool ovl_is_opaquedir(struct dentry *dentry)
 {
        int res;
@@ -603,6 +612,7 @@ static void ovl_put_super(struct super_block *sb)
        kfree(ufs->config.lowerdir);
        kfree(ufs->config.upperdir);
        kfree(ufs->config.workdir);
+       put_cred(ufs->creator_cred);
        kfree(ufs);
 }
 
@@ -1064,16 +1074,19 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
                /*
                 * Upper should support d_type, else whiteouts are visible.
                 * Given workdir and upper are on same fs, we can do
-                * iterate_dir() on workdir.
+                * iterate_dir() on workdir. This check requires successful
+                * creation of workdir in previous step.
                 */
-               err = ovl_check_d_type_supported(&workpath);
-               if (err < 0)
-                       goto out_put_workdir;
+               if (ufs->workdir) {
+                       err = ovl_check_d_type_supported(&workpath);
+                       if (err < 0)
+                               goto out_put_workdir;
 
-               if (!err) {
-                       pr_err("overlayfs: upper fs needs to support d_type.\n");
-                       err = -EINVAL;
-                       goto out_put_workdir;
+                       if (!err) {
+                               pr_err("overlayfs: upper fs needs to support d_type.\n");
+                               err = -EINVAL;
+                               goto out_put_workdir;
+                       }
                }
        }
 
@@ -1108,10 +1121,14 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
        else
                sb->s_d_op = &ovl_dentry_operations;
 
+       ufs->creator_cred = prepare_creds();
+       if (!ufs->creator_cred)
+               goto out_put_lower_mnt;
+
        err = -ENOMEM;
        oe = ovl_alloc_entry(numlower);
        if (!oe)
-               goto out_put_lower_mnt;
+               goto out_put_cred;
 
        root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, oe));
        if (!root_dentry)
@@ -1144,6 +1161,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 
 out_free_oe:
        kfree(oe);
+out_put_cred:
+       put_cred(ufs->creator_cred);
 out_put_lower_mnt:
        for (i = 0; i < ufs->numlower; i++)
                mntput(ufs->lower_mnt[i]);
index 2c60f17..8a4a266 100644 (file)
@@ -822,10 +822,10 @@ posix_acl_xattr_get(const struct xattr_handler *handler,
 
 static int
 posix_acl_xattr_set(const struct xattr_handler *handler,
-                   struct dentry *dentry, const char *name,
-                   const void *value, size_t size, int flags)
+                   struct dentry *unused, struct inode *inode,
+                   const char *name, const void *value,
+                   size_t size, int flags)
 {
-       struct inode *inode = d_backing_inode(dentry);
        struct posix_acl *acl = NULL;
        int ret;
 
index a86c6c0..9d0212c 100644 (file)
@@ -35,13 +35,13 @@ int iterate_dir(struct file *file, struct dir_context *ctx)
        if (res)
                goto out;
 
-       if (shared)
+       if (shared) {
                inode_lock_shared(inode);
-       else
-               inode_lock(inode);
-       // res = mutex_lock_killable(&inode->i_mutex);
-       // if (res)
-       //      goto out;
+       } else {
+               res = down_write_killable(&inode->i_rwsem);
+               if (res)
+                       goto out;
+       }
 
        res = -ENOENT;
        if (!IS_DEADDIR(inode)) {
@@ -182,6 +182,8 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen,
        }
        dirent = buf->previous;
        if (dirent) {
+               if (signal_pending(current))
+                       return -EINTR;
                if (__put_user(offset, &dirent->d_off))
                        goto efault;
        }
@@ -261,6 +263,8 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen,
                return -EINVAL;
        dirent = buf->previous;
        if (dirent) {
+               if (signal_pending(current))
+                       return -EINTR;
                if (__put_user(offset, &dirent->d_off))
                        goto efault;
        }
index 86aeb9d..e4cbb77 100644 (file)
@@ -20,13 +20,14 @@ security_get(const struct xattr_handler *handler, struct dentry *unused,
 }
 
 static int
-security_set(const struct xattr_handler *handler, struct dentry *dentry,
-            const char *name, const void *buffer, size_t size, int flags)
+security_set(const struct xattr_handler *handler, struct dentry *unused,
+            struct inode *inode, const char *name, const void *buffer,
+            size_t size, int flags)
 {
-       if (IS_PRIVATE(d_inode(dentry)))
+       if (IS_PRIVATE(inode))
                return -EPERM;
 
-       return reiserfs_xattr_set(d_inode(dentry),
+       return reiserfs_xattr_set(inode,
                                  xattr_full_name(handler, name),
                                  buffer, size, flags);
 }
index 31837f0..f15a5f9 100644 (file)
@@ -19,13 +19,14 @@ trusted_get(const struct xattr_handler *handler, struct dentry *unused,
 }
 
 static int
-trusted_set(const struct xattr_handler *handler, struct dentry *dentry,
-           const char *name, const void *buffer, size_t size, int flags)
+trusted_set(const struct xattr_handler *handler, struct dentry *unused,
+           struct inode *inode, const char *name, const void *buffer,
+           size_t size, int flags)
 {
-       if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry)))
+       if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode))
                return -EPERM;
 
-       return reiserfs_xattr_set(d_inode(dentry),
+       return reiserfs_xattr_set(inode,
                                  xattr_full_name(handler, name),
                                  buffer, size, flags);
 }
index f7c3973..dc59df4 100644 (file)
@@ -17,12 +17,13 @@ user_get(const struct xattr_handler *handler, struct dentry *unused,
 }
 
 static int
-user_set(const struct xattr_handler *handler, struct dentry *dentry,
-        const char *name, const void *buffer, size_t size, int flags)
+user_set(const struct xattr_handler *handler, struct dentry *unused,
+        struct inode *inode, const char *name, const void *buffer,
+        size_t size, int flags)
 {
-       if (!reiserfs_xattrs_user(dentry->d_sb))
+       if (!reiserfs_xattrs_user(inode->i_sb))
                return -EOPNOTSUPP;
-       return reiserfs_xattr_set(d_inode(dentry),
+       return reiserfs_xattr_set(inode,
                                  xattr_full_name(handler, name),
                                  buffer, size, flags);
 }
index 595ca0d..69e287e 100644 (file)
@@ -260,7 +260,7 @@ void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode)
        pr_err("\txattr_names    %u\n", ui->xattr_names);
        pr_err("\tdirty          %u\n", ui->dirty);
        pr_err("\txattr          %u\n", ui->xattr);
-       pr_err("\tbulk_read      %u\n", ui->xattr);
+       pr_err("\tbulk_read      %u\n", ui->bulk_read);
        pr_err("\tsynced_i_size  %llu\n",
               (unsigned long long)ui->synced_i_size);
        pr_err("\tui_size        %llu\n",
index 6c277eb..b5fc279 100644 (file)
@@ -579,11 +579,10 @@ static int ubifs_xattr_get(const struct xattr_handler *handler,
 }
 
 static int ubifs_xattr_set(const struct xattr_handler *handler,
-                          struct dentry *dentry, const char *name,
-                          const void *value, size_t size, int flags)
+                          struct dentry *dentry, struct inode *inode,
+                          const char *name, const void *value,
+                          size_t size, int flags)
 {
-       struct inode *inode = d_inode(dentry);
-
        dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd",
                name, inode->i_ino, dentry, size);
 
index b11945e..4beafc4 100644 (file)
@@ -100,7 +100,7 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
        if (issec)
                inode->i_flags &= ~S_NOSEC;
        if (inode->i_op->setxattr) {
-               error = inode->i_op->setxattr(dentry, name, value, size, flags);
+               error = inode->i_op->setxattr(dentry, inode, name, value, size, flags);
                if (!error) {
                        fsnotify_xattr(dentry);
                        security_inode_post_setxattr(dentry, name, value,
@@ -655,6 +655,7 @@ strcmp_prefix(const char *a, const char *a_prefix)
  * operations to the correct xattr_handler.
  */
 #define for_each_xattr_handler(handlers, handler)              \
+       if (handlers)                                           \
                for ((handler) = *(handlers)++;                 \
                        (handler) != NULL;                      \
                        (handler) = *(handlers)++)
@@ -668,7 +669,7 @@ xattr_resolve_name(const struct xattr_handler **handlers, const char **name)
        const struct xattr_handler *handler;
 
        if (!*name)
-               return NULL;
+               return ERR_PTR(-EINVAL);
 
        for_each_xattr_handler(handlers, handler) {
                const char *n;
@@ -744,7 +745,8 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
  * Find the handler for the prefix and dispatch its set() operation.
  */
 int
-generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags)
+generic_setxattr(struct dentry *dentry, struct inode *inode, const char *name,
+                const void *value, size_t size, int flags)
 {
        const struct xattr_handler *handler;
 
@@ -753,7 +755,7 @@ generic_setxattr(struct dentry *dentry, const char *name, const void *value, siz
        handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
        if (IS_ERR(handler))
                return PTR_ERR(handler);
-       return handler->set(handler, dentry, name, value, size, flags);
+       return handler->set(handler, dentry, inode, name, value, size, flags);
 }
 
 /*
@@ -768,7 +770,8 @@ generic_removexattr(struct dentry *dentry, const char *name)
        handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
        if (IS_ERR(handler))
                return PTR_ERR(handler);
-       return handler->set(handler, dentry, name, NULL, 0, XATTR_REPLACE);
+       return handler->set(handler, dentry, d_inode(dentry), name, NULL,
+                           0, XATTR_REPLACE);
 }
 
 EXPORT_SYMBOL(generic_getxattr);
index 686ba6f..339c696 100644 (file)
@@ -93,19 +93,23 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
 }
 
 void *
-kmem_realloc(const void *ptr, size_t newsize, size_t oldsize,
-            xfs_km_flags_t flags)
+kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags)
 {
-       void    *new;
+       int     retries = 0;
+       gfp_t   lflags = kmem_flags_convert(flags);
+       void    *ptr;
 
-       new = kmem_alloc(newsize, flags);
-       if (ptr) {
-               if (new)
-                       memcpy(new, ptr,
-                               ((oldsize < newsize) ? oldsize : newsize));
-               kmem_free(ptr);
-       }
-       return new;
+       do {
+               ptr = krealloc(old, newsize, lflags);
+               if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
+                       return ptr;
+               if (!(++retries % 100))
+                       xfs_err(NULL,
+       "%s(%u) possible memory allocation deadlock size %zu in %s (mode:0x%x)",
+                               current->comm, current->pid,
+                               newsize, __func__, lflags);
+               congestion_wait(BLK_RW_ASYNC, HZ/50);
+       } while (1);
 }
 
 void *
index d1c66e4..689f746 100644 (file)
@@ -62,7 +62,7 @@ kmem_flags_convert(xfs_km_flags_t flags)
 
 extern void *kmem_alloc(size_t, xfs_km_flags_t);
 extern void *kmem_zalloc_large(size_t size, xfs_km_flags_t);
-extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t);
+extern void *kmem_realloc(const void *, size_t, xfs_km_flags_t);
 static inline void  kmem_free(const void *ptr)
 {
        kvfree(ptr);
index fa3b948..4e126f4 100644 (file)
@@ -242,37 +242,21 @@ xfs_attr_set(
                        return error;
        }
 
-       /*
-        * Start our first transaction of the day.
-        *
-        * All future transactions during this code must be "chained" off
-        * this one via the trans_dup() call.  All transactions will contain
-        * the inode, and the inode will always be marked with trans_ihold().
-        * Since the inode will be locked in all transactions, we must log
-        * the inode in every transaction to let it float upward through
-        * the log.
-        */
-       args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_SET);
+       tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
+                        M_RES(mp)->tr_attrsetrt.tr_logres * args.total;
+       tres.tr_logcount = XFS_ATTRSET_LOG_COUNT;
+       tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
 
        /*
         * Root fork attributes can use reserved data blocks for this
         * operation if necessary
         */
-
-       if (rsvd)
-               args.trans->t_flags |= XFS_TRANS_RESERVE;
-
-       tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
-                        M_RES(mp)->tr_attrsetrt.tr_logres * args.total;
-       tres.tr_logcount = XFS_ATTRSET_LOG_COUNT;
-       tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
-       error = xfs_trans_reserve(args.trans, &tres, args.total, 0);
-       if (error) {
-               xfs_trans_cancel(args.trans);
+       error = xfs_trans_alloc(mp, &tres, args.total, 0,
+                       rsvd ? XFS_TRANS_RESERVE : 0, &args.trans);
+       if (error)
                return error;
-       }
-       xfs_ilock(dp, XFS_ILOCK_EXCL);
 
+       xfs_ilock(dp, XFS_ILOCK_EXCL);
        error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0,
                                rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
                                       XFS_QMOPT_RES_REGBLKS);
@@ -428,32 +412,16 @@ xfs_attr_remove(
        if (error)
                return error;
 
-       /*
-        * Start our first transaction of the day.
-        *
-        * All future transactions during this code must be "chained" off
-        * this one via the trans_dup() call.  All transactions will contain
-        * the inode, and the inode will always be marked with trans_ihold().
-        * Since the inode will be locked in all transactions, we must log
-        * the inode in every transaction to let it float upward through
-        * the log.
-        */
-       args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_RM);
-
        /*
         * Root fork attributes can use reserved data blocks for this
         * operation if necessary
         */
-
-       if (flags & ATTR_ROOT)
-               args.trans->t_flags |= XFS_TRANS_RESERVE;
-
-       error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm,
-                                 XFS_ATTRRM_SPACE_RES(mp), 0);
-       if (error) {
-               xfs_trans_cancel(args.trans);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_attrrm,
+                       XFS_ATTRRM_SPACE_RES(mp), 0,
+                       (flags & ATTR_ROOT) ? XFS_TRANS_RESERVE : 0,
+                       &args.trans);
+       if (error)
                return error;
-       }
 
        xfs_ilock(dp, XFS_ILOCK_EXCL);
        /*
index ce41d7f..932381c 100644 (file)
@@ -1121,15 +1121,14 @@ xfs_bmap_add_attrfork(
 
        mp = ip->i_mount;
        ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
-       tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK);
+
        blks = XFS_ADDAFORK_SPACE_RES(mp);
-       if (rsvd)
-               tp->t_flags |= XFS_TRANS_RESERVE;
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0);
-       if (error) {
-               xfs_trans_cancel(tp);
+
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_addafork, blks, 0,
+                       rsvd ? XFS_TRANS_RESERVE : 0, &tp);
+       if (error)
                return error;
-       }
+
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
                        XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
@@ -6026,13 +6025,10 @@ xfs_bmap_split_extent(
        xfs_fsblock_t           firstfsb;
        int                     error;
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
-                       XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
-       if (error) {
-               xfs_trans_cancel(tp);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write,
+                       XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp);
+       if (error)
                return error;
-       }
 
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
index 974d62e..e5bb9cc 100644 (file)
@@ -257,15 +257,12 @@ xfs_dir2_block_to_sf(
         *
         * Convert the inode to local format and copy the data in.
         */
-       dp->i_df.if_flags &= ~XFS_IFEXTENTS;
-       dp->i_df.if_flags |= XFS_IFINLINE;
-       dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
        ASSERT(dp->i_df.if_bytes == 0);
-       xfs_idata_realloc(dp, size, XFS_DATA_FORK);
+       xfs_init_local_fork(dp, XFS_DATA_FORK, dst, size);
+       dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+       dp->i_d.di_size = size;
 
        logflags |= XFS_ILOG_DDATA;
-       memcpy(dp->i_df.if_u1.if_data, dst, size);
-       dp->i_d.di_size = size;
        xfs_dir2_sf_check(args);
 out:
        xfs_trans_log_inode(args->trans, dp, logflags);
index 11faf7d..bbcc8c7 100644 (file)
@@ -231,6 +231,48 @@ xfs_iformat_fork(
        return error;
 }
 
+void
+xfs_init_local_fork(
+       struct xfs_inode        *ip,
+       int                     whichfork,
+       const void              *data,
+       int                     size)
+{
+       struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
+       int                     mem_size = size, real_size = 0;
+       bool                    zero_terminate;
+
+       /*
+        * If we are using the local fork to store a symlink body we need to
+        * zero-terminate it so that we can pass it back to the VFS directly.
+        * Overallocate the in-memory fork by one for that and add a zero
+        * to terminate it below.
+        */
+       zero_terminate = S_ISLNK(VFS_I(ip)->i_mode);
+       if (zero_terminate)
+               mem_size++;
+
+       if (size == 0)
+               ifp->if_u1.if_data = NULL;
+       else if (mem_size <= sizeof(ifp->if_u2.if_inline_data))
+               ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+       else {
+               real_size = roundup(mem_size, 4);
+               ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
+       }
+
+       if (size) {
+               memcpy(ifp->if_u1.if_data, data, size);
+               if (zero_terminate)
+                       ifp->if_u1.if_data[size] = '\0';
+       }
+
+       ifp->if_bytes = size;
+       ifp->if_real_bytes = real_size;
+       ifp->if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
+       ifp->if_flags |= XFS_IFINLINE;
+}
+
 /*
  * The file is in-lined in the on-disk inode.
  * If it fits into if_inline_data, then copy
@@ -248,8 +290,6 @@ xfs_iformat_local(
        int             whichfork,
        int             size)
 {
-       xfs_ifork_t     *ifp;
-       int             real_size;
 
        /*
         * If the size is unreasonable, then something
@@ -265,22 +305,8 @@ xfs_iformat_local(
                                     ip->i_mount, dip);
                return -EFSCORRUPTED;
        }
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       real_size = 0;
-       if (size == 0)
-               ifp->if_u1.if_data = NULL;
-       else if (size <= sizeof(ifp->if_u2.if_inline_data))
-               ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
-       else {
-               real_size = roundup(size, 4);
-               ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
-       }
-       ifp->if_bytes = size;
-       ifp->if_real_bytes = real_size;
-       if (size)
-               memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size);
-       ifp->if_flags &= ~XFS_IFEXTENTS;
-       ifp->if_flags |= XFS_IFINLINE;
+
+       xfs_init_local_fork(ip, whichfork, XFS_DFORK_PTR(dip, whichfork), size);
        return 0;
 }
 
@@ -516,7 +542,6 @@ xfs_iroot_realloc(
                new_max = cur_max + rec_diff;
                new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
                ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
-                               XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max),
                                KM_SLEEP | KM_NOFS);
                op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
                                                     ifp->if_broot_bytes);
@@ -660,7 +685,6 @@ xfs_idata_realloc(
                                ifp->if_u1.if_data =
                                        kmem_realloc(ifp->if_u1.if_data,
                                                        real_size,
-                                                       ifp->if_real_bytes,
                                                        KM_SLEEP | KM_NOFS);
                        }
                } else {
@@ -1376,8 +1400,7 @@ xfs_iext_realloc_direct(
                if (rnew_size != ifp->if_real_bytes) {
                        ifp->if_u1.if_extents =
                                kmem_realloc(ifp->if_u1.if_extents,
-                                               rnew_size,
-                                               ifp->if_real_bytes, KM_NOFS);
+                                               rnew_size, KM_NOFS);
                }
                if (rnew_size > ifp->if_real_bytes) {
                        memset(&ifp->if_u1.if_extents[ifp->if_bytes /
@@ -1461,9 +1484,8 @@ xfs_iext_realloc_indirect(
        if (new_size == 0) {
                xfs_iext_destroy(ifp);
        } else {
-               ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *)
-                       kmem_realloc(ifp->if_u1.if_ext_irec,
-                               new_size, size, KM_NOFS);
+               ifp->if_u1.if_ext_irec =
+                       kmem_realloc(ifp->if_u1.if_ext_irec, new_size, KM_NOFS);
        }
 }
 
@@ -1496,6 +1518,24 @@ xfs_iext_indirect_to_direct(
        }
 }
 
+/*
+ * Remove all records from the indirection array.
+ */
+STATIC void
+xfs_iext_irec_remove_all(
+       struct xfs_ifork *ifp)
+{
+       int             nlists;
+       int             i;
+
+       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+       for (i = 0; i < nlists; i++)
+               kmem_free(ifp->if_u1.if_ext_irec[i].er_extbuf);
+       kmem_free(ifp->if_u1.if_ext_irec);
+       ifp->if_flags &= ~XFS_IFEXTIREC;
+}
+
 /*
  * Free incore file extents.
  */
@@ -1504,14 +1544,7 @@ xfs_iext_destroy(
        xfs_ifork_t     *ifp)           /* inode fork pointer */
 {
        if (ifp->if_flags & XFS_IFEXTIREC) {
-               int     erp_idx;
-               int     nlists;
-
-               nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-               for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) {
-                       xfs_iext_irec_remove(ifp, erp_idx);
-               }
-               ifp->if_flags &= ~XFS_IFEXTIREC;
+               xfs_iext_irec_remove_all(ifp);
        } else if (ifp->if_real_bytes) {
                kmem_free(ifp->if_u1.if_extents);
        } else if (ifp->if_bytes) {
index 7d3b1ed..f95e072 100644 (file)
@@ -134,6 +134,7 @@ void                xfs_iroot_realloc(struct xfs_inode *, int, int);
 int            xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
 int            xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *,
                                  int);
+void           xfs_init_local_fork(struct xfs_inode *, int, const void *, int);
 
 struct xfs_bmbt_rec_host *
                xfs_iext_get_ext(struct xfs_ifork *, xfs_extnum_t);
index d54a801..e8f49c0 100644 (file)
@@ -211,6 +211,11 @@ typedef struct xfs_trans_header {
 
 #define        XFS_TRANS_HEADER_MAGIC  0x5452414e      /* TRAN */
 
+/*
+ * The only type valid for th_type in CIL-enabled file system logs:
+ */
+#define XFS_TRANS_CHECKPOINT   40
+
 /*
  * Log item types.
  */
index 8a53eaa..12ca867 100644 (file)
@@ -838,12 +838,10 @@ xfs_sync_sb(
        struct xfs_trans        *tp;
        int                     error;
 
-       tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_CHANGE, KM_SLEEP);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
-       if (error) {
-               xfs_trans_cancel(tp);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_sb, 0, 0,
+                       XFS_TRANS_NO_WRITECOUNT, &tp);
+       if (error)
                return error;
-       }
 
        xfs_log_sb(tp);
        if (wait)
index 81ac870..16002b5 100644 (file)
@@ -55,103 +55,6 @@ extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
 extern const struct xfs_buf_ops xfs_symlink_buf_ops;
 extern const struct xfs_buf_ops xfs_rtbuf_ops;
 
-/*
- * Transaction types.  Used to distinguish types of buffers. These never reach
- * the log.
- */
-#define XFS_TRANS_SETATTR_NOT_SIZE     1
-#define XFS_TRANS_SETATTR_SIZE         2
-#define XFS_TRANS_INACTIVE             3
-#define XFS_TRANS_CREATE               4
-#define XFS_TRANS_CREATE_TRUNC         5
-#define XFS_TRANS_TRUNCATE_FILE                6
-#define XFS_TRANS_REMOVE               7
-#define XFS_TRANS_LINK                 8
-#define XFS_TRANS_RENAME               9
-#define XFS_TRANS_MKDIR                        10
-#define XFS_TRANS_RMDIR                        11
-#define XFS_TRANS_SYMLINK              12
-#define XFS_TRANS_SET_DMATTRS          13
-#define XFS_TRANS_GROWFS               14
-#define XFS_TRANS_STRAT_WRITE          15
-#define XFS_TRANS_DIOSTRAT             16
-/* 17 was XFS_TRANS_WRITE_SYNC */
-#define        XFS_TRANS_WRITEID               18
-#define        XFS_TRANS_ADDAFORK              19
-#define        XFS_TRANS_ATTRINVAL             20
-#define        XFS_TRANS_ATRUNCATE             21
-#define        XFS_TRANS_ATTR_SET              22
-#define        XFS_TRANS_ATTR_RM               23
-#define        XFS_TRANS_ATTR_FLAG             24
-#define        XFS_TRANS_CLEAR_AGI_BUCKET      25
-#define XFS_TRANS_SB_CHANGE            26
-/*
- * Dummy entries since we use the transaction type to index into the
- * trans_type[] in xlog_recover_print_trans_head()
- */
-#define XFS_TRANS_DUMMY1               27
-#define XFS_TRANS_DUMMY2               28
-#define XFS_TRANS_QM_QUOTAOFF          29
-#define XFS_TRANS_QM_DQALLOC           30
-#define XFS_TRANS_QM_SETQLIM           31
-#define XFS_TRANS_QM_DQCLUSTER         32
-#define XFS_TRANS_QM_QINOCREATE                33
-#define XFS_TRANS_QM_QUOTAOFF_END      34
-#define XFS_TRANS_FSYNC_TS             35
-#define        XFS_TRANS_GROWFSRT_ALLOC        36
-#define        XFS_TRANS_GROWFSRT_ZERO         37
-#define        XFS_TRANS_GROWFSRT_FREE         38
-#define        XFS_TRANS_SWAPEXT               39
-#define        XFS_TRANS_CHECKPOINT            40
-#define        XFS_TRANS_ICREATE               41
-#define        XFS_TRANS_CREATE_TMPFILE        42
-#define        XFS_TRANS_TYPE_MAX              43
-/* new transaction types need to be reflected in xfs_logprint(8) */
-
-#define XFS_TRANS_TYPES \
-       { XFS_TRANS_SETATTR_NOT_SIZE,   "SETATTR_NOT_SIZE" }, \
-       { XFS_TRANS_SETATTR_SIZE,       "SETATTR_SIZE" }, \
-       { XFS_TRANS_INACTIVE,           "INACTIVE" }, \
-       { XFS_TRANS_CREATE,             "CREATE" }, \
-       { XFS_TRANS_CREATE_TRUNC,       "CREATE_TRUNC" }, \
-       { XFS_TRANS_TRUNCATE_FILE,      "TRUNCATE_FILE" }, \
-       { XFS_TRANS_REMOVE,             "REMOVE" }, \
-       { XFS_TRANS_LINK,               "LINK" }, \
-       { XFS_TRANS_RENAME,             "RENAME" }, \
-       { XFS_TRANS_MKDIR,              "MKDIR" }, \
-       { XFS_TRANS_RMDIR,              "RMDIR" }, \
-       { XFS_TRANS_SYMLINK,            "SYMLINK" }, \
-       { XFS_TRANS_SET_DMATTRS,        "SET_DMATTRS" }, \
-       { XFS_TRANS_GROWFS,             "GROWFS" }, \
-       { XFS_TRANS_STRAT_WRITE,        "STRAT_WRITE" }, \
-       { XFS_TRANS_DIOSTRAT,           "DIOSTRAT" }, \
-       { XFS_TRANS_WRITEID,            "WRITEID" }, \
-       { XFS_TRANS_ADDAFORK,           "ADDAFORK" }, \
-       { XFS_TRANS_ATTRINVAL,          "ATTRINVAL" }, \
-       { XFS_TRANS_ATRUNCATE,          "ATRUNCATE" }, \
-       { XFS_TRANS_ATTR_SET,           "ATTR_SET" }, \
-       { XFS_TRANS_ATTR_RM,            "ATTR_RM" }, \
-       { XFS_TRANS_ATTR_FLAG,          "ATTR_FLAG" }, \
-       { XFS_TRANS_CLEAR_AGI_BUCKET,   "CLEAR_AGI_BUCKET" }, \
-       { XFS_TRANS_SB_CHANGE,          "SBCHANGE" }, \
-       { XFS_TRANS_DUMMY1,             "DUMMY1" }, \
-       { XFS_TRANS_DUMMY2,             "DUMMY2" }, \
-       { XFS_TRANS_QM_QUOTAOFF,        "QM_QUOTAOFF" }, \
-       { XFS_TRANS_QM_DQALLOC,         "QM_DQALLOC" }, \
-       { XFS_TRANS_QM_SETQLIM,         "QM_SETQLIM" }, \
-       { XFS_TRANS_QM_DQCLUSTER,       "QM_DQCLUSTER" }, \
-       { XFS_TRANS_QM_QINOCREATE,      "QM_QINOCREATE" }, \
-       { XFS_TRANS_QM_QUOTAOFF_END,    "QM_QOFF_END" }, \
-       { XFS_TRANS_FSYNC_TS,           "FSYNC_TS" }, \
-       { XFS_TRANS_GROWFSRT_ALLOC,     "GROWFSRT_ALLOC" }, \
-       { XFS_TRANS_GROWFSRT_ZERO,      "GROWFSRT_ZERO" }, \
-       { XFS_TRANS_GROWFSRT_FREE,      "GROWFSRT_FREE" }, \
-       { XFS_TRANS_SWAPEXT,            "SWAPEXT" }, \
-       { XFS_TRANS_CHECKPOINT,         "CHECKPOINT" }, \
-       { XFS_TRANS_ICREATE,            "ICREATE" }, \
-       { XFS_TRANS_CREATE_TMPFILE,     "CREATE_TMPFILE" }, \
-       { XLOG_UNMOUNT_REC_TYPE,        "UNMOUNT" }
-
 /*
  * This structure is used to track log items associated with
  * a transaction.  It points to the log item and keeps some
@@ -181,8 +84,9 @@ int  xfs_log_calc_minimum_size(struct xfs_mount *);
 #define        XFS_TRANS_SYNC          0x08    /* make commit synchronous */
 #define XFS_TRANS_DQ_DIRTY     0x10    /* at least one dquot in trx dirty */
 #define XFS_TRANS_RESERVE      0x20    /* OK to use reserved data blocks */
-#define XFS_TRANS_FREEZE_PROT  0x40    /* Transaction has elevated writer
-                                          count in superblock */
+#define XFS_TRANS_NO_WRITECOUNT 0x40   /* do not elevate SB writecount */
+#define XFS_TRANS_NOFS         0x80    /* pass KM_NOFS to kmem_alloc */
+
 /*
  * Field values for xfs_trans_mod_sb.
  */
index c535887..4c463b9 100644 (file)
@@ -84,23 +84,71 @@ xfs_find_bdev_for_inode(
 }
 
 /*
- * We're now finished for good with this ioend structure.
- * Update the page state via the associated buffer_heads,
- * release holds on the inode and bio, and finally free
- * up memory.  Do not use the ioend after this.
+ * We're now finished for good with this page.  Update the page state via the
+ * associated buffer_heads, paying attention to the start and end offsets that
+ * we need to process on the page.
+ */
+static void
+xfs_finish_page_writeback(
+       struct inode            *inode,
+       struct bio_vec          *bvec,
+       int                     error)
+{
+       unsigned int            end = bvec->bv_offset + bvec->bv_len - 1;
+       struct buffer_head      *head, *bh;
+       unsigned int            off = 0;
+
+       ASSERT(bvec->bv_offset < PAGE_SIZE);
+       ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0);
+       ASSERT(end < PAGE_SIZE);
+       ASSERT((bvec->bv_len & ((1 << inode->i_blkbits) - 1)) == 0);
+
+       bh = head = page_buffers(bvec->bv_page);
+
+       do {
+               if (off < bvec->bv_offset)
+                       goto next_bh;
+               if (off > end)
+                       break;
+               bh->b_end_io(bh, !error);
+next_bh:
+               off += bh->b_size;
+       } while ((bh = bh->b_this_page) != head);
+}
+
+/*
+ * We're now finished for good with this ioend structure.  Update the page
+ * state, release holds on bios, and finally free up memory.  Do not use the
+ * ioend after this.
  */
 STATIC void
 xfs_destroy_ioend(
-       xfs_ioend_t             *ioend)
+       struct xfs_ioend        *ioend,
+       int                     error)
 {
-       struct buffer_head      *bh, *next;
+       struct inode            *inode = ioend->io_inode;
+       struct bio              *last = ioend->io_bio;
+       struct bio              *bio, *next;
 
-       for (bh = ioend->io_buffer_head; bh; bh = next) {
-               next = bh->b_private;
-               bh->b_end_io(bh, !ioend->io_error);
-       }
+       for (bio = &ioend->io_inline_bio; bio; bio = next) {
+               struct bio_vec  *bvec;
+               int             i;
+
+               /*
+                * For the last bio, bi_private points to the ioend, so we
+                * need to explicitly end the iteration here.
+                */
+               if (bio == last)
+                       next = NULL;
+               else
+                       next = bio->bi_private;
 
-       mempool_free(ioend, xfs_ioend_pool);
+               /* walk each page on bio, ending page IO on them */
+               bio_for_each_segment_all(bvec, bio, i)
+                       xfs_finish_page_writeback(inode, bvec, error);
+
+               bio_put(bio);
+       }
 }
 
 /*
@@ -120,13 +168,9 @@ xfs_setfilesize_trans_alloc(
        struct xfs_trans        *tp;
        int                     error;
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
-
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
-       if (error) {
-               xfs_trans_cancel(tp);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
+       if (error)
                return error;
-       }
 
        ioend->io_append_trans = tp;
 
@@ -174,7 +218,8 @@ xfs_setfilesize(
 
 STATIC int
 xfs_setfilesize_ioend(
-       struct xfs_ioend        *ioend)
+       struct xfs_ioend        *ioend,
+       int                     error)
 {
        struct xfs_inode        *ip = XFS_I(ioend->io_inode);
        struct xfs_trans        *tp = ioend->io_append_trans;
@@ -188,36 +233,14 @@ xfs_setfilesize_ioend(
        __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
 
        /* we abort the update if there was an IO error */
-       if (ioend->io_error) {
+       if (error) {
                xfs_trans_cancel(tp);
-               return ioend->io_error;
+               return error;
        }
 
        return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
 }
 
-/*
- * Schedule IO completion handling on the final put of an ioend.
- *
- * If there is no work to do we might as well call it a day and free the
- * ioend right now.
- */
-STATIC void
-xfs_finish_ioend(
-       struct xfs_ioend        *ioend)
-{
-       if (atomic_dec_and_test(&ioend->io_remaining)) {
-               struct xfs_mount        *mp = XFS_I(ioend->io_inode)->i_mount;
-
-               if (ioend->io_type == XFS_IO_UNWRITTEN)
-                       queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
-               else if (ioend->io_append_trans)
-                       queue_work(mp->m_data_workqueue, &ioend->io_work);
-               else
-                       xfs_destroy_ioend(ioend);
-       }
-}
-
 /*
  * IO write completion.
  */
@@ -225,16 +248,17 @@ STATIC void
 xfs_end_io(
        struct work_struct *work)
 {
-       xfs_ioend_t     *ioend = container_of(work, xfs_ioend_t, io_work);
-       struct xfs_inode *ip = XFS_I(ioend->io_inode);
-       int             error = 0;
+       struct xfs_ioend        *ioend =
+               container_of(work, struct xfs_ioend, io_work);
+       struct xfs_inode        *ip = XFS_I(ioend->io_inode);
+       int                     error = ioend->io_bio->bi_error;
 
        /*
         * Set an error if the mount has shut down and proceed with end I/O
         * processing so it can perform whatever cleanups are necessary.
         */
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-               ioend->io_error = -EIO;
+               error = -EIO;
 
        /*
         * For unwritten extents we need to issue transactions to convert a
@@ -244,55 +268,33 @@ xfs_end_io(
         * on error.
         */
        if (ioend->io_type == XFS_IO_UNWRITTEN) {
-               if (ioend->io_error)
+               if (error)
                        goto done;
                error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
                                                  ioend->io_size);
        } else if (ioend->io_append_trans) {
-               error = xfs_setfilesize_ioend(ioend);
+               error = xfs_setfilesize_ioend(ioend, error);
        } else {
                ASSERT(!xfs_ioend_is_append(ioend));
        }
 
 done:
-       if (error)
-               ioend->io_error = error;
-       xfs_destroy_ioend(ioend);
+       xfs_destroy_ioend(ioend, error);
 }
 
-/*
- * Allocate and initialise an IO completion structure.
- * We need to track unwritten extent write completion here initially.
- * We'll need to extend this for updating the ondisk inode size later
- * (vs. incore size).
- */
-STATIC xfs_ioend_t *
-xfs_alloc_ioend(
-       struct inode            *inode,
-       unsigned int            type)
+STATIC void
+xfs_end_bio(
+       struct bio              *bio)
 {
-       xfs_ioend_t             *ioend;
-
-       ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS);
-
-       /*
-        * Set the count to 1 initially, which will prevent an I/O
-        * completion callback from happening before we have started
-        * all the I/O from calling the completion routine too early.
-        */
-       atomic_set(&ioend->io_remaining, 1);
-       ioend->io_error = 0;
-       INIT_LIST_HEAD(&ioend->io_list);
-       ioend->io_type = type;
-       ioend->io_inode = inode;
-       ioend->io_buffer_head = NULL;
-       ioend->io_buffer_tail = NULL;
-       ioend->io_offset = 0;
-       ioend->io_size = 0;
-       ioend->io_append_trans = NULL;
+       struct xfs_ioend        *ioend = bio->bi_private;
+       struct xfs_mount        *mp = XFS_I(ioend->io_inode)->i_mount;
 
-       INIT_WORK(&ioend->io_work, xfs_end_io);
-       return ioend;
+       if (ioend->io_type == XFS_IO_UNWRITTEN)
+               queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
+       else if (ioend->io_append_trans)
+               queue_work(mp->m_data_workqueue, &ioend->io_work);
+       else
+               xfs_destroy_ioend(ioend, bio->bi_error);
 }
 
 STATIC int
@@ -364,50 +366,6 @@ xfs_imap_valid(
                offset < imap->br_startoff + imap->br_blockcount;
 }
 
-/*
- * BIO completion handler for buffered IO.
- */
-STATIC void
-xfs_end_bio(
-       struct bio              *bio)
-{
-       xfs_ioend_t             *ioend = bio->bi_private;
-
-       if (!ioend->io_error)
-               ioend->io_error = bio->bi_error;
-
-       /* Toss bio and pass work off to an xfsdatad thread */
-       bio->bi_private = NULL;
-       bio->bi_end_io = NULL;
-       bio_put(bio);
-
-       xfs_finish_ioend(ioend);
-}
-
-STATIC void
-xfs_submit_ioend_bio(
-       struct writeback_control *wbc,
-       xfs_ioend_t             *ioend,
-       struct bio              *bio)
-{
-       atomic_inc(&ioend->io_remaining);
-       bio->bi_private = ioend;
-       bio->bi_end_io = xfs_end_bio;
-       submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
-}
-
-STATIC struct bio *
-xfs_alloc_ioend_bio(
-       struct buffer_head      *bh)
-{
-       struct bio              *bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
-
-       ASSERT(bio->bi_private == NULL);
-       bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
-       bio->bi_bdev = bh->b_bdev;
-       return bio;
-}
-
 STATIC void
 xfs_start_buffer_writeback(
        struct buffer_head      *bh)
@@ -452,28 +410,35 @@ static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
 }
 
 /*
- * Submit all of the bios for an ioend. We are only passed a single ioend at a
- * time; the caller is responsible for chaining prior to submission.
+ * Submit the bio for an ioend. We are passed an ioend with a bio attached to
+ * it, and we submit that bio. The ioend may be used for multiple bio
+ * submissions, so we only want to allocate an append transaction for the ioend
+ * once. In the case of multiple bio submission, each bio will take an IO
+ * reference to the ioend to ensure that the ioend completion is only done once
+ * all bios have been submitted and the ioend is really done.
  *
  * If @fail is non-zero, it means that we have a situation where some part of
  * the submission process has failed after we have marked paged for writeback
- * and unlocked them. In this situation, we need to fail the ioend chain rather
- * than submit it to IO. This typically only happens on a filesystem shutdown.
+ * and unlocked them. In this situation, we need to fail the bio and ioend
+ * rather than submit it to IO. This typically only happens on a filesystem
+ * shutdown.
  */
 STATIC int
 xfs_submit_ioend(
        struct writeback_control *wbc,
-       xfs_ioend_t             *ioend,
+       struct xfs_ioend        *ioend,
        int                     status)
 {
-       struct buffer_head      *bh;
-       struct bio              *bio;
-       sector_t                lastblock = 0;
-
        /* Reserve log space if we might write beyond the on-disk inode size. */
        if (!status &&
-            ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
+           ioend->io_type != XFS_IO_UNWRITTEN &&
+           xfs_ioend_is_append(ioend) &&
+           !ioend->io_append_trans)
                status = xfs_setfilesize_trans_alloc(ioend);
+
+       ioend->io_bio->bi_private = ioend;
+       ioend->io_bio->bi_end_io = xfs_end_bio;
+
        /*
         * If we are failing the IO now, just mark the ioend with an
         * error and finish it. This will run IO completion immediately
@@ -481,33 +446,73 @@ xfs_submit_ioend(
         * time.
         */
        if (status) {
-               ioend->io_error = status;
-               xfs_finish_ioend(ioend);
+               ioend->io_bio->bi_error = status;
+               bio_endio(ioend->io_bio);
                return status;
        }
 
-       bio = NULL;
-       for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
+       submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE,
+                  ioend->io_bio);
+       return 0;
+}
 
-               if (!bio) {
-retry:
-                       bio = xfs_alloc_ioend_bio(bh);
-               } else if (bh->b_blocknr != lastblock + 1) {
-                       xfs_submit_ioend_bio(wbc, ioend, bio);
-                       goto retry;
-               }
+static void
+xfs_init_bio_from_bh(
+       struct bio              *bio,
+       struct buffer_head      *bh)
+{
+       bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+       bio->bi_bdev = bh->b_bdev;
+}
 
-               if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
-                       xfs_submit_ioend_bio(wbc, ioend, bio);
-                       goto retry;
-               }
+static struct xfs_ioend *
+xfs_alloc_ioend(
+       struct inode            *inode,
+       unsigned int            type,
+       xfs_off_t               offset,
+       struct buffer_head      *bh)
+{
+       struct xfs_ioend        *ioend;
+       struct bio              *bio;
 
-               lastblock = bh->b_blocknr;
-       }
-       if (bio)
-               xfs_submit_ioend_bio(wbc, ioend, bio);
-       xfs_finish_ioend(ioend);
-       return 0;
+       bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, xfs_ioend_bioset);
+       xfs_init_bio_from_bh(bio, bh);
+
+       ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
+       INIT_LIST_HEAD(&ioend->io_list);
+       ioend->io_type = type;
+       ioend->io_inode = inode;
+       ioend->io_size = 0;
+       ioend->io_offset = offset;
+       INIT_WORK(&ioend->io_work, xfs_end_io);
+       ioend->io_append_trans = NULL;
+       ioend->io_bio = bio;
+       return ioend;
+}
+
+/*
+ * Allocate a new bio, and chain the old bio to the new one.
+ *
+ * Note that we have to do perform the chaining in this unintuitive order
+ * so that the bi_private linkage is set up in the right direction for the
+ * traversal in xfs_destroy_ioend().
+ */
+static void
+xfs_chain_bio(
+       struct xfs_ioend        *ioend,
+       struct writeback_control *wbc,
+       struct buffer_head      *bh)
+{
+       struct bio *new;
+
+       new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
+       xfs_init_bio_from_bh(new, bh);
+
+       bio_chain(ioend->io_bio, new);
+       bio_get(ioend->io_bio);         /* for xfs_destroy_ioend */
+       submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE,
+                  ioend->io_bio);
+       ioend->io_bio = new;
 }
 
 /*
@@ -523,27 +528,24 @@ xfs_add_to_ioend(
        struct buffer_head      *bh,
        xfs_off_t               offset,
        struct xfs_writepage_ctx *wpc,
+       struct writeback_control *wbc,
        struct list_head        *iolist)
 {
        if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
            bh->b_blocknr != wpc->last_block + 1 ||
            offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
-               struct xfs_ioend        *new;
-
                if (wpc->ioend)
                        list_add(&wpc->ioend->io_list, iolist);
-
-               new = xfs_alloc_ioend(inode, wpc->io_type);
-               new->io_offset = offset;
-               new->io_buffer_head = bh;
-               new->io_buffer_tail = bh;
-               wpc->ioend = new;
-       } else {
-               wpc->ioend->io_buffer_tail->b_private = bh;
-               wpc->ioend->io_buffer_tail = bh;
+               wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset, bh);
        }
 
-       bh->b_private = NULL;
+       /*
+        * If the buffer doesn't fit into the bio we need to allocate a new
+        * one.  This shouldn't happen more than once for a given buffer.
+        */
+       while (xfs_bio_add_buffer(wpc->ioend->io_bio, bh) != bh->b_size)
+               xfs_chain_bio(wpc->ioend, wbc, bh);
+
        wpc->ioend->io_size += bh->b_size;
        wpc->last_block = bh->b_blocknr;
        xfs_start_buffer_writeback(bh);
@@ -803,7 +805,7 @@ xfs_writepage_map(
                        lock_buffer(bh);
                        if (wpc->io_type != XFS_IO_OVERWRITE)
                                xfs_map_at_offset(inode, bh, &wpc->imap, offset);
-                       xfs_add_to_ioend(inode, bh, offset, wpc, &submit_list);
+                       xfs_add_to_ioend(inode, bh, offset, wpc, wbc, &submit_list);
                        count++;
                }
 
@@ -1391,13 +1393,10 @@ xfs_end_io_direct_write(
 
                trace_xfs_end_io_direct_write_append(ip, offset, size);
 
-               tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
-               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
-               if (error) {
-                       xfs_trans_cancel(tp);
-                       return error;
-               }
-               error = xfs_setfilesize(ip, tp, offset, size);
+               error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0,
+                               &tp);
+               if (!error)
+                       error = xfs_setfilesize(ip, tp, offset, size);
        }
 
        return error;
index b442117..814aab7 100644 (file)
@@ -18,7 +18,7 @@
 #ifndef __XFS_AOPS_H__
 #define __XFS_AOPS_H__
 
-extern mempool_t *xfs_ioend_pool;
+extern struct bio_set *xfs_ioend_bioset;
 
 /*
  * Types of I/O for bmap clustering and I/O completion tracking.
@@ -37,22 +37,19 @@ enum {
        { XFS_IO_OVERWRITE,             "overwrite" }
 
 /*
- * xfs_ioend struct manages large extent writes for XFS.
- * It can manage several multi-page bio's at once.
+ * Structure for buffered I/O completions.
  */
-typedef struct xfs_ioend {
+struct xfs_ioend {
        struct list_head        io_list;        /* next ioend in chain */
        unsigned int            io_type;        /* delalloc / unwritten */
-       int                     io_error;       /* I/O error code */
-       atomic_t                io_remaining;   /* hold count */
        struct inode            *io_inode;      /* file being written to */
-       struct buffer_head      *io_buffer_head;/* buffer linked list head */
-       struct buffer_head      *io_buffer_tail;/* buffer linked list tail */
        size_t                  io_size;        /* size of the extent */
        xfs_off_t               io_offset;      /* offset in the file */
        struct work_struct      io_work;        /* xfsdatad work queue */
        struct xfs_trans        *io_append_trans;/* xact. for size update */
-} xfs_ioend_t;
+       struct bio              *io_bio;        /* bio being built */
+       struct bio              io_inline_bio;  /* MUST BE LAST! */
+};
 
 extern const struct address_space_operations xfs_address_space_operations;
 
index dd48245..e3da5d4 100644 (file)
@@ -112,8 +112,9 @@ typedef struct attrlist_cursor_kern {
  *========================================================================*/
 
 
+/* Return 0 on success, or -errno; other state communicated via *context */
 typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int,
-                             unsigned char *, int, int, unsigned char *);
+                             unsigned char *, int, int);
 
 typedef struct xfs_attr_list_context {
        struct xfs_inode                *dp;            /* inode */
@@ -126,7 +127,6 @@ typedef struct xfs_attr_list_context {
        int                             firstu;         /* first used byte in buffer */
        int                             flags;          /* from VOP call */
        int                             resynch;        /* T/F: resynch with cursor */
-       int                             put_value;      /* T/F: need value for listent */
        put_listent_func_t              put_listent;    /* list output fmt function */
        int                             index;          /* index into output buffer */
 } xfs_attr_list_context_t;
index 2bb959a..55d2149 100644 (file)
@@ -405,21 +405,11 @@ xfs_attr_inactive(
                goto out_destroy_fork;
        xfs_iunlock(dp, lock_mode);
 
-       /*
-        * Start our first transaction of the day.
-        *
-        * All future transactions during this code must be "chained" off
-        * this one via the trans_dup() call.  All transactions will contain
-        * the inode, and the inode will always be marked with trans_ihold().
-        * Since the inode will be locked in all transactions, we must log
-        * the inode in every transaction to let it float upward through
-        * the log.
-        */
        lock_mode = 0;
-       trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL);
-       error = xfs_trans_reserve(trans, &M_RES(mp)->tr_attrinval, 0, 0);
+
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_attrinval, 0, 0, 0, &trans);
        if (error)
-               goto out_cancel;
+               goto out_destroy_fork;
 
        lock_mode = XFS_ILOCK_EXCL;
        xfs_ilock(dp, lock_mode);
index 4fa1482..d25f26b 100644 (file)
@@ -106,18 +106,15 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
                                           sfe->flags,
                                           sfe->nameval,
                                           (int)sfe->namelen,
-                                          (int)sfe->valuelen,
-                                          &sfe->nameval[sfe->namelen]);
-
+                                          (int)sfe->valuelen);
+                       if (error)
+                               return error;
                        /*
                         * Either search callback finished early or
                         * didn't fit it all in the buffer after all.
                         */
                        if (context->seen_enough)
                                break;
-
-                       if (error)
-                               return error;
                        sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
                }
                trace_xfs_attr_list_sf_all(context);
@@ -200,8 +197,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
                                        sbp->flags,
                                        sbp->name,
                                        sbp->namelen,
-                                       sbp->valuelen,
-                                       &sbp->name[sbp->namelen]);
+                                       sbp->valuelen);
                if (error) {
                        kmem_free(sbuf);
                        return error;
@@ -416,6 +412,9 @@ xfs_attr3_leaf_list_int(
         */
        retval = 0;
        for (; i < ichdr.count; entry++, i++) {
+               char *name;
+               int namelen, valuelen;
+
                if (be32_to_cpu(entry->hashval) != cursor->hashval) {
                        cursor->hashval = be32_to_cpu(entry->hashval);
                        cursor->offset = 0;
@@ -425,56 +424,25 @@ xfs_attr3_leaf_list_int(
                        continue;               /* skip incomplete entries */
 
                if (entry->flags & XFS_ATTR_LOCAL) {
-                       xfs_attr_leaf_name_local_t *name_loc =
-                               xfs_attr3_leaf_name_local(leaf, i);
-
-                       retval = context->put_listent(context,
-                                               entry->flags,
-                                               name_loc->nameval,
-                                               (int)name_loc->namelen,
-                                               be16_to_cpu(name_loc->valuelen),
-                                               &name_loc->nameval[name_loc->namelen]);
-                       if (retval)
-                               return retval;
+                       xfs_attr_leaf_name_local_t *name_loc;
+
+                       name_loc = xfs_attr3_leaf_name_local(leaf, i);
+                       name = name_loc->nameval;
+                       namelen = name_loc->namelen;
+                       valuelen = be16_to_cpu(name_loc->valuelen);
                } else {
-                       xfs_attr_leaf_name_remote_t *name_rmt =
-                               xfs_attr3_leaf_name_remote(leaf, i);
-
-                       int valuelen = be32_to_cpu(name_rmt->valuelen);
-
-                       if (context->put_value) {
-                               xfs_da_args_t args;
-
-                               memset((char *)&args, 0, sizeof(args));
-                               args.geo = context->dp->i_mount->m_attr_geo;
-                               args.dp = context->dp;
-                               args.whichfork = XFS_ATTR_FORK;
-                               args.valuelen = valuelen;
-                               args.rmtvaluelen = valuelen;
-                               args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
-                               args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
-                               args.rmtblkcnt = xfs_attr3_rmt_blocks(
-                                                       args.dp->i_mount, valuelen);
-                               retval = xfs_attr_rmtval_get(&args);
-                               if (!retval)
-                                       retval = context->put_listent(context,
-                                                       entry->flags,
-                                                       name_rmt->name,
-                                                       (int)name_rmt->namelen,
-                                                       valuelen,
-                                                       args.value);
-                               kmem_free(args.value);
-                       } else {
-                               retval = context->put_listent(context,
-                                               entry->flags,
-                                               name_rmt->name,
-                                               (int)name_rmt->namelen,
-                                               valuelen,
-                                               NULL);
-                       }
-                       if (retval)
-                               return retval;
+                       xfs_attr_leaf_name_remote_t *name_rmt;
+
+                       name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
+                       name = name_rmt->name;
+                       namelen = name_rmt->namelen;
+                       valuelen = be32_to_cpu(name_rmt->valuelen);
                }
+
+               retval = context->put_listent(context, entry->flags,
+                                             name, namelen, valuelen);
+               if (retval)
+                       break;
                if (context->seen_enough)
                        break;
                cursor->offset++;
@@ -551,8 +519,7 @@ xfs_attr_put_listent(
        int             flags,
        unsigned char   *name,
        int             namelen,
-       int             valuelen,
-       unsigned char   *value)
+       int             valuelen)
 {
        struct attrlist *alist = (struct attrlist *)context->alist;
        attrlist_ent_t *aep;
@@ -581,7 +548,7 @@ xfs_attr_put_listent(
                trace_xfs_attr_list_full(context);
                alist->al_more = 1;
                context->seen_enough = 1;
-               return 1;
+               return 0;
        }
 
        aep = (attrlist_ent_t *)&context->alist[context->firstu];
index 3b63098..586bb64 100644 (file)
@@ -72,18 +72,11 @@ xfs_zero_extent(
        struct xfs_mount *mp = ip->i_mount;
        xfs_daddr_t     sector = xfs_fsb_to_db(ip, start_fsb);
        sector_t        block = XFS_BB_TO_FSBT(mp, sector);
-       ssize_t         size = XFS_FSB_TO_B(mp, count_fsb);
-
-       if (IS_DAX(VFS_I(ip)))
-               return dax_clear_sectors(xfs_find_bdev_for_inode(VFS_I(ip)),
-                               sector, size);
-
-       /*
-        * let the block layer decide on the fastest method of
-        * implementing the zeroing.
-        */
-       return sb_issue_zeroout(mp->m_super, block, count_fsb, GFP_NOFS);
 
+       return blkdev_issue_zeroout(xfs_find_bdev_for_inode(VFS_I(ip)),
+               block << (mp->m_super->s_blocksize_bits - 9),
+               count_fsb << (mp->m_super->s_blocksize_bits - 9),
+               GFP_NOFS, true);
 }
 
 /*
@@ -900,19 +893,15 @@ xfs_free_eofblocks(
                 * Free them up now by truncating the file to
                 * its current size.
                 */
-               tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
-
                if (need_iolock) {
-                       if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
-                               xfs_trans_cancel(tp);
+                       if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL))
                                return -EAGAIN;
-                       }
                }
 
-               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+               error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0,
+                               &tp);
                if (error) {
                        ASSERT(XFS_FORCED_SHUTDOWN(mp));
-                       xfs_trans_cancel(tp);
                        if (need_iolock)
                                xfs_iunlock(ip, XFS_IOLOCK_EXCL);
                        return error;
@@ -1037,9 +1026,9 @@ xfs_alloc_file_space(
                /*
                 * Allocate and setup the transaction.
                 */
-               tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
-               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
-                                         resblks, resrtextents);
+               error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks,
+                               resrtextents, 0, &tp);
+
                /*
                 * Check for running out of space
                 */
@@ -1048,7 +1037,6 @@ xfs_alloc_file_space(
                         * Free the transaction structure.
                         */
                        ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
-                       xfs_trans_cancel(tp);
                        break;
                }
                xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -1311,18 +1299,10 @@ xfs_free_file_space(
                 * transaction to dip into the reserve blocks to ensure
                 * the freeing of the space succeeds at ENOSPC.
                 */
-               tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
-               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0);
-
-               /*
-                * check for running out of space
-                */
+               error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0,
+                               &tp);
                if (error) {
-                       /*
-                        * Free the transaction structure.
-                        */
                        ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
-                       xfs_trans_cancel(tp);
                        break;
                }
                xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -1482,19 +1462,16 @@ xfs_shift_file_space(
        }
 
        while (!error && !done) {
-               tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
                /*
                 * We would need to reserve permanent block for transaction.
                 * This will come into picture when after shifting extent into
                 * hole we found that adjacent extents can be merged which
                 * may lead to freeing of a block during record update.
                 */
-               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
-                               XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
-               if (error) {
-                       xfs_trans_cancel(tp);
+               error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write,
+                               XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp);
+               if (error)
                        break;
-               }
 
                xfs_ilock(ip, XFS_ILOCK_EXCL);
                error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
@@ -1747,12 +1724,9 @@ xfs_swap_extents(
        if (error)
                goto out_unlock;
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
-       if (error) {
-               xfs_trans_cancel(tp);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+       if (error)
                goto out_unlock;
-       }
 
        /*
         * Lock and join the inodes to the tansaction so that transaction commit
index 9a2191b..e71cfbd 100644 (file)
@@ -1100,22 +1100,18 @@ xfs_bwrite(
        return error;
 }
 
-STATIC void
+static void
 xfs_buf_bio_end_io(
        struct bio              *bio)
 {
-       xfs_buf_t               *bp = (xfs_buf_t *)bio->bi_private;
+       struct xfs_buf          *bp = (struct xfs_buf *)bio->bi_private;
 
        /*
         * don't overwrite existing errors - otherwise we can lose errors on
         * buffers that require multiple bios to complete.
         */
-       if (bio->bi_error) {
-               spin_lock(&bp->b_lock);
-               if (!bp->b_io_error)
-                       bp->b_io_error = bio->bi_error;
-               spin_unlock(&bp->b_lock);
-       }
+       if (bio->bi_error)
+               cmpxchg(&bp->b_io_error, 0, bio->bi_error);
 
        if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
                invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
index 4eb89bd..8bfb974 100644 (file)
@@ -183,6 +183,26 @@ typedef struct xfs_buf {
        unsigned int            b_page_count;   /* size of page array */
        unsigned int            b_offset;       /* page offset in first page */
        int                     b_error;        /* error code on I/O */
+
+       /*
+        * async write failure retry count. Initialised to zero on the first
+        * failure, then when it exceeds the maximum configured without a
+        * success the write is considered to be failed permanently and the
+        * iodone handler will take appropriate action.
+        *
+        * For retry timeouts, we record the jiffie of the first failure. This
+        * means that we can change the retry timeout for buffers already under
+        * I/O and thus avoid getting stuck in a retry loop with a long timeout.
+        *
+        * last_error is used to ensure that we are getting repeated errors, not
+        * different errors. e.g. a block device might change ENOSPC to EIO when
+        * a failure timeout occurs, so we want to re-initialise the error
+        * retry behaviour appropriately when that happens.
+        */
+       int                     b_retries;
+       unsigned long           b_first_retry_time; /* in jiffies */
+       int                     b_last_error;
+
        const struct xfs_buf_ops        *b_ops;
 
 #ifdef XFS_BUF_LOCK_TRACKING
index 99e91a0..3425799 100644 (file)
@@ -1042,35 +1042,22 @@ xfs_buf_do_callbacks(
        }
 }
 
-/*
- * This is the iodone() function for buffers which have had callbacks
- * attached to them by xfs_buf_attach_iodone().  It should remove each
- * log item from the buffer's list and call the callback of each in turn.
- * When done, the buffer's fsprivate field is set to NULL and the buffer
- * is unlocked with a call to iodone().
- */
-void
-xfs_buf_iodone_callbacks(
+static bool
+xfs_buf_iodone_callback_error(
        struct xfs_buf          *bp)
 {
        struct xfs_log_item     *lip = bp->b_fspriv;
        struct xfs_mount        *mp = lip->li_mountp;
        static ulong            lasttime;
        static xfs_buftarg_t    *lasttarg;
-
-       if (likely(!bp->b_error))
-               goto do_callbacks;
+       struct xfs_error_cfg    *cfg;
 
        /*
         * If we've already decided to shutdown the filesystem because of
         * I/O errors, there's no point in giving this a retry.
         */
-       if (XFS_FORCED_SHUTDOWN(mp)) {
-               xfs_buf_stale(bp);
-               bp->b_flags |= XBF_DONE;
-               trace_xfs_buf_item_iodone(bp, _RET_IP_);
-               goto do_callbacks;
-       }
+       if (XFS_FORCED_SHUTDOWN(mp))
+               goto out_stale;
 
        if (bp->b_target != lasttarg ||
            time_after(jiffies, (lasttime + 5*HZ))) {
@@ -1079,45 +1066,93 @@ xfs_buf_iodone_callbacks(
        }
        lasttarg = bp->b_target;
 
+       /* synchronous writes will have callers process the error */
+       if (!(bp->b_flags & XBF_ASYNC))
+               goto out_stale;
+
+       trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
+       ASSERT(bp->b_iodone != NULL);
+
        /*
         * If the write was asynchronous then no one will be looking for the
-        * error.  Clear the error state and write the buffer out again.
-        *
-        * XXX: This helps against transient write errors, but we need to find
-        * a way to shut the filesystem down if the writes keep failing.
-        *
-        * In practice we'll shut the filesystem down soon as non-transient
-        * errors tend to affect the whole device and a failing log write
-        * will make us give up.  But we really ought to do better here.
+        * error.  If this is the first failure of this type, clear the error
+        * state and write the buffer out again. This means we always retry an
+        * async write failure at least once, but we also need to set the buffer
+        * up to behave correctly now for repeated failures.
         */
-       if (bp->b_flags & XBF_ASYNC) {
-               ASSERT(bp->b_iodone != NULL);
+       if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL)) ||
+            bp->b_last_error != bp->b_error) {
+               bp->b_flags |= (XBF_WRITE | XBF_ASYNC |
+                               XBF_DONE | XBF_WRITE_FAIL);
+               bp->b_last_error = bp->b_error;
+               bp->b_retries = 0;
+               bp->b_first_retry_time = jiffies;
+
+               xfs_buf_ioerror(bp, 0);
+               xfs_buf_submit(bp);
+               return true;
+       }
 
-               trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
+       /*
+        * Repeated failure on an async write. Take action according to the
+        * error configuration we have been set up to use.
+        */
+       cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
 
-               xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */
+       if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
+           ++bp->b_retries > cfg->max_retries)
+                       goto permanent_error;
+       if (cfg->retry_timeout &&
+           time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time))
+                       goto permanent_error;
 
-               if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL))) {
-                       bp->b_flags |= XBF_WRITE | XBF_ASYNC |
-                                      XBF_DONE | XBF_WRITE_FAIL;
-                       xfs_buf_submit(bp);
-               } else {
-                       xfs_buf_relse(bp);
-               }
+       /* At unmount we may treat errors differently */
+       if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount)
+               goto permanent_error;
 
-               return;
-       }
+       /* still a transient error, higher layers will retry */
+       xfs_buf_ioerror(bp, 0);
+       xfs_buf_relse(bp);
+       return true;
 
        /*
-        * If the write of the buffer was synchronous, we want to make
-        * sure to return the error to the caller of xfs_bwrite().
+        * Permanent error - we need to trigger a shutdown if we haven't already
+        * to indicate that inconsistency will result from this action.
         */
+permanent_error:
+       xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+out_stale:
        xfs_buf_stale(bp);
        bp->b_flags |= XBF_DONE;
-
        trace_xfs_buf_error_relse(bp, _RET_IP_);
+       return false;
+}
+
+/*
+ * This is the iodone() function for buffers which have had callbacks attached
+ * to them by xfs_buf_attach_iodone(). We need to iterate the items on the
+ * callback list, mark the buffer as having no more callbacks and then push the
+ * buffer through IO completion processing.
+ */
+void
+xfs_buf_iodone_callbacks(
+       struct xfs_buf          *bp)
+{
+       /*
+        * If there is an error, process it. Some errors require us
+        * to run callbacks after failure processing is done so we
+        * detect that and take appropriate action.
+        */
+       if (bp->b_error && xfs_buf_iodone_callback_error(bp))
+               return;
+
+       /*
+        * Successful IO or permanent error. Either way, we can clear the
+        * retry state here in preparation for the next error that may occur.
+        */
+       bp->b_last_error = 0;
+       bp->b_retries = 0;
 
-do_callbacks:
        xfs_buf_do_callbacks(bp);
        bp->b_fspriv = NULL;
        bp->b_iodone = NULL;
index 316b2a1..e064665 100644 (file)
@@ -614,11 +614,10 @@ xfs_qm_dqread(
        trace_xfs_dqread(dqp);
 
        if (flags & XFS_QMOPT_DQALLOC) {
-               tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
-               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_dqalloc,
-                                         XFS_QM_DQALLOC_SPACE_RES(mp), 0);
+               error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc,
+                               XFS_QM_DQALLOC_SPACE_RES(mp), 0, 0, &tp);
                if (error)
-                       goto error1;
+                       goto error0;
        }
 
        /*
@@ -692,7 +691,7 @@ error0:
  * end of the chunk, skip ahead to first id in next allocated chunk
  * using the SEEK_DATA interface.
  */
-int
+static int
 xfs_dq_get_next_id(
        xfs_mount_t             *mp,
        uint                    type,
index 85ce303..47fc632 100644 (file)
@@ -145,12 +145,10 @@ xfs_update_prealloc_flags(
        struct xfs_trans        *tp;
        int                     error;
 
-       tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID);
-       error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0);
-       if (error) {
-               xfs_trans_cancel(tp);
+       error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid,
+                       0, 0, 0, &tp);
+       if (error)
                return error;
-       }
 
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
@@ -1553,7 +1551,7 @@ xfs_filemap_page_mkwrite(
        xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
        if (IS_DAX(inode)) {
-               ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, NULL);
+               ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault);
        } else {
                ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
                ret = block_page_mkwrite_return(ret);
@@ -1587,7 +1585,7 @@ xfs_filemap_fault(
                 * changes to xfs_get_blocks_direct() to map unwritten extent
                 * ioend for conversion on read-only mappings.
                 */
-               ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault, NULL);
+               ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault);
        } else
                ret = filemap_fault(vma, vmf);
        xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
@@ -1624,8 +1622,7 @@ xfs_filemap_pmd_fault(
        }
 
        xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-       ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault,
-                             NULL);
+       ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault);
        xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
        if (flags & FAULT_FLAG_WRITE)
index ee3aaa0..b4d7582 100644 (file)
@@ -198,14 +198,10 @@ xfs_growfs_data_private(
                        return error;
        }
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
-       tp->t_flags |= XFS_TRANS_RESERVE;
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata,
-                                 XFS_GROWFS_SPACE_RES(mp), 0);
-       if (error) {
-               xfs_trans_cancel(tp);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata,
+                       XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
+       if (error)
                return error;
-       }
 
        /*
         * Write new AG headers to disk. Non-transactional, but written
@@ -243,8 +239,8 @@ xfs_growfs_data_private(
                agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp));
                agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1);
                agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1);
-               agf->agf_flfirst = 0;
-               agf->agf_fllast = cpu_to_be32(XFS_AGFL_SIZE(mp) - 1);
+               agf->agf_flfirst = cpu_to_be32(1);
+               agf->agf_fllast = 0;
                agf->agf_flcount = 0;
                tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp);
                agf->agf_freeblks = cpu_to_be32(tmpsize);
index bf2d607..99ee6ee 100644 (file)
@@ -37,9 +37,6 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 
-STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp,
-                               struct xfs_perag *pag, struct xfs_inode *ip);
-
 /*
  * Allocate and initialise an xfs_inode.
  */
@@ -94,13 +91,6 @@ xfs_inode_free_callback(
        struct inode            *inode = container_of(head, struct inode, i_rcu);
        struct xfs_inode        *ip = XFS_I(inode);
 
-       kmem_zone_free(xfs_inode_zone, ip);
-}
-
-void
-xfs_inode_free(
-       struct xfs_inode        *ip)
-{
        switch (VFS_I(ip)->i_mode & S_IFMT) {
        case S_IFREG:
        case S_IFDIR:
@@ -118,6 +108,25 @@ xfs_inode_free(
                ip->i_itemp = NULL;
        }
 
+       kmem_zone_free(xfs_inode_zone, ip);
+}
+
+static void
+__xfs_inode_free(
+       struct xfs_inode        *ip)
+{
+       /* asserts to verify all state is correct here */
+       ASSERT(atomic_read(&ip->i_pincount) == 0);
+       ASSERT(!xfs_isiflocked(ip));
+       XFS_STATS_DEC(ip->i_mount, vn_active);
+
+       call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
+}
+
+void
+xfs_inode_free(
+       struct xfs_inode        *ip)
+{
        /*
         * Because we use RCU freeing we need to ensure the inode always
         * appears to be reclaimed with an invalid inode number when in the
@@ -129,12 +138,123 @@ xfs_inode_free(
        ip->i_ino = 0;
        spin_unlock(&ip->i_flags_lock);
 
-       /* asserts to verify all state is correct here */
-       ASSERT(atomic_read(&ip->i_pincount) == 0);
-       ASSERT(!xfs_isiflocked(ip));
-       XFS_STATS_DEC(ip->i_mount, vn_active);
+       __xfs_inode_free(ip);
+}
 
-       call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
+/*
+ * Queue a new inode reclaim pass if there are reclaimable inodes and there
+ * isn't a reclaim pass already in progress. By default it runs every 5s based
+ * on the xfs periodic sync default of 30s. Perhaps this should have it's own
+ * tunable, but that can be done if this method proves to be ineffective or too
+ * aggressive.
+ */
+static void
+xfs_reclaim_work_queue(
+       struct xfs_mount        *mp)
+{
+
+       rcu_read_lock();
+       if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
+               queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
+                       msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
+       }
+       rcu_read_unlock();
+}
+
+/*
+ * This is a fast pass over the inode cache to try to get reclaim moving on as
+ * many inodes as possible in a short period of time. It kicks itself every few
+ * seconds, as well as being kicked by the inode cache shrinker when memory
+ * goes low. It scans as quickly as possible avoiding locked inodes or those
+ * already being flushed, and once done schedules a future pass.
+ */
+void
+xfs_reclaim_worker(
+       struct work_struct *work)
+{
+       struct xfs_mount *mp = container_of(to_delayed_work(work),
+                                       struct xfs_mount, m_reclaim_work);
+
+       xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
+       xfs_reclaim_work_queue(mp);
+}
+
+static void
+xfs_perag_set_reclaim_tag(
+       struct xfs_perag        *pag)
+{
+       struct xfs_mount        *mp = pag->pag_mount;
+
+       ASSERT(spin_is_locked(&pag->pag_ici_lock));
+       if (pag->pag_ici_reclaimable++)
+               return;
+
+       /* propagate the reclaim tag up into the perag radix tree */
+       spin_lock(&mp->m_perag_lock);
+       radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno,
+                          XFS_ICI_RECLAIM_TAG);
+       spin_unlock(&mp->m_perag_lock);
+
+       /* schedule periodic background inode reclaim */
+       xfs_reclaim_work_queue(mp);
+
+       trace_xfs_perag_set_reclaim(mp, pag->pag_agno, -1, _RET_IP_);
+}
+
+static void
+xfs_perag_clear_reclaim_tag(
+       struct xfs_perag        *pag)
+{
+       struct xfs_mount        *mp = pag->pag_mount;
+
+       ASSERT(spin_is_locked(&pag->pag_ici_lock));
+       if (--pag->pag_ici_reclaimable)
+               return;
+
+       /* clear the reclaim tag from the perag radix tree */
+       spin_lock(&mp->m_perag_lock);
+       radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno,
+                            XFS_ICI_RECLAIM_TAG);
+       spin_unlock(&mp->m_perag_lock);
+       trace_xfs_perag_clear_reclaim(mp, pag->pag_agno, -1, _RET_IP_);
+}
+
+
+/*
+ * We set the inode flag atomically with the radix tree tag.
+ * Once we get tag lookups on the radix tree, this inode flag
+ * can go away.
+ */
+void
+xfs_inode_set_reclaim_tag(
+       struct xfs_inode        *ip)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       struct xfs_perag        *pag;
+
+       pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+       spin_lock(&pag->pag_ici_lock);
+       spin_lock(&ip->i_flags_lock);
+
+       radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino),
+                          XFS_ICI_RECLAIM_TAG);
+       xfs_perag_set_reclaim_tag(pag);
+       __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
+
+       spin_unlock(&ip->i_flags_lock);
+       spin_unlock(&pag->pag_ici_lock);
+       xfs_perag_put(pag);
+}
+
+STATIC void
+xfs_inode_clear_reclaim_tag(
+       struct xfs_perag        *pag,
+       xfs_ino_t               ino)
+{
+       radix_tree_tag_clear(&pag->pag_ici_root,
+                            XFS_INO_TO_AGINO(pag->pag_mount, ino),
+                            XFS_ICI_RECLAIM_TAG);
+       xfs_perag_clear_reclaim_tag(pag);
 }
 
 /*
@@ -264,7 +384,7 @@ xfs_iget_cache_hit(
                 */
                ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
                ip->i_flags |= XFS_INEW;
-               __xfs_inode_clear_reclaim_tag(mp, pag, ip);
+               xfs_inode_clear_reclaim_tag(pag, ip->i_ino);
                inode->i_state = I_NEW;
 
                ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
@@ -722,121 +842,6 @@ xfs_inode_ag_iterator_tag(
        return last_error;
 }
 
-/*
- * Queue a new inode reclaim pass if there are reclaimable inodes and there
- * isn't a reclaim pass already in progress. By default it runs every 5s based
- * on the xfs periodic sync default of 30s. Perhaps this should have it's own
- * tunable, but that can be done if this method proves to be ineffective or too
- * aggressive.
- */
-static void
-xfs_reclaim_work_queue(
-       struct xfs_mount        *mp)
-{
-
-       rcu_read_lock();
-       if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
-               queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
-                       msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
-       }
-       rcu_read_unlock();
-}
-
-/*
- * This is a fast pass over the inode cache to try to get reclaim moving on as
- * many inodes as possible in a short period of time. It kicks itself every few
- * seconds, as well as being kicked by the inode cache shrinker when memory
- * goes low. It scans as quickly as possible avoiding locked inodes or those
- * already being flushed, and once done schedules a future pass.
- */
-void
-xfs_reclaim_worker(
-       struct work_struct *work)
-{
-       struct xfs_mount *mp = container_of(to_delayed_work(work),
-                                       struct xfs_mount, m_reclaim_work);
-
-       xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
-       xfs_reclaim_work_queue(mp);
-}
-
-static void
-__xfs_inode_set_reclaim_tag(
-       struct xfs_perag        *pag,
-       struct xfs_inode        *ip)
-{
-       radix_tree_tag_set(&pag->pag_ici_root,
-                          XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
-                          XFS_ICI_RECLAIM_TAG);
-
-       if (!pag->pag_ici_reclaimable) {
-               /* propagate the reclaim tag up into the perag radix tree */
-               spin_lock(&ip->i_mount->m_perag_lock);
-               radix_tree_tag_set(&ip->i_mount->m_perag_tree,
-                               XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
-                               XFS_ICI_RECLAIM_TAG);
-               spin_unlock(&ip->i_mount->m_perag_lock);
-
-               /* schedule periodic background inode reclaim */
-               xfs_reclaim_work_queue(ip->i_mount);
-
-               trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
-                                                       -1, _RET_IP_);
-       }
-       pag->pag_ici_reclaimable++;
-}
-
-/*
- * We set the inode flag atomically with the radix tree tag.
- * Once we get tag lookups on the radix tree, this inode flag
- * can go away.
- */
-void
-xfs_inode_set_reclaim_tag(
-       xfs_inode_t     *ip)
-{
-       struct xfs_mount *mp = ip->i_mount;
-       struct xfs_perag *pag;
-
-       pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-       spin_lock(&pag->pag_ici_lock);
-       spin_lock(&ip->i_flags_lock);
-       __xfs_inode_set_reclaim_tag(pag, ip);
-       __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
-       spin_unlock(&ip->i_flags_lock);
-       spin_unlock(&pag->pag_ici_lock);
-       xfs_perag_put(pag);
-}
-
-STATIC void
-__xfs_inode_clear_reclaim(
-       xfs_perag_t     *pag,
-       xfs_inode_t     *ip)
-{
-       pag->pag_ici_reclaimable--;
-       if (!pag->pag_ici_reclaimable) {
-               /* clear the reclaim tag from the perag radix tree */
-               spin_lock(&ip->i_mount->m_perag_lock);
-               radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
-                               XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
-                               XFS_ICI_RECLAIM_TAG);
-               spin_unlock(&ip->i_mount->m_perag_lock);
-               trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
-                                                       -1, _RET_IP_);
-       }
-}
-
-STATIC void
-__xfs_inode_clear_reclaim_tag(
-       xfs_mount_t     *mp,
-       xfs_perag_t     *pag,
-       xfs_inode_t     *ip)
-{
-       radix_tree_tag_clear(&pag->pag_ici_root,
-                       XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
-       __xfs_inode_clear_reclaim(pag, ip);
-}
-
 /*
  * Grab the inode for reclaim exclusively.
  * Return 0 if we grabbed it, non-zero otherwise.
@@ -929,6 +934,7 @@ xfs_reclaim_inode(
        int                     sync_mode)
 {
        struct xfs_buf          *bp = NULL;
+       xfs_ino_t               ino = ip->i_ino; /* for radix_tree_delete */
        int                     error;
 
 restart:
@@ -993,6 +999,22 @@ restart:
 
        xfs_iflock(ip);
 reclaim:
+       /*
+        * Because we use RCU freeing we need to ensure the inode always appears
+        * to be reclaimed with an invalid inode number when in the free state.
+        * We do this as early as possible under the ILOCK and flush lock so
+        * that xfs_iflush_cluster() can be guaranteed to detect races with us
+        * here. By doing this, we guarantee that once xfs_iflush_cluster has
+        * locked both the XFS_ILOCK and the flush lock that it will see either
+        * a valid, flushable inode that will serialise correctly against the
+        * locks below, or it will see a clean (and invalid) inode that it can
+        * skip.
+        */
+       spin_lock(&ip->i_flags_lock);
+       ip->i_flags = XFS_IRECLAIM;
+       ip->i_ino = 0;
+       spin_unlock(&ip->i_flags_lock);
+
        xfs_ifunlock(ip);
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
 
@@ -1006,9 +1028,9 @@ reclaim:
         */
        spin_lock(&pag->pag_ici_lock);
        if (!radix_tree_delete(&pag->pag_ici_root,
-                               XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
+                               XFS_INO_TO_AGINO(ip->i_mount, ino)))
                ASSERT(0);
-       __xfs_inode_clear_reclaim(pag, ip);
+       xfs_perag_clear_reclaim_tag(pag);
        spin_unlock(&pag->pag_ici_lock);
 
        /*
@@ -1023,7 +1045,7 @@ reclaim:
        xfs_qm_dqdetach(ip);
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
 
-       xfs_inode_free(ip);
+       __xfs_inode_free(ip);
        return error;
 
 out_ifunlock:
index 96f606d..ee6799e 100644 (file)
@@ -1030,7 +1030,7 @@ xfs_dir_ialloc(
                        tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
                }
 
-               code = xfs_trans_roll(&tp, 0);
+               code = xfs_trans_roll(&tp, NULL);
                if (committed != NULL)
                        *committed = 1;
 
@@ -1161,11 +1161,9 @@ xfs_create(
                rdev = 0;
                resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
                tres = &M_RES(mp)->tr_mkdir;
-               tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
        } else {
                resblks = XFS_CREATE_SPACE_RES(mp, name->len);
                tres = &M_RES(mp)->tr_create;
-               tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
        }
 
        /*
@@ -1174,20 +1172,19 @@ xfs_create(
         * the case we'll drop the one we have and get a more
         * appropriate transaction later.
         */
-       error = xfs_trans_reserve(tp, tres, resblks, 0);
+       error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
        if (error == -ENOSPC) {
                /* flush outstanding delalloc blocks and retry */
                xfs_flush_inodes(mp);
-               error = xfs_trans_reserve(tp, tres, resblks, 0);
+               error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
        }
        if (error == -ENOSPC) {
                /* No space at all so try a "no-allocation" reservation */
                resblks = 0;
-               error = xfs_trans_reserve(tp, tres, 0, 0);
+               error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp);
        }
        if (error)
-               goto out_trans_cancel;
-
+               goto out_release_inode;
 
        xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL |
                      XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT);
@@ -1337,17 +1334,16 @@ xfs_create_tmpfile(
                return error;
 
        resblks = XFS_IALLOC_SPACE_RES(mp);
-       tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE_TMPFILE);
-
        tres = &M_RES(mp)->tr_create_tmpfile;
-       error = xfs_trans_reserve(tp, tres, resblks, 0);
+
+       error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
        if (error == -ENOSPC) {
                /* No space at all so try a "no-allocation" reservation */
                resblks = 0;
-               error = xfs_trans_reserve(tp, tres, 0, 0);
+               error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp);
        }
        if (error)
-               goto out_trans_cancel;
+               goto out_release_inode;
 
        error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
                                                pdqp, resblks, 1, 0);
@@ -1432,15 +1428,14 @@ xfs_link(
        if (error)
                goto std_return;
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
        resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, resblks, 0, 0, &tp);
        if (error == -ENOSPC) {
                resblks = 0;
-               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0);
+               error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &tp);
        }
        if (error)
-               goto error_return;
+               goto std_return;
 
        xfs_ilock(tdp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
        xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
@@ -1710,11 +1705,9 @@ xfs_inactive_truncate(
        struct xfs_trans        *tp;
        int                     error;
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
        if (error) {
                ASSERT(XFS_FORCED_SHUTDOWN(mp));
-               xfs_trans_cancel(tp);
                return error;
        }
 
@@ -1764,8 +1757,6 @@ xfs_inactive_ifree(
        struct xfs_trans        *tp;
        int                     error;
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
-
        /*
         * The ifree transaction might need to allocate blocks for record
         * insertion to the finobt. We don't want to fail here at ENOSPC, so
@@ -1781,9 +1772,8 @@ xfs_inactive_ifree(
         * now remains allocated and sits on the unlinked list until the fs is
         * repaired.
         */
-       tp->t_flags |= XFS_TRANS_RESERVE;
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree,
-                                 XFS_IFREE_SPACE_RES(mp), 0);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
+                       XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
        if (error) {
                if (error == -ENOSPC) {
                        xfs_warn_ratelimited(mp,
@@ -1792,7 +1782,6 @@ xfs_inactive_ifree(
                } else {
                        ASSERT(XFS_FORCED_SHUTDOWN(mp));
                }
-               xfs_trans_cancel(tp);
                return error;
        }
 
@@ -2525,11 +2514,6 @@ xfs_remove(
        if (error)
                goto std_return;
 
-       if (is_dir)
-               tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
-       else
-               tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
-
        /*
         * We try to get the real space reservation first,
         * allowing for directory btree deletion(s) implying
@@ -2540,14 +2524,15 @@ xfs_remove(
         * block from the directory.
         */
        resblks = XFS_REMOVE_SPACE_RES(mp);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, resblks, 0);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, resblks, 0, 0, &tp);
        if (error == -ENOSPC) {
                resblks = 0;
-               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, 0, 0);
+               error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, 0, 0, 0,
+                               &tp);
        }
        if (error) {
                ASSERT(error != -ENOSPC);
-               goto out_trans_cancel;
+               goto std_return;
        }
 
        xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
@@ -2855,6 +2840,7 @@ xfs_rename_alloc_whiteout(
         * and flag it as linkable.
         */
        drop_nlink(VFS_I(tmpfile));
+       xfs_setup_iops(tmpfile);
        xfs_finish_inode_setup(tmpfile);
        VFS_I(tmpfile)->i_state |= I_LINKABLE;
 
@@ -2910,15 +2896,15 @@ xfs_rename(
        xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip,
                                inodes, &num_inodes);
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
        spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp);
        if (error == -ENOSPC) {
                spaceres = 0;
-               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0);
+               error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, 0, 0, 0,
+                               &tp);
        }
        if (error)
-               goto out_trans_cancel;
+               goto out_release_wip;
 
        /*
         * Attach the dquots to the inodes
@@ -3155,6 +3141,7 @@ out_bmap_cancel:
        xfs_bmap_cancel(&free_list);
 out_trans_cancel:
        xfs_trans_cancel(tp);
+out_release_wip:
        if (wip)
                IRELE(wip);
        return error;
@@ -3162,16 +3149,16 @@ out_trans_cancel:
 
 STATIC int
 xfs_iflush_cluster(
-       xfs_inode_t     *ip,
-       xfs_buf_t       *bp)
+       struct xfs_inode        *ip,
+       struct xfs_buf          *bp)
 {
-       xfs_mount_t             *mp = ip->i_mount;
+       struct xfs_mount        *mp = ip->i_mount;
        struct xfs_perag        *pag;
        unsigned long           first_index, mask;
        unsigned long           inodes_per_cluster;
-       int                     ilist_size;
-       xfs_inode_t             **ilist;
-       xfs_inode_t             *iq;
+       int                     cilist_size;
+       struct xfs_inode        **cilist;
+       struct xfs_inode        *cip;
        int                     nr_found;
        int                     clcount = 0;
        int                     bufwasdelwri;
@@ -3180,23 +3167,23 @@ xfs_iflush_cluster(
        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
 
        inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
-       ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
-       ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
-       if (!ilist)
+       cilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
+       cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS);
+       if (!cilist)
                goto out_put;
 
        mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1);
        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
        rcu_read_lock();
        /* really need a gang lookup range call here */
-       nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
+       nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist,
                                        first_index, inodes_per_cluster);
        if (nr_found == 0)
                goto out_free;
 
        for (i = 0; i < nr_found; i++) {
-               iq = ilist[i];
-               if (iq == ip)
+               cip = cilist[i];
+               if (cip == ip)
                        continue;
 
                /*
@@ -3205,20 +3192,30 @@ xfs_iflush_cluster(
                 * We need to check under the i_flags_lock for a valid inode
                 * here. Skip it if it is not valid or the wrong inode.
                 */
-               spin_lock(&ip->i_flags_lock);
-               if (!ip->i_ino ||
-                   (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
-                       spin_unlock(&ip->i_flags_lock);
+               spin_lock(&cip->i_flags_lock);
+               if (!cip->i_ino ||
+                   __xfs_iflags_test(cip, XFS_ISTALE)) {
+                       spin_unlock(&cip->i_flags_lock);
                        continue;
                }
-               spin_unlock(&ip->i_flags_lock);
+
+               /*
+                * Once we fall off the end of the cluster, no point checking
+                * any more inodes in the list because they will also all be
+                * outside the cluster.
+                */
+               if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) {
+                       spin_unlock(&cip->i_flags_lock);
+                       break;
+               }
+               spin_unlock(&cip->i_flags_lock);
 
                /*
                 * Do an un-protected check to see if the inode is dirty and
                 * is a candidate for flushing.  These checks will be repeated
                 * later after the appropriate locks are acquired.
                 */
-               if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0)
+               if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0)
                        continue;
 
                /*
@@ -3226,15 +3223,28 @@ xfs_iflush_cluster(
                 * then this inode cannot be flushed and is skipped.
                 */
 
-               if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED))
+               if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED))
+                       continue;
+               if (!xfs_iflock_nowait(cip)) {
+                       xfs_iunlock(cip, XFS_ILOCK_SHARED);
                        continue;
-               if (!xfs_iflock_nowait(iq)) {
-                       xfs_iunlock(iq, XFS_ILOCK_SHARED);
+               }
+               if (xfs_ipincount(cip)) {
+                       xfs_ifunlock(cip);
+                       xfs_iunlock(cip, XFS_ILOCK_SHARED);
                        continue;
                }
-               if (xfs_ipincount(iq)) {
-                       xfs_ifunlock(iq);
-                       xfs_iunlock(iq, XFS_ILOCK_SHARED);
+
+
+               /*
+                * Check the inode number again, just to be certain we are not
+                * racing with freeing in xfs_reclaim_inode(). See the comments
+                * in that function for more information as to why the initial
+                * check is not sufficient.
+                */
+               if (!cip->i_ino) {
+                       xfs_ifunlock(cip);
+                       xfs_iunlock(cip, XFS_ILOCK_SHARED);
                        continue;
                }
 
@@ -3242,18 +3252,18 @@ xfs_iflush_cluster(
                 * arriving here means that this inode can be flushed.  First
                 * re-check that it's dirty before flushing.
                 */
-               if (!xfs_inode_clean(iq)) {
+               if (!xfs_inode_clean(cip)) {
                        int     error;
-                       error = xfs_iflush_int(iq, bp);
+                       error = xfs_iflush_int(cip, bp);
                        if (error) {
-                               xfs_iunlock(iq, XFS_ILOCK_SHARED);
+                               xfs_iunlock(cip, XFS_ILOCK_SHARED);
                                goto cluster_corrupt_out;
                        }
                        clcount++;
                } else {
-                       xfs_ifunlock(iq);
+                       xfs_ifunlock(cip);
                }
-               xfs_iunlock(iq, XFS_ILOCK_SHARED);
+               xfs_iunlock(cip, XFS_ILOCK_SHARED);
        }
 
        if (clcount) {
@@ -3263,7 +3273,7 @@ xfs_iflush_cluster(
 
 out_free:
        rcu_read_unlock();
-       kmem_free(ilist);
+       kmem_free(cilist);
 out_put:
        xfs_perag_put(pag);
        return 0;
@@ -3306,8 +3316,8 @@ cluster_corrupt_out:
        /*
         * Unlocks the flush lock
         */
-       xfs_iflush_abort(iq, false);
-       kmem_free(ilist);
+       xfs_iflush_abort(cip, false);
+       kmem_free(cilist);
        xfs_perag_put(pag);
        return -EFSCORRUPTED;
 }
@@ -3327,7 +3337,7 @@ xfs_iflush(
        struct xfs_buf          **bpp)
 {
        struct xfs_mount        *mp = ip->i_mount;
-       struct xfs_buf          *bp;
+       struct xfs_buf          *bp = NULL;
        struct xfs_dinode       *dip;
        int                     error;
 
@@ -3369,14 +3379,22 @@ xfs_iflush(
        }
 
        /*
-        * Get the buffer containing the on-disk inode.
+        * Get the buffer containing the on-disk inode. We are doing a try-lock
+        * operation here, so we may get  an EAGAIN error. In that case, we
+        * simply want to return with the inode still dirty.
+        *
+        * If we get any other error, we effectively have a corruption situation
+        * and we cannot flush the inode, so we treat it the same as failing
+        * xfs_iflush_int().
         */
        error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK,
                               0);
-       if (error || !bp) {
+       if (error == -EAGAIN) {
                xfs_ifunlock(ip);
                return error;
        }
+       if (error)
+               goto corrupt_out;
 
        /*
         * First flush out the inode that xfs_iflush was called with.
@@ -3404,7 +3422,8 @@ xfs_iflush(
        return 0;
 
 corrupt_out:
-       xfs_buf_relse(bp);
+       if (bp)
+               xfs_buf_relse(bp);
        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 cluster_corrupt_out:
        error = -EFSCORRUPTED;
index 43e1d51..e52d7c7 100644 (file)
@@ -440,6 +440,9 @@ loff_t      __xfs_seek_hole_data(struct inode *inode, loff_t start,
 
 
 /* from xfs_iops.c */
+extern void xfs_setup_inode(struct xfs_inode *ip);
+extern void xfs_setup_iops(struct xfs_inode *ip);
+
 /*
  * When setting up a newly allocated inode, we need to call
  * xfs_finish_inode_setup() once the inode is fully instantiated at
@@ -447,7 +450,6 @@ loff_t      __xfs_seek_hole_data(struct inode *inode, loff_t start,
  * before we've completed instantiation. Otherwise we can do it
  * the moment the inode lookup is complete.
  */
-extern void xfs_setup_inode(struct xfs_inode *ip);
 static inline void xfs_finish_inode_setup(struct xfs_inode *ip)
 {
        xfs_iflags_clear(ip, XFS_INEW);
@@ -458,6 +460,7 @@ static inline void xfs_finish_inode_setup(struct xfs_inode *ip)
 static inline void xfs_setup_existing_inode(struct xfs_inode *ip)
 {
        xfs_setup_inode(ip);
+       xfs_setup_iops(ip);
        xfs_finish_inode_setup(ip);
 }
 
index c48b5b1..a1b0761 100644 (file)
@@ -210,7 +210,7 @@ xfs_inode_item_format_data_fork(
                         */
                        data_bytes = roundup(ip->i_df.if_bytes, 4);
                        ASSERT(ip->i_df.if_real_bytes == 0 ||
-                              ip->i_df.if_real_bytes == data_bytes);
+                              ip->i_df.if_real_bytes >= data_bytes);
                        ASSERT(ip->i_df.if_u1.if_data != NULL);
                        ASSERT(ip->i_d.di_size > 0);
                        xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL,
@@ -305,7 +305,7 @@ xfs_inode_item_format_attr_fork(
                         */
                        data_bytes = roundup(ip->i_afp->if_bytes, 4);
                        ASSERT(ip->i_afp->if_real_bytes == 0 ||
-                              ip->i_afp->if_real_bytes == data_bytes);
+                              ip->i_afp->if_real_bytes >= data_bytes);
                        ASSERT(ip->i_afp->if_u1.if_data != NULL);
                        xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL,
                                        ip->i_afp->if_u1.if_data,
@@ -479,6 +479,8 @@ STATIC uint
 xfs_inode_item_push(
        struct xfs_log_item     *lip,
        struct list_head        *buffer_list)
+               __releases(&lip->li_ailp->xa_lock)
+               __acquires(&lip->li_ailp->xa_lock)
 {
        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
        struct xfs_inode        *ip = iip->ili_inode;
index bcb6c19..dbca737 100644 (file)
@@ -277,7 +277,6 @@ xfs_readlink_by_handle(
 {
        struct dentry           *dentry;
        __u32                   olen;
-       void                    *link;
        int                     error;
 
        if (!capable(CAP_SYS_ADMIN))
@@ -288,7 +287,7 @@ xfs_readlink_by_handle(
                return PTR_ERR(dentry);
 
        /* Restrict this handle operation to symlinks only. */
-       if (!d_is_symlink(dentry)) {
+       if (!d_inode(dentry)->i_op->readlink) {
                error = -EINVAL;
                goto out_dput;
        }
@@ -298,21 +297,8 @@ xfs_readlink_by_handle(
                goto out_dput;
        }
 
-       link = kmalloc(MAXPATHLEN+1, GFP_KERNEL);
-       if (!link) {
-               error = -ENOMEM;
-               goto out_dput;
-       }
-
-       error = xfs_readlink(XFS_I(d_inode(dentry)), link);
-       if (error)
-               goto out_kfree;
-       error = readlink_copy(hreq->ohandle, olen, link);
-       if (error)
-               goto out_kfree;
+       error = d_inode(dentry)->i_op->readlink(dentry, hreq->ohandle, olen);
 
- out_kfree:
-       kfree(link);
  out_dput:
        dput(dentry);
        return error;
@@ -334,12 +320,10 @@ xfs_set_dmattrs(
        if (XFS_FORCED_SHUTDOWN(mp))
                return -EIO;
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
-       if (error) {
-               xfs_trans_cancel(tp);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+       if (error)
                return error;
-       }
+
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 
@@ -1141,10 +1125,9 @@ xfs_ioctl_setattr_get_trans(
        if (XFS_FORCED_SHUTDOWN(mp))
                goto out_unlock;
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
        if (error)
-               goto out_cancel;
+               return ERR_PTR(error);
 
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | join_flags);
index d81bdc0..5839135 100644 (file)
@@ -132,6 +132,7 @@ xfs_iomap_write_direct(
        int             error;
        int             lockmode;
        int             bmapi_flags = XFS_BMAPI_PREALLOC;
+       uint            tflags = 0;
 
        rt = XFS_IS_REALTIME_INODE(ip);
        extsz = xfs_get_extsz_hint(ip);
@@ -191,11 +192,6 @@ xfs_iomap_write_direct(
        if (error)
                return error;
 
-       /*
-        * Allocate and setup the transaction
-        */
-       tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
-
        /*
         * For DAX, we do not allocate unwritten extents, but instead we zero
         * the block before we commit the transaction.  Ideally we'd like to do
@@ -209,23 +205,17 @@ xfs_iomap_write_direct(
         * the reserve block pool for bmbt block allocation if there is no space
         * left but we need to do unwritten extent conversion.
         */
-
        if (IS_DAX(VFS_I(ip))) {
                bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO;
                if (ISUNWRITTEN(imap)) {
-                       tp->t_flags |= XFS_TRANS_RESERVE;
+                       tflags |= XFS_TRANS_RESERVE;
                        resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
                }
        }
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
-                                 resblks, resrtextents);
-       /*
-        * Check for running out of space, note: need lock to return
-        */
-       if (error) {
-               xfs_trans_cancel(tp);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, resrtextents,
+                       tflags, &tp);
+       if (error)
                return error;
-       }
 
        lockmode = XFS_ILOCK_EXCL;
        xfs_ilock(ip, lockmode);
@@ -726,15 +716,13 @@ xfs_iomap_write_allocate(
 
                nimaps = 0;
                while (nimaps == 0) {
-                       tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
-                       tp->t_flags |= XFS_TRANS_RESERVE;
                        nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
-                       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
-                                                 nres, 0);
-                       if (error) {
-                               xfs_trans_cancel(tp);
+
+                       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, nres,
+                                       0, XFS_TRANS_RESERVE, &tp);
+                       if (error)
                                return error;
-                       }
+
                        xfs_ilock(ip, XFS_ILOCK_EXCL);
                        xfs_trans_ijoin(tp, ip, 0);
 
@@ -878,25 +866,18 @@ xfs_iomap_write_unwritten(
 
        do {
                /*
-                * set up a transaction to convert the range of extents
+                * Set up a transaction to convert the range of extents
                 * from unwritten to real. Do allocations in a loop until
                 * we have covered the range passed in.
                 *
-                * Note that we open code the transaction allocation here
-                * to pass KM_NOFS--we can't risk to recursing back into
-                * the filesystem here as we might be asked to write out
-                * the same inode that we complete here and might deadlock
-                * on the iolock.
+                * Note that we can't risk to recursing back into the filesystem
+                * here as we might be asked to write out the same inode that we
+                * complete here and might deadlock on the iolock.
                 */
-               sb_start_intwrite(mp->m_super);
-               tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS);
-               tp->t_flags |= XFS_TRANS_RESERVE | XFS_TRANS_FREEZE_PROT;
-               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
-                                         resblks, 0);
-               if (error) {
-                       xfs_trans_cancel(tp);
+               error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
+                               XFS_TRANS_RESERVE | XFS_TRANS_NOFS, &tp);
+               if (error)
                        return error;
-               }
 
                xfs_ilock(ip, XFS_ILOCK_EXCL);
                xfs_trans_ijoin(tp, ip, 0);
index fb7dc61..c5d4eba 100644 (file)
@@ -181,6 +181,8 @@ xfs_generic_create(
        }
 #endif
 
+       xfs_setup_iops(ip);
+
        if (tmpfile)
                d_tmpfile(dentry, inode);
        else
@@ -368,6 +370,8 @@ xfs_vn_symlink(
        if (unlikely(error))
                goto out_cleanup_inode;
 
+       xfs_setup_iops(cip);
+
        d_instantiate(dentry, inode);
        xfs_finish_inode_setup(cip);
        return 0;
@@ -442,6 +446,16 @@ xfs_vn_get_link(
        return ERR_PTR(error);
 }
 
+STATIC const char *
+xfs_vn_get_link_inline(
+       struct dentry           *dentry,
+       struct inode            *inode,
+       struct delayed_call     *done)
+{
+       ASSERT(XFS_I(inode)->i_df.if_flags & XFS_IFINLINE);
+       return XFS_I(inode)->i_df.if_u1.if_data;
+}
+
 STATIC int
 xfs_vn_getattr(
        struct vfsmount         *mnt,
@@ -599,12 +613,12 @@ xfs_setattr_nonsize(
                        return error;
        }
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
        if (error)
-               goto out_trans_cancel;
+               goto out_dqrele;
 
        xfs_ilock(ip, XFS_ILOCK_EXCL);
+       xfs_trans_ijoin(tp, ip, 0);
 
        /*
         * Change file ownership.  Must be the owner or privileged.
@@ -633,12 +647,10 @@ xfs_setattr_nonsize(
                                                NULL, capable(CAP_FOWNER) ?
                                                XFS_QMOPT_FORCE_RES : 0);
                        if (error)      /* out of quota */
-                               goto out_unlock;
+                               goto out_cancel;
                }
        }
 
-       xfs_trans_ijoin(tp, ip, 0);
-
        /*
         * Change file ownership.  Must be the owner or privileged.
         */
@@ -722,10 +734,9 @@ xfs_setattr_nonsize(
 
        return 0;
 
-out_unlock:
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-out_trans_cancel:
+out_cancel:
        xfs_trans_cancel(tp);
+out_dqrele:
        xfs_qm_dqrele(udqp);
        xfs_qm_dqrele(gdqp);
        return error;
@@ -834,7 +845,7 @@ xfs_setattr_size(
         * We have to do all the page cache truncate work outside the
         * transaction context as the "lock" order is page lock->log space
         * reservation as defined by extent allocation in the writeback path.
-        * Hence a truncate can fail with ENOMEM from xfs_trans_reserve(), but
+        * Hence a truncate can fail with ENOMEM from xfs_trans_alloc(), but
         * having already truncated the in-memory version of the file (i.e. made
         * user visible changes). There's not much we can do about this, except
         * to hope that the caller sees ENOMEM and retries the truncate
@@ -849,10 +860,9 @@ xfs_setattr_size(
                return error;
        truncate_setsize(inode, newsize);
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
        if (error)
-               goto out_trans_cancel;
+               return error;
 
        lock_flags |= XFS_ILOCK_EXCL;
        xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -971,12 +981,9 @@ xfs_vn_update_time(
 
        trace_xfs_update_time(ip);
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
-       if (error) {
-               xfs_trans_cancel(tp);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
+       if (error)
                return error;
-       }
 
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        if (flags & S_CTIME)
@@ -1167,6 +1174,18 @@ static const struct inode_operations xfs_symlink_inode_operations = {
        .update_time            = xfs_vn_update_time,
 };
 
+static const struct inode_operations xfs_inline_symlink_inode_operations = {
+       .readlink               = generic_readlink,
+       .get_link               = xfs_vn_get_link_inline,
+       .getattr                = xfs_vn_getattr,
+       .setattr                = xfs_vn_setattr,
+       .setxattr               = generic_setxattr,
+       .getxattr               = generic_getxattr,
+       .removexattr            = generic_removexattr,
+       .listxattr              = xfs_vn_listxattr,
+       .update_time            = xfs_vn_update_time,
+};
+
 STATIC void
 xfs_diflags_to_iflags(
        struct inode            *inode,
@@ -1193,7 +1212,7 @@ xfs_diflags_to_iflags(
 }
 
 /*
- * Initialize the Linux inode and set up the operation vectors.
+ * Initialize the Linux inode.
  *
  * When reading existing inodes from disk this is called directly from xfs_iget,
  * when creating a new inode it is called from xfs_ialloc after setting up the
@@ -1232,32 +1251,12 @@ xfs_setup_inode(
        i_size_write(inode, ip->i_d.di_size);
        xfs_diflags_to_iflags(inode, ip);
 
-       ip->d_ops = ip->i_mount->m_nondir_inode_ops;
-       lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class);
-       switch (inode->i_mode & S_IFMT) {
-       case S_IFREG:
-               inode->i_op = &xfs_inode_operations;
-               inode->i_fop = &xfs_file_operations;
-               inode->i_mapping->a_ops = &xfs_address_space_operations;
-               break;
-       case S_IFDIR:
+       if (S_ISDIR(inode->i_mode)) {
                lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class);
-               if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb))
-                       inode->i_op = &xfs_dir_ci_inode_operations;
-               else
-                       inode->i_op = &xfs_dir_inode_operations;
-               inode->i_fop = &xfs_dir_file_operations;
                ip->d_ops = ip->i_mount->m_dir_inode_ops;
-               break;
-       case S_IFLNK:
-               inode->i_op = &xfs_symlink_inode_operations;
-               if (!(ip->i_df.if_flags & XFS_IFINLINE))
-                       inode->i_mapping->a_ops = &xfs_address_space_operations;
-               break;
-       default:
-               inode->i_op = &xfs_inode_operations;
-               init_special_inode(inode, inode->i_mode, inode->i_rdev);
-               break;
+       } else {
+               ip->d_ops = ip->i_mount->m_nondir_inode_ops;
+               lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class);
        }
 
        /*
@@ -1277,3 +1276,35 @@ xfs_setup_inode(
                cache_no_acl(inode);
        }
 }
+
+void
+xfs_setup_iops(
+       struct xfs_inode        *ip)
+{
+       struct inode            *inode = &ip->i_vnode;
+
+       switch (inode->i_mode & S_IFMT) {
+       case S_IFREG:
+               inode->i_op = &xfs_inode_operations;
+               inode->i_fop = &xfs_file_operations;
+               inode->i_mapping->a_ops = &xfs_address_space_operations;
+               break;
+       case S_IFDIR:
+               if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb))
+                       inode->i_op = &xfs_dir_ci_inode_operations;
+               else
+                       inode->i_op = &xfs_dir_inode_operations;
+               inode->i_fop = &xfs_dir_file_operations;
+               break;
+       case S_IFLNK:
+               if (ip->i_df.if_flags & XFS_IFINLINE)
+                       inode->i_op = &xfs_inline_symlink_inode_operations;
+               else
+                       inode->i_op = &xfs_symlink_inode_operations;
+               break;
+       default:
+               inode->i_op = &xfs_inode_operations;
+               init_special_inode(inode, inode->i_mode, inode->i_rdev);
+               break;
+       }
+}
index b49ccf5..bde02f1 100644 (file)
@@ -435,8 +435,7 @@ xfs_log_reserve(
        int                     cnt,
        struct xlog_ticket      **ticp,
        __uint8_t               client,
-       bool                    permanent,
-       uint                    t_type)
+       bool                    permanent)
 {
        struct xlog             *log = mp->m_log;
        struct xlog_ticket      *tic;
@@ -456,7 +455,6 @@ xfs_log_reserve(
        if (!tic)
                return -ENOMEM;
 
-       tic->t_trans_type = t_type;
        *ticp = tic;
 
        xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
@@ -823,8 +821,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
        } while (iclog != first_iclog);
 #endif
        if (! (XLOG_FORCED_SHUTDOWN(log))) {
-               error = xfs_log_reserve(mp, 600, 1, &tic,
-                                       XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE);
+               error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0);
                if (!error) {
                        /* the data section must be 32 bit size aligned */
                        struct {
@@ -2032,58 +2029,8 @@ xlog_print_tic_res(
            REG_TYPE_STR(ICREATE, "inode create")
        };
 #undef REG_TYPE_STR
-#define TRANS_TYPE_STR(type)   [XFS_TRANS_##type] = #type
-       static char *trans_type_str[XFS_TRANS_TYPE_MAX] = {
-           TRANS_TYPE_STR(SETATTR_NOT_SIZE),
-           TRANS_TYPE_STR(SETATTR_SIZE),
-           TRANS_TYPE_STR(INACTIVE),
-           TRANS_TYPE_STR(CREATE),
-           TRANS_TYPE_STR(CREATE_TRUNC),
-           TRANS_TYPE_STR(TRUNCATE_FILE),
-           TRANS_TYPE_STR(REMOVE),
-           TRANS_TYPE_STR(LINK),
-           TRANS_TYPE_STR(RENAME),
-           TRANS_TYPE_STR(MKDIR),
-           TRANS_TYPE_STR(RMDIR),
-           TRANS_TYPE_STR(SYMLINK),
-           TRANS_TYPE_STR(SET_DMATTRS),
-           TRANS_TYPE_STR(GROWFS),
-           TRANS_TYPE_STR(STRAT_WRITE),
-           TRANS_TYPE_STR(DIOSTRAT),
-           TRANS_TYPE_STR(WRITEID),
-           TRANS_TYPE_STR(ADDAFORK),
-           TRANS_TYPE_STR(ATTRINVAL),
-           TRANS_TYPE_STR(ATRUNCATE),
-           TRANS_TYPE_STR(ATTR_SET),
-           TRANS_TYPE_STR(ATTR_RM),
-           TRANS_TYPE_STR(ATTR_FLAG),
-           TRANS_TYPE_STR(CLEAR_AGI_BUCKET),
-           TRANS_TYPE_STR(SB_CHANGE),
-           TRANS_TYPE_STR(DUMMY1),
-           TRANS_TYPE_STR(DUMMY2),
-           TRANS_TYPE_STR(QM_QUOTAOFF),
-           TRANS_TYPE_STR(QM_DQALLOC),
-           TRANS_TYPE_STR(QM_SETQLIM),
-           TRANS_TYPE_STR(QM_DQCLUSTER),
-           TRANS_TYPE_STR(QM_QINOCREATE),
-           TRANS_TYPE_STR(QM_QUOTAOFF_END),
-           TRANS_TYPE_STR(FSYNC_TS),
-           TRANS_TYPE_STR(GROWFSRT_ALLOC),
-           TRANS_TYPE_STR(GROWFSRT_ZERO),
-           TRANS_TYPE_STR(GROWFSRT_FREE),
-           TRANS_TYPE_STR(SWAPEXT),
-           TRANS_TYPE_STR(CHECKPOINT),
-           TRANS_TYPE_STR(ICREATE),
-           TRANS_TYPE_STR(CREATE_TMPFILE)
-       };
-#undef TRANS_TYPE_STR
 
        xfs_warn(mp, "xlog_write: reservation summary:");
-       xfs_warn(mp, "  trans type  = %s (%u)",
-                ((ticket->t_trans_type <= 0 ||
-                  ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
-                 "bad-trans-type" : trans_type_str[ticket->t_trans_type]),
-                ticket->t_trans_type);
        xfs_warn(mp, "  unit res    = %d bytes",
                 ticket->t_unit_res);
        xfs_warn(mp, "  current res = %d bytes",
@@ -3378,7 +3325,7 @@ xfs_log_force(
 {
        int     error;
 
-       trace_xfs_log_force(mp, 0);
+       trace_xfs_log_force(mp, 0, _RET_IP_);
        error = _xfs_log_force(mp, flags, NULL);
        if (error)
                xfs_warn(mp, "%s: error %d returned.", __func__, error);
@@ -3527,7 +3474,7 @@ xfs_log_force_lsn(
 {
        int     error;
 
-       trace_xfs_log_force(mp, lsn);
+       trace_xfs_log_force(mp, lsn, _RET_IP_);
        error = _xfs_log_force_lsn(mp, lsn, flags, NULL);
        if (error)
                xfs_warn(mp, "%s: error %d returned.", __func__, error);
@@ -3709,7 +3656,6 @@ xlog_ticket_alloc(
        tic->t_tid              = prandom_u32();
        tic->t_clientid         = client;
        tic->t_flags            = XLOG_TIC_INITED;
-       tic->t_trans_type       = 0;
        if (permanent)
                tic->t_flags |= XLOG_TIC_PERM_RESERV;
 
index aa533a7..80ba0c0 100644 (file)
@@ -161,8 +161,7 @@ int   xfs_log_reserve(struct xfs_mount *mp,
                          int              count,
                          struct xlog_ticket **ticket,
                          __uint8_t        clientid,
-                         bool             permanent,
-                         uint             t_type);
+                         bool             permanent);
 int      xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
 int      xfs_log_unmount_write(struct xfs_mount *mp);
 void      xfs_log_unmount(struct xfs_mount *mp);
index 4e76493..5e54e79 100644 (file)
@@ -51,7 +51,6 @@ xlog_cil_ticket_alloc(
 
        tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
                                KM_SLEEP|KM_NOFS);
-       tic->t_trans_type = XFS_TRANS_CHECKPOINT;
 
        /*
         * set the current reservation to zero so we know to steal the basic
index ed88963..765f084 100644 (file)
@@ -175,7 +175,6 @@ typedef struct xlog_ticket {
        char               t_cnt;        /* current count                : 1  */
        char               t_clientid;   /* who does this belong to;     : 1  */
        char               t_flags;      /* properties of reservation    : 1  */
-       uint               t_trans_type; /* transaction type             : 4  */
 
         /* reservation array fields */
        uint               t_res_num;                    /* num in array : 4 */
index 396565f..8359978 100644 (file)
@@ -3843,7 +3843,7 @@ xlog_recover_add_to_cont_trans(
        old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
        old_len = item->ri_buf[item->ri_cnt-1].i_len;
 
-       ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP);
+       ptr = kmem_realloc(old_ptr, len + old_len, KM_SLEEP);
        memcpy(&ptr[old_len], dp, len);
        item->ri_buf[item->ri_cnt-1].i_len += len;
        item->ri_buf[item->ri_cnt-1].i_addr = ptr;
@@ -4205,10 +4205,9 @@ xlog_recover_process_efi(
                }
        }
 
-       tp = xfs_trans_alloc(mp, 0);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
        if (error)
-               goto abort_error;
+               return error;
        efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
 
        for (i = 0; i < efip->efi_format.efi_nextents; i++) {
@@ -4355,10 +4354,9 @@ xlog_recover_clear_agi_bucket(
        int             offset;
        int             error;
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_clearagi, 0, 0);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_clearagi, 0, 0, 0, &tp);
        if (error)
-               goto out_abort;
+               goto out_error;
 
        error = xfs_read_agi(mp, tp, agno, &agibp);
        if (error)
index cfd4210..e39b023 100644 (file)
@@ -89,7 +89,6 @@ xfs_uuid_mount(
        if (hole < 0) {
                xfs_uuid_table = kmem_realloc(xfs_uuid_table,
                        (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
-                       xfs_uuid_table_size  * sizeof(*xfs_uuid_table),
                        KM_SLEEP);
                hole = xfs_uuid_table_size++;
        }
@@ -681,6 +680,9 @@ xfs_mountfs(
 
        xfs_set_maxicount(mp);
 
+       /* enable fail_at_unmount as default */
+       mp->m_fail_unmount = 1;
+
        error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_fsname);
        if (error)
                goto out;
@@ -690,10 +692,15 @@ xfs_mountfs(
        if (error)
                goto out_remove_sysfs;
 
-       error = xfs_uuid_mount(mp);
+       error = xfs_error_sysfs_init(mp);
        if (error)
                goto out_del_stats;
 
+
+       error = xfs_uuid_mount(mp);
+       if (error)
+               goto out_remove_error_sysfs;
+
        /*
         * Set the minimum read and write sizes
         */
@@ -957,6 +964,7 @@ xfs_mountfs(
        cancel_delayed_work_sync(&mp->m_reclaim_work);
        xfs_reclaim_inodes(mp, SYNC_WAIT);
  out_log_dealloc:
+       mp->m_flags |= XFS_MOUNT_UNMOUNTING;
        xfs_log_mount_cancel(mp);
  out_fail_wait:
        if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
@@ -968,6 +976,8 @@ xfs_mountfs(
        xfs_da_unmount(mp);
  out_remove_uuid:
        xfs_uuid_unmount(mp);
+ out_remove_error_sysfs:
+       xfs_error_sysfs_del(mp);
  out_del_stats:
        xfs_sysfs_del(&mp->m_stats.xs_kobj);
  out_remove_sysfs:
@@ -1005,6 +1015,14 @@ xfs_unmountfs(
         */
        xfs_log_force(mp, XFS_LOG_SYNC);
 
+       /*
+        * We now need to tell the world we are unmounting. This will allow
+        * us to detect that the filesystem is going away and we should error
+        * out anything that we have been retrying in the background. This will
+        * prevent neverending retries in AIL pushing from hanging the unmount.
+        */
+       mp->m_flags |= XFS_MOUNT_UNMOUNTING;
+
        /*
         * Flush all pending changes from the AIL.
         */
@@ -1056,6 +1074,7 @@ xfs_unmountfs(
 #endif
        xfs_free_perag(mp);
 
+       xfs_error_sysfs_del(mp);
        xfs_sysfs_del(&mp->m_stats.xs_kobj);
        xfs_sysfs_del(&mp->m_kobj);
 }
index eafe257..c1b798c 100644 (file)
@@ -37,6 +37,32 @@ enum {
        XFS_LOWSP_MAX,
 };
 
+/*
+ * Error Configuration
+ *
+ * Error classes define the subsystem the configuration belongs to.
+ * Error numbers define the errors that are configurable.
+ */
+enum {
+       XFS_ERR_METADATA,
+       XFS_ERR_CLASS_MAX,
+};
+enum {
+       XFS_ERR_DEFAULT,
+       XFS_ERR_EIO,
+       XFS_ERR_ENOSPC,
+       XFS_ERR_ENODEV,
+       XFS_ERR_ERRNO_MAX,
+};
+
+#define XFS_ERR_RETRY_FOREVER  -1
+
+struct xfs_error_cfg {
+       struct xfs_kobj kobj;
+       int             max_retries;
+       unsigned long   retry_timeout;  /* in jiffies, 0 = no timeout */
+};
+
 typedef struct xfs_mount {
        struct super_block      *m_super;
        xfs_tid_t               m_tid;          /* next unused tid for fs */
@@ -127,6 +153,9 @@ typedef struct xfs_mount {
        int64_t                 m_low_space[XFS_LOWSP_MAX];
                                                /* low free space thresholds */
        struct xfs_kobj         m_kobj;
+       struct xfs_kobj         m_error_kobj;
+       struct xfs_kobj         m_error_meta_kobj;
+       struct xfs_error_cfg    m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX];
        struct xstats           m_stats;        /* per-fs stats */
 
        struct workqueue_struct *m_buf_workqueue;
@@ -148,6 +177,7 @@ typedef struct xfs_mount {
         */
        __uint32_t              m_generation;
 
+       bool                    m_fail_unmount;
 #ifdef DEBUG
        /*
         * DEBUG mode instrumentation to test and/or trigger delayed allocation
@@ -166,6 +196,7 @@ typedef struct xfs_mount {
 #define XFS_MOUNT_WSYNC                (1ULL << 0)     /* for nfs - all metadata ops
                                                   must be synchronous except
                                                   for space allocations */
+#define XFS_MOUNT_UNMOUNTING   (1ULL << 1)     /* filesystem is unmounting */
 #define XFS_MOUNT_WAS_CLEAN    (1ULL << 3)
 #define XFS_MOUNT_FS_SHUTDOWN  (1ULL << 4)     /* atomic stop of all filesystem
                                                   operations, typically for
@@ -364,4 +395,7 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *);
 int    xfs_zero_extent(struct xfs_inode *ip, xfs_fsblock_t start_fsb,
                        xfs_off_t count_fsb);
 
+struct xfs_error_cfg * xfs_error_get_cfg(struct xfs_mount *mp,
+               int error_class, int error);
+
 #endif /* __XFS_MOUNT_H__ */
index 51ddaf2..d5b7566 100644 (file)
@@ -308,12 +308,9 @@ xfs_fs_commit_blocks(
                        goto out_drop_iolock;
        }
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
-       if (error) {
-               xfs_trans_cancel(tp);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+       if (error)
                goto out_drop_iolock;
-       }
 
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
index be125e1..a60d9e2 100644 (file)
@@ -783,13 +783,10 @@ xfs_qm_qino_alloc(
                }
        }
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_create,
-                                 XFS_QM_QINOCREATE_SPACE_RES(mp), 0);
-       if (error) {
-               xfs_trans_cancel(tp);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_create,
+                       XFS_QM_QINOCREATE_SPACE_RES(mp), 0, 0, &tp);
+       if (error)
                return error;
-       }
 
        if (need_alloc) {
                error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip,
index f4d0e0a..475a388 100644 (file)
@@ -236,10 +236,8 @@ xfs_qm_scall_trunc_qfile(
 
        xfs_ilock(ip, XFS_IOLOCK_EXCL);
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
        if (error) {
-               xfs_trans_cancel(tp);
                xfs_iunlock(ip, XFS_IOLOCK_EXCL);
                goto out_put;
        }
@@ -436,12 +434,9 @@ xfs_qm_scall_setqlim(
        defq = xfs_get_defquota(dqp, q);
        xfs_dqunlock(dqp);
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_setqlim, 0, 0);
-       if (error) {
-               xfs_trans_cancel(tp);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_setqlim, 0, 0, 0, &tp);
+       if (error)
                goto out_rele;
-       }
 
        xfs_dqlock(dqp);
        xfs_trans_dqjoin(tp, dqp);
@@ -569,13 +564,9 @@ xfs_qm_log_quotaoff_end(
        int                     error;
        xfs_qoff_logitem_t      *qoffi;
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END);
-
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0);
-       if (error) {
-               xfs_trans_cancel(tp);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_equotaoff, 0, 0, 0, &tp);
+       if (error)
                return error;
-       }
 
        qoffi = xfs_trans_get_qoff_item(tp, startqoff,
                                        flags & XFS_ALL_QUOTA_ACCT);
@@ -603,12 +594,9 @@ xfs_qm_log_quotaoff(
 
        *qoffstartp = NULL;
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0);
-       if (error) {
-               xfs_trans_cancel(tp);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_quotaoff, 0, 0, 0, &tp);
+       if (error)
                goto out;
-       }
 
        qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT);
        xfs_trans_log_quotaoff_item(tp, qoffi);
index abf4443..3938b37 100644 (file)
@@ -780,15 +780,14 @@ xfs_growfs_rt_alloc(
         * Allocate space to the file, as necessary.
         */
        while (oblocks < nblocks) {
-               tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC);
                resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks);
                /*
                 * Reserve space & log for one extent added to the file.
                 */
-               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtalloc,
-                                         resblks, 0);
+               error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtalloc, resblks,
+                               0, 0, &tp);
                if (error)
-                       goto out_trans_cancel;
+                       return error;
                /*
                 * Lock the inode.
                 */
@@ -823,14 +822,13 @@ xfs_growfs_rt_alloc(
                for (bno = map.br_startoff, fsbno = map.br_startblock;
                     bno < map.br_startoff + map.br_blockcount;
                     bno++, fsbno++) {
-                       tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ZERO);
                        /*
                         * Reserve log for one block zeroing.
                         */
-                       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtzero,
-                                                 0, 0);
+                       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtzero,
+                                       0, 0, 0, &tp);
                        if (error)
-                               goto out_trans_cancel;
+                               return error;
                        /*
                         * Lock the bitmap inode.
                         */
@@ -994,11 +992,10 @@ xfs_growfs_rt(
                /*
                 * Start a transaction, get the log reservation.
                 */
-               tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_FREE);
-               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtfree,
-                                         0, 0);
+               error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtfree, 0, 0, 0,
+                               &tp);
                if (error)
-                       goto error_cancel;
+                       break;
                /*
                 * Lock out other callers by grabbing the bitmap inode lock.
                 */
index 187e14b..11ea5d5 100644 (file)
@@ -58,8 +58,7 @@
 #include <linux/parser.h>
 
 static const struct super_operations xfs_super_operations;
-static kmem_zone_t *xfs_ioend_zone;
-mempool_t *xfs_ioend_pool;
+struct bio_set *xfs_ioend_bioset;
 
 static struct kset *xfs_kset;          /* top-level xfs sysfs dir */
 #ifdef DEBUG
@@ -350,6 +349,7 @@ xfs_parseargs(
                case Opt_pqnoenforce:
                        mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
                        mp->m_qflags &= ~XFS_PQUOTA_ENFD;
+                       break;
                case Opt_gquota:
                case Opt_grpquota:
                        mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
@@ -928,7 +928,7 @@ xfs_fs_alloc_inode(
 
 /*
  * Now that the generic code is guaranteed not to be accessing
- * the linux inode, we can reclaim the inode.
+ * the linux inode, we can inactivate and reclaim the inode.
  */
 STATIC void
 xfs_fs_destroy_inode(
@@ -938,9 +938,14 @@ xfs_fs_destroy_inode(
 
        trace_xfs_destroy_inode(ip);
 
-       XFS_STATS_INC(ip->i_mount, vn_reclaim);
+       ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+       XFS_STATS_INC(ip->i_mount, vn_rele);
+       XFS_STATS_INC(ip->i_mount, vn_remove);
+
+       xfs_inactive(ip);
 
        ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
+       XFS_STATS_INC(ip->i_mount, vn_reclaim);
 
        /*
         * We should never get here with one of the reclaim flags already set.
@@ -987,24 +992,6 @@ xfs_fs_inode_init_once(
                     "xfsino", ip->i_ino);
 }
 
-STATIC void
-xfs_fs_evict_inode(
-       struct inode            *inode)
-{
-       xfs_inode_t             *ip = XFS_I(inode);
-
-       ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
-
-       trace_xfs_evict_inode(ip);
-
-       truncate_inode_pages_final(&inode->i_data);
-       clear_inode(inode);
-       XFS_STATS_INC(ip->i_mount, vn_rele);
-       XFS_STATS_INC(ip->i_mount, vn_remove);
-
-       xfs_inactive(ip);
-}
-
 /*
  * We do an unlocked check for XFS_IDONTCACHE here because we are already
  * serialised against cache hits here via the inode->i_lock and igrab() in
@@ -1276,6 +1263,16 @@ xfs_fs_remount(
                        return -EINVAL;
                }
 
+               if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
+                   xfs_sb_has_ro_compat_feature(sbp,
+                                       XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
+                       xfs_warn(mp,
+"ro->rw transition prohibited on unknown (0x%x) ro-compat filesystem",
+                               (sbp->sb_features_ro_compat &
+                                       XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
+                       return -EINVAL;
+               }
+
                mp->m_flags &= ~XFS_MOUNT_RDONLY;
 
                /*
@@ -1558,14 +1555,12 @@ xfs_fs_fill_super(
 
        if (mp->m_flags & XFS_MOUNT_DAX) {
                xfs_warn(mp,
-       "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
-               if (sb->s_blocksize != PAGE_SIZE) {
-                       xfs_alert(mp,
-               "Filesystem block size invalid for DAX Turning DAX off.");
-                       mp->m_flags &= ~XFS_MOUNT_DAX;
-               } else if (!sb->s_bdev->bd_disk->fops->direct_access) {
+               "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
+
+               error = bdev_dax_supported(sb, sb->s_blocksize);
+               if (error) {
                        xfs_alert(mp,
-               "Block device does not support DAX Turning DAX off.");
+                       "DAX unsupported by block device. Turning off DAX.");
                        mp->m_flags &= ~XFS_MOUNT_DAX;
                }
        }
@@ -1663,7 +1658,6 @@ xfs_fs_free_cached_objects(
 static const struct super_operations xfs_super_operations = {
        .alloc_inode            = xfs_fs_alloc_inode,
        .destroy_inode          = xfs_fs_destroy_inode,
-       .evict_inode            = xfs_fs_evict_inode,
        .drop_inode             = xfs_fs_drop_inode,
        .put_super              = xfs_fs_put_super,
        .sync_fs                = xfs_fs_sync_fs,
@@ -1688,20 +1682,15 @@ MODULE_ALIAS_FS("xfs");
 STATIC int __init
 xfs_init_zones(void)
 {
-
-       xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend");
-       if (!xfs_ioend_zone)
+       xfs_ioend_bioset = bioset_create(4 * MAX_BUF_PER_PAGE,
+                       offsetof(struct xfs_ioend, io_inline_bio));
+       if (!xfs_ioend_bioset)
                goto out;
 
-       xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE,
-                                                 xfs_ioend_zone);
-       if (!xfs_ioend_pool)
-               goto out_destroy_ioend_zone;
-
        xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t),
                                                "xfs_log_ticket");
        if (!xfs_log_ticket_zone)
-               goto out_destroy_ioend_pool;
+               goto out_free_ioend_bioset;
 
        xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t),
                                                "xfs_bmap_free_item");
@@ -1797,10 +1786,8 @@ xfs_init_zones(void)
        kmem_zone_destroy(xfs_bmap_free_item_zone);
  out_destroy_log_ticket_zone:
        kmem_zone_destroy(xfs_log_ticket_zone);
- out_destroy_ioend_pool:
-       mempool_destroy(xfs_ioend_pool);
- out_destroy_ioend_zone:
-       kmem_zone_destroy(xfs_ioend_zone);
+ out_free_ioend_bioset:
+       bioset_free(xfs_ioend_bioset);
  out:
        return -ENOMEM;
 }
@@ -1826,9 +1813,7 @@ xfs_destroy_zones(void)
        kmem_zone_destroy(xfs_btree_cur_zone);
        kmem_zone_destroy(xfs_bmap_free_item_zone);
        kmem_zone_destroy(xfs_log_ticket_zone);
-       mempool_destroy(xfs_ioend_pool);
-       kmem_zone_destroy(xfs_ioend_zone);
-
+       bioset_free(xfs_ioend_bioset);
 }
 
 STATIC int __init
index b44284c..08a46c6 100644 (file)
@@ -131,6 +131,8 @@ xfs_readlink(
 
        trace_xfs_readlink(ip);
 
+       ASSERT(!(ip->i_df.if_flags & XFS_IFINLINE));
+
        if (XFS_FORCED_SHUTDOWN(mp))
                return -EIO;
 
@@ -150,12 +152,7 @@ xfs_readlink(
        }
 
 
-       if (ip->i_df.if_flags & XFS_IFINLINE) {
-               memcpy(link, ip->i_df.if_u1.if_data, pathlen);
-               link[pathlen] = '\0';
-       } else {
-               error = xfs_readlink_bmap(ip, link);
-       }
+       error = xfs_readlink_bmap(ip, link);
 
  out:
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -221,7 +218,6 @@ xfs_symlink(
        if (error)
                return error;
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
        /*
         * The symlink will fit into the inode data fork?
         * There can't be any attributes so we get the whole variable part.
@@ -231,13 +227,15 @@ xfs_symlink(
        else
                fs_blocks = xfs_symlink_blocks(mp, pathlen);
        resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, resblks, 0);
+
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, resblks, 0, 0, &tp);
        if (error == -ENOSPC && fs_blocks == 0) {
                resblks = 0;
-               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0);
+               error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, 0, 0, 0,
+                               &tp);
        }
        if (error)
-               goto out_trans_cancel;
+               goto out_release_inode;
 
        xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL |
                      XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT);
@@ -302,19 +300,11 @@ xfs_symlink(
         * If the symlink will fit into the inode, write it inline.
         */
        if (pathlen <= XFS_IFORK_DSIZE(ip)) {
-               xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
-               memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
-               ip->i_d.di_size = pathlen;
-
-               /*
-                * The inode was initially created in extent format.
-                */
-               ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
-               ip->i_df.if_flags |= XFS_IFINLINE;
+               xfs_init_local_fork(ip, XFS_DATA_FORK, target_path, pathlen);
 
+               ip->i_d.di_size = pathlen;
                ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
                xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
-
        } else {
                int     offset;
 
@@ -455,12 +445,9 @@ xfs_inactive_symlink_rmt(
         */
        ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
-       if (error) {
-               xfs_trans_cancel(tp);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
+       if (error)
                return error;
-       }
 
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        xfs_trans_ijoin(tp, ip, 0);
index 6ced4f1..4c2c550 100644 (file)
  */
 
 #include "xfs.h"
-#include "xfs_sysfs.h"
+#include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
+#include "xfs_sysfs.h"
 #include "xfs_log.h"
 #include "xfs_log_priv.h"
 #include "xfs_stats.h"
@@ -362,3 +363,291 @@ struct kobj_type xfs_log_ktype = {
        .sysfs_ops = &xfs_sysfs_ops,
        .default_attrs = xfs_log_attrs,
 };
+
+/*
+ * Metadata IO error configuration
+ *
+ * The sysfs structure here is:
+ *     ...xfs/<dev>/error/<class>/<errno>/<error_attrs>
+ *
+ * where <class> allows us to discriminate between data IO and metadata IO,
+ * and any other future type of IO (e.g. special inode or directory error
+ * handling) we care to support.
+ */
+static inline struct xfs_error_cfg *
+to_error_cfg(struct kobject *kobject)
+{
+       struct xfs_kobj *kobj = to_kobj(kobject);
+       return container_of(kobj, struct xfs_error_cfg, kobj);
+}
+
+static inline struct xfs_mount *
+err_to_mp(struct kobject *kobject)
+{
+       struct xfs_kobj *kobj = to_kobj(kobject);
+       return container_of(kobj, struct xfs_mount, m_error_kobj);
+}
+
+static ssize_t
+max_retries_show(
+       struct kobject  *kobject,
+       char            *buf)
+{
+       struct xfs_error_cfg *cfg = to_error_cfg(kobject);
+
+       return snprintf(buf, PAGE_SIZE, "%d\n", cfg->max_retries);
+}
+
+static ssize_t
+max_retries_store(
+       struct kobject  *kobject,
+       const char      *buf,
+       size_t          count)
+{
+       struct xfs_error_cfg *cfg = to_error_cfg(kobject);
+       int             ret;
+       int             val;
+
+       ret = kstrtoint(buf, 0, &val);
+       if (ret)
+               return ret;
+
+       if (val < -1)
+               return -EINVAL;
+
+       cfg->max_retries = val;
+       return count;
+}
+XFS_SYSFS_ATTR_RW(max_retries);
+
+static ssize_t
+retry_timeout_seconds_show(
+       struct kobject  *kobject,
+       char            *buf)
+{
+       struct xfs_error_cfg *cfg = to_error_cfg(kobject);
+
+       return snprintf(buf, PAGE_SIZE, "%ld\n",
+                       jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC);
+}
+
+static ssize_t
+retry_timeout_seconds_store(
+       struct kobject  *kobject,
+       const char      *buf,
+       size_t          count)
+{
+       struct xfs_error_cfg *cfg = to_error_cfg(kobject);
+       int             ret;
+       int             val;
+
+       ret = kstrtoint(buf, 0, &val);
+       if (ret)
+               return ret;
+
+       /* 1 day timeout maximum */
+       if (val < 0 || val > 86400)
+               return -EINVAL;
+
+       cfg->retry_timeout = msecs_to_jiffies(val * MSEC_PER_SEC);
+       return count;
+}
+XFS_SYSFS_ATTR_RW(retry_timeout_seconds);
+
+static ssize_t
+fail_at_unmount_show(
+       struct kobject  *kobject,
+       char            *buf)
+{
+       struct xfs_mount        *mp = err_to_mp(kobject);
+
+       return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_fail_unmount);
+}
+
+static ssize_t
+fail_at_unmount_store(
+       struct kobject  *kobject,
+       const char      *buf,
+       size_t          count)
+{
+       struct xfs_mount        *mp = err_to_mp(kobject);
+       int             ret;
+       int             val;
+
+       ret = kstrtoint(buf, 0, &val);
+       if (ret)
+               return ret;
+
+       if (val < 0 || val > 1)
+               return -EINVAL;
+
+       mp->m_fail_unmount = val;
+       return count;
+}
+XFS_SYSFS_ATTR_RW(fail_at_unmount);
+
+static struct attribute *xfs_error_attrs[] = {
+       ATTR_LIST(max_retries),
+       ATTR_LIST(retry_timeout_seconds),
+       NULL,
+};
+
+
+struct kobj_type xfs_error_cfg_ktype = {
+       .release = xfs_sysfs_release,
+       .sysfs_ops = &xfs_sysfs_ops,
+       .default_attrs = xfs_error_attrs,
+};
+
+struct kobj_type xfs_error_ktype = {
+       .release = xfs_sysfs_release,
+       .sysfs_ops = &xfs_sysfs_ops,
+};
+
+/*
+ * Error initialization tables. These need to be ordered in the same
+ * order as the enums used to index the array. All class init tables need to
+ * define a "default" behaviour as the first entry, all other entries can be
+ * empty.
+ */
+struct xfs_error_init {
+       char            *name;
+       int             max_retries;
+       int             retry_timeout;  /* in seconds */
+};
+
+static const struct xfs_error_init xfs_error_meta_init[XFS_ERR_ERRNO_MAX] = {
+       { .name = "default",
+         .max_retries = XFS_ERR_RETRY_FOREVER,
+         .retry_timeout = 0,
+       },
+       { .name = "EIO",
+         .max_retries = XFS_ERR_RETRY_FOREVER,
+         .retry_timeout = 0,
+       },
+       { .name = "ENOSPC",
+         .max_retries = XFS_ERR_RETRY_FOREVER,
+         .retry_timeout = 0,
+       },
+       { .name = "ENODEV",
+         .max_retries = 0,
+       },
+};
+
+static int
+xfs_error_sysfs_init_class(
+       struct xfs_mount        *mp,
+       int                     class,
+       const char              *parent_name,
+       struct xfs_kobj         *parent_kobj,
+       const struct xfs_error_init init[])
+{
+       struct xfs_error_cfg    *cfg;
+       int                     error;
+       int                     i;
+
+       ASSERT(class < XFS_ERR_CLASS_MAX);
+
+       error = xfs_sysfs_init(parent_kobj, &xfs_error_ktype,
+                               &mp->m_error_kobj, parent_name);
+       if (error)
+               return error;
+
+       for (i = 0; i < XFS_ERR_ERRNO_MAX; i++) {
+               cfg = &mp->m_error_cfg[class][i];
+               error = xfs_sysfs_init(&cfg->kobj, &xfs_error_cfg_ktype,
+                                       parent_kobj, init[i].name);
+               if (error)
+                       goto out_error;
+
+               cfg->max_retries = init[i].max_retries;
+               cfg->retry_timeout = msecs_to_jiffies(
+                                       init[i].retry_timeout * MSEC_PER_SEC);
+       }
+       return 0;
+
+out_error:
+       /* unwind the entries that succeeded */
+       for (i--; i >= 0; i--) {
+               cfg = &mp->m_error_cfg[class][i];
+               xfs_sysfs_del(&cfg->kobj);
+       }
+       xfs_sysfs_del(parent_kobj);
+       return error;
+}
+
+int
+xfs_error_sysfs_init(
+       struct xfs_mount        *mp)
+{
+       int                     error;
+
+       /* .../xfs/<dev>/error/ */
+       error = xfs_sysfs_init(&mp->m_error_kobj, &xfs_error_ktype,
+                               &mp->m_kobj, "error");
+       if (error)
+               return error;
+
+       error = sysfs_create_file(&mp->m_error_kobj.kobject,
+                                 ATTR_LIST(fail_at_unmount));
+
+       if (error)
+               goto out_error;
+
+       /* .../xfs/<dev>/error/metadata/ */
+       error = xfs_error_sysfs_init_class(mp, XFS_ERR_METADATA,
+                               "metadata", &mp->m_error_meta_kobj,
+                               xfs_error_meta_init);
+       if (error)
+               goto out_error;
+
+       return 0;
+
+out_error:
+       xfs_sysfs_del(&mp->m_error_kobj);
+       return error;
+}
+
+void
+xfs_error_sysfs_del(
+       struct xfs_mount        *mp)
+{
+       struct xfs_error_cfg    *cfg;
+       int                     i, j;
+
+       for (i = 0; i < XFS_ERR_CLASS_MAX; i++) {
+               for (j = 0; j < XFS_ERR_ERRNO_MAX; j++) {
+                       cfg = &mp->m_error_cfg[i][j];
+
+                       xfs_sysfs_del(&cfg->kobj);
+               }
+       }
+       xfs_sysfs_del(&mp->m_error_meta_kobj);
+       xfs_sysfs_del(&mp->m_error_kobj);
+}
+
+struct xfs_error_cfg *
+xfs_error_get_cfg(
+       struct xfs_mount        *mp,
+       int                     error_class,
+       int                     error)
+{
+       struct xfs_error_cfg    *cfg;
+
+       switch (error) {
+       case EIO:
+               cfg = &mp->m_error_cfg[error_class][XFS_ERR_EIO];
+               break;
+       case ENOSPC:
+               cfg = &mp->m_error_cfg[error_class][XFS_ERR_ENOSPC];
+               break;
+       case ENODEV:
+               cfg = &mp->m_error_cfg[error_class][XFS_ERR_ENODEV];
+               break;
+       default:
+               cfg = &mp->m_error_cfg[error_class][XFS_ERR_DEFAULT];
+               break;
+       }
+
+       return cfg;
+}
index be692e5..d046371 100644 (file)
@@ -58,4 +58,7 @@ xfs_sysfs_del(
        wait_for_completion(&kobj->complete);
 }
 
+int    xfs_error_sysfs_init(struct xfs_mount *mp);
+void   xfs_error_sysfs_del(struct xfs_mount *mp);
+
 #endif /* __XFS_SYSFS_H__ */
index c8d5842..ea94ee0 100644 (file)
@@ -364,7 +364,6 @@ DEFINE_BUF_EVENT(xfs_buf_delwri_split);
 DEFINE_BUF_EVENT(xfs_buf_get_uncached);
 DEFINE_BUF_EVENT(xfs_bdstrat_shut);
 DEFINE_BUF_EVENT(xfs_buf_item_relse);
-DEFINE_BUF_EVENT(xfs_buf_item_iodone);
 DEFINE_BUF_EVENT(xfs_buf_item_iodone_async);
 DEFINE_BUF_EVENT(xfs_buf_error_relse);
 DEFINE_BUF_EVENT(xfs_buf_wait_buftarg);
@@ -944,7 +943,6 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
        TP_ARGS(log, tic),
        TP_STRUCT__entry(
                __field(dev_t, dev)
-               __field(unsigned, trans_type)
                __field(char, ocnt)
                __field(char, cnt)
                __field(int, curr_res)
@@ -962,7 +960,6 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
        ),
        TP_fast_assign(
                __entry->dev = log->l_mp->m_super->s_dev;
-               __entry->trans_type = tic->t_trans_type;
                __entry->ocnt = tic->t_ocnt;
                __entry->cnt = tic->t_cnt;
                __entry->curr_res = tic->t_curr_res;
@@ -980,14 +977,13 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
                __entry->curr_block = log->l_curr_block;
                __entry->tail_lsn = atomic64_read(&log->l_tail_lsn);
        ),
-       TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u "
+       TP_printk("dev %d:%d t_ocnt %u t_cnt %u t_curr_res %u "
                  "t_unit_res %u t_flags %s reserveq %s "
                  "writeq %s grant_reserve_cycle %d "
                  "grant_reserve_bytes %d grant_write_cycle %d "
                  "grant_write_bytes %d curr_cycle %d curr_block %d "
                  "tail_cycle %d tail_block %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __print_symbolic(__entry->trans_type, XFS_TRANS_TYPES),
                  __entry->ocnt,
                  __entry->cnt,
                  __entry->curr_res,
@@ -1053,19 +1049,21 @@ DECLARE_EVENT_CLASS(xfs_log_item_class,
 )
 
 TRACE_EVENT(xfs_log_force,
-       TP_PROTO(struct xfs_mount *mp, xfs_lsn_t lsn),
-       TP_ARGS(mp, lsn),
+       TP_PROTO(struct xfs_mount *mp, xfs_lsn_t lsn, unsigned long caller_ip),
+       TP_ARGS(mp, lsn, caller_ip),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_lsn_t, lsn)
+               __field(unsigned long, caller_ip)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->lsn = lsn;
+               __entry->caller_ip = caller_ip;
        ),
-       TP_printk("dev %d:%d lsn 0x%llx",
+       TP_printk("dev %d:%d lsn 0x%llx caller %ps",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->lsn)
+                 __entry->lsn, (void *)__entry->caller_ip)
 )
 
 #define DEFINE_LOG_ITEM_EVENT(name) \
index 20c5366..5f3d33d 100644 (file)
@@ -46,47 +46,6 @@ xfs_trans_init(
        xfs_trans_resv_calc(mp, M_RES(mp));
 }
 
-/*
- * This routine is called to allocate a transaction structure.
- * The type parameter indicates the type of the transaction.  These
- * are enumerated in xfs_trans.h.
- *
- * Dynamically allocate the transaction structure from the transaction
- * zone, initialize it, and return it to the caller.
- */
-xfs_trans_t *
-xfs_trans_alloc(
-       xfs_mount_t     *mp,
-       uint            type)
-{
-       xfs_trans_t     *tp;
-
-       sb_start_intwrite(mp->m_super);
-       tp = _xfs_trans_alloc(mp, type, KM_SLEEP);
-       tp->t_flags |= XFS_TRANS_FREEZE_PROT;
-       return tp;
-}
-
-xfs_trans_t *
-_xfs_trans_alloc(
-       xfs_mount_t     *mp,
-       uint            type,
-       xfs_km_flags_t  memflags)
-{
-       xfs_trans_t     *tp;
-
-       WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
-       atomic_inc(&mp->m_active_trans);
-
-       tp = kmem_zone_zalloc(xfs_trans_zone, memflags);
-       tp->t_magic = XFS_TRANS_HEADER_MAGIC;
-       tp->t_type = type;
-       tp->t_mountp = mp;
-       INIT_LIST_HEAD(&tp->t_items);
-       INIT_LIST_HEAD(&tp->t_busy);
-       return tp;
-}
-
 /*
  * Free the transaction structure.  If there is more clean up
  * to do when the structure is freed, add it here.
@@ -99,7 +58,7 @@ xfs_trans_free(
        xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false);
 
        atomic_dec(&tp->t_mountp->m_active_trans);
-       if (tp->t_flags & XFS_TRANS_FREEZE_PROT)
+       if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT))
                sb_end_intwrite(tp->t_mountp->m_super);
        xfs_trans_free_dqinfo(tp);
        kmem_zone_free(xfs_trans_zone, tp);
@@ -125,7 +84,6 @@ xfs_trans_dup(
         * Initialize the new transaction structure.
         */
        ntp->t_magic = XFS_TRANS_HEADER_MAGIC;
-       ntp->t_type = tp->t_type;
        ntp->t_mountp = tp->t_mountp;
        INIT_LIST_HEAD(&ntp->t_items);
        INIT_LIST_HEAD(&ntp->t_busy);
@@ -135,9 +93,9 @@ xfs_trans_dup(
 
        ntp->t_flags = XFS_TRANS_PERM_LOG_RES |
                       (tp->t_flags & XFS_TRANS_RESERVE) |
-                      (tp->t_flags & XFS_TRANS_FREEZE_PROT);
+                      (tp->t_flags & XFS_TRANS_NO_WRITECOUNT);
        /* We gave our writer reference to the new transaction */
-       tp->t_flags &= ~XFS_TRANS_FREEZE_PROT;
+       tp->t_flags |= XFS_TRANS_NO_WRITECOUNT;
        ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket);
        ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used;
        tp->t_blk_res = tp->t_blk_res_used;
@@ -165,7 +123,7 @@ xfs_trans_dup(
  * This does not do quota reservations. That typically is done by the
  * caller afterwards.
  */
-int
+static int
 xfs_trans_reserve(
        struct xfs_trans        *tp,
        struct xfs_trans_res    *resp,
@@ -219,7 +177,7 @@ xfs_trans_reserve(
                                                resp->tr_logres,
                                                resp->tr_logcount,
                                                &tp->t_ticket, XFS_TRANSACTION,
-                                               permanent, tp->t_type);
+                                               permanent);
                }
 
                if (error)
@@ -268,6 +226,42 @@ undo_blocks:
        return error;
 }
 
+int
+xfs_trans_alloc(
+       struct xfs_mount        *mp,
+       struct xfs_trans_res    *resp,
+       uint                    blocks,
+       uint                    rtextents,
+       uint                    flags,
+       struct xfs_trans        **tpp)
+{
+       struct xfs_trans        *tp;
+       int                     error;
+
+       if (!(flags & XFS_TRANS_NO_WRITECOUNT))
+               sb_start_intwrite(mp->m_super);
+
+       WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
+       atomic_inc(&mp->m_active_trans);
+
+       tp = kmem_zone_zalloc(xfs_trans_zone,
+               (flags & XFS_TRANS_NOFS) ? KM_NOFS : KM_SLEEP);
+       tp->t_magic = XFS_TRANS_HEADER_MAGIC;
+       tp->t_flags = flags;
+       tp->t_mountp = mp;
+       INIT_LIST_HEAD(&tp->t_items);
+       INIT_LIST_HEAD(&tp->t_busy);
+
+       error = xfs_trans_reserve(tp, resp, blocks, rtextents);
+       if (error) {
+               xfs_trans_cancel(tp);
+               return error;
+       }
+
+       *tpp = tp;
+       return 0;
+}
+
 /*
  * Record the indicated change to the given field for application
  * to the file system's superblock when the transaction commits.
index e7c49cf..9a462e8 100644 (file)
@@ -90,7 +90,6 @@ void  xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
  */
 typedef struct xfs_trans {
        unsigned int            t_magic;        /* magic number */
-       unsigned int            t_type;         /* transaction type */
        unsigned int            t_log_res;      /* amt of log space resvd */
        unsigned int            t_log_count;    /* count for perm log res */
        unsigned int            t_blk_res;      /* # of blocks resvd */
@@ -148,10 +147,9 @@ typedef struct xfs_trans {
 /*
  * XFS transaction mechanism exported interfaces.
  */
-xfs_trans_t    *xfs_trans_alloc(struct xfs_mount *, uint);
-xfs_trans_t    *_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t);
-int            xfs_trans_reserve(struct xfs_trans *, struct xfs_trans_res *,
-                                 uint, uint);
+int            xfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp,
+                       uint blocks, uint rtextents, uint flags,
+                       struct xfs_trans **tpp);
 void           xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t);
 
 struct xfs_buf *xfs_trans_get_buf_map(struct xfs_trans *tp,
index d111f69..ea62245 100644 (file)
@@ -74,11 +74,12 @@ xfs_forget_acl(
 }
 
 static int
-xfs_xattr_set(const struct xattr_handler *handler, struct dentry *dentry,
-               const char *name, const void *value, size_t size, int flags)
+xfs_xattr_set(const struct xattr_handler *handler, struct dentry *unused,
+               struct inode *inode, const char *name, const void *value,
+               size_t size, int flags)
 {
        int                     xflags = handler->flags;
-       struct xfs_inode        *ip = XFS_I(d_inode(dentry));
+       struct xfs_inode        *ip = XFS_I(inode);
        int                     error;
 
        /* Convert Linux syscall to XFS internal ATTR flags */
@@ -92,7 +93,7 @@ xfs_xattr_set(const struct xattr_handler *handler, struct dentry *dentry,
        error = xfs_attr_set(ip, (unsigned char *)name,
                                (void *)value, size, xflags);
        if (!error)
-               xfs_forget_acl(d_inode(dentry), name, xflags);
+               xfs_forget_acl(inode, name, xflags);
 
        return error;
 }
@@ -146,7 +147,7 @@ __xfs_xattr_put_listent(
        arraytop = context->count + prefix_len + namelen + 1;
        if (arraytop > context->firstu) {
                context->count = -1;    /* insufficient space */
-               return 1;
+               return 0;
        }
        offset = (char *)context->alist + context->count;
        strncpy(offset, prefix, prefix_len);
@@ -166,8 +167,7 @@ xfs_xattr_put_listent(
        int             flags,
        unsigned char   *name,
        int             namelen,
-       int             valuelen,
-       unsigned char   *value)
+       int             valuelen)
 {
        char *prefix;
        int prefix_len;
@@ -221,11 +221,15 @@ xfs_xattr_put_listent(
 }
 
 ssize_t
-xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
+xfs_vn_listxattr(
+       struct dentry   *dentry,
+       char            *data,
+       size_t          size)
 {
        struct xfs_attr_list_context context;
        struct attrlist_cursor_kern cursor = { 0 };
-       struct inode            *inode = d_inode(dentry);
+       struct inode    *inode = d_inode(dentry);
+       int             error;
 
        /*
         * First read the regular on-disk attributes.
@@ -239,7 +243,9 @@ xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
        context.firstu = context.bufsize;
        context.put_listent = xfs_xattr_put_listent;
 
-       xfs_attr_list_int(&context);
+       error = xfs_attr_list_int(&context);
+       if (error)
+               return error;
        if (context.count < 0)
                return -ERANGE;
 
index 70a41f7..5731ccb 100644 (file)
@@ -51,7 +51,8 @@ extern void acpi_video_set_dmi_backlight_type(enum acpi_backlight_type type);
  */
 extern bool acpi_video_handles_brightness_key_presses(void);
 extern int acpi_video_get_levels(struct acpi_device *device,
-                                struct acpi_video_device_brightness **dev_br);
+                                struct acpi_video_device_brightness **dev_br,
+                                int *pmax_level);
 #else
 static inline int acpi_video_register(void) { return 0; }
 static inline void acpi_video_unregister(void) { return; }
@@ -72,7 +73,8 @@ static inline bool acpi_video_handles_brightness_key_presses(void)
        return false;
 }
 static inline int acpi_video_get_levels(struct acpi_device *device,
-                       struct acpi_video_device_brightness **dev_br)
+                       struct acpi_video_device_brightness **dev_br,
+                       int *pmax_level)
 {
        return -ENODEV;
 }
index 5d8ffa3..c1cde35 100644 (file)
@@ -7,10 +7,10 @@
 
 static __always_inline int preempt_count(void)
 {
-       return current_thread_info()->preempt_count;
+       return READ_ONCE(current_thread_info()->preempt_count);
 }
 
-static __always_inline int *preempt_count_ptr(void)
+static __always_inline volatile int *preempt_count_ptr(void)
 {
        return &current_thread_info()->preempt_count;
 }
diff --git a/include/drm/drm_dp_dual_mode_helper.h b/include/drm/drm_dp_dual_mode_helper.h
new file mode 100644 (file)
index 0000000..e8a9dfd
--- /dev/null
@@ -0,0 +1,92 @@
+/*
+ * Copyright Â© 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef DRM_DP_DUAL_MODE_HELPER_H
+#define DRM_DP_DUAL_MODE_HELPER_H
+
+#include <linux/types.h>
+
+/*
+ * Optional for type 1 DVI adaptors
+ * Mandatory for type 1 HDMI and type 2 adaptors
+ */
+#define DP_DUAL_MODE_HDMI_ID 0x00 /* 00-0f */
+#define  DP_DUAL_MODE_HDMI_ID_LEN 16
+/*
+ * Optional for type 1 adaptors
+ * Mandatory for type 2 adaptors
+ */
+#define DP_DUAL_MODE_ADAPTOR_ID 0x10
+#define  DP_DUAL_MODE_REV_MASK 0x07
+#define  DP_DUAL_MODE_REV_TYPE2 0x00
+#define  DP_DUAL_MODE_TYPE_MASK 0xf0
+#define  DP_DUAL_MODE_TYPE_TYPE2 0xa0
+#define DP_DUAL_MODE_IEEE_OUI 0x11 /* 11-13*/
+#define  DP_DUAL_IEEE_OUI_LEN 3
+#define DP_DUAL_DEVICE_ID 0x14 /* 14-19 */
+#define  DP_DUAL_DEVICE_ID_LEN 6
+#define DP_DUAL_MODE_HARDWARE_REV 0x1a
+#define DP_DUAL_MODE_FIRMWARE_MAJOR_REV 0x1b
+#define DP_DUAL_MODE_FIRMWARE_MINOR_REV 0x1c
+#define DP_DUAL_MODE_MAX_TMDS_CLOCK 0x1d
+#define DP_DUAL_MODE_I2C_SPEED_CAP 0x1e
+#define DP_DUAL_MODE_TMDS_OEN 0x20
+#define  DP_DUAL_MODE_TMDS_DISABLE 0x01
+#define DP_DUAL_MODE_HDMI_PIN_CTRL 0x21
+#define  DP_DUAL_MODE_CEC_ENABLE 0x01
+#define DP_DUAL_MODE_I2C_SPEED_CTRL 0x22
+
+struct i2c_adapter;
+
+ssize_t drm_dp_dual_mode_read(struct i2c_adapter *adapter,
+                             u8 offset, void *buffer, size_t size);
+ssize_t drm_dp_dual_mode_write(struct i2c_adapter *adapter,
+                              u8 offset, const void *buffer, size_t size);
+
+/**
+ * enum drm_dp_dual_mode_type - Type of the DP dual mode adaptor
+ * @DRM_DP_DUAL_MODE_NONE: No DP dual mode adaptor
+ * @DRM_DP_DUAL_MODE_UNKNOWN: Could be either none or type 1 DVI adaptor
+ * @DRM_DP_DUAL_MODE_TYPE1_DVI: Type 1 DVI adaptor
+ * @DRM_DP_DUAL_MODE_TYPE1_HDMI: Type 1 HDMI adaptor
+ * @DRM_DP_DUAL_MODE_TYPE2_DVI: Type 2 DVI adaptor
+ * @DRM_DP_DUAL_MODE_TYPE2_HDMI: Type 2 HDMI adaptor
+ */
+enum drm_dp_dual_mode_type {
+       DRM_DP_DUAL_MODE_NONE,
+       DRM_DP_DUAL_MODE_UNKNOWN,
+       DRM_DP_DUAL_MODE_TYPE1_DVI,
+       DRM_DP_DUAL_MODE_TYPE1_HDMI,
+       DRM_DP_DUAL_MODE_TYPE2_DVI,
+       DRM_DP_DUAL_MODE_TYPE2_HDMI,
+};
+
+enum drm_dp_dual_mode_type drm_dp_dual_mode_detect(struct i2c_adapter *adapter);
+int drm_dp_dual_mode_max_tmds_clock(enum drm_dp_dual_mode_type type,
+                                   struct i2c_adapter *adapter);
+int drm_dp_dual_mode_get_tmds_output(enum drm_dp_dual_mode_type type,
+                                    struct i2c_adapter *adapter, bool *enabled);
+int drm_dp_dual_mode_set_tmds_output(enum drm_dp_dual_mode_type type,
+                                    struct i2c_adapter *adapter, bool enable);
+const char *drm_dp_get_dual_mode_type_name(enum drm_dp_dual_mode_type type);
+
+#endif
index 85aaf66..729ab9f 100644 (file)
@@ -9,5 +9,6 @@
 #define TEGRA124_SOCTHERM_SENSOR_MEM 1
 #define TEGRA124_SOCTHERM_SENSOR_GPU 2
 #define TEGRA124_SOCTHERM_SENSOR_PLLX 3
+#define TEGRA124_SOCTHERM_SENSOR_NUM 4
 
 #endif
index b651aed..dda39d8 100644 (file)
@@ -24,9 +24,6 @@
 #include <linux/workqueue.h>
 
 struct arch_timer_kvm {
-       /* Is the timer enabled */
-       bool                    enabled;
-
        /* Virtual offset */
        cycle_t                 cntvoff;
 };
@@ -53,15 +50,15 @@ struct arch_timer_cpu {
        /* Timer IRQ */
        struct kvm_irq_level            irq;
 
-       /* VGIC mapping */
-       struct irq_phys_map             *map;
-
        /* Active IRQ state caching */
        bool                            active_cleared_last;
+
+       /* Is the timer enabled */
+       bool                    enabled;
 };
 
 int kvm_timer_hyp_init(void);
-void kvm_timer_enable(struct kvm *kvm);
+int kvm_timer_enable(struct kvm_vcpu *vcpu);
 void kvm_timer_init(struct kvm *kvm);
 int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
                         const struct kvm_irq_level *irq);
index be6037a..da0a524 100644 (file)
 #ifndef __ASM_ARM_KVM_VGIC_H
 #define __ASM_ARM_KVM_VGIC_H
 
+#ifdef CONFIG_KVM_NEW_VGIC
+#include <kvm/vgic/vgic.h>
+#else
+
 #include <linux/kernel.h>
 #include <linux/kvm.h>
 #include <linux/irqreturn.h>
@@ -158,7 +162,6 @@ struct vgic_io_device {
 struct irq_phys_map {
        u32                     virt_irq;
        u32                     phys_irq;
-       u32                     irq;
 };
 
 struct irq_phys_map_entry {
@@ -305,9 +308,6 @@ struct vgic_cpu {
        unsigned long   *active_shared;
        unsigned long   *pend_act_shared;
 
-       /* Number of list registers on this CPU */
-       int             nr_lr;
-
        /* CPU vif control registers for world switch */
        union {
                struct vgic_v2_cpu_if   vgic_v2;
@@ -342,17 +342,18 @@ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu);
 int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num,
                        bool level);
 int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid,
-                              struct irq_phys_map *map, bool level);
+                              unsigned int virt_irq, bool level);
 void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg);
 int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
-struct irq_phys_map *kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu,
-                                          int virt_irq, int irq);
-int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, struct irq_phys_map *map);
-bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, struct irq_phys_map *map);
+int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, int virt_irq, int phys_irq);
+int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq);
+bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq);
 
 #define irqchip_in_kernel(k)   (!!((k)->arch.vgic.in_kernel))
 #define vgic_initialized(k)    (!!((k)->arch.vgic.nr_cpus))
 #define vgic_ready(k)          ((k)->arch.vgic.ready)
+#define vgic_valid_spi(k, i)   (((i) >= VGIC_NR_PRIVATE_IRQS) && \
+                                ((i) < (k)->arch.vgic.nr_irqs))
 
 int vgic_v2_probe(const struct gic_kvm_info *gic_kvm_info,
                  const struct vgic_ops **ops,
@@ -370,4 +371,5 @@ static inline int vgic_v3_probe(const struct gic_kvm_info *gic_kvm_info,
 }
 #endif
 
+#endif /* old VGIC include */
 #endif
diff --git a/include/kvm/vgic/vgic.h b/include/kvm/vgic/vgic.h
new file mode 100644 (file)
index 0000000..3fbd175
--- /dev/null
@@ -0,0 +1,246 @@
+/*
+ * Copyright (C) 2015, 2016 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef __ASM_ARM_KVM_VGIC_VGIC_H
+#define __ASM_ARM_KVM_VGIC_VGIC_H
+
+#include <linux/kernel.h>
+#include <linux/kvm.h>
+#include <linux/irqreturn.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <kvm/iodev.h>
+
+#define VGIC_V3_MAX_CPUS       255
+#define VGIC_V2_MAX_CPUS       8
+#define VGIC_NR_IRQS_LEGACY     256
+#define VGIC_NR_SGIS           16
+#define VGIC_NR_PPIS           16
+#define VGIC_NR_PRIVATE_IRQS   (VGIC_NR_SGIS + VGIC_NR_PPIS)
+#define VGIC_MAX_PRIVATE       (VGIC_NR_PRIVATE_IRQS - 1)
+#define VGIC_MAX_SPI           1019
+#define VGIC_MAX_RESERVED      1023
+#define VGIC_MIN_LPI           8192
+
+enum vgic_type {
+       VGIC_V2,                /* Good ol' GICv2 */
+       VGIC_V3,                /* New fancy GICv3 */
+};
+
+/* same for all guests, as depending only on the _host's_ GIC model */
+struct vgic_global {
+       /* type of the host GIC */
+       enum vgic_type          type;
+
+       /* Physical address of vgic virtual cpu interface */
+       phys_addr_t             vcpu_base;
+
+       /* virtual control interface mapping */
+       void __iomem            *vctrl_base;
+
+       /* Number of implemented list registers */
+       int                     nr_lr;
+
+       /* Maintenance IRQ number */
+       unsigned int            maint_irq;
+
+       /* maximum number of VCPUs allowed (GICv2 limits us to 8) */
+       int                     max_gic_vcpus;
+
+       /* Only needed for the legacy KVM_CREATE_IRQCHIP */
+       bool                    can_emulate_gicv2;
+};
+
+extern struct vgic_global kvm_vgic_global_state;
+
+#define VGIC_V2_MAX_LRS                (1 << 6)
+#define VGIC_V3_MAX_LRS                16
+#define VGIC_V3_LR_INDEX(lr)   (VGIC_V3_MAX_LRS - 1 - lr)
+
+enum vgic_irq_config {
+       VGIC_CONFIG_EDGE = 0,
+       VGIC_CONFIG_LEVEL
+};
+
+struct vgic_irq {
+       spinlock_t irq_lock;            /* Protects the content of the struct */
+       struct list_head ap_list;
+
+       struct kvm_vcpu *vcpu;          /* SGIs and PPIs: The VCPU
+                                        * SPIs and LPIs: The VCPU whose ap_list
+                                        * this is queued on.
+                                        */
+
+       struct kvm_vcpu *target_vcpu;   /* The VCPU that this interrupt should
+                                        * be sent to, as a result of the
+                                        * targets reg (v2) or the
+                                        * affinity reg (v3).
+                                        */
+
+       u32 intid;                      /* Guest visible INTID */
+       bool pending;
+       bool line_level;                /* Level only */
+       bool soft_pending;              /* Level only */
+       bool active;                    /* not used for LPIs */
+       bool enabled;
+       bool hw;                        /* Tied to HW IRQ */
+       u32 hwintid;                    /* HW INTID number */
+       union {
+               u8 targets;                     /* GICv2 target VCPUs mask */
+               u32 mpidr;                      /* GICv3 target VCPU */
+       };
+       u8 source;                      /* GICv2 SGIs only */
+       u8 priority;
+       enum vgic_irq_config config;    /* Level or edge */
+};
+
+struct vgic_register_region;
+
+struct vgic_io_device {
+       gpa_t base_addr;
+       struct kvm_vcpu *redist_vcpu;
+       const struct vgic_register_region *regions;
+       int nr_regions;
+       struct kvm_io_device dev;
+};
+
+struct vgic_dist {
+       bool                    in_kernel;
+       bool                    ready;
+       bool                    initialized;
+
+       /* vGIC model the kernel emulates for the guest (GICv2 or GICv3) */
+       u32                     vgic_model;
+
+       int                     nr_spis;
+
+       /* TODO: Consider moving to global state */
+       /* Virtual control interface mapping */
+       void __iomem            *vctrl_base;
+
+       /* base addresses in guest physical address space: */
+       gpa_t                   vgic_dist_base;         /* distributor */
+       union {
+               /* either a GICv2 CPU interface */
+               gpa_t                   vgic_cpu_base;
+               /* or a number of GICv3 redistributor regions */
+               gpa_t                   vgic_redist_base;
+       };
+
+       /* distributor enabled */
+       bool                    enabled;
+
+       struct vgic_irq         *spis;
+
+       struct vgic_io_device   dist_iodev;
+       struct vgic_io_device   *redist_iodevs;
+};
+
+struct vgic_v2_cpu_if {
+       u32             vgic_hcr;
+       u32             vgic_vmcr;
+       u32             vgic_misr;      /* Saved only */
+       u64             vgic_eisr;      /* Saved only */
+       u64             vgic_elrsr;     /* Saved only */
+       u32             vgic_apr;
+       u32             vgic_lr[VGIC_V2_MAX_LRS];
+};
+
+struct vgic_v3_cpu_if {
+#ifdef CONFIG_KVM_ARM_VGIC_V3
+       u32             vgic_hcr;
+       u32             vgic_vmcr;
+       u32             vgic_sre;       /* Restored only, change ignored */
+       u32             vgic_misr;      /* Saved only */
+       u32             vgic_eisr;      /* Saved only */
+       u32             vgic_elrsr;     /* Saved only */
+       u32             vgic_ap0r[4];
+       u32             vgic_ap1r[4];
+       u64             vgic_lr[VGIC_V3_MAX_LRS];
+#endif
+};
+
+struct vgic_cpu {
+       /* CPU vif control registers for world switch */
+       union {
+               struct vgic_v2_cpu_if   vgic_v2;
+               struct vgic_v3_cpu_if   vgic_v3;
+       };
+
+       unsigned int used_lrs;
+       struct vgic_irq private_irqs[VGIC_NR_PRIVATE_IRQS];
+
+       spinlock_t ap_list_lock;        /* Protects the ap_list */
+
+       /*
+        * List of IRQs that this VCPU should consider because they are either
+        * Active or Pending (hence the name; AP list), or because they recently
+        * were one of the two and need to be migrated off this list to another
+        * VCPU.
+        */
+       struct list_head ap_list_head;
+
+       u64 live_lrs;
+};
+
+int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write);
+void kvm_vgic_early_init(struct kvm *kvm);
+int kvm_vgic_create(struct kvm *kvm, u32 type);
+void kvm_vgic_destroy(struct kvm *kvm);
+void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu);
+void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu);
+int kvm_vgic_map_resources(struct kvm *kvm);
+int kvm_vgic_hyp_init(void);
+
+int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
+                       bool level);
+int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid, unsigned int intid,
+                              bool level);
+int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq);
+int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq);
+bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq);
+
+int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
+
+#define irqchip_in_kernel(k)   (!!((k)->arch.vgic.in_kernel))
+#define vgic_initialized(k)    ((k)->arch.vgic.initialized)
+#define vgic_ready(k)          ((k)->arch.vgic.ready)
+#define vgic_valid_spi(k, i)   (((i) >= VGIC_NR_PRIVATE_IRQS) && \
+                       ((i) < (k)->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS))
+
+bool kvm_vcpu_has_pending_irqs(struct kvm_vcpu *vcpu);
+void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu);
+void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu);
+
+#ifdef CONFIG_KVM_ARM_VGIC_V3
+void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg);
+#else
+static inline void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg)
+{
+}
+#endif
+
+/**
+ * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW
+ *
+ * The host's GIC naturally limits the maximum amount of VCPUs a guest
+ * can use.
+ */
+static inline int kvm_vgic_get_max_vcpus(void)
+{
+       return kvm_vgic_global_state.max_gic_vcpus;
+}
+
+#endif /* __ASM_ARM_KVM_VGIC_VGIC_H */
index 846513c..a5ac2ca 100644 (file)
@@ -587,7 +587,6 @@ struct mtd_info;
 
 struct bcma_sflash {
        bool present;
-       u32 window;
        u32 blocksize;
        u16 numblocks;
        u32 size;
index 1fd8fdf..3d9cf32 100644 (file)
@@ -768,6 +768,17 @@ static inline void rq_flush_dcache_pages(struct request *rq)
 }
 #endif
 
+#ifdef CONFIG_PRINTK
+#define vfs_msg(sb, level, fmt, ...)                           \
+       __vfs_msg(sb, level, fmt, ##__VA_ARGS__)
+#else
+#define vfs_msg(sb, level, fmt, ...)                           \
+do {                                                           \
+       no_printk(fmt, ##__VA_ARGS__);                          \
+       __vfs_msg(sb, "", " ");                                 \
+} while (0)
+#endif
+
 extern int blk_register_queue(struct gendisk *disk);
 extern void blk_unregister_queue(struct gendisk *disk);
 extern blk_qc_t generic_make_request(struct bio *bio);
@@ -1660,7 +1671,7 @@ struct block_device_operations {
        int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
        int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
        long (*direct_access)(struct block_device *, sector_t, void __pmem **,
-                       pfn_t *);
+                       pfn_t *, long);
        unsigned int (*check_events) (struct gendisk *disk,
                                      unsigned int clearing);
        /* ->media_changed() is DEPRECATED, use ->check_events() instead */
@@ -1680,6 +1691,8 @@ extern int bdev_read_page(struct block_device *, sector_t, struct page *);
 extern int bdev_write_page(struct block_device *, sector_t, struct page *,
                                                struct writeback_control *);
 extern long bdev_direct_access(struct block_device *, struct blk_dax_ctl *);
+extern int bdev_dax_supported(struct super_block *, int);
+extern bool bdev_dax_capable(struct block_device *);
 #else /* CONFIG_BLOCK */
 
 struct block_device;
index b827e06..146507d 100644 (file)
@@ -51,11 +51,11 @@ static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
        return ceph_frag_make(newbits,
                         ceph_frag_value(f) | (i << (24 - newbits)));
 }
-static inline int ceph_frag_is_leftmost(__u32 f)
+static inline bool ceph_frag_is_leftmost(__u32 f)
 {
        return ceph_frag_value(f) == 0;
 }
-static inline int ceph_frag_is_rightmost(__u32 f)
+static inline bool ceph_frag_is_rightmost(__u32 f)
 {
        return ceph_frag_value(f) == ceph_frag_mask(f);
 }
index 37f28bf..dfce616 100644 (file)
@@ -153,8 +153,9 @@ struct ceph_dir_layout {
 
 /* watch-notify operations */
 enum {
-  WATCH_NOTIFY                         = 1, /* notifying watcher */
-  WATCH_NOTIFY_COMPLETE                        = 2, /* notifier notified when done */
+       CEPH_WATCH_EVENT_NOTIFY           = 1, /* notifying watcher */
+       CEPH_WATCH_EVENT_NOTIFY_COMPLETE  = 2, /* notifier notified when done */
+       CEPH_WATCH_EVENT_DISCONNECT       = 3, /* we were disconnected */
 };
 
 
@@ -207,6 +208,8 @@ struct ceph_mon_subscribe_ack {
        struct ceph_fsid fsid;
 } __attribute__ ((packed));
 
+#define CEPH_FS_CLUSTER_ID_NONE  -1
+
 /*
  * mdsmap flags
  */
@@ -344,6 +347,18 @@ extern const char *ceph_mds_op_name(int op);
 #define CEPH_XATTR_REPLACE (1 << 1)
 #define CEPH_XATTR_REMOVE  (1 << 31)
 
+/*
+ * readdir request flags;
+ */
+#define CEPH_READDIR_REPLY_BITFLAGS    (1<<0)
+
+/*
+ * readdir reply flags.
+ */
+#define CEPH_READDIR_FRAG_END          (1<<0)
+#define CEPH_READDIR_FRAG_COMPLETE     (1<<8)
+#define CEPH_READDIR_HASH_ORDER                (1<<9)
+
 union ceph_mds_request_args {
        struct {
                __le32 mask;                 /* CEPH_CAP_* */
@@ -361,6 +376,7 @@ union ceph_mds_request_args {
                __le32 frag;                 /* which dir fragment */
                __le32 max_entries;          /* how many dentries to grab */
                __le32 max_bytes;
+               __le16 flags;
        } __attribute__ ((packed)) readdir;
        struct {
                __le32 mode;
index a6ef9cc..19e9932 100644 (file)
@@ -47,7 +47,7 @@ static inline void ceph_decode_copy(void **p, void *pv, size_t n)
 /*
  * bounds check input.
  */
-static inline int ceph_has_room(void **p, void *end, size_t n)
+static inline bool ceph_has_room(void **p, void *end, size_t n)
 {
        return end >= *p && n <= end - *p;
 }
index db92a8d..690985d 100644 (file)
@@ -180,6 +180,63 @@ static inline int calc_pages_for(u64 off, u64 len)
                (off >> PAGE_SHIFT);
 }
 
+/*
+ * These are not meant to be generic - an integer key is assumed.
+ */
+#define DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld)            \
+static void insert_##name(struct rb_root *root, type *t)               \
+{                                                                      \
+       struct rb_node **n = &root->rb_node;                            \
+       struct rb_node *parent = NULL;                                  \
+                                                                       \
+       BUG_ON(!RB_EMPTY_NODE(&t->nodefld));                            \
+                                                                       \
+       while (*n) {                                                    \
+               type *cur = rb_entry(*n, type, nodefld);                \
+                                                                       \
+               parent = *n;                                            \
+               if (t->keyfld < cur->keyfld)                            \
+                       n = &(*n)->rb_left;                             \
+               else if (t->keyfld > cur->keyfld)                       \
+                       n = &(*n)->rb_right;                            \
+               else                                                    \
+                       BUG();                                          \
+       }                                                               \
+                                                                       \
+       rb_link_node(&t->nodefld, parent, n);                           \
+       rb_insert_color(&t->nodefld, root);                             \
+}                                                                      \
+static void erase_##name(struct rb_root *root, type *t)                        \
+{                                                                      \
+       BUG_ON(RB_EMPTY_NODE(&t->nodefld));                             \
+       rb_erase(&t->nodefld, root);                                    \
+       RB_CLEAR_NODE(&t->nodefld);                                     \
+}
+
+#define DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld)             \
+static type *lookup_##name(struct rb_root *root,                       \
+                          typeof(((type *)0)->keyfld) key)             \
+{                                                                      \
+       struct rb_node *n = root->rb_node;                              \
+                                                                       \
+       while (n) {                                                     \
+               type *cur = rb_entry(n, type, nodefld);                 \
+                                                                       \
+               if (key < cur->keyfld)                                  \
+                       n = n->rb_left;                                 \
+               else if (key > cur->keyfld)                             \
+                       n = n->rb_right;                                \
+               else                                                    \
+                       return cur;                                     \
+       }                                                               \
+                                                                       \
+       return NULL;                                                    \
+}
+
+#define DEFINE_RB_FUNCS(name, type, keyfld, nodefld)                   \
+DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld)                    \
+DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld)
+
 extern struct kmem_cache *ceph_inode_cachep;
 extern struct kmem_cache *ceph_cap_cachep;
 extern struct kmem_cache *ceph_cap_flush_cachep;
index e230e7e..e2a92df 100644 (file)
@@ -39,20 +39,31 @@ struct ceph_mon_request {
        ceph_monc_request_func_t do_request;
 };
 
+typedef void (*ceph_monc_callback_t)(struct ceph_mon_generic_request *);
+
 /*
  * ceph_mon_generic_request is being used for the statfs and
  * mon_get_version requests which are being done a bit differently
  * because we need to get data back to the caller
  */
 struct ceph_mon_generic_request {
+       struct ceph_mon_client *monc;
        struct kref kref;
        u64 tid;
        struct rb_node node;
        int result;
-       void *buf;
+
        struct completion completion;
+       ceph_monc_callback_t complete_cb;
+       u64 private_data;          /* r_tid/linger_id */
+
        struct ceph_msg *request;  /* original request */
        struct ceph_msg *reply;    /* and reply */
+
+       union {
+               struct ceph_statfs *st;
+               u64 newest;
+       } u;
 };
 
 struct ceph_mon_client {
@@ -77,7 +88,6 @@ struct ceph_mon_client {
 
        /* pending generic requests */
        struct rb_root generic_request_tree;
-       int num_generic_requests;
        u64 last_tid;
 
        /* subs, indexed with CEPH_SUB_* */
@@ -86,6 +96,7 @@ struct ceph_mon_client {
                bool want;
                u32 have; /* epoch */
        } subs[3];
+       int fs_cluster_id; /* "mdsmap.<id>" sub */
 
 #ifdef CONFIG_DEBUG_FS
        struct dentry *debugfs_file;
@@ -116,16 +127,18 @@ extern const char *ceph_sub_str[];
 bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch,
                        bool continuous);
 void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch);
+void ceph_monc_renew_subs(struct ceph_mon_client *monc);
 
-extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
 extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
                                 unsigned long timeout);
 
 extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
                               struct ceph_statfs *buf);
 
-extern int ceph_monc_do_get_version(struct ceph_mon_client *monc,
-                                   const char *what, u64 *newest);
+int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
+                         u64 *newest);
+int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what,
+                               ceph_monc_callback_t cb, u64 private_data);
 
 extern int ceph_monc_open_session(struct ceph_mon_client *monc);
 
index cbf4609..1b3b6e1 100644 (file)
@@ -20,10 +20,11 @@ struct ceph_osd_client;
 /*
  * completion callback for async writepages
  */
-typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
-                                    struct ceph_msg *);
+typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *);
 typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool);
 
+#define CEPH_HOMELESS_OSD      -1
+
 /* a given osd we're communicating with */
 struct ceph_osd {
        atomic_t o_ref;
@@ -32,16 +33,15 @@ struct ceph_osd {
        int o_incarnation;
        struct rb_node o_node;
        struct ceph_connection o_con;
-       struct list_head o_requests;
-       struct list_head o_linger_requests;
+       struct rb_root o_requests;
+       struct rb_root o_linger_requests;
        struct list_head o_osd_lru;
        struct ceph_auth_handshake o_auth;
        unsigned long lru_ttl;
-       int o_marked_for_keepalive;
        struct list_head o_keepalive_item;
+       struct mutex lock;
 };
 
-
 #define CEPH_OSD_SLAB_OPS      2
 #define CEPH_OSD_MAX_OPS       16
 
@@ -104,15 +104,21 @@ struct ceph_osd_req_op {
                        struct ceph_osd_data response_data;
                        __u8 class_len;
                        __u8 method_len;
-                       __u8 argc;
+                       u32 indata_len;
                } cls;
                struct {
                        u64 cookie;
-                       u64 ver;
-                       u32 prot_ver;
-                       u32 timeout;
-                       __u8 flag;
+                       __u8 op;           /* CEPH_OSD_WATCH_OP_ */
+                       u32 gen;
                } watch;
+               struct {
+                       struct ceph_osd_data request_data;
+               } notify_ack;
+               struct {
+                       u64 cookie;
+                       struct ceph_osd_data request_data;
+                       struct ceph_osd_data response_data;
+               } notify;
                struct {
                        u64 expected_object_size;
                        u64 expected_write_size;
@@ -120,60 +126,73 @@ struct ceph_osd_req_op {
        };
 };
 
+struct ceph_osd_request_target {
+       struct ceph_object_id base_oid;
+       struct ceph_object_locator base_oloc;
+       struct ceph_object_id target_oid;
+       struct ceph_object_locator target_oloc;
+
+       struct ceph_pg pgid;
+       u32 pg_num;
+       u32 pg_num_mask;
+       struct ceph_osds acting;
+       struct ceph_osds up;
+       int size;
+       int min_size;
+       bool sort_bitwise;
+
+       unsigned int flags;                /* CEPH_OSD_FLAG_* */
+       bool paused;
+
+       int osd;
+};
+
 /* an in-flight request */
 struct ceph_osd_request {
        u64             r_tid;              /* unique for this client */
        struct rb_node  r_node;
-       struct list_head r_req_lru_item;
-       struct list_head r_osd_item;
-       struct list_head r_linger_item;
-       struct list_head r_linger_osd_item;
+       struct rb_node  r_mc_node;          /* map check */
        struct ceph_osd *r_osd;
-       struct ceph_pg   r_pgid;
-       int              r_pg_osds[CEPH_PG_MAX_SIZE];
-       int              r_num_pg_osds;
+
+       struct ceph_osd_request_target r_t;
+#define r_base_oid     r_t.base_oid
+#define r_base_oloc    r_t.base_oloc
+#define r_flags                r_t.flags
 
        struct ceph_msg  *r_request, *r_reply;
-       int               r_flags;     /* any additional flags for the osd */
        u32               r_sent;      /* >0 if r_request is sending/sent */
 
        /* request osd ops array  */
        unsigned int            r_num_ops;
 
-       /* these are updated on each send */
-       __le32           *r_request_osdmap_epoch;
-       __le32           *r_request_flags;
-       __le64           *r_request_pool;
-       void             *r_request_pgid;
-       __le32           *r_request_attempts;
-       bool              r_paused;
-       struct ceph_eversion *r_request_reassert_version;
-
        int               r_result;
-       int               r_got_reply;
-       int               r_linger;
+       bool              r_got_reply;
 
        struct ceph_osd_client *r_osdc;
        struct kref       r_kref;
        bool              r_mempool;
-       struct completion r_completion, r_safe_completion;
+       struct completion r_completion;
+       struct completion r_safe_completion;  /* fsync waiter */
        ceph_osdc_callback_t r_callback;
        ceph_osdc_unsafe_callback_t r_unsafe_callback;
-       struct ceph_eversion r_reassert_version;
        struct list_head  r_unsafe_item;
 
        struct inode *r_inode;                /* for use by callbacks */
        void *r_priv;                         /* ditto */
 
-       struct ceph_object_locator r_base_oloc;
-       struct ceph_object_id r_base_oid;
-       struct ceph_object_locator r_target_oloc;
-       struct ceph_object_id r_target_oid;
-
-       u64               r_snapid;
-       unsigned long     r_stamp;            /* send OR check time */
+       /* set by submitter */
+       u64 r_snapid;                         /* for reads, CEPH_NOSNAP o/w */
+       struct ceph_snap_context *r_snapc;    /* for writes */
+       struct timespec r_mtime;              /* ditto */
+       u64 r_data_offset;                    /* ditto */
+       bool r_linger;                        /* don't resend on failure */
 
-       struct ceph_snap_context *r_snapc;    /* snap context for writes */
+       /* internal */
+       unsigned long r_stamp;                /* jiffies, send or check time */
+       int r_attempts;
+       struct ceph_eversion r_replay_version; /* aka reassert_version */
+       u32 r_last_force_resend;
+       u32 r_map_dne_bound;
 
        struct ceph_osd_req_op r_ops[];
 };
@@ -182,44 +201,70 @@ struct ceph_request_redirect {
        struct ceph_object_locator oloc;
 };
 
-struct ceph_osd_event {
-       u64 cookie;
-       int one_shot;
+typedef void (*rados_watchcb2_t)(void *arg, u64 notify_id, u64 cookie,
+                                u64 notifier_id, void *data, size_t data_len);
+typedef void (*rados_watcherrcb_t)(void *arg, u64 cookie, int err);
+
+struct ceph_osd_linger_request {
        struct ceph_osd_client *osdc;
-       void (*cb)(u64, u64, u8, void *);
-       void *data;
-       struct rb_node node;
-       struct list_head osd_node;
+       u64 linger_id;
+       bool committed;
+       bool is_watch;                  /* watch or notify */
+
+       struct ceph_osd *osd;
+       struct ceph_osd_request *reg_req;
+       struct ceph_osd_request *ping_req;
+       unsigned long ping_sent;
+       unsigned long watch_valid_thru;
+       struct list_head pending_lworks;
+
+       struct ceph_osd_request_target t;
+       u32 last_force_resend;
+       u32 map_dne_bound;
+
+       struct timespec mtime;
+
        struct kref kref;
-};
+       struct mutex lock;
+       struct rb_node node;            /* osd */
+       struct rb_node osdc_node;       /* osdc */
+       struct rb_node mc_node;         /* map check */
+       struct list_head scan_item;
+
+       struct completion reg_commit_wait;
+       struct completion notify_finish_wait;
+       int reg_commit_error;
+       int notify_finish_error;
+       int last_error;
+
+       u32 register_gen;
+       u64 notify_id;
+
+       rados_watchcb2_t wcb;
+       rados_watcherrcb_t errcb;
+       void *data;
 
-struct ceph_osd_event_work {
-       struct work_struct work;
-       struct ceph_osd_event *event;
-        u64 ver;
-        u64 notify_id;
-        u8 opcode;
+       struct page ***preply_pages;
+       size_t *preply_len;
 };
 
 struct ceph_osd_client {
        struct ceph_client     *client;
 
        struct ceph_osdmap     *osdmap;       /* current map */
-       struct rw_semaphore    map_sem;
-       struct completion      map_waiters;
-       u64                    last_requested_map;
+       struct rw_semaphore    lock;
 
-       struct mutex           request_mutex;
        struct rb_root         osds;          /* osds */
        struct list_head       osd_lru;       /* idle osds */
-       u64                    timeout_tid;   /* tid of timeout triggering rq */
-       u64                    last_tid;      /* tid of last request */
-       struct rb_root         requests;      /* pending requests */
-       struct list_head       req_lru;       /* in-flight lru */
-       struct list_head       req_unsent;    /* unsent/need-resend queue */
-       struct list_head       req_notarget;  /* map to no osd */
-       struct list_head       req_linger;    /* lingering requests */
-       int                    num_requests;
+       spinlock_t             osd_lru_lock;
+       struct ceph_osd        homeless_osd;
+       atomic64_t             last_tid;      /* tid of last request */
+       u64                    last_linger_id;
+       struct rb_root         linger_requests; /* lingering requests */
+       struct rb_root         map_checks;
+       struct rb_root         linger_map_checks;
+       atomic_t               num_requests;
+       atomic_t               num_homeless;
        struct delayed_work    timeout_work;
        struct delayed_work    osds_timeout_work;
 #ifdef CONFIG_DEBUG_FS
@@ -231,13 +276,14 @@ struct ceph_osd_client {
        struct ceph_msgpool     msgpool_op;
        struct ceph_msgpool     msgpool_op_reply;
 
-       spinlock_t              event_lock;
-       struct rb_root          event_tree;
-       u64                     event_count;
-
        struct workqueue_struct *notify_wq;
 };
 
+static inline bool ceph_osdmap_flag(struct ceph_osd_client *osdc, int flag)
+{
+       return osdc->osdmap->flags & flag;
+}
+
 extern int ceph_osdc_setup(void);
 extern void ceph_osdc_cleanup(void);
 
@@ -271,9 +317,6 @@ extern void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req,
 extern struct ceph_osd_data *osd_req_op_extent_osd_data(
                                        struct ceph_osd_request *osd_req,
                                        unsigned int which);
-extern struct ceph_osd_data *osd_req_op_cls_response_data(
-                                       struct ceph_osd_request *osd_req,
-                                       unsigned int which);
 
 extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *,
                                        unsigned int which,
@@ -309,9 +352,6 @@ extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req,
 extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
                                 u16 opcode, const char *name, const void *value,
                                 size_t size, u8 cmp_op, u8 cmp_mode);
-extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
-                                       unsigned int which, u16 opcode,
-                                       u64 cookie, u64 version, int flag);
 extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
                                       unsigned int which,
                                       u64 expected_object_size,
@@ -322,11 +362,7 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *
                                               unsigned int num_ops,
                                               bool use_mempool,
                                               gfp_t gfp_flags);
-
-extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
-                                   struct ceph_snap_context *snapc,
-                                   u64 snap_id,
-                                   struct timespec *mtime);
+int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp);
 
 extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
                                      struct ceph_file_layout *layout,
@@ -338,9 +374,6 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
                                      u32 truncate_seq, u64 truncate_size,
                                      bool use_mempool);
 
-extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
-                                        struct ceph_osd_request *req);
-
 extern void ceph_osdc_get_request(struct ceph_osd_request *req);
 extern void ceph_osdc_put_request(struct ceph_osd_request *req);
 
@@ -353,6 +386,7 @@ extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
 extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
 
 extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc);
+void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc);
 
 extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
                               struct ceph_vino vino,
@@ -371,11 +405,33 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
                                struct timespec *mtime,
                                struct page **pages, int nr_pages);
 
-/* watch/notify events */
-extern int ceph_osdc_create_event(struct ceph_osd_client *osdc,
-                                 void (*event_cb)(u64, u64, u8, void *),
-                                 void *data, struct ceph_osd_event **pevent);
-extern void ceph_osdc_cancel_event(struct ceph_osd_event *event);
-extern void ceph_osdc_put_event(struct ceph_osd_event *event);
+/* watch/notify */
+struct ceph_osd_linger_request *
+ceph_osdc_watch(struct ceph_osd_client *osdc,
+               struct ceph_object_id *oid,
+               struct ceph_object_locator *oloc,
+               rados_watchcb2_t wcb,
+               rados_watcherrcb_t errcb,
+               void *data);
+int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
+                     struct ceph_osd_linger_request *lreq);
+
+int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
+                        struct ceph_object_id *oid,
+                        struct ceph_object_locator *oloc,
+                        u64 notify_id,
+                        u64 cookie,
+                        void *payload,
+                        size_t payload_len);
+int ceph_osdc_notify(struct ceph_osd_client *osdc,
+                    struct ceph_object_id *oid,
+                    struct ceph_object_locator *oloc,
+                    void *payload,
+                    size_t payload_len,
+                    u32 timeout,
+                    struct page ***preply_pages,
+                    size_t *preply_len);
+int ceph_osdc_watch_check(struct ceph_osd_client *osdc,
+                         struct ceph_osd_linger_request *lreq);
 #endif
 
index e55c08b..9ccf4db 100644 (file)
@@ -24,21 +24,29 @@ struct ceph_pg {
        uint32_t seed;
 };
 
-#define CEPH_POOL_FLAG_HASHPSPOOL  1
+int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs);
+
+#define CEPH_POOL_FLAG_HASHPSPOOL      (1ULL << 0) /* hash pg seed and pool id
+                                                      together */
+#define CEPH_POOL_FLAG_FULL            (1ULL << 1) /* pool is full */
 
 struct ceph_pg_pool_info {
        struct rb_node node;
        s64 id;
-       u8 type;
+       u8 type; /* CEPH_POOL_TYPE_* */
        u8 size;
+       u8 min_size;
        u8 crush_ruleset;
        u8 object_hash;
+       u32 last_force_request_resend;
        u32 pg_num, pgp_num;
        int pg_num_mask, pgp_num_mask;
        s64 read_tier;
        s64 write_tier; /* wins for read+write ops */
-       u64 flags;
+       u64 flags; /* CEPH_POOL_FLAG_* */
        char *name;
+
+       bool was_full;  /* for handle_one_map() */
 };
 
 static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool)
@@ -57,6 +65,22 @@ struct ceph_object_locator {
        s64 pool;
 };
 
+static inline void ceph_oloc_init(struct ceph_object_locator *oloc)
+{
+       oloc->pool = -1;
+}
+
+static inline bool ceph_oloc_empty(const struct ceph_object_locator *oloc)
+{
+       return oloc->pool == -1;
+}
+
+static inline void ceph_oloc_copy(struct ceph_object_locator *dest,
+                                 const struct ceph_object_locator *src)
+{
+       dest->pool = src->pool;
+}
+
 /*
  * Maximum supported by kernel client object name length
  *
@@ -64,11 +88,47 @@ struct ceph_object_locator {
  */
 #define CEPH_MAX_OID_NAME_LEN 100
 
+/*
+ * 51-char inline_name is long enough for all cephfs and all but one
+ * rbd requests: <imgname> in "<imgname>.rbd"/"rbd_id.<imgname>" can be
+ * arbitrarily long (~PAGE_SIZE).  It's done once during rbd map; all
+ * other rbd requests fit into inline_name.
+ *
+ * Makes ceph_object_id 64 bytes on 64-bit.
+ */
+#define CEPH_OID_INLINE_LEN 52
+
+/*
+ * Both inline and external buffers have space for a NUL-terminator,
+ * which is carried around.  It's not required though - RADOS object
+ * names don't have to be NUL-terminated and may contain NULs.
+ */
 struct ceph_object_id {
-       char name[CEPH_MAX_OID_NAME_LEN];
+       char *name;
+       char inline_name[CEPH_OID_INLINE_LEN];
        int name_len;
 };
 
+static inline void ceph_oid_init(struct ceph_object_id *oid)
+{
+       oid->name = oid->inline_name;
+       oid->name_len = 0;
+}
+
+static inline bool ceph_oid_empty(const struct ceph_object_id *oid)
+{
+       return oid->name == oid->inline_name && !oid->name_len;
+}
+
+void ceph_oid_copy(struct ceph_object_id *dest,
+                  const struct ceph_object_id *src);
+__printf(2, 3)
+void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...);
+__printf(3, 4)
+int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
+                    const char *fmt, ...);
+void ceph_oid_destroy(struct ceph_object_id *oid);
+
 struct ceph_pg_mapping {
        struct rb_node node;
        struct ceph_pg pgid;
@@ -87,7 +147,6 @@ struct ceph_pg_mapping {
 struct ceph_osdmap {
        struct ceph_fsid fsid;
        u32 epoch;
-       u32 mkfs_epoch;
        struct ceph_timespec created, modified;
 
        u32 flags;         /* CEPH_OSDMAP_* */
@@ -113,52 +172,23 @@ struct ceph_osdmap {
        int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3];
 };
 
-static inline void ceph_oid_set_name(struct ceph_object_id *oid,
-                                    const char *name)
-{
-       int len;
-
-       len = strlen(name);
-       if (len > sizeof(oid->name)) {
-               WARN(1, "ceph_oid_set_name '%s' len %d vs %zu, truncating\n",
-                    name, len, sizeof(oid->name));
-               len = sizeof(oid->name);
-       }
-
-       memcpy(oid->name, name, len);
-       oid->name_len = len;
-}
-
-static inline void ceph_oid_copy(struct ceph_object_id *dest,
-                                struct ceph_object_id *src)
-{
-       BUG_ON(src->name_len > sizeof(dest->name));
-       memcpy(dest->name, src->name, src->name_len);
-       dest->name_len = src->name_len;
-}
-
-static inline int ceph_osd_exists(struct ceph_osdmap *map, int osd)
+static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd)
 {
        return osd >= 0 && osd < map->max_osd &&
               (map->osd_state[osd] & CEPH_OSD_EXISTS);
 }
 
-static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
+static inline bool ceph_osd_is_up(struct ceph_osdmap *map, int osd)
 {
        return ceph_osd_exists(map, osd) &&
               (map->osd_state[osd] & CEPH_OSD_UP);
 }
 
-static inline int ceph_osd_is_down(struct ceph_osdmap *map, int osd)
+static inline bool ceph_osd_is_down(struct ceph_osdmap *map, int osd)
 {
        return !ceph_osd_is_up(map, osd);
 }
 
-static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
-{
-       return map && (map->flags & flag);
-}
-
 extern char *ceph_osdmap_state_str(char *str, int len, int state);
 extern u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd);
 
@@ -192,28 +222,59 @@ static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid)
        return 0;
 }
 
+struct ceph_osdmap *ceph_osdmap_alloc(void);
 extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end);
-extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
-                                           struct ceph_osdmap *map,
-                                           struct ceph_messenger *msgr);
+struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+                                            struct ceph_osdmap *map);
 extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
 
+struct ceph_osds {
+       int osds[CEPH_PG_MAX_SIZE];
+       int size;
+       int primary; /* id, NOT index */
+};
+
+static inline void ceph_osds_init(struct ceph_osds *set)
+{
+       set->size = 0;
+       set->primary = -1;
+}
+
+void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src);
+
+bool ceph_is_new_interval(const struct ceph_osds *old_acting,
+                         const struct ceph_osds *new_acting,
+                         const struct ceph_osds *old_up,
+                         const struct ceph_osds *new_up,
+                         int old_size,
+                         int new_size,
+                         int old_min_size,
+                         int new_min_size,
+                         u32 old_pg_num,
+                         u32 new_pg_num,
+                         bool old_sort_bitwise,
+                         bool new_sort_bitwise,
+                         const struct ceph_pg *pgid);
+bool ceph_osds_changed(const struct ceph_osds *old_acting,
+                      const struct ceph_osds *new_acting,
+                      bool any_change);
+
 /* calculate mapping of a file extent to an object */
 extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
                                         u64 off, u64 len,
                                         u64 *bno, u64 *oxoff, u64 *oxlen);
 
-/* calculate mapping of object to a placement group */
-extern int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
-                              struct ceph_object_locator *oloc,
-                              struct ceph_object_id *oid,
-                              struct ceph_pg *pg_out);
-
-extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap,
-                              struct ceph_pg pgid,
-                              int *osds, int *primary);
-extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
-                               struct ceph_pg pgid);
+int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
+                             struct ceph_object_id *oid,
+                             struct ceph_object_locator *oloc,
+                             struct ceph_pg *raw_pgid);
+
+void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
+                              const struct ceph_pg *raw_pgid,
+                              struct ceph_osds *up,
+                              struct ceph_osds *acting);
+int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
+                             const struct ceph_pg *raw_pgid);
 
 extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map,
                                                    u64 id);
index 2f822dc..5c0da61 100644 (file)
@@ -114,8 +114,8 @@ struct ceph_object_layout {
  * compound epoch+version, used by storage layer to serialize mutations
  */
 struct ceph_eversion {
-       __le32 epoch;
        __le64 version;
+       __le32 epoch;
 } __attribute__ ((packed));
 
 /*
@@ -153,6 +153,11 @@ extern const char *ceph_osd_state_name(int s);
 #define CEPH_OSDMAP_NOIN     (1<<8)  /* block osd auto mark-in */
 #define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */
 #define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */
+#define CEPH_OSDMAP_NOSCRUB  (1<<11) /* block periodic scrub */
+#define CEPH_OSDMAP_NODEEP_SCRUB (1<<12) /* block periodic deep-scrub */
+#define CEPH_OSDMAP_NOTIERAGENT (1<<13) /* disable tiering agent */
+#define CEPH_OSDMAP_NOREBALANCE (1<<14) /* block osd backfill unless pg is degraded */
+#define CEPH_OSDMAP_SORTBITWISE (1<<15) /* use bitwise hobject_t sort */
 
 /*
  * The error code to return when an OSD can't handle a write
@@ -389,6 +394,13 @@ enum {
        CEPH_OSD_FLAG_SKIPRWLOCKS =   0x10000,  /* skip rw locks */
        CEPH_OSD_FLAG_IGNORE_OVERLAY = 0x20000, /* ignore pool overlay */
        CEPH_OSD_FLAG_FLUSH =         0x40000,  /* this is part of flush */
+       CEPH_OSD_FLAG_MAP_SNAP_CLONE = 0x80000,  /* map snap direct to clone id */
+       CEPH_OSD_FLAG_ENFORCE_SNAPC   = 0x100000,  /* use snapc provided even if
+                                                     pool uses pool snaps */
+       CEPH_OSD_FLAG_REDIRECTED   = 0x200000,  /* op has been redirected */
+       CEPH_OSD_FLAG_KNOWN_REDIR = 0x400000,  /* redirect bit is authoritative */
+       CEPH_OSD_FLAG_FULL_TRY =    0x800000,  /* try op despite full flag */
+       CEPH_OSD_FLAG_FULL_FORCE = 0x1000000,  /* force op despite full flag */
 };
 
 enum {
@@ -415,7 +427,17 @@ enum {
        CEPH_OSD_CMPXATTR_MODE_U64    = 2
 };
 
-#define RADOS_NOTIFY_VER       1
+enum {
+       CEPH_OSD_WATCH_OP_UNWATCH = 0,
+       CEPH_OSD_WATCH_OP_LEGACY_WATCH = 1,
+       /* note: use only ODD ids to prevent pre-giant code from
+          interpreting the op as UNWATCH */
+       CEPH_OSD_WATCH_OP_WATCH = 3,
+       CEPH_OSD_WATCH_OP_RECONNECT = 5,
+       CEPH_OSD_WATCH_OP_PING = 7,
+};
+
+const char *ceph_osd_watch_op_name(int o);
 
 /*
  * an individual object operation.  each may be accompanied by some data
@@ -450,9 +472,13 @@ struct ceph_osd_op {
                } __attribute__ ((packed)) snap;
                struct {
                        __le64 cookie;
-                       __le64 ver;
-                       __u8 flag;      /* 0 = unwatch, 1 = watch */
+                       __le64 ver;     /* no longer used */
+                       __u8 op;        /* CEPH_OSD_WATCH_OP_* */
+                       __le32 gen;     /* registration generation */
                } __attribute__ ((packed)) watch;
+               struct {
+                       __le64 cookie;
+               } __attribute__ ((packed)) notify;
                struct {
                        __le64 offset, length;
                        __le64 src_offset;
index 982a6c4..43d5f0b 100644 (file)
@@ -3,45 +3,62 @@
 
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/radix-tree.h>
 #include <asm/pgtable.h>
 
+/* We use lowest available exceptional entry bit for locking */
+#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
+
 ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *,
                  get_block_t, dio_iodone_t, int flags);
-int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size);
 int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
 int dax_truncate_page(struct inode *, loff_t from, get_block_t);
-int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
-               dax_iodone_t);
-int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
-               dax_iodone_t);
+int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
+int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
+int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
+void dax_wake_mapping_entry_waiter(struct address_space *mapping,
+                                  pgoff_t index, bool wake_all);
 
 #ifdef CONFIG_FS_DAX
 struct page *read_dax_sector(struct block_device *bdev, sector_t n);
+void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index);
+int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
+               unsigned int offset, unsigned int length);
 #else
 static inline struct page *read_dax_sector(struct block_device *bdev,
                sector_t n)
 {
        return ERR_PTR(-ENXIO);
 }
+/* Shouldn't ever be called when dax is disabled. */
+static inline void dax_unlock_mapping_entry(struct address_space *mapping,
+                                           pgoff_t index)
+{
+       BUG();
+}
+static inline int __dax_zero_page_range(struct block_device *bdev,
+               sector_t sector, unsigned int offset, unsigned int length)
+{
+       return -ENXIO;
+}
 #endif
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
 int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
-                               unsigned int flags, get_block_t, dax_iodone_t);
+                               unsigned int flags, get_block_t);
 int __dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
-                               unsigned int flags, get_block_t, dax_iodone_t);
+                               unsigned int flags, get_block_t);
 #else
 static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
-                               pmd_t *pmd, unsigned int flags, get_block_t gb,
-                               dax_iodone_t di)
+                               pmd_t *pmd, unsigned int flags, get_block_t gb)
 {
        return VM_FAULT_FALLBACK;
 }
 #define __dax_pmd_fault dax_pmd_fault
 #endif
 int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
-#define dax_mkwrite(vma, vmf, gb, iod)         dax_fault(vma, vmf, gb, iod)
-#define __dax_mkwrite(vma, vmf, gb, iod)       __dax_fault(vma, vmf, gb, iod)
+#define dax_mkwrite(vma, vmf, gb)      dax_fault(vma, vmf, gb)
+#define __dax_mkwrite(vma, vmf, gb)    __dax_fault(vma, vmf, gb)
 
 static inline bool vma_is_dax(struct vm_area_struct *vma)
 {
index f8506e8..484c879 100644 (file)
@@ -10,6 +10,7 @@
 #include <linux/cache.h>
 #include <linux/rcupdate.h>
 #include <linux/lockref.h>
+#include <linux/stringhash.h>
 
 struct path;
 struct vfsmount;
@@ -52,9 +53,6 @@ struct qstr {
 };
 
 #define QSTR_INIT(n,l) { { { .len = l } }, .name = n }
-#define hashlen_hash(hashlen) ((u32) (hashlen))
-#define hashlen_len(hashlen)  ((u32)((hashlen) >> 32))
-#define hashlen_create(hash,len) (((u64)(len)<<32)|(u32)(hash))
 
 struct dentry_stat_t {
        long nr_dentry;
@@ -65,29 +63,6 @@ struct dentry_stat_t {
 };
 extern struct dentry_stat_t dentry_stat;
 
-/* Name hashing routines. Initial hash value */
-/* Hash courtesy of the R5 hash in reiserfs modulo sign bits */
-#define init_name_hash()               0
-
-/* partial hash update function. Assume roughly 4 bits per character */
-static inline unsigned long
-partial_name_hash(unsigned long c, unsigned long prevhash)
-{
-       return (prevhash + (c << 4) + (c >> 4)) * 11;
-}
-
-/*
- * Finally: cut down the number of bits to a int value (and try to avoid
- * losing bits)
- */
-static inline unsigned long end_name_hash(unsigned long hash)
-{
-       return (unsigned int) hash;
-}
-
-/* Compute the hash for a name string. */
-extern unsigned int full_name_hash(const unsigned char *, unsigned int);
-
 /*
  * Try to keep struct dentry aligned on 64 byte cachelines (this will
  * give reasonable cacheline footprint with larger lines without the
index 5871f29..277ab9a 100644 (file)
 
 #include <linux/errno.h>
 
-struct pts_fs_info;
-
 #ifdef CONFIG_UNIX98_PTYS
 
-/* Look up a pts fs info and get a ref to it */
-struct pts_fs_info *devpts_get_ref(struct inode *, struct file *);
-void devpts_put_ref(struct pts_fs_info *);
+struct pts_fs_info;
+
+struct pts_fs_info *devpts_acquire(struct file *);
+void devpts_release(struct pts_fs_info *);
 
 int devpts_new_index(struct pts_fs_info *);
 void devpts_kill_index(struct pts_fs_info *, int);
index 3fe90d4..4551c6f 100644 (file)
@@ -112,19 +112,24 @@ struct dma_buf_ops {
  * @file: file pointer used for sharing buffers across, and for refcounting.
  * @attachments: list of dma_buf_attachment that denotes all devices attached.
  * @ops: dma_buf_ops associated with this buffer object.
+ * @lock: used internally to serialize list manipulation, attach/detach and vmap/unmap
+ * @vmapping_counter: used internally to refcnt the vmaps
+ * @vmap_ptr: the current vmap ptr if vmapping_counter > 0
  * @exp_name: name of the exporter; useful for debugging.
  * @owner: pointer to exporter module; used for refcounting when exporter is a
  *         kernel module.
  * @list_node: node for dma_buf accounting and debugging.
  * @priv: exporter specific private data for this buffer object.
  * @resv: reservation object linked to this dma-buf
+ * @poll: for userspace poll support
+ * @cb_excl: for userspace poll support
+ * @cb_shared: for userspace poll support
  */
 struct dma_buf {
        size_t size;
        struct file *file;
        struct list_head attachments;
        const struct dma_buf_ops *ops;
-       /* mutex to serialize list manipulation, attach/detach and vmap/unmap */
        struct mutex lock;
        unsigned vmapping_counter;
        void *vmap_ptr;
@@ -188,9 +193,11 @@ struct dma_buf_export_info {
 
 /**
  * helper macro for exporters; zeros and fills in most common values
+ *
+ * @name: export-info name
  */
-#define DEFINE_DMA_BUF_EXPORT_INFO(a)  \
-       struct dma_buf_export_info a = { .exp_name = KBUILD_MODNAME, \
+#define DEFINE_DMA_BUF_EXPORT_INFO(name)       \
+       struct dma_buf_export_info name = { .exp_name = KBUILD_MODNAME, \
                                         .owner = THIS_MODULE }
 
 /**
index 56762ab..1e35588 100644 (file)
@@ -18,7 +18,7 @@
 
 #ifndef __ASSEMBLY__
 
-#define IS_ERR_VALUE(x) unlikely((x) >= (unsigned long)-MAX_ERRNO)
+#define IS_ERR_VALUE(x) unlikely((unsigned long)(void *)(x) >= (unsigned long)-MAX_ERRNO)
 
 static inline void * __must_check ERR_PTR(long error)
 {
index 89627b9..7ce9fb1 100644 (file)
@@ -28,5 +28,6 @@
 #define EBADTYPE       527     /* Type not supported by server */
 #define EJUKEBOX       528     /* Request initiated, but will not complete before timeout */
 #define EIOCBQUEUED    529     /* iocb queued, will get completion event */
+#define ERECALLCONFLICT        530     /* conflict with recalled state */
 
 #endif
index 96e45ea..2f9ccbe 100644 (file)
@@ -38,7 +38,7 @@ extern struct module __this_module;
 
 #ifdef CONFIG_MODULES
 
-#ifndef __GENKSYMS__
+#if defined(__KERNEL__) && !defined(__GENKSYMS__)
 #ifdef CONFIG_MODVERSIONS
 /* Mark the CRC weak since genksyms apparently decides not to
  * generate a checksums for some symbols */
@@ -53,7 +53,7 @@ extern struct module __this_module;
 #endif
 
 /* For every exported symbol, place a struct in the __ksymtab section */
-#define __EXPORT_SYMBOL(sym, sec)                              \
+#define ___EXPORT_SYMBOL(sym, sec)                             \
        extern typeof(sym) sym;                                 \
        __CRC_SYMBOL(sym, sec)                                  \
        static const char __kstrtab_##sym[]                     \
@@ -65,6 +65,35 @@ extern struct module __this_module;
        __attribute__((section("___ksymtab" sec "+" #sym), unused))     \
        = { (unsigned long)&sym, __kstrtab_##sym }
 
+#if defined(__KSYM_DEPS__)
+
+/*
+ * For fine grained build dependencies, we want to tell the build system
+ * about each possible exported symbol even if they're not actually exported.
+ * We use a string pattern that is unlikely to be valid code that the build
+ * system filters out from the preprocessor output (see ksym_dep_filter
+ * in scripts/Kbuild.include).
+ */
+#define __EXPORT_SYMBOL(sym, sec)      === __KSYM_##sym ===
+
+#elif defined(CONFIG_TRIM_UNUSED_KSYMS)
+
+#include <linux/kconfig.h>
+#include <generated/autoksyms.h>
+
+#define __EXPORT_SYMBOL(sym, sec)                              \
+       __cond_export_sym(sym, sec, config_enabled(__KSYM_##sym))
+#define __cond_export_sym(sym, sec, conf)                      \
+       ___cond_export_sym(sym, sec, conf)
+#define ___cond_export_sym(sym, sec, enabled)                  \
+       __cond_export_sym_##enabled(sym, sec)
+#define __cond_export_sym_1(sym, sec) ___EXPORT_SYMBOL(sym, sec)
+#define __cond_export_sym_0(sym, sec) /* nothing */
+
+#else
+#define __EXPORT_SYMBOL ___EXPORT_SYMBOL
+#endif
+
 #define EXPORT_SYMBOL(sym)                                     \
        __EXPORT_SYMBOL(sym, "")
 
index 2b17698..2056e9f 100644 (file)
@@ -49,6 +49,8 @@ struct fence_cb;
  * @timestamp: Timestamp when the fence was signaled.
  * @status: Optional, only valid if < 0, must be set before calling
  * fence_signal, indicates that the fence has completed with an error.
+ * @child_list: list of children fences
+ * @active_list: list of active fences
  *
  * the flags member must be manipulated and read using the appropriate
  * atomic ops (bit_*), so taking the spinlock will not be needed most
index 5f61431..dd28814 100644 (file)
@@ -74,7 +74,6 @@ typedef int (get_block_t)(struct inode *inode, sector_t iblock,
                        struct buffer_head *bh_result, int create);
 typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
                        ssize_t bytes, void *private);
-typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate);
 
 #define MAY_EXEC               0x00000001
 #define MAY_WRITE              0x00000002
@@ -1730,7 +1729,8 @@ struct inode_operations {
                        struct inode *, struct dentry *, unsigned int);
        int (*setattr) (struct dentry *, struct iattr *);
        int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
-       int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
+       int (*setxattr) (struct dentry *, struct inode *,
+                        const char *, const void *, size_t, int);
        ssize_t (*getxattr) (struct dentry *, struct inode *,
                             const char *, void *, size_t);
        ssize_t (*listxattr) (struct dentry *, char *, size_t);
index 604e152..13ba552 100644 (file)
@@ -241,7 +241,7 @@ struct fscache_cache_ops {
 
        /* check the consistency between the backing cache and the FS-Cache
         * cookie */
-       bool (*check_consistency)(struct fscache_operation *op);
+       int (*check_consistency)(struct fscache_operation *op);
 
        /* store the updated auxiliary data on an object */
        void (*update_object)(struct fscache_object *object);
index 0023088..3f9778c 100644 (file)
 #define FSL_IFC_VERSION_MASK   0x0F0F0000
 #define FSL_IFC_VERSION_1_0_0  0x01000000
 #define FSL_IFC_VERSION_1_1_0  0x01010000
+#define FSL_IFC_VERSION_2_0_0  0x02000000
+
+#define PGOFFSET_64K   (64*1024)
+#define PGOFFSET_4K    (4*1024)
 
 /*
  * CSPR - Chip Select Property Register
@@ -723,20 +727,26 @@ struct fsl_ifc_nand {
        __be32 nand_evter_en;
        u32 res17[0x2];
        __be32 nand_evter_intr_en;
-       u32 res18[0x2];
+       __be32 nand_vol_addr_stat;
+       u32 res18;
        __be32 nand_erattr0;
        __be32 nand_erattr1;
        u32 res19[0x10];
        __be32 nand_fsr;
-       u32 res20;
-       __be32 nand_eccstat[4];
-       u32 res21[0x20];
+       u32 res20[0x3];
+       __be32 nand_eccstat[6];
+       u32 res21[0x1c];
        __be32 nanndcr;
        u32 res22[0x2];
        __be32 nand_autoboot_trgr;
        u32 res23;
        __be32 nand_mdr;
-       u32 res24[0x5C];
+       u32 res24[0x1C];
+       __be32 nand_dll_lowcfg0;
+       __be32 nand_dll_lowcfg1;
+       u32 res25;
+       __be32 nand_dll_lowstat;
+       u32 res26[0x3c];
 };
 
 /*
@@ -771,13 +781,12 @@ struct fsl_ifc_gpcm {
        __be32 gpcm_erattr1;
        __be32 gpcm_erattr2;
        __be32 gpcm_stat;
-       u32 res4[0x1F3];
 };
 
 /*
  * IFC Controller Registers
  */
-struct fsl_ifc_regs {
+struct fsl_ifc_global {
        __be32 ifc_rev;
        u32 res1[0x2];
        struct {
@@ -803,21 +812,26 @@ struct fsl_ifc_regs {
        } ftim_cs[FSL_IFC_BANK_COUNT];
        u32 res9[0x30];
        __be32 rb_stat;
-       u32 res10[0x2];
+       __be32 rb_map;
+       __be32 wb_map;
        __be32 ifc_gcr;
-       u32 res11[0x2];
+       u32 res10[0x2];
        __be32 cm_evter_stat;
-       u32 res12[0x2];
+       u32 res11[0x2];
        __be32 cm_evter_en;
-       u32 res13[0x2];
+       u32 res12[0x2];
        __be32 cm_evter_intr_en;
-       u32 res14[0x2];
+       u32 res13[0x2];
        __be32 cm_erattr0;
        __be32 cm_erattr1;
-       u32 res15[0x2];
+       u32 res14[0x2];
        __be32 ifc_ccr;
        __be32 ifc_csr;
-       u32 res16[0x2EB];
+       __be32 ddr_ccr_low;
+};
+
+
+struct fsl_ifc_runtime {
        struct fsl_ifc_nand ifc_nand;
        struct fsl_ifc_nor ifc_nor;
        struct fsl_ifc_gpcm ifc_gpcm;
@@ -831,7 +845,8 @@ extern int fsl_ifc_find(phys_addr_t addr_base);
 struct fsl_ifc_ctrl {
        /* device info */
        struct device                   *dev;
-       struct fsl_ifc_regs __iomem     *regs;
+       struct fsl_ifc_global __iomem   *gregs;
+       struct fsl_ifc_runtime __iomem  *rregs;
        int                             irq;
        int                             nand_irq;
        spinlock_t                      lock;
index 79c52fa..ad6fa21 100644 (file)
@@ -3,92 +3,94 @@
 /* Fast hashing routine for ints,  longs and pointers.
    (C) 2002 Nadia Yvette Chambers, IBM */
 
-/*
- * Knuth recommends primes in approximately golden ratio to the maximum
- * integer representable by a machine word for multiplicative hashing.
- * Chuck Lever verified the effectiveness of this technique:
- * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
- *
- * These primes are chosen to be bit-sparse, that is operations on
- * them can use shifts and additions instead of multiplications for
- * machines where multiplications are slow.
- */
-
 #include <asm/types.h>
 #include <linux/compiler.h>
 
-/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
-#define GOLDEN_RATIO_PRIME_32 0x9e370001UL
-/*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
-#define GOLDEN_RATIO_PRIME_64 0x9e37fffffffc0001UL
-
+/*
+ * The "GOLDEN_RATIO_PRIME" is used in ifs/btrfs/brtfs_inode.h and
+ * fs/inode.c.  It's not actually prime any more (the previous primes
+ * were actively bad for hashing), but the name remains.
+ */
 #if BITS_PER_LONG == 32
-#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_PRIME_32
+#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_32
 #define hash_long(val, bits) hash_32(val, bits)
 #elif BITS_PER_LONG == 64
 #define hash_long(val, bits) hash_64(val, bits)
-#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_PRIME_64
+#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_64
 #else
 #error Wordsize not 32 or 64
 #endif
 
 /*
- * The above primes are actively bad for hashing, since they are
- * too sparse. The 32-bit one is mostly ok, the 64-bit one causes
- * real problems. Besides, the "prime" part is pointless for the
- * multiplicative hash.
+ * This hash multiplies the input by a large odd number and takes the
+ * high bits.  Since multiplication propagates changes to the most
+ * significant end only, it is essential that the high bits of the
+ * product be used for the hash value.
+ *
+ * Chuck Lever verified the effectiveness of this technique:
+ * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
  *
  * Although a random odd number will do, it turns out that the golden
  * ratio phi = (sqrt(5)-1)/2, or its negative, has particularly nice
- * properties.
+ * properties.  (See Knuth vol 3, section 6.4, exercise 9.)
  *
- * These are the negative, (1 - phi) = (phi^2) = (3 - sqrt(5))/2.
- * (See Knuth vol 3, section 6.4, exercise 9.)
+ * These are the negative, (1 - phi) = phi**2 = (3 - sqrt(5))/2,
+ * which is very slightly easier to multiply by and makes no
+ * difference to the hash distribution.
  */
 #define GOLDEN_RATIO_32 0x61C88647
 #define GOLDEN_RATIO_64 0x61C8864680B583EBull
 
-static __always_inline u64 hash_64(u64 val, unsigned int bits)
-{
-       u64 hash = val;
+#ifdef CONFIG_HAVE_ARCH_HASH
+/* This header may use the GOLDEN_RATIO_xx constants */
+#include <asm/hash.h>
+#endif
 
-#if BITS_PER_LONG == 64
-       hash = hash * GOLDEN_RATIO_64;
-#else
-       /*  Sigh, gcc can't optimise this alone like it does for 32 bits. */
-       u64 n = hash;
-       n <<= 18;
-       hash -= n;
-       n <<= 33;
-       hash -= n;
-       n <<= 3;
-       hash += n;
-       n <<= 3;
-       hash -= n;
-       n <<= 4;
-       hash += n;
-       n <<= 2;
-       hash += n;
+/*
+ * The _generic versions exist only so lib/test_hash.c can compare
+ * the arch-optimized versions with the generic.
+ *
+ * Note that if you change these, any <asm/hash.h> that aren't updated
+ * to match need to have their HAVE_ARCH_* define values updated so the
+ * self-test will not false-positive.
+ */
+#ifndef HAVE_ARCH__HASH_32
+#define __hash_32 __hash_32_generic
 #endif
+static inline u32 __hash_32_generic(u32 val)
+{
+       return val * GOLDEN_RATIO_32;
+}
 
+#ifndef HAVE_ARCH_HASH_32
+#define hash_32 hash_32_generic
+#endif
+static inline u32 hash_32_generic(u32 val, unsigned int bits)
+{
        /* High bits are more random, so use them. */
-       return hash >> (64 - bits);
+       return __hash_32(val) >> (32 - bits);
 }
 
-static inline u32 hash_32(u32 val, unsigned int bits)
+#ifndef HAVE_ARCH_HASH_64
+#define hash_64 hash_64_generic
+#endif
+static __always_inline u32 hash_64_generic(u64 val, unsigned int bits)
 {
-       /* On some cpus multiply is faster, on others gcc will do shifts */
-       u32 hash = val * GOLDEN_RATIO_PRIME_32;
-
-       /* High bits are more random, so use them. */
-       return hash >> (32 - bits);
+#if BITS_PER_LONG == 64
+       /* 64x64-bit multiply is efficient on all 64-bit processors */
+       return val * GOLDEN_RATIO_64 >> (64 - bits);
+#else
+       /* Hash 64 bits using only 32x32-bit multiply. */
+       return hash_32((u32)val ^ __hash_32(val >> 32), bits);
+#endif
 }
 
-static inline unsigned long hash_ptr(const void *ptr, unsigned int bits)
+static inline u32 hash_ptr(const void *ptr, unsigned int bits)
 {
        return hash_long((unsigned long)ptr, bits);
 }
 
+/* This really should be called fold32_ptr; it does no hashing to speak of. */
 static inline u32 hash32_ptr(const void *ptr)
 {
        unsigned long val = (unsigned long)ptr;
index 92f7177..f27bb2c 100644 (file)
 /* iova structure */
 struct iova {
        struct rb_node  node;
-       unsigned long   pfn_hi; /* IOMMU dish out addr hi */
-       unsigned long   pfn_lo; /* IOMMU dish out addr lo */
+       unsigned long   pfn_hi; /* Highest allocated pfn */
+       unsigned long   pfn_lo; /* Lowest allocated pfn */
+};
+
+struct iova_magazine;
+struct iova_cpu_rcache;
+
+#define IOVA_RANGE_CACHE_MAX_SIZE 6    /* log of max cached IOVA range size (in pages) */
+#define MAX_GLOBAL_MAGS 32     /* magazines per bin */
+
+struct iova_rcache {
+       spinlock_t lock;
+       unsigned long depot_size;
+       struct iova_magazine *depot[MAX_GLOBAL_MAGS];
+       struct iova_cpu_rcache __percpu *cpu_rcaches;
 };
 
 /* holds all the iova translations for a domain */
@@ -31,6 +44,7 @@ struct iova_domain {
        unsigned long   granule;        /* pfn granularity for this domain */
        unsigned long   start_pfn;      /* Lower limit for this domain */
        unsigned long   dma_32bit_pfn;
+       struct iova_rcache rcaches[IOVA_RANGE_CACHE_MAX_SIZE];  /* IOVA range caches */
 };
 
 static inline unsigned long iova_size(struct iova *iova)
@@ -78,6 +92,10 @@ void __free_iova(struct iova_domain *iovad, struct iova *iova);
 struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
        unsigned long limit_pfn,
        bool size_aligned);
+void free_iova_fast(struct iova_domain *iovad, unsigned long pfn,
+                   unsigned long size);
+unsigned long alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
+                             unsigned long limit_pfn);
 struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
        unsigned long pfn_hi);
 void copy_reserved_iova(struct iova_domain *from, struct iova_domain *to);
@@ -87,5 +105,6 @@ struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn);
 void put_iova_domain(struct iova_domain *iovad);
 struct iova *split_and_remove_iova(struct iova_domain *iovad,
        struct iova *iova, unsigned long pfn_lo, unsigned long pfn_hi);
+void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad);
 
 #endif
index 9e6fdd3..dc493e0 100644 (file)
 #define ICH_LR_ACTIVE_BIT              (1ULL << 63)
 #define ICH_LR_PHYS_ID_SHIFT           32
 #define ICH_LR_PHYS_ID_MASK            (0x3ffULL << ICH_LR_PHYS_ID_SHIFT)
+#define ICH_LR_PRIORITY_SHIFT          48
+
+/* These are for GICv2 emulation only */
+#define GICH_LR_VIRTUALID              (0x3ffUL << 0)
+#define GICH_LR_PHYSID_CPUID_SHIFT     (10)
+#define GICH_LR_PHYSID_CPUID           (7UL << GICH_LR_PHYSID_CPUID_SHIFT)
 
 #define ICH_MISR_EOI                   (1 << 0)
 #define ICH_MISR_U                     (1 << 1)
 #define ICC_SGI1R_AFFINITY_1_SHIFT     16
 #define ICC_SGI1R_AFFINITY_1_MASK      (0xff << ICC_SGI1R_AFFINITY_1_SHIFT)
 #define ICC_SGI1R_SGI_ID_SHIFT         24
-#define ICC_SGI1R_SGI_ID_MASK          (0xff << ICC_SGI1R_SGI_ID_SHIFT)
+#define ICC_SGI1R_SGI_ID_MASK          (0xfULL << ICC_SGI1R_SGI_ID_SHIFT)
 #define ICC_SGI1R_AFFINITY_2_SHIFT     32
-#define ICC_SGI1R_AFFINITY_2_MASK      (0xffULL << ICC_SGI1R_AFFINITY_1_SHIFT)
+#define ICC_SGI1R_AFFINITY_2_MASK      (0xffULL << ICC_SGI1R_AFFINITY_2_SHIFT)
 #define ICC_SGI1R_IRQ_ROUTING_MODE_BIT 40
 #define ICC_SGI1R_AFFINITY_3_SHIFT     48
-#define ICC_SGI1R_AFFINITY_3_MASK      (0xffULL << ICC_SGI1R_AFFINITY_1_SHIFT)
+#define ICC_SGI1R_AFFINITY_3_MASK      (0xffULL << ICC_SGI1R_AFFINITY_3_SHIFT)
 
 #include <asm/arch_gicv3.h>
 
index 9c94026..fd05185 100644 (file)
@@ -33,6 +33,7 @@
 
 #define GIC_DIST_CTRL                  0x000
 #define GIC_DIST_CTR                   0x004
+#define GIC_DIST_IIDR                  0x008
 #define GIC_DIST_IGROUP                        0x080
 #define GIC_DIST_ENABLE_SET            0x100
 #define GIC_DIST_ENABLE_CLEAR          0x180
@@ -76,6 +77,7 @@
 #define GICH_LR_VIRTUALID              (0x3ff << 0)
 #define GICH_LR_PHYSID_CPUID_SHIFT     (10)
 #define GICH_LR_PHYSID_CPUID           (0x3ff << GICH_LR_PHYSID_CPUID_SHIFT)
+#define GICH_LR_PRIORITY_SHIFT         23
 #define GICH_LR_STATE                  (3 << 28)
 #define GICH_LR_PENDING_BIT            (1 << 28)
 #define GICH_LR_ACTIVE_BIT             (1 << 29)
index fd1083c..efb232c 100644 (file)
@@ -403,11 +403,19 @@ static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
 
 /* Flags in jbd_inode->i_flags */
 #define __JI_COMMIT_RUNNING 0
-/* Commit of the inode data in progress. We use this flag to protect us from
+#define __JI_WRITE_DATA 1
+#define __JI_WAIT_DATA 2
+
+/*
+ * Commit of the inode data in progress. We use this flag to protect us from
  * concurrent deletion of inode. We cannot use reference to inode for this
  * since we cannot afford doing last iput() on behalf of kjournald
  */
 #define JI_COMMIT_RUNNING (1 << __JI_COMMIT_RUNNING)
+/* Write allocated dirty buffers in this inode before commit */
+#define JI_WRITE_DATA (1 << __JI_WRITE_DATA)
+/* Wait for outstanding data writes for this inode before commit */
+#define JI_WAIT_DATA (1 << __JI_WAIT_DATA)
 
 /**
  * struct jbd_inode is the structure linking inodes in ordered mode
@@ -781,9 +789,6 @@ jbd2_time_diff(unsigned long start, unsigned long end)
  * @j_wbufsize: maximum number of buffer_heads allowed in j_wbuf, the
  *     number that will fit in j_blocksize
  * @j_last_sync_writer: most recent pid which did a synchronous write
- * @j_history: Buffer storing the transactions statistics history
- * @j_history_max: Maximum number of transactions in the statistics history
- * @j_history_cur: Current number of transactions in the statistics history
  * @j_history_lock: Protect the transactions statistics history
  * @j_proc_entry: procfs entry for the jbd statistics directory
  * @j_stats: Overall statistics
@@ -1270,7 +1275,8 @@ extern int           jbd2_journal_clear_err  (journal_t *);
 extern int        jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *);
 extern int        jbd2_journal_force_commit(journal_t *);
 extern int        jbd2_journal_force_commit_nested(journal_t *);
-extern int        jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode);
+extern int        jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *inode);
+extern int        jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *inode);
 extern int        jbd2_journal_begin_ordered_truncate(journal_t *journal,
                                struct jbd2_inode *inode, loff_t new_size);
 extern void       jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode);
index b1fa8f1..1c9c973 100644 (file)
@@ -412,6 +412,8 @@ struct kvm {
 #endif
        long tlbs_dirty;
        struct list_head devices;
+       struct dentry *debugfs_dentry;
+       struct kvm_stat_data **debugfs_stat_data;
 };
 
 #define kvm_err(fmt, ...) \
@@ -991,6 +993,11 @@ enum kvm_stat_kind {
        KVM_STAT_VCPU,
 };
 
+struct kvm_stat_data {
+       int offset;
+       struct kvm *kvm;
+};
+
 struct kvm_stats_debugfs_item {
        const char *name;
        int offset;
index 20d8a5d..5145620 100644 (file)
@@ -182,7 +182,7 @@ static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
 #endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
 
 #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
-extern void register_page_bootmem_info_node(struct pglist_data *pgdat);
+extern void __init register_page_bootmem_info_node(struct pglist_data *pgdat);
 #else
 static inline void register_page_bootmem_info_node(struct pglist_data *pgdat)
 {
index a677c2b..64184d2 100644 (file)
@@ -50,9 +50,11 @@ enum {
                                        EC_MSG_TX_TRAILER_BYTES,
        EC_MSG_RX_PROTO_BYTES   = 3,
 
-       /* Max length of messages */
-       EC_MSG_BYTES            = EC_PROTO2_MAX_PARAM_SIZE +
+       /* Max length of messages for proto 2*/
+       EC_PROTO2_MSG_BYTES             = EC_PROTO2_MAX_PARAM_SIZE +
                                        EC_MSG_TX_PROTO_BYTES,
+
+       EC_MAX_MSG_BYTES                = 64 * 1024,
 };
 
 /*
index 8f9fc3d..8e95cd8 100644 (file)
 #define TWL6040_HFDACENA               (1 << 0)
 #define TWL6040_HFPGAENA               (1 << 1)
 #define TWL6040_HFDRVENA               (1 << 4)
+#define TWL6040_HFSWENA                        (1 << 6)
 
 /* VIBCTLL/R (0x18/0x1A) fields */
 
index 2835d59..5df5feb 100644 (file)
@@ -303,6 +303,12 @@ struct vm_fault {
                                         * is set (which is also implied by
                                         * VM_FAULT_ERROR).
                                         */
+       void *entry;                    /* ->fault handler can alternatively
+                                        * return locked DAX entry. In that
+                                        * case handler should return
+                                        * VM_FAULT_DAX_LOCKED and fill in
+                                        * entry here.
+                                        */
        /* for ->map_pages() only */
        pgoff_t max_pgoff;              /* map pages for offset from pgoff till
                                         * max_pgoff inclusive */
@@ -1076,6 +1082,7 @@ static inline void clear_page_pfmemalloc(struct page *page)
 #define VM_FAULT_LOCKED        0x0200  /* ->fault locked the returned page */
 #define VM_FAULT_RETRY 0x0400  /* ->fault blocked, must retry */
 #define VM_FAULT_FALLBACK 0x0800       /* huge page fault failed, fall back to small */
+#define VM_FAULT_DAX_LOCKED 0x1000     /* ->fault has locked DAX entry */
 
 #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
 
@@ -2011,7 +2018,7 @@ static inline void mm_populate(unsigned long addr, unsigned long len) {}
 #endif
 
 /* These take the mm semaphore themselves */
-extern unsigned long __must_check vm_brk(unsigned long, unsigned long);
+extern int __must_check vm_brk(unsigned long, unsigned long);
 extern int vm_munmap(unsigned long, size_t);
 extern unsigned long __must_check vm_mmap(struct file *, unsigned long,
         unsigned long, unsigned long,
index d553855..ca3e517 100644 (file)
@@ -514,7 +514,9 @@ struct mm_struct {
 #ifdef CONFIG_HUGETLB_PAGE
        atomic_long_t hugetlb_usage;
 #endif
+#ifdef CONFIG_MMU
        struct work_struct async_put_work;
+#endif
 };
 
 static inline void mm_init_cpumask(struct mm_struct *mm)
index 85800b4..45cde8c 100644 (file)
@@ -329,6 +329,7 @@ struct mmc_host {
        unsigned int            can_retune:1;   /* re-tuning can be used */
        unsigned int            doing_retune:1; /* re-tuning in progress */
        unsigned int            retune_now:1;   /* do re-tuning at next req */
+       unsigned int            retune_paused:1; /* re-tuning is temporarily disabled */
 
        int                     rescan_disable; /* disable card detection */
        int                     rescan_entered; /* used with nonremovable devices */
@@ -526,4 +527,7 @@ static inline void mmc_retune_recheck(struct mmc_host *host)
                host->retune_now = 1;
 }
 
+void mmc_retune_pause(struct mmc_host *host);
+void mmc_retune_unpause(struct mmc_host *host);
+
 #endif /* LINUX_MMC_HOST_H */
index c8be32e..ad3c348 100644 (file)
 
 #define FSMC_BUSY_WAIT_TIMEOUT (1 * HZ)
 
-/*
- * There are 13 bytes of ecc for every 512 byte block in FSMC version 8
- * and it has to be read consecutively and immediately after the 512
- * byte data block for hardware to generate the error bit offsets
- * Managing the ecc bytes in the following way is easier. This way is
- * similar to oobfree structure maintained already in u-boot nand driver
- */
-#define MAX_ECCPLACE_ENTRIES   32
-
-struct fsmc_nand_eccplace {
-       uint8_t offset;
-       uint8_t length;
-};
-
-struct fsmc_eccplace {
-       struct fsmc_nand_eccplace eccplace[MAX_ECCPLACE_ENTRIES];
-};
-
 struct fsmc_nand_timings {
        uint8_t tclr;
        uint8_t tar;
index 5e0eb7c..3aa56e3 100644 (file)
 #endif
 
 #ifdef CONFIG_MTD_MAP_BANK_WIDTH_32
-# ifdef map_bankwidth
-#  undef map_bankwidth
-#  define map_bankwidth(map) ((map)->bankwidth)
-#  undef map_bankwidth_is_large
-#  define map_bankwidth_is_large(map) (map_bankwidth(map) > BITS_PER_LONG/8)
-#  undef map_words
-#  define map_words(map) map_calc_words(map)
-# else
-#  define map_bankwidth(map) 32
-#  define map_bankwidth_is_large(map) (1)
-#  define map_words(map) map_calc_words(map)
-# endif
+/* always use indirect access for 256-bit to preserve kernel stack */
+# undef map_bankwidth
+# define map_bankwidth(map) ((map)->bankwidth)
+# undef map_bankwidth_is_large
+# define map_bankwidth_is_large(map) (map_bankwidth(map) > BITS_PER_LONG/8)
+# undef map_words
+# define map_words(map) map_calc_words(map)
 #define map_bankwidth_is_32(map) (map_bankwidth(map) == 32)
 #undef MAX_MAP_BANKWIDTH
 #define MAX_MAP_BANKWIDTH 32
index ef9fea4..29a1706 100644 (file)
@@ -96,16 +96,35 @@ struct mtd_oob_ops {
 
 #define MTD_MAX_OOBFREE_ENTRIES_LARGE  32
 #define MTD_MAX_ECCPOS_ENTRIES_LARGE   640
+/**
+ * struct mtd_oob_region - oob region definition
+ * @offset: region offset
+ * @length: region length
+ *
+ * This structure describes a region of the OOB area, and is used
+ * to retrieve ECC or free bytes sections.
+ * Each section is defined by an offset within the OOB area and a
+ * length.
+ */
+struct mtd_oob_region {
+       u32 offset;
+       u32 length;
+};
+
 /*
- * Internal ECC layout control structure. For historical reasons, there is a
- * similar, smaller struct nand_ecclayout_user (in mtd-abi.h) that is retained
- * for export to user-space via the ECCGETLAYOUT ioctl.
- * nand_ecclayout should be expandable in the future simply by the above macros.
+ * struct mtd_ooblayout_ops - NAND OOB layout operations
+ * @ecc: function returning an ECC region in the OOB area.
+ *      Should return -ERANGE if %section exceeds the total number of
+ *      ECC sections.
+ * @free: function returning a free region in the OOB area.
+ *       Should return -ERANGE if %section exceeds the total number of
+ *       free sections.
  */
-struct nand_ecclayout {
-       __u32 eccbytes;
-       __u32 eccpos[MTD_MAX_ECCPOS_ENTRIES_LARGE];
-       struct nand_oobfree oobfree[MTD_MAX_OOBFREE_ENTRIES_LARGE];
+struct mtd_ooblayout_ops {
+       int (*ecc)(struct mtd_info *mtd, int section,
+                  struct mtd_oob_region *oobecc);
+       int (*free)(struct mtd_info *mtd, int section,
+                   struct mtd_oob_region *oobfree);
 };
 
 struct module; /* only needed for owner field in mtd_info */
@@ -166,8 +185,8 @@ struct mtd_info {
        const char *name;
        int index;
 
-       /* ECC layout structure pointer - read only! */
-       struct nand_ecclayout *ecclayout;
+       /* OOB layout description */
+       const struct mtd_ooblayout_ops *ooblayout;
 
        /* the ecc step size. */
        unsigned int ecc_step_size;
@@ -253,6 +272,30 @@ struct mtd_info {
        int usecount;
 };
 
+int mtd_ooblayout_ecc(struct mtd_info *mtd, int section,
+                     struct mtd_oob_region *oobecc);
+int mtd_ooblayout_find_eccregion(struct mtd_info *mtd, int eccbyte,
+                                int *section,
+                                struct mtd_oob_region *oobregion);
+int mtd_ooblayout_get_eccbytes(struct mtd_info *mtd, u8 *eccbuf,
+                              const u8 *oobbuf, int start, int nbytes);
+int mtd_ooblayout_set_eccbytes(struct mtd_info *mtd, const u8 *eccbuf,
+                              u8 *oobbuf, int start, int nbytes);
+int mtd_ooblayout_free(struct mtd_info *mtd, int section,
+                      struct mtd_oob_region *oobfree);
+int mtd_ooblayout_get_databytes(struct mtd_info *mtd, u8 *databuf,
+                               const u8 *oobbuf, int start, int nbytes);
+int mtd_ooblayout_set_databytes(struct mtd_info *mtd, const u8 *databuf,
+                               u8 *oobbuf, int start, int nbytes);
+int mtd_ooblayout_count_freebytes(struct mtd_info *mtd);
+int mtd_ooblayout_count_eccbytes(struct mtd_info *mtd);
+
+static inline void mtd_set_ooblayout(struct mtd_info *mtd,
+                                    const struct mtd_ooblayout_ops *ooblayout)
+{
+       mtd->ooblayout = ooblayout;
+}
+
 static inline void mtd_set_of_node(struct mtd_info *mtd,
                                   struct device_node *np)
 {
index 56574ba..fbe8e16 100644 (file)
@@ -116,9 +116,14 @@ typedef enum {
        NAND_ECC_HW,
        NAND_ECC_HW_SYNDROME,
        NAND_ECC_HW_OOB_FIRST,
-       NAND_ECC_SOFT_BCH,
 } nand_ecc_modes_t;
 
+enum nand_ecc_algo {
+       NAND_ECC_UNKNOWN,
+       NAND_ECC_HAMMING,
+       NAND_ECC_BCH,
+};
+
 /*
  * Constants for Hardware ECC
  */
@@ -458,6 +463,7 @@ struct nand_hw_control {
 /**
  * struct nand_ecc_ctrl - Control structure for ECC
  * @mode:      ECC mode
+ * @algo:      ECC algorithm
  * @steps:     number of ECC steps per page
  * @size:      data bytes per ECC step
  * @bytes:     ECC bytes per step
@@ -466,7 +472,6 @@ struct nand_hw_control {
  * @prepad:    padding information for syndrome based ECC generators
  * @postpad:   padding information for syndrome based ECC generators
  * @options:   ECC specific options (see NAND_ECC_XXX flags defined above)
- * @layout:    ECC layout control struct pointer
  * @priv:      pointer to private ECC control data
  * @hwctl:     function to control hardware ECC generator. Must only
  *             be provided if an hardware ECC is available
@@ -508,6 +513,7 @@ struct nand_hw_control {
  */
 struct nand_ecc_ctrl {
        nand_ecc_modes_t mode;
+       enum nand_ecc_algo algo;
        int steps;
        int size;
        int bytes;
@@ -516,7 +522,6 @@ struct nand_ecc_ctrl {
        int prepad;
        int postpad;
        unsigned int options;
-       struct nand_ecclayout   *layout;
        void *priv;
        void (*hwctl)(struct mtd_info *mtd, int mode);
        int (*calculate)(struct mtd_info *mtd, const uint8_t *dat,
@@ -740,6 +745,9 @@ struct nand_chip {
        void *priv;
 };
 
+extern const struct mtd_ooblayout_ops nand_ooblayout_sp_ops;
+extern const struct mtd_ooblayout_ops nand_ooblayout_lp_ops;
+
 static inline void nand_set_flash_node(struct nand_chip *chip,
                                       struct device_node *np)
 {
@@ -1070,4 +1078,18 @@ int nand_check_erased_ecc_chunk(void *data, int datalen,
                                void *ecc, int ecclen,
                                void *extraoob, int extraooblen,
                                int threshold);
+
+/* Default write_oob implementation */
+int nand_write_oob_std(struct mtd_info *mtd, struct nand_chip *chip, int page);
+
+/* Default write_oob syndrome implementation */
+int nand_write_oob_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
+                           int page);
+
+/* Default read_oob implementation */
+int nand_read_oob_std(struct mtd_info *mtd, struct nand_chip *chip, int page);
+
+/* Default read_oob syndrome implementation */
+int nand_read_oob_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
+                          int page);
 #endif /* __LINUX_MTD_NAND_H */
index 4596503..0aaa98b 100644 (file)
@@ -80,7 +80,6 @@ struct onenand_bufferram {
  * @page_buf:          [INTERN] page main data buffer
  * @oob_buf:           [INTERN] page oob data buffer
  * @subpagesize:       [INTERN] holds the subpagesize
- * @ecclayout:         [REPLACEABLE] the default ecc placement scheme
  * @bbm:               [REPLACEABLE] pointer to Bad Block Management
  * @priv:              [OPTIONAL] pointer to private chip date
  */
@@ -134,7 +133,6 @@ struct onenand_chip {
 #endif
 
        int                     subpagesize;
-       struct nand_ecclayout   *ecclayout;
 
        void                    *bbm;
 
index 25f4d2a..65e91d0 100644 (file)
@@ -14,7 +14,7 @@
 
 struct sharpsl_nand_platform_data {
        struct nand_bbt_descr   *badblock_pattern;
-       struct nand_ecclayout   *ecc_layout;
+       const struct mtd_ooblayout_ops *ecc_layout;
        struct mtd_partition    *partitions;
        unsigned int            nr_partitions;
 };
index 3c36113..7f041bd 100644 (file)
@@ -21,6 +21,7 @@
  * Sometimes these are the same as CFI IDs, but sometimes they aren't.
  */
 #define SNOR_MFR_ATMEL         CFI_MFR_ATMEL
+#define SNOR_MFR_GIGADEVICE    0xc8
 #define SNOR_MFR_INTEL         CFI_MFR_INTEL
 #define SNOR_MFR_MICRON                CFI_MFR_ST /* ST Micro <--> Micron */
 #define SNOR_MFR_MACRONIX      CFI_MFR_MACRONIX
index ec5ec28..d3d0398 100644 (file)
@@ -45,6 +45,8 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
 #define LOOKUP_ROOT            0x2000
 #define LOOKUP_EMPTY           0x4000
 
+extern int path_pts(struct path *path);
+
 extern int user_path_at_empty(int, const char __user *, unsigned, struct path *, int *empty);
 
 static inline int user_path_at(int dfd, const char __user *name, unsigned flags,
index 0114334..bfed6b3 100644 (file)
@@ -50,12 +50,27 @@ struct nfs4_label {
 
 typedef struct { char data[NFS4_VERIFIER_SIZE]; } nfs4_verifier;
 
-struct nfs_stateid4 {
-       __be32 seqid;
-       char other[NFS4_STATEID_OTHER_SIZE];
-} __attribute__ ((packed));
+struct nfs4_stateid_struct {
+       union {
+               char data[NFS4_STATEID_SIZE];
+               struct {
+                       __be32 seqid;
+                       char other[NFS4_STATEID_OTHER_SIZE];
+               } __attribute__ ((packed));
+       };
+
+       enum {
+               NFS4_INVALID_STATEID_TYPE = 0,
+               NFS4_SPECIAL_STATEID_TYPE,
+               NFS4_OPEN_STATEID_TYPE,
+               NFS4_LOCK_STATEID_TYPE,
+               NFS4_DELEGATION_STATEID_TYPE,
+               NFS4_LAYOUT_STATEID_TYPE,
+               NFS4_PNFS_DS_STATEID_TYPE,
+       } type;
+};
 
-typedef struct nfs_stateid4 nfs4_stateid;
+typedef struct nfs4_stateid_struct nfs4_stateid;
 
 enum nfs_opnum4 {
        OP_ACCESS = 3,
@@ -504,6 +519,7 @@ enum {
        NFSPROC4_CLNT_DEALLOCATE,
        NFSPROC4_CLNT_LAYOUTSTATS,
        NFSPROC4_CLNT_CLONE,
+       NFSPROC4_CLNT_COPY,
 };
 
 /* nfs41 types */
@@ -621,7 +637,9 @@ enum pnfs_update_layout_reason {
        PNFS_UPDATE_LAYOUT_IO_TEST_FAIL,
        PNFS_UPDATE_LAYOUT_FOUND_CACHED,
        PNFS_UPDATE_LAYOUT_RETURN,
+       PNFS_UPDATE_LAYOUT_RETRY,
        PNFS_UPDATE_LAYOUT_BLOCKED,
+       PNFS_UPDATE_LAYOUT_INVALID_OPEN,
        PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET,
 };
 
index 7fcc13c..14a762d 100644 (file)
@@ -246,5 +246,6 @@ struct nfs_server {
 #define NFS_CAP_DEALLOCATE     (1U << 21)
 #define NFS_CAP_LAYOUTSTATS    (1U << 22)
 #define NFS_CAP_CLONE          (1U << 23)
+#define NFS_CAP_COPY           (1U << 24)
 
 #endif
index ee8491d..c304a11 100644 (file)
@@ -233,7 +233,6 @@ struct nfs4_layoutget_args {
        struct inode *inode;
        struct nfs_open_context *ctx;
        nfs4_stateid stateid;
-       unsigned long timestamp;
        struct nfs4_layoutdriver_data layout;
 };
 
@@ -251,7 +250,6 @@ struct nfs4_layoutget {
        struct nfs4_layoutget_res res;
        struct rpc_cred *cred;
        gfp_t gfp_flags;
-       long timeout;
 };
 
 struct nfs4_getdeviceinfo_args {
@@ -1343,6 +1341,32 @@ struct nfs42_falloc_res {
        const struct nfs_server         *falloc_server;
 };
 
+struct nfs42_copy_args {
+       struct nfs4_sequence_args       seq_args;
+
+       struct nfs_fh                   *src_fh;
+       nfs4_stateid                    src_stateid;
+       u64                             src_pos;
+
+       struct nfs_fh                   *dst_fh;
+       nfs4_stateid                    dst_stateid;
+       u64                             dst_pos;
+
+       u64                             count;
+};
+
+struct nfs42_write_res {
+       u64                     count;
+       struct nfs_writeverf    verifier;
+};
+
+struct nfs42_copy_res {
+       struct nfs4_sequence_res        seq_res;
+       struct nfs42_write_res          write_res;
+       bool                            consecutive;
+       bool                            synchronous;
+};
+
 struct nfs42_seek_args {
        struct nfs4_sequence_args       seq_args;
 
@@ -1431,7 +1455,7 @@ struct nfs_commit_completion_ops {
 };
 
 struct nfs_commit_info {
-       spinlock_t                      *lock;  /* inode->i_lock */
+       struct inode                    *inode; /* Needed for inode->i_lock */
        struct nfs_mds_commit_info      *mds;
        struct pnfs_ds_commit_info      *ds;
        struct nfs_direct_req           *dreq;  /* O_DIRECT request */
diff --git a/include/linux/of_mtd.h b/include/linux/of_mtd.h
deleted file mode 100644 (file)
index e266caa..0000000
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright 2012 Jean-Christophe PLAGNIOL-VILLARD <plagnioj@jcrosoft.com>
- *
- * OF helpers for mtd.
- *
- * This file is released under the GPLv2
- */
-
-#ifndef __LINUX_OF_MTD_H
-#define __LINUX_OF_MTD_H
-
-#ifdef CONFIG_OF_MTD
-
-#include <linux/of.h>
-int of_get_nand_ecc_mode(struct device_node *np);
-int of_get_nand_ecc_step_size(struct device_node *np);
-int of_get_nand_ecc_strength(struct device_node *np);
-int of_get_nand_bus_width(struct device_node *np);
-bool of_get_nand_on_flash_bbt(struct device_node *np);
-
-#else /* CONFIG_OF_MTD */
-
-static inline int of_get_nand_ecc_mode(struct device_node *np)
-{
-       return -ENOSYS;
-}
-
-static inline int of_get_nand_ecc_step_size(struct device_node *np)
-{
-       return -ENOSYS;
-}
-
-static inline int of_get_nand_ecc_strength(struct device_node *np)
-{
-       return -ENOSYS;
-}
-
-static inline int of_get_nand_bus_width(struct device_node *np)
-{
-       return -ENOSYS;
-}
-
-static inline bool of_get_nand_on_flash_bbt(struct device_node *np)
-{
-       return false;
-}
-
-#endif /* CONFIG_OF_MTD */
-
-#endif /* __LINUX_OF_MTD_H */
index d833eb4..9e9d79e 100644 (file)
  *  option) any later version.
  */
 
-/* Maximum Number of Chip Selects */
-#define GPMC_CS_NUM            8
+#include <linux/platform_data/gpmc-omap.h>
 
 #define GPMC_CONFIG_WP         0x00000005
 
-#define GPMC_IRQ_FIFOEVENTENABLE       0x01
-#define GPMC_IRQ_COUNT_EVENT           0x02
-
-#define GPMC_BURST_4                   4       /* 4 word burst */
-#define GPMC_BURST_8                   8       /* 8 word burst */
-#define GPMC_BURST_16                  16      /* 16 word burst */
-#define GPMC_DEVWIDTH_8BIT             1       /* 8-bit device width */
-#define GPMC_DEVWIDTH_16BIT            2       /* 16-bit device width */
-#define GPMC_MUX_AAD                   1       /* Addr-Addr-Data multiplex */
-#define GPMC_MUX_AD                    2       /* Addr-Data multiplex */
-
-/* bool type time settings */
-struct gpmc_bool_timings {
-       bool cycle2cyclediffcsen;
-       bool cycle2cyclesamecsen;
-       bool we_extra_delay;
-       bool oe_extra_delay;
-       bool adv_extra_delay;
-       bool cs_extra_delay;
-       bool time_para_granularity;
-};
+/* IRQ numbers in GPMC IRQ domain for legacy boot use */
+#define GPMC_IRQ_FIFOEVENTENABLE       0
+#define GPMC_IRQ_COUNT_EVENT           1
 
-/*
- * Note that all values in this struct are in nanoseconds except sync_clk
- * (which is in picoseconds), while the register values are in gpmc_fck cycles.
+/**
+ * gpmc_nand_ops - Interface between NAND and GPMC
+ * @nand_write_buffer_empty: get the NAND write buffer empty status.
  */
-struct gpmc_timings {
-       /* Minimum clock period for synchronous mode (in picoseconds) */
-       u32 sync_clk;
-
-       /* Chip-select signal timings corresponding to GPMC_CS_CONFIG2 */
-       u32 cs_on;              /* Assertion time */
-       u32 cs_rd_off;          /* Read deassertion time */
-       u32 cs_wr_off;          /* Write deassertion time */
-
-       /* ADV signal timings corresponding to GPMC_CONFIG3 */
-       u32 adv_on;             /* Assertion time */
-       u32 adv_rd_off;         /* Read deassertion time */
-       u32 adv_wr_off;         /* Write deassertion time */
-       u32 adv_aad_mux_on;     /* ADV assertion time for AAD */
-       u32 adv_aad_mux_rd_off; /* ADV read deassertion time for AAD */
-       u32 adv_aad_mux_wr_off; /* ADV write deassertion time for AAD */
-
-       /* WE signals timings corresponding to GPMC_CONFIG4 */
-       u32 we_on;              /* WE assertion time */
-       u32 we_off;             /* WE deassertion time */
-
-       /* OE signals timings corresponding to GPMC_CONFIG4 */
-       u32 oe_on;              /* OE assertion time */
-       u32 oe_off;             /* OE deassertion time */
-       u32 oe_aad_mux_on;      /* OE assertion time for AAD */
-       u32 oe_aad_mux_off;     /* OE deassertion time for AAD */
-
-       /* Access time and cycle time timings corresponding to GPMC_CONFIG5 */
-       u32 page_burst_access;  /* Multiple access word delay */
-       u32 access;             /* Start-cycle to first data valid delay */
-       u32 rd_cycle;           /* Total read cycle time */
-       u32 wr_cycle;           /* Total write cycle time */
-
-       u32 bus_turnaround;
-       u32 cycle2cycle_delay;
-
-       u32 wait_monitoring;
-       u32 clk_activation;
-
-       /* The following are only on OMAP3430 */
-       u32 wr_access;          /* WRACCESSTIME */
-       u32 wr_data_mux_bus;    /* WRDATAONADMUXBUS */
-
-       struct gpmc_bool_timings bool_timings;
+struct gpmc_nand_ops {
+       bool (*nand_writebuffer_empty)(void);
 };
 
-/* Device timings in picoseconds */
-struct gpmc_device_timings {
-       u32 t_ceasu;    /* address setup to CS valid */
-       u32 t_avdasu;   /* address setup to ADV valid */
-       /* XXX: try to combine t_avdp_r & t_avdp_w. Issue is
-        * of tusb using these timings even for sync whilst
-        * ideally for adv_rd/(wr)_off it should have considered
-        * t_avdh instead. This indirectly necessitates r/w
-        * variations of t_avdp as it is possible to have one
-        * sync & other async
-        */
-       u32 t_avdp_r;   /* ADV low time (what about t_cer ?) */
-       u32 t_avdp_w;
-       u32 t_aavdh;    /* address hold time */
-       u32 t_oeasu;    /* address setup to OE valid */
-       u32 t_aa;       /* access time from ADV assertion */
-       u32 t_iaa;      /* initial access time */
-       u32 t_oe;       /* access time from OE assertion */
-       u32 t_ce;       /* access time from CS asertion */
-       u32 t_rd_cycle; /* read cycle time */
-       u32 t_cez_r;    /* read CS deassertion to high Z */
-       u32 t_cez_w;    /* write CS deassertion to high Z */
-       u32 t_oez;      /* OE deassertion to high Z */
-       u32 t_weasu;    /* address setup to WE valid */
-       u32 t_wpl;      /* write assertion time */
-       u32 t_wph;      /* write deassertion time */
-       u32 t_wr_cycle; /* write cycle time */
-
-       u32 clk;
-       u32 t_bacc;     /* burst access valid clock to output delay */
-       u32 t_ces;      /* CS setup time to clk */
-       u32 t_avds;     /* ADV setup time to clk */
-       u32 t_avdh;     /* ADV hold time from clk */
-       u32 t_ach;      /* address hold time from clk */
-       u32 t_rdyo;     /* clk to ready valid */
-
-       u32 t_ce_rdyz;  /* XXX: description ?, or use t_cez instead */
-       u32 t_ce_avd;   /* CS on to ADV on delay */
-
-       /* XXX: check the possibility of combining
-        * cyc_aavhd_oe & cyc_aavdh_we
-        */
-       u8 cyc_aavdh_oe;/* read address hold time in cycles */
-       u8 cyc_aavdh_we;/* write address hold time in cycles */
-       u8 cyc_oe;      /* access time from OE assertion in cycles */
-       u8 cyc_wpl;     /* write deassertion time in cycles */
-       u32 cyc_iaa;    /* initial access time in cycles */
-
-       /* extra delays */
-       bool ce_xdelay;
-       bool avd_xdelay;
-       bool oe_xdelay;
-       bool we_xdelay;
-};
+struct gpmc_nand_regs;
 
-struct gpmc_settings {
-       bool burst_wrap;        /* enables wrap bursting */
-       bool burst_read;        /* enables read page/burst mode */
-       bool burst_write;       /* enables write page/burst mode */
-       bool device_nand;       /* device is NAND */
-       bool sync_read;         /* enables synchronous reads */
-       bool sync_write;        /* enables synchronous writes */
-       bool wait_on_read;      /* monitor wait on reads */
-       bool wait_on_write;     /* monitor wait on writes */
-       u32 burst_len;          /* page/burst length */
-       u32 device_width;       /* device bus width (8 or 16 bit) */
-       u32 mux_add_data;       /* multiplex address & data */
-       u32 wait_pin;           /* wait-pin to be used */
-};
+#if IS_ENABLED(CONFIG_OMAP_GPMC)
+struct gpmc_nand_ops *gpmc_omap_get_nand_ops(struct gpmc_nand_regs *regs,
+                                            int cs);
+#else
+static inline gpmc_nand_ops *gpmc_omap_get_nand_ops(struct gpmc_nand_regs *regs,
+                                                   int cs)
+{
+       return NULL;
+}
+#endif /* CONFIG_OMAP_GPMC */
+
+/*--------------------------------*/
+
+/* deprecated APIs */
+#if IS_ENABLED(CONFIG_OMAP_GPMC)
+void gpmc_update_nand_reg(struct gpmc_nand_regs *reg, int cs);
+#else
+static inline void gpmc_update_nand_reg(struct gpmc_nand_regs *reg, int cs)
+{
+}
+#endif /* CONFIG_OMAP_GPMC */
+/*--------------------------------*/
 
 extern int gpmc_calc_timings(struct gpmc_timings *gpmc_t,
                             struct gpmc_settings *gpmc_s,
                             struct gpmc_device_timings *dev_t);
 
-struct gpmc_nand_regs;
 struct device_node;
 
-extern void gpmc_update_nand_reg(struct gpmc_nand_regs *reg, int cs);
 extern int gpmc_get_client_irq(unsigned irq_config);
 
 extern unsigned int gpmc_ticks_to_ns(unsigned int ticks);
index bf268fa..fec4027 100644 (file)
@@ -46,33 +46,62 @@ extern struct page_ext_operations page_idle_ops;
 
 static inline bool page_is_young(struct page *page)
 {
-       return test_bit(PAGE_EXT_YOUNG, &lookup_page_ext(page)->flags);
+       struct page_ext *page_ext = lookup_page_ext(page);
+
+       if (unlikely(!page_ext))
+               return false;
+
+       return test_bit(PAGE_EXT_YOUNG, &page_ext->flags);
 }
 
 static inline void set_page_young(struct page *page)
 {
-       set_bit(PAGE_EXT_YOUNG, &lookup_page_ext(page)->flags);
+       struct page_ext *page_ext = lookup_page_ext(page);
+
+       if (unlikely(!page_ext))
+               return;
+
+       set_bit(PAGE_EXT_YOUNG, &page_ext->flags);
 }
 
 static inline bool test_and_clear_page_young(struct page *page)
 {
-       return test_and_clear_bit(PAGE_EXT_YOUNG,
-                                 &lookup_page_ext(page)->flags);
+       struct page_ext *page_ext = lookup_page_ext(page);
+
+       if (unlikely(!page_ext))
+               return false;
+
+       return test_and_clear_bit(PAGE_EXT_YOUNG, &page_ext->flags);
 }
 
 static inline bool page_is_idle(struct page *page)
 {
-       return test_bit(PAGE_EXT_IDLE, &lookup_page_ext(page)->flags);
+       struct page_ext *page_ext = lookup_page_ext(page);
+
+       if (unlikely(!page_ext))
+               return false;
+
+       return test_bit(PAGE_EXT_IDLE, &page_ext->flags);
 }
 
 static inline void set_page_idle(struct page *page)
 {
-       set_bit(PAGE_EXT_IDLE, &lookup_page_ext(page)->flags);
+       struct page_ext *page_ext = lookup_page_ext(page);
+
+       if (unlikely(!page_ext))
+               return;
+
+       set_bit(PAGE_EXT_IDLE, &page_ext->flags);
 }
 
 static inline void clear_page_idle(struct page *page)
 {
-       clear_bit(PAGE_EXT_IDLE, &lookup_page_ext(page)->flags);
+       struct page_ext *page_ext = lookup_page_ext(page);
+
+       if (unlikely(!page_ext))
+               return;
+
+       clear_bit(PAGE_EXT_IDLE, &page_ext->flags);
 }
 #endif /* CONFIG_64BIT */
 
index 44f3383..1a827ce 100644 (file)
@@ -61,6 +61,14 @@ struct perf_callchain_entry {
        __u64                           ip[0]; /* /proc/sys/kernel/perf_event_max_stack */
 };
 
+struct perf_callchain_entry_ctx {
+       struct perf_callchain_entry *entry;
+       u32                         max_stack;
+       u32                         nr;
+       short                       contexts;
+       bool                        contexts_maxed;
+};
+
 struct perf_raw_record {
        u32                             size;
        void                            *data;
@@ -1061,20 +1069,36 @@ extern void perf_event_fork(struct task_struct *tsk);
 /* Callchains */
 DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);
 
-extern void perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs);
-extern void perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs);
+extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
+extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
 extern struct perf_callchain_entry *
 get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
-                  bool crosstask, bool add_mark);
+                  u32 max_stack, bool crosstask, bool add_mark);
 extern int get_callchain_buffers(void);
 extern void put_callchain_buffers(void);
 
 extern int sysctl_perf_event_max_stack;
+extern int sysctl_perf_event_max_contexts_per_stack;
+
+static inline int perf_callchain_store_context(struct perf_callchain_entry_ctx *ctx, u64 ip)
+{
+       if (ctx->contexts < sysctl_perf_event_max_contexts_per_stack) {
+               struct perf_callchain_entry *entry = ctx->entry;
+               entry->ip[entry->nr++] = ip;
+               ++ctx->contexts;
+               return 0;
+       } else {
+               ctx->contexts_maxed = true;
+               return -1; /* no more room, stop walking the stack */
+       }
+}
 
-static inline int perf_callchain_store(struct perf_callchain_entry *entry, u64 ip)
+static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 ip)
 {
-       if (entry->nr < sysctl_perf_event_max_stack) {
+       if (ctx->nr < ctx->max_stack && !ctx->contexts_maxed) {
+               struct perf_callchain_entry *entry = ctx->entry;
                entry->ip[entry->nr++] = ip;
+               ++ctx->nr;
                return 0;
        } else {
                return -1; /* no more room, stop walking the stack */
index dc9a13e..be830b1 100644 (file)
@@ -26,7 +26,7 @@
  *
  * An example in pseudo code for a setup() callback:
  *
- * void get_mac_addr(struct mvmem_device *nvmem, void *context)
+ * void get_mac_addr(struct nvmem_device *nvmem, void *context)
  * {
  *     u8 *mac_addr = ethernet_pdata->mac_addr;
  *     off_t offset = context;
diff --git a/include/linux/platform_data/gpmc-omap.h b/include/linux/platform_data/gpmc-omap.h
new file mode 100644 (file)
index 0000000..67ccdb0
--- /dev/null
@@ -0,0 +1,172 @@
+/*
+ * OMAP GPMC Platform data
+ *
+ * Copyright (C) 2014 Texas Instruments, Inc. - http://www.ti.com
+ *     Roger Quadros <rogerq@ti.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ */
+
+#ifndef _GPMC_OMAP_H_
+#define _GPMC_OMAP_H_
+
+/* Maximum Number of Chip Selects */
+#define GPMC_CS_NUM            8
+
+/* bool type time settings */
+struct gpmc_bool_timings {
+       bool cycle2cyclediffcsen;
+       bool cycle2cyclesamecsen;
+       bool we_extra_delay;
+       bool oe_extra_delay;
+       bool adv_extra_delay;
+       bool cs_extra_delay;
+       bool time_para_granularity;
+};
+
+/*
+ * Note that all values in this struct are in nanoseconds except sync_clk
+ * (which is in picoseconds), while the register values are in gpmc_fck cycles.
+ */
+struct gpmc_timings {
+       /* Minimum clock period for synchronous mode (in picoseconds) */
+       u32 sync_clk;
+
+       /* Chip-select signal timings corresponding to GPMC_CS_CONFIG2 */
+       u32 cs_on;              /* Assertion time */
+       u32 cs_rd_off;          /* Read deassertion time */
+       u32 cs_wr_off;          /* Write deassertion time */
+
+       /* ADV signal timings corresponding to GPMC_CONFIG3 */
+       u32 adv_on;             /* Assertion time */
+       u32 adv_rd_off;         /* Read deassertion time */
+       u32 adv_wr_off;         /* Write deassertion time */
+       u32 adv_aad_mux_on;     /* ADV assertion time for AAD */
+       u32 adv_aad_mux_rd_off; /* ADV read deassertion time for AAD */
+       u32 adv_aad_mux_wr_off; /* ADV write deassertion time for AAD */
+
+       /* WE signals timings corresponding to GPMC_CONFIG4 */
+       u32 we_on;              /* WE assertion time */
+       u32 we_off;             /* WE deassertion time */
+
+       /* OE signals timings corresponding to GPMC_CONFIG4 */
+       u32 oe_on;              /* OE assertion time */
+       u32 oe_off;             /* OE deassertion time */
+       u32 oe_aad_mux_on;      /* OE assertion time for AAD */
+       u32 oe_aad_mux_off;     /* OE deassertion time for AAD */
+
+       /* Access time and cycle time timings corresponding to GPMC_CONFIG5 */
+       u32 page_burst_access;  /* Multiple access word delay */
+       u32 access;             /* Start-cycle to first data valid delay */
+       u32 rd_cycle;           /* Total read cycle time */
+       u32 wr_cycle;           /* Total write cycle time */
+
+       u32 bus_turnaround;
+       u32 cycle2cycle_delay;
+
+       u32 wait_monitoring;
+       u32 clk_activation;
+
+       /* The following are only on OMAP3430 */
+       u32 wr_access;          /* WRACCESSTIME */
+       u32 wr_data_mux_bus;    /* WRDATAONADMUXBUS */
+
+       struct gpmc_bool_timings bool_timings;
+};
+
+/* Device timings in picoseconds */
+struct gpmc_device_timings {
+       u32 t_ceasu;    /* address setup to CS valid */
+       u32 t_avdasu;   /* address setup to ADV valid */
+       /* XXX: try to combine t_avdp_r & t_avdp_w. Issue is
+        * of tusb using these timings even for sync whilst
+        * ideally for adv_rd/(wr)_off it should have considered
+        * t_avdh instead. This indirectly necessitates r/w
+        * variations of t_avdp as it is possible to have one
+        * sync & other async
+        */
+       u32 t_avdp_r;   /* ADV low time (what about t_cer ?) */
+       u32 t_avdp_w;
+       u32 t_aavdh;    /* address hold time */
+       u32 t_oeasu;    /* address setup to OE valid */
+       u32 t_aa;       /* access time from ADV assertion */
+       u32 t_iaa;      /* initial access time */
+       u32 t_oe;       /* access time from OE assertion */
+       u32 t_ce;       /* access time from CS asertion */
+       u32 t_rd_cycle; /* read cycle time */
+       u32 t_cez_r;    /* read CS deassertion to high Z */
+       u32 t_cez_w;    /* write CS deassertion to high Z */
+       u32 t_oez;      /* OE deassertion to high Z */
+       u32 t_weasu;    /* address setup to WE valid */
+       u32 t_wpl;      /* write assertion time */
+       u32 t_wph;      /* write deassertion time */
+       u32 t_wr_cycle; /* write cycle time */
+
+       u32 clk;
+       u32 t_bacc;     /* burst access valid clock to output delay */
+       u32 t_ces;      /* CS setup time to clk */
+       u32 t_avds;     /* ADV setup time to clk */
+       u32 t_avdh;     /* ADV hold time from clk */
+       u32 t_ach;      /* address hold time from clk */
+       u32 t_rdyo;     /* clk to ready valid */
+
+       u32 t_ce_rdyz;  /* XXX: description ?, or use t_cez instead */
+       u32 t_ce_avd;   /* CS on to ADV on delay */
+
+       /* XXX: check the possibility of combining
+        * cyc_aavhd_oe & cyc_aavdh_we
+        */
+       u8 cyc_aavdh_oe;/* read address hold time in cycles */
+       u8 cyc_aavdh_we;/* write address hold time in cycles */
+       u8 cyc_oe;      /* access time from OE assertion in cycles */
+       u8 cyc_wpl;     /* write deassertion time in cycles */
+       u32 cyc_iaa;    /* initial access time in cycles */
+
+       /* extra delays */
+       bool ce_xdelay;
+       bool avd_xdelay;
+       bool oe_xdelay;
+       bool we_xdelay;
+};
+
+#define GPMC_BURST_4                   4       /* 4 word burst */
+#define GPMC_BURST_8                   8       /* 8 word burst */
+#define GPMC_BURST_16                  16      /* 16 word burst */
+#define GPMC_DEVWIDTH_8BIT             1       /* 8-bit device width */
+#define GPMC_DEVWIDTH_16BIT            2       /* 16-bit device width */
+#define GPMC_MUX_AAD                   1       /* Addr-Addr-Data multiplex */
+#define GPMC_MUX_AD                    2       /* Addr-Data multiplex */
+
+struct gpmc_settings {
+       bool burst_wrap;        /* enables wrap bursting */
+       bool burst_read;        /* enables read page/burst mode */
+       bool burst_write;       /* enables write page/burst mode */
+       bool device_nand;       /* device is NAND */
+       bool sync_read;         /* enables synchronous reads */
+       bool sync_write;        /* enables synchronous writes */
+       bool wait_on_read;      /* monitor wait on reads */
+       bool wait_on_write;     /* monitor wait on writes */
+       u32 burst_len;          /* page/burst length */
+       u32 device_width;       /* device bus width (8 or 16 bit) */
+       u32 mux_add_data;       /* multiplex address & data */
+       u32 wait_pin;           /* wait-pin to be used */
+};
+
+/* Data for each chip select */
+struct gpmc_omap_cs_data {
+       bool valid;                     /* data is valid */
+       bool is_nand;                   /* device within this CS is NAND */
+       struct gpmc_settings *settings;
+       struct gpmc_device_timings *device_timings;
+       struct gpmc_timings *gpmc_timings;
+       struct platform_device *pdev;   /* device within this CS region */
+       unsigned int pdata_size;
+};
+
+struct gpmc_omap_platform_data {
+       struct gpmc_omap_cs_data cs[GPMC_CS_NUM];
+};
+
+#endif /* _GPMC_OMAP_H */
index 090bbab..17d57a1 100644 (file)
@@ -45,7 +45,6 @@ enum omap_ecc {
 };
 
 struct gpmc_nand_regs {
-       void __iomem    *gpmc_status;
        void __iomem    *gpmc_nand_command;
        void __iomem    *gpmc_nand_address;
        void __iomem    *gpmc_nand_data;
@@ -64,21 +63,24 @@ struct gpmc_nand_regs {
        void __iomem    *gpmc_bch_result4[GPMC_BCH_NUM_REMAINDER];
        void __iomem    *gpmc_bch_result5[GPMC_BCH_NUM_REMAINDER];
        void __iomem    *gpmc_bch_result6[GPMC_BCH_NUM_REMAINDER];
+       /* Deprecated. Do not use */
+       void __iomem    *gpmc_status;
 };
 
 struct omap_nand_platform_data {
        int                     cs;
        struct mtd_partition    *parts;
        int                     nr_parts;
-       bool                    dev_ready;
        bool                    flash_bbt;
        enum nand_io            xfer_type;
        int                     devsize;
        enum omap_ecc           ecc_opt;
-       struct gpmc_nand_regs   reg;
 
-       /* for passing the partitions */
-       struct device_node      *of_node;
        struct device_node      *elm_of_node;
+
+       /* deprecated */
+       struct gpmc_nand_regs   reg;
+       struct device_node      *of_node;
+       bool                    dev_ready;
 };
 #endif
index b78d27c..17018f3 100644 (file)
@@ -5,59 +5,7 @@
 #include <linux/mutex.h>
 #include <linux/of.h>
 
-struct pwm_device;
 struct seq_file;
-
-#if IS_ENABLED(CONFIG_PWM)
-/*
- * pwm_request - request a PWM device
- */
-struct pwm_device *pwm_request(int pwm_id, const char *label);
-
-/*
- * pwm_free - free a PWM device
- */
-void pwm_free(struct pwm_device *pwm);
-
-/*
- * pwm_config - change a PWM device configuration
- */
-int pwm_config(struct pwm_device *pwm, int duty_ns, int period_ns);
-
-/*
- * pwm_enable - start a PWM output toggling
- */
-int pwm_enable(struct pwm_device *pwm);
-
-/*
- * pwm_disable - stop a PWM output toggling
- */
-void pwm_disable(struct pwm_device *pwm);
-#else
-static inline struct pwm_device *pwm_request(int pwm_id, const char *label)
-{
-       return ERR_PTR(-ENODEV);
-}
-
-static inline void pwm_free(struct pwm_device *pwm)
-{
-}
-
-static inline int pwm_config(struct pwm_device *pwm, int duty_ns, int period_ns)
-{
-       return -EINVAL;
-}
-
-static inline int pwm_enable(struct pwm_device *pwm)
-{
-       return -EINVAL;
-}
-
-static inline void pwm_disable(struct pwm_device *pwm)
-{
-}
-#endif
-
 struct pwm_chip;
 
 /**
@@ -94,8 +42,21 @@ struct pwm_args {
 
 enum {
        PWMF_REQUESTED = 1 << 0,
-       PWMF_ENABLED = 1 << 1,
-       PWMF_EXPORTED = 1 << 2,
+       PWMF_EXPORTED = 1 << 1,
+};
+
+/*
+ * struct pwm_state - state of a PWM channel
+ * @period: PWM period (in nanoseconds)
+ * @duty_cycle: PWM duty cycle (in nanoseconds)
+ * @polarity: PWM polarity
+ * @enabled: PWM enabled status
+ */
+struct pwm_state {
+       unsigned int period;
+       unsigned int duty_cycle;
+       enum pwm_polarity polarity;
+       bool enabled;
 };
 
 /**
@@ -106,11 +67,8 @@ enum {
  * @pwm: global index of the PWM device
  * @chip: PWM chip providing this PWM device
  * @chip_data: chip-private data associated with the PWM device
- * @lock: used to serialize accesses to the PWM device where necessary
- * @period: period of the PWM signal (in nanoseconds)
- * @duty_cycle: duty cycle of the PWM signal (in nanoseconds)
- * @polarity: polarity of the PWM signal
  * @args: PWM arguments
+ * @state: curent PWM channel state
  */
 struct pwm_device {
        const char *label;
@@ -119,50 +77,68 @@ struct pwm_device {
        unsigned int pwm;
        struct pwm_chip *chip;
        void *chip_data;
-       struct mutex lock;
-
-       unsigned int period;
-       unsigned int duty_cycle;
-       enum pwm_polarity polarity;
 
        struct pwm_args args;
+       struct pwm_state state;
 };
 
+/**
+ * pwm_get_state() - retrieve the current PWM state
+ * @pwm: PWM device
+ * @state: state to fill with the current PWM state
+ */
+static inline void pwm_get_state(const struct pwm_device *pwm,
+                                struct pwm_state *state)
+{
+       *state = pwm->state;
+}
+
 static inline bool pwm_is_enabled(const struct pwm_device *pwm)
 {
-       return test_bit(PWMF_ENABLED, &pwm->flags);
+       struct pwm_state state;
+
+       pwm_get_state(pwm, &state);
+
+       return state.enabled;
 }
 
 static inline void pwm_set_period(struct pwm_device *pwm, unsigned int period)
 {
        if (pwm)
-               pwm->period = period;
+               pwm->state.period = period;
 }
 
 static inline unsigned int pwm_get_period(const struct pwm_device *pwm)
 {
-       return pwm ? pwm->period : 0;
+       struct pwm_state state;
+
+       pwm_get_state(pwm, &state);
+
+       return state.period;
 }
 
 static inline void pwm_set_duty_cycle(struct pwm_device *pwm, unsigned int duty)
 {
        if (pwm)
-               pwm->duty_cycle = duty;
+               pwm->state.duty_cycle = duty;
 }
 
 static inline unsigned int pwm_get_duty_cycle(const struct pwm_device *pwm)
 {
-       return pwm ? pwm->duty_cycle : 0;
-}
+       struct pwm_state state;
 
-/*
- * pwm_set_polarity - configure the polarity of a PWM signal
- */
-int pwm_set_polarity(struct pwm_device *pwm, enum pwm_polarity polarity);
+       pwm_get_state(pwm, &state);
+
+       return state.duty_cycle;
+}
 
 static inline enum pwm_polarity pwm_get_polarity(const struct pwm_device *pwm)
 {
-       return pwm ? pwm->polarity : PWM_POLARITY_NORMAL;
+       struct pwm_state state;
+
+       pwm_get_state(pwm, &state);
+
+       return state.polarity;
 }
 
 static inline void pwm_get_args(const struct pwm_device *pwm,
@@ -171,12 +147,6 @@ static inline void pwm_get_args(const struct pwm_device *pwm,
        *args = pwm->args;
 }
 
-static inline void pwm_apply_args(struct pwm_device *pwm)
-{
-       pwm_set_period(pwm, pwm->args.period);
-       pwm_set_polarity(pwm, pwm->args.polarity);
-}
-
 /**
  * struct pwm_ops - PWM controller operations
  * @request: optional hook for requesting a PWM
@@ -185,6 +155,13 @@ static inline void pwm_apply_args(struct pwm_device *pwm)
  * @set_polarity: configure the polarity of this PWM
  * @enable: enable PWM output toggling
  * @disable: disable PWM output toggling
+ * @apply: atomically apply a new PWM config. The state argument
+ *        should be adjusted with the real hardware config (if the
+ *        approximate the period or duty_cycle value, state should
+ *        reflect it)
+ * @get_state: get the current PWM state. This function is only
+ *            called once per PWM device when the PWM chip is
+ *            registered.
  * @dbg_show: optional routine to show contents in debugfs
  * @owner: helps prevent removal of modules exporting active PWMs
  */
@@ -197,6 +174,10 @@ struct pwm_ops {
                            enum pwm_polarity polarity);
        int (*enable)(struct pwm_chip *chip, struct pwm_device *pwm);
        void (*disable)(struct pwm_chip *chip, struct pwm_device *pwm);
+       int (*apply)(struct pwm_chip *chip, struct pwm_device *pwm,
+                    struct pwm_state *state);
+       void (*get_state)(struct pwm_chip *chip, struct pwm_device *pwm,
+                         struct pwm_state *state);
 #ifdef CONFIG_DEBUG_FS
        void (*dbg_show)(struct pwm_chip *chip, struct seq_file *s);
 #endif
@@ -232,6 +213,115 @@ struct pwm_chip {
 };
 
 #if IS_ENABLED(CONFIG_PWM)
+/* PWM user APIs */
+struct pwm_device *pwm_request(int pwm_id, const char *label);
+void pwm_free(struct pwm_device *pwm);
+int pwm_apply_state(struct pwm_device *pwm, struct pwm_state *state);
+int pwm_adjust_config(struct pwm_device *pwm);
+
+/**
+ * pwm_config() - change a PWM device configuration
+ * @pwm: PWM device
+ * @duty_ns: "on" time (in nanoseconds)
+ * @period_ns: duration (in nanoseconds) of one cycle
+ *
+ * Returns: 0 on success or a negative error code on failure.
+ */
+static inline int pwm_config(struct pwm_device *pwm, int duty_ns,
+                            int period_ns)
+{
+       struct pwm_state state;
+
+       if (!pwm)
+               return -EINVAL;
+
+       pwm_get_state(pwm, &state);
+       if (state.duty_cycle == duty_ns && state.period == period_ns)
+               return 0;
+
+       state.duty_cycle = duty_ns;
+       state.period = period_ns;
+       return pwm_apply_state(pwm, &state);
+}
+
+/**
+ * pwm_set_polarity() - configure the polarity of a PWM signal
+ * @pwm: PWM device
+ * @polarity: new polarity of the PWM signal
+ *
+ * Note that the polarity cannot be configured while the PWM device is
+ * enabled.
+ *
+ * Returns: 0 on success or a negative error code on failure.
+ */
+static inline int pwm_set_polarity(struct pwm_device *pwm,
+                                  enum pwm_polarity polarity)
+{
+       struct pwm_state state;
+
+       if (!pwm)
+               return -EINVAL;
+
+       pwm_get_state(pwm, &state);
+       if (state.polarity == polarity)
+               return 0;
+
+       /*
+        * Changing the polarity of a running PWM without adjusting the
+        * dutycycle/period value is a bit risky (can introduce glitches).
+        * Return -EBUSY in this case.
+        * Note that this is allowed when using pwm_apply_state() because
+        * the user specifies all the parameters.
+        */
+       if (state.enabled)
+               return -EBUSY;
+
+       state.polarity = polarity;
+       return pwm_apply_state(pwm, &state);
+}
+
+/**
+ * pwm_enable() - start a PWM output toggling
+ * @pwm: PWM device
+ *
+ * Returns: 0 on success or a negative error code on failure.
+ */
+static inline int pwm_enable(struct pwm_device *pwm)
+{
+       struct pwm_state state;
+
+       if (!pwm)
+               return -EINVAL;
+
+       pwm_get_state(pwm, &state);
+       if (state.enabled)
+               return 0;
+
+       state.enabled = true;
+       return pwm_apply_state(pwm, &state);
+}
+
+/**
+ * pwm_disable() - stop a PWM output toggling
+ * @pwm: PWM device
+ */
+static inline void pwm_disable(struct pwm_device *pwm)
+{
+       struct pwm_state state;
+
+       if (!pwm)
+               return;
+
+       pwm_get_state(pwm, &state);
+       if (!state.enabled)
+               return;
+
+       state.enabled = false;
+       pwm_apply_state(pwm, &state);
+}
+
+
+/* PWM provider APIs */
 int pwm_set_chip_data(struct pwm_device *pwm, void *data);
 void *pwm_get_chip_data(struct pwm_device *pwm);
 
@@ -257,6 +347,47 @@ void devm_pwm_put(struct device *dev, struct pwm_device *pwm);
 
 bool pwm_can_sleep(struct pwm_device *pwm);
 #else
+static inline struct pwm_device *pwm_request(int pwm_id, const char *label)
+{
+       return ERR_PTR(-ENODEV);
+}
+
+static inline void pwm_free(struct pwm_device *pwm)
+{
+}
+
+static inline int pwm_apply_state(struct pwm_device *pwm,
+                                 const struct pwm_state *state)
+{
+       return -ENOTSUPP;
+}
+
+static inline int pwm_adjust_config(struct pwm_device *pwm)
+{
+       return -ENOTSUPP;
+}
+
+static inline int pwm_config(struct pwm_device *pwm, int duty_ns,
+                            int period_ns)
+{
+       return -EINVAL;
+}
+
+static inline int pwm_set_polarity(struct pwm_device *pwm,
+                                  enum pwm_polarity polarity)
+{
+       return -ENOTSUPP;
+}
+
+static inline int pwm_enable(struct pwm_device *pwm)
+{
+       return -EINVAL;
+}
+
+static inline void pwm_disable(struct pwm_device *pwm)
+{
+}
+
 static inline int pwm_set_chip_data(struct pwm_device *pwm, void *data)
 {
        return -EINVAL;
@@ -328,6 +459,34 @@ static inline bool pwm_can_sleep(struct pwm_device *pwm)
 }
 #endif
 
+static inline void pwm_apply_args(struct pwm_device *pwm)
+{
+       /*
+        * PWM users calling pwm_apply_args() expect to have a fresh config
+        * where the polarity and period are set according to pwm_args info.
+        * The problem is, polarity can only be changed when the PWM is
+        * disabled.
+        *
+        * PWM drivers supporting hardware readout may declare the PWM device
+        * as enabled, and prevent polarity setting, which changes from the
+        * existing behavior, where all PWM devices are declared as disabled
+        * at startup (even if they are actually enabled), thus authorizing
+        * polarity setting.
+        *
+        * Instead of setting ->enabled to false, we call pwm_disable()
+        * before pwm_set_polarity() to ensure that everything is configured
+        * as expected, and the PWM is really disabled when the user request
+        * it.
+        *
+        * Note that PWM users requiring a smooth handover between the
+        * bootloader and the kernel (like critical regulators controlled by
+        * PWM devices) will have to switch to the atomic API and avoid calling
+        * pwm_apply_args().
+        */
+       pwm_disable(pwm);
+       pwm_set_polarity(pwm, pwm->args.polarity);
+}
+
 struct pwm_lookup {
        struct list_head list;
        const char *provider;
index 49d0576..b0f305e 100644 (file)
@@ -49,12 +49,27 @@ extern struct ww_class reservation_ww_class;
 extern struct lock_class_key reservation_seqcount_class;
 extern const char reservation_seqcount_string[];
 
+/**
+ * struct reservation_object_list - a list of shared fences
+ * @rcu: for internal use
+ * @shared_count: table of shared fences
+ * @shared_max: for growing shared fence table
+ * @shared: shared fence table
+ */
 struct reservation_object_list {
        struct rcu_head rcu;
        u32 shared_count, shared_max;
        struct fence __rcu *shared[];
 };
 
+/**
+ * struct reservation_object - a reservation object manages fences for a buffer
+ * @lock: update side lock
+ * @seq: sequence count for managing RCU read-side synchronization
+ * @fence_excl: the exclusive fence, if there is one currently
+ * @fence: list of current shared fences
+ * @staged: staged copy of shared fences for RCU updates
+ */
 struct reservation_object {
        struct ww_mutex lock;
        seqcount_t seq;
@@ -68,6 +83,10 @@ struct reservation_object {
 #define reservation_object_assert_held(obj) \
        lockdep_assert_held(&(obj)->lock.base)
 
+/**
+ * reservation_object_init - initialize a reservation object
+ * @obj: the reservation object
+ */
 static inline void
 reservation_object_init(struct reservation_object *obj)
 {
@@ -79,6 +98,10 @@ reservation_object_init(struct reservation_object *obj)
        obj->staged = NULL;
 }
 
+/**
+ * reservation_object_fini - destroys a reservation object
+ * @obj: the reservation object
+ */
 static inline void
 reservation_object_fini(struct reservation_object *obj)
 {
@@ -106,6 +129,14 @@ reservation_object_fini(struct reservation_object *obj)
        ww_mutex_destroy(&obj->lock);
 }
 
+/**
+ * reservation_object_get_list - get the reservation object's
+ * shared fence list, with update-side lock held
+ * @obj: the reservation object
+ *
+ * Returns the shared fence list.  Does NOT take references to
+ * the fence.  The obj->lock must be held.
+ */
 static inline struct reservation_object_list *
 reservation_object_get_list(struct reservation_object *obj)
 {
@@ -113,6 +144,17 @@ reservation_object_get_list(struct reservation_object *obj)
                                         reservation_object_held(obj));
 }
 
+/**
+ * reservation_object_get_excl - get the reservation object's
+ * exclusive fence, with update-side lock held
+ * @obj: the reservation object
+ *
+ * Returns the exclusive fence (if any).  Does NOT take a
+ * reference.  The obj->lock must be held.
+ *
+ * RETURNS
+ * The exclusive fence or NULL
+ */
 static inline struct fence *
 reservation_object_get_excl(struct reservation_object *obj)
 {
@@ -120,6 +162,17 @@ reservation_object_get_excl(struct reservation_object *obj)
                                         reservation_object_held(obj));
 }
 
+/**
+ * reservation_object_get_excl_rcu - get the reservation object's
+ * exclusive fence, without lock held.
+ * @obj: the reservation object
+ *
+ * If there is an exclusive fence, this atomically increments it's
+ * reference count and returns it.
+ *
+ * RETURNS
+ * The exclusive fence or NULL if none
+ */
 static inline struct fence *
 reservation_object_get_excl_rcu(struct reservation_object *obj)
 {
index d1c12d1..d37fbb3 100644 (file)
@@ -156,6 +156,7 @@ extern void downgrade_write(struct rw_semaphore *sem);
  */
 extern void down_read_nested(struct rw_semaphore *sem, int subclass);
 extern void down_write_nested(struct rw_semaphore *sem, int subclass);
+extern int down_write_killable_nested(struct rw_semaphore *sem, int subclass);
 extern void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest_lock);
 
 # define down_write_nest_lock(sem, nest_lock)                  \
@@ -176,6 +177,7 @@ extern void up_read_non_owner(struct rw_semaphore *sem);
 # define down_read_nested(sem, subclass)               down_read(sem)
 # define down_write_nest_lock(sem, nest_lock)  down_write(sem)
 # define down_write_nested(sem, subclass)      down_write(sem)
+# define down_write_killable_nested(sem, subclass)     down_write_killable(sem)
 # define down_read_non_owner(sem)              down_read(sem)
 # define up_read_non_owner(sem)                        up_read(sem)
 #endif
index 21c26e7..6e42ada 100644 (file)
@@ -1539,6 +1539,7 @@ struct task_struct {
        unsigned sched_reset_on_fork:1;
        unsigned sched_contributes_to_load:1;
        unsigned sched_migrated:1;
+       unsigned sched_remote_wakeup:1;
        unsigned :0; /* force alignment to the next boundary */
 
        /* unserialized, strictly 'current' */
@@ -2744,10 +2745,12 @@ static inline bool mmget_not_zero(struct mm_struct *mm)
 
 /* mmput gets rid of the mappings and all user-space */
 extern void mmput(struct mm_struct *);
-/* same as above but performs the slow path from the async kontext. Can
+#ifdef CONFIG_MMU
+/* same as above but performs the slow path from the async context. Can
  * be called from the atomic context as well
  */
 extern void mmput_async(struct mm_struct *);
+#endif
 
 /* Grab a reference to a task's mm, if it is not already going away */
 extern struct mm_struct *get_task_mm(struct task_struct *task);
index dacb5e7..de1f643 100644 (file)
@@ -765,6 +765,8 @@ struct sctp_info {
        __u8    sctpi_s_disable_fragments;
        __u8    sctpi_s_v4mapped;
        __u8    sctpi_s_frag_interleave;
+       __u32   sctpi_s_type;
+       __u32   __reserved3;
 };
 
 struct sctp_infox {
index e058210..7973a82 100644 (file)
@@ -277,7 +277,7 @@ static inline void raw_write_seqcount_barrier(seqcount_t *s)
 
 static inline int raw_read_seqcount_latch(seqcount_t *s)
 {
-       return lockless_dereference(s->sequence);
+       return lockless_dereference(s)->sequence;
 }
 
 /**
@@ -331,7 +331,7 @@ static inline int raw_read_seqcount_latch(seqcount_t *s)
  *     unsigned seq, idx;
  *
  *     do {
- *             seq = lockless_dereference(latch->seq);
+ *             seq = lockless_dereference(latch)->seq;
  *
  *             idx = seq & 0x01;
  *             entry = data_query(latch->data[idx], ...);
index 665cd0c..d1faa01 100644 (file)
@@ -111,22 +111,6 @@ static inline void sysfs_slab_remove(struct kmem_cache *s)
 }
 #endif
 
-
-/**
- * virt_to_obj - returns address of the beginning of object.
- * @s: object's kmem_cache
- * @slab_page: address of slab page
- * @x: address within object memory range
- *
- * Returns address of the beginning of object
- */
-static inline void *virt_to_obj(struct kmem_cache *s,
-                               const void *slab_page,
-                               const void *x)
-{
-       return (void *)x - ((x - slab_page) % s->size);
-}
-
 void object_err(struct kmem_cache *s, struct page *page,
                u8 *object, char *reason);
 
index 857a9a1..1f03483 100644 (file)
@@ -372,6 +372,7 @@ static inline void spi_unregister_driver(struct spi_driver *sdrv)
  * @unprepare_message: undo any work done by prepare_message().
  * @spi_flash_read: to support spi-controller hardwares that provide
  *                  accelerated interface to read from flash devices.
+ * @flash_read_supported: spi device supports flash read
  * @cs_gpios: Array of GPIOs to use as chip select lines; one per CS
  *     number. Any individual value may be -ENOENT for CS lines that
  *     are not GPIOs (driven by the SPI controller itself).
@@ -529,6 +530,7 @@ struct spi_master {
                                 struct spi_message *message);
        int (*spi_flash_read)(struct  spi_device *spi,
                              struct spi_flash_read_message *msg);
+       bool (*flash_read_supported)(struct spi_device *spi);
 
        /*
         * These hooks are for drivers that use a generic implementation
@@ -1158,7 +1160,9 @@ struct spi_flash_read_message {
 /* SPI core interface for flash read support */
 static inline bool spi_flash_read_supported(struct spi_device *spi)
 {
-       return spi->master->spi_flash_read ? true : false;
+       return spi->master->spi_flash_read &&
+              (!spi->master->flash_read_supported ||
+              spi->master->flash_read_supported(spi));
 }
 
 int spi_flash_read(struct spi_device *spi,
diff --git a/include/linux/stringhash.h b/include/linux/stringhash.h
new file mode 100644 (file)
index 0000000..451771d
--- /dev/null
@@ -0,0 +1,76 @@
+#ifndef __LINUX_STRINGHASH_H
+#define __LINUX_STRINGHASH_H
+
+#include <linux/compiler.h>    /* For __pure */
+#include <linux/types.h>       /* For u32, u64 */
+
+/*
+ * Routines for hashing strings of bytes to a 32-bit hash value.
+ *
+ * These hash functions are NOT GUARANTEED STABLE between kernel
+ * versions, architectures, or even repeated boots of the same kernel.
+ * (E.g. they may depend on boot-time hardware detection or be
+ * deliberately randomized.)
+ *
+ * They are also not intended to be secure against collisions caused by
+ * malicious inputs; much slower hash functions are required for that.
+ *
+ * They are optimized for pathname components, meaning short strings.
+ * Even if a majority of files have longer names, the dynamic profile of
+ * pathname components skews short due to short directory names.
+ * (E.g. /usr/lib/libsesquipedalianism.so.3.141.)
+ */
+
+/*
+ * Version 1: one byte at a time.  Example of use:
+ *
+ * unsigned long hash = init_name_hash;
+ * while (*p)
+ *     hash = partial_name_hash(tolower(*p++), hash);
+ * hash = end_name_hash(hash);
+ *
+ * Although this is designed for bytes, fs/hfsplus/unicode.c
+ * abuses it to hash 16-bit values.
+ */
+
+/* Hash courtesy of the R5 hash in reiserfs modulo sign bits */
+#define init_name_hash()               0
+
+/* partial hash update function. Assume roughly 4 bits per character */
+static inline unsigned long
+partial_name_hash(unsigned long c, unsigned long prevhash)
+{
+       return (prevhash + (c << 4) + (c >> 4)) * 11;
+}
+
+/*
+ * Finally: cut down the number of bits to a int value (and try to avoid
+ * losing bits)
+ */
+static inline unsigned long end_name_hash(unsigned long hash)
+{
+       return (unsigned int)hash;
+}
+
+/*
+ * Version 2: One word (32 or 64 bits) at a time.
+ * If CONFIG_DCACHE_WORD_ACCESS is defined (meaning <asm/word-at-a-time.h>
+ * exists, which describes major Linux platforms like x86 and ARM), then
+ * this computes a different hash function much faster.
+ *
+ * If not set, this falls back to a wrapper around the preceding.
+ */
+extern unsigned int __pure full_name_hash(const char *, unsigned int);
+
+/*
+ * A hash_len is a u64 with the hash of a string in the low
+ * half and the length in the high half.
+ */
+#define hashlen_hash(hashlen) ((u32)(hashlen))
+#define hashlen_len(hashlen)  ((u32)((hashlen) >> 32))
+#define hashlen_create(hash, len) ((u64)(len)<<32 | (u32)(hash))
+
+/* Return the "hash_len" (hash and length) of a null-terminated string */
+extern u64 __pure hashlen_string(const char *name);
+
+#endif /* __LINUX_STRINGHASH_H */
index 6a241a2..8997915 100644 (file)
@@ -127,7 +127,7 @@ struct rpc_authops {
        void                    (*destroy)(struct rpc_auth *);
 
        struct rpc_cred *       (*lookup_cred)(struct rpc_auth *, struct auth_cred *, int);
-       struct rpc_cred *       (*crcreate)(struct rpc_auth*, struct auth_cred *, int);
+       struct rpc_cred *       (*crcreate)(struct rpc_auth*, struct auth_cred *, int, gfp_t);
        int                     (*list_pseudoflavors)(rpc_authflavor_t *, int);
        rpc_authflavor_t        (*info2flavor)(struct rpcsec_gss_info *);
        int                     (*flavor2info)(rpc_authflavor_t,
@@ -167,6 +167,7 @@ void                        rpc_destroy_authunix(void);
 
 struct rpc_cred *      rpc_lookup_cred(void);
 struct rpc_cred *      rpc_lookup_cred_nonblock(void);
+struct rpc_cred *      rpc_lookup_generic_cred(struct auth_cred *, int, gfp_t);
 struct rpc_cred *      rpc_lookup_machine_cred(const char *service_name);
 int                    rpcauth_register(const struct rpc_authops *);
 int                    rpcauth_unregister(const struct rpc_authops *);
@@ -178,7 +179,7 @@ rpc_authflavor_t    rpcauth_get_pseudoflavor(rpc_authflavor_t,
 int                    rpcauth_get_gssinfo(rpc_authflavor_t,
                                struct rpcsec_gss_info *);
 int                    rpcauth_list_flavors(rpc_authflavor_t *, int);
-struct rpc_cred *      rpcauth_lookup_credcache(struct rpc_auth *, struct auth_cred *, int);
+struct rpc_cred *      rpcauth_lookup_credcache(struct rpc_auth *, struct auth_cred *, int, gfp_t);
 void                   rpcauth_init_cred(struct rpc_cred *, const struct auth_cred *, struct rpc_auth *, const struct rpc_credops *);
 struct rpc_cred *      rpcauth_lookupcred(struct rpc_auth *, int);
 struct rpc_cred *      rpcauth_generic_bind_cred(struct rpc_task *, struct rpc_cred *, int);
@@ -201,9 +202,28 @@ char *                     rpcauth_stringify_acceptor(struct rpc_cred *);
 static inline
 struct rpc_cred *      get_rpccred(struct rpc_cred *cred)
 {
-       atomic_inc(&cred->cr_count);
+       if (cred != NULL)
+               atomic_inc(&cred->cr_count);
        return cred;
 }
 
+/**
+ * get_rpccred_rcu - get a reference to a cred using rcu-protected pointer
+ * @cred: cred of which to take a reference
+ *
+ * In some cases, we may have a pointer to a credential to which we
+ * want to take a reference, but don't already have one. Because these
+ * objects are freed using RCU, we can access the cr_count while its
+ * on its way to destruction and only take a reference if it's not already
+ * zero.
+ */
+static inline struct rpc_cred *
+get_rpccred_rcu(struct rpc_cred *cred)
+{
+       if (atomic_inc_not_zero(&cred->cr_count))
+               return cred;
+       return NULL;
+}
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_SUNRPC_AUTH_H */
index 9a7ddba..19c659d 100644 (file)
@@ -176,6 +176,7 @@ void                rpc_setbufsize(struct rpc_clnt *, unsigned int, unsigned int);
 int            rpc_protocol(struct rpc_clnt *);
 struct net *   rpc_net_ns(struct rpc_clnt *);
 size_t         rpc_max_payload(struct rpc_clnt *);
+size_t         rpc_max_bc_payload(struct rpc_clnt *);
 unsigned long  rpc_get_timeout(struct rpc_clnt *clnt);
 void           rpc_force_rebind(struct rpc_clnt *);
 size_t         rpc_peeraddr(struct rpc_clnt *, struct sockaddr *, size_t);
index 8073713..59cbf16 100644 (file)
@@ -158,9 +158,9 @@ typedef __be32      rpc_fraghdr;
 
 /*
  * Note that RFC 1833 does not put any size restrictions on the
- * netid string, but all currently defined netid's fit in 4 bytes.
+ * netid string, but all currently defined netid's fit in 5 bytes.
  */
-#define RPCBIND_MAXNETIDLEN    (4u)
+#define RPCBIND_MAXNETIDLEN    (5u)
 
 /*
  * Universal addresses are introduced in RFC 1833 and further spelled
index 3081339..d6917b8 100644 (file)
@@ -199,7 +199,7 @@ extern int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt,
                                    struct xdr_buf *rcvbuf);
 
 /* svc_rdma_marshal.c */
-extern int svc_rdma_xdr_decode_req(struct rpcrdma_msg *, struct svc_rqst *);
+extern int svc_rdma_xdr_decode_req(struct xdr_buf *);
 extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *,
                                     struct rpcrdma_msg *,
                                     enum rpcrdma_errcode, __be32 *);
index c00f53a..91d5a5d 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/sunrpc/cache.h>
 #include <linux/sunrpc/gss_api.h>
 #include <linux/hash.h>
+#include <linux/stringhash.h>
 #include <linux/cred.h>
 
 struct svc_cred {
@@ -165,41 +166,18 @@ extern int svcauth_unix_set_client(struct svc_rqst *rqstp);
 extern int unix_gid_cache_create(struct net *net);
 extern void unix_gid_cache_destroy(struct net *net);
 
-static inline unsigned long hash_str(char *name, int bits)
+/*
+ * The <stringhash.h> functions are good enough that we don't need to
+ * use hash_32() on them; just extracting the high bits is enough.
+ */
+static inline unsigned long hash_str(char const *name, int bits)
 {
-       unsigned long hash = 0;
-       unsigned long l = 0;
-       int len = 0;
-       unsigned char c;
-       do {
-               if (unlikely(!(c = *name++))) {
-                       c = (char)len; len = -1;
-               }
-               l = (l << 8) | c;
-               len++;
-               if ((len & (BITS_PER_LONG/8-1))==0)
-                       hash = hash_long(hash^l, BITS_PER_LONG);
-       } while (len);
-       return hash >> (BITS_PER_LONG - bits);
+       return hashlen_hash(hashlen_string(name)) >> (32 - bits);
 }
 
-static inline unsigned long hash_mem(char *buf, int length, int bits)
+static inline unsigned long hash_mem(char const *buf, int length, int bits)
 {
-       unsigned long hash = 0;
-       unsigned long l = 0;
-       int len = 0;
-       unsigned char c;
-       do {
-               if (len == length) {
-                       c = (char)len; len = -1;
-               } else
-                       c = *buf++;
-               l = (l << 8) | c;
-               len++;
-               if ((len & (BITS_PER_LONG/8-1))==0)
-                       hash = hash_long(hash^l, BITS_PER_LONG);
-       } while (len);
-       return hash >> (BITS_PER_LONG - bits);
+       return full_name_hash(buf, length) >> (32 - bits);
 }
 
 #endif /* __KERNEL__ */
index fb0d212..5aa3834 100644 (file)
@@ -142,6 +142,7 @@ struct rpc_xprt_ops {
        int             (*bc_setup)(struct rpc_xprt *xprt,
                                    unsigned int min_reqs);
        int             (*bc_up)(struct svc_serv *serv, struct net *net);
+       size_t          (*bc_maxpayload)(struct rpc_xprt *xprt);
        void            (*bc_free_rqst)(struct rpc_rqst *rqst);
        void            (*bc_destroy)(struct rpc_xprt *xprt,
                                      unsigned int max_reqs);
index 767190b..39267dc 100644 (file)
@@ -52,7 +52,9 @@
 #define RPCRDMA_DEF_SLOT_TABLE (128U)
 #define RPCRDMA_MAX_SLOT_TABLE (256U)
 
-#define RPCRDMA_DEF_INLINE  (1024)     /* default inline max */
+#define RPCRDMA_MIN_INLINE  (1024)     /* min inline thresh */
+#define RPCRDMA_DEF_INLINE  (1024)     /* default inline thresh */
+#define RPCRDMA_MAX_INLINE  (3068)     /* max inline thresh */
 
 /* Memory registration strategies, by number.
  * This is part of a kernel / user space API. Do not remove. */
index 1b8a5a7..e45abe7 100644 (file)
@@ -340,6 +340,7 @@ struct thermal_zone_of_device_ops {
        int (*get_temp)(void *, int *);
        int (*get_trend)(void *, long *);
        int (*set_emul_temp)(void *, int);
+       int (*set_trip_temp)(void *, int, int);
 };
 
 /**
index 37dbacf..816b754 100644 (file)
@@ -21,6 +21,9 @@ static inline int do_sys_settimeofday(const struct timespec *tv,
        struct timespec64 ts64;
 
        if (!tv)
+               return do_sys_settimeofday64(NULL, tz);
+
+       if (!timespec_valid(tv))
                return -EINVAL;
 
        ts64 = timespec_to_timespec64(*tv);
index 1cc4c57..94079ba 100644 (file)
@@ -33,8 +33,8 @@ struct xattr_handler {
                   struct inode *inode, const char *name, void *buffer,
                   size_t size);
        int (*set)(const struct xattr_handler *, struct dentry *dentry,
-                  const char *name, const void *buffer, size_t size,
-                  int flags);
+                  struct inode *inode, const char *name, const void *buffer,
+                  size_t size, int flags);
 };
 
 const char *xattr_full_name(const struct xattr_handler *, const char *);
@@ -54,7 +54,8 @@ int vfs_removexattr(struct dentry *, const char *);
 
 ssize_t generic_getxattr(struct dentry *dentry, struct inode *inode, const char *name, void *buffer, size_t size);
 ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size);
-int generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags);
+int generic_setxattr(struct dentry *dentry, struct inode *inode,
+                    const char *name, const void *value, size_t size, int flags);
 int generic_removexattr(struct dentry *dentry, const char *name);
 ssize_t vfs_getxattr_alloc(struct dentry *dentry, const char *name,
                           char **xattr_value, size_t size, gfp_t flags);
index d325c81..43a5a0e 100644 (file)
@@ -63,6 +63,8 @@ struct ip6_tnl_encap_ops {
                            u8 *protocol, struct flowi6 *fl6);
 };
 
+#ifdef CONFIG_INET
+
 extern const struct ip6_tnl_encap_ops __rcu *
                ip6tun_encaps[MAX_IPTUN_ENCAP_OPS];
 
@@ -138,7 +140,6 @@ struct net *ip6_tnl_get_link_net(const struct net_device *dev);
 int ip6_tnl_get_iflink(const struct net_device *dev);
 int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu);
 
-#ifdef CONFIG_INET
 static inline void ip6tunnel_xmit(struct sock *sk, struct sk_buff *skb,
                                  struct net_device *dev)
 {
index 401038d..fea53f4 100644 (file)
@@ -61,6 +61,7 @@ psched_tdiff_bounded(psched_time_t tv1, psched_time_t tv2, psched_time_t bound)
 }
 
 struct qdisc_watchdog {
+       u64             last_expires;
        struct hrtimer  timer;
        struct Qdisc    *qdisc;
 };
index 37dd534..c8a773f 100644 (file)
@@ -239,12 +239,15 @@ struct ib_vendor_mad {
 
 #define IB_MGMT_CLASSPORTINFO_ATTR_ID  cpu_to_be16(0x0001)
 
+#define IB_CLASS_PORT_INFO_RESP_TIME_MASK      0x1F
+#define IB_CLASS_PORT_INFO_RESP_TIME_FIELD_SIZE 5
+
 struct ib_class_port_info {
        u8                      base_version;
        u8                      class_version;
        __be16                  capability_mask;
-       u8                      reserved[3];
-       u8                      resp_time_value;
+         /* 27 bits for cap_mask2, 5 bits for resp_time */
+       __be32                  cap_mask2_resp_time;
        u8                      redirect_gid[16];
        __be32                  redirect_tcslfl;
        __be16                  redirect_lid;
@@ -259,6 +262,59 @@ struct ib_class_port_info {
        __be32                  trap_qkey;
 };
 
+/**
+ * ib_get_cpi_resp_time - Returns the resp_time value from
+ * cap_mask2_resp_time in ib_class_port_info.
+ * @cpi: A struct ib_class_port_info mad.
+ */
+static inline u8 ib_get_cpi_resp_time(struct ib_class_port_info *cpi)
+{
+       return (u8)(be32_to_cpu(cpi->cap_mask2_resp_time) &
+                   IB_CLASS_PORT_INFO_RESP_TIME_MASK);
+}
+
+/**
+ * ib_set_cpi_resptime - Sets the response time in an
+ * ib_class_port_info mad.
+ * @cpi: A struct ib_class_port_info.
+ * @rtime: The response time to set.
+ */
+static inline void ib_set_cpi_resp_time(struct ib_class_port_info *cpi,
+                                       u8 rtime)
+{
+       cpi->cap_mask2_resp_time =
+               (cpi->cap_mask2_resp_time &
+                cpu_to_be32(~IB_CLASS_PORT_INFO_RESP_TIME_MASK)) |
+               cpu_to_be32(rtime & IB_CLASS_PORT_INFO_RESP_TIME_MASK);
+}
+
+/**
+ * ib_get_cpi_capmask2 - Returns the capmask2 value from
+ * cap_mask2_resp_time in ib_class_port_info.
+ * @cpi: A struct ib_class_port_info mad.
+ */
+static inline u32 ib_get_cpi_capmask2(struct ib_class_port_info *cpi)
+{
+       return (be32_to_cpu(cpi->cap_mask2_resp_time) >>
+               IB_CLASS_PORT_INFO_RESP_TIME_FIELD_SIZE);
+}
+
+/**
+ * ib_set_cpi_capmask2 - Sets the capmask2 in an
+ * ib_class_port_info mad.
+ * @cpi: A struct ib_class_port_info.
+ * @capmask2: The capmask2 to set.
+ */
+static inline void ib_set_cpi_capmask2(struct ib_class_port_info *cpi,
+                                      u32 capmask2)
+{
+       cpi->cap_mask2_resp_time =
+               (cpi->cap_mask2_resp_time &
+                cpu_to_be32(IB_CLASS_PORT_INFO_RESP_TIME_MASK)) |
+               cpu_to_be32(capmask2 <<
+                           IB_CLASS_PORT_INFO_RESP_TIME_FIELD_SIZE);
+}
+
 struct ib_mad_notice_attr {
        u8 generic_type;
        u8 prod_type_msb;
index 0f3daae..b13419c 100644 (file)
@@ -103,6 +103,9 @@ enum {
        IB_OPCODE_ATOMIC_ACKNOWLEDGE                = 0x12,
        IB_OPCODE_COMPARE_SWAP                      = 0x13,
        IB_OPCODE_FETCH_ADD                         = 0x14,
+       /* opcode 0x15 is reserved */
+       IB_OPCODE_SEND_LAST_WITH_INVALIDATE         = 0x16,
+       IB_OPCODE_SEND_ONLY_WITH_INVALIDATE         = 0x17,
 
        /* real constants follow -- see comment about above IB_OPCODE()
           macro for more details */
@@ -129,6 +132,8 @@ enum {
        IB_OPCODE(RC, ATOMIC_ACKNOWLEDGE),
        IB_OPCODE(RC, COMPARE_SWAP),
        IB_OPCODE(RC, FETCH_ADD),
+       IB_OPCODE(RC, SEND_LAST_WITH_INVALIDATE),
+       IB_OPCODE(RC, SEND_ONLY_WITH_INVALIDATE),
 
        /* UC */
        IB_OPCODE(UC, SEND_FIRST),
index cdc1c81..3840416 100644 (file)
@@ -94,6 +94,8 @@ enum ib_sa_selector {
        IB_SA_BEST = 3
 };
 
+#define IB_SA_CAP_MASK2_SENDONLY_FULL_MEM_SUPPORT      BIT(12)
+
 /*
  * Structures for SA records are named "struct ib_sa_xxx_rec."  No
  * attempt is made to pack structures to match the physical layout of
@@ -439,4 +441,14 @@ int ib_sa_guid_info_rec_query(struct ib_sa_client *client,
                              void *context,
                              struct ib_sa_query **sa_query);
 
+/* Support get SA ClassPortInfo */
+int ib_sa_classport_info_rec_query(struct ib_sa_client *client,
+                                  struct ib_device *device, u8 port_num,
+                                  int timeout_ms, gfp_t gfp_mask,
+                                  void (*callback)(int status,
+                                                   struct ib_class_port_info *resp,
+                                                   void *context),
+                                  void *context,
+                                  struct ib_sa_query **sa_query);
+
 #endif /* IB_SA_H */
index fc0320c..432bed5 100644 (file)
@@ -403,56 +403,55 @@ enum ib_port_speed {
        IB_SPEED_EDR    = 32
 };
 
-struct ib_protocol_stats {
-       /* TBD... */
-};
-
-struct iw_protocol_stats {
-       u64     ipInReceives;
-       u64     ipInHdrErrors;
-       u64     ipInTooBigErrors;
-       u64     ipInNoRoutes;
-       u64     ipInAddrErrors;
-       u64     ipInUnknownProtos;
-       u64     ipInTruncatedPkts;
-       u64     ipInDiscards;
-       u64     ipInDelivers;
-       u64     ipOutForwDatagrams;
-       u64     ipOutRequests;
-       u64     ipOutDiscards;
-       u64     ipOutNoRoutes;
-       u64     ipReasmTimeout;
-       u64     ipReasmReqds;
-       u64     ipReasmOKs;
-       u64     ipReasmFails;
-       u64     ipFragOKs;
-       u64     ipFragFails;
-       u64     ipFragCreates;
-       u64     ipInMcastPkts;
-       u64     ipOutMcastPkts;
-       u64     ipInBcastPkts;
-       u64     ipOutBcastPkts;
-
-       u64     tcpRtoAlgorithm;
-       u64     tcpRtoMin;
-       u64     tcpRtoMax;
-       u64     tcpMaxConn;
-       u64     tcpActiveOpens;
-       u64     tcpPassiveOpens;
-       u64     tcpAttemptFails;
-       u64     tcpEstabResets;
-       u64     tcpCurrEstab;
-       u64     tcpInSegs;
-       u64     tcpOutSegs;
-       u64     tcpRetransSegs;
-       u64     tcpInErrs;
-       u64     tcpOutRsts;
-};
-
-union rdma_protocol_stats {
-       struct ib_protocol_stats        ib;
-       struct iw_protocol_stats        iw;
-};
+/**
+ * struct rdma_hw_stats
+ * @timestamp - Used by the core code to track when the last update was
+ * @lifespan - Used by the core code to determine how old the counters
+ *   should be before being updated again.  Stored in jiffies, defaults
+ *   to 10 milliseconds, drivers can override the default be specifying
+ *   their own value during their allocation routine.
+ * @name - Array of pointers to static names used for the counters in
+ *   directory.
+ * @num_counters - How many hardware counters there are.  If name is
+ *   shorter than this number, a kernel oops will result.  Driver authors
+ *   are encouraged to leave BUILD_BUG_ON(ARRAY_SIZE(@name) < num_counters)
+ *   in their code to prevent this.
+ * @value - Array of u64 counters that are accessed by the sysfs code and
+ *   filled in by the drivers get_stats routine
+ */
+struct rdma_hw_stats {
+       unsigned long   timestamp;
+       unsigned long   lifespan;
+       const char * const *names;
+       int             num_counters;
+       u64             value[];
+};
+
+#define RDMA_HW_STATS_DEFAULT_LIFESPAN 10
+/**
+ * rdma_alloc_hw_stats_struct - Helper function to allocate dynamic struct
+ *   for drivers.
+ * @names - Array of static const char *
+ * @num_counters - How many elements in array
+ * @lifespan - How many milliseconds between updates
+ */
+static inline struct rdma_hw_stats *rdma_alloc_hw_stats_struct(
+               const char * const *names, int num_counters,
+               unsigned long lifespan)
+{
+       struct rdma_hw_stats *stats;
+
+       stats = kzalloc(sizeof(*stats) + num_counters * sizeof(u64),
+                       GFP_KERNEL);
+       if (!stats)
+               return NULL;
+       stats->names = names;
+       stats->num_counters = num_counters;
+       stats->lifespan = msecs_to_jiffies(lifespan);
+
+       return stats;
+}
+
 
 /* Define bits for the various functionality this port needs to be supported by
  * the core.
@@ -1707,8 +1706,29 @@ struct ib_device {
 
        struct iw_cm_verbs           *iwcm;
 
-       int                        (*get_protocol_stats)(struct ib_device *device,
-                                                        union rdma_protocol_stats *stats);
+       /**
+        * alloc_hw_stats - Allocate a struct rdma_hw_stats and fill in the
+        *   driver initialized data.  The struct is kfree()'ed by the sysfs
+        *   core when the device is removed.  A lifespan of -1 in the return
+        *   struct tells the core to set a default lifespan.
+        */
+       struct rdma_hw_stats      *(*alloc_hw_stats)(struct ib_device *device,
+                                                    u8 port_num);
+       /**
+        * get_hw_stats - Fill in the counter value(s) in the stats struct.
+        * @index - The index in the value array we wish to have updated, or
+        *   num_counters if we want all stats updated
+        * Return codes -
+        *   < 0 - Error, no counters updated
+        *   index - Updated the single counter pointed to by index
+        *   num_counters - Updated all counters (will reset the timestamp
+        *     and prevent further calls for lifespan milliseconds)
+        * Drivers are allowed to update all counters in leiu of just the
+        *   one given in index at their option
+        */
+       int                        (*get_hw_stats)(struct ib_device *device,
+                                                  struct rdma_hw_stats *stats,
+                                                  u8 port, int index);
        int                        (*query_device)(struct ib_device *device,
                                                   struct ib_device_attr *device_attr,
                                                   struct ib_udata *udata);
@@ -1926,6 +1946,8 @@ struct ib_device {
        u8                           node_type;
        u8                           phys_port_cnt;
        struct ib_device_attr        attrs;
+       struct attribute_group       *hw_stats_ag;
+       struct rdma_hw_stats         *hw_stats;
 
        /**
         * The following mandatory functions are used only at device
index d57ceee..16274e2 100644 (file)
@@ -149,15 +149,15 @@ struct rvt_driver_params {
        int qpn_res_end;
        int nports;
        int npkeys;
-       u8 qos_shift;
        char cq_name[RVT_CQN_MAX];
        int node;
-       int max_rdma_atomic;
        int psn_mask;
        int psn_shift;
        int psn_modify_mask;
        u32 core_cap_flags;
        u32 max_mad_size;
+       u8 qos_shift;
+       u8 max_rdma_atomic;
 };
 
 /* Protection domain */
@@ -425,6 +425,15 @@ static inline unsigned rvt_get_npkeys(struct rvt_dev_info *rdi)
        return rdi->dparms.npkeys;
 }
 
+/*
+ * Return the max atomic suitable for determining
+ * the size of the ack ring buffer in a QP.
+ */
+static inline unsigned int rvt_max_atomic(struct rvt_dev_info *rdi)
+{
+       return rdi->dparms.max_rdma_atomic + 1;
+}
+
 /*
  * Return the indexed PKEY from the port PKEY table.
  */
index 0e1ff2a..6d23b87 100644 (file)
@@ -211,8 +211,6 @@ struct rvt_mmap_info {
        unsigned size;
 };
 
-#define RVT_MAX_RDMA_ATOMIC    16
-
 /*
  * This structure holds the information that the send tasklet needs
  * to send a RDMA read response or atomic operation.
@@ -282,8 +280,7 @@ struct rvt_qp {
        atomic_t refcount ____cacheline_aligned_in_smp;
        wait_queue_head_t wait;
 
-       struct rvt_ack_entry s_ack_queue[RVT_MAX_RDMA_ATOMIC + 1]
-               ____cacheline_aligned_in_smp;
+       struct rvt_ack_entry *s_ack_queue;
        struct rvt_sge_state s_rdma_read_sge;
 
        spinlock_t r_lock ____cacheline_aligned_in_smp;      /* used for APM */
index c3371fa..4ac24f5 100644 (file)
@@ -74,6 +74,7 @@ enum iscsit_transport_type {
        ISCSI_IWARP_TCP                         = 3,
        ISCSI_IWARP_SCTP                        = 4,
        ISCSI_INFINIBAND                        = 5,
+       ISCSI_CXGBIT                            = 6,
 };
 
 /* RFC-3720 7.1.4  Standard Connection State Diagram for a Target */
@@ -890,4 +891,30 @@ static inline u32 session_get_next_ttt(struct iscsi_session *session)
 }
 
 extern struct iscsi_cmd *iscsit_find_cmd_from_itt(struct iscsi_conn *, itt_t);
+
+static inline void iscsit_thread_check_cpumask(
+       struct iscsi_conn *conn,
+       struct task_struct *p,
+       int mode)
+{
+       /*
+        * mode == 1 signals iscsi_target_tx_thread() usage.
+        * mode == 0 signals iscsi_target_rx_thread() usage.
+        */
+       if (mode == 1) {
+               if (!conn->conn_tx_reset_cpumask)
+                       return;
+               conn->conn_tx_reset_cpumask = 0;
+       } else {
+               if (!conn->conn_rx_reset_cpumask)
+                       return;
+               conn->conn_rx_reset_cpumask = 0;
+       }
+       /*
+        * Update the CPU mask for this single kthread so that
+        * both TX and RX kthreads are scheduled to run on the
+        * same CPU.
+        */
+       set_cpus_allowed_ptr(p, conn->conn_cpumask);
+}
 #endif /* ISCSI_TARGET_CORE_H */
index 90e37fa..40ac7cd 100644 (file)
@@ -6,6 +6,7 @@ struct iscsit_transport {
 #define ISCSIT_TRANSPORT_NAME  16
        char name[ISCSIT_TRANSPORT_NAME];
        int transport_type;
+       bool rdma_shutdown;
        int priv_size;
        struct module *owner;
        struct list_head t_node;
@@ -22,6 +23,13 @@ struct iscsit_transport {
        int (*iscsit_queue_data_in)(struct iscsi_conn *, struct iscsi_cmd *);
        int (*iscsit_queue_status)(struct iscsi_conn *, struct iscsi_cmd *);
        void (*iscsit_aborted_task)(struct iscsi_conn *, struct iscsi_cmd *);
+       int (*iscsit_xmit_pdu)(struct iscsi_conn *, struct iscsi_cmd *,
+                              struct iscsi_datain_req *, const void *, u32);
+       void (*iscsit_release_cmd)(struct iscsi_conn *, struct iscsi_cmd *);
+       void (*iscsit_get_rx_pdu)(struct iscsi_conn *);
+       int (*iscsit_validate_params)(struct iscsi_conn *);
+       void (*iscsit_get_r2t_ttt)(struct iscsi_conn *, struct iscsi_cmd *,
+                                  struct iscsi_r2t *);
        enum target_prot_op (*iscsit_get_sup_prot_ops)(struct iscsi_conn *);
 };
 
@@ -77,6 +85,18 @@ extern void iscsit_build_reject(struct iscsi_cmd *, struct iscsi_conn *,
 extern int iscsit_build_logout_rsp(struct iscsi_cmd *, struct iscsi_conn *,
                                struct iscsi_logout_rsp *);
 extern int iscsit_logout_post_handler(struct iscsi_cmd *, struct iscsi_conn *);
+extern int iscsit_queue_rsp(struct iscsi_conn *, struct iscsi_cmd *);
+extern void iscsit_aborted_task(struct iscsi_conn *, struct iscsi_cmd *);
+extern int iscsit_add_reject(struct iscsi_conn *, u8, unsigned char *);
+extern int iscsit_reject_cmd(struct iscsi_cmd *, u8, unsigned char *);
+extern int iscsit_handle_snack(struct iscsi_conn *, unsigned char *);
+extern void iscsit_build_datain_pdu(struct iscsi_cmd *, struct iscsi_conn *,
+                                   struct iscsi_datain *,
+                                   struct iscsi_data_rsp *, bool);
+extern int iscsit_build_r2ts_for_cmd(struct iscsi_conn *, struct iscsi_cmd *,
+                                    bool);
+extern int iscsit_immediate_queue(struct iscsi_conn *, struct iscsi_cmd *, int);
+extern int iscsit_response_queue(struct iscsi_conn *, struct iscsi_cmd *, int);
 /*
  * From iscsi_target_device.c
  */
@@ -102,3 +122,24 @@ extern struct iscsi_cmd *iscsit_allocate_cmd(struct iscsi_conn *, int);
 extern int iscsit_sequence_cmd(struct iscsi_conn *, struct iscsi_cmd *,
                               unsigned char *, __be32);
 extern void iscsit_release_cmd(struct iscsi_cmd *);
+extern void iscsit_free_cmd(struct iscsi_cmd *, bool);
+extern void iscsit_add_cmd_to_immediate_queue(struct iscsi_cmd *,
+                                             struct iscsi_conn *, u8);
+
+/*
+ * From iscsi_target_nego.c
+ */
+extern int iscsi_target_check_login_request(struct iscsi_conn *,
+                                           struct iscsi_login *);
+
+/*
+ * From iscsi_target_login.c
+ */
+extern __printf(2, 3) int iscsi_change_param_sprintf(
+       struct iscsi_conn *, const char *, ...);
+
+/*
+ * From iscsi_target_parameters.c
+ */
+extern struct iscsi_param *iscsi_find_param_from_key(
+       char *, struct iscsi_param_list *);
index 3e0dd86..b316b44 100644 (file)
@@ -536,7 +536,6 @@ struct se_node_acl {
        char                    initiatorname[TRANSPORT_IQN_LEN];
        /* Used to signal demo mode created ACL, disabled by default */
        bool                    dynamic_node_acl;
-       bool                    acl_stop:1;
        u32                     queue_depth;
        u32                     acl_index;
        enum target_prot_type   saved_prot_type;
@@ -603,7 +602,6 @@ struct se_session {
        struct list_head        sess_cmd_list;
        struct list_head        sess_wait_list;
        spinlock_t              sess_cmd_lock;
-       struct kref             sess_kref;
        void                    *sess_cmd_map;
        struct percpu_ida       sess_tag_pool;
 };
index 78d88f0..de44462 100644 (file)
@@ -50,10 +50,6 @@ struct target_core_fabric_ops {
         */
        int (*check_stop_free)(struct se_cmd *);
        void (*release_cmd)(struct se_cmd *);
-       /*
-        * Called with spin_lock_bh(struct se_portal_group->session_lock held.
-        */
-       int (*shutdown_session)(struct se_session *);
        void (*close_session)(struct se_session *);
        u32 (*sess_get_index)(struct se_session *);
        /*
@@ -123,8 +119,6 @@ void        __transport_register_session(struct se_portal_group *,
                struct se_node_acl *, struct se_session *, void *);
 void   transport_register_session(struct se_portal_group *,
                struct se_node_acl *, struct se_session *, void *);
-int    target_get_session(struct se_session *);
-void   target_put_session(struct se_session *);
 ssize_t        target_show_dynamic_sessions(struct se_portal_group *, char *);
 void   transport_free_session(struct se_session *);
 void   target_put_nacl(struct se_node_acl *);
index 526fb3d..f28292d 100644 (file)
@@ -108,7 +108,7 @@ TRACE_EVENT(kvm_ioapic_set_irq,
                __entry->coalesced      = coalesced;
        ),
 
-       TP_printk("pin %u dst %x vec=%u (%s|%s|%s%s)%s",
+       TP_printk("pin %u dst %x vec %u (%s|%s|%s%s)%s",
                  __entry->pin, (u8)(__entry->e >> 56), (u8)__entry->e,
                  __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
                  (__entry->e & (1<<11)) ? "logical" : "physical",
@@ -129,7 +129,7 @@ TRACE_EVENT(kvm_ioapic_delayed_eoi_inj,
                __entry->e              = e;
        ),
 
-       TP_printk("dst %x vec=%u (%s|%s|%s%s)",
+       TP_printk("dst %x vec %u (%s|%s|%s%s)",
                  (u8)(__entry->e >> 56), (u8)__entry->e,
                  __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
                  (__entry->e & (1<<11)) ? "logical" : "physical",
@@ -151,7 +151,7 @@ TRACE_EVENT(kvm_msi_set_irq,
                __entry->data           = data;
        ),
 
-       TP_printk("dst %u vec %x (%s|%s|%s%s)",
+       TP_printk("dst %u vec %u (%s|%s|%s%s)",
                  (u8)(__entry->address >> 12), (u8)__entry->data,
                  __print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode),
                  (__entry->address & (1<<2)) ? "logical" : "physical",
index c51afb7..a26415b 100644 (file)
@@ -127,8 +127,11 @@ __SYSCALL(__NR_unlinkat, sys_unlinkat)
 __SYSCALL(__NR_symlinkat, sys_symlinkat)
 #define __NR_linkat 37
 __SYSCALL(__NR_linkat, sys_linkat)
+#ifdef __ARCH_WANT_RENAMEAT
+/* renameat is superseded with flags by renameat2 */
 #define __NR_renameat 38
 __SYSCALL(__NR_renameat, sys_renameat)
+#endif /* __ARCH_WANT_RENAMEAT */
 
 /* fs/namespace.c */
 #define __NR_umount2 39
index 9222db8..5f030b4 100644 (file)
@@ -1353,6 +1353,15 @@ enum ethtool_link_mode_bit_indices {
        ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT = 28,
        ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT = 29,
        ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT = 30,
+       ETHTOOL_LINK_MODE_25000baseCR_Full_BIT  = 31,
+       ETHTOOL_LINK_MODE_25000baseKR_Full_BIT  = 32,
+       ETHTOOL_LINK_MODE_25000baseSR_Full_BIT  = 33,
+       ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT = 34,
+       ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT = 35,
+       ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT        = 36,
+       ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT        = 37,
+       ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT        = 38,
+       ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT    = 39,
 
        /* Last allowed bit for __ETHTOOL_LINK_MODE_LEGACY_MASK is bit
         * 31. Please do NOT define any SUPPORTED_* or ADVERTISED_*
@@ -1361,7 +1370,7 @@ enum ethtool_link_mode_bit_indices {
         */
 
        __ETHTOOL_LINK_MODE_LAST
-         = ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT,
+         = ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT,
 };
 
 #define __ETHTOOL_LINK_MODE_LEGACY_MASK(base_name)     \
index c4b2a3f..50ff21f 100644 (file)
@@ -61,5 +61,6 @@ struct nvme_passthru_cmd {
 #define NVME_IOCTL_IO_CMD      _IOWR('N', 0x43, struct nvme_passthru_cmd)
 #define NVME_IOCTL_RESET       _IO('N', 0x44)
 #define NVME_IOCTL_SUBSYS_RESET        _IO('N', 0x45)
+#define NVME_IOCTL_RESCAN      _IO('N', 0x46)
 
 #endif /* _UAPI_LINUX_NVME_IOCTL_H */
index 43fc8d2..36ce552 100644 (file)
@@ -862,6 +862,7 @@ enum perf_event_type {
 };
 
 #define PERF_MAX_STACK_DEPTH           127
+#define PERF_MAX_CONTEXTS_PER_STACK      8
 
 enum perf_callchain_context {
        PERF_CONTEXT_HV                 = (__u64)-32,
index eba5914..f4297c8 100644 (file)
@@ -145,6 +145,8 @@ enum {
        TCA_POLICE_PEAKRATE,
        TCA_POLICE_AVRATE,
        TCA_POLICE_RESULT,
+       TCA_POLICE_TM,
+       TCA_POLICE_PAD,
        __TCA_POLICE_MAX
 #define TCA_POLICE_RESULT TCA_POLICE_RESULT
 };
@@ -173,7 +175,7 @@ enum {
        TCA_U32_DIVISOR,
        TCA_U32_SEL,
        TCA_U32_POLICE,
-       TCA_U32_ACT,   
+       TCA_U32_ACT,
        TCA_U32_INDEV,
        TCA_U32_PCNT,
        TCA_U32_MARK,
index 763bb69..0ec1da2 100644 (file)
@@ -228,7 +228,7 @@ struct nand_oobfree {
  * complete set of ECC information. The ioctl truncates the larger internal
  * structure to retain binary compatibility with the static declaration of the
  * ioctl. Note that the "MTD_MAX_..._ENTRIES" macros represent the max size of
- * the user struct, not the MAX size of the internal struct nand_ecclayout.
+ * the user struct, not the MAX size of the internal OOB layout representation.
  */
 struct nand_ecclayout_user {
        __u32 eccbytes;
index a533cec..98bebf8 100644 (file)
@@ -66,7 +66,7 @@
  * The major version changes when data structures change in an incompatible
  * way. The driver must be the same for initialization to succeed.
  */
-#define HFI1_USER_SWMAJOR 5
+#define HFI1_USER_SWMAJOR 6
 
 /*
  * Minor version differences are always compatible
  * may not be implemented; the user code must deal with this if it
  * cares, or it must abort after initialization reports the difference.
  */
-#define HFI1_USER_SWMINOR 0
+#define HFI1_USER_SWMINOR 1
+
+/*
+ * We will encode the major/minor inside a single 32bit version number.
+ */
+#define HFI1_SWMAJOR_SHIFT 16
 
 /*
  * Set of HW and driver capability/feature bits.
 #define HFI1_RCVHDR_ENTSIZE_16   (1UL << 1)
 #define HFI1_RCVDHR_ENTSIZE_32   (1UL << 2)
 
-/*
- * If the unit is specified via open, HFI choice is fixed.  If port is
- * specified, it's also fixed.  Otherwise we try to spread contexts
- * across ports and HFIs, using different algorithms.  WITHIN is
- * the old default, prior to this mechanism.
- */
-#define HFI1_ALG_ACROSS 0 /* round robin contexts across HFIs, then
-                         * ports; this is the default */
-#define HFI1_ALG_WITHIN 1 /* use all contexts on an HFI (round robin
-                         * active ports within), then next HFI */
-#define HFI1_ALG_COUNT  2 /* number of algorithm choices */
-
-
 /* User commands. */
 #define HFI1_CMD_ASSIGN_CTXT     1     /* allocate HFI and context */
 #define HFI1_CMD_CTXT_INFO       2     /* find out what resources we got */
 #define HFI1_CMD_TID_UPDATE      4     /* update expected TID entries */
 #define HFI1_CMD_TID_FREE        5     /* free expected TID entries */
 #define HFI1_CMD_CREDIT_UPD      6     /* force an update of PIO credit */
-#define HFI1_CMD_SDMA_STATUS_UPD 7      /* force update of SDMA status ring */
 
 #define HFI1_CMD_RECV_CTRL       8     /* control receipt of packets */
 #define HFI1_CMD_POLL_TYPE       9     /* set the kind of polling we want */
 #define HFI1_CMD_SET_PKEY        11     /* set context's pkey */
 #define HFI1_CMD_CTXT_RESET      12     /* reset context's HW send context */
 #define HFI1_CMD_TID_INVAL_READ  13     /* read TID cache invalidations */
-/* separate EPROM commands from normal PSM commands */
-#define HFI1_CMD_EP_INFO         64      /* read EPROM device ID */
-#define HFI1_CMD_EP_ERASE_CHIP   65      /* erase whole EPROM */
-/* range 66-74 no longer used */
-#define HFI1_CMD_EP_ERASE_RANGE  75      /* erase EPROM range */
-#define HFI1_CMD_EP_READ_RANGE   76      /* read EPROM range */
-#define HFI1_CMD_EP_WRITE_RANGE  77      /* write EPROM range */
+#define HFI1_CMD_GET_VERS       14     /* get the version of the user cdev */
+
+/*
+ * User IOCTLs can not go above 128 if they do then see common.h and change the
+ * base for the snoop ioctl
+ */
+#define IB_IOCTL_MAGIC 0x1b /* See Documentation/ioctl/ioctl-number.txt */
+
+/*
+ * Make the ioctls occupy the last 0xf0-0xff portion of the IB range
+ */
+#define __NUM(cmd) (HFI1_CMD_##cmd + 0xe0)
+
+struct hfi1_cmd;
+#define HFI1_IOCTL_ASSIGN_CTXT \
+       _IOWR(IB_IOCTL_MAGIC, __NUM(ASSIGN_CTXT), struct hfi1_user_info)
+#define HFI1_IOCTL_CTXT_INFO \
+       _IOW(IB_IOCTL_MAGIC, __NUM(CTXT_INFO), struct hfi1_ctxt_info)
+#define HFI1_IOCTL_USER_INFO \
+       _IOW(IB_IOCTL_MAGIC, __NUM(USER_INFO), struct hfi1_base_info)
+#define HFI1_IOCTL_TID_UPDATE \
+       _IOWR(IB_IOCTL_MAGIC, __NUM(TID_UPDATE), struct hfi1_tid_info)
+#define HFI1_IOCTL_TID_FREE \
+       _IOWR(IB_IOCTL_MAGIC, __NUM(TID_FREE), struct hfi1_tid_info)
+#define HFI1_IOCTL_CREDIT_UPD \
+       _IO(IB_IOCTL_MAGIC, __NUM(CREDIT_UPD))
+#define HFI1_IOCTL_RECV_CTRL \
+       _IOW(IB_IOCTL_MAGIC, __NUM(RECV_CTRL), int)
+#define HFI1_IOCTL_POLL_TYPE \
+       _IOW(IB_IOCTL_MAGIC, __NUM(POLL_TYPE), int)
+#define HFI1_IOCTL_ACK_EVENT \
+       _IOW(IB_IOCTL_MAGIC, __NUM(ACK_EVENT), unsigned long)
+#define HFI1_IOCTL_SET_PKEY \
+       _IOW(IB_IOCTL_MAGIC, __NUM(SET_PKEY), __u16)
+#define HFI1_IOCTL_CTXT_RESET \
+       _IO(IB_IOCTL_MAGIC, __NUM(CTXT_RESET))
+#define HFI1_IOCTL_TID_INVAL_READ \
+       _IOWR(IB_IOCTL_MAGIC, __NUM(TID_INVAL_READ), struct hfi1_tid_info)
+#define HFI1_IOCTL_GET_VERS \
+       _IOR(IB_IOCTL_MAGIC, __NUM(GET_VERS), int)
 
 #define _HFI1_EVENT_FROZEN_BIT         0
 #define _HFI1_EVENT_LINKDOWN_BIT       1
@@ -199,9 +223,7 @@ struct hfi1_user_info {
         * Should be set to HFI1_USER_SWVERSION.
         */
        __u32 userversion;
-       __u16 pad;
-       /* HFI selection algorithm, if unit has not selected */
-       __u16 hfi1_alg;
+       __u32 pad;
        /*
         * If two or more processes wish to share a context, each process
         * must set the subcontext_cnt and subcontext_id to the same
@@ -243,12 +265,6 @@ struct hfi1_tid_info {
        __u32 length;
 };
 
-struct hfi1_cmd {
-       __u32 type;        /* command type */
-       __u32 len;         /* length of struct pointed to by add */
-       __u64 addr;        /* pointer to user structure */
-};
-
 enum hfi1_sdma_comp_state {
        FREE = 0,
        QUEUED,
index 6e373d1..02fe839 100644 (file)
@@ -135,10 +135,12 @@ enum {
  * Local service operations:
  *   RESOLVE - The client requests the local service to resolve a path.
  *   SET_TIMEOUT - The local service requests the client to set the timeout.
+ *   IP_RESOLVE - The client requests the local service to resolve an IP to GID.
  */
 enum {
        RDMA_NL_LS_OP_RESOLVE = 0,
        RDMA_NL_LS_OP_SET_TIMEOUT,
+       RDMA_NL_LS_OP_IP_RESOLVE,
        RDMA_NL_LS_NUM_OPS
 };
 
@@ -176,6 +178,10 @@ struct rdma_ls_resolve_header {
        __u8 path_use;
 };
 
+struct rdma_ls_ip_resolve_header {
+       __u32 ifindex;
+};
+
 /* Local service attribute type */
 #define RDMA_NLA_F_MANDATORY   (1 << 13)
 #define RDMA_NLA_TYPE_MASK     (~(NLA_F_NESTED | NLA_F_NET_BYTEORDER | \
@@ -193,6 +199,8 @@ struct rdma_ls_resolve_header {
  *   TCLASS          u8
  *   PKEY            u16                        cpu
  *   QOS_CLASS       u16                        cpu
+ *   IPV4            u32                        BE
+ *   IPV6            u8[16]                     BE
  */
 enum {
        LS_NLA_TYPE_UNSPEC = 0,
@@ -204,6 +212,8 @@ enum {
        LS_NLA_TYPE_TCLASS,
        LS_NLA_TYPE_PKEY,
        LS_NLA_TYPE_QOS_CLASS,
+       LS_NLA_TYPE_IPV4,
+       LS_NLA_TYPE_IPV6,
        LS_NLA_TYPE_MAX
 };
 
index c4cc1e4..e4701a3 100644 (file)
 #define SND_SOC_TPLG_STREAM_PLAYBACK   0
 #define SND_SOC_TPLG_STREAM_CAPTURE    1
 
+/* vendor tuple types */
+#define SND_SOC_TPLG_TUPLE_TYPE_UUID   0
+#define SND_SOC_TPLG_TUPLE_TYPE_STRING 1
+#define SND_SOC_TPLG_TUPLE_TYPE_BOOL   2
+#define SND_SOC_TPLG_TUPLE_TYPE_BYTE   3
+#define SND_SOC_TPLG_TUPLE_TYPE_WORD   4
+#define SND_SOC_TPLG_TUPLE_TYPE_SHORT  5
+
 /*
  * Block Header.
  * This header precedes all object and object arrays below.
@@ -132,6 +140,35 @@ struct snd_soc_tplg_hdr {
        __le32 count;           /* number of elements in block */
 } __attribute__((packed));
 
+/* vendor tuple for uuid */
+struct snd_soc_tplg_vendor_uuid_elem {
+       __le32 token;
+       char uuid[16];
+} __attribute__((packed));
+
+/* vendor tuple for a bool/byte/short/word value */
+struct snd_soc_tplg_vendor_value_elem {
+       __le32 token;
+       __le32 value;
+} __attribute__((packed));
+
+/* vendor tuple for string */
+struct snd_soc_tplg_vendor_string_elem {
+       __le32 token;
+       char string[SNDRV_CTL_ELEM_ID_NAME_MAXLEN];
+} __attribute__((packed));
+
+struct snd_soc_tplg_vendor_array {
+       __le32 size;    /* size in bytes of the array, including all elements */
+       __le32 type;    /* SND_SOC_TPLG_TUPLE_TYPE_ */
+       __le32 num_elems;       /* number of elements in array */
+       union {
+               struct snd_soc_tplg_vendor_uuid_elem uuid[0];
+               struct snd_soc_tplg_vendor_value_elem value[0];
+               struct snd_soc_tplg_vendor_string_elem string[0];
+       };
+} __attribute__((packed));
+
 /*
  * Private data.
  * All topology objects may have private data that can be used by the driver or
@@ -139,7 +176,10 @@ struct snd_soc_tplg_hdr {
  */
 struct snd_soc_tplg_private {
        __le32 size;    /* in bytes of private data */
-       char data[0];
+       union {
+               char data[0];
+               struct snd_soc_tplg_vendor_array array[0];
+       };
 } __attribute__((packed));
 
 /*
@@ -383,7 +423,7 @@ struct snd_soc_tplg_pcm {
        __le32 size;            /* in bytes of this structure */
        char pcm_name[SNDRV_CTL_ELEM_ID_NAME_MAXLEN];
        char dai_name[SNDRV_CTL_ELEM_ID_NAME_MAXLEN];
-       __le32 pcm_id;          /* unique ID - used to match */
+       __le32 pcm_id;          /* unique ID - used to match with DAI link */
        __le32 dai_id;          /* unique ID - used to match */
        __le32 playback;        /* supports playback mode */
        __le32 capture;         /* supports capture mode */
index ad66589..3a2a794 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/videodev2.h>
 #include <linux/bitmap.h>
 #include <linux/fb.h>
+#include <linux/of.h>
 #include <media/v4l2-mediabus.h>
 #include <video/videomode.h>
 
@@ -345,6 +346,7 @@ struct ipu_client_platformdata {
        int dc;
        int dp;
        int dma[2];
+       struct device_node *of_node;
 };
 
 #endif /* __DRM_IPU_H__ */
index a9c4aef..f755a60 100644 (file)
@@ -1306,6 +1306,17 @@ source "usr/Kconfig"
 
 endif
 
+choice
+       prompt "Compiler optimization level"
+       default CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE
+
+config CC_OPTIMIZE_FOR_PERFORMANCE
+       bool "Optimize for performance"
+       help
+         This is the default optimization level for the kernel, building
+         with the "-O2" compiler flag for best performance and most
+         helpful compile-time warnings.
+
 config CC_OPTIMIZE_FOR_SIZE
        bool "Optimize for size"
        help
@@ -1314,6 +1325,8 @@ config CC_OPTIMIZE_FOR_SIZE
 
          If unsure, say N.
 
+endchoice
+
 config SYSCTL
        bool
 
@@ -2049,6 +2062,22 @@ config MODULE_COMPRESS_XZ
 
 endchoice
 
+config TRIM_UNUSED_KSYMS
+       bool "Trim unused exported kernel symbols"
+       depends on MODULES && !UNUSED_SYMBOLS
+       help
+         The kernel and some modules make many symbols available for
+         other modules to use via EXPORT_SYMBOL() and variants. Depending
+         on the set of modules being selected in your kernel configuration,
+         many of those exported symbols might never be used.
+
+         This option allows for unused exported symbols to be dropped from
+         the build. In turn, this provides the compiler more opportunities
+         (especially when using LTO) for optimizing the code and reducing
+         binary size.  This might have some security advantages as well.
+
+         If unsure say N.
+
 endif # MODULES
 
 config MODULES_TREE_LOOKUP
index bc0f9e0..4c17fda 100644 (file)
@@ -607,6 +607,7 @@ asmlinkage __visible void __init start_kernel(void)
                initrd_start = 0;
        }
 #endif
+       page_ext_init();
        debug_objects_mem_init();
        kmemleak_init();
        setup_per_cpu_pageset();
@@ -1003,8 +1004,6 @@ static noinline void __init kernel_init_freeable(void)
        sched_init_smp();
 
        page_alloc_init_late();
-       /* Initialize page ext after all struct pages are initializaed */
-       page_ext_init();
 
        do_basic_setup();
 
index 04be702..318858e 100644 (file)
@@ -365,7 +365,6 @@ static struct file_system_type bpf_fs_type = {
        .name           = "bpf",
        .mount          = bpf_mount,
        .kill_sb        = kill_litter_super,
-       .fs_flags       = FS_USERNS_MOUNT,
 };
 
 MODULE_ALIAS_FS("bpf");
index c8ee352..080a2df 100644 (file)
@@ -136,7 +136,8 @@ u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
                               BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
                return -EINVAL;
 
-       trace = get_perf_callchain(regs, init_nr, kernel, user, false, false);
+       trace = get_perf_callchain(regs, init_nr, kernel, user,
+                                  sysctl_perf_event_max_stack, false, false);
 
        if (unlikely(!trace))
                /* couldn't fetch the stack trace */
index b9325e7..179ef46 100644 (file)
@@ -19,11 +19,13 @@ struct callchain_cpus_entries {
 };
 
 int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
+int sysctl_perf_event_max_contexts_per_stack __read_mostly = PERF_MAX_CONTEXTS_PER_STACK;
 
 static inline size_t perf_callchain_entry__sizeof(void)
 {
        return (sizeof(struct perf_callchain_entry) +
-               sizeof(__u64) * sysctl_perf_event_max_stack);
+               sizeof(__u64) * (sysctl_perf_event_max_stack +
+                                sysctl_perf_event_max_contexts_per_stack));
 }
 
 static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
@@ -32,12 +34,12 @@ static DEFINE_MUTEX(callchain_mutex);
 static struct callchain_cpus_entries *callchain_cpus_entries;
 
 
-__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
+__weak void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
                                  struct pt_regs *regs)
 {
 }
 
-__weak void perf_callchain_user(struct perf_callchain_entry *entry,
+__weak void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
                                struct pt_regs *regs)
 {
 }
@@ -176,14 +178,15 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
        if (!kernel && !user)
                return NULL;
 
-       return get_perf_callchain(regs, 0, kernel, user, crosstask, true);
+       return get_perf_callchain(regs, 0, kernel, user, sysctl_perf_event_max_stack, crosstask, true);
 }
 
 struct perf_callchain_entry *
 get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
-                  bool crosstask, bool add_mark)
+                  u32 max_stack, bool crosstask, bool add_mark)
 {
        struct perf_callchain_entry *entry;
+       struct perf_callchain_entry_ctx ctx;
        int rctx;
 
        entry = get_callchain_entry(&rctx);
@@ -193,12 +196,16 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
        if (!entry)
                goto exit_put;
 
-       entry->nr = init_nr;
+       ctx.entry     = entry;
+       ctx.max_stack = max_stack;
+       ctx.nr        = entry->nr = init_nr;
+       ctx.contexts       = 0;
+       ctx.contexts_maxed = false;
 
        if (kernel && !user_mode(regs)) {
                if (add_mark)
-                       perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
-               perf_callchain_kernel(entry, regs);
+                       perf_callchain_store_context(&ctx, PERF_CONTEXT_KERNEL);
+               perf_callchain_kernel(&ctx, regs);
        }
 
        if (user) {
@@ -214,8 +221,8 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
                                goto exit_put;
 
                        if (add_mark)
-                               perf_callchain_store(entry, PERF_CONTEXT_USER);
-                       perf_callchain_user(entry, regs);
+                               perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
+                       perf_callchain_user(&ctx, regs);
                }
        }
 
@@ -225,10 +232,15 @@ exit_put:
        return entry;
 }
 
+/*
+ * Used for sysctl_perf_event_max_stack and
+ * sysctl_perf_event_max_contexts_per_stack.
+ */
 int perf_event_max_stack_handler(struct ctl_table *table, int write,
                                 void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-       int new_value = sysctl_perf_event_max_stack, ret;
+       int *value = table->data;
+       int new_value = *value, ret;
        struct ctl_table new_table = *table;
 
        new_table.data = &new_value;
@@ -240,7 +252,7 @@ int perf_event_max_stack_handler(struct ctl_table *table, int write,
        if (atomic_read(&nr_callchain_events))
                ret = -EBUSY;
        else
-               sysctl_perf_event_max_stack = new_value;
+               *value = new_value;
 
        mutex_unlock(&callchain_mutex);
 
index 47887bb..5c2c355 100644 (file)
@@ -736,6 +736,7 @@ void mmput(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(mmput);
 
+#ifdef CONFIG_MMU
 static void mmput_async_fn(struct work_struct *work)
 {
        struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
@@ -749,6 +750,7 @@ void mmput_async(struct mm_struct *mm)
                schedule_work(&mm->async_put_work);
        }
 }
+#endif
 
 /**
  * set_mm_exe_file - change a reference to the mm's executable file
index c92e448..1276aab 100644 (file)
@@ -37,6 +37,7 @@ config ARCH_HAS_GCOV_PROFILE_ALL
 
 config GCOV_PROFILE_ALL
        bool "Profile entire Kernel"
+       depends on !COMPILE_TEST
        depends on GCOV_KERNEL
        depends on ARCH_HAS_GCOV_PROFILE_ALL
        default n
index c427422..89b49f6 100644 (file)
@@ -125,7 +125,7 @@ int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest)
 
        domain = data->domain;
        if (WARN_ON(domain == NULL))
-               return;
+               return -EINVAL;
 
        if (!irq_domain_is_ipi(domain)) {
                pr_warn("Trying to destroy a non IPI domain!\n");
index f231e0b..bec0b64 100644 (file)
@@ -37,6 +37,7 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
        free_percpu(brw->fast_read_ctr);
        brw->fast_read_ctr = NULL; /* catch use after free bugs */
 }
+EXPORT_SYMBOL_GPL(percpu_free_rwsem);
 
 /*
  * This is the fast-path for down_read/up_read. If it succeeds we rely
index c817216..2e853ad 100644 (file)
@@ -173,6 +173,22 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
 
 EXPORT_SYMBOL(down_write_nested);
 
+int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
+{
+       might_sleep();
+       rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
+
+       if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) {
+               rwsem_release(&sem->dep_map, 1, _RET_IP_);
+               return -EINTR;
+       }
+
+       rwsem_set_owner(sem);
+       return 0;
+}
+
+EXPORT_SYMBOL(down_write_killable_nested);
+
 void up_read_non_owner(struct rw_semaphore *sem)
 {
        __up_read(sem);
index 4d73a83..f66162f 100644 (file)
@@ -311,7 +311,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
        pid->level = ns->level;
        for (i = ns->level; i >= 0; i--) {
                nr = alloc_pidmap(tmp);
-               if (IS_ERR_VALUE(nr)) {
+               if (nr < 0) {
                        retval = nr;
                        goto out_free;
                }
index 404c078..7f2cae4 100644 (file)
@@ -1768,13 +1768,15 @@ void sched_ttwu_pending(void)
        cookie = lockdep_pin_lock(&rq->lock);
 
        while (llist) {
+               int wake_flags = 0;
+
                p = llist_entry(llist, struct task_struct, wake_entry);
                llist = llist_next(llist);
-               /*
-                * See ttwu_queue(); we only call ttwu_queue_remote() when
-                * its a x-cpu wakeup.
-                */
-               ttwu_do_activate(rq, p, WF_MIGRATED, cookie);
+
+               if (p->sched_remote_wakeup)
+                       wake_flags = WF_MIGRATED;
+
+               ttwu_do_activate(rq, p, wake_flags, cookie);
        }
 
        lockdep_unpin_lock(&rq->lock, cookie);
@@ -1819,10 +1821,12 @@ void scheduler_ipi(void)
        irq_exit();
 }
 
-static void ttwu_queue_remote(struct task_struct *p, int cpu)
+static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
 {
        struct rq *rq = cpu_rq(cpu);
 
+       p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
+
        if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
                if (!set_nr_if_polling(rq->idle))
                        smp_send_reschedule(cpu);
@@ -1869,7 +1873,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
 #if defined(CONFIG_SMP)
        if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
                sched_clock_cpu(cpu); /* sync clocks x-cpu */
-               ttwu_queue_remote(p, cpu);
+               ttwu_queue_remote(p, cpu, wake_flags);
                return;
        }
 #endif
index 154ae3a..14c4aa2 100644 (file)
@@ -9,6 +9,8 @@
  * published by the Free Software Foundation.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/cpufreq.h>
 #include <linux/module.h>
 #include <linux/slab.h>
@@ -388,7 +390,7 @@ static int sugov_init(struct cpufreq_policy *policy)
        mutex_unlock(&global_tunables_lock);
 
        sugov_policy_free(sg_policy);
-       pr_err("cpufreq: schedutil governor initialization failed (error %d)\n", ret);
+       pr_err("initialization failed (error %d)\n", ret);
        return ret;
 }
 
index 2effd84..87b2fc3 100644 (file)
@@ -1149,13 +1149,22 @@ static struct ctl_table kern_table[] = {
        },
        {
                .procname       = "perf_event_max_stack",
-               .data           = NULL, /* filled in by handler */
+               .data           = &sysctl_perf_event_max_stack,
                .maxlen         = sizeof(sysctl_perf_event_max_stack),
                .mode           = 0644,
                .proc_handler   = perf_event_max_stack_handler,
                .extra1         = &zero,
                .extra2         = &six_hundred_forty_kb,
        },
+       {
+               .procname       = "perf_event_max_contexts_per_stack",
+               .data           = &sysctl_perf_event_max_contexts_per_stack,
+               .maxlen         = sizeof(sysctl_perf_event_max_contexts_per_stack),
+               .mode           = 0644,
+               .proc_handler   = perf_event_max_stack_handler,
+               .extra1         = &zero,
+               .extra2         = &one_thousand,
+       },
 #endif
 #ifdef CONFIG_KMEMCHECK
        {
index 8c7392c..e99df0f 100644 (file)
@@ -425,6 +425,7 @@ void destroy_hrtimer_on_stack(struct hrtimer *timer)
 {
        debug_object_free(timer, &hrtimer_debug_descr);
 }
+EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack);
 
 #else
 static inline void debug_hrtimer_init(struct hrtimer *timer) { }
index e707ab3..b9cfdbf 100644 (file)
@@ -1841,6 +1841,9 @@ config TEST_BITMAP
 
          If unsure, say N.
 
+config TEST_UUID
+       tristate "Test functions located in the uuid module at runtime"
+
 config TEST_RHASHTABLE
        tristate "Perform selftest on resizable hash table"
        default n
@@ -1849,6 +1852,17 @@ config TEST_RHASHTABLE
 
          If unsure, say N.
 
+config TEST_HASH
+       tristate "Perform selftest on hash functions"
+       default n
+       help
+         Enable this option to test the kernel's integer (<linux/hash,h>)
+         and string (<linux/stringhash.h>) hash functions on boot
+         (or module load).
+
+         This is intended to help people writing architecture-specific
+         optimized versions.  If unsure, say N.
+
 endmenu # runtime tests
 
 config PROVIDE_OHCI1394_DMA_INIT
index 42b6918..ff6a7a6 100644 (file)
@@ -48,6 +48,7 @@ obj-$(CONFIG_TEST_HEXDUMP) += test_hexdump.o
 obj-y += kstrtox.o
 obj-$(CONFIG_TEST_BPF) += test_bpf.o
 obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o
+obj-$(CONFIG_TEST_HASH) += test_hash.o
 obj-$(CONFIG_TEST_KASAN) += test_kasan.o
 obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o
 obj-$(CONFIG_TEST_LKM) += test_module.o
@@ -57,6 +58,7 @@ obj-$(CONFIG_TEST_STATIC_KEYS) += test_static_keys.o
 obj-$(CONFIG_TEST_STATIC_KEYS) += test_static_key_base.o
 obj-$(CONFIG_TEST_PRINTF) += test_printf.o
 obj-$(CONFIG_TEST_BITMAP) += test_bitmap.o
+obj-$(CONFIG_TEST_UUID) += test_uuid.o
 
 ifeq ($(CONFIG_DEBUG_KOBJECT),y)
 CFLAGS_kobject.o += -DDEBUG
index 4a1515f..51a76af 100644 (file)
@@ -657,9 +657,9 @@ static struct dma_debug_entry *dma_entry_alloc(void)
        spin_lock_irqsave(&free_entries_lock, flags);
 
        if (list_empty(&free_entries)) {
-               pr_err("DMA-API: debugging out of memory - disabling\n");
                global_disable = true;
                spin_unlock_irqrestore(&free_entries_lock, flags);
+               pr_err("DMA-API: debugging out of memory - disabling\n");
                return NULL;
        }
 
index 28cb431..0cd5227 100644 (file)
 #define iterate_and_advance(i, n, v, I, B, K) {                        \
        if (unlikely(i->count < n))                             \
                n = i->count;                                   \
-       if (n) {                                                \
+       if (i->count) {                                         \
                size_t skip = i->iov_offset;                    \
                if (unlikely(i->type & ITER_BVEC)) {            \
                        const struct bio_vec *bvec;             \
diff --git a/lib/test_hash.c b/lib/test_hash.c
new file mode 100644 (file)
index 0000000..c9549c8
--- /dev/null
@@ -0,0 +1,250 @@
+/*
+ * Test cases for <linux/hash.h> and <linux/stringhash.h>
+ * This just verifies that various ways of computing a hash
+ * produce the same thing and, for cases where a k-bit hash
+ * value is requested, is of the requested size.
+ *
+ * We fill a buffer with a 255-byte null-terminated string,
+ * and use both full_name_hash() and hashlen_string() to hash the
+ * substrings from i to j, where 0 <= i < j < 256.
+ *
+ * The returned values are used to check that __hash_32() and
+ * __hash_32_generic() compute the same thing.  Likewise hash_32()
+ * and hash_64().
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt "\n"
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/hash.h>
+#include <linux/stringhash.h>
+#include <linux/printk.h>
+
+/* 32-bit XORSHIFT generator.  Seed must not be zero. */
+static u32 __init __attribute_const__
+xorshift(u32 seed)
+{
+       seed ^= seed << 13;
+       seed ^= seed >> 17;
+       seed ^= seed << 5;
+       return seed;
+}
+
+/* Given a non-zero x, returns a non-zero byte. */
+static u8 __init __attribute_const__
+mod255(u32 x)
+{
+       x = (x & 0xffff) + (x >> 16);   /* 1 <= x <= 0x1fffe */
+       x = (x & 0xff) + (x >> 8);      /* 1 <= x <= 0x2fd */
+       x = (x & 0xff) + (x >> 8);      /* 1 <= x <= 0x100 */
+       x = (x & 0xff) + (x >> 8);      /* 1 <= x <= 0xff */
+       return x;
+}
+
+/* Fill the buffer with non-zero bytes. */
+static void __init
+fill_buf(char *buf, size_t len, u32 seed)
+{
+       size_t i;
+
+       for (i = 0; i < len; i++) {
+               seed = xorshift(seed);
+               buf[i] = mod255(seed);
+       }
+}
+
+/*
+ * Test the various integer hash functions.  h64 (or its low-order bits)
+ * is the integer to hash.  hash_or accumulates the OR of the hash values,
+ * which are later checked to see that they cover all the requested bits.
+ *
+ * Because these functions (as opposed to the string hashes) are all
+ * inline, the code being tested is actually in the module, and you can
+ * recompile and re-test the module without rebooting.
+ */
+static bool __init
+test_int_hash(unsigned long long h64, u32 hash_or[2][33])
+{
+       int k;
+       u32 h0 = (u32)h64, h1, h2;
+
+       /* Test __hash32 */
+       hash_or[0][0] |= h1 = __hash_32(h0);
+#ifdef HAVE_ARCH__HASH_32
+       hash_or[1][0] |= h2 = __hash_32_generic(h0);
+#if HAVE_ARCH__HASH_32 == 1
+       if (h1 != h2) {
+               pr_err("__hash_32(%#x) = %#x != __hash_32_generic() = %#x",
+                       h0, h1, h2);
+               return false;
+       }
+#endif
+#endif
+
+       /* Test k = 1..32 bits */
+       for (k = 1; k <= 32; k++) {
+               u32 const m = ((u32)2 << (k-1)) - 1;    /* Low k bits set */
+
+               /* Test hash_32 */
+               hash_or[0][k] |= h1 = hash_32(h0, k);
+               if (h1 > m) {
+                       pr_err("hash_32(%#x, %d) = %#x > %#x", h0, k, h1, m);
+                       return false;
+               }
+#ifdef HAVE_ARCH_HASH_32
+               h2 = hash_32_generic(h0, k);
+#if HAVE_ARCH_HASH_32 == 1
+               if (h1 != h2) {
+                       pr_err("hash_32(%#x, %d) = %#x != hash_32_generic() "
+                               " = %#x", h0, k, h1, h2);
+                       return false;
+               }
+#else
+               if (h2 > m) {
+                       pr_err("hash_32_generic(%#x, %d) = %#x > %#x",
+                               h0, k, h1, m);
+                       return false;
+               }
+#endif
+#endif
+               /* Test hash_64 */
+               hash_or[1][k] |= h1 = hash_64(h64, k);
+               if (h1 > m) {
+                       pr_err("hash_64(%#llx, %d) = %#x > %#x", h64, k, h1, m);
+                       return false;
+               }
+#ifdef HAVE_ARCH_HASH_64
+               h2 = hash_64_generic(h64, k);
+#if HAVE_ARCH_HASH_64 == 1
+               if (h1 != h2) {
+                       pr_err("hash_64(%#llx, %d) = %#x != hash_64_generic() "
+                               "= %#x", h64, k, h1, h2);
+                       return false;
+               }
+#else
+               if (h2 > m) {
+                       pr_err("hash_64_generic(%#llx, %d) = %#x > %#x",
+                               h64, k, h1, m);
+                       return false;
+               }
+#endif
+#endif
+       }
+
+       (void)h2;       /* Suppress unused variable warning */
+       return true;
+}
+
+#define SIZE 256       /* Run time is cubic in SIZE */
+
+static int __init
+test_hash_init(void)
+{
+       char buf[SIZE+1];
+       u32 string_or = 0, hash_or[2][33] = { 0 };
+       unsigned tests = 0;
+       unsigned long long h64 = 0;
+       int i, j;
+
+       fill_buf(buf, SIZE, 1);
+
+       /* Test every possible non-empty substring in the buffer. */
+       for (j = SIZE; j > 0; --j) {
+               buf[j] = '\0';
+
+               for (i = 0; i <= j; i++) {
+                       u64 hashlen = hashlen_string(buf+i);
+                       u32 h0 = full_name_hash(buf+i, j-i);
+
+                       /* Check that hashlen_string gets the length right */
+                       if (hashlen_len(hashlen) != j-i) {
+                               pr_err("hashlen_string(%d..%d) returned length"
+                                       " %u, expected %d",
+                                       i, j, hashlen_len(hashlen), j-i);
+                               return -EINVAL;
+                       }
+                       /* Check that the hashes match */
+                       if (hashlen_hash(hashlen) != h0) {
+                               pr_err("hashlen_string(%d..%d) = %08x != "
+                                       "full_name_hash() = %08x",
+                                       i, j, hashlen_hash(hashlen), h0);
+                               return -EINVAL;
+                       }
+
+                       string_or |= h0;
+                       h64 = h64 << 32 | h0;   /* For use with hash_64 */
+                       if (!test_int_hash(h64, hash_or))
+                               return -EINVAL;
+                       tests++;
+               } /* i */
+       } /* j */
+
+       /* The OR of all the hash values should cover all the bits */
+       if (~string_or) {
+               pr_err("OR of all string hash results = %#x != %#x",
+                       string_or, -1u);
+               return -EINVAL;
+       }
+       if (~hash_or[0][0]) {
+               pr_err("OR of all __hash_32 results = %#x != %#x",
+                       hash_or[0][0], -1u);
+               return -EINVAL;
+       }
+#ifdef HAVE_ARCH__HASH_32
+#if HAVE_ARCH__HASH_32 != 1    /* Test is pointless if results match */
+       if (~hash_or[1][0]) {
+               pr_err("OR of all __hash_32_generic results = %#x != %#x",
+                       hash_or[1][0], -1u);
+               return -EINVAL;
+       }
+#endif
+#endif
+
+       /* Likewise for all the i-bit hash values */
+       for (i = 1; i <= 32; i++) {
+               u32 const m = ((u32)2 << (i-1)) - 1;    /* Low i bits set */
+
+               if (hash_or[0][i] != m) {
+                       pr_err("OR of all hash_32(%d) results = %#x "
+                               "(%#x expected)", i, hash_or[0][i], m);
+                       return -EINVAL;
+               }
+               if (hash_or[1][i] != m) {
+                       pr_err("OR of all hash_64(%d) results = %#x "
+                               "(%#x expected)", i, hash_or[1][i], m);
+                       return -EINVAL;
+               }
+       }
+
+       /* Issue notices about skipped tests. */
+#ifndef HAVE_ARCH__HASH_32
+       pr_info("__hash_32() has no arch implementation to test.");
+#elif HAVE_ARCH__HASH_32 != 1
+       pr_info("__hash_32() is arch-specific; not compared to generic.");
+#endif
+#ifndef HAVE_ARCH_HASH_32
+       pr_info("hash_32() has no arch implementation to test.");
+#elif HAVE_ARCH_HASH_32 != 1
+       pr_info("hash_32() is arch-specific; not compared to generic.");
+#endif
+#ifndef HAVE_ARCH_HASH_64
+       pr_info("hash_64() has no arch implementation to test.");
+#elif HAVE_ARCH_HASH_64 != 1
+       pr_info("hash_64() is arch-specific; not compared to generic.");
+#endif
+
+       pr_notice("%u tests passed.", tests);
+
+       return 0;
+}
+
+static void __exit test_hash_exit(void)
+{
+}
+
+module_init(test_hash_init);   /* Does everything */
+module_exit(test_hash_exit);   /* Does nothing */
+
+MODULE_LICENSE("GPL");
diff --git a/lib/test_uuid.c b/lib/test_uuid.c
new file mode 100644 (file)
index 0000000..547d312
--- /dev/null
@@ -0,0 +1,133 @@
+/*
+ * Test cases for lib/uuid.c module.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/uuid.h>
+
+struct test_uuid_data {
+       const char *uuid;
+       uuid_le le;
+       uuid_be be;
+};
+
+static const struct test_uuid_data test_uuid_test_data[] = {
+       {
+               .uuid = "c33f4995-3701-450e-9fbf-206a2e98e576",
+               .le = UUID_LE(0xc33f4995, 0x3701, 0x450e, 0x9f, 0xbf, 0x20, 0x6a, 0x2e, 0x98, 0xe5, 0x76),
+               .be = UUID_BE(0xc33f4995, 0x3701, 0x450e, 0x9f, 0xbf, 0x20, 0x6a, 0x2e, 0x98, 0xe5, 0x76),
+       },
+       {
+               .uuid = "64b4371c-77c1-48f9-8221-29f054fc023b",
+               .le = UUID_LE(0x64b4371c, 0x77c1, 0x48f9, 0x82, 0x21, 0x29, 0xf0, 0x54, 0xfc, 0x02, 0x3b),
+               .be = UUID_BE(0x64b4371c, 0x77c1, 0x48f9, 0x82, 0x21, 0x29, 0xf0, 0x54, 0xfc, 0x02, 0x3b),
+       },
+       {
+               .uuid = "0cb4ddff-a545-4401-9d06-688af53e7f84",
+               .le = UUID_LE(0x0cb4ddff, 0xa545, 0x4401, 0x9d, 0x06, 0x68, 0x8a, 0xf5, 0x3e, 0x7f, 0x84),
+               .be = UUID_BE(0x0cb4ddff, 0xa545, 0x4401, 0x9d, 0x06, 0x68, 0x8a, 0xf5, 0x3e, 0x7f, 0x84),
+       },
+};
+
+static const char * const test_uuid_wrong_data[] = {
+       "c33f4995-3701-450e-9fbf206a2e98e576 ", /* no hyphen(s) */
+       "64b4371c-77c1-48f9-8221-29f054XX023b", /* invalid character(s) */
+       "0cb4ddff-a545-4401-9d06-688af53e",     /* not enough data */
+};
+
+static unsigned total_tests __initdata;
+static unsigned failed_tests __initdata;
+
+static void __init test_uuid_failed(const char *prefix, bool wrong, bool be,
+                                   const char *data, const char *actual)
+{
+       pr_err("%s test #%u %s %s data: '%s'\n",
+              prefix,
+              total_tests,
+              wrong ? "passed on wrong" : "failed on",
+              be ? "BE" : "LE",
+              data);
+       if (actual && *actual)
+               pr_err("%s test #%u actual data: '%s'\n",
+                      prefix,
+                      total_tests,
+                      actual);
+       failed_tests++;
+}
+
+static void __init test_uuid_test(const struct test_uuid_data *data)
+{
+       uuid_le le;
+       uuid_be be;
+       char buf[48];
+
+       /* LE */
+       total_tests++;
+       if (uuid_le_to_bin(data->uuid, &le))
+               test_uuid_failed("conversion", false, false, data->uuid, NULL);
+
+       total_tests++;
+       if (uuid_le_cmp(data->le, le)) {
+               sprintf(buf, "%pUl", &le);
+               test_uuid_failed("cmp", false, false, data->uuid, buf);
+       }
+
+       /* BE */
+       total_tests++;
+       if (uuid_be_to_bin(data->uuid, &be))
+               test_uuid_failed("conversion", false, true, data->uuid, NULL);
+
+       total_tests++;
+       if (uuid_be_cmp(data->be, be)) {
+               sprintf(buf, "%pUb", &be);
+               test_uuid_failed("cmp", false, true, data->uuid, buf);
+       }
+}
+
+static void __init test_uuid_wrong(const char *data)
+{
+       uuid_le le;
+       uuid_be be;
+
+       /* LE */
+       total_tests++;
+       if (!uuid_le_to_bin(data, &le))
+               test_uuid_failed("negative", true, false, data, NULL);
+
+       /* BE */
+       total_tests++;
+       if (!uuid_be_to_bin(data, &be))
+               test_uuid_failed("negative", true, true, data, NULL);
+}
+
+static int __init test_uuid_init(void)
+{
+       unsigned int i;
+
+       for (i = 0; i < ARRAY_SIZE(test_uuid_test_data); i++)
+               test_uuid_test(&test_uuid_test_data[i]);
+
+       for (i = 0; i < ARRAY_SIZE(test_uuid_wrong_data); i++)
+               test_uuid_wrong(test_uuid_wrong_data[i]);
+
+       if (failed_tests == 0)
+               pr_info("all %u tests passed\n", total_tests);
+       else
+               pr_err("failed %u out of %u tests\n", failed_tests, total_tests);
+
+       return failed_tests ? -EINVAL : 0;
+}
+module_init(test_uuid_init);
+
+static void __exit test_uuid_exit(void)
+{
+       /* do nothing */
+}
+module_exit(test_uuid_exit);
+
+MODULE_AUTHOR("Andy Shevchenko <andriy.shevchenko@linux.intel.com>");
+MODULE_LICENSE("Dual BSD/GPL");
index e116ae5..37687af 100644 (file)
@@ -106,8 +106,8 @@ static int __uuid_to_bin(const char *uuid, __u8 b[16], const u8 ei[16])
                return -EINVAL;
 
        for (i = 0; i < 16; i++) {
-               int hi = hex_to_bin(uuid[si[i]] + 0);
-               int lo = hex_to_bin(uuid[si[i]] + 1);
+               int hi = hex_to_bin(uuid[si[i] + 0]);
+               int lo = hex_to_bin(uuid[si[i] + 1]);
 
                b[ei[i]] = (hi << 4) | lo;
        }
index 2664c11..3e2daef 100644 (file)
@@ -648,7 +648,8 @@ config DEFERRED_STRUCT_PAGE_INIT
        bool "Defer initialisation of struct pages to kthreads"
        default n
        depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
-       depends on MEMORY_HOTPLUG
+       depends on NO_BOOTMEM && MEMORY_HOTPLUG
+       depends on !FLATMEM
        help
          Ordinarily all struct pages are initialised during early boot in a
          single thread. On very large machines this can take a considerable
index ea506eb..bd0e141 100644 (file)
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -183,7 +183,8 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
                return -EINVAL;
 
        /* ensure minimal alignment required by mm core */
-       alignment = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order);
+       alignment = PAGE_SIZE <<
+                       max_t(unsigned long, MAX_ORDER - 1, pageblock_order);
 
        /* alignment should be aligned with order_per_bit */
        if (!IS_ALIGNED(alignment >> PAGE_SHIFT, 1 << order_per_bit))
@@ -266,8 +267,8 @@ int __init cma_declare_contiguous(phys_addr_t base,
         * migratetype page by page allocator's buddy algorithm. In the case,
         * you couldn't get a contiguous memory, which is not what we want.
         */
-       alignment = max(alignment,
-               (phys_addr_t)PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order));
+       alignment = max(alignment,  (phys_addr_t)PAGE_SIZE <<
+                         max_t(unsigned long, MAX_ORDER - 1, pageblock_order));
        base = ALIGN(base, alignment);
        size = ALIGN(size, alignment);
        limit &= ~(alignment - 1);
index 9665b1d..00ae878 100644 (file)
@@ -143,13 +143,15 @@ static void page_cache_tree_delete(struct address_space *mapping,
                        return;
 
        /*
-        * Track node that only contains shadow entries.
+        * Track node that only contains shadow entries. DAX mappings contain
+        * no shadow entries and may contain other exceptional entries so skip
+        * those.
         *
         * Avoid acquiring the list_lru lock if already tracked.  The
         * list_empty() test is safe as node->private_list is
         * protected by mapping->tree_lock.
         */
-       if (!workingset_node_pages(node) &&
+       if (!dax_mapping(mapping) && !workingset_node_pages(node) &&
            list_empty(&node->private_list)) {
                node->private_data = mapping;
                list_lru_add(&workingset_shadow_nodes, &node->private_list);
@@ -580,14 +582,24 @@ static int page_cache_tree_insert(struct address_space *mapping,
                if (!radix_tree_exceptional_entry(p))
                        return -EEXIST;
 
-               if (WARN_ON(dax_mapping(mapping)))
-                       return -EINVAL;
-
-               if (shadowp)
-                       *shadowp = p;
                mapping->nrexceptional--;
-               if (node)
-                       workingset_node_shadows_dec(node);
+               if (!dax_mapping(mapping)) {
+                       if (shadowp)
+                               *shadowp = p;
+                       if (node)
+                               workingset_node_shadows_dec(node);
+               } else {
+                       /* DAX can replace empty locked entry with a hole */
+                       WARN_ON_ONCE(p !=
+                               (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
+                                        RADIX_DAX_ENTRY_LOCK));
+                       /* DAX accounts exceptional entries as normal pages */
+                       if (node)
+                               workingset_node_pages_dec(node);
+                       /* Wakeup waiters for exceptional entry lock */
+                       dax_wake_mapping_entry_waiter(mapping, page->index,
+                                                     false);
+               }
        }
        radix_tree_replace_slot(slot, page);
        mapping->nrpages++;
index 7f7ac51..fb87923 100644 (file)
@@ -77,7 +77,6 @@ struct kasan_alloc_meta {
        struct kasan_track track;
        u32 state : 2;  /* enum kasan_state */
        u32 alloc_size : 30;
-       u32 reserved;
 };
 
 struct qlist_node {
index cf428d7..58c69c9 100644 (file)
@@ -1108,6 +1108,8 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
                limit = READ_ONCE(memcg->memsw.limit);
                if (count <= limit)
                        margin = min(margin, limit - count);
+               else
+                       margin = 0;
        }
 
        return margin;
@@ -1302,6 +1304,8 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
                                mem_cgroup_iter_break(memcg, iter);
                                if (chosen)
                                        put_task_struct(chosen);
+                               /* Set a dummy value to return "true". */
+                               chosen = (void *) 1;
                                goto unlock;
                        case OOM_SCAN_OK:
                                break;
@@ -2892,6 +2896,7 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
         * ordering is imposed by list_lru_node->lock taken by
         * memcg_drain_all_list_lrus().
         */
+       rcu_read_lock(); /* can be called from css_free w/o cgroup_mutex */
        css_for_each_descendant_pre(css, &memcg->css) {
                child = mem_cgroup_from_css(css);
                BUG_ON(child->kmemcg_id != kmemcg_id);
@@ -2899,6 +2904,8 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
                if (!memcg->use_hierarchy)
                        break;
        }
+       rcu_read_unlock();
+
        memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id);
 
        memcg_free_cache_id(kmemcg_id);
@@ -4305,24 +4312,6 @@ static int mem_cgroup_do_precharge(unsigned long count)
        return 0;
 }
 
-/**
- * get_mctgt_type - get target type of moving charge
- * @vma: the vma the pte to be checked belongs
- * @addr: the address corresponding to the pte to be checked
- * @ptent: the pte to be checked
- * @target: the pointer the target page or swap ent will be stored(can be NULL)
- *
- * Returns
- *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
- *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
- *     move charge. if @target is not NULL, the page is stored in target->page
- *     with extra refcnt got(Callers should handle it).
- *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
- *     target for charge migration. if @target is not NULL, the entry is stored
- *     in target->ent.
- *
- * Called with pte lock held.
- */
 union mc_target {
        struct page     *page;
        swp_entry_t     ent;
@@ -4511,6 +4500,25 @@ out:
        return ret;
 }
 
+/**
+ * get_mctgt_type - get target type of moving charge
+ * @vma: the vma the pte to be checked belongs
+ * @addr: the address corresponding to the pte to be checked
+ * @ptent: the pte to be checked
+ * @target: the pointer the target page or swap ent will be stored(can be NULL)
+ *
+ * Returns
+ *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
+ *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
+ *     move charge. if @target is not NULL, the page is stored in target->page
+ *     with extra refcnt got(Callers should handle it).
+ *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
+ *     target for charge migration. if @target is not NULL, the entry is stored
+ *     in target->ent.
+ *
+ * Called with pte lock held.
+ */
+
 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
                unsigned long addr, pte_t ptent, union mc_target *target)
 {
index a1b93d9..15322b7 100644 (file)
@@ -63,6 +63,7 @@
 #include <linux/dma-debug.h>
 #include <linux/debugfs.h>
 #include <linux/userfaultfd_k.h>
+#include <linux/dax.h>
 
 #include <asm/io.h>
 #include <asm/mmu_context.h>
@@ -2492,8 +2493,6 @@ void unmap_mapping_range(struct address_space *mapping,
        if (details.last_index < details.first_index)
                details.last_index = ULONG_MAX;
 
-
-       /* DAX uses i_mmap_lock to serialise file truncate vs page fault */
        i_mmap_lock_write(mapping);
        if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
                unmap_mapping_range_tree(&mapping->i_mmap, &details);
@@ -2825,7 +2824,8 @@ oom:
  */
 static int __do_fault(struct vm_area_struct *vma, unsigned long address,
                        pgoff_t pgoff, unsigned int flags,
-                       struct page *cow_page, struct page **page)
+                       struct page *cow_page, struct page **page,
+                       void **entry)
 {
        struct vm_fault vmf;
        int ret;
@@ -2840,8 +2840,10 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
        ret = vma->vm_ops->fault(vma, &vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                return ret;
-       if (!vmf.page)
-               goto out;
+       if (ret & VM_FAULT_DAX_LOCKED) {
+               *entry = vmf.entry;
+               return ret;
+       }
 
        if (unlikely(PageHWPoison(vmf.page))) {
                if (ret & VM_FAULT_LOCKED)
@@ -2855,7 +2857,6 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
        else
                VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
 
- out:
        *page = vmf.page;
        return ret;
 }
@@ -3048,7 +3049,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                pte_unmap_unlock(pte, ptl);
        }
 
-       ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
+       ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                return ret;
 
@@ -3071,6 +3072,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
 {
        struct page *fault_page, *new_page;
+       void *fault_entry;
        struct mem_cgroup *memcg;
        spinlock_t *ptl;
        pte_t *pte;
@@ -3088,26 +3090,24 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                return VM_FAULT_OOM;
        }
 
-       ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page);
+       ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page,
+                        &fault_entry);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                goto uncharge_out;
 
-       if (fault_page)
+       if (!(ret & VM_FAULT_DAX_LOCKED))
                copy_user_highpage(new_page, fault_page, address, vma);
        __SetPageUptodate(new_page);
 
        pte = pte_offset_map_lock(mm, pmd, address, &ptl);
        if (unlikely(!pte_same(*pte, orig_pte))) {
                pte_unmap_unlock(pte, ptl);
-               if (fault_page) {
+               if (!(ret & VM_FAULT_DAX_LOCKED)) {
                        unlock_page(fault_page);
                        put_page(fault_page);
                } else {
-                       /*
-                        * The fault handler has no page to lock, so it holds
-                        * i_mmap_lock for read to protect against truncate.
-                        */
-                       i_mmap_unlock_read(vma->vm_file->f_mapping);
+                       dax_unlock_mapping_entry(vma->vm_file->f_mapping,
+                                                pgoff);
                }
                goto uncharge_out;
        }
@@ -3115,15 +3115,11 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        mem_cgroup_commit_charge(new_page, memcg, false, false);
        lru_cache_add_active_or_unevictable(new_page, vma);
        pte_unmap_unlock(pte, ptl);
-       if (fault_page) {
+       if (!(ret & VM_FAULT_DAX_LOCKED)) {
                unlock_page(fault_page);
                put_page(fault_page);
        } else {
-               /*
-                * The fault handler has no page to lock, so it holds
-                * i_mmap_lock for read to protect against truncate.
-                */
-               i_mmap_unlock_read(vma->vm_file->f_mapping);
+               dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff);
        }
        return ret;
 uncharge_out:
@@ -3143,7 +3139,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        int dirtied = 0;
        int ret, tmp;
 
-       ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
+       ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                return ret;
 
index caf2a14..e3cbdca 100644 (file)
@@ -263,7 +263,7 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
 }
 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */
 
-void register_page_bootmem_info_node(struct pglist_data *pgdat)
+void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
 {
        unsigned long i, pfn, end_pfn, nr_pages;
        int node = pgdat->node_id;
@@ -300,7 +300,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
                 * multiple nodes we check that this pfn does not already
                 * reside in some other nodes.
                 */
-               if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node))
+               if (pfn_valid(pfn) && (early_pfn_to_nid(pfn) == node))
                        register_page_bootmem_info_section(pfn);
        }
 }
index d3d9a94..de2c176 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -168,7 +168,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
        return next;
 }
 
-static unsigned long do_brk(unsigned long addr, unsigned long len);
+static int do_brk(unsigned long addr, unsigned long len);
 
 SYSCALL_DEFINE1(brk, unsigned long, brk)
 {
@@ -224,7 +224,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
                goto out;
 
        /* Ok, looks good - let it rip. */
-       if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
+       if (do_brk(oldbrk, newbrk-oldbrk) < 0)
                goto out;
 
 set_brk:
@@ -2625,7 +2625,7 @@ static inline void verify_mm_writelocked(struct mm_struct *mm)
  *  anonymous maps.  eventually we may be able to do some
  *  brk-specific accounting here.
  */
-static unsigned long do_brk(unsigned long addr, unsigned long len)
+static int do_brk(unsigned long addr, unsigned long len)
 {
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma, *prev;
@@ -2636,7 +2636,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
 
        len = PAGE_ALIGN(len);
        if (!len)
-               return addr;
+               return 0;
 
        flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
 
@@ -2703,13 +2703,13 @@ out:
        if (flags & VM_LOCKED)
                mm->locked_vm += (len >> PAGE_SHIFT);
        vma->vm_flags |= VM_SOFTDIRTY;
-       return addr;
+       return 0;
 }
 
-unsigned long vm_brk(unsigned long addr, unsigned long len)
+int vm_brk(unsigned long addr, unsigned long len)
 {
        struct mm_struct *mm = current->mm;
-       unsigned long ret;
+       int ret;
        bool populate;
 
        if (down_write_killable(&mm->mmap_sem))
@@ -2718,7 +2718,7 @@ unsigned long vm_brk(unsigned long addr, unsigned long len)
        ret = do_brk(addr, len);
        populate = ((mm->def_flags & VM_LOCKED) != 0);
        up_write(&mm->mmap_sem);
-       if (populate)
+       if (populate && !ret)
                mm_populate(addr, len);
        return ret;
 }
index c8bd59a..c2e5888 100644 (file)
@@ -1682,7 +1682,7 @@ void exit_mmap(struct mm_struct *mm)
        }
 }
 
-unsigned long vm_brk(unsigned long addr, unsigned long len)
+int vm_brk(unsigned long addr, unsigned long len)
 {
        return -ENOMEM;
 }
index 5bb2f76..acbc432 100644 (file)
@@ -443,12 +443,28 @@ static bool __oom_reap_task(struct task_struct *tsk)
 {
        struct mmu_gather tlb;
        struct vm_area_struct *vma;
-       struct mm_struct *mm;
+       struct mm_struct *mm = NULL;
        struct task_struct *p;
        struct zap_details details = {.check_swap_entries = true,
                                      .ignore_dirty = true};
        bool ret = true;
 
+       /*
+        * We have to make sure to not race with the victim exit path
+        * and cause premature new oom victim selection:
+        * __oom_reap_task              exit_mm
+        *   atomic_inc_not_zero
+        *                                mmput
+        *                                  atomic_dec_and_test
+        *                                exit_oom_victim
+        *                              [...]
+        *                              out_of_memory
+        *                                select_bad_process
+        *                                  # no TIF_MEMDIE task selects new victim
+        *  unmap_page_range # frees some memory
+        */
+       mutex_lock(&oom_lock);
+
        /*
         * Make sure we find the associated mm_struct even when the particular
         * thread has already terminated and cleared its mm.
@@ -457,19 +473,19 @@ static bool __oom_reap_task(struct task_struct *tsk)
         */
        p = find_lock_task_mm(tsk);
        if (!p)
-               return true;
+               goto unlock_oom;
 
        mm = p->mm;
        if (!atomic_inc_not_zero(&mm->mm_users)) {
                task_unlock(p);
-               return true;
+               goto unlock_oom;
        }
 
        task_unlock(p);
 
        if (!down_read_trylock(&mm->mmap_sem)) {
                ret = false;
-               goto out;
+               goto unlock_oom;
        }
 
        tlb_gather_mmu(&tlb, mm, 0, -1);
@@ -511,13 +527,15 @@ static bool __oom_reap_task(struct task_struct *tsk)
         * to release its memory.
         */
        set_bit(MMF_OOM_REAPED, &mm->flags);
-out:
+unlock_oom:
+       mutex_unlock(&oom_lock);
        /*
         * Drop our reference but make sure the mmput slow path is called from a
         * different context because we shouldn't risk we get stuck there and
         * put the oom_reaper out of the way.
         */
-       mmput_async(mm);
+       if (mm)
+               mmput_async(mm);
        return ret;
 }
 
@@ -607,12 +625,8 @@ void try_oom_reaper(struct task_struct *tsk)
        if (atomic_read(&mm->mm_users) > 1) {
                rcu_read_lock();
                for_each_process(p) {
-                       bool exiting;
-
                        if (!process_shares_mm(p, mm))
                                continue;
-                       if (same_thread_group(p, tsk))
-                               continue;
                        if (fatal_signal_pending(p))
                                continue;
 
@@ -620,10 +634,7 @@ void try_oom_reaper(struct task_struct *tsk)
                         * If the task is exiting make sure the whole thread group
                         * is exiting and cannot acces mm anymore.
                         */
-                       spin_lock_irq(&p->sighand->siglock);
-                       exiting = signal_group_exit(p->signal);
-                       spin_unlock_irq(&p->sighand->siglock);
-                       if (exiting)
+                       if (signal_group_exit(p->signal))
                                continue;
 
                        /* Give up */
index f8f3bfc..6903b69 100644 (file)
@@ -656,6 +656,9 @@ static inline void set_page_guard(struct zone *zone, struct page *page,
                return;
 
        page_ext = lookup_page_ext(page);
+       if (unlikely(!page_ext))
+               return;
+
        __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
 
        INIT_LIST_HEAD(&page->lru);
@@ -673,6 +676,9 @@ static inline void clear_page_guard(struct zone *zone, struct page *page,
                return;
 
        page_ext = lookup_page_ext(page);
+       if (unlikely(!page_ext))
+               return;
+
        __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
 
        set_page_private(page, 0);
@@ -2609,11 +2615,12 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
                                page = list_last_entry(list, struct page, lru);
                        else
                                page = list_first_entry(list, struct page, lru);
-               } while (page && check_new_pcp(page));
 
-               __dec_zone_state(zone, NR_ALLOC_BATCH);
-               list_del(&page->lru);
-               pcp->count--;
+                       __dec_zone_state(zone, NR_ALLOC_BATCH);
+                       list_del(&page->lru);
+                       pcp->count--;
+
+               } while (check_new_pcp(page));
        } else {
                /*
                 * We most definitely don't want callers attempting to
@@ -3023,6 +3030,7 @@ reset_fair:
                apply_fair = false;
                fair_skipped = false;
                reset_alloc_batches(ac->preferred_zoneref->zone);
+               z = ac->preferred_zoneref;
                goto zonelist_scan;
        }
 
@@ -3596,6 +3604,17 @@ retry:
         */
        alloc_flags = gfp_to_alloc_flags(gfp_mask);
 
+       /*
+        * Reset the zonelist iterators if memory policies can be ignored.
+        * These allocations are high priority and system rather than user
+        * orientated.
+        */
+       if ((alloc_flags & ALLOC_NO_WATERMARKS) || !(alloc_flags & ALLOC_CPUSET)) {
+               ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
+               ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
+                                       ac->high_zoneidx, ac->nodemask);
+       }
+
        /* This is the last chance, in general, before the goto nopage. */
        page = get_page_from_freelist(gfp_mask, order,
                                alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
@@ -3604,12 +3623,6 @@ retry:
 
        /* Allocate without watermarks if the context allows */
        if (alloc_flags & ALLOC_NO_WATERMARKS) {
-               /*
-                * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
-                * the allocation is high priority and these type of
-                * allocations are system rather than user orientated
-                */
-               ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
                page = get_page_from_freelist(gfp_mask, order,
                                                ALLOC_NO_WATERMARKS, ac);
                if (page)
@@ -3808,7 +3821,11 @@ retry_cpuset:
        /* Dirty zone balancing only done in the fast path */
        ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE);
 
-       /* The preferred zone is used for statistics later */
+       /*
+        * The preferred zone is used for statistics but crucially it is
+        * also used as the starting point for the zonelist iterator. It
+        * may get reset for allocations that ignore memory policies.
+        */
        ac.preferred_zoneref = first_zones_zonelist(ac.zonelist,
                                        ac.high_zoneidx, ac.nodemask);
        if (!ac.preferred_zoneref) {
index 2d864e6..44a4c02 100644 (file)
@@ -390,8 +390,10 @@ void __init page_ext_init(void)
                         * We know some arch can have a nodes layout such as
                         * -------------pfn-------------->
                         * N0 | N1 | N2 | N0 | N1 | N2|....
+                        *
+                        * Take into account DEFERRED_STRUCT_PAGE_INIT.
                         */
-                       if (pfn_to_nid(pfn) != nid)
+                       if (early_pfn_to_nid(pfn) != nid)
                                continue;
                        if (init_section_page_ext(pfn, nid))
                                goto oom;
index 792b56d..c6cda3e 100644 (file)
@@ -55,6 +55,8 @@ void __reset_page_owner(struct page *page, unsigned int order)
 
        for (i = 0; i < (1 << order); i++) {
                page_ext = lookup_page_ext(page + i);
+               if (unlikely(!page_ext))
+                       continue;
                __clear_bit(PAGE_EXT_OWNER, &page_ext->flags);
        }
 }
@@ -62,6 +64,7 @@ void __reset_page_owner(struct page *page, unsigned int order)
 void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
 {
        struct page_ext *page_ext = lookup_page_ext(page);
+
        struct stack_trace trace = {
                .nr_entries = 0,
                .max_entries = ARRAY_SIZE(page_ext->trace_entries),
@@ -69,6 +72,9 @@ void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
                .skip = 3,
        };
 
+       if (unlikely(!page_ext))
+               return;
+
        save_stack_trace(&trace);
 
        page_ext->order = order;
@@ -82,6 +88,8 @@ void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
 void __set_page_owner_migrate_reason(struct page *page, int reason)
 {
        struct page_ext *page_ext = lookup_page_ext(page);
+       if (unlikely(!page_ext))
+               return;
 
        page_ext->last_migrate_reason = reason;
 }
@@ -89,6 +97,12 @@ void __set_page_owner_migrate_reason(struct page *page, int reason)
 gfp_t __get_page_owner_gfp(struct page *page)
 {
        struct page_ext *page_ext = lookup_page_ext(page);
+       if (unlikely(!page_ext))
+               /*
+                * The caller just returns 0 if no valid gfp
+                * So return 0 here too.
+                */
+               return 0;
 
        return page_ext->gfp_mask;
 }
@@ -99,6 +113,9 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage)
        struct page_ext *new_ext = lookup_page_ext(newpage);
        int i;
 
+       if (unlikely(!old_ext || !new_ext))
+               return;
+
        new_ext->order = old_ext->order;
        new_ext->gfp_mask = old_ext->gfp_mask;
        new_ext->nr_entries = old_ext->nr_entries;
@@ -193,6 +210,11 @@ void __dump_page_owner(struct page *page)
        gfp_t gfp_mask = page_ext->gfp_mask;
        int mt = gfpflags_to_migratetype(gfp_mask);
 
+       if (unlikely(!page_ext)) {
+               pr_alert("There is not page extension available.\n");
+               return;
+       }
+
        if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) {
                pr_alert("page_owner info is not active (free page?)\n");
                return;
@@ -251,6 +273,8 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
                }
 
                page_ext = lookup_page_ext(page);
+               if (unlikely(!page_ext))
+                       continue;
 
                /*
                 * Some pages could be missed by concurrent allocation or free,
@@ -317,6 +341,8 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
                                continue;
 
                        page_ext = lookup_page_ext(page);
+                       if (unlikely(!page_ext))
+                               continue;
 
                        /* Maybe overraping zone */
                        if (test_bit(PAGE_EXT_OWNER, &page_ext->flags))
index 1eae5fa..2e647c6 100644 (file)
@@ -54,6 +54,9 @@ static inline void set_page_poison(struct page *page)
        struct page_ext *page_ext;
 
        page_ext = lookup_page_ext(page);
+       if (unlikely(!page_ext))
+               return;
+
        __set_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
 }
 
@@ -62,6 +65,9 @@ static inline void clear_page_poison(struct page *page)
        struct page_ext *page_ext;
 
        page_ext = lookup_page_ext(page);
+       if (unlikely(!page_ext))
+               return;
+
        __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
 }
 
@@ -70,7 +76,7 @@ bool page_is_poisoned(struct page *page)
        struct page_ext *page_ext;
 
        page_ext = lookup_page_ext(page);
-       if (!page_ext)
+       if (unlikely(!page_ext))
                return false;
 
        return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
index 8a83993..0ea5d90 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1098,6 +1098,8 @@ void page_move_anon_rmap(struct page *page,
 
        VM_BUG_ON_PAGE(!PageLocked(page), page);
        VM_BUG_ON_VMA(!anon_vma, vma);
+       if (IS_ENABLED(CONFIG_DEBUG_VM) && PageTransHuge(page))
+               address &= HPAGE_PMD_MASK;
        VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page);
 
        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
index e418a99..a361449 100644 (file)
@@ -2645,10 +2645,11 @@ static int shmem_xattr_handler_get(const struct xattr_handler *handler,
 }
 
 static int shmem_xattr_handler_set(const struct xattr_handler *handler,
-                                  struct dentry *dentry, const char *name,
-                                  const void *value, size_t size, int flags)
+                                  struct dentry *unused, struct inode *inode,
+                                  const char *name, const void *value,
+                                  size_t size, int flags)
 {
-       struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
+       struct shmem_inode_info *info = SHMEM_I(inode);
 
        name = xattr_full_name(handler, name);
        return simple_xattr_set(&info->xattrs, name, value, size, flags);
index b002728..4064f8f 100644 (file)
@@ -34,40 +34,38 @@ static void clear_exceptional_entry(struct address_space *mapping,
        if (shmem_mapping(mapping))
                return;
 
-       spin_lock_irq(&mapping->tree_lock);
-
        if (dax_mapping(mapping)) {
-               if (radix_tree_delete_item(&mapping->page_tree, index, entry))
-                       mapping->nrexceptional--;
-       } else {
-               /*
-                * Regular page slots are stabilized by the page lock even
-                * without the tree itself locked.  These unlocked entries
-                * need verification under the tree lock.
-                */
-               if (!__radix_tree_lookup(&mapping->page_tree, index, &node,
-                                       &slot))
-                       goto unlock;
-               if (*slot != entry)
-                       goto unlock;
-               radix_tree_replace_slot(slot, NULL);
-               mapping->nrexceptional--;
-               if (!node)
-                       goto unlock;
-               workingset_node_shadows_dec(node);
-               /*
-                * Don't track node without shadow entries.
-                *
-                * Avoid acquiring the list_lru lock if already untracked.
-                * The list_empty() test is safe as node->private_list is
-                * protected by mapping->tree_lock.
-                */
-               if (!workingset_node_shadows(node) &&
-                   !list_empty(&node->private_list))
-                       list_lru_del(&workingset_shadow_nodes,
-                                       &node->private_list);
-               __radix_tree_delete_node(&mapping->page_tree, node);
+               dax_delete_mapping_entry(mapping, index);
+               return;
        }
+       spin_lock_irq(&mapping->tree_lock);
+       /*
+        * Regular page slots are stabilized by the page lock even
+        * without the tree itself locked.  These unlocked entries
+        * need verification under the tree lock.
+        */
+       if (!__radix_tree_lookup(&mapping->page_tree, index, &node,
+                               &slot))
+               goto unlock;
+       if (*slot != entry)
+               goto unlock;
+       radix_tree_replace_slot(slot, NULL);
+       mapping->nrexceptional--;
+       if (!node)
+               goto unlock;
+       workingset_node_shadows_dec(node);
+       /*
+        * Don't track node without shadow entries.
+        *
+        * Avoid acquiring the list_lru lock if already untracked.
+        * The list_empty() test is safe as node->private_list is
+        * protected by mapping->tree_lock.
+        */
+       if (!workingset_node_shadows(node) &&
+           !list_empty(&node->private_list))
+               list_lru_del(&workingset_shadow_nodes,
+                               &node->private_list);
+       __radix_tree_delete_node(&mapping->page_tree, node);
 unlock:
        spin_unlock_irq(&mapping->tree_lock);
 }
index cf7ad1a..e11475c 100644 (file)
@@ -1105,7 +1105,7 @@ EXPORT_SYMBOL_GPL(vm_unmap_aliases);
  */
 void vm_unmap_ram(const void *mem, unsigned int count)
 {
-       unsigned long size = count << PAGE_SHIFT;
+       unsigned long size = (unsigned long)count << PAGE_SHIFT;
        unsigned long addr = (unsigned long)mem;
 
        BUG_ON(!addr);
@@ -1140,7 +1140,7 @@ EXPORT_SYMBOL(vm_unmap_ram);
  */
 void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
 {
-       unsigned long size = count << PAGE_SHIFT;
+       unsigned long size = (unsigned long)count << PAGE_SHIFT;
        unsigned long addr;
        void *mem;
 
@@ -1574,14 +1574,15 @@ void *vmap(struct page **pages, unsigned int count,
                unsigned long flags, pgprot_t prot)
 {
        struct vm_struct *area;
+       unsigned long size;             /* In bytes */
 
        might_sleep();
 
        if (count > totalram_pages)
                return NULL;
 
-       area = get_vm_area_caller((count << PAGE_SHIFT), flags,
-                                       __builtin_return_address(0));
+       size = (unsigned long)count << PAGE_SHIFT;
+       area = get_vm_area_caller(size, flags, __builtin_return_address(0));
        if (!area)
                return NULL;
 
index 77e42ef..cb2a67b 100644 (file)
@@ -1061,6 +1061,8 @@ static void pagetypeinfo_showmixedcount_print(struct seq_file *m,
                                continue;
 
                        page_ext = lookup_page_ext(page);
+                       if (unlikely(!page_ext))
+                               continue;
 
                        if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
                                continue;
index 34917d5..8f9e89c 100644 (file)
@@ -412,7 +412,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
                /* HEADLESS page stored */
                bud = HEADLESS;
        } else {
-               bud = (handle - zhdr->first_num) & BUDDY_MASK;
+               bud = handle_to_buddy(handle);
 
                switch (bud) {
                case FIRST:
@@ -572,15 +572,19 @@ next:
                        pool->pages_nr--;
                        spin_unlock(&pool->lock);
                        return 0;
-               } else if (zhdr->first_chunks != 0 &&
-                          zhdr->last_chunks != 0 && zhdr->middle_chunks != 0) {
-                       /* Full, add to buddied list */
-                       list_add(&zhdr->buddy, &pool->buddied);
-               } else if (!test_bit(PAGE_HEADLESS, &page->private)) {
-                       z3fold_compact_page(zhdr);
-                       /* add to unbuddied list */
-                       freechunks = num_free_chunks(zhdr);
-                       list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
+               }  else if (!test_bit(PAGE_HEADLESS, &page->private)) {
+                       if (zhdr->first_chunks != 0 &&
+                           zhdr->last_chunks != 0 &&
+                           zhdr->middle_chunks != 0) {
+                               /* Full, add to buddied list */
+                               list_add(&zhdr->buddy, &pool->buddied);
+                       } else {
+                               z3fold_compact_page(zhdr);
+                               /* add to unbuddied list */
+                               freechunks = num_free_chunks(zhdr);
+                               list_add(&zhdr->buddy,
+                                        &pool->unbuddied[freechunks]);
+                       }
                }
 
                /* add to beginning of LRU */
index 72698db..b6d4f25 100644 (file)
@@ -45,6 +45,8 @@
  *
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
@@ -483,16 +485,16 @@ static inline unsigned long zs_stat_get(struct size_class *class,
 
 #ifdef CONFIG_ZSMALLOC_STAT
 
-static int __init zs_stat_init(void)
+static void __init zs_stat_init(void)
 {
-       if (!debugfs_initialized())
-               return -ENODEV;
+       if (!debugfs_initialized()) {
+               pr_warn("debugfs not available, stat dir not created\n");
+               return;
+       }
 
        zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
        if (!zs_stat_root)
-               return -ENOMEM;
-
-       return 0;
+               pr_warn("debugfs 'zsmalloc' stat dir creation failed\n");
 }
 
 static void __exit zs_stat_exit(void)
@@ -577,8 +579,10 @@ static void zs_pool_stat_create(struct zs_pool *pool, const char *name)
 {
        struct dentry *entry;
 
-       if (!zs_stat_root)
+       if (!zs_stat_root) {
+               pr_warn("no root stat dir, not creating <%s> stat dir\n", name);
                return;
+       }
 
        entry = debugfs_create_dir(name, zs_stat_root);
        if (!entry) {
@@ -592,7 +596,8 @@ static void zs_pool_stat_create(struct zs_pool *pool, const char *name)
        if (!entry) {
                pr_warn("%s: debugfs file entry <%s> creation failed\n",
                                name, "classes");
-               return;
+               debugfs_remove_recursive(pool->stat_dentry);
+               pool->stat_dentry = NULL;
        }
 }
 
@@ -602,9 +607,8 @@ static void zs_pool_stat_destroy(struct zs_pool *pool)
 }
 
 #else /* CONFIG_ZSMALLOC_STAT */
-static int __init zs_stat_init(void)
+static void __init zs_stat_init(void)
 {
-       return 0;
 }
 
 static void __exit zs_stat_exit(void)
@@ -2011,17 +2015,10 @@ static int __init zs_init(void)
        zpool_register_driver(&zs_zpool_driver);
 #endif
 
-       ret = zs_stat_init();
-       if (ret) {
-               pr_err("zs stat initialization failed\n");
-               goto stat_fail;
-       }
+       zs_stat_init();
+
        return 0;
 
-stat_fail:
-#ifdef CONFIG_ZPOOL
-       zpool_unregister_driver(&zs_zpool_driver);
-#endif
 notifier_fail:
        zs_unregister_cpu_notifier();
 
index a1e273a..82a116b 100644 (file)
@@ -290,6 +290,10 @@ static void vlan_sync_address(struct net_device *dev,
        if (ether_addr_equal(vlan->real_dev_addr, dev->dev_addr))
                return;
 
+       /* vlan continues to inherit address of lower device */
+       if (vlan_dev_inherit_address(vlandev, dev))
+               goto out;
+
        /* vlan address was different from the old address and is equal to
         * the new address */
        if (!ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) &&
@@ -302,6 +306,7 @@ static void vlan_sync_address(struct net_device *dev,
            !ether_addr_equal(vlandev->dev_addr, dev->dev_addr))
                dev_uc_add(dev, vlandev->dev_addr);
 
+out:
        ether_addr_copy(vlan->real_dev_addr, dev->dev_addr);
 }
 
index 9d010a0..cc15579 100644 (file)
@@ -109,6 +109,8 @@ int vlan_check_real_dev(struct net_device *real_dev,
 void vlan_setup(struct net_device *dev);
 int register_vlan_dev(struct net_device *dev);
 void unregister_vlan_dev(struct net_device *dev, struct list_head *head);
+bool vlan_dev_inherit_address(struct net_device *dev,
+                             struct net_device *real_dev);
 
 static inline u32 vlan_get_ingress_priority(struct net_device *dev,
                                            u16 vlan_tci)
index e7e6257..86ae75b 100644 (file)
@@ -245,6 +245,17 @@ void vlan_dev_get_realdev_name(const struct net_device *dev, char *result)
        strncpy(result, vlan_dev_priv(dev)->real_dev->name, 23);
 }
 
+bool vlan_dev_inherit_address(struct net_device *dev,
+                             struct net_device *real_dev)
+{
+       if (dev->addr_assign_type != NET_ADDR_STOLEN)
+               return false;
+
+       ether_addr_copy(dev->dev_addr, real_dev->dev_addr);
+       call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+       return true;
+}
+
 static int vlan_dev_open(struct net_device *dev)
 {
        struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
@@ -255,7 +266,8 @@ static int vlan_dev_open(struct net_device *dev)
            !(vlan->flags & VLAN_FLAG_LOOSE_BINDING))
                return -ENETDOWN;
 
-       if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr)) {
+       if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr) &&
+           !vlan_dev_inherit_address(dev, real_dev)) {
                err = dev_uc_add(real_dev, dev->dev_addr);
                if (err < 0)
                        goto out;
@@ -560,8 +572,10 @@ static int vlan_dev_init(struct net_device *dev)
        /* ipv6 shared card related stuff */
        dev->dev_id = real_dev->dev_id;
 
-       if (is_zero_ether_addr(dev->dev_addr))
-               eth_hw_addr_inherit(dev, real_dev);
+       if (is_zero_ether_addr(dev->dev_addr)) {
+               ether_addr_copy(dev->dev_addr, real_dev->dev_addr);
+               dev->addr_assign_type = NET_ADDR_STOLEN;
+       }
        if (is_zero_ether_addr(dev->broadcast))
                memcpy(dev->broadcast, real_dev->broadcast, dev->addr_len);
 
index ea79ee9..3fc94a4 100644 (file)
@@ -518,10 +518,10 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req)
                if (err)
                        goto out_err;
 
-               if (p9_is_proto_dotu(c))
+               if (p9_is_proto_dotu(c) && ecode < 512)
                        err = -ecode;
 
-               if (!err || !IS_ERR_VALUE(err)) {
+               if (!err) {
                        err = p9_errstr2errno(ename, strlen(ename));
 
                        p9_debug(P9_DEBUG_9P, "<<< RERROR (%d) %s\n",
@@ -605,10 +605,10 @@ static int p9_check_zc_errors(struct p9_client *c, struct p9_req_t *req,
                if (err)
                        goto out_err;
 
-               if (p9_is_proto_dotu(c))
+               if (p9_is_proto_dotu(c) && ecode < 512)
                        err = -ecode;
 
-               if (!err || !IS_ERR_VALUE(err)) {
+               if (!err) {
                        err = p9_errstr2errno(ename, strlen(ename));
 
                        p9_debug(P9_DEBUG_9P, "<<< RERROR (%d) %s\n",
index 4fd6af4..adb6e3d 100644 (file)
@@ -124,7 +124,7 @@ as_indicate_complete:
                break;
        case as_addparty:
        case as_dropparty:
-               sk->sk_err_soft = msg->reply;
+               sk->sk_err_soft = -msg->reply;
                                        /* < 0 failure, otherwise ep_ref */
                clear_bit(ATM_VF_WAITING, &vcc->flags);
                break;
index 3fa0a9e..878563a 100644 (file)
@@ -546,7 +546,7 @@ static int svc_addparty(struct socket *sock, struct sockaddr *sockaddr,
                schedule();
        }
        finish_wait(sk_sleep(sk), &wait);
-       error = xchg(&sk->sk_err_soft, 0);
+       error = -xchg(&sk->sk_err_soft, 0);
 out:
        release_sock(sk);
        return error;
@@ -573,7 +573,7 @@ static int svc_dropparty(struct socket *sock, int ep_ref)
                error = -EUNATCH;
                goto out;
        }
-       error = xchg(&sk->sk_err_soft, 0);
+       error = -xchg(&sk->sk_err_soft, 0);
 out:
        release_sock(sk);
        return error;
index dcc18c6..55d2bfe 100644 (file)
@@ -651,7 +651,7 @@ EXPORT_SYMBOL(ceph_destroy_client);
 /*
  * true if we have the mon map (and have thus joined the cluster)
  */
-static int have_mon_and_osd_map(struct ceph_client *client)
+static bool have_mon_and_osd_map(struct ceph_client *client)
 {
        return client->monc.monmap && client->monc.monmap->epoch &&
               client->osdc.osdmap && client->osdc.osdmap->epoch;
index 139a9cb..3773a4f 100644 (file)
@@ -27,6 +27,22 @@ __CEPH_FORALL_OSD_OPS(GENERATE_CASE)
        }
 }
 
+const char *ceph_osd_watch_op_name(int o)
+{
+       switch (o) {
+       case CEPH_OSD_WATCH_OP_UNWATCH:
+               return "unwatch";
+       case CEPH_OSD_WATCH_OP_WATCH:
+               return "watch";
+       case CEPH_OSD_WATCH_OP_RECONNECT:
+               return "reconnect";
+       case CEPH_OSD_WATCH_OP_PING:
+               return "ping";
+       default:
+               return "???";
+       }
+}
+
 const char *ceph_osd_state_name(int s)
 {
        switch (s) {
index b902fbc..e77b04c 100644 (file)
@@ -54,24 +54,25 @@ static int osdmap_show(struct seq_file *s, void *p)
 {
        int i;
        struct ceph_client *client = s->private;
-       struct ceph_osdmap *map = client->osdc.osdmap;
+       struct ceph_osd_client *osdc = &client->osdc;
+       struct ceph_osdmap *map = osdc->osdmap;
        struct rb_node *n;
 
        if (map == NULL)
                return 0;
 
-       seq_printf(s, "epoch %d\n", map->epoch);
-       seq_printf(s, "flags%s%s\n",
-                  (map->flags & CEPH_OSDMAP_NEARFULL) ?  " NEARFULL" : "",
-                  (map->flags & CEPH_OSDMAP_FULL) ?  " FULL" : "");
+       down_read(&osdc->lock);
+       seq_printf(s, "epoch %d flags 0x%x\n", map->epoch, map->flags);
 
        for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) {
-               struct ceph_pg_pool_info *pool =
+               struct ceph_pg_pool_info *pi =
                        rb_entry(n, struct ceph_pg_pool_info, node);
 
-               seq_printf(s, "pool %lld pg_num %u (%d) read_tier %lld write_tier %lld\n",
-                          pool->id, pool->pg_num, pool->pg_num_mask,
-                          pool->read_tier, pool->write_tier);
+               seq_printf(s, "pool %lld '%s' type %d size %d min_size %d pg_num %u pg_num_mask %d flags 0x%llx lfor %u read_tier %lld write_tier %lld\n",
+                          pi->id, pi->name, pi->type, pi->size, pi->min_size,
+                          pi->pg_num, pi->pg_num_mask, pi->flags,
+                          pi->last_force_request_resend, pi->read_tier,
+                          pi->write_tier);
        }
        for (i = 0; i < map->max_osd; i++) {
                struct ceph_entity_addr *addr = &map->osd_addr[i];
@@ -103,6 +104,7 @@ static int osdmap_show(struct seq_file *s, void *p)
                           pg->pgid.seed, pg->primary_temp.osd);
        }
 
+       up_read(&osdc->lock);
        return 0;
 }
 
@@ -126,6 +128,7 @@ static int monc_show(struct seq_file *s, void *p)
                                        CEPH_SUBSCRIBE_ONETIME ?  "" : "+"));
                seq_putc(s, '\n');
        }
+       seq_printf(s, "fs_cluster_id %d\n", monc->fs_cluster_id);
 
        for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
                __u16 op;
@@ -143,43 +146,113 @@ static int monc_show(struct seq_file *s, void *p)
        return 0;
 }
 
-static int osdc_show(struct seq_file *s, void *pp)
+static void dump_target(struct seq_file *s, struct ceph_osd_request_target *t)
 {
-       struct ceph_client *client = s->private;
-       struct ceph_osd_client *osdc = &client->osdc;
-       struct rb_node *p;
+       int i;
 
-       mutex_lock(&osdc->request_mutex);
-       for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
-               struct ceph_osd_request *req;
-               unsigned int i;
-               int opcode;
+       seq_printf(s, "osd%d\t%llu.%x\t[", t->osd, t->pgid.pool, t->pgid.seed);
+       for (i = 0; i < t->up.size; i++)
+               seq_printf(s, "%s%d", (!i ? "" : ","), t->up.osds[i]);
+       seq_printf(s, "]/%d\t[", t->up.primary);
+       for (i = 0; i < t->acting.size; i++)
+               seq_printf(s, "%s%d", (!i ? "" : ","), t->acting.osds[i]);
+       seq_printf(s, "]/%d\t%*pE\t0x%x", t->acting.primary,
+                  t->target_oid.name_len, t->target_oid.name, t->flags);
+       if (t->paused)
+               seq_puts(s, "\tP");
+}
 
-               req = rb_entry(p, struct ceph_osd_request, r_node);
+static void dump_request(struct seq_file *s, struct ceph_osd_request *req)
+{
+       int i;
 
-               seq_printf(s, "%lld\tosd%d\t%lld.%x\t", req->r_tid,
-                          req->r_osd ? req->r_osd->o_osd : -1,
-                          req->r_pgid.pool, req->r_pgid.seed);
+       seq_printf(s, "%llu\t", req->r_tid);
+       dump_target(s, &req->r_t);
 
-               seq_printf(s, "%.*s", req->r_base_oid.name_len,
-                          req->r_base_oid.name);
+       seq_printf(s, "\t%d\t%u'%llu", req->r_attempts,
+                  le32_to_cpu(req->r_replay_version.epoch),
+                  le64_to_cpu(req->r_replay_version.version));
 
-               if (req->r_reassert_version.epoch)
-                       seq_printf(s, "\t%u'%llu",
-                          (unsigned int)le32_to_cpu(req->r_reassert_version.epoch),
-                          le64_to_cpu(req->r_reassert_version.version));
-               else
-                       seq_printf(s, "\t");
+       for (i = 0; i < req->r_num_ops; i++) {
+               struct ceph_osd_req_op *op = &req->r_ops[i];
+
+               seq_printf(s, "%s%s", (i == 0 ? "\t" : ","),
+                          ceph_osd_op_name(op->op));
+               if (op->op == CEPH_OSD_OP_WATCH)
+                       seq_printf(s, "-%s",
+                                  ceph_osd_watch_op_name(op->watch.op));
+       }
+
+       seq_putc(s, '\n');
+}
+
+static void dump_requests(struct seq_file *s, struct ceph_osd *osd)
+{
+       struct rb_node *n;
+
+       mutex_lock(&osd->lock);
+       for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
+               struct ceph_osd_request *req =
+                   rb_entry(n, struct ceph_osd_request, r_node);
+
+               dump_request(s, req);
+       }
+
+       mutex_unlock(&osd->lock);
+}
 
-               for (i = 0; i < req->r_num_ops; i++) {
-                       opcode = req->r_ops[i].op;
-                       seq_printf(s, "%s%s", (i == 0 ? "\t" : ","),
-                                  ceph_osd_op_name(opcode));
-               }
+static void dump_linger_request(struct seq_file *s,
+                               struct ceph_osd_linger_request *lreq)
+{
+       seq_printf(s, "%llu\t", lreq->linger_id);
+       dump_target(s, &lreq->t);
+
+       seq_printf(s, "\t%u\t%s%s/%d\n", lreq->register_gen,
+                  lreq->is_watch ? "W" : "N", lreq->committed ? "C" : "",
+                  lreq->last_error);
+}
+
+static void dump_linger_requests(struct seq_file *s, struct ceph_osd *osd)
+{
+       struct rb_node *n;
+
+       mutex_lock(&osd->lock);
+       for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) {
+               struct ceph_osd_linger_request *lreq =
+                   rb_entry(n, struct ceph_osd_linger_request, node);
+
+               dump_linger_request(s, lreq);
+       }
+
+       mutex_unlock(&osd->lock);
+}
 
-               seq_printf(s, "\n");
+static int osdc_show(struct seq_file *s, void *pp)
+{
+       struct ceph_client *client = s->private;
+       struct ceph_osd_client *osdc = &client->osdc;
+       struct rb_node *n;
+
+       down_read(&osdc->lock);
+       seq_printf(s, "REQUESTS %d homeless %d\n",
+                  atomic_read(&osdc->num_requests),
+                  atomic_read(&osdc->num_homeless));
+       for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+               struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+               dump_requests(s, osd);
        }
-       mutex_unlock(&osdc->request_mutex);
+       dump_requests(s, &osdc->homeless_osd);
+
+       seq_puts(s, "LINGER REQUESTS\n");
+       for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+               struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+               dump_linger_requests(s, osd);
+       }
+       dump_linger_requests(s, &osdc->homeless_osd);
+
+       up_read(&osdc->lock);
        return 0;
 }
 
index cf638c0..37c38a7 100644 (file)
@@ -260,20 +260,26 @@ static void __send_subscribe(struct ceph_mon_client *monc)
        BUG_ON(num < 1); /* monmap sub is always there */
        ceph_encode_32(&p, num);
        for (i = 0; i < ARRAY_SIZE(monc->subs); i++) {
-               const char *s = ceph_sub_str[i];
+               char buf[32];
+               int len;
 
                if (!monc->subs[i].want)
                        continue;
 
-               dout("%s %s start %llu flags 0x%x\n", __func__, s,
+               len = sprintf(buf, "%s", ceph_sub_str[i]);
+               if (i == CEPH_SUB_MDSMAP &&
+                   monc->fs_cluster_id != CEPH_FS_CLUSTER_ID_NONE)
+                       len += sprintf(buf + len, ".%d", monc->fs_cluster_id);
+
+               dout("%s %s start %llu flags 0x%x\n", __func__, buf,
                     le64_to_cpu(monc->subs[i].item.start),
                     monc->subs[i].item.flags);
-               ceph_encode_string(&p, end, s, strlen(s));
+               ceph_encode_string(&p, end, buf, len);
                memcpy(p, &monc->subs[i].item, sizeof(monc->subs[i].item));
                p += sizeof(monc->subs[i].item);
        }
 
-       BUG_ON(p != (end - 35 - (ARRAY_SIZE(monc->subs) - num) * 19));
+       BUG_ON(p > end);
        msg->front.iov_len = p - msg->front.iov_base;
        msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
        ceph_msg_revoke(msg);
@@ -376,19 +382,13 @@ void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch)
 }
 EXPORT_SYMBOL(ceph_monc_got_map);
 
-/*
- * Register interest in the next osdmap
- */
-void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
+void ceph_monc_renew_subs(struct ceph_mon_client *monc)
 {
-       dout("%s have %u\n", __func__, monc->subs[CEPH_SUB_OSDMAP].have);
        mutex_lock(&monc->mutex);
-       if (__ceph_monc_want_map(monc, CEPH_SUB_OSDMAP,
-                                monc->subs[CEPH_SUB_OSDMAP].have + 1, false))
-               __send_subscribe(monc);
+       __send_subscribe(monc);
        mutex_unlock(&monc->mutex);
 }
-EXPORT_SYMBOL(ceph_monc_request_next_osdmap);
+EXPORT_SYMBOL(ceph_monc_renew_subs);
 
 /*
  * Wait for an osdmap with a given epoch.
@@ -478,51 +478,17 @@ out:
 /*
  * generic requests (currently statfs, mon_get_version)
  */
-static struct ceph_mon_generic_request *__lookup_generic_req(
-       struct ceph_mon_client *monc, u64 tid)
-{
-       struct ceph_mon_generic_request *req;
-       struct rb_node *n = monc->generic_request_tree.rb_node;
-
-       while (n) {
-               req = rb_entry(n, struct ceph_mon_generic_request, node);
-               if (tid < req->tid)
-                       n = n->rb_left;
-               else if (tid > req->tid)
-                       n = n->rb_right;
-               else
-                       return req;
-       }
-       return NULL;
-}
-
-static void __insert_generic_request(struct ceph_mon_client *monc,
-                           struct ceph_mon_generic_request *new)
-{
-       struct rb_node **p = &monc->generic_request_tree.rb_node;
-       struct rb_node *parent = NULL;
-       struct ceph_mon_generic_request *req = NULL;
-
-       while (*p) {
-               parent = *p;
-               req = rb_entry(parent, struct ceph_mon_generic_request, node);
-               if (new->tid < req->tid)
-                       p = &(*p)->rb_left;
-               else if (new->tid > req->tid)
-                       p = &(*p)->rb_right;
-               else
-                       BUG();
-       }
-
-       rb_link_node(&new->node, parent, p);
-       rb_insert_color(&new->node, &monc->generic_request_tree);
-}
+DEFINE_RB_FUNCS(generic_request, struct ceph_mon_generic_request, tid, node)
 
 static void release_generic_request(struct kref *kref)
 {
        struct ceph_mon_generic_request *req =
                container_of(kref, struct ceph_mon_generic_request, kref);
 
+       dout("%s greq %p request %p reply %p\n", __func__, req, req->request,
+            req->reply);
+       WARN_ON(!RB_EMPTY_NODE(&req->node));
+
        if (req->reply)
                ceph_msg_put(req->reply);
        if (req->request)
@@ -533,7 +499,8 @@ static void release_generic_request(struct kref *kref)
 
 static void put_generic_request(struct ceph_mon_generic_request *req)
 {
-       kref_put(&req->kref, release_generic_request);
+       if (req)
+               kref_put(&req->kref, release_generic_request);
 }
 
 static void get_generic_request(struct ceph_mon_generic_request *req)
@@ -541,6 +508,103 @@ static void get_generic_request(struct ceph_mon_generic_request *req)
        kref_get(&req->kref);
 }
 
+static struct ceph_mon_generic_request *
+alloc_generic_request(struct ceph_mon_client *monc, gfp_t gfp)
+{
+       struct ceph_mon_generic_request *req;
+
+       req = kzalloc(sizeof(*req), gfp);
+       if (!req)
+               return NULL;
+
+       req->monc = monc;
+       kref_init(&req->kref);
+       RB_CLEAR_NODE(&req->node);
+       init_completion(&req->completion);
+
+       dout("%s greq %p\n", __func__, req);
+       return req;
+}
+
+static void register_generic_request(struct ceph_mon_generic_request *req)
+{
+       struct ceph_mon_client *monc = req->monc;
+
+       WARN_ON(req->tid);
+
+       get_generic_request(req);
+       req->tid = ++monc->last_tid;
+       insert_generic_request(&monc->generic_request_tree, req);
+}
+
+static void send_generic_request(struct ceph_mon_client *monc,
+                                struct ceph_mon_generic_request *req)
+{
+       WARN_ON(!req->tid);
+
+       dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+       req->request->hdr.tid = cpu_to_le64(req->tid);
+       ceph_con_send(&monc->con, ceph_msg_get(req->request));
+}
+
+static void __finish_generic_request(struct ceph_mon_generic_request *req)
+{
+       struct ceph_mon_client *monc = req->monc;
+
+       dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+       erase_generic_request(&monc->generic_request_tree, req);
+
+       ceph_msg_revoke(req->request);
+       ceph_msg_revoke_incoming(req->reply);
+}
+
+static void finish_generic_request(struct ceph_mon_generic_request *req)
+{
+       __finish_generic_request(req);
+       put_generic_request(req);
+}
+
+static void complete_generic_request(struct ceph_mon_generic_request *req)
+{
+       if (req->complete_cb)
+               req->complete_cb(req);
+       else
+               complete_all(&req->completion);
+       put_generic_request(req);
+}
+
+void cancel_generic_request(struct ceph_mon_generic_request *req)
+{
+       struct ceph_mon_client *monc = req->monc;
+       struct ceph_mon_generic_request *lookup_req;
+
+       dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+
+       mutex_lock(&monc->mutex);
+       lookup_req = lookup_generic_request(&monc->generic_request_tree,
+                                           req->tid);
+       if (lookup_req) {
+               WARN_ON(lookup_req != req);
+               finish_generic_request(req);
+       }
+
+       mutex_unlock(&monc->mutex);
+}
+
+static int wait_generic_request(struct ceph_mon_generic_request *req)
+{
+       int ret;
+
+       dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+       ret = wait_for_completion_interruptible(&req->completion);
+       if (ret)
+               cancel_generic_request(req);
+       else
+               ret = req->result; /* completed */
+
+       return ret;
+}
+
 static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
                                         struct ceph_msg_header *hdr,
                                         int *skip)
@@ -551,7 +615,7 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
        struct ceph_msg *m;
 
        mutex_lock(&monc->mutex);
-       req = __lookup_generic_req(monc, tid);
+       req = lookup_generic_request(&monc->generic_request_tree, tid);
        if (!req) {
                dout("get_generic_reply %lld dne\n", tid);
                *skip = 1;
@@ -570,42 +634,6 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
        return m;
 }
 
-static int __do_generic_request(struct ceph_mon_client *monc, u64 tid,
-                               struct ceph_mon_generic_request *req)
-{
-       int err;
-
-       /* register request */
-       req->tid = tid != 0 ? tid : ++monc->last_tid;
-       req->request->hdr.tid = cpu_to_le64(req->tid);
-       __insert_generic_request(monc, req);
-       monc->num_generic_requests++;
-       ceph_con_send(&monc->con, ceph_msg_get(req->request));
-       mutex_unlock(&monc->mutex);
-
-       err = wait_for_completion_interruptible(&req->completion);
-
-       mutex_lock(&monc->mutex);
-       rb_erase(&req->node, &monc->generic_request_tree);
-       monc->num_generic_requests--;
-
-       if (!err)
-               err = req->result;
-       return err;
-}
-
-static int do_generic_request(struct ceph_mon_client *monc,
-                             struct ceph_mon_generic_request *req)
-{
-       int err;
-
-       mutex_lock(&monc->mutex);
-       err = __do_generic_request(monc, 0, req);
-       mutex_unlock(&monc->mutex);
-
-       return err;
-}
-
 /*
  * statfs
  */
@@ -616,22 +644,24 @@ static void handle_statfs_reply(struct ceph_mon_client *monc,
        struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
        u64 tid = le64_to_cpu(msg->hdr.tid);
 
+       dout("%s msg %p tid %llu\n", __func__, msg, tid);
+
        if (msg->front.iov_len != sizeof(*reply))
                goto bad;
-       dout("handle_statfs_reply %p tid %llu\n", msg, tid);
 
        mutex_lock(&monc->mutex);
-       req = __lookup_generic_req(monc, tid);
-       if (req) {
-               *(struct ceph_statfs *)req->buf = reply->st;
-               req->result = 0;
-               get_generic_request(req);
+       req = lookup_generic_request(&monc->generic_request_tree, tid);
+       if (!req) {
+               mutex_unlock(&monc->mutex);
+               return;
        }
+
+       req->result = 0;
+       *req->u.st = reply->st; /* struct */
+       __finish_generic_request(req);
        mutex_unlock(&monc->mutex);
-       if (req) {
-               complete_all(&req->completion);
-               put_generic_request(req);
-       }
+
+       complete_generic_request(req);
        return;
 
 bad:
@@ -646,38 +676,38 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
 {
        struct ceph_mon_generic_request *req;
        struct ceph_mon_statfs *h;
-       int err;
+       int ret = -ENOMEM;
 
-       req = kzalloc(sizeof(*req), GFP_NOFS);
+       req = alloc_generic_request(monc, GFP_NOFS);
        if (!req)
-               return -ENOMEM;
-
-       kref_init(&req->kref);
-       req->buf = buf;
-       init_completion(&req->completion);
+               goto out;
 
-       err = -ENOMEM;
        req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS,
                                    true);
        if (!req->request)
                goto out;
-       req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS,
-                                 true);
+
+       req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 64, GFP_NOFS, true);
        if (!req->reply)
                goto out;
 
+       req->u.st = buf;
+
+       mutex_lock(&monc->mutex);
+       register_generic_request(req);
        /* fill out request */
        h = req->request->front.iov_base;
        h->monhdr.have_version = 0;
        h->monhdr.session_mon = cpu_to_le16(-1);
        h->monhdr.session_mon_tid = 0;
        h->fsid = monc->monmap->fsid;
+       send_generic_request(monc, req);
+       mutex_unlock(&monc->mutex);
 
-       err = do_generic_request(monc, req);
-
+       ret = wait_generic_request(req);
 out:
        put_generic_request(req);
-       return err;
+       return ret;
 }
 EXPORT_SYMBOL(ceph_monc_do_statfs);
 
@@ -690,7 +720,7 @@ static void handle_get_version_reply(struct ceph_mon_client *monc,
        void *end = p + msg->front_alloc_len;
        u64 handle;
 
-       dout("%s %p tid %llu\n", __func__, msg, tid);
+       dout("%s msg %p tid %llu\n", __func__, msg, tid);
 
        ceph_decode_need(&p, end, 2*sizeof(u64), bad);
        handle = ceph_decode_64(&p);
@@ -698,77 +728,111 @@ static void handle_get_version_reply(struct ceph_mon_client *monc,
                goto bad;
 
        mutex_lock(&monc->mutex);
-       req = __lookup_generic_req(monc, handle);
-       if (req) {
-               *(u64 *)req->buf = ceph_decode_64(&p);
-               req->result = 0;
-               get_generic_request(req);
+       req = lookup_generic_request(&monc->generic_request_tree, handle);
+       if (!req) {
+               mutex_unlock(&monc->mutex);
+               return;
        }
+
+       req->result = 0;
+       req->u.newest = ceph_decode_64(&p);
+       __finish_generic_request(req);
        mutex_unlock(&monc->mutex);
-       if (req) {
-               complete_all(&req->completion);
-               put_generic_request(req);
-       }
 
+       complete_generic_request(req);
        return;
+
 bad:
        pr_err("corrupt mon_get_version reply, tid %llu\n", tid);
        ceph_msg_dump(msg);
 }
 
-/*
- * Send MMonGetVersion and wait for the reply.
- *
- * @what: one of "mdsmap", "osdmap" or "monmap"
- */
-int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what,
-                            u64 *newest)
+static struct ceph_mon_generic_request *
+__ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
+                       ceph_monc_callback_t cb, u64 private_data)
 {
        struct ceph_mon_generic_request *req;
-       void *p, *end;
-       u64 tid;
-       int err;
 
-       req = kzalloc(sizeof(*req), GFP_NOFS);
+       req = alloc_generic_request(monc, GFP_NOIO);
        if (!req)
-               return -ENOMEM;
-
-       kref_init(&req->kref);
-       req->buf = newest;
-       init_completion(&req->completion);
+               goto err_put_req;
 
        req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION,
                                    sizeof(u64) + sizeof(u32) + strlen(what),
-                                   GFP_NOFS, true);
-       if (!req->request) {
-               err = -ENOMEM;
-               goto out;
-       }
+                                   GFP_NOIO, true);
+       if (!req->request)
+               goto err_put_req;
 
-       req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 1024,
-                                 GFP_NOFS, true);
-       if (!req->reply) {
-               err = -ENOMEM;
-               goto out;
-       }
+       req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 32, GFP_NOIO,
+                                 true);
+       if (!req->reply)
+               goto err_put_req;
 
-       p = req->request->front.iov_base;
-       end = p + req->request->front_alloc_len;
+       req->complete_cb = cb;
+       req->private_data = private_data;
 
-       /* fill out request */
        mutex_lock(&monc->mutex);
-       tid = ++monc->last_tid;
-       ceph_encode_64(&p, tid); /* handle */
-       ceph_encode_string(&p, end, what, strlen(what));
+       register_generic_request(req);
+       {
+               void *p = req->request->front.iov_base;
+               void *const end = p + req->request->front_alloc_len;
+
+               ceph_encode_64(&p, req->tid); /* handle */
+               ceph_encode_string(&p, end, what, strlen(what));
+               WARN_ON(p != end);
+       }
+       send_generic_request(monc, req);
+       mutex_unlock(&monc->mutex);
 
-       err = __do_generic_request(monc, tid, req);
+       return req;
 
-       mutex_unlock(&monc->mutex);
-out:
+err_put_req:
        put_generic_request(req);
-       return err;
+       return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * Send MMonGetVersion and wait for the reply.
+ *
+ * @what: one of "mdsmap", "osdmap" or "monmap"
+ */
+int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
+                         u64 *newest)
+{
+       struct ceph_mon_generic_request *req;
+       int ret;
+
+       req = __ceph_monc_get_version(monc, what, NULL, 0);
+       if (IS_ERR(req))
+               return PTR_ERR(req);
+
+       ret = wait_generic_request(req);
+       if (!ret)
+               *newest = req->u.newest;
+
+       put_generic_request(req);
+       return ret;
 }
-EXPORT_SYMBOL(ceph_monc_do_get_version);
+EXPORT_SYMBOL(ceph_monc_get_version);
+
+/*
+ * Send MMonGetVersion,
+ *
+ * @what: one of "mdsmap", "osdmap" or "monmap"
+ */
+int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what,
+                               ceph_monc_callback_t cb, u64 private_data)
+{
+       struct ceph_mon_generic_request *req;
+
+       req = __ceph_monc_get_version(monc, what, cb, private_data);
+       if (IS_ERR(req))
+               return PTR_ERR(req);
+
+       put_generic_request(req);
+       return 0;
+}
+EXPORT_SYMBOL(ceph_monc_get_version_async);
 
 /*
  * Resend pending generic requests.
@@ -890,7 +954,7 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
        if (!monc->m_subscribe_ack)
                goto out_auth;
 
-       monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS,
+       monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 128, GFP_NOFS,
                                         true);
        if (!monc->m_subscribe)
                goto out_subscribe_ack;
@@ -914,9 +978,10 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 
        INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
        monc->generic_request_tree = RB_ROOT;
-       monc->num_generic_requests = 0;
        monc->last_tid = 0;
 
+       monc->fs_cluster_id = CEPH_FS_CLUSTER_ID_NONE;
+
        return 0;
 
 out_auth_reply:
@@ -954,6 +1019,8 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
 
        ceph_auth_destroy(monc->auth);
 
+       WARN_ON(!RB_EMPTY_ROOT(&monc->generic_request_tree));
+
        ceph_msg_put(monc->m_auth);
        ceph_msg_put(monc->m_auth_reply);
        ceph_msg_put(monc->m_subscribe);
index 40a53a7..8946959 100644 (file)
 #include <linux/ceph/auth.h>
 #include <linux/ceph/pagelist.h>
 
-#define OSD_OP_FRONT_LEN       4096
 #define OSD_OPREPLY_FRONT_LEN  512
 
 static struct kmem_cache       *ceph_osd_request_cache;
 
 static const struct ceph_connection_operations osd_con_ops;
 
-static void __send_queued(struct ceph_osd_client *osdc);
-static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd);
-static void __register_request(struct ceph_osd_client *osdc,
-                              struct ceph_osd_request *req);
-static void __unregister_request(struct ceph_osd_client *osdc,
-                                struct ceph_osd_request *req);
-static void __unregister_linger_request(struct ceph_osd_client *osdc,
-                                       struct ceph_osd_request *req);
-static void __enqueue_request(struct ceph_osd_request *req);
-static void __send_request(struct ceph_osd_client *osdc,
-                          struct ceph_osd_request *req);
-
 /*
  * Implement client access to distributed object storage cluster.
  *
@@ -56,6 +43,52 @@ static void __send_request(struct ceph_osd_client *osdc,
  * channel with an OSD is reset.
  */
 
+static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req);
+static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req);
+static void link_linger(struct ceph_osd *osd,
+                       struct ceph_osd_linger_request *lreq);
+static void unlink_linger(struct ceph_osd *osd,
+                         struct ceph_osd_linger_request *lreq);
+
+#if 1
+static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem)
+{
+       bool wrlocked = true;
+
+       if (unlikely(down_read_trylock(sem))) {
+               wrlocked = false;
+               up_read(sem);
+       }
+
+       return wrlocked;
+}
+static inline void verify_osdc_locked(struct ceph_osd_client *osdc)
+{
+       WARN_ON(!rwsem_is_locked(&osdc->lock));
+}
+static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc)
+{
+       WARN_ON(!rwsem_is_wrlocked(&osdc->lock));
+}
+static inline void verify_osd_locked(struct ceph_osd *osd)
+{
+       struct ceph_osd_client *osdc = osd->o_osdc;
+
+       WARN_ON(!(mutex_is_locked(&osd->lock) &&
+                 rwsem_is_locked(&osdc->lock)) &&
+               !rwsem_is_wrlocked(&osdc->lock));
+}
+static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq)
+{
+       WARN_ON(!mutex_is_locked(&lreq->lock));
+}
+#else
+static inline void verify_osdc_locked(struct ceph_osd_client *osdc) { }
+static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc) { }
+static inline void verify_osd_locked(struct ceph_osd *osd) { }
+static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq) { }
+#endif
+
 /*
  * calculate the mapping of a file extent onto an object, and fill out the
  * request accordingly.  shorten extent as necessary if it crosses an
@@ -144,14 +177,6 @@ osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req,
 }
 EXPORT_SYMBOL(osd_req_op_extent_osd_data);
 
-struct ceph_osd_data *
-osd_req_op_cls_response_data(struct ceph_osd_request *osd_req,
-                       unsigned int which)
-{
-       return osd_req_op_data(osd_req, which, cls, response_data);
-}
-EXPORT_SYMBOL(osd_req_op_cls_response_data);   /* ??? */
-
 void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req,
                        unsigned int which, struct page **pages,
                        u64 length, u32 alignment,
@@ -218,6 +243,8 @@ void osd_req_op_cls_request_data_pagelist(
 
        osd_data = osd_req_op_data(osd_req, which, cls, request_data);
        ceph_osd_data_pagelist_init(osd_data, pagelist);
+       osd_req->r_ops[which].cls.indata_len += pagelist->length;
+       osd_req->r_ops[which].indata_len += pagelist->length;
 }
 EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist);
 
@@ -230,6 +257,8 @@ void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req,
        osd_data = osd_req_op_data(osd_req, which, cls, request_data);
        ceph_osd_data_pages_init(osd_data, pages, length, alignment,
                                pages_from_pool, own_pages);
+       osd_req->r_ops[which].cls.indata_len += length;
+       osd_req->r_ops[which].indata_len += length;
 }
 EXPORT_SYMBOL(osd_req_op_cls_request_data_pages);
 
@@ -302,14 +331,76 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
        case CEPH_OSD_OP_STAT:
                ceph_osd_data_release(&op->raw_data_in);
                break;
+       case CEPH_OSD_OP_NOTIFY_ACK:
+               ceph_osd_data_release(&op->notify_ack.request_data);
+               break;
+       case CEPH_OSD_OP_NOTIFY:
+               ceph_osd_data_release(&op->notify.request_data);
+               ceph_osd_data_release(&op->notify.response_data);
+               break;
        default:
                break;
        }
 }
 
+/*
+ * Assumes @t is zero-initialized.
+ */
+static void target_init(struct ceph_osd_request_target *t)
+{
+       ceph_oid_init(&t->base_oid);
+       ceph_oloc_init(&t->base_oloc);
+       ceph_oid_init(&t->target_oid);
+       ceph_oloc_init(&t->target_oloc);
+
+       ceph_osds_init(&t->acting);
+       ceph_osds_init(&t->up);
+       t->size = -1;
+       t->min_size = -1;
+
+       t->osd = CEPH_HOMELESS_OSD;
+}
+
+static void target_copy(struct ceph_osd_request_target *dest,
+                       const struct ceph_osd_request_target *src)
+{
+       ceph_oid_copy(&dest->base_oid, &src->base_oid);
+       ceph_oloc_copy(&dest->base_oloc, &src->base_oloc);
+       ceph_oid_copy(&dest->target_oid, &src->target_oid);
+       ceph_oloc_copy(&dest->target_oloc, &src->target_oloc);
+
+       dest->pgid = src->pgid; /* struct */
+       dest->pg_num = src->pg_num;
+       dest->pg_num_mask = src->pg_num_mask;
+       ceph_osds_copy(&dest->acting, &src->acting);
+       ceph_osds_copy(&dest->up, &src->up);
+       dest->size = src->size;
+       dest->min_size = src->min_size;
+       dest->sort_bitwise = src->sort_bitwise;
+
+       dest->flags = src->flags;
+       dest->paused = src->paused;
+
+       dest->osd = src->osd;
+}
+
+static void target_destroy(struct ceph_osd_request_target *t)
+{
+       ceph_oid_destroy(&t->base_oid);
+       ceph_oid_destroy(&t->target_oid);
+}
+
 /*
  * requests
  */
+static void request_release_checks(struct ceph_osd_request *req)
+{
+       WARN_ON(!RB_EMPTY_NODE(&req->r_node));
+       WARN_ON(!RB_EMPTY_NODE(&req->r_mc_node));
+       WARN_ON(!list_empty(&req->r_unsafe_item));
+       WARN_ON(req->r_osd);
+}
+
 static void ceph_osdc_release_request(struct kref *kref)
 {
        struct ceph_osd_request *req = container_of(kref,
@@ -318,24 +409,19 @@ static void ceph_osdc_release_request(struct kref *kref)
 
        dout("%s %p (r_request %p r_reply %p)\n", __func__, req,
             req->r_request, req->r_reply);
-       WARN_ON(!RB_EMPTY_NODE(&req->r_node));
-       WARN_ON(!list_empty(&req->r_req_lru_item));
-       WARN_ON(!list_empty(&req->r_osd_item));
-       WARN_ON(!list_empty(&req->r_linger_item));
-       WARN_ON(!list_empty(&req->r_linger_osd_item));
-       WARN_ON(req->r_osd);
+       request_release_checks(req);
 
        if (req->r_request)
                ceph_msg_put(req->r_request);
-       if (req->r_reply) {
-               ceph_msg_revoke_incoming(req->r_reply);
+       if (req->r_reply)
                ceph_msg_put(req->r_reply);
-       }
 
        for (which = 0; which < req->r_num_ops; which++)
                osd_req_op_data_release(req, which);
 
+       target_destroy(&req->r_t);
        ceph_put_snap_context(req->r_snapc);
+
        if (req->r_mempool)
                mempool_free(req, req->r_osdc->req_mempool);
        else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS)
@@ -354,12 +440,66 @@ EXPORT_SYMBOL(ceph_osdc_get_request);
 
 void ceph_osdc_put_request(struct ceph_osd_request *req)
 {
-       dout("%s %p (was %d)\n", __func__, req,
-            atomic_read(&req->r_kref.refcount));
-       kref_put(&req->r_kref, ceph_osdc_release_request);
+       if (req) {
+               dout("%s %p (was %d)\n", __func__, req,
+                    atomic_read(&req->r_kref.refcount));
+               kref_put(&req->r_kref, ceph_osdc_release_request);
+       }
 }
 EXPORT_SYMBOL(ceph_osdc_put_request);
 
+static void request_init(struct ceph_osd_request *req)
+{
+       /* req only, each op is zeroed in _osd_req_op_init() */
+       memset(req, 0, sizeof(*req));
+
+       kref_init(&req->r_kref);
+       init_completion(&req->r_completion);
+       init_completion(&req->r_safe_completion);
+       RB_CLEAR_NODE(&req->r_node);
+       RB_CLEAR_NODE(&req->r_mc_node);
+       INIT_LIST_HEAD(&req->r_unsafe_item);
+
+       target_init(&req->r_t);
+}
+
+/*
+ * This is ugly, but it allows us to reuse linger registration and ping
+ * requests, keeping the structure of the code around send_linger{_ping}()
+ * reasonable.  Setting up a min_nr=2 mempool for each linger request
+ * and dealing with copying ops (this blasts req only, watch op remains
+ * intact) isn't any better.
+ */
+static void request_reinit(struct ceph_osd_request *req)
+{
+       struct ceph_osd_client *osdc = req->r_osdc;
+       bool mempool = req->r_mempool;
+       unsigned int num_ops = req->r_num_ops;
+       u64 snapid = req->r_snapid;
+       struct ceph_snap_context *snapc = req->r_snapc;
+       bool linger = req->r_linger;
+       struct ceph_msg *request_msg = req->r_request;
+       struct ceph_msg *reply_msg = req->r_reply;
+
+       dout("%s req %p\n", __func__, req);
+       WARN_ON(atomic_read(&req->r_kref.refcount) != 1);
+       request_release_checks(req);
+
+       WARN_ON(atomic_read(&request_msg->kref.refcount) != 1);
+       WARN_ON(atomic_read(&reply_msg->kref.refcount) != 1);
+       target_destroy(&req->r_t);
+
+       request_init(req);
+       req->r_osdc = osdc;
+       req->r_mempool = mempool;
+       req->r_num_ops = num_ops;
+       req->r_snapid = snapid;
+       req->r_snapc = snapc;
+       req->r_linger = linger;
+       req->r_request = request_msg;
+       req->r_reply = reply_msg;
+}
+
 struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
                                               struct ceph_snap_context *snapc,
                                               unsigned int num_ops,
@@ -367,8 +507,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
                                               gfp_t gfp_flags)
 {
        struct ceph_osd_request *req;
-       struct ceph_msg *msg;
-       size_t msg_size;
 
        if (use_mempool) {
                BUG_ON(num_ops > CEPH_OSD_SLAB_OPS);
@@ -383,73 +521,65 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
        if (unlikely(!req))
                return NULL;
 
-       /* req only, each op is zeroed in _osd_req_op_init() */
-       memset(req, 0, sizeof(*req));
-
+       request_init(req);
        req->r_osdc = osdc;
        req->r_mempool = use_mempool;
        req->r_num_ops = num_ops;
+       req->r_snapid = CEPH_NOSNAP;
+       req->r_snapc = ceph_get_snap_context(snapc);
 
-       kref_init(&req->r_kref);
-       init_completion(&req->r_completion);
-       init_completion(&req->r_safe_completion);
-       RB_CLEAR_NODE(&req->r_node);
-       INIT_LIST_HEAD(&req->r_unsafe_item);
-       INIT_LIST_HEAD(&req->r_linger_item);
-       INIT_LIST_HEAD(&req->r_linger_osd_item);
-       INIT_LIST_HEAD(&req->r_req_lru_item);
-       INIT_LIST_HEAD(&req->r_osd_item);
-
-       req->r_base_oloc.pool = -1;
-       req->r_target_oloc.pool = -1;
+       dout("%s req %p\n", __func__, req);
+       return req;
+}
+EXPORT_SYMBOL(ceph_osdc_alloc_request);
 
-       msg_size = OSD_OPREPLY_FRONT_LEN;
-       if (num_ops > CEPH_OSD_SLAB_OPS) {
-               /* ceph_osd_op and rval */
-               msg_size += (num_ops - CEPH_OSD_SLAB_OPS) *
-                           (sizeof(struct ceph_osd_op) + 4);
-       }
+int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
+{
+       struct ceph_osd_client *osdc = req->r_osdc;
+       struct ceph_msg *msg;
+       int msg_size;
 
-       /* create reply message */
-       if (use_mempool)
-               msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
-       else
-               msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size,
-                                  gfp_flags, true);
-       if (!msg) {
-               ceph_osdc_put_request(req);
-               return NULL;
-       }
-       req->r_reply = msg;
+       WARN_ON(ceph_oid_empty(&req->r_base_oid));
 
+       /* create request message */
        msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */
        msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */
        msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
        msg_size += 1 + 8 + 4 + 4; /* pgid */
-       msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */
-       msg_size += 2 + num_ops * sizeof(struct ceph_osd_op);
+       msg_size += 4 + req->r_base_oid.name_len; /* oid */
+       msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op);
        msg_size += 8; /* snapid */
        msg_size += 8; /* snap_seq */
-       msg_size += 4 + 8 * (snapc ? snapc->num_snaps : 0); /* snaps */
+       msg_size += 4 + 8 * (req->r_snapc ? req->r_snapc->num_snaps : 0);
        msg_size += 4; /* retry_attempt */
 
-       /* create request message; allow space for oid */
-       if (use_mempool)
+       if (req->r_mempool)
                msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
        else
-               msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags, true);
-       if (!msg) {
-               ceph_osdc_put_request(req);
-               return NULL;
-       }
+               msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp, true);
+       if (!msg)
+               return -ENOMEM;
 
        memset(msg->front.iov_base, 0, msg->front.iov_len);
-
        req->r_request = msg;
 
-       return req;
+       /* create reply message */
+       msg_size = OSD_OPREPLY_FRONT_LEN;
+       msg_size += req->r_base_oid.name_len;
+       msg_size += req->r_num_ops * sizeof(struct ceph_osd_op);
+
+       if (req->r_mempool)
+               msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
+       else
+               msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size, gfp, true);
+       if (!msg)
+               return -ENOMEM;
+
+       req->r_reply = msg;
+
+       return 0;
 }
-EXPORT_SYMBOL(ceph_osdc_alloc_request);
+EXPORT_SYMBOL(ceph_osdc_alloc_messages);
 
 static bool osd_req_opcode_valid(u16 opcode)
 {
@@ -587,8 +717,6 @@ void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
 
        osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist);
 
-       op->cls.argc = 0;       /* currently unused */
-
        op->indata_len = payload_len;
 }
 EXPORT_SYMBOL(osd_req_op_cls_init);
@@ -627,21 +755,19 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
 }
 EXPORT_SYMBOL(osd_req_op_xattr_init);
 
-void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
-                               unsigned int which, u16 opcode,
-                               u64 cookie, u64 version, int flag)
+/*
+ * @watch_opcode: CEPH_OSD_WATCH_OP_*
+ */
+static void osd_req_op_watch_init(struct ceph_osd_request *req, int which,
+                                 u64 cookie, u8 watch_opcode)
 {
-       struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
-                                                     opcode, 0);
-
-       BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH);
+       struct ceph_osd_req_op *op;
 
+       op = _osd_req_op_init(req, which, CEPH_OSD_OP_WATCH, 0);
        op->watch.cookie = cookie;
-       op->watch.ver = version;
-       if (opcode == CEPH_OSD_OP_WATCH && flag)
-               op->watch.flag = (u8)1;
+       op->watch.op = watch_opcode;
+       op->watch.gen = 0;
 }
-EXPORT_SYMBOL(osd_req_op_watch_init);
 
 void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
                                unsigned int which,
@@ -686,16 +812,9 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
        }
 }
 
-static u64 osd_req_encode_op(struct ceph_osd_request *req,
-                             struct ceph_osd_op *dst, unsigned int which)
+static u32 osd_req_encode_op(struct ceph_osd_op *dst,
+                            const struct ceph_osd_req_op *src)
 {
-       struct ceph_osd_req_op *src;
-       struct ceph_osd_data *osd_data;
-       u64 request_data_len = 0;
-       u64 data_length;
-
-       BUG_ON(which >= req->r_num_ops);
-       src = &req->r_ops[which];
        if (WARN_ON(!osd_req_opcode_valid(src->op))) {
                pr_err("unrecognized osd opcode %d\n", src->op);
 
@@ -704,57 +823,36 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
 
        switch (src->op) {
        case CEPH_OSD_OP_STAT:
-               osd_data = &src->raw_data_in;
-               ceph_osdc_msg_data_add(req->r_reply, osd_data);
                break;
        case CEPH_OSD_OP_READ:
        case CEPH_OSD_OP_WRITE:
        case CEPH_OSD_OP_WRITEFULL:
        case CEPH_OSD_OP_ZERO:
        case CEPH_OSD_OP_TRUNCATE:
-               if (src->op == CEPH_OSD_OP_WRITE ||
-                   src->op == CEPH_OSD_OP_WRITEFULL)
-                       request_data_len = src->extent.length;
                dst->extent.offset = cpu_to_le64(src->extent.offset);
                dst->extent.length = cpu_to_le64(src->extent.length);
                dst->extent.truncate_size =
                        cpu_to_le64(src->extent.truncate_size);
                dst->extent.truncate_seq =
                        cpu_to_le32(src->extent.truncate_seq);
-               osd_data = &src->extent.osd_data;
-               if (src->op == CEPH_OSD_OP_WRITE ||
-                   src->op == CEPH_OSD_OP_WRITEFULL)
-                       ceph_osdc_msg_data_add(req->r_request, osd_data);
-               else
-                       ceph_osdc_msg_data_add(req->r_reply, osd_data);
                break;
        case CEPH_OSD_OP_CALL:
                dst->cls.class_len = src->cls.class_len;
                dst->cls.method_len = src->cls.method_len;
-               osd_data = &src->cls.request_info;
-               ceph_osdc_msg_data_add(req->r_request, osd_data);
-               BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGELIST);
-               request_data_len = osd_data->pagelist->length;
-
-               osd_data = &src->cls.request_data;
-               data_length = ceph_osd_data_length(osd_data);
-               if (data_length) {
-                       BUG_ON(osd_data->type == CEPH_OSD_DATA_TYPE_NONE);
-                       dst->cls.indata_len = cpu_to_le32(data_length);
-                       ceph_osdc_msg_data_add(req->r_request, osd_data);
-                       src->indata_len += data_length;
-                       request_data_len += data_length;
-               }
-               osd_data = &src->cls.response_data;
-               ceph_osdc_msg_data_add(req->r_reply, osd_data);
+               dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
                break;
        case CEPH_OSD_OP_STARTSYNC:
                break;
-       case CEPH_OSD_OP_NOTIFY_ACK:
        case CEPH_OSD_OP_WATCH:
                dst->watch.cookie = cpu_to_le64(src->watch.cookie);
-               dst->watch.ver = cpu_to_le64(src->watch.ver);
-               dst->watch.flag = src->watch.flag;
+               dst->watch.ver = cpu_to_le64(0);
+               dst->watch.op = src->watch.op;
+               dst->watch.gen = cpu_to_le32(src->watch.gen);
+               break;
+       case CEPH_OSD_OP_NOTIFY_ACK:
+               break;
+       case CEPH_OSD_OP_NOTIFY:
+               dst->notify.cookie = cpu_to_le64(src->notify.cookie);
                break;
        case CEPH_OSD_OP_SETALLOCHINT:
                dst->alloc_hint.expected_object_size =
@@ -768,9 +866,6 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
                dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
                dst->xattr.cmp_op = src->xattr.cmp_op;
                dst->xattr.cmp_mode = src->xattr.cmp_mode;
-               osd_data = &src->xattr.osd_data;
-               ceph_osdc_msg_data_add(req->r_request, osd_data);
-               request_data_len = osd_data->pagelist->length;
                break;
        case CEPH_OSD_OP_CREATE:
        case CEPH_OSD_OP_DELETE:
@@ -787,7 +882,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
        dst->flags = cpu_to_le32(src->flags);
        dst->payload_len = cpu_to_le32(src->indata_len);
 
-       return request_data_len;
+       return src->indata_len;
 }
 
 /*
@@ -824,17 +919,15 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 
        req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool,
                                        GFP_NOFS);
-       if (!req)
-               return ERR_PTR(-ENOMEM);
-
-       req->r_flags = flags;
+       if (!req) {
+               r = -ENOMEM;
+               goto fail;
+       }
 
        /* calculate max write size */
        r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen);
-       if (r < 0) {
-               ceph_osdc_put_request(req);
-               return ERR_PTR(r);
-       }
+       if (r)
+               goto fail;
 
        if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) {
                osd_req_op_init(req, which, opcode, 0);
@@ -854,194 +947,71 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
                                       truncate_size, truncate_seq);
        }
 
+       req->r_flags = flags;
        req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout);
+       ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum);
 
-       snprintf(req->r_base_oid.name, sizeof(req->r_base_oid.name),
-                "%llx.%08llx", vino.ino, objnum);
-       req->r_base_oid.name_len = strlen(req->r_base_oid.name);
+       req->r_snapid = vino.snap;
+       if (flags & CEPH_OSD_FLAG_WRITE)
+               req->r_data_offset = off;
+
+       r = ceph_osdc_alloc_messages(req, GFP_NOFS);
+       if (r)
+               goto fail;
 
        return req;
+
+fail:
+       ceph_osdc_put_request(req);
+       return ERR_PTR(r);
 }
 EXPORT_SYMBOL(ceph_osdc_new_request);
 
 /*
  * We keep osd requests in an rbtree, sorted by ->r_tid.
  */
-static void __insert_request(struct ceph_osd_client *osdc,
-                            struct ceph_osd_request *new)
-{
-       struct rb_node **p = &osdc->requests.rb_node;
-       struct rb_node *parent = NULL;
-       struct ceph_osd_request *req = NULL;
-
-       while (*p) {
-               parent = *p;
-               req = rb_entry(parent, struct ceph_osd_request, r_node);
-               if (new->r_tid < req->r_tid)
-                       p = &(*p)->rb_left;
-               else if (new->r_tid > req->r_tid)
-                       p = &(*p)->rb_right;
-               else
-                       BUG();
-       }
-
-       rb_link_node(&new->r_node, parent, p);
-       rb_insert_color(&new->r_node, &osdc->requests);
-}
-
-static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
-                                                u64 tid)
-{
-       struct ceph_osd_request *req;
-       struct rb_node *n = osdc->requests.rb_node;
-
-       while (n) {
-               req = rb_entry(n, struct ceph_osd_request, r_node);
-               if (tid < req->r_tid)
-                       n = n->rb_left;
-               else if (tid > req->r_tid)
-                       n = n->rb_right;
-               else
-                       return req;
-       }
-       return NULL;
-}
+DEFINE_RB_FUNCS(request, struct ceph_osd_request, r_tid, r_node)
+DEFINE_RB_FUNCS(request_mc, struct ceph_osd_request, r_tid, r_mc_node)
 
-static struct ceph_osd_request *
-__lookup_request_ge(struct ceph_osd_client *osdc,
-                   u64 tid)
+static bool osd_homeless(struct ceph_osd *osd)
 {
-       struct ceph_osd_request *req;
-       struct rb_node *n = osdc->requests.rb_node;
-
-       while (n) {
-               req = rb_entry(n, struct ceph_osd_request, r_node);
-               if (tid < req->r_tid) {
-                       if (!n->rb_left)
-                               return req;
-                       n = n->rb_left;
-               } else if (tid > req->r_tid) {
-                       n = n->rb_right;
-               } else {
-                       return req;
-               }
-       }
-       return NULL;
+       return osd->o_osd == CEPH_HOMELESS_OSD;
 }
 
-static void __kick_linger_request(struct ceph_osd_request *req)
+static bool osd_registered(struct ceph_osd *osd)
 {
-       struct ceph_osd_client *osdc = req->r_osdc;
-       struct ceph_osd *osd = req->r_osd;
-
-       /*
-        * Linger requests need to be resent with a new tid to avoid
-        * the dup op detection logic on the OSDs.  Achieve this with
-        * a re-register dance instead of open-coding.
-        */
-       ceph_osdc_get_request(req);
-       if (!list_empty(&req->r_linger_item))
-               __unregister_linger_request(osdc, req);
-       else
-               __unregister_request(osdc, req);
-       __register_request(osdc, req);
-       ceph_osdc_put_request(req);
-
-       /*
-        * Unless request has been registered as both normal and
-        * lingering, __unregister{,_linger}_request clears r_osd.
-        * However, here we need to preserve r_osd to make sure we
-        * requeue on the same OSD.
-        */
-       WARN_ON(req->r_osd || !osd);
-       req->r_osd = osd;
+       verify_osdc_locked(osd->o_osdc);
 
-       dout("%s requeueing %p tid %llu\n", __func__, req, req->r_tid);
-       __enqueue_request(req);
+       return !RB_EMPTY_NODE(&osd->o_node);
 }
 
 /*
- * Resubmit requests pending on the given osd.
+ * Assumes @osd is zero-initialized.
  */
-static void __kick_osd_requests(struct ceph_osd_client *osdc,
-                               struct ceph_osd *osd)
+static void osd_init(struct ceph_osd *osd)
 {
-       struct ceph_osd_request *req, *nreq;
-       LIST_HEAD(resend);
-       LIST_HEAD(resend_linger);
-       int err;
-
-       dout("%s osd%d\n", __func__, osd->o_osd);
-       err = __reset_osd(osdc, osd);
-       if (err)
-               return;
-
-       /*
-        * Build up a list of requests to resend by traversing the
-        * osd's list of requests.  Requests for a given object are
-        * sent in tid order, and that is also the order they're
-        * kept on this list.  Therefore all requests that are in
-        * flight will be found first, followed by all requests that
-        * have not yet been sent.  And to resend requests while
-        * preserving this order we will want to put any sent
-        * requests back on the front of the osd client's unsent
-        * list.
-        *
-        * So we build a separate ordered list of already-sent
-        * requests for the affected osd and splice it onto the
-        * front of the osd client's unsent list.  Once we've seen a
-        * request that has not yet been sent we're done.  Those
-        * requests are already sitting right where they belong.
-        */
-       list_for_each_entry(req, &osd->o_requests, r_osd_item) {
-               if (!req->r_sent)
-                       break;
-
-               if (!req->r_linger) {
-                       dout("%s requeueing %p tid %llu\n", __func__, req,
-                            req->r_tid);
-                       list_move_tail(&req->r_req_lru_item, &resend);
-                       req->r_flags |= CEPH_OSD_FLAG_RETRY;
-               } else {
-                       list_move_tail(&req->r_req_lru_item, &resend_linger);
-               }
-       }
-       list_splice(&resend, &osdc->req_unsent);
-
-       /*
-        * Both registered and not yet registered linger requests are
-        * enqueued with a new tid on the same OSD.  We add/move them
-        * to req_unsent/o_requests at the end to keep things in tid
-        * order.
-        */
-       list_for_each_entry_safe(req, nreq, &osd->o_linger_requests,
-                                r_linger_osd_item) {
-               WARN_ON(!list_empty(&req->r_req_lru_item));
-               __kick_linger_request(req);
-       }
-
-       list_for_each_entry_safe(req, nreq, &resend_linger, r_req_lru_item)
-               __kick_linger_request(req);
+       atomic_set(&osd->o_ref, 1);
+       RB_CLEAR_NODE(&osd->o_node);
+       osd->o_requests = RB_ROOT;
+       osd->o_linger_requests = RB_ROOT;
+       INIT_LIST_HEAD(&osd->o_osd_lru);
+       INIT_LIST_HEAD(&osd->o_keepalive_item);
+       osd->o_incarnation = 1;
+       mutex_init(&osd->lock);
 }
 
-/*
- * If the osd connection drops, we need to resubmit all requests.
- */
-static void osd_reset(struct ceph_connection *con)
+static void osd_cleanup(struct ceph_osd *osd)
 {
-       struct ceph_osd *osd = con->private;
-       struct ceph_osd_client *osdc;
-
-       if (!osd)
-               return;
-       dout("osd_reset osd%d\n", osd->o_osd);
-       osdc = osd->o_osdc;
-       down_read(&osdc->map_sem);
-       mutex_lock(&osdc->request_mutex);
-       __kick_osd_requests(osdc, osd);
-       __send_queued(osdc);
-       mutex_unlock(&osdc->request_mutex);
-       up_read(&osdc->map_sem);
+       WARN_ON(!RB_EMPTY_NODE(&osd->o_node));
+       WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
+       WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
+       WARN_ON(!list_empty(&osd->o_osd_lru));
+       WARN_ON(!list_empty(&osd->o_keepalive_item));
+
+       if (osd->o_auth.authorizer) {
+               WARN_ON(osd_homeless(osd));
+               ceph_auth_destroy_authorizer(osd->o_auth.authorizer);
+       }
 }
 
 /*
@@ -1051,22 +1021,15 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum)
 {
        struct ceph_osd *osd;
 
-       osd = kzalloc(sizeof(*osd), GFP_NOFS);
-       if (!osd)
-               return NULL;
+       WARN_ON(onum == CEPH_HOMELESS_OSD);
 
-       atomic_set(&osd->o_ref, 1);
+       osd = kzalloc(sizeof(*osd), GFP_NOIO | __GFP_NOFAIL);
+       osd_init(osd);
        osd->o_osdc = osdc;
        osd->o_osd = onum;
-       RB_CLEAR_NODE(&osd->o_node);
-       INIT_LIST_HEAD(&osd->o_requests);
-       INIT_LIST_HEAD(&osd->o_linger_requests);
-       INIT_LIST_HEAD(&osd->o_osd_lru);
-       osd->o_incarnation = 1;
 
        ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr);
 
-       INIT_LIST_HEAD(&osd->o_keepalive_item);
        return osd;
 }
 
@@ -1087,114 +1050,115 @@ static void put_osd(struct ceph_osd *osd)
        dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
             atomic_read(&osd->o_ref) - 1);
        if (atomic_dec_and_test(&osd->o_ref)) {
-               if (osd->o_auth.authorizer)
-                       ceph_auth_destroy_authorizer(osd->o_auth.authorizer);
+               osd_cleanup(osd);
                kfree(osd);
        }
 }
 
-/*
- * remove an osd from our map
- */
-static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
-{
-       dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
-       WARN_ON(!list_empty(&osd->o_requests));
-       WARN_ON(!list_empty(&osd->o_linger_requests));
-
-       list_del_init(&osd->o_osd_lru);
-       rb_erase(&osd->o_node, &osdc->osds);
-       RB_CLEAR_NODE(&osd->o_node);
-}
-
-static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
-{
-       dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
-
-       if (!RB_EMPTY_NODE(&osd->o_node)) {
-               ceph_con_close(&osd->o_con);
-               __remove_osd(osdc, osd);
-               put_osd(osd);
-       }
-}
+DEFINE_RB_FUNCS(osd, struct ceph_osd, o_osd, o_node)
 
-static void remove_all_osds(struct ceph_osd_client *osdc)
+static void __move_osd_to_lru(struct ceph_osd *osd)
 {
-       dout("%s %p\n", __func__, osdc);
-       mutex_lock(&osdc->request_mutex);
-       while (!RB_EMPTY_ROOT(&osdc->osds)) {
-               struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
-                                               struct ceph_osd, o_node);
-               remove_osd(osdc, osd);
-       }
-       mutex_unlock(&osdc->request_mutex);
-}
+       struct ceph_osd_client *osdc = osd->o_osdc;
 
-static void __move_osd_to_lru(struct ceph_osd_client *osdc,
-                             struct ceph_osd *osd)
-{
-       dout("%s %p\n", __func__, osd);
+       dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
        BUG_ON(!list_empty(&osd->o_osd_lru));
 
+       spin_lock(&osdc->osd_lru_lock);
        list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
+       spin_unlock(&osdc->osd_lru_lock);
+
        osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl;
 }
 
-static void maybe_move_osd_to_lru(struct ceph_osd_client *osdc,
-                                 struct ceph_osd *osd)
+static void maybe_move_osd_to_lru(struct ceph_osd *osd)
 {
-       dout("%s %p\n", __func__, osd);
-
-       if (list_empty(&osd->o_requests) &&
-           list_empty(&osd->o_linger_requests))
-               __move_osd_to_lru(osdc, osd);
+       if (RB_EMPTY_ROOT(&osd->o_requests) &&
+           RB_EMPTY_ROOT(&osd->o_linger_requests))
+               __move_osd_to_lru(osd);
 }
 
 static void __remove_osd_from_lru(struct ceph_osd *osd)
 {
-       dout("__remove_osd_from_lru %p\n", osd);
+       struct ceph_osd_client *osdc = osd->o_osdc;
+
+       dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+
+       spin_lock(&osdc->osd_lru_lock);
        if (!list_empty(&osd->o_osd_lru))
                list_del_init(&osd->o_osd_lru);
+       spin_unlock(&osdc->osd_lru_lock);
 }
 
-static void remove_old_osds(struct ceph_osd_client *osdc)
+/*
+ * Close the connection and assign any leftover requests to the
+ * homeless session.
+ */
+static void close_osd(struct ceph_osd *osd)
 {
-       struct ceph_osd *osd, *nosd;
+       struct ceph_osd_client *osdc = osd->o_osdc;
+       struct rb_node *n;
 
-       dout("__remove_old_osds %p\n", osdc);
-       mutex_lock(&osdc->request_mutex);
-       list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
-               if (time_before(jiffies, osd->lru_ttl))
-                       break;
-               remove_osd(osdc, osd);
+       verify_osdc_wrlocked(osdc);
+       dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+
+       ceph_con_close(&osd->o_con);
+
+       for (n = rb_first(&osd->o_requests); n; ) {
+               struct ceph_osd_request *req =
+                   rb_entry(n, struct ceph_osd_request, r_node);
+
+               n = rb_next(n); /* unlink_request() */
+
+               dout(" reassigning req %p tid %llu\n", req, req->r_tid);
+               unlink_request(osd, req);
+               link_request(&osdc->homeless_osd, req);
+       }
+       for (n = rb_first(&osd->o_linger_requests); n; ) {
+               struct ceph_osd_linger_request *lreq =
+                   rb_entry(n, struct ceph_osd_linger_request, node);
+
+               n = rb_next(n); /* unlink_linger() */
+
+               dout(" reassigning lreq %p linger_id %llu\n", lreq,
+                    lreq->linger_id);
+               unlink_linger(osd, lreq);
+               link_linger(&osdc->homeless_osd, lreq);
        }
-       mutex_unlock(&osdc->request_mutex);
+
+       __remove_osd_from_lru(osd);
+       erase_osd(&osdc->osds, osd);
+       put_osd(osd);
 }
 
 /*
  * reset osd connect
  */
-static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+static int reopen_osd(struct ceph_osd *osd)
 {
        struct ceph_entity_addr *peer_addr;
 
-       dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
-       if (list_empty(&osd->o_requests) &&
-           list_empty(&osd->o_linger_requests)) {
-               remove_osd(osdc, osd);
+       dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+
+       if (RB_EMPTY_ROOT(&osd->o_requests) &&
+           RB_EMPTY_ROOT(&osd->o_linger_requests)) {
+               close_osd(osd);
                return -ENODEV;
        }
 
-       peer_addr = &osdc->osdmap->osd_addr[osd->o_osd];
+       peer_addr = &osd->o_osdc->osdmap->osd_addr[osd->o_osd];
        if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) &&
                        !ceph_con_opened(&osd->o_con)) {
-               struct ceph_osd_request *req;
+               struct rb_node *n;
 
                dout("osd addr hasn't changed and connection never opened, "
                     "letting msgr retry\n");
                /* touch each r_stamp for handle_timeout()'s benfit */
-               list_for_each_entry(req, &osd->o_requests, r_osd_item)
+               for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
+                       struct ceph_osd_request *req =
+                           rb_entry(n, struct ceph_osd_request, r_node);
                        req->r_stamp = jiffies;
+               }
 
                return -EAGAIN;
        }
@@ -1206,455 +1170,1369 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
        return 0;
 }
 
-static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
+static struct ceph_osd *lookup_create_osd(struct ceph_osd_client *osdc, int o,
+                                         bool wrlocked)
 {
-       struct rb_node **p = &osdc->osds.rb_node;
-       struct rb_node *parent = NULL;
-       struct ceph_osd *osd = NULL;
+       struct ceph_osd *osd;
 
-       dout("__insert_osd %p osd%d\n", new, new->o_osd);
-       while (*p) {
-               parent = *p;
-               osd = rb_entry(parent, struct ceph_osd, o_node);
-               if (new->o_osd < osd->o_osd)
-                       p = &(*p)->rb_left;
-               else if (new->o_osd > osd->o_osd)
-                       p = &(*p)->rb_right;
-               else
-                       BUG();
-       }
+       if (wrlocked)
+               verify_osdc_wrlocked(osdc);
+       else
+               verify_osdc_locked(osdc);
 
-       rb_link_node(&new->o_node, parent, p);
-       rb_insert_color(&new->o_node, &osdc->osds);
+       if (o != CEPH_HOMELESS_OSD)
+               osd = lookup_osd(&osdc->osds, o);
+       else
+               osd = &osdc->homeless_osd;
+       if (!osd) {
+               if (!wrlocked)
+                       return ERR_PTR(-EAGAIN);
+
+               osd = create_osd(osdc, o);
+               insert_osd(&osdc->osds, osd);
+               ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd,
+                             &osdc->osdmap->osd_addr[osd->o_osd]);
+       }
+
+       dout("%s osdc %p osd%d -> osd %p\n", __func__, osdc, o, osd);
+       return osd;
 }
 
-static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
+/*
+ * Create request <-> OSD session relation.
+ *
+ * @req has to be assigned a tid, @osd may be homeless.
+ */
+static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req)
 {
-       struct ceph_osd *osd;
-       struct rb_node *n = osdc->osds.rb_node;
-
-       while (n) {
-               osd = rb_entry(n, struct ceph_osd, o_node);
-               if (o < osd->o_osd)
-                       n = n->rb_left;
-               else if (o > osd->o_osd)
-                       n = n->rb_right;
-               else
-                       return osd;
-       }
-       return NULL;
+       verify_osd_locked(osd);
+       WARN_ON(!req->r_tid || req->r_osd);
+       dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
+            req, req->r_tid);
+
+       if (!osd_homeless(osd))
+               __remove_osd_from_lru(osd);
+       else
+               atomic_inc(&osd->o_osdc->num_homeless);
+
+       get_osd(osd);
+       insert_request(&osd->o_requests, req);
+       req->r_osd = osd;
 }
 
-static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
+static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req)
 {
-       schedule_delayed_work(&osdc->timeout_work,
-                             osdc->client->options->osd_keepalive_timeout);
+       verify_osd_locked(osd);
+       WARN_ON(req->r_osd != osd);
+       dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
+            req, req->r_tid);
+
+       req->r_osd = NULL;
+       erase_request(&osd->o_requests, req);
+       put_osd(osd);
+
+       if (!osd_homeless(osd))
+               maybe_move_osd_to_lru(osd);
+       else
+               atomic_dec(&osd->o_osdc->num_homeless);
 }
 
-static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
+static bool __pool_full(struct ceph_pg_pool_info *pi)
 {
-       cancel_delayed_work(&osdc->timeout_work);
+       return pi->flags & CEPH_POOL_FLAG_FULL;
 }
 
-/*
- * Register request, assign tid.  If this is the first request, set up
- * the timeout event.
- */
-static void __register_request(struct ceph_osd_client *osdc,
-                              struct ceph_osd_request *req)
+static bool have_pool_full(struct ceph_osd_client *osdc)
 {
-       req->r_tid = ++osdc->last_tid;
-       req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
-       dout("__register_request %p tid %lld\n", req, req->r_tid);
-       __insert_request(osdc, req);
-       ceph_osdc_get_request(req);
-       osdc->num_requests++;
-       if (osdc->num_requests == 1) {
-               dout(" first request, scheduling timeout\n");
-               __schedule_osd_timeout(osdc);
+       struct rb_node *n;
+
+       for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) {
+               struct ceph_pg_pool_info *pi =
+                   rb_entry(n, struct ceph_pg_pool_info, node);
+
+               if (__pool_full(pi))
+                       return true;
        }
+
+       return false;
+}
+
+static bool pool_full(struct ceph_osd_client *osdc, s64 pool_id)
+{
+       struct ceph_pg_pool_info *pi;
+
+       pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
+       if (!pi)
+               return false;
+
+       return __pool_full(pi);
 }
 
 /*
- * called under osdc->request_mutex
+ * Returns whether a request should be blocked from being sent
+ * based on the current osdmap and osd_client settings.
  */
-static void __unregister_request(struct ceph_osd_client *osdc,
-                                struct ceph_osd_request *req)
+static bool target_should_be_paused(struct ceph_osd_client *osdc,
+                                   const struct ceph_osd_request_target *t,
+                                   struct ceph_pg_pool_info *pi)
 {
-       if (RB_EMPTY_NODE(&req->r_node)) {
-               dout("__unregister_request %p tid %lld not registered\n",
-                       req, req->r_tid);
-               return;
+       bool pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD);
+       bool pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) ||
+                      ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
+                      __pool_full(pi);
+
+       WARN_ON(pi->id != t->base_oloc.pool);
+       return (t->flags & CEPH_OSD_FLAG_READ && pauserd) ||
+              (t->flags & CEPH_OSD_FLAG_WRITE && pausewr);
+}
+
+enum calc_target_result {
+       CALC_TARGET_NO_ACTION = 0,
+       CALC_TARGET_NEED_RESEND,
+       CALC_TARGET_POOL_DNE,
+};
+
+static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
+                                          struct ceph_osd_request_target *t,
+                                          u32 *last_force_resend,
+                                          bool any_change)
+{
+       struct ceph_pg_pool_info *pi;
+       struct ceph_pg pgid, last_pgid;
+       struct ceph_osds up, acting;
+       bool force_resend = false;
+       bool need_check_tiering = false;
+       bool need_resend = false;
+       bool sort_bitwise = ceph_osdmap_flag(osdc, CEPH_OSDMAP_SORTBITWISE);
+       enum calc_target_result ct_res;
+       int ret;
+
+       pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool);
+       if (!pi) {
+               t->osd = CEPH_HOMELESS_OSD;
+               ct_res = CALC_TARGET_POOL_DNE;
+               goto out;
        }
 
-       dout("__unregister_request %p tid %lld\n", req, req->r_tid);
-       rb_erase(&req->r_node, &osdc->requests);
-       RB_CLEAR_NODE(&req->r_node);
-       osdc->num_requests--;
+       if (osdc->osdmap->epoch == pi->last_force_request_resend) {
+               if (last_force_resend &&
+                   *last_force_resend < pi->last_force_request_resend) {
+                       *last_force_resend = pi->last_force_request_resend;
+                       force_resend = true;
+               } else if (!last_force_resend) {
+                       force_resend = true;
+               }
+       }
+       if (ceph_oid_empty(&t->target_oid) || force_resend) {
+               ceph_oid_copy(&t->target_oid, &t->base_oid);
+               need_check_tiering = true;
+       }
+       if (ceph_oloc_empty(&t->target_oloc) || force_resend) {
+               ceph_oloc_copy(&t->target_oloc, &t->base_oloc);
+               need_check_tiering = true;
+       }
 
-       if (req->r_osd) {
-               /* make sure the original request isn't in flight. */
-               ceph_msg_revoke(req->r_request);
+       if (need_check_tiering &&
+           (t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
+               if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0)
+                       t->target_oloc.pool = pi->read_tier;
+               if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0)
+                       t->target_oloc.pool = pi->write_tier;
+       }
 
-               list_del_init(&req->r_osd_item);
-               maybe_move_osd_to_lru(osdc, req->r_osd);
-               if (list_empty(&req->r_linger_osd_item))
-                       req->r_osd = NULL;
+       ret = ceph_object_locator_to_pg(osdc->osdmap, &t->target_oid,
+                                       &t->target_oloc, &pgid);
+       if (ret) {
+               WARN_ON(ret != -ENOENT);
+               t->osd = CEPH_HOMELESS_OSD;
+               ct_res = CALC_TARGET_POOL_DNE;
+               goto out;
+       }
+       last_pgid.pool = pgid.pool;
+       last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask);
+
+       ceph_pg_to_up_acting_osds(osdc->osdmap, &pgid, &up, &acting);
+       if (any_change &&
+           ceph_is_new_interval(&t->acting,
+                                &acting,
+                                &t->up,
+                                &up,
+                                t->size,
+                                pi->size,
+                                t->min_size,
+                                pi->min_size,
+                                t->pg_num,
+                                pi->pg_num,
+                                t->sort_bitwise,
+                                sort_bitwise,
+                                &last_pgid))
+               force_resend = true;
+
+       if (t->paused && !target_should_be_paused(osdc, t, pi)) {
+               t->paused = false;
+               need_resend = true;
        }
 
-       list_del_init(&req->r_req_lru_item);
-       ceph_osdc_put_request(req);
+       if (ceph_pg_compare(&t->pgid, &pgid) ||
+           ceph_osds_changed(&t->acting, &acting, any_change) ||
+           force_resend) {
+               t->pgid = pgid; /* struct */
+               ceph_osds_copy(&t->acting, &acting);
+               ceph_osds_copy(&t->up, &up);
+               t->size = pi->size;
+               t->min_size = pi->min_size;
+               t->pg_num = pi->pg_num;
+               t->pg_num_mask = pi->pg_num_mask;
+               t->sort_bitwise = sort_bitwise;
+
+               t->osd = acting.primary;
+               need_resend = true;
+       }
+
+       ct_res = need_resend ? CALC_TARGET_NEED_RESEND : CALC_TARGET_NO_ACTION;
+out:
+       dout("%s t %p -> ct_res %d osd %d\n", __func__, t, ct_res, t->osd);
+       return ct_res;
+}
+
+static void setup_request_data(struct ceph_osd_request *req,
+                              struct ceph_msg *msg)
+{
+       u32 data_len = 0;
+       int i;
+
+       if (!list_empty(&msg->data))
+               return;
+
+       WARN_ON(msg->data_length);
+       for (i = 0; i < req->r_num_ops; i++) {
+               struct ceph_osd_req_op *op = &req->r_ops[i];
+
+               switch (op->op) {
+               /* request */
+               case CEPH_OSD_OP_WRITE:
+               case CEPH_OSD_OP_WRITEFULL:
+                       WARN_ON(op->indata_len != op->extent.length);
+                       ceph_osdc_msg_data_add(msg, &op->extent.osd_data);
+                       break;
+               case CEPH_OSD_OP_SETXATTR:
+               case CEPH_OSD_OP_CMPXATTR:
+                       WARN_ON(op->indata_len != op->xattr.name_len +
+                                                 op->xattr.value_len);
+                       ceph_osdc_msg_data_add(msg, &op->xattr.osd_data);
+                       break;
+               case CEPH_OSD_OP_NOTIFY_ACK:
+                       ceph_osdc_msg_data_add(msg,
+                                              &op->notify_ack.request_data);
+                       break;
+
+               /* reply */
+               case CEPH_OSD_OP_STAT:
+                       ceph_osdc_msg_data_add(req->r_reply,
+                                              &op->raw_data_in);
+                       break;
+               case CEPH_OSD_OP_READ:
+                       ceph_osdc_msg_data_add(req->r_reply,
+                                              &op->extent.osd_data);
+                       break;
+
+               /* both */
+               case CEPH_OSD_OP_CALL:
+                       WARN_ON(op->indata_len != op->cls.class_len +
+                                                 op->cls.method_len +
+                                                 op->cls.indata_len);
+                       ceph_osdc_msg_data_add(msg, &op->cls.request_info);
+                       /* optional, can be NONE */
+                       ceph_osdc_msg_data_add(msg, &op->cls.request_data);
+                       /* optional, can be NONE */
+                       ceph_osdc_msg_data_add(req->r_reply,
+                                              &op->cls.response_data);
+                       break;
+               case CEPH_OSD_OP_NOTIFY:
+                       ceph_osdc_msg_data_add(msg,
+                                              &op->notify.request_data);
+                       ceph_osdc_msg_data_add(req->r_reply,
+                                              &op->notify.response_data);
+                       break;
+               }
+
+               data_len += op->indata_len;
+       }
+
+       WARN_ON(data_len != msg->data_length);
+}
+
+static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg)
+{
+       void *p = msg->front.iov_base;
+       void *const end = p + msg->front_alloc_len;
+       u32 data_len = 0;
+       int i;
+
+       if (req->r_flags & CEPH_OSD_FLAG_WRITE) {
+               /* snapshots aren't writeable */
+               WARN_ON(req->r_snapid != CEPH_NOSNAP);
+       } else {
+               WARN_ON(req->r_mtime.tv_sec || req->r_mtime.tv_nsec ||
+                       req->r_data_offset || req->r_snapc);
+       }
+
+       setup_request_data(req, msg);
+
+       ceph_encode_32(&p, 1); /* client_inc, always 1 */
+       ceph_encode_32(&p, req->r_osdc->osdmap->epoch);
+       ceph_encode_32(&p, req->r_flags);
+       ceph_encode_timespec(p, &req->r_mtime);
+       p += sizeof(struct ceph_timespec);
+       /* aka reassert_version */
+       memcpy(p, &req->r_replay_version, sizeof(req->r_replay_version));
+       p += sizeof(req->r_replay_version);
+
+       /* oloc */
+       ceph_encode_8(&p, 4);
+       ceph_encode_8(&p, 4);
+       ceph_encode_32(&p, 8 + 4 + 4);
+       ceph_encode_64(&p, req->r_t.target_oloc.pool);
+       ceph_encode_32(&p, -1); /* preferred */
+       ceph_encode_32(&p, 0); /* key len */
+
+       /* pgid */
+       ceph_encode_8(&p, 1);
+       ceph_encode_64(&p, req->r_t.pgid.pool);
+       ceph_encode_32(&p, req->r_t.pgid.seed);
+       ceph_encode_32(&p, -1); /* preferred */
+
+       /* oid */
+       ceph_encode_32(&p, req->r_t.target_oid.name_len);
+       memcpy(p, req->r_t.target_oid.name, req->r_t.target_oid.name_len);
+       p += req->r_t.target_oid.name_len;
+
+       /* ops, can imply data */
+       ceph_encode_16(&p, req->r_num_ops);
+       for (i = 0; i < req->r_num_ops; i++) {
+               data_len += osd_req_encode_op(p, &req->r_ops[i]);
+               p += sizeof(struct ceph_osd_op);
+       }
 
-       if (osdc->num_requests == 0) {
-               dout(" no requests, canceling timeout\n");
-               __cancel_osd_timeout(osdc);
+       ceph_encode_64(&p, req->r_snapid); /* snapid */
+       if (req->r_snapc) {
+               ceph_encode_64(&p, req->r_snapc->seq);
+               ceph_encode_32(&p, req->r_snapc->num_snaps);
+               for (i = 0; i < req->r_snapc->num_snaps; i++)
+                       ceph_encode_64(&p, req->r_snapc->snaps[i]);
+       } else {
+               ceph_encode_64(&p, 0); /* snap_seq */
+               ceph_encode_32(&p, 0); /* snaps len */
        }
+
+       ceph_encode_32(&p, req->r_attempts); /* retry_attempt */
+
+       BUG_ON(p > end);
+       msg->front.iov_len = p - msg->front.iov_base;
+       msg->hdr.version = cpu_to_le16(4); /* MOSDOp v4 */
+       msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+       msg->hdr.data_len = cpu_to_le32(data_len);
+       /*
+        * The header "data_off" is a hint to the receiver allowing it
+        * to align received data into its buffers such that there's no
+        * need to re-copy it before writing it to disk (direct I/O).
+        */
+       msg->hdr.data_off = cpu_to_le16(req->r_data_offset);
+
+       dout("%s req %p oid %s oid_len %d front %zu data %u\n", __func__,
+            req, req->r_t.target_oid.name, req->r_t.target_oid.name_len,
+            msg->front.iov_len, data_len);
 }
 
 /*
- * Cancel a previously queued request message
+ * @req has to be assigned a tid and registered.
  */
-static void __cancel_request(struct ceph_osd_request *req)
+static void send_request(struct ceph_osd_request *req)
 {
-       if (req->r_sent && req->r_osd) {
+       struct ceph_osd *osd = req->r_osd;
+
+       verify_osd_locked(osd);
+       WARN_ON(osd->o_osd != req->r_t.osd);
+
+       /*
+        * We may have a previously queued request message hanging
+        * around.  Cancel it to avoid corrupting the msgr.
+        */
+       if (req->r_sent)
                ceph_msg_revoke(req->r_request);
-               req->r_sent = 0;
+
+       req->r_flags |= CEPH_OSD_FLAG_KNOWN_REDIR;
+       if (req->r_attempts)
+               req->r_flags |= CEPH_OSD_FLAG_RETRY;
+       else
+               WARN_ON(req->r_flags & CEPH_OSD_FLAG_RETRY);
+
+       encode_request(req, req->r_request);
+
+       dout("%s req %p tid %llu to pg %llu.%x osd%d flags 0x%x attempt %d\n",
+            __func__, req, req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed,
+            req->r_t.osd, req->r_flags, req->r_attempts);
+
+       req->r_t.paused = false;
+       req->r_stamp = jiffies;
+       req->r_attempts++;
+
+       req->r_sent = osd->o_incarnation;
+       req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
+       ceph_con_send(&osd->o_con, ceph_msg_get(req->r_request));
+}
+
+static void maybe_request_map(struct ceph_osd_client *osdc)
+{
+       bool continuous = false;
+
+       verify_osdc_locked(osdc);
+       WARN_ON(!osdc->osdmap->epoch);
+
+       if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
+           ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD) ||
+           ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) {
+               dout("%s osdc %p continuous\n", __func__, osdc);
+               continuous = true;
+       } else {
+               dout("%s osdc %p onetime\n", __func__, osdc);
        }
+
+       if (ceph_monc_want_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
+                              osdc->osdmap->epoch + 1, continuous))
+               ceph_monc_renew_subs(&osdc->client->monc);
 }
 
-static void __register_linger_request(struct ceph_osd_client *osdc,
-                                   struct ceph_osd_request *req)
+static void send_map_check(struct ceph_osd_request *req);
+
+static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
 {
-       dout("%s %p tid %llu\n", __func__, req, req->r_tid);
-       WARN_ON(!req->r_linger);
+       struct ceph_osd_client *osdc = req->r_osdc;
+       struct ceph_osd *osd;
+       enum calc_target_result ct_res;
+       bool need_send = false;
+       bool promoted = false;
+
+       WARN_ON(req->r_tid || req->r_got_reply);
+       dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
+
+again:
+       ct_res = calc_target(osdc, &req->r_t, &req->r_last_force_resend, false);
+       if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked)
+               goto promote;
+
+       osd = lookup_create_osd(osdc, req->r_t.osd, wrlocked);
+       if (IS_ERR(osd)) {
+               WARN_ON(PTR_ERR(osd) != -EAGAIN || wrlocked);
+               goto promote;
+       }
 
+       if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
+           ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) {
+               dout("req %p pausewr\n", req);
+               req->r_t.paused = true;
+               maybe_request_map(osdc);
+       } else if ((req->r_flags & CEPH_OSD_FLAG_READ) &&
+                  ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD)) {
+               dout("req %p pauserd\n", req);
+               req->r_t.paused = true;
+               maybe_request_map(osdc);
+       } else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
+                  !(req->r_flags & (CEPH_OSD_FLAG_FULL_TRY |
+                                    CEPH_OSD_FLAG_FULL_FORCE)) &&
+                  (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
+                   pool_full(osdc, req->r_t.base_oloc.pool))) {
+               dout("req %p full/pool_full\n", req);
+               pr_warn_ratelimited("FULL or reached pool quota\n");
+               req->r_t.paused = true;
+               maybe_request_map(osdc);
+       } else if (!osd_homeless(osd)) {
+               need_send = true;
+       } else {
+               maybe_request_map(osdc);
+       }
+
+       mutex_lock(&osd->lock);
+       /*
+        * Assign the tid atomically with send_request() to protect
+        * multiple writes to the same object from racing with each
+        * other, resulting in out of order ops on the OSDs.
+        */
+       req->r_tid = atomic64_inc_return(&osdc->last_tid);
+       link_request(osd, req);
+       if (need_send)
+               send_request(req);
+       mutex_unlock(&osd->lock);
+
+       if (ct_res == CALC_TARGET_POOL_DNE)
+               send_map_check(req);
+
+       if (promoted)
+               downgrade_write(&osdc->lock);
+       return;
+
+promote:
+       up_read(&osdc->lock);
+       down_write(&osdc->lock);
+       wrlocked = true;
+       promoted = true;
+       goto again;
+}
+
+static void account_request(struct ceph_osd_request *req)
+{
+       unsigned int mask = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
+
+       if (req->r_flags & CEPH_OSD_FLAG_READ) {
+               WARN_ON(req->r_flags & mask);
+               req->r_flags |= CEPH_OSD_FLAG_ACK;
+       } else if (req->r_flags & CEPH_OSD_FLAG_WRITE)
+               WARN_ON(!(req->r_flags & mask));
+       else
+               WARN_ON(1);
+
+       WARN_ON(req->r_unsafe_callback && (req->r_flags & mask) != mask);
+       atomic_inc(&req->r_osdc->num_requests);
+}
+
+static void submit_request(struct ceph_osd_request *req, bool wrlocked)
+{
        ceph_osdc_get_request(req);
-       list_add_tail(&req->r_linger_item, &osdc->req_linger);
-       if (req->r_osd)
-               list_add_tail(&req->r_linger_osd_item,
-                             &req->r_osd->o_linger_requests);
+       account_request(req);
+       __submit_request(req, wrlocked);
 }
 
-static void __unregister_linger_request(struct ceph_osd_client *osdc,
-                                       struct ceph_osd_request *req)
+static void __finish_request(struct ceph_osd_request *req)
 {
-       WARN_ON(!req->r_linger);
+       struct ceph_osd_client *osdc = req->r_osdc;
+       struct ceph_osd *osd = req->r_osd;
 
-       if (list_empty(&req->r_linger_item)) {
-               dout("%s %p tid %llu not registered\n", __func__, req,
-                    req->r_tid);
+       verify_osd_locked(osd);
+       dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+
+       WARN_ON(lookup_request_mc(&osdc->map_checks, req->r_tid));
+       unlink_request(osd, req);
+       atomic_dec(&osdc->num_requests);
+
+       /*
+        * If an OSD has failed or returned and a request has been sent
+        * twice, it's possible to get a reply and end up here while the
+        * request message is queued for delivery.  We will ignore the
+        * reply, so not a big deal, but better to try and catch it.
+        */
+       ceph_msg_revoke(req->r_request);
+       ceph_msg_revoke_incoming(req->r_reply);
+}
+
+static void finish_request(struct ceph_osd_request *req)
+{
+       __finish_request(req);
+       ceph_osdc_put_request(req);
+}
+
+static void __complete_request(struct ceph_osd_request *req)
+{
+       if (req->r_callback)
+               req->r_callback(req);
+       else
+               complete_all(&req->r_completion);
+}
+
+/*
+ * Note that this is open-coded in handle_reply(), which has to deal
+ * with ack vs commit, dup acks, etc.
+ */
+static void complete_request(struct ceph_osd_request *req, int err)
+{
+       dout("%s req %p tid %llu err %d\n", __func__, req, req->r_tid, err);
+
+       req->r_result = err;
+       __finish_request(req);
+       __complete_request(req);
+       complete_all(&req->r_safe_completion);
+       ceph_osdc_put_request(req);
+}
+
+static void cancel_map_check(struct ceph_osd_request *req)
+{
+       struct ceph_osd_client *osdc = req->r_osdc;
+       struct ceph_osd_request *lookup_req;
+
+       verify_osdc_wrlocked(osdc);
+
+       lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid);
+       if (!lookup_req)
                return;
+
+       WARN_ON(lookup_req != req);
+       erase_request_mc(&osdc->map_checks, req);
+       ceph_osdc_put_request(req);
+}
+
+static void cancel_request(struct ceph_osd_request *req)
+{
+       dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+
+       cancel_map_check(req);
+       finish_request(req);
+}
+
+static void check_pool_dne(struct ceph_osd_request *req)
+{
+       struct ceph_osd_client *osdc = req->r_osdc;
+       struct ceph_osdmap *map = osdc->osdmap;
+
+       verify_osdc_wrlocked(osdc);
+       WARN_ON(!map->epoch);
+
+       if (req->r_attempts) {
+               /*
+                * We sent a request earlier, which means that
+                * previously the pool existed, and now it does not
+                * (i.e., it was deleted).
+                */
+               req->r_map_dne_bound = map->epoch;
+               dout("%s req %p tid %llu pool disappeared\n", __func__, req,
+                    req->r_tid);
+       } else {
+               dout("%s req %p tid %llu map_dne_bound %u have %u\n", __func__,
+                    req, req->r_tid, req->r_map_dne_bound, map->epoch);
        }
 
-       dout("%s %p tid %llu\n", __func__, req, req->r_tid);
-       list_del_init(&req->r_linger_item);
+       if (req->r_map_dne_bound) {
+               if (map->epoch >= req->r_map_dne_bound) {
+                       /* we had a new enough map */
+                       pr_info_ratelimited("tid %llu pool does not exist\n",
+                                           req->r_tid);
+                       complete_request(req, -ENOENT);
+               }
+       } else {
+               send_map_check(req);
+       }
+}
+
+static void map_check_cb(struct ceph_mon_generic_request *greq)
+{
+       struct ceph_osd_client *osdc = &greq->monc->client->osdc;
+       struct ceph_osd_request *req;
+       u64 tid = greq->private_data;
+
+       WARN_ON(greq->result || !greq->u.newest);
 
-       if (req->r_osd) {
-               list_del_init(&req->r_linger_osd_item);
-               maybe_move_osd_to_lru(osdc, req->r_osd);
-               if (list_empty(&req->r_osd_item))
-                       req->r_osd = NULL;
+       down_write(&osdc->lock);
+       req = lookup_request_mc(&osdc->map_checks, tid);
+       if (!req) {
+               dout("%s tid %llu dne\n", __func__, tid);
+               goto out_unlock;
        }
+
+       dout("%s req %p tid %llu map_dne_bound %u newest %llu\n", __func__,
+            req, req->r_tid, req->r_map_dne_bound, greq->u.newest);
+       if (!req->r_map_dne_bound)
+               req->r_map_dne_bound = greq->u.newest;
+       erase_request_mc(&osdc->map_checks, req);
+       check_pool_dne(req);
+
        ceph_osdc_put_request(req);
+out_unlock:
+       up_write(&osdc->lock);
 }
 
-void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
-                                 struct ceph_osd_request *req)
+static void send_map_check(struct ceph_osd_request *req)
 {
-       if (!req->r_linger) {
-               dout("set_request_linger %p\n", req);
-               req->r_linger = 1;
+       struct ceph_osd_client *osdc = req->r_osdc;
+       struct ceph_osd_request *lookup_req;
+       int ret;
+
+       verify_osdc_wrlocked(osdc);
+
+       lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid);
+       if (lookup_req) {
+               WARN_ON(lookup_req != req);
+               return;
        }
+
+       ceph_osdc_get_request(req);
+       insert_request_mc(&osdc->map_checks, req);
+       ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap",
+                                         map_check_cb, req->r_tid);
+       WARN_ON(ret);
 }
-EXPORT_SYMBOL(ceph_osdc_set_request_linger);
 
 /*
- * Returns whether a request should be blocked from being sent
- * based on the current osdmap and osd_client settings.
- *
- * Caller should hold map_sem for read.
+ * lingering requests, watch/notify v2 infrastructure
  */
-static bool __req_should_be_paused(struct ceph_osd_client *osdc,
-                                  struct ceph_osd_request *req)
+static void linger_release(struct kref *kref)
 {
-       bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
-       bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
-               ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
-       return (req->r_flags & CEPH_OSD_FLAG_READ && pauserd) ||
-               (req->r_flags & CEPH_OSD_FLAG_WRITE && pausewr);
+       struct ceph_osd_linger_request *lreq =
+           container_of(kref, struct ceph_osd_linger_request, kref);
+
+       dout("%s lreq %p reg_req %p ping_req %p\n", __func__, lreq,
+            lreq->reg_req, lreq->ping_req);
+       WARN_ON(!RB_EMPTY_NODE(&lreq->node));
+       WARN_ON(!RB_EMPTY_NODE(&lreq->osdc_node));
+       WARN_ON(!RB_EMPTY_NODE(&lreq->mc_node));
+       WARN_ON(!list_empty(&lreq->scan_item));
+       WARN_ON(!list_empty(&lreq->pending_lworks));
+       WARN_ON(lreq->osd);
+
+       if (lreq->reg_req)
+               ceph_osdc_put_request(lreq->reg_req);
+       if (lreq->ping_req)
+               ceph_osdc_put_request(lreq->ping_req);
+       target_destroy(&lreq->t);
+       kfree(lreq);
 }
 
+static void linger_put(struct ceph_osd_linger_request *lreq)
+{
+       if (lreq)
+               kref_put(&lreq->kref, linger_release);
+}
+
+static struct ceph_osd_linger_request *
+linger_get(struct ceph_osd_linger_request *lreq)
+{
+       kref_get(&lreq->kref);
+       return lreq;
+}
+
+static struct ceph_osd_linger_request *
+linger_alloc(struct ceph_osd_client *osdc)
+{
+       struct ceph_osd_linger_request *lreq;
+
+       lreq = kzalloc(sizeof(*lreq), GFP_NOIO);
+       if (!lreq)
+               return NULL;
+
+       kref_init(&lreq->kref);
+       mutex_init(&lreq->lock);
+       RB_CLEAR_NODE(&lreq->node);
+       RB_CLEAR_NODE(&lreq->osdc_node);
+       RB_CLEAR_NODE(&lreq->mc_node);
+       INIT_LIST_HEAD(&lreq->scan_item);
+       INIT_LIST_HEAD(&lreq->pending_lworks);
+       init_completion(&lreq->reg_commit_wait);
+       init_completion(&lreq->notify_finish_wait);
+
+       lreq->osdc = osdc;
+       target_init(&lreq->t);
+
+       dout("%s lreq %p\n", __func__, lreq);
+       return lreq;
+}
+
+DEFINE_RB_INSDEL_FUNCS(linger, struct ceph_osd_linger_request, linger_id, node)
+DEFINE_RB_FUNCS(linger_osdc, struct ceph_osd_linger_request, linger_id, osdc_node)
+DEFINE_RB_FUNCS(linger_mc, struct ceph_osd_linger_request, linger_id, mc_node)
+
 /*
- * Calculate mapping of a request to a PG.  Takes tiering into account.
+ * Create linger request <-> OSD session relation.
+ *
+ * @lreq has to be registered, @osd may be homeless.
  */
-static int __calc_request_pg(struct ceph_osdmap *osdmap,
-                            struct ceph_osd_request *req,
-                            struct ceph_pg *pg_out)
+static void link_linger(struct ceph_osd *osd,
+                       struct ceph_osd_linger_request *lreq)
 {
-       bool need_check_tiering;
+       verify_osd_locked(osd);
+       WARN_ON(!lreq->linger_id || lreq->osd);
+       dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd,
+            osd->o_osd, lreq, lreq->linger_id);
 
-       need_check_tiering = false;
-       if (req->r_target_oloc.pool == -1) {
-               req->r_target_oloc = req->r_base_oloc; /* struct */
-               need_check_tiering = true;
+       if (!osd_homeless(osd))
+               __remove_osd_from_lru(osd);
+       else
+               atomic_inc(&osd->o_osdc->num_homeless);
+
+       get_osd(osd);
+       insert_linger(&osd->o_linger_requests, lreq);
+       lreq->osd = osd;
+}
+
+static void unlink_linger(struct ceph_osd *osd,
+                         struct ceph_osd_linger_request *lreq)
+{
+       verify_osd_locked(osd);
+       WARN_ON(lreq->osd != osd);
+       dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd,
+            osd->o_osd, lreq, lreq->linger_id);
+
+       lreq->osd = NULL;
+       erase_linger(&osd->o_linger_requests, lreq);
+       put_osd(osd);
+
+       if (!osd_homeless(osd))
+               maybe_move_osd_to_lru(osd);
+       else
+               atomic_dec(&osd->o_osdc->num_homeless);
+}
+
+static bool __linger_registered(struct ceph_osd_linger_request *lreq)
+{
+       verify_osdc_locked(lreq->osdc);
+
+       return !RB_EMPTY_NODE(&lreq->osdc_node);
+}
+
+static bool linger_registered(struct ceph_osd_linger_request *lreq)
+{
+       struct ceph_osd_client *osdc = lreq->osdc;
+       bool registered;
+
+       down_read(&osdc->lock);
+       registered = __linger_registered(lreq);
+       up_read(&osdc->lock);
+
+       return registered;
+}
+
+static void linger_register(struct ceph_osd_linger_request *lreq)
+{
+       struct ceph_osd_client *osdc = lreq->osdc;
+
+       verify_osdc_wrlocked(osdc);
+       WARN_ON(lreq->linger_id);
+
+       linger_get(lreq);
+       lreq->linger_id = ++osdc->last_linger_id;
+       insert_linger_osdc(&osdc->linger_requests, lreq);
+}
+
+static void linger_unregister(struct ceph_osd_linger_request *lreq)
+{
+       struct ceph_osd_client *osdc = lreq->osdc;
+
+       verify_osdc_wrlocked(osdc);
+
+       erase_linger_osdc(&osdc->linger_requests, lreq);
+       linger_put(lreq);
+}
+
+static void cancel_linger_request(struct ceph_osd_request *req)
+{
+       struct ceph_osd_linger_request *lreq = req->r_priv;
+
+       WARN_ON(!req->r_linger);
+       cancel_request(req);
+       linger_put(lreq);
+}
+
+struct linger_work {
+       struct work_struct work;
+       struct ceph_osd_linger_request *lreq;
+       struct list_head pending_item;
+       unsigned long queued_stamp;
+
+       union {
+               struct {
+                       u64 notify_id;
+                       u64 notifier_id;
+                       void *payload; /* points into @msg front */
+                       size_t payload_len;
+
+                       struct ceph_msg *msg; /* for ceph_msg_put() */
+               } notify;
+               struct {
+                       int err;
+               } error;
+       };
+};
+
+static struct linger_work *lwork_alloc(struct ceph_osd_linger_request *lreq,
+                                      work_func_t workfn)
+{
+       struct linger_work *lwork;
+
+       lwork = kzalloc(sizeof(*lwork), GFP_NOIO);
+       if (!lwork)
+               return NULL;
+
+       INIT_WORK(&lwork->work, workfn);
+       INIT_LIST_HEAD(&lwork->pending_item);
+       lwork->lreq = linger_get(lreq);
+
+       return lwork;
+}
+
+static void lwork_free(struct linger_work *lwork)
+{
+       struct ceph_osd_linger_request *lreq = lwork->lreq;
+
+       mutex_lock(&lreq->lock);
+       list_del(&lwork->pending_item);
+       mutex_unlock(&lreq->lock);
+
+       linger_put(lreq);
+       kfree(lwork);
+}
+
+static void lwork_queue(struct linger_work *lwork)
+{
+       struct ceph_osd_linger_request *lreq = lwork->lreq;
+       struct ceph_osd_client *osdc = lreq->osdc;
+
+       verify_lreq_locked(lreq);
+       WARN_ON(!list_empty(&lwork->pending_item));
+
+       lwork->queued_stamp = jiffies;
+       list_add_tail(&lwork->pending_item, &lreq->pending_lworks);
+       queue_work(osdc->notify_wq, &lwork->work);
+}
+
+static void do_watch_notify(struct work_struct *w)
+{
+       struct linger_work *lwork = container_of(w, struct linger_work, work);
+       struct ceph_osd_linger_request *lreq = lwork->lreq;
+
+       if (!linger_registered(lreq)) {
+               dout("%s lreq %p not registered\n", __func__, lreq);
+               goto out;
        }
-       if (req->r_target_oid.name_len == 0) {
-               ceph_oid_copy(&req->r_target_oid, &req->r_base_oid);
-               need_check_tiering = true;
+
+       WARN_ON(!lreq->is_watch);
+       dout("%s lreq %p notify_id %llu notifier_id %llu payload_len %zu\n",
+            __func__, lreq, lwork->notify.notify_id, lwork->notify.notifier_id,
+            lwork->notify.payload_len);
+       lreq->wcb(lreq->data, lwork->notify.notify_id, lreq->linger_id,
+                 lwork->notify.notifier_id, lwork->notify.payload,
+                 lwork->notify.payload_len);
+
+out:
+       ceph_msg_put(lwork->notify.msg);
+       lwork_free(lwork);
+}
+
+static void do_watch_error(struct work_struct *w)
+{
+       struct linger_work *lwork = container_of(w, struct linger_work, work);
+       struct ceph_osd_linger_request *lreq = lwork->lreq;
+
+       if (!linger_registered(lreq)) {
+               dout("%s lreq %p not registered\n", __func__, lreq);
+               goto out;
        }
 
-       if (need_check_tiering &&
-           (req->r_flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
-               struct ceph_pg_pool_info *pi;
-
-               pi = ceph_pg_pool_by_id(osdmap, req->r_target_oloc.pool);
-               if (pi) {
-                       if ((req->r_flags & CEPH_OSD_FLAG_READ) &&
-                           pi->read_tier >= 0)
-                               req->r_target_oloc.pool = pi->read_tier;
-                       if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
-                           pi->write_tier >= 0)
-                               req->r_target_oloc.pool = pi->write_tier;
+       dout("%s lreq %p err %d\n", __func__, lreq, lwork->error.err);
+       lreq->errcb(lreq->data, lreq->linger_id, lwork->error.err);
+
+out:
+       lwork_free(lwork);
+}
+
+static void queue_watch_error(struct ceph_osd_linger_request *lreq)
+{
+       struct linger_work *lwork;
+
+       lwork = lwork_alloc(lreq, do_watch_error);
+       if (!lwork) {
+               pr_err("failed to allocate error-lwork\n");
+               return;
+       }
+
+       lwork->error.err = lreq->last_error;
+       lwork_queue(lwork);
+}
+
+static void linger_reg_commit_complete(struct ceph_osd_linger_request *lreq,
+                                      int result)
+{
+       if (!completion_done(&lreq->reg_commit_wait)) {
+               lreq->reg_commit_error = (result <= 0 ? result : 0);
+               complete_all(&lreq->reg_commit_wait);
+       }
+}
+
+static void linger_commit_cb(struct ceph_osd_request *req)
+{
+       struct ceph_osd_linger_request *lreq = req->r_priv;
+
+       mutex_lock(&lreq->lock);
+       dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq,
+            lreq->linger_id, req->r_result);
+       WARN_ON(!__linger_registered(lreq));
+       linger_reg_commit_complete(lreq, req->r_result);
+       lreq->committed = true;
+
+       if (!lreq->is_watch) {
+               struct ceph_osd_data *osd_data =
+                   osd_req_op_data(req, 0, notify, response_data);
+               void *p = page_address(osd_data->pages[0]);
+
+               WARN_ON(req->r_ops[0].op != CEPH_OSD_OP_NOTIFY ||
+                       osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
+
+               /* make note of the notify_id */
+               if (req->r_ops[0].outdata_len >= sizeof(u64)) {
+                       lreq->notify_id = ceph_decode_64(&p);
+                       dout("lreq %p notify_id %llu\n", lreq,
+                            lreq->notify_id);
+               } else {
+                       dout("lreq %p no notify_id\n", lreq);
                }
-               /* !pi is caught in ceph_oloc_oid_to_pg() */
        }
 
-       return ceph_oloc_oid_to_pg(osdmap, &req->r_target_oloc,
-                                  &req->r_target_oid, pg_out);
+       mutex_unlock(&lreq->lock);
+       linger_put(lreq);
 }
 
-static void __enqueue_request(struct ceph_osd_request *req)
+static int normalize_watch_error(int err)
 {
-       struct ceph_osd_client *osdc = req->r_osdc;
+       /*
+        * Translate ENOENT -> ENOTCONN so that a delete->disconnection
+        * notification and a failure to reconnect because we raced with
+        * the delete appear the same to the user.
+        */
+       if (err == -ENOENT)
+               err = -ENOTCONN;
 
-       dout("%s %p tid %llu to osd%d\n", __func__, req, req->r_tid,
-            req->r_osd ? req->r_osd->o_osd : -1);
+       return err;
+}
+
+static void linger_reconnect_cb(struct ceph_osd_request *req)
+{
+       struct ceph_osd_linger_request *lreq = req->r_priv;
+
+       mutex_lock(&lreq->lock);
+       dout("%s lreq %p linger_id %llu result %d last_error %d\n", __func__,
+            lreq, lreq->linger_id, req->r_result, lreq->last_error);
+       if (req->r_result < 0) {
+               if (!lreq->last_error) {
+                       lreq->last_error = normalize_watch_error(req->r_result);
+                       queue_watch_error(lreq);
+               }
+       }
 
-       if (req->r_osd) {
-               __remove_osd_from_lru(req->r_osd);
-               list_add_tail(&req->r_osd_item, &req->r_osd->o_requests);
-               list_move_tail(&req->r_req_lru_item, &osdc->req_unsent);
+       mutex_unlock(&lreq->lock);
+       linger_put(lreq);
+}
+
+static void send_linger(struct ceph_osd_linger_request *lreq)
+{
+       struct ceph_osd_request *req = lreq->reg_req;
+       struct ceph_osd_req_op *op = &req->r_ops[0];
+
+       verify_osdc_wrlocked(req->r_osdc);
+       dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
+
+       if (req->r_osd)
+               cancel_linger_request(req);
+
+       request_reinit(req);
+       ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
+       ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
+       req->r_flags = lreq->t.flags;
+       req->r_mtime = lreq->mtime;
+
+       mutex_lock(&lreq->lock);
+       if (lreq->is_watch && lreq->committed) {
+               WARN_ON(op->op != CEPH_OSD_OP_WATCH ||
+                       op->watch.cookie != lreq->linger_id);
+               op->watch.op = CEPH_OSD_WATCH_OP_RECONNECT;
+               op->watch.gen = ++lreq->register_gen;
+               dout("lreq %p reconnect register_gen %u\n", lreq,
+                    op->watch.gen);
+               req->r_callback = linger_reconnect_cb;
        } else {
-               list_move_tail(&req->r_req_lru_item, &osdc->req_notarget);
+               if (!lreq->is_watch)
+                       lreq->notify_id = 0;
+               else
+                       WARN_ON(op->watch.op != CEPH_OSD_WATCH_OP_WATCH);
+               dout("lreq %p register\n", lreq);
+               req->r_callback = linger_commit_cb;
        }
+       mutex_unlock(&lreq->lock);
+
+       req->r_priv = linger_get(lreq);
+       req->r_linger = true;
+
+       submit_request(req, true);
 }
 
-/*
- * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
- * (as needed), and set the request r_osd appropriately.  If there is
- * no up osd, set r_osd to NULL.  Move the request to the appropriate list
- * (unsent, homeless) or leave on in-flight lru.
- *
- * Return 0 if unchanged, 1 if changed, or negative on error.
- *
- * Caller should hold map_sem for read and request_mutex.
- */
-static int __map_request(struct ceph_osd_client *osdc,
-                        struct ceph_osd_request *req, int force_resend)
+static void linger_ping_cb(struct ceph_osd_request *req)
 {
-       struct ceph_pg pgid;
-       int acting[CEPH_PG_MAX_SIZE];
-       int num, o;
-       int err;
-       bool was_paused;
-
-       dout("map_request %p tid %lld\n", req, req->r_tid);
-
-       err = __calc_request_pg(osdc->osdmap, req, &pgid);
-       if (err) {
-               list_move(&req->r_req_lru_item, &osdc->req_notarget);
-               return err;
-       }
-       req->r_pgid = pgid;
-
-       num = ceph_calc_pg_acting(osdc->osdmap, pgid, acting, &o);
-       if (num < 0)
-               num = 0;
-
-       was_paused = req->r_paused;
-       req->r_paused = __req_should_be_paused(osdc, req);
-       if (was_paused && !req->r_paused)
-               force_resend = 1;
-
-       if ((!force_resend &&
-            req->r_osd && req->r_osd->o_osd == o &&
-            req->r_sent >= req->r_osd->o_incarnation &&
-            req->r_num_pg_osds == num &&
-            memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
-           (req->r_osd == NULL && o == -1) ||
-           req->r_paused)
-               return 0;  /* no change */
-
-       dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n",
-            req->r_tid, pgid.pool, pgid.seed, o,
-            req->r_osd ? req->r_osd->o_osd : -1);
-
-       /* record full pg acting set */
-       memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
-       req->r_num_pg_osds = num;
-
-       if (req->r_osd) {
-               __cancel_request(req);
-               list_del_init(&req->r_osd_item);
-               list_del_init(&req->r_linger_osd_item);
-               req->r_osd = NULL;
-       }
-
-       req->r_osd = __lookup_osd(osdc, o);
-       if (!req->r_osd && o >= 0) {
-               err = -ENOMEM;
-               req->r_osd = create_osd(osdc, o);
-               if (!req->r_osd) {
-                       list_move(&req->r_req_lru_item, &osdc->req_notarget);
-                       goto out;
+       struct ceph_osd_linger_request *lreq = req->r_priv;
+
+       mutex_lock(&lreq->lock);
+       dout("%s lreq %p linger_id %llu result %d ping_sent %lu last_error %d\n",
+            __func__, lreq, lreq->linger_id, req->r_result, lreq->ping_sent,
+            lreq->last_error);
+       if (lreq->register_gen == req->r_ops[0].watch.gen) {
+               if (!req->r_result) {
+                       lreq->watch_valid_thru = lreq->ping_sent;
+               } else if (!lreq->last_error) {
+                       lreq->last_error = normalize_watch_error(req->r_result);
+                       queue_watch_error(lreq);
                }
+       } else {
+               dout("lreq %p register_gen %u ignoring old pong %u\n", lreq,
+                    lreq->register_gen, req->r_ops[0].watch.gen);
+       }
 
-               dout("map_request osd %p is osd%d\n", req->r_osd, o);
-               __insert_osd(osdc, req->r_osd);
+       mutex_unlock(&lreq->lock);
+       linger_put(lreq);
+}
+
+static void send_linger_ping(struct ceph_osd_linger_request *lreq)
+{
+       struct ceph_osd_client *osdc = lreq->osdc;
+       struct ceph_osd_request *req = lreq->ping_req;
+       struct ceph_osd_req_op *op = &req->r_ops[0];
 
-               ceph_con_open(&req->r_osd->o_con,
-                             CEPH_ENTITY_TYPE_OSD, o,
-                             &osdc->osdmap->osd_addr[o]);
+       if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD)) {
+               dout("%s PAUSERD\n", __func__);
+               return;
        }
 
-       __enqueue_request(req);
-       err = 1;   /* osd or pg changed */
+       lreq->ping_sent = jiffies;
+       dout("%s lreq %p linger_id %llu ping_sent %lu register_gen %u\n",
+            __func__, lreq, lreq->linger_id, lreq->ping_sent,
+            lreq->register_gen);
 
-out:
-       return err;
+       if (req->r_osd)
+               cancel_linger_request(req);
+
+       request_reinit(req);
+       target_copy(&req->r_t, &lreq->t);
+
+       WARN_ON(op->op != CEPH_OSD_OP_WATCH ||
+               op->watch.cookie != lreq->linger_id ||
+               op->watch.op != CEPH_OSD_WATCH_OP_PING);
+       op->watch.gen = lreq->register_gen;
+       req->r_callback = linger_ping_cb;
+       req->r_priv = linger_get(lreq);
+       req->r_linger = true;
+
+       ceph_osdc_get_request(req);
+       account_request(req);
+       req->r_tid = atomic64_inc_return(&osdc->last_tid);
+       link_request(lreq->osd, req);
+       send_request(req);
+}
+
+static void linger_submit(struct ceph_osd_linger_request *lreq)
+{
+       struct ceph_osd_client *osdc = lreq->osdc;
+       struct ceph_osd *osd;
+
+       calc_target(osdc, &lreq->t, &lreq->last_force_resend, false);
+       osd = lookup_create_osd(osdc, lreq->t.osd, true);
+       link_linger(osd, lreq);
+
+       send_linger(lreq);
+}
+
+static void cancel_linger_map_check(struct ceph_osd_linger_request *lreq)
+{
+       struct ceph_osd_client *osdc = lreq->osdc;
+       struct ceph_osd_linger_request *lookup_lreq;
+
+       verify_osdc_wrlocked(osdc);
+
+       lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks,
+                                      lreq->linger_id);
+       if (!lookup_lreq)
+               return;
+
+       WARN_ON(lookup_lreq != lreq);
+       erase_linger_mc(&osdc->linger_map_checks, lreq);
+       linger_put(lreq);
 }
 
 /*
- * caller should hold map_sem (for read) and request_mutex
+ * @lreq has to be both registered and linked.
  */
-static void __send_request(struct ceph_osd_client *osdc,
-                          struct ceph_osd_request *req)
+static void __linger_cancel(struct ceph_osd_linger_request *lreq)
+{
+       if (lreq->is_watch && lreq->ping_req->r_osd)
+               cancel_linger_request(lreq->ping_req);
+       if (lreq->reg_req->r_osd)
+               cancel_linger_request(lreq->reg_req);
+       cancel_linger_map_check(lreq);
+       unlink_linger(lreq->osd, lreq);
+       linger_unregister(lreq);
+}
+
+static void linger_cancel(struct ceph_osd_linger_request *lreq)
+{
+       struct ceph_osd_client *osdc = lreq->osdc;
+
+       down_write(&osdc->lock);
+       if (__linger_registered(lreq))
+               __linger_cancel(lreq);
+       up_write(&osdc->lock);
+}
+
+static void send_linger_map_check(struct ceph_osd_linger_request *lreq);
+
+static void check_linger_pool_dne(struct ceph_osd_linger_request *lreq)
 {
-       void *p;
+       struct ceph_osd_client *osdc = lreq->osdc;
+       struct ceph_osdmap *map = osdc->osdmap;
+
+       verify_osdc_wrlocked(osdc);
+       WARN_ON(!map->epoch);
+
+       if (lreq->register_gen) {
+               lreq->map_dne_bound = map->epoch;
+               dout("%s lreq %p linger_id %llu pool disappeared\n", __func__,
+                    lreq, lreq->linger_id);
+       } else {
+               dout("%s lreq %p linger_id %llu map_dne_bound %u have %u\n",
+                    __func__, lreq, lreq->linger_id, lreq->map_dne_bound,
+                    map->epoch);
+       }
+
+       if (lreq->map_dne_bound) {
+               if (map->epoch >= lreq->map_dne_bound) {
+                       /* we had a new enough map */
+                       pr_info("linger_id %llu pool does not exist\n",
+                               lreq->linger_id);
+                       linger_reg_commit_complete(lreq, -ENOENT);
+                       __linger_cancel(lreq);
+               }
+       } else {
+               send_linger_map_check(lreq);
+       }
+}
+
+static void linger_map_check_cb(struct ceph_mon_generic_request *greq)
+{
+       struct ceph_osd_client *osdc = &greq->monc->client->osdc;
+       struct ceph_osd_linger_request *lreq;
+       u64 linger_id = greq->private_data;
+
+       WARN_ON(greq->result || !greq->u.newest);
 
-       dout("send_request %p tid %llu to osd%d flags %d pg %lld.%x\n",
-            req, req->r_tid, req->r_osd->o_osd, req->r_flags,
-            (unsigned long long)req->r_pgid.pool, req->r_pgid.seed);
+       down_write(&osdc->lock);
+       lreq = lookup_linger_mc(&osdc->linger_map_checks, linger_id);
+       if (!lreq) {
+               dout("%s linger_id %llu dne\n", __func__, linger_id);
+               goto out_unlock;
+       }
 
-       /* fill in message content that changes each time we send it */
-       put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch);
-       put_unaligned_le32(req->r_flags, req->r_request_flags);
-       put_unaligned_le64(req->r_target_oloc.pool, req->r_request_pool);
-       p = req->r_request_pgid;
-       ceph_encode_64(&p, req->r_pgid.pool);
-       ceph_encode_32(&p, req->r_pgid.seed);
-       put_unaligned_le64(1, req->r_request_attempts);  /* FIXME */
-       memcpy(req->r_request_reassert_version, &req->r_reassert_version,
-              sizeof(req->r_reassert_version));
+       dout("%s lreq %p linger_id %llu map_dne_bound %u newest %llu\n",
+            __func__, lreq, lreq->linger_id, lreq->map_dne_bound,
+            greq->u.newest);
+       if (!lreq->map_dne_bound)
+               lreq->map_dne_bound = greq->u.newest;
+       erase_linger_mc(&osdc->linger_map_checks, lreq);
+       check_linger_pool_dne(lreq);
 
-       req->r_stamp = jiffies;
-       list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
+       linger_put(lreq);
+out_unlock:
+       up_write(&osdc->lock);
+}
+
+static void send_linger_map_check(struct ceph_osd_linger_request *lreq)
+{
+       struct ceph_osd_client *osdc = lreq->osdc;
+       struct ceph_osd_linger_request *lookup_lreq;
+       int ret;
 
-       ceph_msg_get(req->r_request); /* send consumes a ref */
+       verify_osdc_wrlocked(osdc);
 
-       req->r_sent = req->r_osd->o_incarnation;
+       lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks,
+                                      lreq->linger_id);
+       if (lookup_lreq) {
+               WARN_ON(lookup_lreq != lreq);
+               return;
+       }
 
-       ceph_con_send(&req->r_osd->o_con, req->r_request);
+       linger_get(lreq);
+       insert_linger_mc(&osdc->linger_map_checks, lreq);
+       ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap",
+                                         linger_map_check_cb, lreq->linger_id);
+       WARN_ON(ret);
 }
 
-/*
- * Send any requests in the queue (req_unsent).
- */
-static void __send_queued(struct ceph_osd_client *osdc)
+static int linger_reg_commit_wait(struct ceph_osd_linger_request *lreq)
 {
-       struct ceph_osd_request *req, *tmp;
+       int ret;
 
-       dout("__send_queued\n");
-       list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item)
-               __send_request(osdc, req);
+       dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
+       ret = wait_for_completion_interruptible(&lreq->reg_commit_wait);
+       return ret ?: lreq->reg_commit_error;
 }
 
-/*
- * Caller should hold map_sem for read and request_mutex.
- */
-static int __ceph_osdc_start_request(struct ceph_osd_client *osdc,
-                                    struct ceph_osd_request *req,
-                                    bool nofail)
-{
-       int rc;
-
-       __register_request(osdc, req);
-       req->r_sent = 0;
-       req->r_got_reply = 0;
-       rc = __map_request(osdc, req, 0);
-       if (rc < 0) {
-               if (nofail) {
-                       dout("osdc_start_request failed map, "
-                               " will retry %lld\n", req->r_tid);
-                       rc = 0;
-               } else {
-                       __unregister_request(osdc, req);
-               }
-               return rc;
-       }
-
-       if (req->r_osd == NULL) {
-               dout("send_request %p no up osds in pg\n", req);
-               ceph_monc_request_next_osdmap(&osdc->client->monc);
-       } else {
-               __send_queued(osdc);
-       }
+static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq)
+{
+       int ret;
 
-       return 0;
+       dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
+       ret = wait_for_completion_interruptible(&lreq->notify_finish_wait);
+       return ret ?: lreq->notify_finish_error;
 }
 
 /*
- * Timeout callback, called every N seconds when 1 or more osd
- * requests has been active for more than N seconds.  When this
- * happens, we ping all OSDs with requests who have timed out to
- * ensure any communications channel reset is detected.  Reset the
- * request timeouts another N seconds in the future as we go.
- * Reschedule the timeout event another N seconds in future (unless
- * there are no open requests).
+ * Timeout callback, called every N seconds.  When 1 or more OSD
+ * requests has been active for more than N seconds, we send a keepalive
+ * (tag + timestamp) to its OSD to ensure any communications channel
+ * reset is detected.
  */
 static void handle_timeout(struct work_struct *work)
 {
        struct ceph_osd_client *osdc =
                container_of(work, struct ceph_osd_client, timeout_work.work);
        struct ceph_options *opts = osdc->client->options;
-       struct ceph_osd_request *req;
-       struct ceph_osd *osd;
-       struct list_head slow_osds;
-       dout("timeout\n");
-       down_read(&osdc->map_sem);
+       unsigned long cutoff = jiffies - opts->osd_keepalive_timeout;
+       LIST_HEAD(slow_osds);
+       struct rb_node *n, *p;
 
-       ceph_monc_request_next_osdmap(&osdc->client->monc);
-
-       mutex_lock(&osdc->request_mutex);
+       dout("%s osdc %p\n", __func__, osdc);
+       down_write(&osdc->lock);
 
        /*
         * ping osds that are a bit slow.  this ensures that if there
         * is a break in the TCP connection we will notice, and reopen
         * a connection with that osd (from the fault callback).
         */
-       INIT_LIST_HEAD(&slow_osds);
-       list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
-               if (time_before(jiffies,
-                               req->r_stamp + opts->osd_keepalive_timeout))
-                       break;
+       for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+               struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+               bool found = false;
+
+               for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
+                       struct ceph_osd_request *req =
+                           rb_entry(p, struct ceph_osd_request, r_node);
+
+                       if (time_before(req->r_stamp, cutoff)) {
+                               dout(" req %p tid %llu on osd%d is laggy\n",
+                                    req, req->r_tid, osd->o_osd);
+                               found = true;
+                       }
+               }
+               for (p = rb_first(&osd->o_linger_requests); p; p = rb_next(p)) {
+                       struct ceph_osd_linger_request *lreq =
+                           rb_entry(p, struct ceph_osd_linger_request, node);
+
+                       dout(" lreq %p linger_id %llu is served by osd%d\n",
+                            lreq, lreq->linger_id, osd->o_osd);
+                       found = true;
+
+                       mutex_lock(&lreq->lock);
+                       if (lreq->is_watch && lreq->committed && !lreq->last_error)
+                               send_linger_ping(lreq);
+                       mutex_unlock(&lreq->lock);
+               }
 
-               osd = req->r_osd;
-               BUG_ON(!osd);
-               dout(" tid %llu is slow, will send keepalive on osd%d\n",
-                    req->r_tid, osd->o_osd);
-               list_move_tail(&osd->o_keepalive_item, &slow_osds);
+               if (found)
+                       list_move_tail(&osd->o_keepalive_item, &slow_osds);
        }
+
+       if (atomic_read(&osdc->num_homeless) || !list_empty(&slow_osds))
+               maybe_request_map(osdc);
+
        while (!list_empty(&slow_osds)) {
-               osd = list_entry(slow_osds.next, struct ceph_osd,
-                                o_keepalive_item);
+               struct ceph_osd *osd = list_first_entry(&slow_osds,
+                                                       struct ceph_osd,
+                                                       o_keepalive_item);
                list_del_init(&osd->o_keepalive_item);
                ceph_con_keepalive(&osd->o_con);
        }
 
-       __schedule_osd_timeout(osdc);
-       __send_queued(osdc);
-       mutex_unlock(&osdc->request_mutex);
-       up_read(&osdc->map_sem);
+       up_write(&osdc->lock);
+       schedule_delayed_work(&osdc->timeout_work,
+                             osdc->client->options->osd_keepalive_timeout);
 }
 
 static void handle_osds_timeout(struct work_struct *work)
@@ -1663,12 +2541,20 @@ static void handle_osds_timeout(struct work_struct *work)
                container_of(work, struct ceph_osd_client,
                             osds_timeout_work.work);
        unsigned long delay = osdc->client->options->osd_idle_ttl / 4;
+       struct ceph_osd *osd, *nosd;
+
+       dout("%s osdc %p\n", __func__, osdc);
+       down_write(&osdc->lock);
+       list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
+               if (time_before(jiffies, osd->lru_ttl))
+                       break;
 
-       dout("osds timeout\n");
-       down_read(&osdc->map_sem);
-       remove_old_osds(osdc);
-       up_read(&osdc->map_sem);
+               WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
+               WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
+               close_osd(osd);
+       }
 
+       up_write(&osdc->lock);
        schedule_delayed_work(&osdc->osds_timeout_work,
                              round_jiffies_relative(delay));
 }
@@ -1776,107 +2662,76 @@ e_inval:
        goto out;
 }
 
-static void complete_request(struct ceph_osd_request *req)
-{
-       complete_all(&req->r_safe_completion);  /* fsync waiter */
-}
+struct MOSDOpReply {
+       struct ceph_pg pgid;
+       u64 flags;
+       int result;
+       u32 epoch;
+       int num_ops;
+       u32 outdata_len[CEPH_OSD_MAX_OPS];
+       s32 rval[CEPH_OSD_MAX_OPS];
+       int retry_attempt;
+       struct ceph_eversion replay_version;
+       u64 user_version;
+       struct ceph_request_redirect redirect;
+};
 
-/*
- * handle osd op reply.  either call the callback if it is specified,
- * or do the completion to wake up the waiting thread.
- */
-static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
+static int decode_MOSDOpReply(const struct ceph_msg *msg, struct MOSDOpReply *m)
 {
-       void *p, *end;
-       struct ceph_osd_request *req;
-       struct ceph_request_redirect redir;
-       u64 tid;
-       int object_len;
-       unsigned int numops;
-       int payload_len, flags;
-       s32 result;
-       s32 retry_attempt;
-       struct ceph_pg pg;
-       int err;
-       u32 reassert_epoch;
-       u64 reassert_version;
-       u32 osdmap_epoch;
-       int already_completed;
-       u32 bytes;
+       void *p = msg->front.iov_base;
+       void *const end = p + msg->front.iov_len;
+       u16 version = le16_to_cpu(msg->hdr.version);
+       struct ceph_eversion bad_replay_version;
        u8 decode_redir;
-       unsigned int i;
-
-       tid = le64_to_cpu(msg->hdr.tid);
-       dout("handle_reply %p tid %llu\n", msg, tid);
+       u32 len;
+       int ret;
+       int i;
 
-       p = msg->front.iov_base;
-       end = p + msg->front.iov_len;
+       ceph_decode_32_safe(&p, end, len, e_inval);
+       ceph_decode_need(&p, end, len, e_inval);
+       p += len; /* skip oid */
 
-       ceph_decode_need(&p, end, 4, bad);
-       object_len = ceph_decode_32(&p);
-       ceph_decode_need(&p, end, object_len, bad);
-       p += object_len;
+       ret = ceph_decode_pgid(&p, end, &m->pgid);
+       if (ret)
+               return ret;
 
-       err = ceph_decode_pgid(&p, end, &pg);
-       if (err)
-               goto bad;
+       ceph_decode_64_safe(&p, end, m->flags, e_inval);
+       ceph_decode_32_safe(&p, end, m->result, e_inval);
+       ceph_decode_need(&p, end, sizeof(bad_replay_version), e_inval);
+       memcpy(&bad_replay_version, p, sizeof(bad_replay_version));
+       p += sizeof(bad_replay_version);
+       ceph_decode_32_safe(&p, end, m->epoch, e_inval);
 
-       ceph_decode_need(&p, end, 8 + 4 + 4 + 8 + 4, bad);
-       flags = ceph_decode_64(&p);
-       result = ceph_decode_32(&p);
-       reassert_epoch = ceph_decode_32(&p);
-       reassert_version = ceph_decode_64(&p);
-       osdmap_epoch = ceph_decode_32(&p);
-
-       /* lookup */
-       down_read(&osdc->map_sem);
-       mutex_lock(&osdc->request_mutex);
-       req = __lookup_request(osdc, tid);
-       if (req == NULL) {
-               dout("handle_reply tid %llu dne\n", tid);
-               goto bad_mutex;
-       }
-       ceph_osdc_get_request(req);
+       ceph_decode_32_safe(&p, end, m->num_ops, e_inval);
+       if (m->num_ops > ARRAY_SIZE(m->outdata_len))
+               goto e_inval;
 
-       dout("handle_reply %p tid %llu req %p result %d\n", msg, tid,
-            req, result);
-
-       ceph_decode_need(&p, end, 4, bad_put);
-       numops = ceph_decode_32(&p);
-       if (numops > CEPH_OSD_MAX_OPS)
-               goto bad_put;
-       if (numops != req->r_num_ops)
-               goto bad_put;
-       payload_len = 0;
-       ceph_decode_need(&p, end, numops * sizeof(struct ceph_osd_op), bad_put);
-       for (i = 0; i < numops; i++) {
+       ceph_decode_need(&p, end, m->num_ops * sizeof(struct ceph_osd_op),
+                        e_inval);
+       for (i = 0; i < m->num_ops; i++) {
                struct ceph_osd_op *op = p;
-               int len;
 
-               len = le32_to_cpu(op->payload_len);
-               req->r_ops[i].outdata_len = len;
-               dout(" op %d has %d bytes\n", i, len);
-               payload_len += len;
+               m->outdata_len[i] = le32_to_cpu(op->payload_len);
                p += sizeof(*op);
        }
-       bytes = le32_to_cpu(msg->hdr.data_len);
-       if (payload_len != bytes) {
-               pr_warn("sum of op payload lens %d != data_len %d\n",
-                       payload_len, bytes);
-               goto bad_put;
-       }
 
-       ceph_decode_need(&p, end, 4 + numops * 4, bad_put);
-       retry_attempt = ceph_decode_32(&p);
-       for (i = 0; i < numops; i++)
-               req->r_ops[i].rval = ceph_decode_32(&p);
+       ceph_decode_32_safe(&p, end, m->retry_attempt, e_inval);
+       for (i = 0; i < m->num_ops; i++)
+               ceph_decode_32_safe(&p, end, m->rval[i], e_inval);
 
-       if (le16_to_cpu(msg->hdr.version) >= 6) {
-               p += 8 + 4; /* skip replay_version */
-               p += 8; /* skip user_version */
+       if (version >= 5) {
+               ceph_decode_need(&p, end, sizeof(m->replay_version), e_inval);
+               memcpy(&m->replay_version, p, sizeof(m->replay_version));
+               p += sizeof(m->replay_version);
+               ceph_decode_64_safe(&p, end, m->user_version, e_inval);
+       } else {
+               m->replay_version = bad_replay_version; /* struct */
+               m->user_version = le64_to_cpu(m->replay_version.version);
+       }
 
-               if (le16_to_cpu(msg->hdr.version) >= 7)
-                       ceph_decode_8_safe(&p, end, decode_redir, bad_put);
+       if (version >= 6) {
+               if (version >= 7)
+                       ceph_decode_8_safe(&p, end, decode_redir, e_inval);
                else
                        decode_redir = 1;
        } else {
@@ -1884,228 +2739,410 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
        }
 
        if (decode_redir) {
-               err = ceph_redirect_decode(&p, end, &redir);
-               if (err)
-                       goto bad_put;
+               ret = ceph_redirect_decode(&p, end, &m->redirect);
+               if (ret)
+                       return ret;
        } else {
-               redir.oloc.pool = -1;
+               ceph_oloc_init(&m->redirect.oloc);
        }
 
-       if (redir.oloc.pool != -1) {
-               dout("redirect pool %lld\n", redir.oloc.pool);
+       return 0;
 
-               __unregister_request(osdc, req);
+e_inval:
+       return -EINVAL;
+}
 
-               req->r_target_oloc = redir.oloc; /* struct */
+/*
+ * We are done with @req if
+ *   - @m is a safe reply, or
+ *   - @m is an unsafe reply and we didn't want a safe one
+ */
+static bool done_request(const struct ceph_osd_request *req,
+                        const struct MOSDOpReply *m)
+{
+       return (m->result < 0 ||
+               (m->flags & CEPH_OSD_FLAG_ONDISK) ||
+               !(req->r_flags & CEPH_OSD_FLAG_ONDISK));
+}
 
-               /*
-                * Start redirect requests with nofail=true.  If
-                * mapping fails, request will end up on the notarget
-                * list, waiting for the new osdmap (which can take
-                * a while), even though the original request mapped
-                * successfully.  In the future we might want to follow
-                * original request's nofail setting here.
-                */
-               err = __ceph_osdc_start_request(osdc, req, true);
-               BUG_ON(err);
+/*
+ * handle osd op reply.  either call the callback if it is specified,
+ * or do the completion to wake up the waiting thread.
+ *
+ * ->r_unsafe_callback is set? yes                     no
+ *
+ * first reply is OK (needed   r_cb/r_completion,      r_cb/r_completion,
+ * any or needed/got safe)     r_safe_completion       r_safe_completion
+ *
+ * first reply is unsafe       r_unsafe_cb(true)       (nothing)
+ *
+ * when we get the safe reply  r_unsafe_cb(false),     r_cb/r_completion,
+ *                             r_safe_completion       r_safe_completion
+ */
+static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
+{
+       struct ceph_osd_client *osdc = osd->o_osdc;
+       struct ceph_osd_request *req;
+       struct MOSDOpReply m;
+       u64 tid = le64_to_cpu(msg->hdr.tid);
+       u32 data_len = 0;
+       bool already_acked;
+       int ret;
+       int i;
 
-               goto out_unlock;
-       }
+       dout("%s msg %p tid %llu\n", __func__, msg, tid);
 
-       already_completed = req->r_got_reply;
-       if (!req->r_got_reply) {
-               req->r_result = result;
-               dout("handle_reply result %d bytes %d\n", req->r_result,
-                    bytes);
-               if (req->r_result == 0)
-                       req->r_result = bytes;
+       down_read(&osdc->lock);
+       if (!osd_registered(osd)) {
+               dout("%s osd%d unknown\n", __func__, osd->o_osd);
+               goto out_unlock_osdc;
+       }
+       WARN_ON(osd->o_osd != le64_to_cpu(msg->hdr.src.num));
 
-               /* in case this is a write and we need to replay, */
-               req->r_reassert_version.epoch = cpu_to_le32(reassert_epoch);
-               req->r_reassert_version.version = cpu_to_le64(reassert_version);
+       mutex_lock(&osd->lock);
+       req = lookup_request(&osd->o_requests, tid);
+       if (!req) {
+               dout("%s osd%d tid %llu unknown\n", __func__, osd->o_osd, tid);
+               goto out_unlock_session;
+       }
 
-               req->r_got_reply = 1;
-       } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
-               dout("handle_reply tid %llu dup ack\n", tid);
-               goto out_unlock;
+       ret = decode_MOSDOpReply(msg, &m);
+       if (ret) {
+               pr_err("failed to decode MOSDOpReply for tid %llu: %d\n",
+                      req->r_tid, ret);
+               ceph_msg_dump(msg);
+               goto fail_request;
+       }
+       dout("%s req %p tid %llu flags 0x%llx pgid %llu.%x epoch %u attempt %d v %u'%llu uv %llu\n",
+            __func__, req, req->r_tid, m.flags, m.pgid.pool, m.pgid.seed,
+            m.epoch, m.retry_attempt, le32_to_cpu(m.replay_version.epoch),
+            le64_to_cpu(m.replay_version.version), m.user_version);
+
+       if (m.retry_attempt >= 0) {
+               if (m.retry_attempt != req->r_attempts - 1) {
+                       dout("req %p tid %llu retry_attempt %d != %d, ignoring\n",
+                            req, req->r_tid, m.retry_attempt,
+                            req->r_attempts - 1);
+                       goto out_unlock_session;
+               }
+       } else {
+               WARN_ON(1); /* MOSDOpReply v4 is assumed */
        }
 
-       dout("handle_reply tid %llu flags %d\n", tid, flags);
+       if (!ceph_oloc_empty(&m.redirect.oloc)) {
+               dout("req %p tid %llu redirect pool %lld\n", req, req->r_tid,
+                    m.redirect.oloc.pool);
+               unlink_request(osd, req);
+               mutex_unlock(&osd->lock);
+
+               ceph_oloc_copy(&req->r_t.target_oloc, &m.redirect.oloc);
+               req->r_flags |= CEPH_OSD_FLAG_REDIRECTED;
+               req->r_tid = 0;
+               __submit_request(req, false);
+               goto out_unlock_osdc;
+       }
 
-       if (req->r_linger && (flags & CEPH_OSD_FLAG_ONDISK))
-               __register_linger_request(osdc, req);
+       if (m.num_ops != req->r_num_ops) {
+               pr_err("num_ops %d != %d for tid %llu\n", m.num_ops,
+                      req->r_num_ops, req->r_tid);
+               goto fail_request;
+       }
+       for (i = 0; i < req->r_num_ops; i++) {
+               dout(" req %p tid %llu op %d rval %d len %u\n", req,
+                    req->r_tid, i, m.rval[i], m.outdata_len[i]);
+               req->r_ops[i].rval = m.rval[i];
+               req->r_ops[i].outdata_len = m.outdata_len[i];
+               data_len += m.outdata_len[i];
+       }
+       if (data_len != le32_to_cpu(msg->hdr.data_len)) {
+               pr_err("sum of lens %u != %u for tid %llu\n", data_len,
+                      le32_to_cpu(msg->hdr.data_len), req->r_tid);
+               goto fail_request;
+       }
+       dout("%s req %p tid %llu acked %d result %d data_len %u\n", __func__,
+            req, req->r_tid, req->r_got_reply, m.result, data_len);
+
+       already_acked = req->r_got_reply;
+       if (!already_acked) {
+               req->r_result = m.result ?: data_len;
+               req->r_replay_version = m.replay_version; /* struct */
+               req->r_got_reply = true;
+       } else if (!(m.flags & CEPH_OSD_FLAG_ONDISK)) {
+               dout("req %p tid %llu dup ack\n", req, req->r_tid);
+               goto out_unlock_session;
+       }
 
-       /* either this is a read, or we got the safe response */
-       if (result < 0 ||
-           (flags & CEPH_OSD_FLAG_ONDISK) ||
-           ((flags & CEPH_OSD_FLAG_WRITE) == 0))
-               __unregister_request(osdc, req);
+       if (done_request(req, &m)) {
+               __finish_request(req);
+               if (req->r_linger) {
+                       WARN_ON(req->r_unsafe_callback);
+                       dout("req %p tid %llu cb (locked)\n", req, req->r_tid);
+                       __complete_request(req);
+               }
+       }
 
-       mutex_unlock(&osdc->request_mutex);
-       up_read(&osdc->map_sem);
+       mutex_unlock(&osd->lock);
+       up_read(&osdc->lock);
 
-       if (!already_completed) {
-               if (req->r_unsafe_callback &&
-                   result >= 0 && !(flags & CEPH_OSD_FLAG_ONDISK))
+       if (done_request(req, &m)) {
+               if (already_acked && req->r_unsafe_callback) {
+                       dout("req %p tid %llu safe-cb\n", req, req->r_tid);
+                       req->r_unsafe_callback(req, false);
+               } else if (!req->r_linger) {
+                       dout("req %p tid %llu cb\n", req, req->r_tid);
+                       __complete_request(req);
+               }
+               if (m.flags & CEPH_OSD_FLAG_ONDISK)
+                       complete_all(&req->r_safe_completion);
+               ceph_osdc_put_request(req);
+       } else {
+               if (req->r_unsafe_callback) {
+                       dout("req %p tid %llu unsafe-cb\n", req, req->r_tid);
                        req->r_unsafe_callback(req, true);
-               if (req->r_callback)
-                       req->r_callback(req, msg);
-               else
-                       complete_all(&req->r_completion);
+               } else {
+                       WARN_ON(1);
+               }
        }
 
-       if (flags & CEPH_OSD_FLAG_ONDISK) {
-               if (req->r_unsafe_callback && already_completed)
-                       req->r_unsafe_callback(req, false);
-               complete_request(req);
+       return;
+
+fail_request:
+       complete_request(req, -EIO);
+out_unlock_session:
+       mutex_unlock(&osd->lock);
+out_unlock_osdc:
+       up_read(&osdc->lock);
+}
+
+static void set_pool_was_full(struct ceph_osd_client *osdc)
+{
+       struct rb_node *n;
+
+       for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) {
+               struct ceph_pg_pool_info *pi =
+                   rb_entry(n, struct ceph_pg_pool_info, node);
+
+               pi->was_full = __pool_full(pi);
        }
+}
 
-out:
-       dout("req=%p req->r_linger=%d\n", req, req->r_linger);
-       ceph_osdc_put_request(req);
-       return;
-out_unlock:
-       mutex_unlock(&osdc->request_mutex);
-       up_read(&osdc->map_sem);
-       goto out;
+static bool pool_cleared_full(struct ceph_osd_client *osdc, s64 pool_id)
+{
+       struct ceph_pg_pool_info *pi;
 
-bad_put:
-       req->r_result = -EIO;
-       __unregister_request(osdc, req);
-       if (req->r_callback)
-               req->r_callback(req, msg);
-       else
-               complete_all(&req->r_completion);
-       complete_request(req);
-       ceph_osdc_put_request(req);
-bad_mutex:
-       mutex_unlock(&osdc->request_mutex);
-       up_read(&osdc->map_sem);
-bad:
-       pr_err("corrupt osd_op_reply got %d %d\n",
-              (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len));
-       ceph_msg_dump(msg);
+       pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
+       if (!pi)
+               return false;
+
+       return pi->was_full && !__pool_full(pi);
 }
 
-static void reset_changed_osds(struct ceph_osd_client *osdc)
+static enum calc_target_result
+recalc_linger_target(struct ceph_osd_linger_request *lreq)
 {
-       struct rb_node *p, *n;
+       struct ceph_osd_client *osdc = lreq->osdc;
+       enum calc_target_result ct_res;
 
-       dout("%s %p\n", __func__, osdc);
-       for (p = rb_first(&osdc->osds); p; p = n) {
-               struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node);
+       ct_res = calc_target(osdc, &lreq->t, &lreq->last_force_resend, true);
+       if (ct_res == CALC_TARGET_NEED_RESEND) {
+               struct ceph_osd *osd;
 
-               n = rb_next(p);
-               if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
-                   memcmp(&osd->o_con.peer_addr,
-                          ceph_osd_addr(osdc->osdmap,
-                                        osd->o_osd),
-                          sizeof(struct ceph_entity_addr)) != 0)
-                       __reset_osd(osdc, osd);
+               osd = lookup_create_osd(osdc, lreq->t.osd, true);
+               if (osd != lreq->osd) {
+                       unlink_linger(lreq->osd, lreq);
+                       link_linger(osd, lreq);
+               }
        }
+
+       return ct_res;
 }
 
 /*
- * Requeue requests whose mapping to an OSD has changed.  If requests map to
- * no osd, request a new map.
- *
- * Caller should hold map_sem for read.
+ * Requeue requests whose mapping to an OSD has changed.
  */
-static void kick_requests(struct ceph_osd_client *osdc, bool force_resend,
-                         bool force_resend_writes)
+static void scan_requests(struct ceph_osd *osd,
+                         bool force_resend,
+                         bool cleared_full,
+                         bool check_pool_cleared_full,
+                         struct rb_root *need_resend,
+                         struct list_head *need_resend_linger)
 {
-       struct ceph_osd_request *req, *nreq;
-       struct rb_node *p;
-       int needmap = 0;
-       int err;
-       bool force_resend_req;
+       struct ceph_osd_client *osdc = osd->o_osdc;
+       struct rb_node *n;
+       bool force_resend_writes;
+
+       for (n = rb_first(&osd->o_linger_requests); n; ) {
+               struct ceph_osd_linger_request *lreq =
+                   rb_entry(n, struct ceph_osd_linger_request, node);
+               enum calc_target_result ct_res;
+
+               n = rb_next(n); /* recalc_linger_target() */
+
+               dout("%s lreq %p linger_id %llu\n", __func__, lreq,
+                    lreq->linger_id);
+               ct_res = recalc_linger_target(lreq);
+               switch (ct_res) {
+               case CALC_TARGET_NO_ACTION:
+                       force_resend_writes = cleared_full ||
+                           (check_pool_cleared_full &&
+                            pool_cleared_full(osdc, lreq->t.base_oloc.pool));
+                       if (!force_resend && !force_resend_writes)
+                               break;
+
+                       /* fall through */
+               case CALC_TARGET_NEED_RESEND:
+                       cancel_linger_map_check(lreq);
+                       /*
+                        * scan_requests() for the previous epoch(s)
+                        * may have already added it to the list, since
+                        * it's not unlinked here.
+                        */
+                       if (list_empty(&lreq->scan_item))
+                               list_add_tail(&lreq->scan_item, need_resend_linger);
+                       break;
+               case CALC_TARGET_POOL_DNE:
+                       check_linger_pool_dne(lreq);
+                       break;
+               }
+       }
+
+       for (n = rb_first(&osd->o_requests); n; ) {
+               struct ceph_osd_request *req =
+                   rb_entry(n, struct ceph_osd_request, r_node);
+               enum calc_target_result ct_res;
+
+               n = rb_next(n); /* unlink_request(), check_pool_dne() */
+
+               dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+               ct_res = calc_target(osdc, &req->r_t,
+                                    &req->r_last_force_resend, false);
+               switch (ct_res) {
+               case CALC_TARGET_NO_ACTION:
+                       force_resend_writes = cleared_full ||
+                           (check_pool_cleared_full &&
+                            pool_cleared_full(osdc, req->r_t.base_oloc.pool));
+                       if (!force_resend &&
+                           (!(req->r_flags & CEPH_OSD_FLAG_WRITE) ||
+                            !force_resend_writes))
+                               break;
+
+                       /* fall through */
+               case CALC_TARGET_NEED_RESEND:
+                       cancel_map_check(req);
+                       unlink_request(osd, req);
+                       insert_request(need_resend, req);
+                       break;
+               case CALC_TARGET_POOL_DNE:
+                       check_pool_dne(req);
+                       break;
+               }
+       }
+}
+
+static int handle_one_map(struct ceph_osd_client *osdc,
+                         void *p, void *end, bool incremental,
+                         struct rb_root *need_resend,
+                         struct list_head *need_resend_linger)
+{
+       struct ceph_osdmap *newmap;
+       struct rb_node *n;
+       bool skipped_map = false;
+       bool was_full;
 
-       dout("kick_requests %s %s\n", force_resend ? " (force resend)" : "",
-               force_resend_writes ? " (force resend writes)" : "");
-       mutex_lock(&osdc->request_mutex);
-       for (p = rb_first(&osdc->requests); p; ) {
-               req = rb_entry(p, struct ceph_osd_request, r_node);
-               p = rb_next(p);
+       was_full = ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL);
+       set_pool_was_full(osdc);
 
+       if (incremental)
+               newmap = osdmap_apply_incremental(&p, end, osdc->osdmap);
+       else
+               newmap = ceph_osdmap_decode(&p, end);
+       if (IS_ERR(newmap))
+               return PTR_ERR(newmap);
+
+       if (newmap != osdc->osdmap) {
                /*
-                * For linger requests that have not yet been
-                * registered, move them to the linger list; they'll
-                * be sent to the osd in the loop below.  Unregister
-                * the request before re-registering it as a linger
-                * request to ensure the __map_request() below
-                * will decide it needs to be sent.
+                * Preserve ->was_full before destroying the old map.
+                * For pools that weren't in the old map, ->was_full
+                * should be false.
                 */
-               if (req->r_linger && list_empty(&req->r_linger_item)) {
-                       dout("%p tid %llu restart on osd%d\n",
-                            req, req->r_tid,
-                            req->r_osd ? req->r_osd->o_osd : -1);
-                       ceph_osdc_get_request(req);
-                       __unregister_request(osdc, req);
-                       __register_linger_request(osdc, req);
-                       ceph_osdc_put_request(req);
-                       continue;
+               for (n = rb_first(&newmap->pg_pools); n; n = rb_next(n)) {
+                       struct ceph_pg_pool_info *pi =
+                           rb_entry(n, struct ceph_pg_pool_info, node);
+                       struct ceph_pg_pool_info *old_pi;
+
+                       old_pi = ceph_pg_pool_by_id(osdc->osdmap, pi->id);
+                       if (old_pi)
+                               pi->was_full = old_pi->was_full;
+                       else
+                               WARN_ON(pi->was_full);
                }
 
-               force_resend_req = force_resend ||
-                       (force_resend_writes &&
-                               req->r_flags & CEPH_OSD_FLAG_WRITE);
-               err = __map_request(osdc, req, force_resend_req);
-               if (err < 0)
-                       continue;  /* error */
-               if (req->r_osd == NULL) {
-                       dout("%p tid %llu maps to no osd\n", req, req->r_tid);
-                       needmap++;  /* request a newer map */
-               } else if (err > 0) {
-                       if (!req->r_linger) {
-                               dout("%p tid %llu requeued on osd%d\n", req,
-                                    req->r_tid,
-                                    req->r_osd ? req->r_osd->o_osd : -1);
-                               req->r_flags |= CEPH_OSD_FLAG_RETRY;
-                       }
+               if (osdc->osdmap->epoch &&
+                   osdc->osdmap->epoch + 1 < newmap->epoch) {
+                       WARN_ON(incremental);
+                       skipped_map = true;
                }
+
+               ceph_osdmap_destroy(osdc->osdmap);
+               osdc->osdmap = newmap;
        }
 
-       list_for_each_entry_safe(req, nreq, &osdc->req_linger,
-                                r_linger_item) {
-               dout("linger req=%p req->r_osd=%p\n", req, req->r_osd);
-
-               err = __map_request(osdc, req,
-                                   force_resend || force_resend_writes);
-               dout("__map_request returned %d\n", err);
-               if (err < 0)
-                       continue;  /* hrm! */
-               if (req->r_osd == NULL || err > 0) {
-                       if (req->r_osd == NULL) {
-                               dout("lingering %p tid %llu maps to no osd\n",
-                                    req, req->r_tid);
-                               /*
-                                * A homeless lingering request makes
-                                * no sense, as it's job is to keep
-                                * a particular OSD connection open.
-                                * Request a newer map and kick the
-                                * request, knowing that it won't be
-                                * resent until we actually get a map
-                                * that can tell us where to send it.
-                                */
-                               needmap++;
-                       }
+       was_full &= !ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL);
+       scan_requests(&osdc->homeless_osd, skipped_map, was_full, true,
+                     need_resend, need_resend_linger);
 
-                       dout("kicking lingering %p tid %llu osd%d\n", req,
-                            req->r_tid, req->r_osd ? req->r_osd->o_osd : -1);
-                       __register_request(osdc, req);
-                       __unregister_linger_request(osdc, req);
+       for (n = rb_first(&osdc->osds); n; ) {
+               struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+               n = rb_next(n); /* close_osd() */
+
+               scan_requests(osd, skipped_map, was_full, true, need_resend,
+                             need_resend_linger);
+               if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
+                   memcmp(&osd->o_con.peer_addr,
+                          ceph_osd_addr(osdc->osdmap, osd->o_osd),
+                          sizeof(struct ceph_entity_addr)))
+                       close_osd(osd);
+       }
+
+       return 0;
+}
+
+static void kick_requests(struct ceph_osd_client *osdc,
+                         struct rb_root *need_resend,
+                         struct list_head *need_resend_linger)
+{
+       struct ceph_osd_linger_request *lreq, *nlreq;
+       struct rb_node *n;
+
+       for (n = rb_first(need_resend); n; ) {
+               struct ceph_osd_request *req =
+                   rb_entry(n, struct ceph_osd_request, r_node);
+               struct ceph_osd *osd;
+
+               n = rb_next(n);
+               erase_request(need_resend, req); /* before link_request() */
+
+               WARN_ON(req->r_osd);
+               calc_target(osdc, &req->r_t, NULL, false);
+               osd = lookup_create_osd(osdc, req->r_t.osd, true);
+               link_request(osd, req);
+               if (!req->r_linger) {
+                       if (!osd_homeless(osd) && !req->r_t.paused)
+                               send_request(req);
+               } else {
+                       cancel_linger_request(req);
                }
        }
-       reset_changed_osds(osdc);
-       mutex_unlock(&osdc->request_mutex);
 
-       if (needmap) {
-               dout("%d requests for down osds, need new map\n", needmap);
-               ceph_monc_request_next_osdmap(&osdc->client->monc);
+       list_for_each_entry_safe(lreq, nlreq, need_resend_linger, scan_item) {
+               if (!osd_homeless(lreq->osd))
+                       send_linger(lreq);
+
+               list_del_init(&lreq->scan_item);
        }
 }
 
-
 /*
  * Process updated osd map.
  *
@@ -2115,27 +3152,31 @@ static void kick_requests(struct ceph_osd_client *osdc, bool force_resend,
  */
 void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 {
-       void *p, *end, *next;
+       void *p = msg->front.iov_base;
+       void *const end = p + msg->front.iov_len;
        u32 nr_maps, maplen;
        u32 epoch;
-       struct ceph_osdmap *newmap = NULL, *oldmap;
-       int err;
        struct ceph_fsid fsid;
-       bool was_full;
+       struct rb_root need_resend = RB_ROOT;
+       LIST_HEAD(need_resend_linger);
+       bool handled_incremental = false;
+       bool was_pauserd, was_pausewr;
+       bool pauserd, pausewr;
+       int err;
 
-       dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
-       p = msg->front.iov_base;
-       end = p + msg->front.iov_len;
+       dout("%s have %u\n", __func__, osdc->osdmap->epoch);
+       down_write(&osdc->lock);
 
        /* verify fsid */
        ceph_decode_need(&p, end, sizeof(fsid), bad);
        ceph_decode_copy(&p, &fsid, sizeof(fsid));
        if (ceph_check_fsid(osdc->client, &fsid) < 0)
-               return;
-
-       down_write(&osdc->map_sem);
+               goto bad;
 
-       was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
+       was_pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD);
+       was_pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) ||
+                     ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
+                     have_pool_full(osdc);
 
        /* incremental maps */
        ceph_decode_32_safe(&p, end, nr_maps, bad);
@@ -2145,34 +3186,23 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
                epoch = ceph_decode_32(&p);
                maplen = ceph_decode_32(&p);
                ceph_decode_need(&p, end, maplen, bad);
-               next = p + maplen;
-               if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
+               if (osdc->osdmap->epoch &&
+                   osdc->osdmap->epoch + 1 == epoch) {
                        dout("applying incremental map %u len %d\n",
                             epoch, maplen);
-                       newmap = osdmap_apply_incremental(&p, next,
-                                                         osdc->osdmap,
-                                                         &osdc->client->msgr);
-                       if (IS_ERR(newmap)) {
-                               err = PTR_ERR(newmap);
+                       err = handle_one_map(osdc, p, p + maplen, true,
+                                            &need_resend, &need_resend_linger);
+                       if (err)
                                goto bad;
-                       }
-                       BUG_ON(!newmap);
-                       if (newmap != osdc->osdmap) {
-                               ceph_osdmap_destroy(osdc->osdmap);
-                               osdc->osdmap = newmap;
-                       }
-                       was_full = was_full ||
-                               ceph_osdmap_flag(osdc->osdmap,
-                                                CEPH_OSDMAP_FULL);
-                       kick_requests(osdc, 0, was_full);
+                       handled_incremental = true;
                } else {
                        dout("ignoring incremental map %u len %d\n",
                             epoch, maplen);
                }
-               p = next;
+               p += maplen;
                nr_maps--;
        }
-       if (newmap)
+       if (handled_incremental)
                goto done;
 
        /* full maps */
@@ -2186,455 +3216,647 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
                if (nr_maps > 1) {
                        dout("skipping non-latest full map %u len %d\n",
                             epoch, maplen);
-               } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
+               } else if (osdc->osdmap->epoch >= epoch) {
                        dout("skipping full map %u len %d, "
                             "older than our %u\n", epoch, maplen,
                             osdc->osdmap->epoch);
                } else {
-                       int skipped_map = 0;
+                       dout("taking full map %u len %d\n", epoch, maplen);
+                       err = handle_one_map(osdc, p, p + maplen, false,
+                                            &need_resend, &need_resend_linger);
+                       if (err)
+                               goto bad;
+               }
+               p += maplen;
+               nr_maps--;
+       }
+
+done:
+       /*
+        * subscribe to subsequent osdmap updates if full to ensure
+        * we find out when we are no longer full and stop returning
+        * ENOSPC.
+        */
+       pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD);
+       pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) ||
+                 ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
+                 have_pool_full(osdc);
+       if (was_pauserd || was_pausewr || pauserd || pausewr)
+               maybe_request_map(osdc);
+
+       kick_requests(osdc, &need_resend, &need_resend_linger);
+
+       ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
+                         osdc->osdmap->epoch);
+       up_write(&osdc->lock);
+       wake_up_all(&osdc->client->auth_wq);
+       return;
+
+bad:
+       pr_err("osdc handle_map corrupt msg\n");
+       ceph_msg_dump(msg);
+       up_write(&osdc->lock);
+}
+
+/*
+ * Resubmit requests pending on the given osd.
+ */
+static void kick_osd_requests(struct ceph_osd *osd)
+{
+       struct rb_node *n;
+
+       for (n = rb_first(&osd->o_requests); n; ) {
+               struct ceph_osd_request *req =
+                   rb_entry(n, struct ceph_osd_request, r_node);
+
+               n = rb_next(n); /* cancel_linger_request() */
+
+               if (!req->r_linger) {
+                       if (!req->r_t.paused)
+                               send_request(req);
+               } else {
+                       cancel_linger_request(req);
+               }
+       }
+       for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) {
+               struct ceph_osd_linger_request *lreq =
+                   rb_entry(n, struct ceph_osd_linger_request, node);
+
+               send_linger(lreq);
+       }
+}
+
+/*
+ * If the osd connection drops, we need to resubmit all requests.
+ */
+static void osd_fault(struct ceph_connection *con)
+{
+       struct ceph_osd *osd = con->private;
+       struct ceph_osd_client *osdc = osd->o_osdc;
+
+       dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+
+       down_write(&osdc->lock);
+       if (!osd_registered(osd)) {
+               dout("%s osd%d unknown\n", __func__, osd->o_osd);
+               goto out_unlock;
+       }
+
+       if (!reopen_osd(osd))
+               kick_osd_requests(osd);
+       maybe_request_map(osdc);
+
+out_unlock:
+       up_write(&osdc->lock);
+}
+
+/*
+ * Process osd watch notifications
+ */
+static void handle_watch_notify(struct ceph_osd_client *osdc,
+                               struct ceph_msg *msg)
+{
+       void *p = msg->front.iov_base;
+       void *const end = p + msg->front.iov_len;
+       struct ceph_osd_linger_request *lreq;
+       struct linger_work *lwork;
+       u8 proto_ver, opcode;
+       u64 cookie, notify_id;
+       u64 notifier_id = 0;
+       s32 return_code = 0;
+       void *payload = NULL;
+       u32 payload_len = 0;
+
+       ceph_decode_8_safe(&p, end, proto_ver, bad);
+       ceph_decode_8_safe(&p, end, opcode, bad);
+       ceph_decode_64_safe(&p, end, cookie, bad);
+       p += 8; /* skip ver */
+       ceph_decode_64_safe(&p, end, notify_id, bad);
+
+       if (proto_ver >= 1) {
+               ceph_decode_32_safe(&p, end, payload_len, bad);
+               ceph_decode_need(&p, end, payload_len, bad);
+               payload = p;
+               p += payload_len;
+       }
+
+       if (le16_to_cpu(msg->hdr.version) >= 2)
+               ceph_decode_32_safe(&p, end, return_code, bad);
+
+       if (le16_to_cpu(msg->hdr.version) >= 3)
+               ceph_decode_64_safe(&p, end, notifier_id, bad);
 
-                       dout("taking full map %u len %d\n", epoch, maplen);
-                       newmap = ceph_osdmap_decode(&p, p+maplen);
-                       if (IS_ERR(newmap)) {
-                               err = PTR_ERR(newmap);
-                               goto bad;
-                       }
-                       BUG_ON(!newmap);
-                       oldmap = osdc->osdmap;
-                       osdc->osdmap = newmap;
-                       if (oldmap) {
-                               if (oldmap->epoch + 1 < newmap->epoch)
-                                       skipped_map = 1;
-                               ceph_osdmap_destroy(oldmap);
+       down_read(&osdc->lock);
+       lreq = lookup_linger_osdc(&osdc->linger_requests, cookie);
+       if (!lreq) {
+               dout("%s opcode %d cookie %llu dne\n", __func__, opcode,
+                    cookie);
+               goto out_unlock_osdc;
+       }
+
+       mutex_lock(&lreq->lock);
+       dout("%s opcode %d cookie %llu lreq %p is_watch %d\n", __func__,
+            opcode, cookie, lreq, lreq->is_watch);
+       if (opcode == CEPH_WATCH_EVENT_DISCONNECT) {
+               if (!lreq->last_error) {
+                       lreq->last_error = -ENOTCONN;
+                       queue_watch_error(lreq);
+               }
+       } else if (!lreq->is_watch) {
+               /* CEPH_WATCH_EVENT_NOTIFY_COMPLETE */
+               if (lreq->notify_id && lreq->notify_id != notify_id) {
+                       dout("lreq %p notify_id %llu != %llu, ignoring\n", lreq,
+                            lreq->notify_id, notify_id);
+               } else if (!completion_done(&lreq->notify_finish_wait)) {
+                       struct ceph_msg_data *data =
+                           list_first_entry_or_null(&msg->data,
+                                                    struct ceph_msg_data,
+                                                    links);
+
+                       if (data) {
+                               if (lreq->preply_pages) {
+                                       WARN_ON(data->type !=
+                                                       CEPH_MSG_DATA_PAGES);
+                                       *lreq->preply_pages = data->pages;
+                                       *lreq->preply_len = data->length;
+                               } else {
+                                       ceph_release_page_vector(data->pages,
+                                              calc_pages_for(0, data->length));
+                               }
                        }
-                       was_full = was_full ||
-                               ceph_osdmap_flag(osdc->osdmap,
-                                                CEPH_OSDMAP_FULL);
-                       kick_requests(osdc, skipped_map, was_full);
+                       lreq->notify_finish_error = return_code;
+                       complete_all(&lreq->notify_finish_wait);
+               }
+       } else {
+               /* CEPH_WATCH_EVENT_NOTIFY */
+               lwork = lwork_alloc(lreq, do_watch_notify);
+               if (!lwork) {
+                       pr_err("failed to allocate notify-lwork\n");
+                       goto out_unlock_lreq;
                }
-               p += maplen;
-               nr_maps--;
-       }
 
-       if (!osdc->osdmap)
-               goto bad;
-done:
-       downgrade_write(&osdc->map_sem);
-       ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
-                         osdc->osdmap->epoch);
+               lwork->notify.notify_id = notify_id;
+               lwork->notify.notifier_id = notifier_id;
+               lwork->notify.payload = payload;
+               lwork->notify.payload_len = payload_len;
+               lwork->notify.msg = ceph_msg_get(msg);
+               lwork_queue(lwork);
+       }
 
-       /*
-        * subscribe to subsequent osdmap updates if full to ensure
-        * we find out when we are no longer full and stop returning
-        * ENOSPC.
-        */
-       if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
-               ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) ||
-               ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR))
-               ceph_monc_request_next_osdmap(&osdc->client->monc);
-
-       mutex_lock(&osdc->request_mutex);
-       __send_queued(osdc);
-       mutex_unlock(&osdc->request_mutex);
-       up_read(&osdc->map_sem);
-       wake_up_all(&osdc->client->auth_wq);
+out_unlock_lreq:
+       mutex_unlock(&lreq->lock);
+out_unlock_osdc:
+       up_read(&osdc->lock);
        return;
 
 bad:
-       pr_err("osdc handle_map corrupt msg\n");
-       ceph_msg_dump(msg);
-       up_write(&osdc->map_sem);
+       pr_err("osdc handle_watch_notify corrupt msg\n");
 }
 
 /*
- * watch/notify callback event infrastructure
- *
- * These callbacks are used both for watch and notify operations.
+ * Register request, send initial attempt.
  */
-static void __release_event(struct kref *kref)
+int ceph_osdc_start_request(struct ceph_osd_client *osdc,
+                           struct ceph_osd_request *req,
+                           bool nofail)
 {
-       struct ceph_osd_event *event =
-               container_of(kref, struct ceph_osd_event, kref);
+       down_read(&osdc->lock);
+       submit_request(req, false);
+       up_read(&osdc->lock);
 
-       dout("__release_event %p\n", event);
-       kfree(event);
+       return 0;
 }
+EXPORT_SYMBOL(ceph_osdc_start_request);
 
-static void get_event(struct ceph_osd_event *event)
+/*
+ * Unregister a registered request.  The request is not completed (i.e.
+ * no callbacks or wakeups) - higher layers are supposed to know what
+ * they are canceling.
+ */
+void ceph_osdc_cancel_request(struct ceph_osd_request *req)
 {
-       kref_get(&event->kref);
-}
+       struct ceph_osd_client *osdc = req->r_osdc;
 
-void ceph_osdc_put_event(struct ceph_osd_event *event)
-{
-       kref_put(&event->kref, __release_event);
+       down_write(&osdc->lock);
+       if (req->r_osd)
+               cancel_request(req);
+       up_write(&osdc->lock);
 }
-EXPORT_SYMBOL(ceph_osdc_put_event);
+EXPORT_SYMBOL(ceph_osdc_cancel_request);
 
-static void __insert_event(struct ceph_osd_client *osdc,
-                            struct ceph_osd_event *new)
+/*
+ * @timeout: in jiffies, 0 means "wait forever"
+ */
+static int wait_request_timeout(struct ceph_osd_request *req,
+                               unsigned long timeout)
 {
-       struct rb_node **p = &osdc->event_tree.rb_node;
-       struct rb_node *parent = NULL;
-       struct ceph_osd_event *event = NULL;
+       long left;
 
-       while (*p) {
-               parent = *p;
-               event = rb_entry(parent, struct ceph_osd_event, node);
-               if (new->cookie < event->cookie)
-                       p = &(*p)->rb_left;
-               else if (new->cookie > event->cookie)
-                       p = &(*p)->rb_right;
-               else
-                       BUG();
+       dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+       left = wait_for_completion_killable_timeout(&req->r_completion,
+                                               ceph_timeout_jiffies(timeout));
+       if (left <= 0) {
+               left = left ?: -ETIMEDOUT;
+               ceph_osdc_cancel_request(req);
+
+               /* kludge - need to to wake ceph_osdc_sync() */
+               complete_all(&req->r_safe_completion);
+       } else {
+               left = req->r_result; /* completed */
        }
 
-       rb_link_node(&new->node, parent, p);
-       rb_insert_color(&new->node, &osdc->event_tree);
+       return left;
 }
 
-static struct ceph_osd_event *__find_event(struct ceph_osd_client *osdc,
-                                               u64 cookie)
+/*
+ * wait for a request to complete
+ */
+int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
+                          struct ceph_osd_request *req)
 {
-       struct rb_node **p = &osdc->event_tree.rb_node;
-       struct rb_node *parent = NULL;
-       struct ceph_osd_event *event = NULL;
-
-       while (*p) {
-               parent = *p;
-               event = rb_entry(parent, struct ceph_osd_event, node);
-               if (cookie < event->cookie)
-                       p = &(*p)->rb_left;
-               else if (cookie > event->cookie)
-                       p = &(*p)->rb_right;
-               else
-                       return event;
-       }
-       return NULL;
+       return wait_request_timeout(req, 0);
 }
+EXPORT_SYMBOL(ceph_osdc_wait_request);
 
-static void __remove_event(struct ceph_osd_event *event)
+/*
+ * sync - wait for all in-flight requests to flush.  avoid starvation.
+ */
+void ceph_osdc_sync(struct ceph_osd_client *osdc)
 {
-       struct ceph_osd_client *osdc = event->osdc;
+       struct rb_node *n, *p;
+       u64 last_tid = atomic64_read(&osdc->last_tid);
 
-       if (!RB_EMPTY_NODE(&event->node)) {
-               dout("__remove_event removed %p\n", event);
-               rb_erase(&event->node, &osdc->event_tree);
-               ceph_osdc_put_event(event);
-       } else {
-               dout("__remove_event didn't remove %p\n", event);
-       }
-}
+again:
+       down_read(&osdc->lock);
+       for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+               struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
 
-int ceph_osdc_create_event(struct ceph_osd_client *osdc,
-                          void (*event_cb)(u64, u64, u8, void *),
-                          void *data, struct ceph_osd_event **pevent)
-{
-       struct ceph_osd_event *event;
+               mutex_lock(&osd->lock);
+               for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
+                       struct ceph_osd_request *req =
+                           rb_entry(p, struct ceph_osd_request, r_node);
 
-       event = kmalloc(sizeof(*event), GFP_NOIO);
-       if (!event)
-               return -ENOMEM;
+                       if (req->r_tid > last_tid)
+                               break;
 
-       dout("create_event %p\n", event);
-       event->cb = event_cb;
-       event->one_shot = 0;
-       event->data = data;
-       event->osdc = osdc;
-       INIT_LIST_HEAD(&event->osd_node);
-       RB_CLEAR_NODE(&event->node);
-       kref_init(&event->kref);   /* one ref for us */
-       kref_get(&event->kref);    /* one ref for the caller */
-
-       spin_lock(&osdc->event_lock);
-       event->cookie = ++osdc->event_count;
-       __insert_event(osdc, event);
-       spin_unlock(&osdc->event_lock);
-
-       *pevent = event;
-       return 0;
+                       if (!(req->r_flags & CEPH_OSD_FLAG_WRITE))
+                               continue;
+
+                       ceph_osdc_get_request(req);
+                       mutex_unlock(&osd->lock);
+                       up_read(&osdc->lock);
+                       dout("%s waiting on req %p tid %llu last_tid %llu\n",
+                            __func__, req, req->r_tid, last_tid);
+                       wait_for_completion(&req->r_safe_completion);
+                       ceph_osdc_put_request(req);
+                       goto again;
+               }
+
+               mutex_unlock(&osd->lock);
+       }
+
+       up_read(&osdc->lock);
+       dout("%s done last_tid %llu\n", __func__, last_tid);
 }
-EXPORT_SYMBOL(ceph_osdc_create_event);
+EXPORT_SYMBOL(ceph_osdc_sync);
 
-void ceph_osdc_cancel_event(struct ceph_osd_event *event)
+static struct ceph_osd_request *
+alloc_linger_request(struct ceph_osd_linger_request *lreq)
 {
-       struct ceph_osd_client *osdc = event->osdc;
+       struct ceph_osd_request *req;
 
-       dout("cancel_event %p\n", event);
-       spin_lock(&osdc->event_lock);
-       __remove_event(event);
-       spin_unlock(&osdc->event_lock);
-       ceph_osdc_put_event(event); /* caller's */
-}
-EXPORT_SYMBOL(ceph_osdc_cancel_event);
+       req = ceph_osdc_alloc_request(lreq->osdc, NULL, 1, false, GFP_NOIO);
+       if (!req)
+               return NULL;
 
+       ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
+       ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
 
-static void do_event_work(struct work_struct *work)
-{
-       struct ceph_osd_event_work *event_work =
-               container_of(work, struct ceph_osd_event_work, work);
-       struct ceph_osd_event *event = event_work->event;
-       u64 ver = event_work->ver;
-       u64 notify_id = event_work->notify_id;
-       u8 opcode = event_work->opcode;
+       if (ceph_osdc_alloc_messages(req, GFP_NOIO)) {
+               ceph_osdc_put_request(req);
+               return NULL;
+       }
 
-       dout("do_event_work completing %p\n", event);
-       event->cb(ver, notify_id, opcode, event->data);
-       dout("do_event_work completed %p\n", event);
-       ceph_osdc_put_event(event);
-       kfree(event_work);
+       return req;
 }
 
-
 /*
- * Process osd watch notifications
+ * Returns a handle, caller owns a ref.
  */
-static void handle_watch_notify(struct ceph_osd_client *osdc,
-                               struct ceph_msg *msg)
+struct ceph_osd_linger_request *
+ceph_osdc_watch(struct ceph_osd_client *osdc,
+               struct ceph_object_id *oid,
+               struct ceph_object_locator *oloc,
+               rados_watchcb2_t wcb,
+               rados_watcherrcb_t errcb,
+               void *data)
 {
-       void *p, *end;
-       u8 proto_ver;
-       u64 cookie, ver, notify_id;
-       u8 opcode;
-       struct ceph_osd_event *event;
-       struct ceph_osd_event_work *event_work;
+       struct ceph_osd_linger_request *lreq;
+       int ret;
 
-       p = msg->front.iov_base;
-       end = p + msg->front.iov_len;
+       lreq = linger_alloc(osdc);
+       if (!lreq)
+               return ERR_PTR(-ENOMEM);
 
-       ceph_decode_8_safe(&p, end, proto_ver, bad);
-       ceph_decode_8_safe(&p, end, opcode, bad);
-       ceph_decode_64_safe(&p, end, cookie, bad);
-       ceph_decode_64_safe(&p, end, ver, bad);
-       ceph_decode_64_safe(&p, end, notify_id, bad);
+       lreq->is_watch = true;
+       lreq->wcb = wcb;
+       lreq->errcb = errcb;
+       lreq->data = data;
+       lreq->watch_valid_thru = jiffies;
+
+       ceph_oid_copy(&lreq->t.base_oid, oid);
+       ceph_oloc_copy(&lreq->t.base_oloc, oloc);
+       lreq->t.flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
+       lreq->mtime = CURRENT_TIME;
+
+       lreq->reg_req = alloc_linger_request(lreq);
+       if (!lreq->reg_req) {
+               ret = -ENOMEM;
+               goto err_put_lreq;
+       }
 
-       spin_lock(&osdc->event_lock);
-       event = __find_event(osdc, cookie);
-       if (event) {
-               BUG_ON(event->one_shot);
-               get_event(event);
-       }
-       spin_unlock(&osdc->event_lock);
-       dout("handle_watch_notify cookie %lld ver %lld event %p\n",
-            cookie, ver, event);
-       if (event) {
-               event_work = kmalloc(sizeof(*event_work), GFP_NOIO);
-               if (!event_work) {
-                       pr_err("couldn't allocate event_work\n");
-                       ceph_osdc_put_event(event);
-                       return;
-               }
-               INIT_WORK(&event_work->work, do_event_work);
-               event_work->event = event;
-               event_work->ver = ver;
-               event_work->notify_id = notify_id;
-               event_work->opcode = opcode;
+       lreq->ping_req = alloc_linger_request(lreq);
+       if (!lreq->ping_req) {
+               ret = -ENOMEM;
+               goto err_put_lreq;
+       }
 
-               queue_work(osdc->notify_wq, &event_work->work);
+       down_write(&osdc->lock);
+       linger_register(lreq); /* before osd_req_op_* */
+       osd_req_op_watch_init(lreq->reg_req, 0, lreq->linger_id,
+                             CEPH_OSD_WATCH_OP_WATCH);
+       osd_req_op_watch_init(lreq->ping_req, 0, lreq->linger_id,
+                             CEPH_OSD_WATCH_OP_PING);
+       linger_submit(lreq);
+       up_write(&osdc->lock);
+
+       ret = linger_reg_commit_wait(lreq);
+       if (ret) {
+               linger_cancel(lreq);
+               goto err_put_lreq;
        }
 
-       return;
+       return lreq;
 
-bad:
-       pr_err("osdc handle_watch_notify corrupt msg\n");
+err_put_lreq:
+       linger_put(lreq);
+       return ERR_PTR(ret);
 }
+EXPORT_SYMBOL(ceph_osdc_watch);
 
 /*
- * build new request AND message
+ * Releases a ref.
  *
+ * Times out after mount_timeout to preserve rbd unmap behaviour
+ * introduced in 2894e1d76974 ("rbd: timeout watch teardown on unmap
+ * with mount_timeout").
  */
-void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
-                               struct ceph_snap_context *snapc, u64 snap_id,
-                               struct timespec *mtime)
-{
-       struct ceph_msg *msg = req->r_request;
-       void *p;
-       size_t msg_size;
-       int flags = req->r_flags;
-       u64 data_len;
-       unsigned int i;
-
-       req->r_snapid = snap_id;
-       req->r_snapc = ceph_get_snap_context(snapc);
+int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
+                     struct ceph_osd_linger_request *lreq)
+{
+       struct ceph_options *opts = osdc->client->options;
+       struct ceph_osd_request *req;
+       int ret;
 
-       /* encode request */
-       msg->hdr.version = cpu_to_le16(4);
-
-       p = msg->front.iov_base;
-       ceph_encode_32(&p, 1);   /* client_inc  is always 1 */
-       req->r_request_osdmap_epoch = p;
-       p += 4;
-       req->r_request_flags = p;
-       p += 4;
-       if (req->r_flags & CEPH_OSD_FLAG_WRITE)
-               ceph_encode_timespec(p, mtime);
-       p += sizeof(struct ceph_timespec);
-       req->r_request_reassert_version = p;
-       p += sizeof(struct ceph_eversion); /* will get filled in */
+       req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
+       if (!req)
+               return -ENOMEM;
 
-       /* oloc */
-       ceph_encode_8(&p, 4);
-       ceph_encode_8(&p, 4);
-       ceph_encode_32(&p, 8 + 4 + 4);
-       req->r_request_pool = p;
-       p += 8;
-       ceph_encode_32(&p, -1);  /* preferred */
-       ceph_encode_32(&p, 0);   /* key len */
+       ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
+       ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
+       req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
+       req->r_mtime = CURRENT_TIME;
+       osd_req_op_watch_init(req, 0, lreq->linger_id,
+                             CEPH_OSD_WATCH_OP_UNWATCH);
 
-       ceph_encode_8(&p, 1);
-       req->r_request_pgid = p;
-       p += 8 + 4;
-       ceph_encode_32(&p, -1);  /* preferred */
+       ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+       if (ret)
+               goto out_put_req;
 
-       /* oid */
-       ceph_encode_32(&p, req->r_base_oid.name_len);
-       memcpy(p, req->r_base_oid.name, req->r_base_oid.name_len);
-       dout("oid '%.*s' len %d\n", req->r_base_oid.name_len,
-            req->r_base_oid.name, req->r_base_oid.name_len);
-       p += req->r_base_oid.name_len;
-
-       /* ops--can imply data */
-       ceph_encode_16(&p, (u16)req->r_num_ops);
-       data_len = 0;
-       for (i = 0; i < req->r_num_ops; i++) {
-               data_len += osd_req_encode_op(req, p, i);
-               p += sizeof(struct ceph_osd_op);
-       }
+       ceph_osdc_start_request(osdc, req, false);
+       linger_cancel(lreq);
+       linger_put(lreq);
+       ret = wait_request_timeout(req, opts->mount_timeout);
 
-       /* snaps */
-       ceph_encode_64(&p, req->r_snapid);
-       ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0);
-       ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0);
-       if (req->r_snapc) {
-               for (i = 0; i < snapc->num_snaps; i++) {
-                       ceph_encode_64(&p, req->r_snapc->snaps[i]);
-               }
-       }
+out_put_req:
+       ceph_osdc_put_request(req);
+       return ret;
+}
+EXPORT_SYMBOL(ceph_osdc_unwatch);
 
-       req->r_request_attempts = p;
-       p += 4;
+static int osd_req_op_notify_ack_init(struct ceph_osd_request *req, int which,
+                                     u64 notify_id, u64 cookie, void *payload,
+                                     size_t payload_len)
+{
+       struct ceph_osd_req_op *op;
+       struct ceph_pagelist *pl;
+       int ret;
 
-       /* data */
-       if (flags & CEPH_OSD_FLAG_WRITE) {
-               u16 data_off;
+       op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY_ACK, 0);
 
-               /*
-                * The header "data_off" is a hint to the receiver
-                * allowing it to align received data into its
-                * buffers such that there's no need to re-copy
-                * it before writing it to disk (direct I/O).
-                */
-               data_off = (u16) (off & 0xffff);
-               req->r_request->hdr.data_off = cpu_to_le16(data_off);
-       }
-       req->r_request->hdr.data_len = cpu_to_le32(data_len);
+       pl = kmalloc(sizeof(*pl), GFP_NOIO);
+       if (!pl)
+               return -ENOMEM;
 
-       BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
-       msg_size = p - msg->front.iov_base;
-       msg->front.iov_len = msg_size;
-       msg->hdr.front_len = cpu_to_le32(msg_size);
+       ceph_pagelist_init(pl);
+       ret = ceph_pagelist_encode_64(pl, notify_id);
+       ret |= ceph_pagelist_encode_64(pl, cookie);
+       if (payload) {
+               ret |= ceph_pagelist_encode_32(pl, payload_len);
+               ret |= ceph_pagelist_append(pl, payload, payload_len);
+       } else {
+               ret |= ceph_pagelist_encode_32(pl, 0);
+       }
+       if (ret) {
+               ceph_pagelist_release(pl);
+               return -ENOMEM;
+       }
 
-       dout("build_request msg_size was %d\n", (int)msg_size);
+       ceph_osd_data_pagelist_init(&op->notify_ack.request_data, pl);
+       op->indata_len = pl->length;
+       return 0;
 }
-EXPORT_SYMBOL(ceph_osdc_build_request);
 
-/*
- * Register request, send initial attempt.
- */
-int ceph_osdc_start_request(struct ceph_osd_client *osdc,
-                           struct ceph_osd_request *req,
-                           bool nofail)
+int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
+                        struct ceph_object_id *oid,
+                        struct ceph_object_locator *oloc,
+                        u64 notify_id,
+                        u64 cookie,
+                        void *payload,
+                        size_t payload_len)
 {
-       int rc;
+       struct ceph_osd_request *req;
+       int ret;
+
+       req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
+       if (!req)
+               return -ENOMEM;
 
-       down_read(&osdc->map_sem);
-       mutex_lock(&osdc->request_mutex);
+       ceph_oid_copy(&req->r_base_oid, oid);
+       ceph_oloc_copy(&req->r_base_oloc, oloc);
+       req->r_flags = CEPH_OSD_FLAG_READ;
 
-       rc = __ceph_osdc_start_request(osdc, req, nofail);
+       ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+       if (ret)
+               goto out_put_req;
+
+       ret = osd_req_op_notify_ack_init(req, 0, notify_id, cookie, payload,
+                                        payload_len);
+       if (ret)
+               goto out_put_req;
 
-       mutex_unlock(&osdc->request_mutex);
-       up_read(&osdc->map_sem);
+       ceph_osdc_start_request(osdc, req, false);
+       ret = ceph_osdc_wait_request(osdc, req);
 
-       return rc;
+out_put_req:
+       ceph_osdc_put_request(req);
+       return ret;
 }
-EXPORT_SYMBOL(ceph_osdc_start_request);
+EXPORT_SYMBOL(ceph_osdc_notify_ack);
 
-/*
- * Unregister a registered request.  The request is not completed (i.e.
- * no callbacks or wakeups) - higher layers are supposed to know what
- * they are canceling.
- */
-void ceph_osdc_cancel_request(struct ceph_osd_request *req)
+static int osd_req_op_notify_init(struct ceph_osd_request *req, int which,
+                                 u64 cookie, u32 prot_ver, u32 timeout,
+                                 void *payload, size_t payload_len)
 {
-       struct ceph_osd_client *osdc = req->r_osdc;
+       struct ceph_osd_req_op *op;
+       struct ceph_pagelist *pl;
+       int ret;
 
-       mutex_lock(&osdc->request_mutex);
-       if (req->r_linger)
-               __unregister_linger_request(osdc, req);
-       __unregister_request(osdc, req);
-       mutex_unlock(&osdc->request_mutex);
+       op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0);
+       op->notify.cookie = cookie;
 
-       dout("%s %p tid %llu canceled\n", __func__, req, req->r_tid);
+       pl = kmalloc(sizeof(*pl), GFP_NOIO);
+       if (!pl)
+               return -ENOMEM;
+
+       ceph_pagelist_init(pl);
+       ret = ceph_pagelist_encode_32(pl, 1); /* prot_ver */
+       ret |= ceph_pagelist_encode_32(pl, timeout);
+       ret |= ceph_pagelist_encode_32(pl, payload_len);
+       ret |= ceph_pagelist_append(pl, payload, payload_len);
+       if (ret) {
+               ceph_pagelist_release(pl);
+               return -ENOMEM;
+       }
+
+       ceph_osd_data_pagelist_init(&op->notify.request_data, pl);
+       op->indata_len = pl->length;
+       return 0;
 }
-EXPORT_SYMBOL(ceph_osdc_cancel_request);
 
 /*
- * wait for a request to complete
+ * @timeout: in seconds
+ *
+ * @preply_{pages,len} are initialized both on success and error.
+ * The caller is responsible for:
+ *
+ *     ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len))
  */
-int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
-                          struct ceph_osd_request *req)
+int ceph_osdc_notify(struct ceph_osd_client *osdc,
+                    struct ceph_object_id *oid,
+                    struct ceph_object_locator *oloc,
+                    void *payload,
+                    size_t payload_len,
+                    u32 timeout,
+                    struct page ***preply_pages,
+                    size_t *preply_len)
 {
-       int rc;
+       struct ceph_osd_linger_request *lreq;
+       struct page **pages;
+       int ret;
 
-       dout("%s %p tid %llu\n", __func__, req, req->r_tid);
+       WARN_ON(!timeout);
+       if (preply_pages) {
+               *preply_pages = NULL;
+               *preply_len = 0;
+       }
 
-       rc = wait_for_completion_interruptible(&req->r_completion);
-       if (rc < 0) {
-               dout("%s %p tid %llu interrupted\n", __func__, req, req->r_tid);
-               ceph_osdc_cancel_request(req);
-               complete_request(req);
-               return rc;
+       lreq = linger_alloc(osdc);
+       if (!lreq)
+               return -ENOMEM;
+
+       lreq->preply_pages = preply_pages;
+       lreq->preply_len = preply_len;
+
+       ceph_oid_copy(&lreq->t.base_oid, oid);
+       ceph_oloc_copy(&lreq->t.base_oloc, oloc);
+       lreq->t.flags = CEPH_OSD_FLAG_READ;
+
+       lreq->reg_req = alloc_linger_request(lreq);
+       if (!lreq->reg_req) {
+               ret = -ENOMEM;
+               goto out_put_lreq;
+       }
+
+       /* for notify_id */
+       pages = ceph_alloc_page_vector(1, GFP_NOIO);
+       if (IS_ERR(pages)) {
+               ret = PTR_ERR(pages);
+               goto out_put_lreq;
+       }
+
+       down_write(&osdc->lock);
+       linger_register(lreq); /* before osd_req_op_* */
+       ret = osd_req_op_notify_init(lreq->reg_req, 0, lreq->linger_id, 1,
+                                    timeout, payload, payload_len);
+       if (ret) {
+               linger_unregister(lreq);
+               up_write(&osdc->lock);
+               ceph_release_page_vector(pages, 1);
+               goto out_put_lreq;
        }
+       ceph_osd_data_pages_init(osd_req_op_data(lreq->reg_req, 0, notify,
+                                                response_data),
+                                pages, PAGE_SIZE, 0, false, true);
+       linger_submit(lreq);
+       up_write(&osdc->lock);
+
+       ret = linger_reg_commit_wait(lreq);
+       if (!ret)
+               ret = linger_notify_finish_wait(lreq);
+       else
+               dout("lreq %p failed to initiate notify %d\n", lreq, ret);
 
-       dout("%s %p tid %llu result %d\n", __func__, req, req->r_tid,
-            req->r_result);
-       return req->r_result;
+       linger_cancel(lreq);
+out_put_lreq:
+       linger_put(lreq);
+       return ret;
 }
-EXPORT_SYMBOL(ceph_osdc_wait_request);
+EXPORT_SYMBOL(ceph_osdc_notify);
 
 /*
- * sync - wait for all in-flight requests to flush.  avoid starvation.
+ * Return the number of milliseconds since the watch was last
+ * confirmed, or an error.  If there is an error, the watch is no
+ * longer valid, and should be destroyed with ceph_osdc_unwatch().
  */
-void ceph_osdc_sync(struct ceph_osd_client *osdc)
+int ceph_osdc_watch_check(struct ceph_osd_client *osdc,
+                         struct ceph_osd_linger_request *lreq)
 {
-       struct ceph_osd_request *req;
-       u64 last_tid, next_tid = 0;
-
-       mutex_lock(&osdc->request_mutex);
-       last_tid = osdc->last_tid;
-       while (1) {
-               req = __lookup_request_ge(osdc, next_tid);
-               if (!req)
-                       break;
-               if (req->r_tid > last_tid)
-                       break;
-
-               next_tid = req->r_tid + 1;
-               if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
-                       continue;
+       unsigned long stamp, age;
+       int ret;
 
-               ceph_osdc_get_request(req);
-               mutex_unlock(&osdc->request_mutex);
-               dout("sync waiting on tid %llu (last is %llu)\n",
-                    req->r_tid, last_tid);
-               wait_for_completion(&req->r_safe_completion);
-               mutex_lock(&osdc->request_mutex);
-               ceph_osdc_put_request(req);
+       down_read(&osdc->lock);
+       mutex_lock(&lreq->lock);
+       stamp = lreq->watch_valid_thru;
+       if (!list_empty(&lreq->pending_lworks)) {
+               struct linger_work *lwork =
+                   list_first_entry(&lreq->pending_lworks,
+                                    struct linger_work,
+                                    pending_item);
+
+               if (time_before(lwork->queued_stamp, stamp))
+                       stamp = lwork->queued_stamp;
        }
-       mutex_unlock(&osdc->request_mutex);
-       dout("sync done (thru tid %llu)\n", last_tid);
+       age = jiffies - stamp;
+       dout("%s lreq %p linger_id %llu age %lu last_error %d\n", __func__,
+            lreq, lreq->linger_id, age, lreq->last_error);
+       /* we are truncating to msecs, so return a safe upper bound */
+       ret = lreq->last_error ?: 1 + jiffies_to_msecs(age);
+
+       mutex_unlock(&lreq->lock);
+       up_read(&osdc->lock);
+       return ret;
 }
-EXPORT_SYMBOL(ceph_osdc_sync);
 
 /*
  * Call all pending notify callbacks - for use after a watch is
@@ -2646,6 +3868,13 @@ void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc)
 }
 EXPORT_SYMBOL(ceph_osdc_flush_notifies);
 
+void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc)
+{
+       down_read(&osdc->lock);
+       maybe_request_map(osdc);
+       up_read(&osdc->lock);
+}
+EXPORT_SYMBOL(ceph_osdc_maybe_request_map);
 
 /*
  * init, shutdown
@@ -2656,43 +3885,35 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 
        dout("init\n");
        osdc->client = client;
-       osdc->osdmap = NULL;
-       init_rwsem(&osdc->map_sem);
-       init_completion(&osdc->map_waiters);
-       osdc->last_requested_map = 0;
-       mutex_init(&osdc->request_mutex);
-       osdc->last_tid = 0;
+       init_rwsem(&osdc->lock);
        osdc->osds = RB_ROOT;
        INIT_LIST_HEAD(&osdc->osd_lru);
-       osdc->requests = RB_ROOT;
-       INIT_LIST_HEAD(&osdc->req_lru);
-       INIT_LIST_HEAD(&osdc->req_unsent);
-       INIT_LIST_HEAD(&osdc->req_notarget);
-       INIT_LIST_HEAD(&osdc->req_linger);
-       osdc->num_requests = 0;
+       spin_lock_init(&osdc->osd_lru_lock);
+       osd_init(&osdc->homeless_osd);
+       osdc->homeless_osd.o_osdc = osdc;
+       osdc->homeless_osd.o_osd = CEPH_HOMELESS_OSD;
+       osdc->linger_requests = RB_ROOT;
+       osdc->map_checks = RB_ROOT;
+       osdc->linger_map_checks = RB_ROOT;
        INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
        INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
-       spin_lock_init(&osdc->event_lock);
-       osdc->event_tree = RB_ROOT;
-       osdc->event_count = 0;
-
-       schedule_delayed_work(&osdc->osds_timeout_work,
-           round_jiffies_relative(osdc->client->options->osd_idle_ttl));
 
        err = -ENOMEM;
+       osdc->osdmap = ceph_osdmap_alloc();
+       if (!osdc->osdmap)
+               goto out;
+
        osdc->req_mempool = mempool_create_slab_pool(10,
                                                     ceph_osd_request_cache);
        if (!osdc->req_mempool)
-               goto out;
+               goto out_map;
 
        err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP,
-                               OSD_OP_FRONT_LEN, 10, true,
-                               "osd_op");
+                               PAGE_SIZE, 10, true, "osd_op");
        if (err < 0)
                goto out_mempool;
        err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY,
-                               OSD_OPREPLY_FRONT_LEN, 10, true,
-                               "osd_op_reply");
+                               PAGE_SIZE, 10, true, "osd_op_reply");
        if (err < 0)
                goto out_msgpool;
 
@@ -2701,6 +3922,11 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
        if (!osdc->notify_wq)
                goto out_msgpool_reply;
 
+       schedule_delayed_work(&osdc->timeout_work,
+                             osdc->client->options->osd_keepalive_timeout);
+       schedule_delayed_work(&osdc->osds_timeout_work,
+           round_jiffies_relative(osdc->client->options->osd_idle_ttl));
+
        return 0;
 
 out_msgpool_reply:
@@ -2709,6 +3935,8 @@ out_msgpool:
        ceph_msgpool_destroy(&osdc->msgpool_op);
 out_mempool:
        mempool_destroy(osdc->req_mempool);
+out_map:
+       ceph_osdmap_destroy(osdc->osdmap);
 out:
        return err;
 }
@@ -2719,11 +3947,25 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
        destroy_workqueue(osdc->notify_wq);
        cancel_delayed_work_sync(&osdc->timeout_work);
        cancel_delayed_work_sync(&osdc->osds_timeout_work);
-       if (osdc->osdmap) {
-               ceph_osdmap_destroy(osdc->osdmap);
-               osdc->osdmap = NULL;
+
+       down_write(&osdc->lock);
+       while (!RB_EMPTY_ROOT(&osdc->osds)) {
+               struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
+                                               struct ceph_osd, o_node);
+               close_osd(osd);
        }
-       remove_all_osds(osdc);
+       up_write(&osdc->lock);
+       WARN_ON(atomic_read(&osdc->homeless_osd.o_ref) != 1);
+       osd_cleanup(&osdc->homeless_osd);
+
+       WARN_ON(!list_empty(&osdc->osd_lru));
+       WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_requests));
+       WARN_ON(!RB_EMPTY_ROOT(&osdc->map_checks));
+       WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_map_checks));
+       WARN_ON(atomic_read(&osdc->num_requests));
+       WARN_ON(atomic_read(&osdc->num_homeless));
+
+       ceph_osdmap_destroy(osdc->osdmap);
        mempool_destroy(osdc->req_mempool);
        ceph_msgpool_destroy(&osdc->msgpool_op);
        ceph_msgpool_destroy(&osdc->msgpool_op_reply);
@@ -2752,15 +3994,12 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
                return PTR_ERR(req);
 
        /* it may be a short read due to an object boundary */
-
        osd_req_op_extent_osd_data_pages(req, 0,
                                pages, *plen, page_align, false, false);
 
        dout("readpages  final extent is %llu~%llu (%llu bytes align %d)\n",
             off, *plen, *plen, page_align);
 
-       ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
-
        rc = ceph_osdc_start_request(osdc, req, false);
        if (!rc)
                rc = ceph_osdc_wait_request(osdc, req);
@@ -2786,7 +4025,6 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
        int rc = 0;
        int page_align = off & ~PAGE_MASK;
 
-       BUG_ON(vino.snap != CEPH_NOSNAP);       /* snapshots aren't writeable */
        req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
                                    CEPH_OSD_OP_WRITE,
                                    CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
@@ -2800,8 +4038,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
                                false, false);
        dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
 
-       ceph_osdc_build_request(req, off, snapc, CEPH_NOSNAP, mtime);
-
+       req->r_mtime = *mtime;
        rc = ceph_osdc_start_request(osdc, req, true);
        if (!rc)
                rc = ceph_osdc_wait_request(osdc, req);
@@ -2841,19 +4078,15 @@ EXPORT_SYMBOL(ceph_osdc_cleanup);
 static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 {
        struct ceph_osd *osd = con->private;
-       struct ceph_osd_client *osdc;
+       struct ceph_osd_client *osdc = osd->o_osdc;
        int type = le16_to_cpu(msg->hdr.type);
 
-       if (!osd)
-               goto out;
-       osdc = osd->o_osdc;
-
        switch (type) {
        case CEPH_MSG_OSD_MAP:
                ceph_osdc_handle_map(osdc, msg);
                break;
        case CEPH_MSG_OSD_OPREPLY:
-               handle_reply(osdc, msg);
+               handle_reply(osd, msg);
                break;
        case CEPH_MSG_WATCH_NOTIFY:
                handle_watch_notify(osdc, msg);
@@ -2863,7 +4096,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
                pr_err("received unknown message type %d %s\n", type,
                       ceph_msg_type_name(type));
        }
-out:
+
        ceph_msg_put(msg);
 }
 
@@ -2878,21 +4111,27 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 {
        struct ceph_osd *osd = con->private;
        struct ceph_osd_client *osdc = osd->o_osdc;
-       struct ceph_msg *m;
+       struct ceph_msg *m = NULL;
        struct ceph_osd_request *req;
        int front_len = le32_to_cpu(hdr->front_len);
        int data_len = le32_to_cpu(hdr->data_len);
-       u64 tid;
+       u64 tid = le64_to_cpu(hdr->tid);
+
+       down_read(&osdc->lock);
+       if (!osd_registered(osd)) {
+               dout("%s osd%d unknown, skipping\n", __func__, osd->o_osd);
+               *skip = 1;
+               goto out_unlock_osdc;
+       }
+       WARN_ON(osd->o_osd != le64_to_cpu(hdr->src.num));
 
-       tid = le64_to_cpu(hdr->tid);
-       mutex_lock(&osdc->request_mutex);
-       req = __lookup_request(osdc, tid);
+       mutex_lock(&osd->lock);
+       req = lookup_request(&osd->o_requests, tid);
        if (!req) {
                dout("%s osd%d tid %llu unknown, skipping\n", __func__,
                     osd->o_osd, tid);
-               m = NULL;
                *skip = 1;
-               goto out;
+               goto out_unlock_session;
        }
 
        ceph_msg_revoke_incoming(req->r_reply);
@@ -2904,7 +4143,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
                m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS,
                                 false);
                if (!m)
-                       goto out;
+                       goto out_unlock_session;
                ceph_msg_put(req->r_reply);
                req->r_reply = m;
        }
@@ -2915,14 +4154,49 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
                        req->r_reply->data_length);
                m = NULL;
                *skip = 1;
-               goto out;
+               goto out_unlock_session;
        }
 
        m = ceph_msg_get(req->r_reply);
        dout("get_reply tid %lld %p\n", tid, m);
 
-out:
-       mutex_unlock(&osdc->request_mutex);
+out_unlock_session:
+       mutex_unlock(&osd->lock);
+out_unlock_osdc:
+       up_read(&osdc->lock);
+       return m;
+}
+
+/*
+ * TODO: switch to a msg-owned pagelist
+ */
+static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr)
+{
+       struct ceph_msg *m;
+       int type = le16_to_cpu(hdr->type);
+       u32 front_len = le32_to_cpu(hdr->front_len);
+       u32 data_len = le32_to_cpu(hdr->data_len);
+
+       m = ceph_msg_new(type, front_len, GFP_NOIO, false);
+       if (!m)
+               return NULL;
+
+       if (data_len) {
+               struct page **pages;
+               struct ceph_osd_data osd_data;
+
+               pages = ceph_alloc_page_vector(calc_pages_for(0, data_len),
+                                              GFP_NOIO);
+               if (!pages) {
+                       ceph_msg_put(m);
+                       return NULL;
+               }
+
+               ceph_osd_data_pages_init(&osd_data, pages, data_len, 0, false,
+                                        false);
+               ceph_osdc_msg_data_add(m, &osd_data);
+       }
+
        return m;
 }
 
@@ -2932,18 +4206,17 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
 {
        struct ceph_osd *osd = con->private;
        int type = le16_to_cpu(hdr->type);
-       int front = le32_to_cpu(hdr->front_len);
 
        *skip = 0;
        switch (type) {
        case CEPH_MSG_OSD_MAP:
        case CEPH_MSG_WATCH_NOTIFY:
-               return ceph_msg_new(type, front, GFP_NOFS, false);
+               return alloc_msg_with_page_vector(hdr);
        case CEPH_MSG_OSD_OPREPLY:
                return get_reply(con, hdr, skip);
        default:
-               pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
-                       osd->o_osd);
+               pr_warn("%s osd%d unknown msg type %d, skipping\n", __func__,
+                       osd->o_osd, type);
                *skip = 1;
                return NULL;
        }
@@ -3047,5 +4320,5 @@ static const struct ceph_connection_operations osd_con_ops = {
        .alloc_msg = alloc_msg,
        .sign_message = osd_sign_message,
        .check_message_signature = osd_check_message_signature,
-       .fault = osd_reset,
+       .fault = osd_fault,
 };
index 243574c..03062bb 100644 (file)
@@ -380,23 +380,24 @@ bad:
        return ERR_PTR(err);
 }
 
-/*
- * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
- * to a set of osds) and primary_temp (explicit primary setting)
- */
-static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
+int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs)
 {
-       if (l.pool < r.pool)
+       if (lhs->pool < rhs->pool)
                return -1;
-       if (l.pool > r.pool)
+       if (lhs->pool > rhs->pool)
                return 1;
-       if (l.seed < r.seed)
+       if (lhs->seed < rhs->seed)
                return -1;
-       if (l.seed > r.seed)
+       if (lhs->seed > rhs->seed)
                return 1;
+
        return 0;
 }
 
+/*
+ * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
+ * to a set of osds) and primary_temp (explicit primary setting)
+ */
 static int __insert_pg_mapping(struct ceph_pg_mapping *new,
                               struct rb_root *root)
 {
@@ -409,7 +410,7 @@ static int __insert_pg_mapping(struct ceph_pg_mapping *new,
        while (*p) {
                parent = *p;
                pg = rb_entry(parent, struct ceph_pg_mapping, node);
-               c = pgid_cmp(new->pgid, pg->pgid);
+               c = ceph_pg_compare(&new->pgid, &pg->pgid);
                if (c < 0)
                        p = &(*p)->rb_left;
                else if (c > 0)
@@ -432,7 +433,7 @@ static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
 
        while (n) {
                pg = rb_entry(n, struct ceph_pg_mapping, node);
-               c = pgid_cmp(pgid, pg->pgid);
+               c = ceph_pg_compare(&pgid, &pg->pgid);
                if (c < 0) {
                        n = n->rb_left;
                } else if (c > 0) {
@@ -596,7 +597,9 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
        *p += 4;  /* skip crash_replay_interval */
 
        if (ev >= 7)
-               *p += 1;  /* skip min_size */
+               pi->min_size = ceph_decode_8(p);
+       else
+               pi->min_size = pi->size - pi->size / 2;
 
        if (ev >= 8)
                *p += 8 + 8;  /* skip quota_max_* */
@@ -616,6 +619,50 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
                pi->write_tier = -1;
        }
 
+       if (ev >= 10) {
+               /* skip properties */
+               num = ceph_decode_32(p);
+               while (num--) {
+                       len = ceph_decode_32(p);
+                       *p += len; /* key */
+                       len = ceph_decode_32(p);
+                       *p += len; /* val */
+               }
+       }
+
+       if (ev >= 11) {
+               /* skip hit_set_params */
+               *p += 1 + 1; /* versions */
+               len = ceph_decode_32(p);
+               *p += len;
+
+               *p += 4; /* skip hit_set_period */
+               *p += 4; /* skip hit_set_count */
+       }
+
+       if (ev >= 12)
+               *p += 4; /* skip stripe_width */
+
+       if (ev >= 13) {
+               *p += 8; /* skip target_max_bytes */
+               *p += 8; /* skip target_max_objects */
+               *p += 4; /* skip cache_target_dirty_ratio_micro */
+               *p += 4; /* skip cache_target_full_ratio_micro */
+               *p += 4; /* skip cache_min_flush_age */
+               *p += 4; /* skip cache_min_evict_age */
+       }
+
+       if (ev >=  14) {
+               /* skip erasure_code_profile */
+               len = ceph_decode_32(p);
+               *p += len;
+       }
+
+       if (ev >= 15)
+               pi->last_force_request_resend = ceph_decode_32(p);
+       else
+               pi->last_force_request_resend = 0;
+
        /* ignore the rest */
 
        *p = pool_end;
@@ -660,6 +707,23 @@ bad:
 /*
  * osd map
  */
+struct ceph_osdmap *ceph_osdmap_alloc(void)
+{
+       struct ceph_osdmap *map;
+
+       map = kzalloc(sizeof(*map), GFP_NOIO);
+       if (!map)
+               return NULL;
+
+       map->pg_pools = RB_ROOT;
+       map->pool_max = -1;
+       map->pg_temp = RB_ROOT;
+       map->primary_temp = RB_ROOT;
+       mutex_init(&map->crush_scratch_mutex);
+
+       return map;
+}
+
 void ceph_osdmap_destroy(struct ceph_osdmap *map)
 {
        dout("osdmap_destroy %p\n", map);
@@ -1183,14 +1247,10 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
        struct ceph_osdmap *map;
        int ret;
 
-       map = kzalloc(sizeof(*map), GFP_NOFS);
+       map = ceph_osdmap_alloc();
        if (!map)
                return ERR_PTR(-ENOMEM);
 
-       map->pg_temp = RB_ROOT;
-       map->primary_temp = RB_ROOT;
-       mutex_init(&map->crush_scratch_mutex);
-
        ret = osdmap_decode(p, end, map);
        if (ret) {
                ceph_osdmap_destroy(map);
@@ -1204,8 +1264,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
  * decode and apply an incremental map update.
  */
 struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
-                                            struct ceph_osdmap *map,
-                                            struct ceph_messenger *msgr)
+                                            struct ceph_osdmap *map)
 {
        struct crush_map *newcrush = NULL;
        struct ceph_fsid fsid;
@@ -1381,8 +1440,252 @@ bad:
        return ERR_PTR(err);
 }
 
+void ceph_oid_copy(struct ceph_object_id *dest,
+                  const struct ceph_object_id *src)
+{
+       WARN_ON(!ceph_oid_empty(dest));
+
+       if (src->name != src->inline_name) {
+               /* very rare, see ceph_object_id definition */
+               dest->name = kmalloc(src->name_len + 1,
+                                    GFP_NOIO | __GFP_NOFAIL);
+       }
+
+       memcpy(dest->name, src->name, src->name_len + 1);
+       dest->name_len = src->name_len;
+}
+EXPORT_SYMBOL(ceph_oid_copy);
+
+static __printf(2, 0)
+int oid_printf_vargs(struct ceph_object_id *oid, const char *fmt, va_list ap)
+{
+       int len;
+
+       WARN_ON(!ceph_oid_empty(oid));
+
+       len = vsnprintf(oid->inline_name, sizeof(oid->inline_name), fmt, ap);
+       if (len >= sizeof(oid->inline_name))
+               return len;
+
+       oid->name_len = len;
+       return 0;
+}
+
+/*
+ * If oid doesn't fit into inline buffer, BUG.
+ */
+void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...)
+{
+       va_list ap;
+
+       va_start(ap, fmt);
+       BUG_ON(oid_printf_vargs(oid, fmt, ap));
+       va_end(ap);
+}
+EXPORT_SYMBOL(ceph_oid_printf);
+
+static __printf(3, 0)
+int oid_aprintf_vargs(struct ceph_object_id *oid, gfp_t gfp,
+                     const char *fmt, va_list ap)
+{
+       va_list aq;
+       int len;
+
+       va_copy(aq, ap);
+       len = oid_printf_vargs(oid, fmt, aq);
+       va_end(aq);
+
+       if (len) {
+               char *external_name;
+
+               external_name = kmalloc(len + 1, gfp);
+               if (!external_name)
+                       return -ENOMEM;
+
+               oid->name = external_name;
+               WARN_ON(vsnprintf(oid->name, len + 1, fmt, ap) != len);
+               oid->name_len = len;
+       }
+
+       return 0;
+}
+
+/*
+ * If oid doesn't fit into inline buffer, allocate.
+ */
+int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
+                    const char *fmt, ...)
+{
+       va_list ap;
+       int ret;
+
+       va_start(ap, fmt);
+       ret = oid_aprintf_vargs(oid, gfp, fmt, ap);
+       va_end(ap);
+
+       return ret;
+}
+EXPORT_SYMBOL(ceph_oid_aprintf);
+
+void ceph_oid_destroy(struct ceph_object_id *oid)
+{
+       if (oid->name != oid->inline_name)
+               kfree(oid->name);
+}
+EXPORT_SYMBOL(ceph_oid_destroy);
+
+/*
+ * osds only
+ */
+static bool __osds_equal(const struct ceph_osds *lhs,
+                        const struct ceph_osds *rhs)
+{
+       if (lhs->size == rhs->size &&
+           !memcmp(lhs->osds, rhs->osds, rhs->size * sizeof(rhs->osds[0])))
+               return true;
+
+       return false;
+}
+
+/*
+ * osds + primary
+ */
+static bool osds_equal(const struct ceph_osds *lhs,
+                      const struct ceph_osds *rhs)
+{
+       if (__osds_equal(lhs, rhs) &&
+           lhs->primary == rhs->primary)
+               return true;
+
+       return false;
+}
+
+static bool osds_valid(const struct ceph_osds *set)
+{
+       /* non-empty set */
+       if (set->size > 0 && set->primary >= 0)
+               return true;
+
+       /* empty can_shift_osds set */
+       if (!set->size && set->primary == -1)
+               return true;
+
+       /* empty !can_shift_osds set - all NONE */
+       if (set->size > 0 && set->primary == -1) {
+               int i;
+
+               for (i = 0; i < set->size; i++) {
+                       if (set->osds[i] != CRUSH_ITEM_NONE)
+                               break;
+               }
+               if (i == set->size)
+                       return true;
+       }
+
+       return false;
+}
+
+void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src)
+{
+       memcpy(dest->osds, src->osds, src->size * sizeof(src->osds[0]));
+       dest->size = src->size;
+       dest->primary = src->primary;
+}
+
+static bool is_split(const struct ceph_pg *pgid,
+                    u32 old_pg_num,
+                    u32 new_pg_num)
+{
+       int old_bits = calc_bits_of(old_pg_num);
+       int old_mask = (1 << old_bits) - 1;
+       int n;
+
+       WARN_ON(pgid->seed >= old_pg_num);
+       if (new_pg_num <= old_pg_num)
+               return false;
+
+       for (n = 1; ; n++) {
+               int next_bit = n << (old_bits - 1);
+               u32 s = next_bit | pgid->seed;
+
+               if (s < old_pg_num || s == pgid->seed)
+                       continue;
+               if (s >= new_pg_num)
+                       break;
+
+               s = ceph_stable_mod(s, old_pg_num, old_mask);
+               if (s == pgid->seed)
+                       return true;
+       }
+
+       return false;
+}
+
+bool ceph_is_new_interval(const struct ceph_osds *old_acting,
+                         const struct ceph_osds *new_acting,
+                         const struct ceph_osds *old_up,
+                         const struct ceph_osds *new_up,
+                         int old_size,
+                         int new_size,
+                         int old_min_size,
+                         int new_min_size,
+                         u32 old_pg_num,
+                         u32 new_pg_num,
+                         bool old_sort_bitwise,
+                         bool new_sort_bitwise,
+                         const struct ceph_pg *pgid)
+{
+       return !osds_equal(old_acting, new_acting) ||
+              !osds_equal(old_up, new_up) ||
+              old_size != new_size ||
+              old_min_size != new_min_size ||
+              is_split(pgid, old_pg_num, new_pg_num) ||
+              old_sort_bitwise != new_sort_bitwise;
+}
+
+static int calc_pg_rank(int osd, const struct ceph_osds *acting)
+{
+       int i;
+
+       for (i = 0; i < acting->size; i++) {
+               if (acting->osds[i] == osd)
+                       return i;
+       }
+
+       return -1;
+}
+
+static bool primary_changed(const struct ceph_osds *old_acting,
+                           const struct ceph_osds *new_acting)
+{
+       if (!old_acting->size && !new_acting->size)
+               return false; /* both still empty */
 
+       if (!old_acting->size ^ !new_acting->size)
+               return true; /* was empty, now not, or vice versa */
 
+       if (old_acting->primary != new_acting->primary)
+               return true; /* primary changed */
+
+       if (calc_pg_rank(old_acting->primary, old_acting) !=
+           calc_pg_rank(new_acting->primary, new_acting))
+               return true;
+
+       return false; /* same primary (tho replicas may have changed) */
+}
+
+bool ceph_osds_changed(const struct ceph_osds *old_acting,
+                      const struct ceph_osds *new_acting,
+                      bool any_change)
+{
+       if (primary_changed(old_acting, new_acting))
+               return true;
+
+       if (any_change && !__osds_equal(old_acting, new_acting))
+               return true;
+
+       return false;
+}
 
 /*
  * calculate file layout from given offset, length.
@@ -1455,30 +1758,71 @@ invalid:
 EXPORT_SYMBOL(ceph_calc_file_object_mapping);
 
 /*
- * Calculate mapping of a (oloc, oid) pair to a PG.  Should only be
- * called with target's (oloc, oid), since tiering isn't taken into
- * account.
+ * Map an object into a PG.
+ *
+ * Should only be called with target_oid and target_oloc (as opposed to
+ * base_oid and base_oloc), since tiering isn't taken into account.
  */
-int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
-                       struct ceph_object_locator *oloc,
-                       struct ceph_object_id *oid,
-                       struct ceph_pg *pg_out)
+int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
+                             struct ceph_object_id *oid,
+                             struct ceph_object_locator *oloc,
+                             struct ceph_pg *raw_pgid)
 {
        struct ceph_pg_pool_info *pi;
 
-       pi = __lookup_pg_pool(&osdmap->pg_pools, oloc->pool);
+       pi = ceph_pg_pool_by_id(osdmap, oloc->pool);
        if (!pi)
-               return -EIO;
+               return -ENOENT;
 
-       pg_out->pool = oloc->pool;
-       pg_out->seed = ceph_str_hash(pi->object_hash, oid->name,
-                                    oid->name_len);
+       raw_pgid->pool = oloc->pool;
+       raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name,
+                                      oid->name_len);
 
-       dout("%s '%.*s' pgid %llu.%x\n", __func__, oid->name_len, oid->name,
-            pg_out->pool, pg_out->seed);
+       dout("%s %s -> raw_pgid %llu.%x\n", __func__, oid->name,
+            raw_pgid->pool, raw_pgid->seed);
        return 0;
 }
-EXPORT_SYMBOL(ceph_oloc_oid_to_pg);
+EXPORT_SYMBOL(ceph_object_locator_to_pg);
+
+/*
+ * Map a raw PG (full precision ps) into an actual PG.
+ */
+static void raw_pg_to_pg(struct ceph_pg_pool_info *pi,
+                        const struct ceph_pg *raw_pgid,
+                        struct ceph_pg *pgid)
+{
+       pgid->pool = raw_pgid->pool;
+       pgid->seed = ceph_stable_mod(raw_pgid->seed, pi->pg_num,
+                                    pi->pg_num_mask);
+}
+
+/*
+ * Map a raw PG (full precision ps) into a placement ps (placement
+ * seed).  Include pool id in that value so that different pools don't
+ * use the same seeds.
+ */
+static u32 raw_pg_to_pps(struct ceph_pg_pool_info *pi,
+                        const struct ceph_pg *raw_pgid)
+{
+       if (pi->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
+               /* hash pool id and seed so that pool PGs do not overlap */
+               return crush_hash32_2(CRUSH_HASH_RJENKINS1,
+                                     ceph_stable_mod(raw_pgid->seed,
+                                                     pi->pgp_num,
+                                                     pi->pgp_num_mask),
+                                     raw_pgid->pool);
+       } else {
+               /*
+                * legacy behavior: add ps and pool together.  this is
+                * not a great approach because the PGs from each pool
+                * will overlap on top of each other: 0.5 == 1.4 ==
+                * 2.3 == ...
+                */
+               return ceph_stable_mod(raw_pgid->seed, pi->pgp_num,
+                                      pi->pgp_num_mask) +
+                      (unsigned)raw_pgid->pool;
+       }
+}
 
 static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
                    int *result, int result_max,
@@ -1497,84 +1841,92 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
 }
 
 /*
- * Calculate raw (crush) set for given pgid.
+ * Calculate raw set (CRUSH output) for given PG.  The result may
+ * contain nonexistent OSDs.  ->primary is undefined for a raw set.
  *
- * Return raw set length, or error.
+ * Placement seed (CRUSH input) is returned through @ppps.
  */
-static int pg_to_raw_osds(struct ceph_osdmap *osdmap,
-                         struct ceph_pg_pool_info *pool,
-                         struct ceph_pg pgid, u32 pps, int *osds)
+static void pg_to_raw_osds(struct ceph_osdmap *osdmap,
+                          struct ceph_pg_pool_info *pi,
+                          const struct ceph_pg *raw_pgid,
+                          struct ceph_osds *raw,
+                          u32 *ppps)
 {
+       u32 pps = raw_pg_to_pps(pi, raw_pgid);
        int ruleno;
        int len;
 
-       /* crush */
-       ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset,
-                                pool->type, pool->size);
+       ceph_osds_init(raw);
+       if (ppps)
+               *ppps = pps;
+
+       ruleno = crush_find_rule(osdmap->crush, pi->crush_ruleset, pi->type,
+                                pi->size);
        if (ruleno < 0) {
                pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n",
-                      pgid.pool, pool->crush_ruleset, pool->type,
-                      pool->size);
-               return -ENOENT;
+                      pi->id, pi->crush_ruleset, pi->type, pi->size);
+               return;
        }
 
-       len = do_crush(osdmap, ruleno, pps, osds,
-                      min_t(int, pool->size, CEPH_PG_MAX_SIZE),
+       len = do_crush(osdmap, ruleno, pps, raw->osds,
+                      min_t(int, pi->size, ARRAY_SIZE(raw->osds)),
                       osdmap->osd_weight, osdmap->max_osd);
        if (len < 0) {
                pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
-                      len, ruleno, pgid.pool, pool->crush_ruleset,
-                      pool->type, pool->size);
-               return len;
+                      len, ruleno, pi->id, pi->crush_ruleset, pi->type,
+                      pi->size);
+               return;
        }
 
-       return len;
+       raw->size = len;
 }
 
 /*
- * Given raw set, calculate up set and up primary.
+ * Given raw set, calculate up set and up primary.  By definition of an
+ * up set, the result won't contain nonexistent or down OSDs.
  *
- * Return up set length.  *primary is set to up primary osd id, or -1
- * if up set is empty.
+ * This is done in-place - on return @set is the up set.  If it's
+ * empty, ->primary will remain undefined.
  */
-static int raw_to_up_osds(struct ceph_osdmap *osdmap,
-                         struct ceph_pg_pool_info *pool,
-                         int *osds, int len, int *primary)
+static void raw_to_up_osds(struct ceph_osdmap *osdmap,
+                          struct ceph_pg_pool_info *pi,
+                          struct ceph_osds *set)
 {
-       int up_primary = -1;
        int i;
 
-       if (ceph_can_shift_osds(pool)) {
+       /* ->primary is undefined for a raw set */
+       BUG_ON(set->primary != -1);
+
+       if (ceph_can_shift_osds(pi)) {
                int removed = 0;
 
-               for (i = 0; i < len; i++) {
-                       if (ceph_osd_is_down(osdmap, osds[i])) {
+               /* shift left */
+               for (i = 0; i < set->size; i++) {
+                       if (ceph_osd_is_down(osdmap, set->osds[i])) {
                                removed++;
                                continue;
                        }
                        if (removed)
-                               osds[i - removed] = osds[i];
+                               set->osds[i - removed] = set->osds[i];
                }
-
-               len -= removed;
-               if (len > 0)
-                       up_primary = osds[0];
+               set->size -= removed;
+               if (set->size > 0)
+                       set->primary = set->osds[0];
        } else {
-               for (i = len - 1; i >= 0; i--) {
-                       if (ceph_osd_is_down(osdmap, osds[i]))
-                               osds[i] = CRUSH_ITEM_NONE;
+               /* set down/dne devices to NONE */
+               for (i = set->size - 1; i >= 0; i--) {
+                       if (ceph_osd_is_down(osdmap, set->osds[i]))
+                               set->osds[i] = CRUSH_ITEM_NONE;
                        else
-                               up_primary = osds[i];
+                               set->primary = set->osds[i];
                }
        }
-
-       *primary = up_primary;
-       return len;
 }
 
-static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
-                                  struct ceph_pg_pool_info *pool,
-                                  int *osds, int len, int *primary)
+static void apply_primary_affinity(struct ceph_osdmap *osdmap,
+                                  struct ceph_pg_pool_info *pi,
+                                  u32 pps,
+                                  struct ceph_osds *up)
 {
        int i;
        int pos = -1;
@@ -1586,8 +1938,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
        if (!osdmap->osd_primary_affinity)
                return;
 
-       for (i = 0; i < len; i++) {
-               int osd = osds[i];
+       for (i = 0; i < up->size; i++) {
+               int osd = up->osds[i];
 
                if (osd != CRUSH_ITEM_NONE &&
                    osdmap->osd_primary_affinity[osd] !=
@@ -1595,7 +1947,7 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
                        break;
                }
        }
-       if (i == len)
+       if (i == up->size)
                return;
 
        /*
@@ -1603,8 +1955,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
         * osd into the hash/rng so that a proportional fraction of an
         * osd's pgs get rejected as primary.
         */
-       for (i = 0; i < len; i++) {
-               int osd = osds[i];
+       for (i = 0; i < up->size; i++) {
+               int osd = up->osds[i];
                u32 aff;
 
                if (osd == CRUSH_ITEM_NONE)
@@ -1629,135 +1981,110 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
        if (pos < 0)
                return;
 
-       *primary = osds[pos];
+       up->primary = up->osds[pos];
 
-       if (ceph_can_shift_osds(pool) && pos > 0) {
+       if (ceph_can_shift_osds(pi) && pos > 0) {
                /* move the new primary to the front */
                for (i = pos; i > 0; i--)
-                       osds[i] = osds[i - 1];
-               osds[0] = *primary;
+                       up->osds[i] = up->osds[i - 1];
+               up->osds[0] = up->primary;
        }
 }
 
 /*
- * Given up set, apply pg_temp and primary_temp mappings.
+ * Get pg_temp and primary_temp mappings for given PG.
  *
- * Return acting set length.  *primary is set to acting primary osd id,
- * or -1 if acting set is empty.
+ * Note that a PG may have none, only pg_temp, only primary_temp or
+ * both pg_temp and primary_temp mappings.  This means @temp isn't
+ * always a valid OSD set on return: in the "only primary_temp" case,
+ * @temp will have its ->primary >= 0 but ->size == 0.
  */
-static int apply_temps(struct ceph_osdmap *osdmap,
-                      struct ceph_pg_pool_info *pool, struct ceph_pg pgid,
-                      int *osds, int len, int *primary)
+static void get_temp_osds(struct ceph_osdmap *osdmap,
+                         struct ceph_pg_pool_info *pi,
+                         const struct ceph_pg *raw_pgid,
+                         struct ceph_osds *temp)
 {
+       struct ceph_pg pgid;
        struct ceph_pg_mapping *pg;
-       int temp_len;
-       int temp_primary;
        int i;
 
-       /* raw_pg -> pg */
-       pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num,
-                                   pool->pg_num_mask);
+       raw_pg_to_pg(pi, raw_pgid, &pgid);
+       ceph_osds_init(temp);
 
        /* pg_temp? */
        pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
        if (pg) {
-               temp_len = 0;
-               temp_primary = -1;
-
                for (i = 0; i < pg->pg_temp.len; i++) {
                        if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
-                               if (ceph_can_shift_osds(pool))
+                               if (ceph_can_shift_osds(pi))
                                        continue;
-                               else
-                                       osds[temp_len++] = CRUSH_ITEM_NONE;
+
+                               temp->osds[temp->size++] = CRUSH_ITEM_NONE;
                        } else {
-                               osds[temp_len++] = pg->pg_temp.osds[i];
+                               temp->osds[temp->size++] = pg->pg_temp.osds[i];
                        }
                }
 
                /* apply pg_temp's primary */
-               for (i = 0; i < temp_len; i++) {
-                       if (osds[i] != CRUSH_ITEM_NONE) {
-                               temp_primary = osds[i];
+               for (i = 0; i < temp->size; i++) {
+                       if (temp->osds[i] != CRUSH_ITEM_NONE) {
+                               temp->primary = temp->osds[i];
                                break;
                        }
                }
-       } else {
-               temp_len = len;
-               temp_primary = *primary;
        }
 
        /* primary_temp? */
        pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid);
        if (pg)
-               temp_primary = pg->primary_temp.osd;
-
-       *primary = temp_primary;
-       return temp_len;
+               temp->primary = pg->primary_temp.osd;
 }
 
 /*
- * Calculate acting set for given pgid.
+ * Map a PG to its acting set as well as its up set.
  *
- * Return acting set length, or error.  *primary is set to acting
- * primary osd id, or -1 if acting set is empty or on error.
+ * Acting set is used for data mapping purposes, while up set can be
+ * recorded for detecting interval changes and deciding whether to
+ * resend a request.
  */
-int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
-                       int *osds, int *primary)
+void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
+                              const struct ceph_pg *raw_pgid,
+                              struct ceph_osds *up,
+                              struct ceph_osds *acting)
 {
-       struct ceph_pg_pool_info *pool;
+       struct ceph_pg_pool_info *pi;
        u32 pps;
-       int len;
 
-       pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
-       if (!pool) {
-               *primary = -1;
-               return -ENOENT;
+       pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool);
+       if (!pi) {
+               ceph_osds_init(up);
+               ceph_osds_init(acting);
+               goto out;
        }
 
-       if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
-               /* hash pool id and seed so that pool PGs do not overlap */
-               pps = crush_hash32_2(CRUSH_HASH_RJENKINS1,
-                                    ceph_stable_mod(pgid.seed, pool->pgp_num,
-                                                    pool->pgp_num_mask),
-                                    pgid.pool);
-       } else {
-               /*
-                * legacy behavior: add ps and pool together.  this is
-                * not a great approach because the PGs from each pool
-                * will overlap on top of each other: 0.5 == 1.4 ==
-                * 2.3 == ...
-                */
-               pps = ceph_stable_mod(pgid.seed, pool->pgp_num,
-                                     pool->pgp_num_mask) +
-                       (unsigned)pgid.pool;
-       }
-
-       len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds);
-       if (len < 0) {
-               *primary = -1;
-               return len;
+       pg_to_raw_osds(osdmap, pi, raw_pgid, up, &pps);
+       raw_to_up_osds(osdmap, pi, up);
+       apply_primary_affinity(osdmap, pi, pps, up);
+       get_temp_osds(osdmap, pi, raw_pgid, acting);
+       if (!acting->size) {
+               memcpy(acting->osds, up->osds, up->size * sizeof(up->osds[0]));
+               acting->size = up->size;
+               if (acting->primary == -1)
+                       acting->primary = up->primary;
        }
-
-       len = raw_to_up_osds(osdmap, pool, osds, len, primary);
-
-       apply_primary_affinity(osdmap, pps, pool, osds, len, primary);
-
-       len = apply_temps(osdmap, pool, pgid, osds, len, primary);
-
-       return len;
+out:
+       WARN_ON(!osds_valid(up) || !osds_valid(acting));
 }
 
 /*
- * Return primary osd for given pgid, or -1 if none.
+ * Return acting primary for given PG, or -1 if none.
  */
-int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
+int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
+                             const struct ceph_pg *raw_pgid)
 {
-       int osds[CEPH_PG_MAX_SIZE];
-       int primary;
-
-       ceph_calc_pg_acting(osdmap, pgid, osds, &primary);
+       struct ceph_osds up, acting;
 
-       return primary;
+       ceph_pg_to_up_acting_osds(osdmap, raw_pgid, &up, &acting);
+       return acting.primary;
 }
-EXPORT_SYMBOL(ceph_calc_pg_primary);
+EXPORT_SYMBOL(ceph_pg_to_acting_primary);
index 941c284..2cab489 100644 (file)
@@ -55,18 +55,21 @@ int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp)
        spin_lock_irqsave(&bm_pool->lock, flags);
        if (bm_pool->buf_num == bm_pool->size) {
                pr_warn("pool already filled\n");
+               spin_unlock_irqrestore(&bm_pool->lock, flags);
                return bm_pool->buf_num;
        }
 
        if (buf_num + bm_pool->buf_num > bm_pool->size) {
                pr_warn("cannot allocate %d buffers for pool\n",
                        buf_num);
+               spin_unlock_irqrestore(&bm_pool->lock, flags);
                return 0;
        }
 
        if ((buf_num + bm_pool->buf_num) < bm_pool->buf_num) {
                pr_warn("Adding %d buffers to the %d current buffers will overflow\n",
                        buf_num,  bm_pool->buf_num);
+               spin_unlock_irqrestore(&bm_pool->lock, flags);
                return 0;
        }
 
index 8604ae2..8b02df0 100644 (file)
@@ -2245,10 +2245,8 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
        hrtimer_set_expires(&t.timer, spin_until);
 
        remaining = ktime_to_ns(hrtimer_expires_remaining(&t.timer));
-       if (remaining <= 0) {
-               pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay);
-               return;
-       }
+       if (remaining <= 0)
+               goto out;
 
        start_time = ktime_get();
        if (remaining < 100000) {
@@ -2273,7 +2271,9 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
        }
 
        pkt_dev->idle_acc += ktime_to_ns(ktime_sub(end_time, start_time));
+out:
        pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay);
+       destroy_hrtimer_on_stack(&t.timer);
 }
 
 static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev)
index ca207db..116187b 100644 (file)
@@ -1289,8 +1289,8 @@ ieee802154_llsec_parse_dev_addr(struct nlattr *nla,
                                     nl802154_dev_addr_policy))
                return -EINVAL;
 
-       if (!attrs[NL802154_DEV_ADDR_ATTR_PAN_ID] &&
-           !attrs[NL802154_DEV_ADDR_ATTR_MODE] &&
+       if (!attrs[NL802154_DEV_ADDR_ATTR_PAN_ID] ||
+           !attrs[NL802154_DEV_ADDR_ATTR_MODE] ||
            !(attrs[NL802154_DEV_ADDR_ATTR_SHORT] ||
              attrs[NL802154_DEV_ADDR_ATTR_EXTENDED]))
                return -EINVAL;
index 377424e..d39e9e4 100644 (file)
@@ -1681,6 +1681,14 @@ static __net_init int inet_init_net(struct net *net)
         */
        net->ipv4.ping_group_range.range[0] = make_kgid(&init_user_ns, 1);
        net->ipv4.ping_group_range.range[1] = make_kgid(&init_user_ns, 0);
+
+       /* Default values for sysctl-controlled parameters.
+        * We set them here, in case sysctl is not compiled.
+        */
+       net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
+       net->ipv4.sysctl_ip_dynaddr = 0;
+       net->ipv4.sysctl_ip_early_demux = 1;
+
        return 0;
 }
 
index bb04195..1cb67de 100644 (file)
@@ -999,10 +999,6 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
        if (!net->ipv4.sysctl_local_reserved_ports)
                goto err_ports;
 
-       net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
-       net->ipv4.sysctl_ip_dynaddr = 0;
-       net->ipv4.sysctl_ip_early_demux = 1;
-
        return 0;
 
 err_ports:
index 3f84113..2343e4f 100644 (file)
@@ -232,6 +232,15 @@ config IPV6_GRE
 
          Saying M here will produce a module called ip6_gre. If unsure, say N.
 
+config IPV6_FOU
+       tristate
+       default NET_FOU && IPV6
+
+config IPV6_FOU_TUNNEL
+       tristate
+       default NET_FOU_IP_TUNNELS && IPV6_FOU
+       select IPV6_TUNNEL
+
 config IPV6_MULTIPLE_TABLES
        bool "IPv6: Multiple Routing Tables"
        select FIB_RULES
index 7ec3129..6d8ea09 100644 (file)
@@ -42,7 +42,7 @@ obj-$(CONFIG_IPV6_VTI) += ip6_vti.o
 obj-$(CONFIG_IPV6_SIT) += sit.o
 obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o
 obj-$(CONFIG_IPV6_GRE) += ip6_gre.o
-obj-$(CONFIG_NET_FOU) += fou6.o
+obj-$(CONFIG_IPV6_FOU) += fou6.o
 
 obj-y += addrconf_core.o exthdrs_core.o ip6_checksum.o ip6_icmp.o
 obj-$(CONFIG_INET) += output_core.o protocol.o $(ipv6-offload)
index c972d0b..9ea249b 100644 (file)
@@ -69,7 +69,7 @@ int gue6_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
 }
 EXPORT_SYMBOL(gue6_build_header);
 
-#ifdef CONFIG_NET_FOU_IP_TUNNELS
+#if IS_ENABLED(CONFIG_IPV6_FOU_TUNNEL)
 
 static const struct ip6_tnl_encap_ops fou_ip6tun_ops = {
        .encap_hlen = fou_encap_hlen,
index af503f5..f4ac284 100644 (file)
@@ -712,6 +712,7 @@ static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu)
        fl6->daddr = p->raddr;
        fl6->flowi6_oif = p->link;
        fl6->flowlabel = 0;
+       fl6->flowi6_proto = IPPROTO_GRE;
 
        if (!(p->flags&IP6_TNL_F_USE_ORIG_TCLASS))
                fl6->flowlabel |= IPV6_TCLASS_MASK & p->flowinfo;
@@ -1027,6 +1028,8 @@ static int ip6gre_tunnel_init_common(struct net_device *dev)
 
        dev->hard_header_len = LL_MAX_HEADER + t_hlen;
        dev->mtu = ETH_DATA_LEN - t_hlen;
+       if (dev->type == ARPHRD_ETHER)
+               dev->mtu -= ETH_HLEN;
        if (!(tunnel->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
                dev->mtu -= 8;
 
index c6f5df1..6c54e03 100644 (file)
@@ -128,6 +128,7 @@ static inline struct sock *l2tp_ip6_bind_lookup(struct net *net,
  */
 static int l2tp_ip6_recv(struct sk_buff *skb)
 {
+       struct net *net = dev_net(skb->dev);
        struct sock *sk;
        u32 session_id;
        u32 tunnel_id;
@@ -154,7 +155,7 @@ static int l2tp_ip6_recv(struct sk_buff *skb)
        }
 
        /* Ok, this is a data packet. Lookup the session. */
-       session = l2tp_session_find(&init_net, NULL, session_id);
+       session = l2tp_session_find(net, NULL, session_id);
        if (session == NULL)
                goto discard;
 
@@ -188,14 +189,14 @@ pass_up:
                goto discard;
 
        tunnel_id = ntohl(*(__be32 *) &skb->data[4]);
-       tunnel = l2tp_tunnel_find(&init_net, tunnel_id);
+       tunnel = l2tp_tunnel_find(net, tunnel_id);
        if (tunnel != NULL)
                sk = tunnel->sock;
        else {
                struct ipv6hdr *iph = ipv6_hdr(skb);
 
                read_lock_bh(&l2tp_ip6_lock);
-               sk = __l2tp_ip6_bind_lookup(&init_net, &iph->daddr,
+               sk = __l2tp_ip6_bind_lookup(net, &iph->daddr,
                                            0, tunnel_id);
                read_unlock_bh(&l2tp_ip6_lock);
        }
@@ -263,6 +264,7 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
        struct inet_sock *inet = inet_sk(sk);
        struct ipv6_pinfo *np = inet6_sk(sk);
        struct sockaddr_l2tpip6 *addr = (struct sockaddr_l2tpip6 *) uaddr;
+       struct net *net = sock_net(sk);
        __be32 v4addr = 0;
        int addr_type;
        int err;
@@ -286,7 +288,7 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 
        err = -EADDRINUSE;
        read_lock_bh(&l2tp_ip6_lock);
-       if (__l2tp_ip6_bind_lookup(&init_net, &addr->l2tp_addr,
+       if (__l2tp_ip6_bind_lookup(net, &addr->l2tp_addr,
                                   sk->sk_bound_dev_if, addr->l2tp_conn_id))
                goto out_in_use;
        read_unlock_bh(&l2tp_ip6_lock);
@@ -456,7 +458,7 @@ static int l2tp_ip6_backlog_recv(struct sock *sk, struct sk_buff *skb)
        return 0;
 
 drop:
-       IP_INC_STATS(&init_net, IPSTATS_MIB_INDISCARDS);
+       IP_INC_STATS(sock_net(sk), IPSTATS_MIB_INDISCARDS);
        kfree_skb(skb);
        return -1;
 }
index 5dba899..1824708 100644 (file)
@@ -444,10 +444,9 @@ static void lapb_state3_machine(struct lapb_cb *lapb, struct sk_buff *skb,
                break;
 
        case LAPB_FRMR:
-               lapb_dbg(1, "(%p) S3 RX FRMR(%d) %02X %02X %02X %02X %02X\n",
+               lapb_dbg(1, "(%p) S3 RX FRMR(%d) %5ph\n",
                         lapb->dev, frame->pf,
-                        skb->data[0], skb->data[1], skb->data[2],
-                        skb->data[3], skb->data[4]);
+                        skb->data);
                lapb_establish_data_link(lapb);
                lapb_dbg(0, "(%p) S3 -> S1\n", lapb->dev);
                lapb_requeue_frames(lapb);
index ba4d015..482c94d 100644 (file)
@@ -148,9 +148,7 @@ void lapb_transmit_buffer(struct lapb_cb *lapb, struct sk_buff *skb, int type)
                }
        }
 
-       lapb_dbg(2, "(%p) S%d TX %02X %02X %02X\n",
-                lapb->dev, lapb->state,
-                skb->data[0], skb->data[1], skb->data[2]);
+       lapb_dbg(2, "(%p) S%d TX %3ph\n", lapb->dev, lapb->state, skb->data);
 
        if (!lapb_data_transmit(lapb, skb))
                kfree_skb(skb);
index 9d0a426..3c1914d 100644 (file)
@@ -113,9 +113,7 @@ int lapb_decode(struct lapb_cb *lapb, struct sk_buff *skb,
 {
        frame->type = LAPB_ILLEGAL;
 
-       lapb_dbg(2, "(%p) S%d RX %02X %02X %02X\n",
-                lapb->dev, lapb->state,
-                skb->data[0], skb->data[1], skb->data[2]);
+       lapb_dbg(2, "(%p) S%d RX %3ph\n", lapb->dev, lapb->state, skb->data);
 
        /* We always need to look at 2 bytes, sometimes we need
         * to look at 3 and those cases are handled below.
@@ -284,10 +282,9 @@ void lapb_transmit_frmr(struct lapb_cb *lapb)
                dptr++;
                *dptr++ = lapb->frmr_type;
 
-               lapb_dbg(1, "(%p) S%d TX FRMR %02X %02X %02X %02X %02X\n",
+               lapb_dbg(1, "(%p) S%d TX FRMR %5ph\n",
                         lapb->dev, lapb->state,
-                        skb->data[1], skb->data[2], skb->data[3],
-                        skb->data[4], skb->data[5]);
+                        &skb->data[1]);
        } else {
                dptr    = skb_put(skb, 4);
                *dptr++ = LAPB_FRMR;
@@ -299,9 +296,8 @@ void lapb_transmit_frmr(struct lapb_cb *lapb)
                dptr++;
                *dptr++ = lapb->frmr_type;
 
-               lapb_dbg(1, "(%p) S%d TX FRMR %02X %02X %02X\n",
-                        lapb->dev, lapb->state, skb->data[1],
-                        skb->data[2], skb->data[3]);
+               lapb_dbg(1, "(%p) S%d TX FRMR %3ph\n",
+                        lapb->dev, lapb->state, &skb->data[1]);
        }
 
        lapb_transmit_buffer(lapb, skb, LAPB_RESPONSE);
index 879185f..9a3eb7a 100644 (file)
@@ -137,11 +137,23 @@ static bool is_flow_key_valid(const struct sw_flow_key *key)
        return !!key->eth.type;
 }
 
+static void update_ethertype(struct sk_buff *skb, struct ethhdr *hdr,
+                            __be16 ethertype)
+{
+       if (skb->ip_summed == CHECKSUM_COMPLETE) {
+               __be16 diff[] = { ~(hdr->h_proto), ethertype };
+
+               skb->csum = ~csum_partial((char *)diff, sizeof(diff),
+                                       ~skb->csum);
+       }
+
+       hdr->h_proto = ethertype;
+}
+
 static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
                     const struct ovs_action_push_mpls *mpls)
 {
        __be32 *new_mpls_lse;
-       struct ethhdr *hdr;
 
        /* Networking stack do not allow simultaneous Tunnel and MPLS GSO. */
        if (skb->encapsulation)
@@ -160,9 +172,7 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
 
        skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN);
 
-       hdr = eth_hdr(skb);
-       hdr->h_proto = mpls->mpls_ethertype;
-
+       update_ethertype(skb, eth_hdr(skb), mpls->mpls_ethertype);
        if (!skb->inner_protocol)
                skb_set_inner_protocol(skb, skb->protocol);
        skb->protocol = mpls->mpls_ethertype;
@@ -193,7 +203,7 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key,
         * field correctly in the presence of VLAN tags.
         */
        hdr = (struct ethhdr *)(skb_mpls_header(skb) - ETH_HLEN);
-       hdr->h_proto = ethertype;
+       update_ethertype(skb, hdr, ethertype);
        if (eth_p_mpls(skb->protocol))
                skb->protocol = ethertype;
 
index 330f14e..b884dae 100644 (file)
@@ -239,6 +239,8 @@ override:
        police->tcfp_t_c = ktime_get_ns();
        police->tcf_index = parm->index ? parm->index :
                tcf_hash_new_index(tn);
+       police->tcf_tm.install = jiffies;
+       police->tcf_tm.lastuse = jiffies;
        h = tcf_hash(police->tcf_index, POL_TAB_MASK);
        spin_lock_bh(&hinfo->lock);
        hlist_add_head(&police->tcf_head, &hinfo->htab[h]);
@@ -268,6 +270,7 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a,
        spin_lock(&police->tcf_lock);
 
        bstats_update(&police->tcf_bstats, skb);
+       tcf_lastuse_update(&police->tcf_tm);
 
        if (police->tcfp_ewma_rate &&
            police->tcf_rate_est.bps >= police->tcfp_ewma_rate) {
@@ -327,6 +330,7 @@ tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
                .refcnt = police->tcf_refcnt - ref,
                .bindcnt = police->tcf_bindcnt - bind,
        };
+       struct tcf_t t;
 
        if (police->rate_present)
                psched_ratecfg_getrate(&opt.rate, &police->rate);
@@ -340,6 +344,13 @@ tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
        if (police->tcfp_ewma_rate &&
            nla_put_u32(skb, TCA_POLICE_AVRATE, police->tcfp_ewma_rate))
                goto nla_put_failure;
+
+       t.install = jiffies_to_clock_t(jiffies - police->tcf_tm.install);
+       t.lastuse = jiffies_to_clock_t(jiffies - police->tcf_tm.lastuse);
+       t.expires = jiffies_to_clock_t(police->tcf_tm.expires);
+       if (nla_put_64bit(skb, TCA_POLICE_TM, sizeof(t), &t, TCA_POLICE_PAD))
+               goto nla_put_failure;
+
        return skb->len;
 
 nla_put_failure:
index 64f71a2..ddf047d 100644 (file)
@@ -607,6 +607,10 @@ void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires, bool thr
        if (throttle)
                qdisc_throttled(wd->qdisc);
 
+       if (wd->last_expires == expires)
+               return;
+
+       wd->last_expires = expires;
        hrtimer_start(&wd->timer,
                      ns_to_ktime(expires),
                      HRTIMER_MODE_ABS_PINNED);
index f6bf581..d4b4218 100644 (file)
@@ -928,17 +928,10 @@ ok:
                }
        }
        qdisc_qstats_overlimit(sch);
-       if (likely(next_event > q->now)) {
-               if (!test_bit(__QDISC_STATE_DEACTIVATED,
-                             &qdisc_root_sleeping(q->watchdog.qdisc)->state)) {
-                       ktime_t time = ns_to_ktime(next_event);
-                       qdisc_throttled(q->watchdog.qdisc);
-                       hrtimer_start(&q->watchdog.timer, time,
-                                     HRTIMER_MODE_ABS_PINNED);
-               }
-       } else {
+       if (likely(next_event > q->now))
+               qdisc_watchdog_schedule_ns(&q->watchdog, next_event, true);
+       else
                schedule_work(&q->work);
-       }
 fin:
        return skb;
 }
index 8e3e769..1ce724b 100644 (file)
@@ -356,6 +356,9 @@ static int sctp_ep_dump(struct sctp_endpoint *ep, void *p)
        if (cb->args[4] < cb->args[1])
                goto next;
 
+       if ((r->idiag_states & ~TCPF_LISTEN) && !list_empty(&ep->asocs))
+               goto next;
+
        if (r->sdiag_family != AF_UNSPEC &&
            sk->sk_family != r->sdiag_family)
                goto next;
index 777d032..67154b8 100644 (file)
@@ -4220,6 +4220,7 @@ int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc,
                info->sctpi_s_disable_fragments = sp->disable_fragments;
                info->sctpi_s_v4mapped = sp->v4mapped;
                info->sctpi_s_frag_interleave = sp->frag_interleave;
+               info->sctpi_s_type = sp->type;
 
                return 0;
        }
index 02f5367..040ff62 100644 (file)
@@ -543,7 +543,7 @@ rpcauth_cache_enforce_limit(void)
  */
 struct rpc_cred *
 rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
-               int flags)
+               int flags, gfp_t gfp)
 {
        LIST_HEAD(free);
        struct rpc_cred_cache *cache = auth->au_credcache;
@@ -580,7 +580,7 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
        if (flags & RPCAUTH_LOOKUP_RCU)
                return ERR_PTR(-ECHILD);
 
-       new = auth->au_ops->crcreate(auth, acred, flags);
+       new = auth->au_ops->crcreate(auth, acred, flags, gfp);
        if (IS_ERR(new)) {
                cred = new;
                goto out;
@@ -703,8 +703,7 @@ rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags)
                new = rpcauth_bind_new_cred(task, lookupflags);
        if (IS_ERR(new))
                return PTR_ERR(new);
-       if (req->rq_cred != NULL)
-               put_rpccred(req->rq_cred);
+       put_rpccred(req->rq_cred);
        req->rq_cred = new;
        return 0;
 }
@@ -712,6 +711,8 @@ rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags)
 void
 put_rpccred(struct rpc_cred *cred)
 {
+       if (cred == NULL)
+               return;
        /* Fast path for unhashed credentials */
        if (test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) == 0) {
                if (atomic_dec_and_test(&cred->cr_count))
index 41248b1..54dd3fd 100644 (file)
@@ -38,6 +38,13 @@ struct rpc_cred *rpc_lookup_cred(void)
 }
 EXPORT_SYMBOL_GPL(rpc_lookup_cred);
 
+struct rpc_cred *
+rpc_lookup_generic_cred(struct auth_cred *acred, int flags, gfp_t gfp)
+{
+       return rpcauth_lookup_credcache(&generic_auth, acred, flags, gfp);
+}
+EXPORT_SYMBOL_GPL(rpc_lookup_generic_cred);
+
 struct rpc_cred *rpc_lookup_cred_nonblock(void)
 {
        return rpcauth_lookupcred(&generic_auth, RPCAUTH_LOOKUP_RCU);
@@ -77,15 +84,15 @@ static struct rpc_cred *generic_bind_cred(struct rpc_task *task,
 static struct rpc_cred *
 generic_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 {
-       return rpcauth_lookup_credcache(&generic_auth, acred, flags);
+       return rpcauth_lookup_credcache(&generic_auth, acred, flags, GFP_KERNEL);
 }
 
 static struct rpc_cred *
-generic_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+generic_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp)
 {
        struct generic_cred *gcred;
 
-       gcred = kmalloc(sizeof(*gcred), GFP_KERNEL);
+       gcred = kmalloc(sizeof(*gcred), gfp);
        if (gcred == NULL)
                return ERR_PTR(-ENOMEM);
 
index 15612ff..e64ae93 100644 (file)
@@ -1299,11 +1299,11 @@ gss_destroy_cred(struct rpc_cred *cred)
 static struct rpc_cred *
 gss_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 {
-       return rpcauth_lookup_credcache(auth, acred, flags);
+       return rpcauth_lookup_credcache(auth, acred, flags, GFP_NOFS);
 }
 
 static struct rpc_cred *
-gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp)
 {
        struct gss_auth *gss_auth = container_of(auth, struct gss_auth, rpc_auth);
        struct gss_cred *cred = NULL;
@@ -1313,7 +1313,7 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
                __func__, from_kuid(&init_user_ns, acred->uid),
                auth->au_flavor);
 
-       if (!(cred = kzalloc(sizeof(*cred), GFP_NOFS)))
+       if (!(cred = kzalloc(sizeof(*cred), gfp)))
                goto out_err;
 
        rpcauth_init_cred(&cred->gc_base, acred, auth, &gss_credops);
index 1095be9..e085f5a 100644 (file)
@@ -569,10 +569,9 @@ gss_svc_searchbyctx(struct cache_detail *cd, struct xdr_netobj *handle)
        struct rsc *found;
 
        memset(&rsci, 0, sizeof(rsci));
-       if (dup_to_netobj(&rsci.handle, handle->data, handle->len))
-               return NULL;
+       rsci.handle.data = handle->data;
+       rsci.handle.len = handle->len;
        found = rsc_lookup(cd, &rsci);
-       rsc_free(&rsci);
        if (!found)
                return NULL;
        if (cache_check(cd, &found->h, NULL))
@@ -857,8 +856,8 @@ unwrap_integ_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct g
                goto out;
        if (svc_getnl(&buf->head[0]) != seq)
                goto out;
-       /* trim off the mic at the end before returning */
-       xdr_buf_trim(buf, mic.len + 4);
+       /* trim off the mic and padding at the end before returning */
+       xdr_buf_trim(buf, round_up_to_quad(mic.len) + 4);
        stat = 0;
 out:
        kfree(mic.data);
index 0d3dd36..9f65452 100644 (file)
@@ -52,11 +52,11 @@ unx_destroy(struct rpc_auth *auth)
 static struct rpc_cred *
 unx_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 {
-       return rpcauth_lookup_credcache(auth, acred, flags);
+       return rpcauth_lookup_credcache(auth, acred, flags, GFP_NOFS);
 }
 
 static struct rpc_cred *
-unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp)
 {
        struct unx_cred *cred;
        unsigned int groups = 0;
@@ -66,7 +66,7 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
                        from_kuid(&init_user_ns, acred->uid),
                        from_kgid(&init_user_ns, acred->gid));
 
-       if (!(cred = kmalloc(sizeof(*cred), GFP_NOFS)))
+       if (!(cred = kmalloc(sizeof(*cred), gfp)))
                return ERR_PTR(-ENOMEM);
 
        rpcauth_init_cred(&cred->uc_base, acred, auth, &unix_credops);
index 7e0c9bf..06b4df9 100644 (file)
@@ -1413,6 +1413,23 @@ size_t rpc_max_payload(struct rpc_clnt *clnt)
 }
 EXPORT_SYMBOL_GPL(rpc_max_payload);
 
+/**
+ * rpc_max_bc_payload - Get maximum backchannel payload size, in bytes
+ * @clnt: RPC client to query
+ */
+size_t rpc_max_bc_payload(struct rpc_clnt *clnt)
+{
+       struct rpc_xprt *xprt;
+       size_t ret;
+
+       rcu_read_lock();
+       xprt = rcu_dereference(clnt->cl_xprt);
+       ret = xprt->ops->bc_maxpayload(xprt);
+       rcu_read_unlock();
+       return ret;
+}
+EXPORT_SYMBOL_GPL(rpc_max_bc_payload);
+
 /**
  * rpc_get_timeout - Get timeout for transport in units of HZ
  * @clnt: RPC client to query
index 7422f28..f5572e3 100644 (file)
@@ -244,13 +244,12 @@ void svc_add_new_perm_xprt(struct svc_serv *serv, struct svc_xprt *new)
        svc_xprt_received(new);
 }
 
-int svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
+int _svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
                    struct net *net, const int family,
                    const unsigned short port, int flags)
 {
        struct svc_xprt_class *xcl;
 
-       dprintk("svc: creating transport %s[%d]\n", xprt_name, port);
        spin_lock(&svc_xprt_class_lock);
        list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) {
                struct svc_xprt *newxprt;
@@ -274,12 +273,28 @@ int svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
        }
  err:
        spin_unlock(&svc_xprt_class_lock);
-       dprintk("svc: transport %s not found\n", xprt_name);
-
        /* This errno is exposed to user space.  Provide a reasonable
         * perror msg for a bad transport. */
        return -EPROTONOSUPPORT;
 }
+
+int svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
+                   struct net *net, const int family,
+                   const unsigned short port, int flags)
+{
+       int err;
+
+       dprintk("svc: creating transport %s[%d]\n", xprt_name, port);
+       err = _svc_create_xprt(serv, xprt_name, net, family, port, flags);
+       if (err == -EPROTONOSUPPORT) {
+               request_module("svc%s", xprt_name);
+               err = _svc_create_xprt(serv, xprt_name, net, family, port, flags);
+       }
+       if (err)
+               dprintk("svc: transport %s not found, err %d\n",
+                       xprt_name, err);
+       return err;
+}
 EXPORT_SYMBOL_GPL(svc_create_xprt);
 
 /*
index 6bdb386..c4f3cc0 100644 (file)
@@ -797,6 +797,8 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p)
                xdr_set_iov(xdr, buf->head, buf->len);
        else if (buf->page_len != 0)
                xdr_set_page_base(xdr, 0, buf->len);
+       else
+               xdr_set_iov(xdr, buf->head, buf->len);
        if (p != NULL && p > xdr->p && xdr->end >= p) {
                xdr->nwords -= p - xdr->p;
                xdr->p = p;
index 2dcd764..87762d9 100644 (file)
@@ -191,6 +191,22 @@ int xprt_rdma_bc_up(struct svc_serv *serv, struct net *net)
        return 0;
 }
 
+/**
+ * xprt_rdma_bc_maxpayload - Return maximum backchannel message size
+ * @xprt: transport
+ *
+ * Returns maximum size, in bytes, of a backchannel message
+ */
+size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt)
+{
+       struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+       struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
+       size_t maxmsg;
+
+       maxmsg = min_t(unsigned int, cdata->inline_rsize, cdata->inline_wsize);
+       return maxmsg - RPCRDMA_HDRLEN_MIN;
+}
+
 /**
  * rpcrdma_bc_marshal_reply - Send backwards direction reply
  * @rqst: buffer containing RPC reply data
index b289e10..6326ebe 100644 (file)
 /* Maximum scatter/gather per FMR */
 #define RPCRDMA_MAX_FMR_SGES   (64)
 
+static struct workqueue_struct *fmr_recovery_wq;
+
+#define FMR_RECOVERY_WQ_FLAGS          (WQ_UNBOUND)
+
+int
+fmr_alloc_recovery_wq(void)
+{
+       fmr_recovery_wq = alloc_workqueue("fmr_recovery", WQ_UNBOUND, 0);
+       return !fmr_recovery_wq ? -ENOMEM : 0;
+}
+
+void
+fmr_destroy_recovery_wq(void)
+{
+       struct workqueue_struct *wq;
+
+       if (!fmr_recovery_wq)
+               return;
+
+       wq = fmr_recovery_wq;
+       fmr_recovery_wq = NULL;
+       destroy_workqueue(wq);
+}
+
+static int
+__fmr_unmap(struct rpcrdma_mw *mw)
+{
+       LIST_HEAD(l);
+
+       list_add(&mw->fmr.fmr->list, &l);
+       return ib_unmap_fmr(&l);
+}
+
+/* Deferred reset of a single FMR. Generate a fresh rkey by
+ * replacing the MR. There's no recovery if this fails.
+ */
+static void
+__fmr_recovery_worker(struct work_struct *work)
+{
+       struct rpcrdma_mw *mw = container_of(work, struct rpcrdma_mw,
+                                           mw_work);
+       struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
+
+       __fmr_unmap(mw);
+       rpcrdma_put_mw(r_xprt, mw);
+       return;
+}
+
+/* A broken MR was discovered in a context that can't sleep.
+ * Defer recovery to the recovery worker.
+ */
+static void
+__fmr_queue_recovery(struct rpcrdma_mw *mw)
+{
+       INIT_WORK(&mw->mw_work, __fmr_recovery_worker);
+       queue_work(fmr_recovery_wq, &mw->mw_work);
+}
+
 static int
 fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
            struct rpcrdma_create_data_internal *cdata)
 {
+       rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1,
+                                                     RPCRDMA_MAX_DATA_SEGS /
+                                                     RPCRDMA_MAX_FMR_SGES));
        return 0;
 }
 
@@ -48,7 +109,7 @@ static size_t
 fmr_op_maxpages(struct rpcrdma_xprt *r_xprt)
 {
        return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
-                    rpcrdma_max_segments(r_xprt) * RPCRDMA_MAX_FMR_SGES);
+                    RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES);
 }
 
 static int
@@ -89,6 +150,7 @@ fmr_op_init(struct rpcrdma_xprt *r_xprt)
                if (IS_ERR(r->fmr.fmr))
                        goto out_fmr_err;
 
+               r->mw_xprt = r_xprt;
                list_add(&r->mw_list, &buf->rb_mws);
                list_add(&r->mw_all, &buf->rb_all);
        }
@@ -104,15 +166,6 @@ out:
        return rc;
 }
 
-static int
-__fmr_unmap(struct rpcrdma_mw *r)
-{
-       LIST_HEAD(l);
-
-       list_add(&r->fmr.fmr->list, &l);
-       return ib_unmap_fmr(&l);
-}
-
 /* Use the ib_map_phys_fmr() verb to register a memory region
  * for remote access via RDMA READ or RDMA WRITE.
  */
@@ -183,15 +236,10 @@ static void
 __fmr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
 {
        struct ib_device *device = r_xprt->rx_ia.ri_device;
-       struct rpcrdma_mw *mw = seg->rl_mw;
        int nsegs = seg->mr_nsegs;
 
-       seg->rl_mw = NULL;
-
        while (nsegs--)
                rpcrdma_unmap_one(device, seg++);
-
-       rpcrdma_put_mw(r_xprt, mw);
 }
 
 /* Invalidate all memory regions that were registered for "req".
@@ -234,42 +282,50 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
                seg = &req->rl_segments[i];
 
                __fmr_dma_unmap(r_xprt, seg);
+               rpcrdma_put_mw(r_xprt, seg->rl_mw);
 
                i += seg->mr_nsegs;
                seg->mr_nsegs = 0;
+               seg->rl_mw = NULL;
        }
 
        req->rl_nchunks = 0;
 }
 
-/* Use the ib_unmap_fmr() verb to prevent further remote
- * access via RDMA READ or RDMA WRITE.
+/* Use a slow, safe mechanism to invalidate all memory regions
+ * that were registered for "req".
+ *
+ * In the asynchronous case, DMA unmapping occurs first here
+ * because the rpcrdma_mr_seg is released immediately after this
+ * call. It's contents won't be available in __fmr_dma_unmap later.
+ * FIXME.
  */
-static int
-fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
+static void
+fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+                 bool sync)
 {
-       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-       struct rpcrdma_mr_seg *seg1 = seg;
-       struct rpcrdma_mw *mw = seg1->rl_mw;
-       int rc, nsegs = seg->mr_nsegs;
+       struct rpcrdma_mr_seg *seg;
+       struct rpcrdma_mw *mw;
+       unsigned int i;
 
-       dprintk("RPC:       %s: FMR %p\n", __func__, mw);
+       for (i = 0; req->rl_nchunks; req->rl_nchunks--) {
+               seg = &req->rl_segments[i];
+               mw = seg->rl_mw;
 
-       seg1->rl_mw = NULL;
-       while (seg1->mr_nsegs--)
-               rpcrdma_unmap_one(ia->ri_device, seg++);
-       rc = __fmr_unmap(mw);
-       if (rc)
-               goto out_err;
-       rpcrdma_put_mw(r_xprt, mw);
-       return nsegs;
+               if (sync) {
+                       /* ORDER */
+                       __fmr_unmap(mw);
+                       __fmr_dma_unmap(r_xprt, seg);
+                       rpcrdma_put_mw(r_xprt, mw);
+               } else {
+                       __fmr_dma_unmap(r_xprt, seg);
+                       __fmr_queue_recovery(mw);
+               }
 
-out_err:
-       /* The FMR is abandoned, but remains in rb_all. fmr_op_destroy
-        * will attempt to release it when the transport is destroyed.
-        */
-       dprintk("RPC:       %s: ib_unmap_fmr status %i\n", __func__, rc);
-       return nsegs;
+               i += seg->mr_nsegs;
+               seg->mr_nsegs = 0;
+               seg->rl_mw = NULL;
+       }
 }
 
 static void
@@ -295,7 +351,7 @@ fmr_op_destroy(struct rpcrdma_buffer *buf)
 const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
        .ro_map                         = fmr_op_map,
        .ro_unmap_sync                  = fmr_op_unmap_sync,
-       .ro_unmap                       = fmr_op_unmap,
+       .ro_unmap_safe                  = fmr_op_unmap_safe,
        .ro_open                        = fmr_op_open,
        .ro_maxpages                    = fmr_op_maxpages,
        .ro_init                        = fmr_op_init,
index 94c3fa9..c094754 100644 (file)
@@ -98,6 +98,47 @@ frwr_destroy_recovery_wq(void)
        destroy_workqueue(wq);
 }
 
+static int
+__frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
+{
+       struct rpcrdma_frmr *f = &r->frmr;
+       int rc;
+
+       rc = ib_dereg_mr(f->fr_mr);
+       if (rc) {
+               pr_warn("rpcrdma: ib_dereg_mr status %d, frwr %p orphaned\n",
+                       rc, r);
+               return rc;
+       }
+
+       f->fr_mr = ib_alloc_mr(ia->ri_pd, IB_MR_TYPE_MEM_REG,
+                              ia->ri_max_frmr_depth);
+       if (IS_ERR(f->fr_mr)) {
+               pr_warn("rpcrdma: ib_alloc_mr status %ld, frwr %p orphaned\n",
+                       PTR_ERR(f->fr_mr), r);
+               return PTR_ERR(f->fr_mr);
+       }
+
+       dprintk("RPC:       %s: recovered FRMR %p\n", __func__, r);
+       f->fr_state = FRMR_IS_INVALID;
+       return 0;
+}
+
+static void
+__frwr_reset_and_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
+{
+       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+       struct rpcrdma_frmr *f = &mw->frmr;
+       int rc;
+
+       rc = __frwr_reset_mr(ia, mw);
+       ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents, f->fr_dir);
+       if (rc)
+               return;
+
+       rpcrdma_put_mw(r_xprt, mw);
+}
+
 /* Deferred reset of a single FRMR. Generate a fresh rkey by
  * replacing the MR.
  *
@@ -109,26 +150,10 @@ static void
 __frwr_recovery_worker(struct work_struct *work)
 {
        struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw,
-                                           frmr.fr_work);
-       struct rpcrdma_xprt *r_xprt = r->frmr.fr_xprt;
-       unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
-       struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
-
-       if (ib_dereg_mr(r->frmr.fr_mr))
-               goto out_fail;
+                                           mw_work);
 
-       r->frmr.fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth);
-       if (IS_ERR(r->frmr.fr_mr))
-               goto out_fail;
-
-       dprintk("RPC:       %s: recovered FRMR %p\n", __func__, r);
-       r->frmr.fr_state = FRMR_IS_INVALID;
-       rpcrdma_put_mw(r_xprt, r);
+       __frwr_reset_and_unmap(r->mw_xprt, r);
        return;
-
-out_fail:
-       pr_warn("RPC:       %s: FRMR %p unrecovered\n",
-               __func__, r);
 }
 
 /* A broken MR was discovered in a context that can't sleep.
@@ -137,8 +162,8 @@ out_fail:
 static void
 __frwr_queue_recovery(struct rpcrdma_mw *r)
 {
-       INIT_WORK(&r->frmr.fr_work, __frwr_recovery_worker);
-       queue_work(frwr_recovery_wq, &r->frmr.fr_work);
+       INIT_WORK(&r->mw_work, __frwr_recovery_worker);
+       queue_work(frwr_recovery_wq, &r->mw_work);
 }
 
 static int
@@ -152,11 +177,11 @@ __frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
        if (IS_ERR(f->fr_mr))
                goto out_mr_err;
 
-       f->sg = kcalloc(depth, sizeof(*f->sg), GFP_KERNEL);
-       if (!f->sg)
+       f->fr_sg = kcalloc(depth, sizeof(*f->fr_sg), GFP_KERNEL);
+       if (!f->fr_sg)
                goto out_list_err;
 
-       sg_init_table(f->sg, depth);
+       sg_init_table(f->fr_sg, depth);
 
        init_completion(&f->fr_linv_done);
 
@@ -185,7 +210,7 @@ __frwr_release(struct rpcrdma_mw *r)
        if (rc)
                dprintk("RPC:       %s: ib_dereg_mr status %i\n",
                        __func__, rc);
-       kfree(r->frmr.sg);
+       kfree(r->frmr.fr_sg);
 }
 
 static int
@@ -231,6 +256,9 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
                                               depth;
        }
 
+       rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1,
+                                                     RPCRDMA_MAX_DATA_SEGS /
+                                                     ia->ri_max_frmr_depth));
        return 0;
 }
 
@@ -243,7 +271,7 @@ frwr_op_maxpages(struct rpcrdma_xprt *r_xprt)
        struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 
        return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
-                    rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth);
+                    RPCRDMA_MAX_HDR_SEGS * ia->ri_max_frmr_depth);
 }
 
 static void
@@ -350,9 +378,9 @@ frwr_op_init(struct rpcrdma_xprt *r_xprt)
                        return rc;
                }
 
+               r->mw_xprt = r_xprt;
                list_add(&r->mw_list, &buf->rb_mws);
                list_add(&r->mw_all, &buf->rb_all);
-               r->frmr.fr_xprt = r_xprt;
        }
 
        return 0;
@@ -396,12 +424,12 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 
        for (i = 0; i < nsegs;) {
                if (seg->mr_page)
-                       sg_set_page(&frmr->sg[i],
+                       sg_set_page(&frmr->fr_sg[i],
                                    seg->mr_page,
                                    seg->mr_len,
                                    offset_in_page(seg->mr_offset));
                else
-                       sg_set_buf(&frmr->sg[i], seg->mr_offset,
+                       sg_set_buf(&frmr->fr_sg[i], seg->mr_offset,
                                   seg->mr_len);
 
                ++seg;
@@ -412,25 +440,26 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
                    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
                        break;
        }
-       frmr->sg_nents = i;
+       frmr->fr_nents = i;
+       frmr->fr_dir = direction;
 
-       dma_nents = ib_dma_map_sg(device, frmr->sg, frmr->sg_nents, direction);
+       dma_nents = ib_dma_map_sg(device, frmr->fr_sg, frmr->fr_nents, direction);
        if (!dma_nents) {
                pr_err("RPC:       %s: failed to dma map sg %p sg_nents %u\n",
-                      __func__, frmr->sg, frmr->sg_nents);
+                      __func__, frmr->fr_sg, frmr->fr_nents);
                return -ENOMEM;
        }
 
-       n = ib_map_mr_sg(mr, frmr->sg, frmr->sg_nents, NULL, PAGE_SIZE);
-       if (unlikely(n != frmr->sg_nents)) {
+       n = ib_map_mr_sg(mr, frmr->fr_sg, frmr->fr_nents, NULL, PAGE_SIZE);
+       if (unlikely(n != frmr->fr_nents)) {
                pr_err("RPC:       %s: failed to map mr %p (%u/%u)\n",
-                      __func__, frmr->fr_mr, n, frmr->sg_nents);
+                      __func__, frmr->fr_mr, n, frmr->fr_nents);
                rc = n < 0 ? n : -EINVAL;
                goto out_senderr;
        }
 
        dprintk("RPC:       %s: Using frmr %p to map %u segments (%u bytes)\n",
-               __func__, mw, frmr->sg_nents, mr->length);
+               __func__, mw, frmr->fr_nents, mr->length);
 
        key = (u8)(mr->rkey & 0x000000FF);
        ib_update_fast_reg_key(mr, ++key);
@@ -452,18 +481,16 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
        if (rc)
                goto out_senderr;
 
-       seg1->mr_dir = direction;
        seg1->rl_mw = mw;
        seg1->mr_rkey = mr->rkey;
        seg1->mr_base = mr->iova;
-       seg1->mr_nsegs = frmr->sg_nents;
+       seg1->mr_nsegs = frmr->fr_nents;
        seg1->mr_len = mr->length;
 
-       return frmr->sg_nents;
+       return frmr->fr_nents;
 
 out_senderr:
        dprintk("RPC:       %s: ib_post_send status %i\n", __func__, rc);
-       ib_dma_unmap_sg(device, frmr->sg, dma_nents, direction);
        __frwr_queue_recovery(mw);
        return rc;
 }
@@ -487,24 +514,6 @@ __frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg)
        return invalidate_wr;
 }
 
-static void
-__frwr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
-                int rc)
-{
-       struct ib_device *device = r_xprt->rx_ia.ri_device;
-       struct rpcrdma_mw *mw = seg->rl_mw;
-       struct rpcrdma_frmr *f = &mw->frmr;
-
-       seg->rl_mw = NULL;
-
-       ib_dma_unmap_sg(device, f->sg, f->sg_nents, seg->mr_dir);
-
-       if (!rc)
-               rpcrdma_put_mw(r_xprt, mw);
-       else
-               __frwr_queue_recovery(mw);
-}
-
 /* Invalidate all memory regions that were registered for "req".
  *
  * Sleeps until it is safe for the host CPU to access the
@@ -518,6 +527,7 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
        struct rpcrdma_mr_seg *seg;
        unsigned int i, nchunks;
        struct rpcrdma_frmr *f;
+       struct rpcrdma_mw *mw;
        int rc;
 
        dprintk("RPC:       %s: req %p\n", __func__, req);
@@ -558,11 +568,8 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
         * unless ri_id->qp is a valid pointer.
         */
        rc = ib_post_send(ia->ri_id->qp, invalidate_wrs, &bad_wr);
-       if (rc) {
-               pr_warn("%s: ib_post_send failed %i\n", __func__, rc);
-               rdma_disconnect(ia->ri_id);
-               goto unmap;
-       }
+       if (rc)
+               goto reset_mrs;
 
        wait_for_completion(&f->fr_linv_done);
 
@@ -572,56 +579,65 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 unmap:
        for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
                seg = &req->rl_segments[i];
+               mw = seg->rl_mw;
+               seg->rl_mw = NULL;
 
-               __frwr_dma_unmap(r_xprt, seg, rc);
+               ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents,
+                               f->fr_dir);
+               rpcrdma_put_mw(r_xprt, mw);
 
                i += seg->mr_nsegs;
                seg->mr_nsegs = 0;
        }
 
        req->rl_nchunks = 0;
-}
+       return;
 
-/* Post a LOCAL_INV Work Request to prevent further remote access
- * via RDMA READ or RDMA WRITE.
- */
-static int
-frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
-{
-       struct rpcrdma_mr_seg *seg1 = seg;
-       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-       struct rpcrdma_mw *mw = seg1->rl_mw;
-       struct rpcrdma_frmr *frmr = &mw->frmr;
-       struct ib_send_wr *invalidate_wr, *bad_wr;
-       int rc, nsegs = seg->mr_nsegs;
+reset_mrs:
+       pr_warn("%s: ib_post_send failed %i\n", __func__, rc);
 
-       dprintk("RPC:       %s: FRMR %p\n", __func__, mw);
+       /* Find and reset the MRs in the LOCAL_INV WRs that did not
+        * get posted. This is synchronous, and slow.
+        */
+       for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
+               seg = &req->rl_segments[i];
+               mw = seg->rl_mw;
+               f = &mw->frmr;
 
-       seg1->rl_mw = NULL;
-       frmr->fr_state = FRMR_IS_INVALID;
-       invalidate_wr = &mw->frmr.fr_invwr;
+               if (mw->frmr.fr_mr->rkey == bad_wr->ex.invalidate_rkey) {
+                       __frwr_reset_mr(ia, mw);
+                       bad_wr = bad_wr->next;
+               }
 
-       memset(invalidate_wr, 0, sizeof(*invalidate_wr));
-       frmr->fr_cqe.done = frwr_wc_localinv;
-       invalidate_wr->wr_cqe = &frmr->fr_cqe;
-       invalidate_wr->opcode = IB_WR_LOCAL_INV;
-       invalidate_wr->ex.invalidate_rkey = frmr->fr_mr->rkey;
-       DECR_CQCOUNT(&r_xprt->rx_ep);
+               i += seg->mr_nsegs;
+       }
+       goto unmap;
+}
 
-       ib_dma_unmap_sg(ia->ri_device, frmr->sg, frmr->sg_nents, seg1->mr_dir);
-       read_lock(&ia->ri_qplock);
-       rc = ib_post_send(ia->ri_id->qp, invalidate_wr, &bad_wr);
-       read_unlock(&ia->ri_qplock);
-       if (rc)
-               goto out_err;
+/* Use a slow, safe mechanism to invalidate all memory regions
+ * that were registered for "req".
+ */
+static void
+frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+                  bool sync)
+{
+       struct rpcrdma_mr_seg *seg;
+       struct rpcrdma_mw *mw;
+       unsigned int i;
 
-       rpcrdma_put_mw(r_xprt, mw);
-       return nsegs;
+       for (i = 0; req->rl_nchunks; req->rl_nchunks--) {
+               seg = &req->rl_segments[i];
+               mw = seg->rl_mw;
 
-out_err:
-       dprintk("RPC:       %s: ib_post_send status %i\n", __func__, rc);
-       __frwr_queue_recovery(mw);
-       return nsegs;
+               if (sync)
+                       __frwr_reset_and_unmap(r_xprt, mw);
+               else
+                       __frwr_queue_recovery(mw);
+
+               i += seg->mr_nsegs;
+               seg->mr_nsegs = 0;
+               seg->rl_mw = NULL;
+       }
 }
 
 static void
@@ -643,7 +659,7 @@ frwr_op_destroy(struct rpcrdma_buffer *buf)
 const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
        .ro_map                         = frwr_op_map,
        .ro_unmap_sync                  = frwr_op_unmap_sync,
-       .ro_unmap                       = frwr_op_unmap,
+       .ro_unmap_safe                  = frwr_op_unmap_safe,
        .ro_open                        = frwr_op_open,
        .ro_maxpages                    = frwr_op_maxpages,
        .ro_init                        = frwr_op_init,
index 481b9b6..3750596 100644 (file)
@@ -36,8 +36,11 @@ physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
                       __func__, PTR_ERR(mr));
                return -ENOMEM;
        }
-
        ia->ri_dma_mr = mr;
+
+       rpcrdma_set_max_header_sizes(ia, cdata, min_t(unsigned int,
+                                                     RPCRDMA_MAX_DATA_SEGS,
+                                                     RPCRDMA_MAX_HDR_SEGS));
        return 0;
 }
 
@@ -47,7 +50,7 @@ static size_t
 physical_op_maxpages(struct rpcrdma_xprt *r_xprt)
 {
        return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
-                    rpcrdma_max_segments(r_xprt));
+                    RPCRDMA_MAX_HDR_SEGS);
 }
 
 static int
@@ -71,17 +74,6 @@ physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
        return 1;
 }
 
-/* Unmap a memory region, but leave it registered.
- */
-static int
-physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
-{
-       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-
-       rpcrdma_unmap_one(ia->ri_device, seg);
-       return 1;
-}
-
 /* DMA unmap all memory regions that were mapped for "req".
  */
 static void
@@ -94,6 +86,25 @@ physical_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
                rpcrdma_unmap_one(device, &req->rl_segments[i++]);
 }
 
+/* Use a slow, safe mechanism to invalidate all memory regions
+ * that were registered for "req".
+ *
+ * For physical memory registration, there is no good way to
+ * fence a single MR that has been advertised to the server. The
+ * client has already handed the server an R_key that cannot be
+ * invalidated and is shared by all MRs on this connection.
+ * Tearing down the PD might be the only safe choice, but it's
+ * not clear that a freshly acquired DMA R_key would be different
+ * than the one used by the PD that was just destroyed.
+ * FIXME.
+ */
+static void
+physical_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+                      bool sync)
+{
+       physical_op_unmap_sync(r_xprt, req);
+}
+
 static void
 physical_op_destroy(struct rpcrdma_buffer *buf)
 {
@@ -102,7 +113,7 @@ physical_op_destroy(struct rpcrdma_buffer *buf)
 const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
        .ro_map                         = physical_op_map,
        .ro_unmap_sync                  = physical_op_unmap_sync,
-       .ro_unmap                       = physical_op_unmap,
+       .ro_unmap_safe                  = physical_op_unmap_safe,
        .ro_open                        = physical_op_open,
        .ro_maxpages                    = physical_op_maxpages,
        .ro_init                        = physical_op_init,
index 888823b..35a8109 100644 (file)
@@ -61,26 +61,84 @@ enum rpcrdma_chunktype {
        rpcrdma_replych
 };
 
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 static const char transfertypes[][12] = {
-       "pure inline",  /* no chunks */
-       " read chunk",  /* some argument via rdma read */
-       "*read chunk",  /* entire request via rdma read */
-       "write chunk",  /* some result via rdma write */
+       "inline",       /* no chunks */
+       "read list",    /* some argument via rdma read */
+       "*read list",   /* entire request via rdma read */
+       "write list",   /* some result via rdma write */
        "reply chunk"   /* entire reply via rdma write */
 };
-#endif
+
+/* Returns size of largest RPC-over-RDMA header in a Call message
+ *
+ * The largest Call header contains a full-size Read list and a
+ * minimal Reply chunk.
+ */
+static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs)
+{
+       unsigned int size;
+
+       /* Fixed header fields and list discriminators */
+       size = RPCRDMA_HDRLEN_MIN;
+
+       /* Maximum Read list size */
+       maxsegs += 2;   /* segment for head and tail buffers */
+       size = maxsegs * sizeof(struct rpcrdma_read_chunk);
+
+       /* Minimal Read chunk size */
+       size += sizeof(__be32); /* segment count */
+       size += sizeof(struct rpcrdma_segment);
+       size += sizeof(__be32); /* list discriminator */
+
+       dprintk("RPC:       %s: max call header size = %u\n",
+               __func__, size);
+       return size;
+}
+
+/* Returns size of largest RPC-over-RDMA header in a Reply message
+ *
+ * There is only one Write list or one Reply chunk per Reply
+ * message.  The larger list is the Write list.
+ */
+static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs)
+{
+       unsigned int size;
+
+       /* Fixed header fields and list discriminators */
+       size = RPCRDMA_HDRLEN_MIN;
+
+       /* Maximum Write list size */
+       maxsegs += 2;   /* segment for head and tail buffers */
+       size = sizeof(__be32);          /* segment count */
+       size += maxsegs * sizeof(struct rpcrdma_segment);
+       size += sizeof(__be32); /* list discriminator */
+
+       dprintk("RPC:       %s: max reply header size = %u\n",
+               __func__, size);
+       return size;
+}
+
+void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *ia,
+                                 struct rpcrdma_create_data_internal *cdata,
+                                 unsigned int maxsegs)
+{
+       ia->ri_max_inline_write = cdata->inline_wsize -
+                                 rpcrdma_max_call_header_size(maxsegs);
+       ia->ri_max_inline_read = cdata->inline_rsize -
+                                rpcrdma_max_reply_header_size(maxsegs);
+}
 
 /* The client can send a request inline as long as the RPCRDMA header
  * plus the RPC call fit under the transport's inline limit. If the
  * combined call message size exceeds that limit, the client must use
  * the read chunk list for this operation.
  */
-static bool rpcrdma_args_inline(struct rpc_rqst *rqst)
+static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
+                               struct rpc_rqst *rqst)
 {
-       unsigned int callsize = RPCRDMA_HDRLEN_MIN + rqst->rq_snd_buf.len;
+       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 
-       return callsize <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst);
+       return rqst->rq_snd_buf.len <= ia->ri_max_inline_write;
 }
 
 /* The client can't know how large the actual reply will be. Thus it
@@ -89,11 +147,12 @@ static bool rpcrdma_args_inline(struct rpc_rqst *rqst)
  * limit, the client must provide a write list or a reply chunk for
  * this request.
  */
-static bool rpcrdma_results_inline(struct rpc_rqst *rqst)
+static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
+                                  struct rpc_rqst *rqst)
 {
-       unsigned int repsize = RPCRDMA_HDRLEN_MIN + rqst->rq_rcv_buf.buflen;
+       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 
-       return repsize <= RPCRDMA_INLINE_READ_THRESHOLD(rqst);
+       return rqst->rq_rcv_buf.buflen <= ia->ri_max_inline_read;
 }
 
 static int
@@ -226,23 +285,16 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
        return n;
 }
 
-/*
- * Create read/write chunk lists, and reply chunks, for RDMA
- *
- *   Assume check against THRESHOLD has been done, and chunks are required.
- *   Assume only encoding one list entry for read|write chunks. The NFSv3
- *     protocol is simple enough to allow this as it only has a single "bulk
- *     result" in each procedure - complicated NFSv4 COMPOUNDs are not. (The
- *     RDMA/Sessions NFSv4 proposal addresses this for future v4 revs.)
- *
- * When used for a single reply chunk (which is a special write
- * chunk used for the entire reply, rather than just the data), it
- * is used primarily for READDIR and READLINK which would otherwise
- * be severely size-limited by a small rdma inline read max. The server
- * response will come back as an RDMA Write, followed by a message
- * of type RDMA_NOMSG carrying the xid and length. As a result, reply
- * chunks do not provide data alignment, however they do not require
- * "fixup" (moving the response to the upper layer buffer) either.
+static inline __be32 *
+xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr_seg *seg)
+{
+       *iptr++ = cpu_to_be32(seg->mr_rkey);
+       *iptr++ = cpu_to_be32(seg->mr_len);
+       return xdr_encode_hyper(iptr, seg->mr_base);
+}
+
+/* XDR-encode the Read list. Supports encoding a list of read
+ * segments that belong to a single read chunk.
  *
  * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
  *
@@ -250,131 +302,190 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
  *   N elements, position P (same P for all chunks of same arg!):
  *    1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0
  *
+ * Returns a pointer to the XDR word in the RDMA header following
+ * the end of the Read list, or an error pointer.
+ */
+static __be32 *
+rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
+                        struct rpcrdma_req *req, struct rpc_rqst *rqst,
+                        __be32 *iptr, enum rpcrdma_chunktype rtype)
+{
+       struct rpcrdma_mr_seg *seg = req->rl_nextseg;
+       unsigned int pos;
+       int n, nsegs;
+
+       if (rtype == rpcrdma_noch) {
+               *iptr++ = xdr_zero;     /* item not present */
+               return iptr;
+       }
+
+       pos = rqst->rq_snd_buf.head[0].iov_len;
+       if (rtype == rpcrdma_areadch)
+               pos = 0;
+       nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg,
+                                    RPCRDMA_MAX_SEGS - req->rl_nchunks);
+       if (nsegs < 0)
+               return ERR_PTR(nsegs);
+
+       do {
+               n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, false);
+               if (n <= 0)
+                       return ERR_PTR(n);
+
+               *iptr++ = xdr_one;      /* item present */
+
+               /* All read segments in this chunk
+                * have the same "position".
+                */
+               *iptr++ = cpu_to_be32(pos);
+               iptr = xdr_encode_rdma_segment(iptr, seg);
+
+               dprintk("RPC: %5u %s: read segment pos %u "
+                       "%d@0x%016llx:0x%08x (%s)\n",
+                       rqst->rq_task->tk_pid, __func__, pos,
+                       seg->mr_len, (unsigned long long)seg->mr_base,
+                       seg->mr_rkey, n < nsegs ? "more" : "last");
+
+               r_xprt->rx_stats.read_chunk_count++;
+               req->rl_nchunks++;
+               seg += n;
+               nsegs -= n;
+       } while (nsegs);
+       req->rl_nextseg = seg;
+
+       /* Finish Read list */
+       *iptr++ = xdr_zero;     /* Next item not present */
+       return iptr;
+}
+
+/* XDR-encode the Write list. Supports encoding a list containing
+ * one array of plain segments that belong to a single write chunk.
+ *
+ * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
+ *
  *  Write chunklist (a list of (one) counted array):
  *   N elements:
  *    1 - N - HLOO - HLOO - ... - HLOO - 0
  *
+ * Returns a pointer to the XDR word in the RDMA header following
+ * the end of the Write list, or an error pointer.
+ */
+static __be32 *
+rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+                         struct rpc_rqst *rqst, __be32 *iptr,
+                         enum rpcrdma_chunktype wtype)
+{
+       struct rpcrdma_mr_seg *seg = req->rl_nextseg;
+       int n, nsegs, nchunks;
+       __be32 *segcount;
+
+       if (wtype != rpcrdma_writech) {
+               *iptr++ = xdr_zero;     /* no Write list present */
+               return iptr;
+       }
+
+       nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf,
+                                    rqst->rq_rcv_buf.head[0].iov_len,
+                                    wtype, seg,
+                                    RPCRDMA_MAX_SEGS - req->rl_nchunks);
+       if (nsegs < 0)
+               return ERR_PTR(nsegs);
+
+       *iptr++ = xdr_one;      /* Write list present */
+       segcount = iptr++;      /* save location of segment count */
+
+       nchunks = 0;
+       do {
+               n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true);
+               if (n <= 0)
+                       return ERR_PTR(n);
+
+               iptr = xdr_encode_rdma_segment(iptr, seg);
+
+               dprintk("RPC: %5u %s: write segment "
+                       "%d@0x016%llx:0x%08x (%s)\n",
+                       rqst->rq_task->tk_pid, __func__,
+                       seg->mr_len, (unsigned long long)seg->mr_base,
+                       seg->mr_rkey, n < nsegs ? "more" : "last");
+
+               r_xprt->rx_stats.write_chunk_count++;
+               r_xprt->rx_stats.total_rdma_request += seg->mr_len;
+               req->rl_nchunks++;
+               nchunks++;
+               seg   += n;
+               nsegs -= n;
+       } while (nsegs);
+       req->rl_nextseg = seg;
+
+       /* Update count of segments in this Write chunk */
+       *segcount = cpu_to_be32(nchunks);
+
+       /* Finish Write list */
+       *iptr++ = xdr_zero;     /* Next item not present */
+       return iptr;
+}
+
+/* XDR-encode the Reply chunk. Supports encoding an array of plain
+ * segments that belong to a single write (reply) chunk.
+ *
+ * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
+ *
  *  Reply chunk (a counted array):
  *   N elements:
  *    1 - N - HLOO - HLOO - ... - HLOO
  *
- * Returns positive RPC/RDMA header size, or negative errno.
+ * Returns a pointer to the XDR word in the RDMA header following
+ * the end of the Reply chunk, or an error pointer.
  */
-
-static ssize_t
-rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
-               struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type)
+static __be32 *
+rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
+                          struct rpcrdma_req *req, struct rpc_rqst *rqst,
+                          __be32 *iptr, enum rpcrdma_chunktype wtype)
 {
-       struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
-       struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
-       int n, nsegs, nchunks = 0;
-       unsigned int pos;
-       struct rpcrdma_mr_seg *seg = req->rl_segments;
-       struct rpcrdma_read_chunk *cur_rchunk = NULL;
-       struct rpcrdma_write_array *warray = NULL;
-       struct rpcrdma_write_chunk *cur_wchunk = NULL;
-       __be32 *iptr = headerp->rm_body.rm_chunks;
-       int (*map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool);
-
-       if (type == rpcrdma_readch || type == rpcrdma_areadch) {
-               /* a read chunk - server will RDMA Read our memory */
-               cur_rchunk = (struct rpcrdma_read_chunk *) iptr;
-       } else {
-               /* a write or reply chunk - server will RDMA Write our memory */
-               *iptr++ = xdr_zero;     /* encode a NULL read chunk list */
-               if (type == rpcrdma_replych)
-                       *iptr++ = xdr_zero;     /* a NULL write chunk list */
-               warray = (struct rpcrdma_write_array *) iptr;
-               cur_wchunk = (struct rpcrdma_write_chunk *) (warray + 1);
-       }
+       struct rpcrdma_mr_seg *seg = req->rl_nextseg;
+       int n, nsegs, nchunks;
+       __be32 *segcount;
 
-       if (type == rpcrdma_replych || type == rpcrdma_areadch)
-               pos = 0;
-       else
-               pos = target->head[0].iov_len;
+       if (wtype != rpcrdma_replych) {
+               *iptr++ = xdr_zero;     /* no Reply chunk present */
+               return iptr;
+       }
 
-       nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS);
+       nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg,
+                                    RPCRDMA_MAX_SEGS - req->rl_nchunks);
        if (nsegs < 0)
-               return nsegs;
+               return ERR_PTR(nsegs);
 
-       map = r_xprt->rx_ia.ri_ops->ro_map;
+       *iptr++ = xdr_one;      /* Reply chunk present */
+       segcount = iptr++;      /* save location of segment count */
+
+       nchunks = 0;
        do {
-               n = map(r_xprt, seg, nsegs, cur_wchunk != NULL);
+               n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true);
                if (n <= 0)
-                       goto out;
-               if (cur_rchunk) {       /* read */
-                       cur_rchunk->rc_discrim = xdr_one;
-                       /* all read chunks have the same "position" */
-                       cur_rchunk->rc_position = cpu_to_be32(pos);
-                       cur_rchunk->rc_target.rs_handle =
-                                               cpu_to_be32(seg->mr_rkey);
-                       cur_rchunk->rc_target.rs_length =
-                                               cpu_to_be32(seg->mr_len);
-                       xdr_encode_hyper(
-                                       (__be32 *)&cur_rchunk->rc_target.rs_offset,
-                                       seg->mr_base);
-                       dprintk("RPC:       %s: read chunk "
-                               "elem %d@0x%llx:0x%x pos %u (%s)\n", __func__,
-                               seg->mr_len, (unsigned long long)seg->mr_base,
-                               seg->mr_rkey, pos, n < nsegs ? "more" : "last");
-                       cur_rchunk++;
-                       r_xprt->rx_stats.read_chunk_count++;
-               } else {                /* write/reply */
-                       cur_wchunk->wc_target.rs_handle =
-                                               cpu_to_be32(seg->mr_rkey);
-                       cur_wchunk->wc_target.rs_length =
-                                               cpu_to_be32(seg->mr_len);
-                       xdr_encode_hyper(
-                                       (__be32 *)&cur_wchunk->wc_target.rs_offset,
-                                       seg->mr_base);
-                       dprintk("RPC:       %s: %s chunk "
-                               "elem %d@0x%llx:0x%x (%s)\n", __func__,
-                               (type == rpcrdma_replych) ? "reply" : "write",
-                               seg->mr_len, (unsigned long long)seg->mr_base,
-                               seg->mr_rkey, n < nsegs ? "more" : "last");
-                       cur_wchunk++;
-                       if (type == rpcrdma_replych)
-                               r_xprt->rx_stats.reply_chunk_count++;
-                       else
-                               r_xprt->rx_stats.write_chunk_count++;
-                       r_xprt->rx_stats.total_rdma_request += seg->mr_len;
-               }
+                       return ERR_PTR(n);
+
+               iptr = xdr_encode_rdma_segment(iptr, seg);
+
+               dprintk("RPC: %5u %s: reply segment "
+                       "%d@0x%016llx:0x%08x (%s)\n",
+                       rqst->rq_task->tk_pid, __func__,
+                       seg->mr_len, (unsigned long long)seg->mr_base,
+                       seg->mr_rkey, n < nsegs ? "more" : "last");
+
+               r_xprt->rx_stats.reply_chunk_count++;
+               r_xprt->rx_stats.total_rdma_request += seg->mr_len;
+               req->rl_nchunks++;
                nchunks++;
                seg   += n;
                nsegs -= n;
        } while (nsegs);
+       req->rl_nextseg = seg;
 
-       /* success. all failures return above */
-       req->rl_nchunks = nchunks;
-
-       /*
-        * finish off header. If write, marshal discrim and nchunks.
-        */
-       if (cur_rchunk) {
-               iptr = (__be32 *) cur_rchunk;
-               *iptr++ = xdr_zero;     /* finish the read chunk list */
-               *iptr++ = xdr_zero;     /* encode a NULL write chunk list */
-               *iptr++ = xdr_zero;     /* encode a NULL reply chunk */
-       } else {
-               warray->wc_discrim = xdr_one;
-               warray->wc_nchunks = cpu_to_be32(nchunks);
-               iptr = (__be32 *) cur_wchunk;
-               if (type == rpcrdma_writech) {
-                       *iptr++ = xdr_zero; /* finish the write chunk list */
-                       *iptr++ = xdr_zero; /* encode a NULL reply chunk */
-               }
-       }
-
-       /*
-        * Return header size.
-        */
-       return (unsigned char *)iptr - (unsigned char *)headerp;
+       /* Update count of segments in the Reply chunk */
+       *segcount = cpu_to_be32(nchunks);
 
-out:
-       for (pos = 0; nchunks--;)
-               pos += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
-                                                     &req->rl_segments[pos]);
-       return n;
+       return iptr;
 }
 
 /*
@@ -440,13 +551,10 @@ static void rpcrdma_inline_pullup(struct rpc_rqst *rqst)
  * Marshal a request: the primary job of this routine is to choose
  * the transfer modes. See comments below.
  *
- * Uses multiple RDMA IOVs for a request:
- *  [0] -- RPC RDMA header, which uses memory from the *start* of the
- *         preregistered buffer that already holds the RPC data in
- *         its middle.
- *  [1] -- the RPC header/data, marshaled by RPC and the NFS protocol.
- *  [2] -- optional padding.
- *  [3] -- if padded, header only in [1] and data here.
+ * Prepares up to two IOVs per Call message:
+ *
+ *  [0] -- RPC RDMA header
+ *  [1] -- the RPC header/data
  *
  * Returns zero on success, otherwise a negative errno.
  */
@@ -457,24 +565,17 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
        struct rpc_xprt *xprt = rqst->rq_xprt;
        struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
        struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
-       char *base;
-       size_t rpclen;
-       ssize_t hdrlen;
        enum rpcrdma_chunktype rtype, wtype;
        struct rpcrdma_msg *headerp;
+       ssize_t hdrlen;
+       size_t rpclen;
+       __be32 *iptr;
 
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
        if (test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state))
                return rpcrdma_bc_marshal_reply(rqst);
 #endif
 
-       /*
-        * rpclen gets amount of data in first buffer, which is the
-        * pre-registered buffer.
-        */
-       base = rqst->rq_svec[0].iov_base;
-       rpclen = rqst->rq_svec[0].iov_len;
-
        headerp = rdmab_to_msg(req->rl_rdmabuf);
        /* don't byte-swap XID, it's already done in request */
        headerp->rm_xid = rqst->rq_xid;
@@ -485,15 +586,16 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
        /*
         * Chunks needed for results?
         *
-        * o Read ops return data as write chunk(s), header as inline.
         * o If the expected result is under the inline threshold, all ops
         *   return as inline.
+        * o Large read ops return data as write chunk(s), header as
+        *   inline.
         * o Large non-read ops return as a single reply chunk.
         */
-       if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
-               wtype = rpcrdma_writech;
-       else if (rpcrdma_results_inline(rqst))
+       if (rpcrdma_results_inline(r_xprt, rqst))
                wtype = rpcrdma_noch;
+       else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
+               wtype = rpcrdma_writech;
        else
                wtype = rpcrdma_replych;
 
@@ -511,10 +613,14 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
         * that both has a data payload, and whose non-data arguments
         * by themselves are larger than the inline threshold.
         */
-       if (rpcrdma_args_inline(rqst)) {
+       if (rpcrdma_args_inline(r_xprt, rqst)) {
                rtype = rpcrdma_noch;
+               rpcrdma_inline_pullup(rqst);
+               rpclen = rqst->rq_svec[0].iov_len;
        } else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
                rtype = rpcrdma_readch;
+               rpclen = rqst->rq_svec[0].iov_len;
+               rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf);
        } else {
                r_xprt->rx_stats.nomsg_call_count++;
                headerp->rm_type = htonl(RDMA_NOMSG);
@@ -522,57 +628,50 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
                rpclen = 0;
        }
 
-       /* The following simplification is not true forever */
-       if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
-               wtype = rpcrdma_noch;
-       if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
-               dprintk("RPC:       %s: cannot marshal multiple chunk lists\n",
-                       __func__);
-               return -EIO;
-       }
-
-       hdrlen = RPCRDMA_HDRLEN_MIN;
-
-       /*
-        * Pull up any extra send data into the preregistered buffer.
-        * When padding is in use and applies to the transfer, insert
-        * it and change the message type.
+       /* This implementation supports the following combinations
+        * of chunk lists in one RPC-over-RDMA Call message:
+        *
+        *   - Read list
+        *   - Write list
+        *   - Reply chunk
+        *   - Read list + Reply chunk
+        *
+        * It might not yet support the following combinations:
+        *
+        *   - Read list + Write list
+        *
+        * It does not support the following combinations:
+        *
+        *   - Write list + Reply chunk
+        *   - Read list + Write list + Reply chunk
+        *
+        * This implementation supports only a single chunk in each
+        * Read or Write list. Thus for example the client cannot
+        * send a Call message with a Position Zero Read chunk and a
+        * regular Read chunk at the same time.
         */
-       if (rtype == rpcrdma_noch) {
-
-               rpcrdma_inline_pullup(rqst);
-
-               headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
-               headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
-               headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero;
-               /* new length after pullup */
-               rpclen = rqst->rq_svec[0].iov_len;
-       } else if (rtype == rpcrdma_readch)
-               rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf);
-       if (rtype != rpcrdma_noch) {
-               hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
-                                              headerp, rtype);
-               wtype = rtype;  /* simplify dprintk */
-
-       } else if (wtype != rpcrdma_noch) {
-               hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf,
-                                              headerp, wtype);
-       }
-       if (hdrlen < 0)
-               return hdrlen;
+       req->rl_nchunks = 0;
+       req->rl_nextseg = req->rl_segments;
+       iptr = headerp->rm_body.rm_chunks;
+       iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype);
+       if (IS_ERR(iptr))
+               goto out_unmap;
+       iptr = rpcrdma_encode_write_list(r_xprt, req, rqst, iptr, wtype);
+       if (IS_ERR(iptr))
+               goto out_unmap;
+       iptr = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, iptr, wtype);
+       if (IS_ERR(iptr))
+               goto out_unmap;
+       hdrlen = (unsigned char *)iptr - (unsigned char *)headerp;
+
+       if (hdrlen + rpclen > RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
+               goto out_overflow;
+
+       dprintk("RPC: %5u %s: %s/%s: hdrlen %zd rpclen %zd\n",
+               rqst->rq_task->tk_pid, __func__,
+               transfertypes[rtype], transfertypes[wtype],
+               hdrlen, rpclen);
 
-       dprintk("RPC:       %s: %s: hdrlen %zd rpclen %zd"
-               " headerp 0x%p base 0x%p lkey 0x%x\n",
-               __func__, transfertypes[wtype], hdrlen, rpclen,
-               headerp, base, rdmab_lkey(req->rl_rdmabuf));
-
-       /*
-        * initialize send_iov's - normally only two: rdma chunk header and
-        * single preregistered RPC header buffer, but if padding is present,
-        * then use a preregistered (and zeroed) pad buffer between the RPC
-        * header and any write data. In all non-rdma cases, any following
-        * data has been copied into the RPC header buffer.
-        */
        req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf);
        req->rl_send_iov[0].length = hdrlen;
        req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf);
@@ -587,6 +686,18 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 
        req->rl_niovs = 2;
        return 0;
+
+out_overflow:
+       pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s/%s\n",
+               hdrlen, rpclen, transfertypes[rtype], transfertypes[wtype]);
+       /* Terminate this RPC. Chunks registered above will be
+        * released by xprt_release -> xprt_rmda_free .
+        */
+       return -EIO;
+
+out_unmap:
+       r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
+       return PTR_ERR(iptr);
 }
 
 /*
index 765bca4..0ba9887 100644 (file)
@@ -145,19 +145,32 @@ static __be32 *decode_reply_array(__be32 *va, __be32 *vaend)
        return (__be32 *)&ary->wc_array[nchunks];
 }
 
-int svc_rdma_xdr_decode_req(struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp)
+/**
+ * svc_rdma_xdr_decode_req - Parse incoming RPC-over-RDMA header
+ * @rq_arg: Receive buffer
+ *
+ * On entry, xdr->head[0].iov_base points to first byte in the
+ * RPC-over-RDMA header.
+ *
+ * On successful exit, head[0] points to first byte past the
+ * RPC-over-RDMA header. For RDMA_MSG, this is the RPC message.
+ * The length of the RPC-over-RDMA header is returned.
+ */
+int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg)
 {
+       struct rpcrdma_msg *rmsgp;
        __be32 *va, *vaend;
        unsigned int len;
        u32 hdr_len;
 
        /* Verify that there's enough bytes for header + something */
-       if (rqstp->rq_arg.len <= RPCRDMA_HDRLEN_ERR) {
+       if (rq_arg->len <= RPCRDMA_HDRLEN_ERR) {
                dprintk("svcrdma: header too short = %d\n",
-                       rqstp->rq_arg.len);
+                       rq_arg->len);
                return -EINVAL;
        }
 
+       rmsgp = (struct rpcrdma_msg *)rq_arg->head[0].iov_base;
        if (rmsgp->rm_vers != rpcrdma_version) {
                dprintk("%s: bad version %u\n", __func__,
                        be32_to_cpu(rmsgp->rm_vers));
@@ -189,10 +202,10 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp)
                        be32_to_cpu(rmsgp->rm_body.rm_padded.rm_thresh);
 
                va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
-               rqstp->rq_arg.head[0].iov_base = va;
+               rq_arg->head[0].iov_base = va;
                len = (u32)((unsigned long)va - (unsigned long)rmsgp);
-               rqstp->rq_arg.head[0].iov_len -= len;
-               if (len > rqstp->rq_arg.len)
+               rq_arg->head[0].iov_len -= len;
+               if (len > rq_arg->len)
                        return -EINVAL;
                return len;
        default:
@@ -205,7 +218,7 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp)
         * chunk list and a reply chunk list.
         */
        va = &rmsgp->rm_body.rm_chunks[0];
-       vaend = (__be32 *)((unsigned long)rmsgp + rqstp->rq_arg.len);
+       vaend = (__be32 *)((unsigned long)rmsgp + rq_arg->len);
        va = decode_read_list(va, vaend);
        if (!va) {
                dprintk("svcrdma: failed to decode read list\n");
@@ -222,10 +235,9 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp)
                return -EINVAL;
        }
 
-       rqstp->rq_arg.head[0].iov_base = va;
+       rq_arg->head[0].iov_base = va;
        hdr_len = (unsigned long)va - (unsigned long)rmsgp;
-       rqstp->rq_arg.head[0].iov_len -= hdr_len;
-
+       rq_arg->head[0].iov_len -= hdr_len;
        return hdr_len;
 }
 
index fbe7444..2c25606 100644 (file)
@@ -447,10 +447,8 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
        head->arg.len = rqstp->rq_arg.len;
        head->arg.buflen = rqstp->rq_arg.buflen;
 
-       ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
-       position = be32_to_cpu(ch->rc_position);
-
        /* RDMA_NOMSG: RDMA READ data should land just after RDMA RECV data */
+       position = be32_to_cpu(ch->rc_position);
        if (position == 0) {
                head->arg.pages = &head->pages[0];
                page_offset = head->byte_len;
@@ -488,7 +486,7 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
        if (page_offset & 3) {
                u32 pad = 4 - (page_offset & 3);
 
-               head->arg.page_len += pad;
+               head->arg.tail[0].iov_len += pad;
                head->arg.len += pad;
                head->arg.buflen += pad;
                page_offset += pad;
@@ -510,11 +508,10 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
        return ret;
 }
 
-static int rdma_read_complete(struct svc_rqst *rqstp,
-                             struct svc_rdma_op_ctxt *head)
+static void rdma_read_complete(struct svc_rqst *rqstp,
+                              struct svc_rdma_op_ctxt *head)
 {
        int page_no;
-       int ret;
 
        /* Copy RPC pages */
        for (page_no = 0; page_no < head->count; page_no++) {
@@ -550,23 +547,6 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
        rqstp->rq_arg.tail[0] = head->arg.tail[0];
        rqstp->rq_arg.len = head->arg.len;
        rqstp->rq_arg.buflen = head->arg.buflen;
-
-       /* Free the context */
-       svc_rdma_put_context(head, 0);
-
-       /* XXX: What should this be? */
-       rqstp->rq_prot = IPPROTO_MAX;
-       svc_xprt_copy_addrs(rqstp, rqstp->rq_xprt);
-
-       ret = rqstp->rq_arg.head[0].iov_len
-               + rqstp->rq_arg.page_len
-               + rqstp->rq_arg.tail[0].iov_len;
-       dprintk("svcrdma: deferred read ret=%d, rq_arg.len=%u, "
-               "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len=%zu\n",
-               ret, rqstp->rq_arg.len, rqstp->rq_arg.head[0].iov_base,
-               rqstp->rq_arg.head[0].iov_len);
-
-       return ret;
 }
 
 /* By convention, backchannel calls arrive via rdma_msg type
@@ -624,7 +604,8 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
                                  dto_q);
                list_del_init(&ctxt->dto_q);
                spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
-               return rdma_read_complete(rqstp, ctxt);
+               rdma_read_complete(rqstp, ctxt);
+               goto complete;
        } else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) {
                ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next,
                                  struct svc_rdma_op_ctxt,
@@ -655,7 +636,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 
        /* Decode the RDMA header. */
        rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
-       ret = svc_rdma_xdr_decode_req(rmsgp, rqstp);
+       ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg);
        if (ret < 0)
                goto out_err;
        if (ret == 0)
@@ -682,6 +663,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
                return 0;
        }
 
+complete:
        ret = rqstp->rq_arg.head[0].iov_len
                + rqstp->rq_arg.page_len
                + rqstp->rq_arg.tail[0].iov_len;
index 4f1b1c4..54d5333 100644 (file)
@@ -463,25 +463,21 @@ static int send_reply(struct svcxprt_rdma *rdma,
                      struct svc_rqst *rqstp,
                      struct page *page,
                      struct rpcrdma_msg *rdma_resp,
-                     struct svc_rdma_op_ctxt *ctxt,
                      struct svc_rdma_req_map *vec,
                      int byte_count)
 {
+       struct svc_rdma_op_ctxt *ctxt;
        struct ib_send_wr send_wr;
        u32 xdr_off;
        int sge_no;
        int sge_bytes;
        int page_no;
        int pages;
-       int ret;
-
-       ret = svc_rdma_repost_recv(rdma, GFP_KERNEL);
-       if (ret) {
-               svc_rdma_put_context(ctxt, 0);
-               return -ENOTCONN;
-       }
+       int ret = -EIO;
 
        /* Prepare the context */
+       ctxt = svc_rdma_get_context(rdma);
+       ctxt->direction = DMA_TO_DEVICE;
        ctxt->pages[0] = page;
        ctxt->count = 1;
 
@@ -565,8 +561,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
  err:
        svc_rdma_unmap_dma(ctxt);
        svc_rdma_put_context(ctxt, 1);
-       pr_err("svcrdma: failed to send reply, rc=%d\n", ret);
-       return -EIO;
+       return ret;
 }
 
 void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp)
@@ -585,7 +580,6 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
        int ret;
        int inline_bytes;
        struct page *res_page;
-       struct svc_rdma_op_ctxt *ctxt;
        struct svc_rdma_req_map *vec;
 
        dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
@@ -598,8 +592,6 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
        rp_ary = svc_rdma_get_reply_array(rdma_argp, wr_ary);
 
        /* Build an req vec for the XDR */
-       ctxt = svc_rdma_get_context(rdma);
-       ctxt->direction = DMA_TO_DEVICE;
        vec = svc_rdma_get_req_map(rdma);
        ret = svc_rdma_map_xdr(rdma, &rqstp->rq_res, vec, wr_ary != NULL);
        if (ret)
@@ -635,7 +627,12 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
                inline_bytes -= ret;
        }
 
-       ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, vec,
+       /* Post a fresh Receive buffer _before_ sending the reply */
+       ret = svc_rdma_post_recv(rdma, GFP_KERNEL);
+       if (ret)
+               goto err1;
+
+       ret = send_reply(rdma, rqstp, res_page, rdma_resp, vec,
                         inline_bytes);
        if (ret < 0)
                goto err1;
@@ -648,7 +645,8 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
        put_page(res_page);
  err0:
        svc_rdma_put_req_map(rdma, vec);
-       svc_rdma_put_context(ctxt, 0);
+       pr_err("svcrdma: Could not send reply, err=%d. Closing transport.\n",
+              ret);
        set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
        return -ENOTCONN;
 }
index 9066896..dd94401 100644 (file)
@@ -789,7 +789,7 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
        int ret;
 
        dprintk("svcrdma: Creating RDMA socket\n");
-       if (sa->sa_family != AF_INET) {
+       if ((sa->sa_family != AF_INET) && (sa->sa_family != AF_INET6)) {
                dprintk("svcrdma: Address family %d is not supported.\n", sa->sa_family);
                return ERR_PTR(-EAFNOSUPPORT);
        }
@@ -805,6 +805,16 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
                goto err0;
        }
 
+       /* Allow both IPv4 and IPv6 sockets to bind a single port
+        * at the same time.
+        */
+#if IS_ENABLED(CONFIG_IPV6)
+       ret = rdma_set_afonly(listen_id, 1);
+       if (ret) {
+               dprintk("svcrdma: rdma_set_afonly failed = %d\n", ret);
+               goto err1;
+       }
+#endif
        ret = rdma_bind_addr(listen_id, sa);
        if (ret) {
                dprintk("svcrdma: rdma_bind_addr failed = %d\n", ret);
@@ -1073,7 +1083,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
                newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV;
 
        /* Post receive buffers */
-       for (i = 0; i < newxprt->sc_rq_depth; i++) {
+       for (i = 0; i < newxprt->sc_max_requests; i++) {
                ret = svc_rdma_post_recv(newxprt, GFP_KERNEL);
                if (ret) {
                        dprintk("svcrdma: failure posting receive buffers\n");
@@ -1170,6 +1180,9 @@ static void __svc_rdma_free(struct work_struct *work)
 
        dprintk("svcrdma: %s(%p)\n", __func__, rdma);
 
+       if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
+               ib_drain_qp(rdma->sc_qp);
+
        /* We should only be called from kref_put */
        if (atomic_read(&xprt->xpt_ref.refcount) != 0)
                pr_err("svcrdma: sc_xprt still in use? (%d)\n",
index b1b009f..99d2e5b 100644 (file)
@@ -73,6 +73,8 @@ static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
 
 static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE;
 static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE;
+static unsigned int min_inline_size = RPCRDMA_MIN_INLINE;
+static unsigned int max_inline_size = RPCRDMA_MAX_INLINE;
 static unsigned int zero;
 static unsigned int max_padding = PAGE_SIZE;
 static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS;
@@ -96,6 +98,8 @@ static struct ctl_table xr_tunables_table[] = {
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
+               .extra1         = &min_inline_size,
+               .extra2         = &max_inline_size,
        },
        {
                .procname       = "rdma_max_inline_write",
@@ -103,6 +107,8 @@ static struct ctl_table xr_tunables_table[] = {
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
+               .extra1         = &min_inline_size,
+               .extra2         = &max_inline_size,
        },
        {
                .procname       = "rdma_inline_write_padding",
@@ -508,6 +514,7 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size)
 out:
        dprintk("RPC:       %s: size %zd, request 0x%p\n", __func__, size, req);
        req->rl_connect_cookie = 0;     /* our reserved value */
+       req->rl_task = task;
        return req->rl_sendbuf->rg_base;
 
 out_rdmabuf:
@@ -564,7 +571,6 @@ xprt_rdma_free(void *buffer)
        struct rpcrdma_req *req;
        struct rpcrdma_xprt *r_xprt;
        struct rpcrdma_regbuf *rb;
-       int i;
 
        if (buffer == NULL)
                return;
@@ -578,11 +584,8 @@ xprt_rdma_free(void *buffer)
 
        dprintk("RPC:       %s: called on 0x%p\n", __func__, req->rl_reply);
 
-       for (i = 0; req->rl_nchunks;) {
-               --req->rl_nchunks;
-               i += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
-                                                   &req->rl_segments[i]);
-       }
+       r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req,
+                                           !RPC_IS_ASYNC(req->rl_task));
 
        rpcrdma_buffer_put(req);
 }
@@ -707,6 +710,7 @@ static struct rpc_xprt_ops xprt_rdma_procs = {
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
        .bc_setup               = xprt_rdma_bc_setup,
        .bc_up                  = xprt_rdma_bc_up,
+       .bc_maxpayload          = xprt_rdma_bc_maxpayload,
        .bc_free_rqst           = xprt_rdma_bc_free_rqst,
        .bc_destroy             = xprt_rdma_bc_destroy,
 #endif
index f5ed9f9..b044d98 100644 (file)
@@ -203,15 +203,6 @@ out_fail:
        goto out_schedule;
 }
 
-static void
-rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
-{
-       struct ib_wc wc;
-
-       while (ib_poll_cq(ep->rep_attr.recv_cq, 1, &wc) > 0)
-               rpcrdma_receive_wc(NULL, &wc);
-}
-
 static int
 rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
 {
@@ -373,23 +364,6 @@ out:
        return ERR_PTR(rc);
 }
 
-/*
- * Drain any cq, prior to teardown.
- */
-static void
-rpcrdma_clean_cq(struct ib_cq *cq)
-{
-       struct ib_wc wc;
-       int count = 0;
-
-       while (1 == ib_poll_cq(cq, 1, &wc))
-               ++count;
-
-       if (count)
-               dprintk("RPC:       %s: flushed %d events (last 0x%x)\n",
-                       __func__, count, wc.opcode);
-}
-
 /*
  * Exported functions.
  */
@@ -459,7 +433,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
        dprintk("RPC:       %s: memory registration strategy is '%s'\n",
                __func__, ia->ri_ops->ro_displayname);
 
-       rwlock_init(&ia->ri_qplock);
        return 0;
 
 out3:
@@ -515,7 +488,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
                        __func__);
                return -ENOMEM;
        }
-       max_qp_wr = ia->ri_device->attrs.max_qp_wr - RPCRDMA_BACKWARD_WRS;
+       max_qp_wr = ia->ri_device->attrs.max_qp_wr - RPCRDMA_BACKWARD_WRS - 1;
 
        /* check provider's send/recv wr limits */
        if (cdata->max_requests > max_qp_wr)
@@ -526,11 +499,13 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
        ep->rep_attr.srq = NULL;
        ep->rep_attr.cap.max_send_wr = cdata->max_requests;
        ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS;
+       ep->rep_attr.cap.max_send_wr += 1;      /* drain cqe */
        rc = ia->ri_ops->ro_open(ia, ep, cdata);
        if (rc)
                return rc;
        ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
        ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
+       ep->rep_attr.cap.max_recv_wr += 1;      /* drain cqe */
        ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS;
        ep->rep_attr.cap.max_recv_sge = 1;
        ep->rep_attr.cap.max_inline_data = 0;
@@ -578,6 +553,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
        ep->rep_attr.recv_cq = recvcq;
 
        /* Initialize cma parameters */
+       memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma));
 
        /* RPC/RDMA does not use private data */
        ep->rep_remote_cma.private_data = NULL;
@@ -591,7 +567,16 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
                ep->rep_remote_cma.responder_resources =
                                                ia->ri_device->attrs.max_qp_rd_atom;
 
-       ep->rep_remote_cma.retry_count = 7;
+       /* Limit transport retries so client can detect server
+        * GID changes quickly. RPC layer handles re-establishing
+        * transport connection and retransmission.
+        */
+       ep->rep_remote_cma.retry_count = 6;
+
+       /* RPC-over-RDMA handles its own flow control. In addition,
+        * make all RNR NAKs visible so we know that RPC-over-RDMA
+        * flow control is working correctly (no NAKs should be seen).
+        */
        ep->rep_remote_cma.flow_control = 0;
        ep->rep_remote_cma.rnr_retry_count = 0;
 
@@ -622,13 +607,8 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 
        cancel_delayed_work_sync(&ep->rep_connect_worker);
 
-       if (ia->ri_id->qp)
-               rpcrdma_ep_disconnect(ep, ia);
-
-       rpcrdma_clean_cq(ep->rep_attr.recv_cq);
-       rpcrdma_clean_cq(ep->rep_attr.send_cq);
-
        if (ia->ri_id->qp) {
+               rpcrdma_ep_disconnect(ep, ia);
                rdma_destroy_qp(ia->ri_id);
                ia->ri_id->qp = NULL;
        }
@@ -659,7 +639,6 @@ retry:
                dprintk("RPC:       %s: reconnecting...\n", __func__);
 
                rpcrdma_ep_disconnect(ep, ia);
-               rpcrdma_flush_cqs(ep);
 
                xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
                id = rpcrdma_create_id(xprt, ia,
@@ -692,10 +671,8 @@ retry:
                        goto out;
                }
 
-               write_lock(&ia->ri_qplock);
                old = ia->ri_id;
                ia->ri_id = id;
-               write_unlock(&ia->ri_qplock);
 
                rdma_destroy_qp(old);
                rpcrdma_destroy_id(old);
@@ -785,7 +762,6 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 {
        int rc;
 
-       rpcrdma_flush_cqs(ep);
        rc = rdma_disconnect(ia->ri_id);
        if (!rc) {
                /* returns without wait if not connected */
@@ -797,6 +773,8 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
                dprintk("RPC:       %s: rdma_disconnect %i\n", __func__, rc);
                ep->rep_connected = rc;
        }
+
+       ib_drain_qp(ia->ri_id->qp);
 }
 
 struct rpcrdma_req *
@@ -1271,25 +1249,3 @@ out_rc:
        rpcrdma_recv_buffer_put(rep);
        return rc;
 }
-
-/* How many chunk list items fit within our inline buffers?
- */
-unsigned int
-rpcrdma_max_segments(struct rpcrdma_xprt *r_xprt)
-{
-       struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
-       int bytes, segments;
-
-       bytes = min_t(unsigned int, cdata->inline_wsize, cdata->inline_rsize);
-       bytes -= RPCRDMA_HDRLEN_MIN;
-       if (bytes < sizeof(struct rpcrdma_segment) * 2) {
-               pr_warn("RPC:       %s: inline threshold too small\n",
-                       __func__);
-               return 0;
-       }
-
-       segments = 1 << (fls(bytes / sizeof(struct rpcrdma_segment)) - 1);
-       dprintk("RPC:       %s: max chunk list size = %d segments\n",
-               __func__, segments);
-       return segments;
-}
index 2ebc743..95cdc66 100644 (file)
@@ -65,7 +65,6 @@
  */
 struct rpcrdma_ia {
        const struct rpcrdma_memreg_ops *ri_ops;
-       rwlock_t                ri_qplock;
        struct ib_device        *ri_device;
        struct rdma_cm_id       *ri_id;
        struct ib_pd            *ri_pd;
@@ -73,6 +72,8 @@ struct rpcrdma_ia {
        struct completion       ri_done;
        int                     ri_async_rc;
        unsigned int            ri_max_frmr_depth;
+       unsigned int            ri_max_inline_write;
+       unsigned int            ri_max_inline_read;
        struct ib_qp_attr       ri_qp_attr;
        struct ib_qp_init_attr  ri_qp_init_attr;
 };
@@ -144,6 +145,26 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
 
 #define RPCRDMA_DEF_GFP                (GFP_NOIO | __GFP_NOWARN)
 
+/* To ensure a transport can always make forward progress,
+ * the number of RDMA segments allowed in header chunk lists
+ * is capped at 8. This prevents less-capable devices and
+ * memory registrations from overrunning the Send buffer
+ * while building chunk lists.
+ *
+ * Elements of the Read list take up more room than the
+ * Write list or Reply chunk. 8 read segments means the Read
+ * list (or Write list or Reply chunk) cannot consume more
+ * than
+ *
+ * ((8 + 2) * read segment size) + 1 XDR words, or 244 bytes.
+ *
+ * And the fixed part of the header is another 24 bytes.
+ *
+ * The smallest inline threshold is 1024 bytes, ensuring that
+ * at least 750 bytes are available for RPC messages.
+ */
+#define RPCRDMA_MAX_HDR_SEGS   (8)
+
 /*
  * struct rpcrdma_rep -- this structure encapsulates state required to recv
  * and complete a reply, asychronously. It needs several pieces of
@@ -162,7 +183,9 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
  */
 
 #define RPCRDMA_MAX_DATA_SEGS  ((1 * 1024 * 1024) / PAGE_SIZE)
-#define RPCRDMA_MAX_SEGS       (RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */
+
+/* data segments + head/tail for Call + head/tail for Reply */
+#define RPCRDMA_MAX_SEGS       (RPCRDMA_MAX_DATA_SEGS + 4)
 
 struct rpcrdma_buffer;
 
@@ -198,14 +221,13 @@ enum rpcrdma_frmr_state {
 };
 
 struct rpcrdma_frmr {
-       struct scatterlist              *sg;
-       int                             sg_nents;
+       struct scatterlist              *fr_sg;
+       int                             fr_nents;
+       enum dma_data_direction         fr_dir;
        struct ib_mr                    *fr_mr;
        struct ib_cqe                   fr_cqe;
        enum rpcrdma_frmr_state         fr_state;
        struct completion               fr_linv_done;
-       struct work_struct              fr_work;
-       struct rpcrdma_xprt             *fr_xprt;
        union {
                struct ib_reg_wr        fr_regwr;
                struct ib_send_wr       fr_invwr;
@@ -222,6 +244,8 @@ struct rpcrdma_mw {
                struct rpcrdma_fmr      fmr;
                struct rpcrdma_frmr     frmr;
        };
+       struct work_struct      mw_work;
+       struct rpcrdma_xprt     *mw_xprt;
        struct list_head        mw_list;
        struct list_head        mw_all;
 };
@@ -270,12 +294,14 @@ struct rpcrdma_req {
        unsigned int            rl_niovs;
        unsigned int            rl_nchunks;
        unsigned int            rl_connect_cookie;
+       struct rpc_task         *rl_task;
        struct rpcrdma_buffer   *rl_buffer;
        struct rpcrdma_rep      *rl_reply;/* holder for reply buffer */
        struct ib_sge           rl_send_iov[RPCRDMA_MAX_IOVS];
        struct rpcrdma_regbuf   *rl_rdmabuf;
        struct rpcrdma_regbuf   *rl_sendbuf;
        struct rpcrdma_mr_seg   rl_segments[RPCRDMA_MAX_SEGS];
+       struct rpcrdma_mr_seg   *rl_nextseg;
 
        struct ib_cqe           rl_cqe;
        struct list_head        rl_all;
@@ -372,8 +398,8 @@ struct rpcrdma_memreg_ops {
                                  struct rpcrdma_mr_seg *, int, bool);
        void            (*ro_unmap_sync)(struct rpcrdma_xprt *,
                                         struct rpcrdma_req *);
-       int             (*ro_unmap)(struct rpcrdma_xprt *,
-                                   struct rpcrdma_mr_seg *);
+       void            (*ro_unmap_safe)(struct rpcrdma_xprt *,
+                                        struct rpcrdma_req *, bool);
        int             (*ro_open)(struct rpcrdma_ia *,
                                   struct rpcrdma_ep *,
                                   struct rpcrdma_create_data_internal *);
@@ -456,7 +482,6 @@ struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *,
 void rpcrdma_free_regbuf(struct rpcrdma_ia *,
                         struct rpcrdma_regbuf *);
 
-unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *);
 int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int);
 
 int frwr_alloc_recovery_wq(void);
@@ -519,6 +544,9 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *);
  * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
  */
 int rpcrdma_marshal_req(struct rpc_rqst *);
+void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *,
+                                 struct rpcrdma_create_data_internal *,
+                                 unsigned int);
 
 /* RPC/RDMA module init - xprtrdma/transport.c
  */
@@ -534,6 +562,7 @@ void xprt_rdma_cleanup(void);
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
 int xprt_rdma_bc_setup(struct rpc_xprt *, unsigned int);
 int xprt_rdma_bc_up(struct svc_serv *, struct net *);
+size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *);
 int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int);
 void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *);
 int rpcrdma_bc_marshal_reply(struct rpc_rqst *);
index b90c539..2d3e0c4 100644 (file)
@@ -1364,6 +1364,11 @@ static int xs_tcp_bc_up(struct svc_serv *serv, struct net *net)
                return ret;
        return 0;
 }
+
+static size_t xs_tcp_bc_maxpayload(struct rpc_xprt *xprt)
+{
+       return PAGE_SIZE;
+}
 #else
 static inline int _xs_tcp_read_data(struct rpc_xprt *xprt,
                                        struct xdr_skb_reader *desc)
@@ -2661,6 +2666,7 @@ static struct rpc_xprt_ops xs_tcp_ops = {
 #ifdef CONFIG_SUNRPC_BACKCHANNEL
        .bc_setup               = xprt_setup_bc,
        .bc_up                  = xs_tcp_bc_up,
+       .bc_maxpayload          = xs_tcp_bc_maxpayload,
        .bc_free_rqst           = xprt_free_bc_rqst,
        .bc_destroy             = xprt_destroy_bc,
 #endif
index 4dfc5c1..f795b1d 100644 (file)
@@ -346,9 +346,15 @@ static int tipc_nl_compat_bearer_dump(struct tipc_nl_compat_msg *msg,
                                      struct nlattr **attrs)
 {
        struct nlattr *bearer[TIPC_NLA_BEARER_MAX + 1];
+       int err;
+
+       if (!attrs[TIPC_NLA_BEARER])
+               return -EINVAL;
 
-       nla_parse_nested(bearer, TIPC_NLA_BEARER_MAX, attrs[TIPC_NLA_BEARER],
-                        NULL);
+       err = nla_parse_nested(bearer, TIPC_NLA_BEARER_MAX,
+                              attrs[TIPC_NLA_BEARER], NULL);
+       if (err)
+               return err;
 
        return tipc_add_tlv(msg->rep, TIPC_TLV_BEARER_NAME,
                            nla_data(bearer[TIPC_NLA_BEARER_NAME]),
@@ -460,14 +466,31 @@ static int tipc_nl_compat_link_stat_dump(struct tipc_nl_compat_msg *msg,
        struct nlattr *link[TIPC_NLA_LINK_MAX + 1];
        struct nlattr *prop[TIPC_NLA_PROP_MAX + 1];
        struct nlattr *stats[TIPC_NLA_STATS_MAX + 1];
+       int err;
 
-       nla_parse_nested(link, TIPC_NLA_LINK_MAX, attrs[TIPC_NLA_LINK], NULL);
+       if (!attrs[TIPC_NLA_LINK])
+               return -EINVAL;
 
-       nla_parse_nested(prop, TIPC_NLA_PROP_MAX, link[TIPC_NLA_LINK_PROP],
-                        NULL);
+       err = nla_parse_nested(link, TIPC_NLA_LINK_MAX, attrs[TIPC_NLA_LINK],
+                              NULL);
+       if (err)
+               return err;
+
+       if (!link[TIPC_NLA_LINK_PROP])
+               return -EINVAL;
 
-       nla_parse_nested(stats, TIPC_NLA_STATS_MAX, link[TIPC_NLA_LINK_STATS],
-                        NULL);
+       err = nla_parse_nested(prop, TIPC_NLA_PROP_MAX,
+                              link[TIPC_NLA_LINK_PROP], NULL);
+       if (err)
+               return err;
+
+       if (!link[TIPC_NLA_LINK_STATS])
+               return -EINVAL;
+
+       err = nla_parse_nested(stats, TIPC_NLA_STATS_MAX,
+                              link[TIPC_NLA_LINK_STATS], NULL);
+       if (err)
+               return err;
 
        name = (char *)TLV_DATA(msg->req);
        if (strcmp(name, nla_data(link[TIPC_NLA_LINK_NAME])) != 0)
@@ -569,8 +592,15 @@ static int tipc_nl_compat_link_dump(struct tipc_nl_compat_msg *msg,
 {
        struct nlattr *link[TIPC_NLA_LINK_MAX + 1];
        struct tipc_link_info link_info;
+       int err;
 
-       nla_parse_nested(link, TIPC_NLA_LINK_MAX, attrs[TIPC_NLA_LINK], NULL);
+       if (!attrs[TIPC_NLA_LINK])
+               return -EINVAL;
+
+       err = nla_parse_nested(link, TIPC_NLA_LINK_MAX, attrs[TIPC_NLA_LINK],
+                              NULL);
+       if (err)
+               return err;
 
        link_info.dest = nla_get_flag(link[TIPC_NLA_LINK_DEST]);
        link_info.up = htonl(nla_get_flag(link[TIPC_NLA_LINK_UP]));
@@ -758,12 +788,23 @@ static int tipc_nl_compat_name_table_dump(struct tipc_nl_compat_msg *msg,
        u32 node, depth, type, lowbound, upbound;
        static const char * const scope_str[] = {"", " zone", " cluster",
                                                 " node"};
+       int err;
 
-       nla_parse_nested(nt, TIPC_NLA_NAME_TABLE_MAX,
-                        attrs[TIPC_NLA_NAME_TABLE], NULL);
+       if (!attrs[TIPC_NLA_NAME_TABLE])
+               return -EINVAL;
 
-       nla_parse_nested(publ, TIPC_NLA_PUBL_MAX, nt[TIPC_NLA_NAME_TABLE_PUBL],
-                        NULL);
+       err = nla_parse_nested(nt, TIPC_NLA_NAME_TABLE_MAX,
+                              attrs[TIPC_NLA_NAME_TABLE], NULL);
+       if (err)
+               return err;
+
+       if (!nt[TIPC_NLA_NAME_TABLE_PUBL])
+               return -EINVAL;
+
+       err = nla_parse_nested(publ, TIPC_NLA_PUBL_MAX,
+                              nt[TIPC_NLA_NAME_TABLE_PUBL], NULL);
+       if (err)
+               return err;
 
        ntq = (struct tipc_name_table_query *)TLV_DATA(msg->req);
 
@@ -815,8 +856,15 @@ static int __tipc_nl_compat_publ_dump(struct tipc_nl_compat_msg *msg,
 {
        u32 type, lower, upper;
        struct nlattr *publ[TIPC_NLA_PUBL_MAX + 1];
+       int err;
 
-       nla_parse_nested(publ, TIPC_NLA_PUBL_MAX, attrs[TIPC_NLA_PUBL], NULL);
+       if (!attrs[TIPC_NLA_PUBL])
+               return -EINVAL;
+
+       err = nla_parse_nested(publ, TIPC_NLA_PUBL_MAX, attrs[TIPC_NLA_PUBL],
+                              NULL);
+       if (err)
+               return err;
 
        type = nla_get_u32(publ[TIPC_NLA_PUBL_TYPE]);
        lower = nla_get_u32(publ[TIPC_NLA_PUBL_LOWER]);
@@ -876,7 +924,13 @@ static int tipc_nl_compat_sk_dump(struct tipc_nl_compat_msg *msg,
        u32 sock_ref;
        struct nlattr *sock[TIPC_NLA_SOCK_MAX + 1];
 
-       nla_parse_nested(sock, TIPC_NLA_SOCK_MAX, attrs[TIPC_NLA_SOCK], NULL);
+       if (!attrs[TIPC_NLA_SOCK])
+               return -EINVAL;
+
+       err = nla_parse_nested(sock, TIPC_NLA_SOCK_MAX, attrs[TIPC_NLA_SOCK],
+                              NULL);
+       if (err)
+               return err;
 
        sock_ref = nla_get_u32(sock[TIPC_NLA_SOCK_REF]);
        tipc_tlv_sprintf(msg->rep, "%u:", sock_ref);
@@ -917,9 +971,15 @@ static int tipc_nl_compat_media_dump(struct tipc_nl_compat_msg *msg,
                                     struct nlattr **attrs)
 {
        struct nlattr *media[TIPC_NLA_MEDIA_MAX + 1];
+       int err;
+
+       if (!attrs[TIPC_NLA_MEDIA])
+               return -EINVAL;
 
-       nla_parse_nested(media, TIPC_NLA_MEDIA_MAX, attrs[TIPC_NLA_MEDIA],
-                        NULL);
+       err = nla_parse_nested(media, TIPC_NLA_MEDIA_MAX, attrs[TIPC_NLA_MEDIA],
+                              NULL);
+       if (err)
+               return err;
 
        return tipc_add_tlv(msg->rep, TIPC_TLV_MEDIA_NAME,
                            nla_data(media[TIPC_NLA_MEDIA_NAME]),
@@ -931,8 +991,15 @@ static int tipc_nl_compat_node_dump(struct tipc_nl_compat_msg *msg,
 {
        struct tipc_node_info node_info;
        struct nlattr *node[TIPC_NLA_NODE_MAX + 1];
+       int err;
 
-       nla_parse_nested(node, TIPC_NLA_NODE_MAX, attrs[TIPC_NLA_NODE], NULL);
+       if (!attrs[TIPC_NLA_NODE])
+               return -EINVAL;
+
+       err = nla_parse_nested(node, TIPC_NLA_NODE_MAX, attrs[TIPC_NLA_NODE],
+                              NULL);
+       if (err)
+               return err;
 
        node_info.addr = htonl(nla_get_u32(node[TIPC_NLA_NODE_ADDR]));
        node_info.up = htonl(nla_get_flag(node[TIPC_NLA_NODE_UP]));
@@ -971,8 +1038,16 @@ static int tipc_nl_compat_net_dump(struct tipc_nl_compat_msg *msg,
 {
        __be32 id;
        struct nlattr *net[TIPC_NLA_NET_MAX + 1];
+       int err;
+
+       if (!attrs[TIPC_NLA_NET])
+               return -EINVAL;
+
+       err = nla_parse_nested(net, TIPC_NLA_NET_MAX, attrs[TIPC_NLA_NET],
+                              NULL);
+       if (err)
+               return err;
 
-       nla_parse_nested(net, TIPC_NLA_NET_MAX, attrs[TIPC_NLA_NET], NULL);
        id = htonl(nla_get_u32(net[TIPC_NLA_NET_ID]));
 
        return tipc_add_tlv(msg->rep, TIPC_TLV_UNSIGNED, &id, sizeof(id));
index b2ab2a9..0f82314 100644 (file)
@@ -7,6 +7,7 @@ quote   := "
 squote  := '
 empty   :=
 space   := $(empty) $(empty)
+space_escape := _-_SPACE_-_
 
 ###
 # Name of target with a '.' as filename prefix. foo/bar.o => foo/.bar.o
@@ -226,10 +227,10 @@ objectify = $(foreach o,$(1),$(if $(filter /%,$(o)),$(o),$(obj)/$(o)))
 # See Documentation/kbuild/makefiles.txt for more info
 
 ifneq ($(KBUILD_NOCMDDEP),1)
-# Check if both arguments has same arguments. Result is empty string if equal.
-# User may override this check using make KBUILD_NOCMDDEP=1
-arg-check = $(strip $(filter-out $(cmd_$(1)), $(cmd_$@)) \
-                    $(filter-out $(cmd_$@),   $(cmd_$(1))) )
+# Check if both arguments are the same including their order. Result is empty
+# string if equal. User may override this check using make KBUILD_NOCMDDEP=1
+arg-check = $(filter-out $(subst $(space),$(space_escape),$(strip $(cmd_$@))), \
+                         $(subst $(space),$(space_escape),$(strip $(cmd_$1))))
 else
 arg-check = $(if $(strip $(cmd_$@)),,1)
 endif
@@ -256,10 +257,42 @@ if_changed = $(if $(strip $(any-prereq) $(arg-check)),                       \
 # Execute the command and also postprocess generated .d dependencies file.
 if_changed_dep = $(if $(strip $(any-prereq) $(arg-check) ),                  \
        @set -e;                                                             \
+       $(cmd_and_fixdep), @:)
+
+ifndef CONFIG_TRIM_UNUSED_KSYMS
+
+cmd_and_fixdep =                                                             \
        $(echo-cmd) $(cmd_$(1));                                             \
        scripts/basic/fixdep $(depfile) $@ '$(make-cmd)' > $(dot-target).tmp;\
        rm -f $(depfile);                                                    \
-       mv -f $(dot-target).tmp $(dot-target).cmd, @:)
+       mv -f $(dot-target).tmp $(dot-target).cmd;
+
+else
+
+# Filter out exported kernel symbol names from the preprocessor output.
+# See also __KSYM_DEPS__ in include/linux/export.h.
+# We disable the depfile generation here, so as not to overwrite the existing
+# depfile while fixdep is parsing it.
+flags_nodeps = $(filter-out -Wp$(comma)-M%, $($(1)))
+ksym_dep_filter =                                                            \
+       case "$(1)" in                                                       \
+         cc_*_c|cpp_i_c)                                                    \
+           $(CPP) $(call flags_nodeps,c_flags) -D__KSYM_DEPS__ $< ;;        \
+         as_*_S|cpp_s_S)                                                    \
+           $(CPP) $(call flags_nodeps,a_flags) -D__KSYM_DEPS__ $< ;;        \
+         boot*|build*|*cpp_lds_S|dtc|host*|vdso*) : ;;                      \
+         *) echo "Don't know how to preprocess $(1)" >&2; false ;;          \
+       esac | tr ";" "\n" | sed -rn 's/^.*=== __KSYM_(.*) ===.*$$/KSYM_\1/p'
+
+cmd_and_fixdep =                                                             \
+       $(echo-cmd) $(cmd_$(1));                                             \
+       $(ksym_dep_filter) |                                                 \
+               scripts/basic/fixdep -e $(depfile) $@ '$(make-cmd)'          \
+                       > $(dot-target).tmp;                                 \
+       rm -f $(depfile);                                                    \
+       mv -f $(dot-target).tmp $(dot-target).cmd;
+
+endif
 
 # Usage: $(call if_changed_rule,foo)
 # Will check if $(cmd_foo) or any of the prerequisites changed,
@@ -341,8 +374,6 @@ endif
 #
 ###############################################################################
 #
-space_escape := %%%SPACE%%%
-#
 define config_filename
 ifneq ($$(CONFIG_$(1)),"")
 $(1)_FILENAME := $$(subst \\,\,$$(subst \$$(quote),$$(quote),$$(subst $$(space_escape),\$$(space),$$(patsubst "%",%,$$(subst $$(space),$$(space_escape),$$(CONFIG_$(1)))))))
index e1bc190..0d1ca5b 100644 (file)
@@ -152,11 +152,11 @@ cmd_cc_s_c       = $(CC) $(c_flags) $(DISABLE_LTO) -fverbose-asm -S -o $@ $<
 $(obj)/%.s: $(src)/%.c FORCE
        $(call if_changed_dep,cc_s_c)
 
-quiet_cmd_cc_i_c = CPP $(quiet_modtag) $@
-cmd_cc_i_c       = $(CPP) $(c_flags)   -o $@ $<
+quiet_cmd_cpp_i_c = CPP $(quiet_modtag) $@
+cmd_cpp_i_c       = $(CPP) $(c_flags) -o $@ $<
 
 $(obj)/%.i: $(src)/%.c FORCE
-       $(call if_changed_dep,cc_i_c)
+       $(call if_changed_dep,cpp_i_c)
 
 cmd_gensymtypes =                                                           \
     $(CPP) -D__GENKSYMS__ $(c_flags) $< |                                   \
@@ -266,26 +266,24 @@ endif # CONFIG_STACK_VALIDATION
 
 define rule_cc_o_c
        $(call echo-cmd,checksrc) $(cmd_checksrc)                         \
-       $(call echo-cmd,cc_o_c) $(cmd_cc_o_c);                            \
+       $(call cmd_and_fixdep,cc_o_c)                                     \
        $(cmd_modversions)                                                \
-       $(cmd_objtool)                                            \
-       $(call echo-cmd,record_mcount)                                    \
-       $(cmd_record_mcount)                                              \
-       scripts/basic/fixdep $(depfile) $@ '$(call make-cmd,cc_o_c)' >    \
-                                                     $(dot-target).tmp;  \
-       rm -f $(depfile);                                                 \
-       mv -f $(dot-target).tmp $(dot-target).cmd
+       $(cmd_objtool)                                                    \
+       $(call echo-cmd,record_mcount) $(cmd_record_mcount)
 endef
 
 define rule_as_o_S
-       $(call echo-cmd,as_o_S) $(cmd_as_o_S);                            \
-       $(cmd_objtool)                                            \
-       scripts/basic/fixdep $(depfile) $@ '$(call make-cmd,as_o_S)' >    \
-                                                     $(dot-target).tmp;  \
-       rm -f $(depfile);                                                 \
-       mv -f $(dot-target).tmp $(dot-target).cmd
+       $(call cmd_and_fixdep,as_o_S)                                     \
+       $(cmd_objtool)
 endef
 
+# List module undefined symbols (or empty line if not enabled)
+ifdef CONFIG_TRIM_UNUSED_KSYMS
+cmd_undef_syms = $(NM) $@ | sed -n 's/^ \+U //p' | xargs echo
+else
+cmd_undef_syms = echo
+endif
+
 # Built-in and composite module parts
 $(obj)/%.o: $(src)/%.c $(recordmcount_source) $(objtool_obj) FORCE
        $(call cmd,force_checksrc)
@@ -296,7 +294,8 @@ $(obj)/%.o: $(src)/%.c $(recordmcount_source) $(objtool_obj) FORCE
 $(single-used-m): $(obj)/%.o: $(src)/%.c $(recordmcount_source) $(objtool_obj) FORCE
        $(call cmd,force_checksrc)
        $(call if_changed_rule,cc_o_c)
-       @{ echo $(@:.o=.ko); echo $@; } > $(MODVERDIR)/$(@F:.o=.mod)
+       @{ echo $(@:.o=.ko); echo $@; \
+          $(cmd_undef_syms); } > $(MODVERDIR)/$(@F:.o=.mod)
 
 quiet_cmd_cc_lst_c = MKLST   $@
       cmd_cc_lst_c = $(CC) $(c_flags) -g -c -o $*.o $< && \
@@ -314,11 +313,11 @@ modkern_aflags := $(KBUILD_AFLAGS_KERNEL) $(AFLAGS_KERNEL)
 $(real-objs-m)      : modkern_aflags := $(KBUILD_AFLAGS_MODULE) $(AFLAGS_MODULE)
 $(real-objs-m:.o=.s): modkern_aflags := $(KBUILD_AFLAGS_MODULE) $(AFLAGS_MODULE)
 
-quiet_cmd_as_s_S = CPP $(quiet_modtag) $@
-cmd_as_s_S       = $(CPP) $(a_flags)   -o $@ $<
+quiet_cmd_cpp_s_S = CPP $(quiet_modtag) $@
+cmd_cpp_s_S       = $(CPP) $(a_flags) -o $@ $<
 
 $(obj)/%.s: $(src)/%.S FORCE
-       $(call if_changed_dep,as_s_S)
+       $(call if_changed_dep,cpp_s_S)
 
 quiet_cmd_as_o_S = AS $(quiet_modtag)  $@
 cmd_as_o_S       = $(CC) $(a_flags) -c -o $@ $<
@@ -426,7 +425,8 @@ $(call multi_depend, $(multi-used-y), .o, -objs -y)
 
 $(multi-used-m): FORCE
        $(call if_changed,link_multi-m)
-       @{ echo $(@:.o=.ko); echo $(link_multi_deps); } > $(MODVERDIR)/$(@F:.o=.mod)
+       @{ echo $(@:.o=.ko); echo $(link_multi_deps); \
+          $(cmd_undef_syms); } > $(MODVERDIR)/$(@F:.o=.mod)
 $(call multi_depend, $(multi-used-m), .o, -objs -y -m)
 
 targets += $(multi-used-y) $(multi-used-m)
index f9e47a7..53449a6 100644 (file)
@@ -24,6 +24,7 @@ warning-1 += $(call cc-option, -Wmissing-prototypes)
 warning-1 += -Wold-style-definition
 warning-1 += $(call cc-option, -Wmissing-include-dirs)
 warning-1 += $(call cc-option, -Wunused-but-set-variable)
+warning-1 += $(call cc-option, -Wunused-const-variable)
 warning-1 += $(call cc-disable-warning, missing-field-initializers)
 warning-1 += $(call cc-disable-warning, sign-compare)
 
index ed1b7c4..e7df0f5 100644 (file)
@@ -96,10 +96,10 @@ obj-dirs    := $(addprefix $(obj)/,$(obj-dirs))
 # Note: Files that end up in two or more modules are compiled without the
 #       KBUILD_MODNAME definition. The reason is that any made-up name would
 #       differ in different configs.
-name-fix = $(subst $(comma),_,$(subst -,_,$1))
-basename_flags = -D"KBUILD_BASENAME=KBUILD_STR($(call name-fix,$(basetarget)))"
+name-fix = $(squote)$(quote)$(subst $(comma),_,$(subst -,_,$1))$(quote)$(squote)
+basename_flags = -DKBUILD_BASENAME=$(call name-fix,$(basetarget))
 modname_flags  = $(if $(filter 1,$(words $(modname))),\
-                 -D"KBUILD_MODNAME=KBUILD_STR($(call name-fix,$(modname)))")
+                 -DKBUILD_MODNAME=$(call name-fix,$(modname)))
 
 orig_c_flags   = $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) $(KBUILD_SUBDIR_CCFLAGS) \
                  $(ccflags-y) $(CFLAGS_$(basetarget).o)
@@ -162,7 +162,7 @@ endif
 
 c_flags        = -Wp,-MD,$(depfile) $(NOSTDINC_FLAGS) $(LINUXINCLUDE)     \
                 $(__c_flags) $(modkern_cflags)                           \
-                -D"KBUILD_STR(s)=\#s" $(basename_flags) $(modname_flags)
+                $(basename_flags) $(modname_flags)
 
 a_flags        = -Wp,-MD,$(depfile) $(NOSTDINC_FLAGS) $(LINUXINCLUDE)     \
                 $(__a_flags) $(modkern_aflags)
diff --git a/scripts/adjust_autoksyms.sh b/scripts/adjust_autoksyms.sh
new file mode 100755 (executable)
index 0000000..8dc1918
--- /dev/null
@@ -0,0 +1,101 @@
+#!/bin/sh
+
+# Script to create/update include/generated/autoksyms.h and dependency files
+#
+# Copyright:   (C) 2016  Linaro Limited
+# Created by:  Nicolas Pitre, January 2016
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+# Create/update the include/generated/autoksyms.h file from the list
+# of all module's needed symbols as recorded on the third line of
+# .tmp_versions/*.mod files.
+#
+# For each symbol being added or removed, the corresponding dependency
+# file's timestamp is updated to force a rebuild of the affected source
+# file. All arguments passed to this script are assumed to be a command
+# to be exec'd to trigger a rebuild of those files.
+
+set -e
+
+cur_ksyms_file="include/generated/autoksyms.h"
+new_ksyms_file="include/generated/autoksyms.h.tmpnew"
+
+info() {
+       if [ "$quiet" != "silent_" ]; then
+               printf "  %-7s %s\n" "$1" "$2"
+       fi
+}
+
+info "CHK" "$cur_ksyms_file"
+
+# Use "make V=1" to debug this script.
+case "$KBUILD_VERBOSE" in
+*1*)
+       set -x
+       ;;
+esac
+
+# We need access to CONFIG_ symbols
+case "${KCONFIG_CONFIG}" in
+*/*)
+       . "${KCONFIG_CONFIG}"
+       ;;
+*)
+       # Force using a file from the current directory
+       . "./${KCONFIG_CONFIG}"
+esac
+
+# In case it doesn't exist yet...
+if [ -e "$cur_ksyms_file" ]; then touch "$cur_ksyms_file"; fi
+
+# Generate a new ksym list file with symbols needed by the current
+# set of modules.
+cat > "$new_ksyms_file" << EOT
+/*
+ * Automatically generated file; DO NOT EDIT.
+ */
+
+EOT
+sed -ns -e '3{s/ /\n/g;/^$/!p;}' "$MODVERDIR"/*.mod | sort -u |
+while read sym; do
+       if [ -n "$CONFIG_HAVE_UNDERSCORE_SYMBOL_PREFIX" ]; then
+               sym="${sym#_}"
+       fi
+       echo "#define __KSYM_${sym} 1"
+done >> "$new_ksyms_file"
+
+# Special case for modversions (see modpost.c)
+if [ -n "$CONFIG_MODVERSIONS" ]; then
+       echo "#define __KSYM_module_layout 1" >> "$new_ksyms_file"
+fi
+
+# Extract changes between old and new list and touch corresponding
+# dependency files.
+changed=$(
+count=0
+sort "$cur_ksyms_file" "$new_ksyms_file" | uniq -u |
+sed -n 's/^#define __KSYM_\(.*\) 1/\1/p' | tr "A-Z_" "a-z/" |
+while read sympath; do
+       if [ -z "$sympath" ]; then continue; fi
+       depfile="include/config/ksym/${sympath}.h"
+       mkdir -p "$(dirname "$depfile")"
+       touch "$depfile"
+       echo $((count += 1))
+done | tail -1 )
+changed=${changed:-0}
+
+if [ $changed -gt 0 ]; then
+       # Replace the old list with tne new one
+       old=$(grep -c "^#define __KSYM_" "$cur_ksyms_file" || true)
+       new=$(grep -c "^#define __KSYM_" "$new_ksyms_file" || true)
+       info "KSYMS" "symbols: before=$old, after=$new, changed=$changed"
+       info "UPD" "$cur_ksyms_file"
+       mv -f "$new_ksyms_file" "$cur_ksyms_file"
+       # Then trigger a rebuild of affected source files
+       exec $@
+else
+       rm -f "$new_ksyms_file"
+fi
index caef815..746ec1e 100644 (file)
 #define INT_NFIG ntohl(0x4e464947)
 #define INT_FIG_ ntohl(0x4649475f)
 
+int insert_extra_deps;
 char *target;
 char *depfile;
 char *cmdline;
 
 static void usage(void)
 {
-       fprintf(stderr, "Usage: fixdep <depfile> <target> <cmdline>\n");
+       fprintf(stderr, "Usage: fixdep [-e] <depfile> <target> <cmdline>\n");
+       fprintf(stderr, " -e  insert extra dependencies given on stdin\n");
        exit(1);
 }
 
@@ -138,6 +140,40 @@ static void print_cmdline(void)
        printf("cmd_%s := %s\n\n", target, cmdline);
 }
 
+/*
+ * Print out a dependency path from a symbol name
+ */
+static void print_config(const char *m, int slen)
+{
+       int c, i;
+
+       printf("    $(wildcard include/config/");
+       for (i = 0; i < slen; i++) {
+               c = m[i];
+               if (c == '_')
+                       c = '/';
+               else
+                       c = tolower(c);
+               putchar(c);
+       }
+       printf(".h) \\\n");
+}
+
+static void do_extra_deps(void)
+{
+       if (insert_extra_deps) {
+               char buf[80];
+               while(fgets(buf, sizeof(buf), stdin)) {
+                       int len = strlen(buf);
+                       if (len < 2 || buf[len-1] != '\n') {
+                               fprintf(stderr, "fixdep: bad data on stdin\n");
+                               exit(1);
+                       }
+                       print_config(buf, len-1);
+               }
+       }
+}
+
 struct item {
        struct item     *next;
        unsigned int    len;
@@ -197,23 +233,12 @@ static void define_config(const char *name, int len, unsigned int hash)
 static void use_config(const char *m, int slen)
 {
        unsigned int hash = strhash(m, slen);
-       int c, i;
 
        if (is_defined_config(m, slen, hash))
            return;
 
        define_config(m, slen, hash);
-
-       printf("    $(wildcard include/config/");
-       for (i = 0; i < slen; i++) {
-               c = m[i];
-               if (c == '_')
-                       c = '/';
-               else
-                       c = tolower(c);
-               putchar(c);
-       }
-       printf(".h) \\\n");
+       print_config(m, slen);
 }
 
 static void parse_config_file(const char *map, size_t len)
@@ -250,7 +275,7 @@ static void parse_config_file(const char *map, size_t len)
        }
 }
 
-/* test is s ends in sub */
+/* test if s ends in sub */
 static int strrcmp(const char *s, const char *sub)
 {
        int slen = strlen(s);
@@ -333,6 +358,7 @@ static void parse_dep_file(void *map, size_t len)
 
                        /* Ignore certain dependencies */
                        if (strrcmp(s, "include/generated/autoconf.h") &&
+                           strrcmp(s, "include/generated/autoksyms.h") &&
                            strrcmp(s, "arch/um/include/uml-config.h") &&
                            strrcmp(s, "include/linux/kconfig.h") &&
                            strrcmp(s, ".ver")) {
@@ -378,6 +404,8 @@ static void parse_dep_file(void *map, size_t len)
                exit(1);
        }
 
+       do_extra_deps();
+
        printf("\n%s: $(deps_%s)\n\n", target, target);
        printf("$(deps_%s):\n", target);
 }
@@ -434,7 +462,10 @@ int main(int argc, char *argv[])
 {
        traps();
 
-       if (argc != 4)
+       if (argc == 5 && !strcmp(argv[1], "-e")) {
+               insert_extra_deps = 1;
+               argv++;
+       } else if (argc != 4)
                usage();
 
        depfile = argv[1];
index 6750595..4904ced 100755 (executable)
@@ -2454,6 +2454,7 @@ sub process {
 
 # Check for git id commit length and improperly formed commit descriptions
                if ($in_commit_log && !$commit_log_possible_stack_dump &&
+                   $line !~ /^\s*(?:Link|Patchwork|http|BugLink):/i &&
                    ($line =~ /\bcommit\s+[0-9a-f]{5,}\b/i ||
                     ($line =~ /\b[0-9a-f]{12,40}\b/i &&
                      $line !~ /[\<\[][0-9a-f]{12,40}[\>\]]/i &&
index b2d7581..dd85a45 100755 (executable)
@@ -98,7 +98,7 @@ run_cmd() {
 }
 
 kill_running() {
-       for i in $(seq $(( NPROC - 1 )) ); do
+       for i in $(seq $(( NPROC - 1 )) ); do
                if [ $VERBOSE -eq 2 ] ; then
                        echo "Killing ${SPATCH_PID[$i]}"
                fi
index 8ee0ac3..eb6bd9e 100644 (file)
@@ -106,7 +106,7 @@ position j0, j1, j2;
 @match_function_and_data_after_init_timer_context
 depends on !patch &&
 !match_immediate_function_data_after_init_timer_context &&
-(context || org || report)@
+ (context || org || report)@
 expression a, b, e1, e2, e3, e4, e5;
 position j0, j1, j2;
 @@
@@ -127,7 +127,7 @@ position j0, j1, j2;
 @r3_context depends on !patch &&
 !match_immediate_function_data_after_init_timer_context &&
 !match_function_and_data_after_init_timer_context &&
-(context || org || report)@
+ (context || org || report)@
 expression c, e6, e7;
 position r1.p;
 position j0, j1;
diff --git a/scripts/coccinelle/misc/compare_const_fl.cocci b/scripts/coccinelle/misc/compare_const_fl.cocci
deleted file mode 100644 (file)
index b5d4bab..0000000
+++ /dev/null
@@ -1,171 +0,0 @@
-/// Move constants to the right of binary operators.
-//# Depends on personal taste in some cases.
-///
-// Confidence: Moderate
-// Copyright: (C) 2015 Copyright: (C) 2015 Julia Lawall, Inria. GPLv2.
-// URL: http://coccinelle.lip6.fr/
-// Options: --no-includes --include-headers
-
-virtual patch
-virtual context
-virtual org
-virtual report
-
-@r1 depends on patch && !context && !org && !report
- disable bitor_comm, neg_if_exp@
-constant c,c1;
-local idexpression i;
-expression e,e1,e2;
-binary operator b = {==,!=,&,|};
-type t;
-@@
-
-(
-c b (c1)
-|
-sizeof(t) b e1
-|
-sizeof e b e1
-|
-i b e1
-|
-c | e1 | e2 | ...
-|
-c | (e ? e1 : e2)
-|
-- c
-+ e
-b
-- e
-+ c
-)
-
-@r2 depends on patch && !context && !org && !report
- disable gtr_lss, gtr_lss_eq, not_int2@
-constant c,c1;
-expression e,e1,e2;
-binary operator b;
-binary operator b1 = {<,<=},b2 = {<,<=};
-binary operator b3 = {>,>=},b4 = {>,>=};
-local idexpression i;
-type t;
-@@
-
-(
-c b c1
-|
-sizeof(t) b e1
-|
-sizeof e b e1
-|
- (e1 b1 e) && (e b2 e2)
-|
- (e1 b3 e) && (e b4 e2)
-|
-i b e
-|
-- c < e
-+ e > c
-|
-- c <= e
-+ e >= c
-|
-- c > e
-+ e < c
-|
-- c >= e
-+ e <= c
-)
-
-// ----------------------------------------------------------------------------
-
-@r1_context depends on !patch && (context || org || report)
- disable bitor_comm, neg_if_exp exists@
-type t;
-binary operator b = {==,!=,&,|};
-constant c, c1;
-expression e, e1, e2;
-local idexpression i;
-position j0;
-@@
-
-(
-c b (c1)
-|
-sizeof(t) b e1
-|
-sizeof e b e1
-|
-i b e1
-|
-c | e1 | e2 | ...
-|
-c | (e ? e1 : e2)
-|
-* c@j0 b e
-)
-
-@r2_context depends on !patch && (context || org || report)
- disable gtr_lss, gtr_lss_eq, not_int2 exists@
-type t;
-binary operator b, b1 = {<,<=}, b2 = {<,<=}, b3 = {>,>=}, b4 = {>,>=};
-constant c, c1;
-expression e, e1, e2;
-local idexpression i;
-position j0;
-@@
-
-(
-c b c1
-|
-sizeof(t) b e1
-|
-sizeof e b e1
-|
- (e1 b1 e) && (e b2 e2)
-|
- (e1 b3 e) && (e b4 e2)
-|
-i b e
-|
-* c@j0 < e
-|
-* c@j0 <= e
-|
-* c@j0 > e
-|
-* c@j0 >= e
-)
-
-// ----------------------------------------------------------------------------
-
-@script:python r1_org depends on org@
-j0 << r1_context.j0;
-@@
-
-msg = "Move constant to right."
-coccilib.org.print_todo(j0[0], msg)
-
-@script:python r2_org depends on org@
-j0 << r2_context.j0;
-@@
-
-msg = "Move constant to right."
-coccilib.org.print_todo(j0[0], msg)
-
-// ----------------------------------------------------------------------------
-
-@script:python r1_report depends on report@
-j0 << r1_context.j0;
-@@
-
-msg = "Move constant to right."
-coccilib.report.print_report(j0[0], msg)
-
-@script:python r2_report depends on report@
-j0 << r2_context.j0;
-@@
-
-msg = "Move constant to right."
-coccilib.report.print_report(j0[0], msg)
-
index dafaf96..06121ce 100644 (file)
@@ -873,5 +873,8 @@ int main(int argc, char **argv)
                        (double)nsyms / (double)HASH_BUCKETS);
        }
 
+       if (dumpfile)
+               fclose(dumpfile);
+
        return errors != 0;
 }
index dd243d2..297b079 100644 (file)
@@ -375,7 +375,9 @@ load:
                                continue;
                } else {
                        if (line[0] != '\r' && line[0] != '\n')
-                               conf_warning("unexpected data");
+                               conf_warning("unexpected data: %.*s",
+                                            (int)strcspn(line, "\r\n"), line);
+
                        continue;
                }
 setsym:
index 25cf0c2..2432298 100644 (file)
@@ -209,12 +209,26 @@ static void sym_set_all_changed(void)
 static void sym_calc_visibility(struct symbol *sym)
 {
        struct property *prop;
+       struct symbol *choice_sym = NULL;
        tristate tri;
 
        /* any prompt visible? */
        tri = no;
+
+       if (sym_is_choice_value(sym))
+               choice_sym = prop_get_symbol(sym_get_choice_prop(sym));
+
        for_all_prompts(sym, prop) {
                prop->visible.tri = expr_calc_value(prop->visible.expr);
+               /*
+                * Tristate choice_values with visibility 'mod' are
+                * not visible if the corresponding choice's value is
+                * 'yes'.
+                */
+               if (choice_sym && sym->type == S_TRISTATE &&
+                   prop->visible.tri == mod && choice_sym->curr.tri == yes)
+                       prop->visible.tri = no;
+
                tri = EXPR_OR(tri, prop->visible.tri);
        }
        if (tri == mod && (sym->type != S_TRISTATE || modules_val == no))
index c2c7389..71b4a8a 100644 (file)
@@ -52,7 +52,7 @@ rpm-pkg rpm: FORCE
        $(call cmd,src_tar,$(KERNELPATH),kernel.spec)
        $(CONFIG_SHELL) $(srctree)/scripts/mkversion > $(objtree)/.tmp_version
        mv -f $(objtree)/.tmp_version $(objtree)/.version
-       rpmbuild --target $(UTS_MACHINE) -ta $(KERNELPATH).tar.gz
+       rpmbuild $(RPMOPTS) --target $(UTS_MACHINE) -ta $(KERNELPATH).tar.gz
        rm $(KERNELPATH).tar.gz kernel.spec
 
 # binrpm-pkg
@@ -63,7 +63,7 @@ binrpm-pkg: FORCE
        $(CONFIG_SHELL) $(srctree)/scripts/mkversion > $(objtree)/.tmp_version
        mv -f $(objtree)/.tmp_version $(objtree)/.version
 
-       rpmbuild --define "_builddir $(objtree)" --target \
+       rpmbuild $(RPMOPTS) --define "_builddir $(objtree)" --target \
                $(UTS_MACHINE) -bb $(objtree)/binkernel.spec
        rm binkernel.spec
 
index 6c3b038..86e56fe 100755 (executable)
@@ -322,7 +322,10 @@ fi
 
 # Build kernel header package
 (cd $srctree; find . -name Makefile\* -o -name Kconfig\* -o -name \*.pl) > "$objtree/debian/hdrsrcfiles"
-(cd $srctree; find arch/$SRCARCH/include include scripts -type f) >> "$objtree/debian/hdrsrcfiles"
+if grep -q '^CONFIG_STACK_VALIDATION=y' $KCONFIG_CONFIG ; then
+       (cd $srctree; find tools/objtool -type f -executable) >> "$objtree/debian/hdrsrcfiles"
+fi
+(cd $srctree; find arch/*/include include scripts -type f) >> "$objtree/debian/hdrsrcfiles"
 (cd $srctree; find arch/$SRCARCH -name module.lds -o -name Kbuild.platforms -o -name Platform) >> "$objtree/debian/hdrsrcfiles"
 (cd $srctree; find $(find arch/$SRCARCH -name include -o -name scripts -type d) -type f) >> "$objtree/debian/hdrsrcfiles"
 (cd $objtree; find arch/$SRCARCH/include Module.symvers include scripts -type f) >> "$objtree/debian/hdrobjfiles"
index b6de63c..57673ba 100755 (executable)
@@ -143,6 +143,11 @@ echo "if [ -x /sbin/new-kernel-pkg ]; then"
 echo "new-kernel-pkg --remove $KERNELRELEASE --rminitrd --initrdfile=/boot/initramfs-$KERNELRELEASE.img"
 echo "fi"
 echo ""
+echo "%postun"
+echo "if [ -x /sbin/update-bootloader ]; then"
+echo "/sbin/update-bootloader --remove $KERNELRELEASE"
+echo "fi"
+echo ""
 echo "%files"
 echo '%defattr (-, root, root)'
 echo "/lib/modules/$KERNELRELEASE"
index c8783b3..36c80bf 100644 (file)
@@ -134,7 +134,7 @@ COMPAT_SYSCALL_DEFINE5(keyctl, u32, option,
 
        case KEYCTL_DH_COMPUTE:
                return keyctl_dh_compute(compat_ptr(arg2), compat_ptr(arg3),
-                                        arg4);
+                                        arg4, compat_ptr(arg5));
 
        default:
                return -EOPNOTSUPP;
index 880505a..531ed2e 100644 (file)
@@ -78,7 +78,8 @@ error:
 }
 
 long keyctl_dh_compute(struct keyctl_dh_params __user *params,
-                      char __user *buffer, size_t buflen)
+                      char __user *buffer, size_t buflen,
+                      void __user *reserved)
 {
        long ret;
        MPI base, private, prime, result;
@@ -97,6 +98,11 @@ long keyctl_dh_compute(struct keyctl_dh_params __user *params,
                goto out;
        }
 
+       if (reserved) {
+               ret = -EINVAL;
+               goto out;
+       }
+
        keylen = mpi_from_key(pcopy.prime, buflen, &prime);
        if (keylen < 0 || !prime) {
                /* buflen == 0 may be used to query the required buffer size,
index 8ec7a52..a705a7d 100644 (file)
@@ -260,10 +260,11 @@ static inline long keyctl_get_persistent(uid_t uid, key_serial_t destring)
 
 #ifdef CONFIG_KEY_DH_OPERATIONS
 extern long keyctl_dh_compute(struct keyctl_dh_params __user *, char __user *,
-                             size_t);
+                             size_t, void __user *);
 #else
 static inline long keyctl_dh_compute(struct keyctl_dh_params __user *params,
-                                    char __user *buffer, size_t buflen)
+                                    char __user *buffer, size_t buflen,
+                                    void __user *reserved)
 {
        return -EOPNOTSUPP;
 }
index 3b135a0..d580ad0 100644 (file)
@@ -1688,8 +1688,8 @@ SYSCALL_DEFINE5(keyctl, int, option, unsigned long, arg2, unsigned long, arg3,
 
        case KEYCTL_DH_COMPUTE:
                return keyctl_dh_compute((struct keyctl_dh_params __user *) arg2,
-                                        (char __user *) arg3,
-                                        (size_t) arg4);
+                                        (char __user *) arg3, (size_t) arg4,
+                                        (void __user *) arg5);
 
        default:
                return -EOPNOTSUPP;
index ff2b8c3..6777295 100644 (file)
@@ -3514,7 +3514,7 @@ static void smack_d_instantiate(struct dentry *opt_dentry, struct inode *inode)
                         */
                        if (isp->smk_flags & SMK_INODE_CHANGED) {
                                isp->smk_flags &= ~SMK_INODE_CHANGED;
-                               rc = inode->i_op->setxattr(dp,
+                               rc = inode->i_op->setxattr(dp, inode,
                                        XATTR_NAME_SMACKTRANSMUTE,
                                        TRANS_TRUE, TRANS_TRUE_SIZE,
                                        0);
index 9b756b1..0309f21 100644 (file)
@@ -19,6 +19,9 @@
 #include <linux/ratelimit.h>
 #include <linux/workqueue.h>
 #include <linux/string_helpers.h>
+#include <linux/task_work.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
 
 #define YAMA_SCOPE_DISABLED    0
 #define YAMA_SCOPE_RELATIONAL  1
@@ -42,20 +45,71 @@ static DEFINE_SPINLOCK(ptracer_relations_lock);
 static void yama_relation_cleanup(struct work_struct *work);
 static DECLARE_WORK(yama_relation_work, yama_relation_cleanup);
 
-static void report_access(const char *access, struct task_struct *target,
-                         struct task_struct *agent)
+struct access_report_info {
+       struct callback_head work;
+       const char *access;
+       struct task_struct *target;
+       struct task_struct *agent;
+};
+
+static void __report_access(struct callback_head *work)
 {
+       struct access_report_info *info =
+               container_of(work, struct access_report_info, work);
        char *target_cmd, *agent_cmd;
 
-       target_cmd = kstrdup_quotable_cmdline(target, GFP_ATOMIC);
-       agent_cmd = kstrdup_quotable_cmdline(agent, GFP_ATOMIC);
+       target_cmd = kstrdup_quotable_cmdline(info->target, GFP_KERNEL);
+       agent_cmd = kstrdup_quotable_cmdline(info->agent, GFP_KERNEL);
 
        pr_notice_ratelimited(
                "ptrace %s of \"%s\"[%d] was attempted by \"%s\"[%d]\n",
-               access, target_cmd, target->pid, agent_cmd, agent->pid);
+               info->access, target_cmd, info->target->pid, agent_cmd,
+               info->agent->pid);
 
        kfree(agent_cmd);
        kfree(target_cmd);
+
+       put_task_struct(info->agent);
+       put_task_struct(info->target);
+       kfree(info);
+}
+
+/* defers execution because cmdline access can sleep */
+static void report_access(const char *access, struct task_struct *target,
+                               struct task_struct *agent)
+{
+       struct access_report_info *info;
+       char agent_comm[sizeof(agent->comm)];
+
+       assert_spin_locked(&target->alloc_lock); /* for target->comm */
+
+       if (current->flags & PF_KTHREAD) {
+               /* I don't think kthreads call task_work_run() before exiting.
+                * Imagine angry ranting about procfs here.
+                */
+               pr_notice_ratelimited(
+                   "ptrace %s of \"%s\"[%d] was attempted by \"%s\"[%d]\n",
+                   access, target->comm, target->pid,
+                   get_task_comm(agent_comm, agent), agent->pid);
+               return;
+       }
+
+       info = kmalloc(sizeof(*info), GFP_ATOMIC);
+       if (!info)
+               return;
+       init_task_work(&info->work, __report_access);
+       get_task_struct(target);
+       get_task_struct(agent);
+       info->access = access;
+       info->target = target;
+       info->agent = agent;
+       if (task_work_add(current, &info->work, true) == 0)
+               return; /* success */
+
+       WARN(1, "report_access called from exiting task");
+       put_task_struct(target);
+       put_task_struct(agent);
+       kfree(info);
 }
 
 /**
@@ -351,8 +405,11 @@ int yama_ptrace_traceme(struct task_struct *parent)
                break;
        }
 
-       if (rc)
+       if (rc) {
+               task_lock(current);
                report_access("traceme", current, parent);
+               task_unlock(current);
+       }
 
        return rc;
 }
index 002f153..d53c25e 100644 (file)
@@ -335,6 +335,7 @@ static void alc_fill_eapd_coef(struct hda_codec *codec)
        case 0x10ec0283:
        case 0x10ec0286:
        case 0x10ec0288:
+       case 0x10ec0295:
        case 0x10ec0298:
                alc_update_coef_idx(codec, 0x10, 1<<9, 0);
                break;
@@ -907,6 +908,7 @@ static struct alc_codec_rename_pci_table rename_pci_tbl[] = {
        { 0x10ec0298, 0x1028, 0, "ALC3266" },
        { 0x10ec0256, 0x1028, 0, "ALC3246" },
        { 0x10ec0225, 0x1028, 0, "ALC3253" },
+       { 0x10ec0295, 0x1028, 0, "ALC3254" },
        { 0x10ec0670, 0x1025, 0, "ALC669X" },
        { 0x10ec0676, 0x1025, 0, "ALC679X" },
        { 0x10ec0282, 0x1043, 0, "ALC3229" },
@@ -3697,6 +3699,7 @@ static void alc_headset_mode_unplugged(struct hda_codec *codec)
                alc_process_coef_fw(codec, coef0668);
                break;
        case 0x10ec0225:
+       case 0x10ec0295:
                alc_process_coef_fw(codec, coef0225);
                break;
        }
@@ -3797,6 +3800,7 @@ static void alc_headset_mode_mic_in(struct hda_codec *codec, hda_nid_t hp_pin,
                snd_hda_set_pin_ctl_cache(codec, mic_pin, PIN_VREF50);
                break;
        case 0x10ec0225:
+       case 0x10ec0295:
                alc_update_coef_idx(codec, 0x45, 0x3f<<10, 0x31<<10);
                snd_hda_set_pin_ctl_cache(codec, hp_pin, 0);
                alc_process_coef_fw(codec, coef0225);
@@ -3854,6 +3858,7 @@ static void alc_headset_mode_default(struct hda_codec *codec)
 
        switch (codec->core.vendor_id) {
        case 0x10ec0225:
+       case 0x10ec0295:
                alc_process_coef_fw(codec, coef0225);
                break;
        case 0x10ec0255:
@@ -3957,6 +3962,7 @@ static void alc_headset_mode_ctia(struct hda_codec *codec)
                alc_process_coef_fw(codec, coef0688);
                break;
        case 0x10ec0225:
+       case 0x10ec0295:
                alc_process_coef_fw(codec, coef0225);
                break;
        }
@@ -4038,6 +4044,7 @@ static void alc_headset_mode_omtp(struct hda_codec *codec)
                alc_process_coef_fw(codec, coef0688);
                break;
        case 0x10ec0225:
+       case 0x10ec0295:
                alc_process_coef_fw(codec, coef0225);
                break;
        }
@@ -4121,6 +4128,7 @@ static void alc_determine_headset_type(struct hda_codec *codec)
                is_ctia = (val & 0x1c02) == 0x1c02;
                break;
        case 0x10ec0225:
+       case 0x10ec0295:
                alc_process_coef_fw(codec, coef0225);
                msleep(800);
                val = alc_read_coef_idx(codec, 0x46);
@@ -5466,8 +5474,9 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
        SND_PCI_QUIRK(0x1028, 0x06de, "Dell", ALC293_FIXUP_DISABLE_AAMIX_MULTIJACK),
        SND_PCI_QUIRK(0x1028, 0x06df, "Dell", ALC293_FIXUP_DISABLE_AAMIX_MULTIJACK),
        SND_PCI_QUIRK(0x1028, 0x06e0, "Dell", ALC293_FIXUP_DISABLE_AAMIX_MULTIJACK),
-       SND_PCI_QUIRK(0x1028, 0x0704, "Dell XPS 13", ALC256_FIXUP_DELL_XPS_13_HEADPHONE_NOISE),
+       SND_PCI_QUIRK(0x1028, 0x0704, "Dell XPS 13 9350", ALC256_FIXUP_DELL_XPS_13_HEADPHONE_NOISE),
        SND_PCI_QUIRK(0x1028, 0x0725, "Dell Inspiron 3162", ALC255_FIXUP_DELL_SPK_NOISE),
+       SND_PCI_QUIRK(0x1028, 0x075b, "Dell XPS 13 9360", ALC256_FIXUP_DELL_XPS_13_HEADPHONE_NOISE),
        SND_PCI_QUIRK(0x1028, 0x164a, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE),
        SND_PCI_QUIRK(0x1028, 0x164b, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE),
        SND_PCI_QUIRK(0x103c, 0x1586, "HP", ALC269_FIXUP_HP_MUTE_LED_MIC2),
@@ -5710,6 +5719,9 @@ static const struct snd_hda_pin_quirk alc269_pin_fixup_tbl[] = {
        SND_HDA_PIN_QUIRK(0x10ec0255, 0x1028, "Dell", ALC255_FIXUP_DELL2_MIC_NO_PRESENCE,
                {0x14, 0x90170110},
                {0x21, 0x02211020}),
+       SND_HDA_PIN_QUIRK(0x10ec0255, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE,
+               {0x14, 0x90170130},
+               {0x21, 0x02211040}),
        SND_HDA_PIN_QUIRK(0x10ec0255, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE,
                {0x12, 0x90a60140},
                {0x14, 0x90170110},
@@ -6033,6 +6045,7 @@ static int patch_alc269(struct hda_codec *codec)
                alc_update_coef_idx(codec, 0x36, 1 << 13, 1 << 5); /* Switch pcbeep path to Line in path*/
                break;
        case 0x10ec0225:
+       case 0x10ec0295:
                spec->codec_variant = ALC269_TYPE_ALC225;
                break;
        case 0x10ec0234:
@@ -6979,6 +6992,7 @@ static const struct hda_device_id snd_hda_id_realtek[] = {
        HDA_CODEC_ENTRY(0x10ec0292, "ALC292", patch_alc269),
        HDA_CODEC_ENTRY(0x10ec0293, "ALC293", patch_alc269),
        HDA_CODEC_ENTRY(0x10ec0294, "ALC294", patch_alc269),
+       HDA_CODEC_ENTRY(0x10ec0295, "ALC295", patch_alc269),
        HDA_CODEC_ENTRY(0x10ec0298, "ALC298", patch_alc269),
        HDA_CODEC_REV_ENTRY(0x10ec0861, 0x100340, "ALC660", patch_alc861),
        HDA_CODEC_ENTRY(0x10ec0660, "ALC660-VD", patch_alc861vd),
index b3afae9..4d82a58 100644 (file)
@@ -43,6 +43,7 @@ config SND_SOC_ALL_CODECS
        select SND_SOC_AK5386
        select SND_SOC_ALC5623 if I2C
        select SND_SOC_ALC5632 if I2C
+       select SND_SOC_BT_SCO
        select SND_SOC_CQ0093VC if MFD_DAVINCI_VOICECODEC
        select SND_SOC_CS35L32 if I2C
        select SND_SOC_CS42L51_I2C if I2C
@@ -64,7 +65,6 @@ config SND_SOC_ALL_CODECS
        select SND_SOC_DA732X if I2C
        select SND_SOC_DA9055 if I2C
        select SND_SOC_DMIC
-       select SND_SOC_BT_SCO
        select SND_SOC_ES8328_SPI if SPI_MASTER
        select SND_SOC_ES8328_I2C if I2C
        select SND_SOC_GTM601
@@ -79,6 +79,7 @@ config SND_SOC_ALL_CODECS
        select SND_SOC_MAX98090 if I2C
        select SND_SOC_MAX98095 if I2C
        select SND_SOC_MAX98357A if GPIOLIB
+       select SND_SOC_MAX98371 if I2C
        select SND_SOC_MAX9867 if I2C
        select SND_SOC_MAX98925 if I2C
        select SND_SOC_MAX98926 if I2C
@@ -126,12 +127,14 @@ config SND_SOC_ALL_CODECS
        select SND_SOC_TAS2552 if I2C
        select SND_SOC_TAS5086 if I2C
        select SND_SOC_TAS571X if I2C
+       select SND_SOC_TAS5720 if I2C
        select SND_SOC_TFA9879 if I2C
        select SND_SOC_TLV320AIC23_I2C if I2C
        select SND_SOC_TLV320AIC23_SPI if SPI_MASTER
        select SND_SOC_TLV320AIC26 if SPI_MASTER
        select SND_SOC_TLV320AIC31XX if I2C
-       select SND_SOC_TLV320AIC32X4 if I2C
+       select SND_SOC_TLV320AIC32X4_I2C if I2C
+       select SND_SOC_TLV320AIC32X4_SPI if SPI_MASTER
        select SND_SOC_TLV320AIC3X if I2C
        select SND_SOC_TPA6130A2 if I2C
        select SND_SOC_TLV320DAC33 if I2C
@@ -367,6 +370,9 @@ config SND_SOC_ALC5623
 config SND_SOC_ALC5632
        tristate
 
+config SND_SOC_BT_SCO
+       tristate
+
 config SND_SOC_CQ0093VC
        tristate
 
@@ -473,9 +479,6 @@ config SND_SOC_DA732X
 config SND_SOC_DA9055
        tristate
 
-config SND_SOC_BT_SCO
-       tristate
-
 config SND_SOC_DMIC
        tristate
 
@@ -529,6 +532,9 @@ config SND_SOC_MAX98095
 config SND_SOC_MAX98357A
        tristate
 
+config SND_SOC_MAX98371
+       tristate
+
 config SND_SOC_MAX9867
        tristate
 
@@ -748,9 +754,16 @@ config SND_SOC_TAS5086
        depends on I2C
 
 config SND_SOC_TAS571X
-       tristate "Texas Instruments TAS5711/TAS5717/TAS5719 power amplifiers"
+       tristate "Texas Instruments TAS5711/TAS5717/TAS5719/TAS5721 power amplifiers"
        depends on I2C
 
+config SND_SOC_TAS5720
+       tristate "Texas Instruments TAS5720 Mono Audio amplifier"
+       depends on I2C
+       help
+         Enable support for Texas Instruments TAS5720L/M high-efficiency mono
+         Class-D audio power amplifiers.
+
 config SND_SOC_TFA9879
        tristate "NXP Semiconductors TFA9879 amplifier"
        depends on I2C
@@ -780,6 +793,16 @@ config SND_SOC_TLV320AIC31XX
 config SND_SOC_TLV320AIC32X4
        tristate
 
+config SND_SOC_TLV320AIC32X4_I2C
+       tristate
+       depends on I2C
+       select SND_SOC_TLV320AIC32X4
+
+config SND_SOC_TLV320AIC32X4_SPI
+       tristate
+       depends on SPI_MASTER
+       select SND_SOC_TLV320AIC32X4
+
 config SND_SOC_TLV320AIC3X
        tristate "Texas Instruments TLV320AIC3x CODECs"
        depends on I2C
@@ -920,7 +943,8 @@ config SND_SOC_WM8955
        tristate
 
 config SND_SOC_WM8960
-       tristate
+       tristate "Wolfson Microelectronics WM8960 CODEC"
+       depends on I2C
 
 config SND_SOC_WM8961
        tristate
index b7b9941..0f548fd 100644 (file)
@@ -32,6 +32,7 @@ snd-soc-ak4642-objs := ak4642.o
 snd-soc-ak4671-objs := ak4671.o
 snd-soc-ak5386-objs := ak5386.o
 snd-soc-arizona-objs := arizona.o
+snd-soc-bt-sco-objs := bt-sco.o
 snd-soc-cq93vc-objs := cq93vc.o
 snd-soc-cs35l32-objs := cs35l32.o
 snd-soc-cs42l51-objs := cs42l51.o
@@ -55,7 +56,6 @@ snd-soc-da7218-objs := da7218.o
 snd-soc-da7219-objs := da7219.o da7219-aad.o
 snd-soc-da732x-objs := da732x.o
 snd-soc-da9055-objs := da9055.o
-snd-soc-bt-sco-objs := bt-sco.o
 snd-soc-dmic-objs := dmic.o
 snd-soc-es8328-objs := es8328.o
 snd-soc-es8328-i2c-objs := es8328-i2c.o
@@ -74,6 +74,7 @@ snd-soc-max98088-objs := max98088.o
 snd-soc-max98090-objs := max98090.o
 snd-soc-max98095-objs := max98095.o
 snd-soc-max98357a-objs := max98357a.o
+snd-soc-max98371-objs := max98371.o
 snd-soc-max9867-objs := max9867.o
 snd-soc-max98925-objs := max98925.o
 snd-soc-max98926-objs := max98926.o
@@ -131,6 +132,7 @@ snd-soc-stac9766-objs := stac9766.o
 snd-soc-sti-sas-objs := sti-sas.o
 snd-soc-tas5086-objs := tas5086.o
 snd-soc-tas571x-objs := tas571x.o
+snd-soc-tas5720-objs := tas5720.o
 snd-soc-tfa9879-objs := tfa9879.o
 snd-soc-tlv320aic23-objs := tlv320aic23.o
 snd-soc-tlv320aic23-i2c-objs := tlv320aic23-i2c.o
@@ -138,6 +140,8 @@ snd-soc-tlv320aic23-spi-objs := tlv320aic23-spi.o
 snd-soc-tlv320aic26-objs := tlv320aic26.o
 snd-soc-tlv320aic31xx-objs := tlv320aic31xx.o
 snd-soc-tlv320aic32x4-objs := tlv320aic32x4.o
+snd-soc-tlv320aic32x4-i2c-objs := tlv320aic32x4-i2c.o
+snd-soc-tlv320aic32x4-spi-objs := tlv320aic32x4-spi.o
 snd-soc-tlv320aic3x-objs := tlv320aic3x.o
 snd-soc-tlv320dac33-objs := tlv320dac33.o
 snd-soc-ts3a227e-objs := ts3a227e.o
@@ -243,6 +247,7 @@ obj-$(CONFIG_SND_SOC_AK5386)        += snd-soc-ak5386.o
 obj-$(CONFIG_SND_SOC_ALC5623)    += snd-soc-alc5623.o
 obj-$(CONFIG_SND_SOC_ALC5632)  += snd-soc-alc5632.o
 obj-$(CONFIG_SND_SOC_ARIZONA)  += snd-soc-arizona.o
+obj-$(CONFIG_SND_SOC_BT_SCO)   += snd-soc-bt-sco.o
 obj-$(CONFIG_SND_SOC_CQ0093VC) += snd-soc-cq93vc.o
 obj-$(CONFIG_SND_SOC_CS35L32)  += snd-soc-cs35l32.o
 obj-$(CONFIG_SND_SOC_CS42L51)  += snd-soc-cs42l51.o
@@ -266,7 +271,6 @@ obj-$(CONFIG_SND_SOC_DA7218)        += snd-soc-da7218.o
 obj-$(CONFIG_SND_SOC_DA7219)   += snd-soc-da7219.o
 obj-$(CONFIG_SND_SOC_DA732X)   += snd-soc-da732x.o
 obj-$(CONFIG_SND_SOC_DA9055)   += snd-soc-da9055.o
-obj-$(CONFIG_SND_SOC_BT_SCO)   += snd-soc-bt-sco.o
 obj-$(CONFIG_SND_SOC_DMIC)     += snd-soc-dmic.o
 obj-$(CONFIG_SND_SOC_ES8328)   += snd-soc-es8328.o
 obj-$(CONFIG_SND_SOC_ES8328_I2C)+= snd-soc-es8328-i2c.o
@@ -339,6 +343,7 @@ obj-$(CONFIG_SND_SOC_STI_SAS)       += snd-soc-sti-sas.o
 obj-$(CONFIG_SND_SOC_TAS2552)  += snd-soc-tas2552.o
 obj-$(CONFIG_SND_SOC_TAS5086)  += snd-soc-tas5086.o
 obj-$(CONFIG_SND_SOC_TAS571X)  += snd-soc-tas571x.o
+obj-$(CONFIG_SND_SOC_TAS5720)  += snd-soc-tas5720.o
 obj-$(CONFIG_SND_SOC_TFA9879)  += snd-soc-tfa9879.o
 obj-$(CONFIG_SND_SOC_TLV320AIC23)      += snd-soc-tlv320aic23.o
 obj-$(CONFIG_SND_SOC_TLV320AIC23_I2C)  += snd-soc-tlv320aic23-i2c.o
@@ -346,6 +351,8 @@ obj-$(CONFIG_SND_SOC_TLV320AIC23_SPI)       += snd-soc-tlv320aic23-spi.o
 obj-$(CONFIG_SND_SOC_TLV320AIC26)      += snd-soc-tlv320aic26.o
 obj-$(CONFIG_SND_SOC_TLV320AIC31XX)     += snd-soc-tlv320aic31xx.o
 obj-$(CONFIG_SND_SOC_TLV320AIC32X4)     += snd-soc-tlv320aic32x4.o
+obj-$(CONFIG_SND_SOC_TLV320AIC32X4_I2C)        += snd-soc-tlv320aic32x4-i2c.o
+obj-$(CONFIG_SND_SOC_TLV320AIC32X4_SPI)        += snd-soc-tlv320aic32x4-spi.o
 obj-$(CONFIG_SND_SOC_TLV320AIC3X)      += snd-soc-tlv320aic3x.o
 obj-$(CONFIG_SND_SOC_TLV320DAC33)      += snd-soc-tlv320dac33.o
 obj-$(CONFIG_SND_SOC_TS3A227E) += snd-soc-ts3a227e.o
index 1ee8506..4d8b9e4 100644 (file)
@@ -560,6 +560,7 @@ static const struct regmap_config ak4642_regmap = {
        .max_register           = FIL1_3,
        .reg_defaults           = ak4642_reg,
        .num_reg_defaults       = NUM_AK4642_REG_DEFAULTS,
+       .cache_type             = REGCACHE_RBTREE,
 };
 
 static const struct regmap_config ak4643_regmap = {
@@ -568,6 +569,7 @@ static const struct regmap_config ak4643_regmap = {
        .max_register           = SPK_MS,
        .reg_defaults           = ak4643_reg,
        .num_reg_defaults       = ARRAY_SIZE(ak4643_reg),
+       .cache_type             = REGCACHE_RBTREE,
 };
 
 static const struct regmap_config ak4648_regmap = {
@@ -576,6 +578,7 @@ static const struct regmap_config ak4648_regmap = {
        .max_register           = EQ_FBEQE,
        .reg_defaults           = ak4648_reg,
        .num_reg_defaults       = ARRAY_SIZE(ak4648_reg),
+       .cache_type             = REGCACHE_RBTREE,
 };
 
 static const struct ak4642_drvdata ak4642_drvdata = {
diff --git a/sound/soc/codecs/max98371.c b/sound/soc/codecs/max98371.c
new file mode 100644 (file)
index 0000000..cf0a39b
--- /dev/null
@@ -0,0 +1,441 @@
+/*
+ * max98371.c -- ALSA SoC Stereo MAX98371 driver
+ *
+ * Copyright 2015-16 Maxim Integrated Products
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/i2c.h>
+#include <linux/module.h>
+#include <linux/regmap.h>
+#include <linux/slab.h>
+#include <sound/pcm.h>
+#include <sound/pcm_params.h>
+#include <sound/soc.h>
+#include <sound/tlv.h>
+#include "max98371.h"
+
+static const char *const monomix_text[] = {
+       "Left", "Right", "LeftRightDiv2",
+};
+
+static const char *const hpf_cutoff_txt[] = {
+       "Disable", "DC Block", "50Hz",
+       "100Hz", "200Hz", "400Hz", "800Hz",
+};
+
+static SOC_ENUM_SINGLE_DECL(max98371_monomix, MAX98371_MONOMIX_CFG, 0,
+               monomix_text);
+
+static SOC_ENUM_SINGLE_DECL(max98371_hpf_cutoff, MAX98371_HPF, 0,
+               hpf_cutoff_txt);
+
+static const DECLARE_TLV_DB_RANGE(max98371_dht_min_gain,
+       0, 1, TLV_DB_SCALE_ITEM(537, 66, 0),
+       2, 3, TLV_DB_SCALE_ITEM(677, 82, 0),
+       4, 5, TLV_DB_SCALE_ITEM(852, 104, 0),
+       6, 7, TLV_DB_SCALE_ITEM(1072, 131, 0),
+       8, 9, TLV_DB_SCALE_ITEM(1350, 165, 0),
+       10, 11, TLV_DB_SCALE_ITEM(1699, 101, 0),
+);
+
+static const DECLARE_TLV_DB_RANGE(max98371_dht_max_gain,
+       0, 1, TLV_DB_SCALE_ITEM(537, 66, 0),
+       2, 3, TLV_DB_SCALE_ITEM(677, 82, 0),
+       4, 5, TLV_DB_SCALE_ITEM(852, 104, 0),
+       6, 7, TLV_DB_SCALE_ITEM(1072, 131, 0),
+       8, 9, TLV_DB_SCALE_ITEM(1350, 165, 0),
+       10, 11, TLV_DB_SCALE_ITEM(1699, 208, 0),
+);
+
+static const DECLARE_TLV_DB_RANGE(max98371_dht_rot_gain,
+       0, 1, TLV_DB_SCALE_ITEM(-50, -50, 0),
+       2, 6, TLV_DB_SCALE_ITEM(-100, -100, 0),
+       7, 8, TLV_DB_SCALE_ITEM(-800, -200, 0),
+       9, 11, TLV_DB_SCALE_ITEM(-1200, -300, 0),
+       12, 13, TLV_DB_SCALE_ITEM(-2000, -200, 0),
+       14, 15, TLV_DB_SCALE_ITEM(-2500, -500, 0),
+);
+
+static const struct reg_default max98371_reg[] = {
+       { 0x01, 0x00 },
+       { 0x02, 0x00 },
+       { 0x03, 0x00 },
+       { 0x04, 0x00 },
+       { 0x05, 0x00 },
+       { 0x06, 0x00 },
+       { 0x07, 0x00 },
+       { 0x08, 0x00 },
+       { 0x09, 0x00 },
+       { 0x0A, 0x00 },
+       { 0x10, 0x06 },
+       { 0x11, 0x08 },
+       { 0x14, 0x80 },
+       { 0x15, 0x00 },
+       { 0x16, 0x00 },
+       { 0x18, 0x00 },
+       { 0x19, 0x00 },
+       { 0x1C, 0x00 },
+       { 0x1D, 0x00 },
+       { 0x1E, 0x00 },
+       { 0x1F, 0x00 },
+       { 0x20, 0x00 },
+       { 0x21, 0x00 },
+       { 0x22, 0x00 },
+       { 0x23, 0x00 },
+       { 0x24, 0x00 },
+       { 0x25, 0x00 },
+       { 0x26, 0x00 },
+       { 0x27, 0x00 },
+       { 0x28, 0x00 },
+       { 0x29, 0x00 },
+       { 0x2A, 0x00 },
+       { 0x2B, 0x00 },
+       { 0x2C, 0x00 },
+       { 0x2D, 0x00 },
+       { 0x2E, 0x0B },
+       { 0x31, 0x00 },
+       { 0x32, 0x18 },
+       { 0x33, 0x00 },
+       { 0x34, 0x00 },
+       { 0x36, 0x00 },
+       { 0x37, 0x00 },
+       { 0x38, 0x00 },
+       { 0x39, 0x00 },
+       { 0x3A, 0x00 },
+       { 0x3B, 0x00 },
+       { 0x3C, 0x00 },
+       { 0x3D, 0x00 },
+       { 0x3E, 0x00 },
+       { 0x3F, 0x00 },
+       { 0x40, 0x00 },
+       { 0x41, 0x00 },
+       { 0x42, 0x00 },
+       { 0x43, 0x00 },
+       { 0x4A, 0x00 },
+       { 0x4B, 0x00 },
+       { 0x4C, 0x00 },
+       { 0x4D, 0x00 },
+       { 0x4E, 0x00 },
+       { 0x50, 0x00 },
+       { 0x51, 0x00 },
+       { 0x55, 0x00 },
+       { 0x58, 0x00 },
+       { 0x59, 0x00 },
+       { 0x5C, 0x00 },
+       { 0xFF, 0x43 },
+};
+
+static bool max98371_volatile_register(struct device *dev, unsigned int reg)
+{
+       switch (reg) {
+       case MAX98371_IRQ_CLEAR1:
+       case MAX98371_IRQ_CLEAR2:
+       case MAX98371_IRQ_CLEAR3:
+       case MAX98371_VERSION:
+               return true;
+       default:
+               return false;
+       }
+}
+
+static bool max98371_readable_register(struct device *dev, unsigned int reg)
+{
+       switch (reg) {
+       case MAX98371_SOFT_RESET:
+               return false;
+       default:
+               return true;
+       }
+};
+
+static const DECLARE_TLV_DB_RANGE(max98371_gain_tlv,
+       0, 7, TLV_DB_SCALE_ITEM(0, 50, 0),
+       8, 10, TLV_DB_SCALE_ITEM(400, 100, 0)
+);
+
+static const DECLARE_TLV_DB_RANGE(max98371_noload_gain_tlv,
+       0, 11, TLV_DB_SCALE_ITEM(950, 100, 0),
+);
+
+static const DECLARE_TLV_DB_SCALE(digital_tlv, -6300, 50, 1);
+
+static const struct snd_kcontrol_new max98371_snd_controls[] = {
+       SOC_SINGLE_TLV("Speaker Volume", MAX98371_GAIN,
+                       MAX98371_GAIN_SHIFT, (1<<MAX98371_GAIN_WIDTH)-1, 0,
+                       max98371_gain_tlv),
+       SOC_SINGLE_TLV("Digital Volume", MAX98371_DIGITAL_GAIN, 0,
+                       (1<<MAX98371_DIGITAL_GAIN_WIDTH)-1, 1, digital_tlv),
+       SOC_SINGLE_TLV("Speaker DHT Max Volume", MAX98371_GAIN,
+                       0, (1<<MAX98371_DHT_MAX_WIDTH)-1, 0,
+                       max98371_dht_max_gain),
+       SOC_SINGLE_TLV("Speaker DHT Min Volume", MAX98371_DHT_GAIN,
+                       0, (1<<MAX98371_DHT_GAIN_WIDTH)-1, 0,
+                       max98371_dht_min_gain),
+       SOC_SINGLE_TLV("Speaker DHT Rotation Volume", MAX98371_DHT_GAIN,
+                       0, (1<<MAX98371_DHT_ROT_WIDTH)-1, 0,
+                       max98371_dht_rot_gain),
+       SOC_SINGLE("DHT Attack Step", MAX98371_DHT, MAX98371_DHT_STEP, 3, 0),
+       SOC_SINGLE("DHT Attack Rate", MAX98371_DHT, 0, 7, 0),
+       SOC_ENUM("Monomix Select", max98371_monomix),
+       SOC_ENUM("HPF Cutoff", max98371_hpf_cutoff),
+};
+
+static int max98371_dai_set_fmt(struct snd_soc_dai *codec_dai,
+               unsigned int fmt)
+{
+       struct snd_soc_codec *codec = codec_dai->codec;
+       struct max98371_priv *max98371 = snd_soc_codec_get_drvdata(codec);
+       unsigned int val = 0;
+
+       switch (fmt & SND_SOC_DAIFMT_MASTER_MASK) {
+       case SND_SOC_DAIFMT_CBS_CFS:
+               break;
+       default:
+               dev_err(codec->dev, "DAI clock mode unsupported");
+               return -EINVAL;
+       }
+
+       switch (fmt & SND_SOC_DAIFMT_FORMAT_MASK) {
+       case SND_SOC_DAIFMT_I2S:
+               val |= 0;
+               break;
+       case SND_SOC_DAIFMT_RIGHT_J:
+               val |= MAX98371_DAI_RIGHT;
+               break;
+       case SND_SOC_DAIFMT_LEFT_J:
+               val |= MAX98371_DAI_LEFT;
+               break;
+       default:
+               dev_err(codec->dev, "DAI wrong mode unsupported");
+               return -EINVAL;
+       }
+       regmap_update_bits(max98371->regmap, MAX98371_FMT,
+                       MAX98371_FMT_MODE_MASK, val);
+       return 0;
+}
+
+static int max98371_dai_hw_params(struct snd_pcm_substream *substream,
+               struct snd_pcm_hw_params *params,
+               struct snd_soc_dai *dai)
+{
+       struct snd_soc_codec *codec = dai->codec;
+       struct max98371_priv *max98371 = snd_soc_codec_get_drvdata(codec);
+       int blr_clk_ratio, ch_size, channels = params_channels(params);
+       int rate = params_rate(params);
+
+       switch (params_format(params)) {
+       case SNDRV_PCM_FORMAT_S8:
+               regmap_update_bits(max98371->regmap, MAX98371_FMT,
+                               MAX98371_FMT_MASK, MAX98371_DAI_CHANSZ_16);
+               ch_size = 8;
+               break;
+       case SNDRV_PCM_FORMAT_S16_LE:
+               regmap_update_bits(max98371->regmap, MAX98371_FMT,
+                               MAX98371_FMT_MASK, MAX98371_DAI_CHANSZ_16);
+               ch_size = 16;
+               break;
+       case SNDRV_PCM_FORMAT_S24_LE:
+               regmap_update_bits(max98371->regmap, MAX98371_FMT,
+                               MAX98371_FMT_MASK, MAX98371_DAI_CHANSZ_32);
+               ch_size = 24;
+               break;
+       case SNDRV_PCM_FORMAT_S32_LE:
+               regmap_update_bits(max98371->regmap, MAX98371_FMT,
+                               MAX98371_FMT_MASK, MAX98371_DAI_CHANSZ_32);
+               ch_size = 32;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       /* BCLK/LRCLK ratio calculation */
+       blr_clk_ratio = channels * ch_size;
+       switch (blr_clk_ratio) {
+       case 32:
+               regmap_update_bits(max98371->regmap,
+                       MAX98371_DAI_CLK,
+                       MAX98371_DAI_BSEL_MASK, MAX98371_DAI_BSEL_32);
+               break;
+       case 48:
+               regmap_update_bits(max98371->regmap,
+                       MAX98371_DAI_CLK,
+                       MAX98371_DAI_BSEL_MASK, MAX98371_DAI_BSEL_48);
+               break;
+       case 64:
+               regmap_update_bits(max98371->regmap,
+                       MAX98371_DAI_CLK,
+                       MAX98371_DAI_BSEL_MASK, MAX98371_DAI_BSEL_64);
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       switch (rate) {
+       case 32000:
+               regmap_update_bits(max98371->regmap,
+                       MAX98371_SPK_SR,
+                       MAX98371_SPK_SR_MASK, MAX98371_SPK_SR_32);
+               break;
+       case 44100:
+               regmap_update_bits(max98371->regmap,
+                       MAX98371_SPK_SR,
+                       MAX98371_SPK_SR_MASK, MAX98371_SPK_SR_44);
+               break;
+       case 48000:
+               regmap_update_bits(max98371->regmap,
+                       MAX98371_SPK_SR,
+                       MAX98371_SPK_SR_MASK, MAX98371_SPK_SR_48);
+               break;
+       case 88200:
+               regmap_update_bits(max98371->regmap,
+                       MAX98371_SPK_SR,
+                       MAX98371_SPK_SR_MASK, MAX98371_SPK_SR_88);
+               break;
+       case 96000:
+               regmap_update_bits(max98371->regmap,
+                       MAX98371_SPK_SR,
+                       MAX98371_SPK_SR_MASK, MAX98371_SPK_SR_96);
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       /* enabling both the RX channels*/
+       regmap_update_bits(max98371->regmap, MAX98371_MONOMIX_SRC,
+                       MAX98371_MONOMIX_SRC_MASK, MONOMIX_RX_0_1);
+       regmap_update_bits(max98371->regmap, MAX98371_DAI_CHANNEL,
+                       MAX98371_CHANNEL_MASK, MAX98371_CHANNEL_MASK);
+       return 0;
+}
+
+static const struct snd_soc_dapm_widget max98371_dapm_widgets[] = {
+       SND_SOC_DAPM_DAC("DAC", NULL, MAX98371_SPK_ENABLE, 0, 0),
+       SND_SOC_DAPM_SUPPLY("Global Enable", MAX98371_GLOBAL_ENABLE,
+               0, 0, NULL, 0),
+       SND_SOC_DAPM_OUTPUT("SPK_OUT"),
+};
+
+static const struct snd_soc_dapm_route max98371_audio_map[] = {
+       {"DAC", NULL, "HiFi Playback"},
+       {"SPK_OUT", NULL, "DAC"},
+       {"SPK_OUT", NULL, "Global Enable"},
+};
+
+#define MAX98371_RATES SNDRV_PCM_RATE_8000_48000
+#define MAX98371_FORMATS (SNDRV_PCM_FMTBIT_S8 | SNDRV_PCM_FMTBIT_S16_BE | \
+               SNDRV_PCM_FMTBIT_S24_BE | SNDRV_PCM_FMTBIT_S32_BE)
+
+static const struct snd_soc_dai_ops max98371_dai_ops = {
+       .set_fmt = max98371_dai_set_fmt,
+       .hw_params = max98371_dai_hw_params,
+};
+
+static struct snd_soc_dai_driver max98371_dai[] = {
+       {
+               .name = "max98371-aif1",
+               .playback = {
+                       .stream_name = "HiFi Playback",
+                       .channels_min = 1,
+                       .channels_max = 2,
+                       .rates = SNDRV_PCM_RATE_8000_48000,
+                       .formats = MAX98371_FORMATS,
+               },
+               .ops = &max98371_dai_ops,
+       }
+};
+
+static const struct snd_soc_codec_driver max98371_codec = {
+       .controls = max98371_snd_controls,
+       .num_controls = ARRAY_SIZE(max98371_snd_controls),
+       .dapm_routes = max98371_audio_map,
+       .num_dapm_routes = ARRAY_SIZE(max98371_audio_map),
+       .dapm_widgets = max98371_dapm_widgets,
+       .num_dapm_widgets = ARRAY_SIZE(max98371_dapm_widgets),
+};
+
+static const struct regmap_config max98371_regmap = {
+       .reg_bits         = 8,
+       .val_bits         = 8,
+       .max_register     = MAX98371_VERSION,
+       .reg_defaults     = max98371_reg,
+       .num_reg_defaults = ARRAY_SIZE(max98371_reg),
+       .volatile_reg     = max98371_volatile_register,
+       .readable_reg     = max98371_readable_register,
+       .cache_type       = REGCACHE_RBTREE,
+};
+
+static int max98371_i2c_probe(struct i2c_client *i2c,
+               const struct i2c_device_id *id)
+{
+       struct max98371_priv *max98371;
+       int ret, reg;
+
+       max98371 = devm_kzalloc(&i2c->dev,
+                       sizeof(*max98371), GFP_KERNEL);
+       if (!max98371)
+               return -ENOMEM;
+
+       i2c_set_clientdata(i2c, max98371);
+       max98371->regmap = devm_regmap_init_i2c(i2c, &max98371_regmap);
+       if (IS_ERR(max98371->regmap)) {
+               ret = PTR_ERR(max98371->regmap);
+               dev_err(&i2c->dev,
+                               "Failed to allocate regmap: %d\n", ret);
+               return ret;
+       }
+
+       ret = regmap_read(max98371->regmap, MAX98371_VERSION, &reg);
+       if (ret < 0) {
+               dev_info(&i2c->dev, "device error %d\n", ret);
+               return ret;
+       }
+       dev_info(&i2c->dev, "device version %x\n", reg);
+
+       ret = snd_soc_register_codec(&i2c->dev, &max98371_codec,
+                       max98371_dai, ARRAY_SIZE(max98371_dai));
+       if (ret < 0) {
+               dev_err(&i2c->dev, "Failed to register codec: %d\n", ret);
+               return ret;
+       }
+       return ret;
+}
+
+static int max98371_i2c_remove(struct i2c_client *client)
+{
+       snd_soc_unregister_codec(&client->dev);
+       return 0;
+}
+
+static const struct i2c_device_id max98371_i2c_id[] = {
+       { "max98371", 0 },
+};
+
+MODULE_DEVICE_TABLE(i2c, max98371_i2c_id);
+
+static const struct of_device_id max98371_of_match[] = {
+       { .compatible = "maxim,max98371", },
+       { }
+};
+MODULE_DEVICE_TABLE(of, max98371_of_match);
+
+static struct i2c_driver max98371_i2c_driver = {
+       .driver = {
+               .name = "max98371",
+               .owner = THIS_MODULE,
+               .pm = NULL,
+               .of_match_table = of_match_ptr(max98371_of_match),
+       },
+       .probe  = max98371_i2c_probe,
+       .remove = max98371_i2c_remove,
+       .id_table = max98371_i2c_id,
+};
+
+module_i2c_driver(max98371_i2c_driver);
+
+MODULE_AUTHOR("anish kumar <yesanishhere@gmail.com>");
+MODULE_DESCRIPTION("ALSA SoC MAX98371 driver");
+MODULE_LICENSE("GPL");
diff --git a/sound/soc/codecs/max98371.h b/sound/soc/codecs/max98371.h
new file mode 100644 (file)
index 0000000..9f63309
--- /dev/null
@@ -0,0 +1,67 @@
+/*
+ * max98371.h -- MAX98371 ALSA SoC Audio driver
+ *
+ * Copyright 2011-2012 Maxim Integrated Products
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _MAX98371_H
+#define _MAX98371_H
+
+#define MAX98371_IRQ_CLEAR1                    0x01
+#define MAX98371_IRQ_CLEAR2                    0x02
+#define MAX98371_IRQ_CLEAR3                    0x03
+#define MAX98371_DAI_CLK                       0x10
+#define MAX98371_DAI_BSEL_MASK                 0xF
+#define MAX98371_DAI_BSEL_32                   2
+#define MAX98371_DAI_BSEL_48                   3
+#define MAX98371_DAI_BSEL_64                   4
+#define MAX98371_SPK_SR                                0x11
+#define MAX98371_SPK_SR_MASK                   0xF
+#define MAX98371_SPK_SR_32                     6
+#define MAX98371_SPK_SR_44                     7
+#define MAX98371_SPK_SR_48                     8
+#define MAX98371_SPK_SR_88                     10
+#define MAX98371_SPK_SR_96                     11
+#define MAX98371_DAI_CHANNEL                   0x15
+#define MAX98371_CHANNEL_MASK                  0x3
+#define MAX98371_MONOMIX_SRC                   0x18
+#define MAX98371_MONOMIX_CFG                   0x19
+#define MAX98371_HPF                           0x1C
+#define MAX98371_MONOMIX_SRC_MASK              0xFF
+#define MONOMIX_RX_0_1                         ((0x1)<<(4))
+#define M98371_DAI_CHANNEL_I2S                 0x3
+#define MAX98371_DIGITAL_GAIN                  0x2D
+#define MAX98371_DIGITAL_GAIN_WIDTH            0x7
+#define MAX98371_GAIN                          0x2E
+#define MAX98371_GAIN_SHIFT                    0x4
+#define MAX98371_GAIN_WIDTH                    0x4
+#define MAX98371_DHT_MAX_WIDTH                 4
+#define MAX98371_FMT                           0x14
+#define MAX98371_CHANSZ_WIDTH                  6
+#define MAX98371_FMT_MASK                      ((0x3)<<(MAX98371_CHANSZ_WIDTH))
+#define MAX98371_FMT_MODE_MASK                 ((0x7)<<(3))
+#define MAX98371_DAI_LEFT                      ((0x1)<<(3))
+#define MAX98371_DAI_RIGHT                     ((0x2)<<(3))
+#define MAX98371_DAI_CHANSZ_16                  ((1)<<(MAX98371_CHANSZ_WIDTH))
+#define MAX98371_DAI_CHANSZ_24                  ((2)<<(MAX98371_CHANSZ_WIDTH))
+#define MAX98371_DAI_CHANSZ_32                  ((3)<<(MAX98371_CHANSZ_WIDTH))
+#define MAX98371_DHT  0x32
+#define MAX98371_DHT_STEP                      0x3
+#define MAX98371_DHT_GAIN                      0x31
+#define MAX98371_DHT_GAIN_WIDTH                        0x4
+#define MAX98371_DHT_ROT_WIDTH                 0x4
+#define MAX98371_SPK_ENABLE                    0x4A
+#define MAX98371_GLOBAL_ENABLE                 0x50
+#define MAX98371_SOFT_RESET                    0x51
+#define MAX98371_VERSION                       0xFF
+
+
+struct max98371_priv {
+       struct regmap *regmap;
+       struct snd_soc_codec *codec;
+};
+#endif
index a1aaffc..f80cfe4 100644 (file)
@@ -276,6 +276,8 @@ static int rt298_jack_detect(struct rt298_priv *rt298, bool *hp, bool *mic)
                } else {
                        *mic = false;
                        regmap_write(rt298->regmap, RT298_SET_MIC1, 0x20);
+                       regmap_update_bits(rt298->regmap,
+                               RT298_CBJ_CTRL1, 0x0400, 0x0000);
                }
        } else {
                regmap_read(rt298->regmap, RT298_GET_HP_SENSE, &buf);
@@ -482,6 +484,26 @@ static int rt298_adc_event(struct snd_soc_dapm_widget *w,
                snd_soc_update_bits(codec,
                        VERB_CMD(AC_VERB_SET_AMP_GAIN_MUTE, nid, 0),
                        0x7080, 0x7000);
+                /* If MCLK doesn't exist, reset AD filter */
+               if (!(snd_soc_read(codec, RT298_VAD_CTRL) & 0x200)) {
+                       pr_info("NO MCLK\n");
+                       switch (nid) {
+                       case RT298_ADC_IN1:
+                               snd_soc_update_bits(codec,
+                                       RT298_D_FILTER_CTRL, 0x2, 0x2);
+                               mdelay(10);
+                               snd_soc_update_bits(codec,
+                                       RT298_D_FILTER_CTRL, 0x2, 0x0);
+                               break;
+                       case RT298_ADC_IN2:
+                               snd_soc_update_bits(codec,
+                                       RT298_D_FILTER_CTRL, 0x4, 0x4);
+                               mdelay(10);
+                               snd_soc_update_bits(codec,
+                                       RT298_D_FILTER_CTRL, 0x4, 0x0);
+                               break;
+                       }
+               }
                break;
        case SND_SOC_DAPM_PRE_PMD:
                snd_soc_update_bits(codec,
@@ -520,30 +542,12 @@ static int rt298_mic1_event(struct snd_soc_dapm_widget *w,
        return 0;
 }
 
-static int rt298_vref_event(struct snd_soc_dapm_widget *w,
-                            struct snd_kcontrol *kcontrol, int event)
-{
-       struct snd_soc_codec *codec = snd_soc_dapm_to_codec(w->dapm);
-
-       switch (event) {
-       case SND_SOC_DAPM_PRE_PMU:
-               snd_soc_update_bits(codec,
-                       RT298_CBJ_CTRL1, 0x0400, 0x0000);
-               mdelay(50);
-               break;
-       default:
-               return 0;
-       }
-
-       return 0;
-}
-
 static const struct snd_soc_dapm_widget rt298_dapm_widgets[] = {
 
        SND_SOC_DAPM_SUPPLY_S("HV", 1, RT298_POWER_CTRL1,
                12, 1, NULL, 0),
        SND_SOC_DAPM_SUPPLY("VREF", RT298_POWER_CTRL1,
-               0, 1, rt298_vref_event, SND_SOC_DAPM_PRE_PMU),
+               0, 1, NULL, 0),
        SND_SOC_DAPM_SUPPLY_S("BG_MBIAS", 1, RT298_POWER_CTRL2,
                1, 0, NULL, 0),
        SND_SOC_DAPM_SUPPLY_S("LDO1", 1, RT298_POWER_CTRL2,
@@ -934,18 +938,9 @@ static int rt298_set_bias_level(struct snd_soc_codec *codec,
                }
                break;
 
-       case SND_SOC_BIAS_ON:
-               mdelay(30);
-               snd_soc_update_bits(codec,
-                       RT298_CBJ_CTRL1, 0x0400, 0x0400);
-
-               break;
-
        case SND_SOC_BIAS_STANDBY:
                snd_soc_write(codec,
                        RT298_SET_AUDIO_POWER, AC_PWRST_D3);
-               snd_soc_update_bits(codec,
-                       RT298_CBJ_CTRL1, 0x0400, 0x0000);
                break;
 
        default:
index d66f884..3638f3d 100644 (file)
 #define RT298_A_BIAS_CTRL2     0x02
 #define RT298_POWER_CTRL1      0x03
 #define RT298_A_BIAS_CTRL3     0x04
+#define RT298_D_FILTER_CTRL    0x05
 #define RT298_POWER_CTRL2      0x08
 #define RT298_I2S_CTRL1                0x09
 #define RT298_I2S_CTRL2                0x0a
 #define RT298_IRQ_CTRL         0x33
 #define RT298_WIND_FILTER_CTRL 0x46
 #define RT298_PLL_CTRL1                0x49
+#define RT298_VAD_CTRL         0x4e
 #define RT298_CBJ_CTRL1                0x4f
 #define RT298_CBJ_CTRL2                0x50
 #define RT298_PLL_CTRL         0x63
index 6021226..da9483c 100644 (file)
@@ -1241,60 +1241,46 @@ static int rt5677_dmic_use_asrc(struct snd_soc_dapm_widget *source,
                regmap_read(rt5677->regmap, RT5677_ASRC_5, &asrc_setting);
                asrc_setting = (asrc_setting & RT5677_AD_STO1_CLK_SEL_MASK) >>
                                RT5677_AD_STO1_CLK_SEL_SFT;
-               if (asrc_setting >= RT5677_CLK_SEL_I2S1_ASRC &&
-                       asrc_setting <= RT5677_CLK_SEL_I2S6_ASRC)
-                       return 1;
                break;
 
        case 10:
                regmap_read(rt5677->regmap, RT5677_ASRC_5, &asrc_setting);
                asrc_setting = (asrc_setting & RT5677_AD_STO2_CLK_SEL_MASK) >>
                                RT5677_AD_STO2_CLK_SEL_SFT;
-               if (asrc_setting >= RT5677_CLK_SEL_I2S1_ASRC &&
-                       asrc_setting <= RT5677_CLK_SEL_I2S6_ASRC)
-                       return 1;
                break;
 
        case 9:
                regmap_read(rt5677->regmap, RT5677_ASRC_5, &asrc_setting);
                asrc_setting = (asrc_setting & RT5677_AD_STO3_CLK_SEL_MASK) >>
                                RT5677_AD_STO3_CLK_SEL_SFT;
-               if (asrc_setting >= RT5677_CLK_SEL_I2S1_ASRC &&
-                       asrc_setting <= RT5677_CLK_SEL_I2S6_ASRC)
-                       return 1;
                break;
 
        case 8:
                regmap_read(rt5677->regmap, RT5677_ASRC_5, &asrc_setting);
                asrc_setting = (asrc_setting & RT5677_AD_STO4_CLK_SEL_MASK) >>
                        RT5677_AD_STO4_CLK_SEL_SFT;
-               if (asrc_setting >= RT5677_CLK_SEL_I2S1_ASRC &&
-                       asrc_setting <= RT5677_CLK_SEL_I2S6_ASRC)
-                       return 1;
                break;
 
        case 7:
                regmap_read(rt5677->regmap, RT5677_ASRC_6, &asrc_setting);
                asrc_setting = (asrc_setting & RT5677_AD_MONOL_CLK_SEL_MASK) >>
                        RT5677_AD_MONOL_CLK_SEL_SFT;
-               if (asrc_setting >= RT5677_CLK_SEL_I2S1_ASRC &&
-                       asrc_setting <= RT5677_CLK_SEL_I2S6_ASRC)
-                       return 1;
                break;
 
        case 6:
                regmap_read(rt5677->regmap, RT5677_ASRC_6, &asrc_setting);
                asrc_setting = (asrc_setting & RT5677_AD_MONOR_CLK_SEL_MASK) >>
                        RT5677_AD_MONOR_CLK_SEL_SFT;
-               if (asrc_setting >= RT5677_CLK_SEL_I2S1_ASRC &&
-                       asrc_setting <= RT5677_CLK_SEL_I2S6_ASRC)
-                       return 1;
                break;
 
        default:
-               break;
+               return 0;
        }
 
+       if (asrc_setting >= RT5677_CLK_SEL_I2S1_ASRC &&
+           asrc_setting <= RT5677_CLK_SEL_I2S6_ASRC)
+               return 1;
+
        return 0;
 }
 
index 39307ad..b8d19b7 100644 (file)
@@ -4,6 +4,9 @@
  * Copyright (C) 2015 Google, Inc.
  * Copyright (c) 2013 Daniel Mack <zonque@gmail.com>
  *
+ * TAS5721 support:
+ * Copyright (C) 2016 Petr Kulhavy, Barix AG <petr@barix.com>
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
@@ -57,6 +60,10 @@ static int tas571x_register_size(struct tas571x_private *priv, unsigned int reg)
        case TAS571X_CH1_VOL_REG:
        case TAS571X_CH2_VOL_REG:
                return priv->chip->vol_reg_size;
+       case TAS571X_INPUT_MUX_REG:
+       case TAS571X_CH4_SRC_SELECT_REG:
+       case TAS571X_PWM_MUX_REG:
+               return 4;
        default:
                return 1;
        }
@@ -167,6 +174,23 @@ static int tas571x_hw_params(struct snd_pcm_substream *substream,
                                  TAS571X_SDI_FMT_MASK, val);
 }
 
+static int tas571x_mute(struct snd_soc_dai *dai, int mute)
+{
+       struct snd_soc_codec *codec = dai->codec;
+       u8 sysctl2;
+       int ret;
+
+       sysctl2 = mute ? TAS571X_SYS_CTRL_2_SDN_MASK : 0;
+
+       ret = snd_soc_update_bits(codec,
+                           TAS571X_SYS_CTRL_2_REG,
+                    TAS571X_SYS_CTRL_2_SDN_MASK,
+                    sysctl2);
+       usleep_range(1000, 2000);
+
+       return ret;
+}
+
 static int tas571x_set_bias_level(struct snd_soc_codec *codec,
                                  enum snd_soc_bias_level level)
 {
@@ -214,6 +238,7 @@ static int tas571x_set_bias_level(struct snd_soc_codec *codec,
 static const struct snd_soc_dai_ops tas571x_dai_ops = {
        .set_fmt        = tas571x_set_dai_fmt,
        .hw_params      = tas571x_hw_params,
+       .digital_mute   = tas571x_mute,
 };
 
 static const char *const tas5711_supply_names[] = {
@@ -241,6 +266,26 @@ static const struct snd_kcontrol_new tas5711_controls[] = {
                   1, 1),
 };
 
+static const struct regmap_range tas571x_readonly_regs_range[] = {
+       regmap_reg_range(TAS571X_CLK_CTRL_REG,  TAS571X_DEV_ID_REG),
+};
+
+static const struct regmap_range tas571x_volatile_regs_range[] = {
+       regmap_reg_range(TAS571X_CLK_CTRL_REG,  TAS571X_ERR_STATUS_REG),
+       regmap_reg_range(TAS571X_OSC_TRIM_REG,  TAS571X_OSC_TRIM_REG),
+};
+
+static const struct regmap_access_table tas571x_write_regs = {
+       .no_ranges =    tas571x_readonly_regs_range,
+       .n_no_ranges =  ARRAY_SIZE(tas571x_readonly_regs_range),
+};
+
+static const struct regmap_access_table tas571x_volatile_regs = {
+       .yes_ranges =   tas571x_volatile_regs_range,
+       .n_yes_ranges = ARRAY_SIZE(tas571x_volatile_regs_range),
+
+};
+
 static const struct reg_default tas5711_reg_defaults[] = {
        { 0x04, 0x05 },
        { 0x05, 0x40 },
@@ -260,6 +305,8 @@ static const struct regmap_config tas5711_regmap_config = {
        .reg_defaults                   = tas5711_reg_defaults,
        .num_reg_defaults               = ARRAY_SIZE(tas5711_reg_defaults),
        .cache_type                     = REGCACHE_RBTREE,
+       .wr_table                       = &tas571x_write_regs,
+       .volatile_table                 = &tas571x_volatile_regs,
 };
 
 static const struct tas571x_chip tas5711_chip = {
@@ -314,6 +361,8 @@ static const struct regmap_config tas5717_regmap_config = {
        .reg_defaults                   = tas5717_reg_defaults,
        .num_reg_defaults               = ARRAY_SIZE(tas5717_reg_defaults),
        .cache_type                     = REGCACHE_RBTREE,
+       .wr_table                       = &tas571x_write_regs,
+       .volatile_table                 = &tas571x_volatile_regs,
 };
 
 /* This entry is reused for tas5719 as the software interface is identical. */
@@ -326,6 +375,77 @@ static const struct tas571x_chip tas5717_chip = {
        .vol_reg_size                   = 2,
 };
 
+static const char *const tas5721_supply_names[] = {
+       "AVDD",
+       "DVDD",
+       "DRVDD",
+       "PVDD",
+};
+
+static const struct snd_kcontrol_new tas5721_controls[] = {
+       SOC_SINGLE_TLV("Master Volume",
+                      TAS571X_MVOL_REG,
+                      0, 0xff, 1, tas5711_volume_tlv),
+       SOC_DOUBLE_R_TLV("Speaker Volume",
+                        TAS571X_CH1_VOL_REG,
+                        TAS571X_CH2_VOL_REG,
+                        0, 0xff, 1, tas5711_volume_tlv),
+       SOC_DOUBLE("Speaker Switch",
+                  TAS571X_SOFT_MUTE_REG,
+                  TAS571X_SOFT_MUTE_CH1_SHIFT, TAS571X_SOFT_MUTE_CH2_SHIFT,
+                  1, 1),
+};
+
+static const struct reg_default tas5721_reg_defaults[] = {
+       {TAS571X_CLK_CTRL_REG,          0x6c},
+       {TAS571X_DEV_ID_REG,            0x00},
+       {TAS571X_ERR_STATUS_REG,        0x00},
+       {TAS571X_SYS_CTRL_1_REG,        0xa0},
+       {TAS571X_SDI_REG,               0x05},
+       {TAS571X_SYS_CTRL_2_REG,        0x40},
+       {TAS571X_SOFT_MUTE_REG,         0x00},
+       {TAS571X_MVOL_REG,              0xff},
+       {TAS571X_CH1_VOL_REG,           0x30},
+       {TAS571X_CH2_VOL_REG,           0x30},
+       {TAS571X_CH3_VOL_REG,           0x30},
+       {TAS571X_VOL_CFG_REG,           0x91},
+       {TAS571X_MODULATION_LIMIT_REG,  0x02},
+       {TAS571X_IC_DELAY_CH1_REG,      0xac},
+       {TAS571X_IC_DELAY_CH2_REG,      0x54},
+       {TAS571X_IC_DELAY_CH3_REG,      0xac},
+       {TAS571X_IC_DELAY_CH4_REG,      0x54},
+       {TAS571X_PWM_CH_SDN_GROUP_REG,  0x30},
+       {TAS571X_START_STOP_PERIOD_REG, 0x0f},
+       {TAS571X_OSC_TRIM_REG,          0x82},
+       {TAS571X_BKND_ERR_REG,          0x02},
+       {TAS571X_INPUT_MUX_REG,         0x17772},
+       {TAS571X_CH4_SRC_SELECT_REG,    0x4303},
+       {TAS571X_PWM_MUX_REG,           0x1021345},
+};
+
+static const struct regmap_config tas5721_regmap_config = {
+       .reg_bits                       = 8,
+       .val_bits                       = 32,
+       .max_register                   = 0xff,
+       .reg_read                       = tas571x_reg_read,
+       .reg_write                      = tas571x_reg_write,
+       .reg_defaults                   = tas5721_reg_defaults,
+       .num_reg_defaults               = ARRAY_SIZE(tas5721_reg_defaults),
+       .cache_type                     = REGCACHE_RBTREE,
+       .wr_table                       = &tas571x_write_regs,
+       .volatile_table                 = &tas571x_volatile_regs,
+};
+
+
+static const struct tas571x_chip tas5721_chip = {
+       .supply_names                   = tas5721_supply_names,
+       .num_supply_names               = ARRAY_SIZE(tas5721_supply_names),
+       .controls                       = tas5711_controls,
+       .num_controls                   = ARRAY_SIZE(tas5711_controls),
+       .regmap_config                  = &tas5721_regmap_config,
+       .vol_reg_size                   = 1,
+};
+
 static const struct snd_soc_dapm_widget tas571x_dapm_widgets[] = {
        SND_SOC_DAPM_DAC("DACL", NULL, SND_SOC_NOPM, 0, 0),
        SND_SOC_DAPM_DAC("DACR", NULL, SND_SOC_NOPM, 0, 0),
@@ -386,11 +506,10 @@ static int tas571x_i2c_probe(struct i2c_client *client,
        i2c_set_clientdata(client, priv);
 
        of_id = of_match_device(tas571x_of_match, dev);
-       if (!of_id) {
-               dev_err(dev, "Unknown device type\n");
-               return -EINVAL;
-       }
-       priv->chip = of_id->data;
+       if (of_id)
+               priv->chip = of_id->data;
+       else
+               priv->chip = (void *) id->driver_data;
 
        priv->mclk = devm_clk_get(dev, "mclk");
        if (IS_ERR(priv->mclk) && PTR_ERR(priv->mclk) != -ENOENT) {
@@ -445,10 +564,6 @@ static int tas571x_i2c_probe(struct i2c_client *client,
        if (ret)
                return ret;
 
-       ret = regmap_update_bits(priv->regmap, TAS571X_SYS_CTRL_2_REG,
-                                TAS571X_SYS_CTRL_2_SDN_MASK, 0);
-       if (ret)
-               return ret;
 
        memcpy(&priv->codec_driver, &tas571x_codec, sizeof(priv->codec_driver));
        priv->codec_driver.controls = priv->chip->controls;
@@ -486,14 +601,16 @@ static const struct of_device_id tas571x_of_match[] = {
        { .compatible = "ti,tas5711", .data = &tas5711_chip, },
        { .compatible = "ti,tas5717", .data = &tas5717_chip, },
        { .compatible = "ti,tas5719", .data = &tas5717_chip, },
+       { .compatible = "ti,tas5721", .data = &tas5721_chip, },
        { }
 };
 MODULE_DEVICE_TABLE(of, tas571x_of_match);
 
 static const struct i2c_device_id tas571x_i2c_id[] = {
-       { "tas5711", 0 },
-       { "tas5717", 0 },
-       { "tas5719", 0 },
+       { "tas5711", (kernel_ulong_t) &tas5711_chip },
+       { "tas5717", (kernel_ulong_t) &tas5717_chip },
+       { "tas5719", (kernel_ulong_t) &tas5717_chip },
+       { "tas5721", (kernel_ulong_t) &tas5721_chip },
        { }
 };
 MODULE_DEVICE_TABLE(i2c, tas571x_i2c_id);
index 0aee471..cf800c3 100644 (file)
 #define _TAS571X_H
 
 /* device registers */
+#define TAS571X_CLK_CTRL_REG           0x00
+#define TAS571X_DEV_ID_REG             0x01
+#define TAS571X_ERR_STATUS_REG         0x02
+#define TAS571X_SYS_CTRL_1_REG         0x03
 #define TAS571X_SDI_REG                        0x04
 #define TAS571X_SDI_FMT_MASK           0x0f
 
 #define TAS571X_MVOL_REG               0x07
 #define TAS571X_CH1_VOL_REG            0x08
 #define TAS571X_CH2_VOL_REG            0x09
+#define TAS571X_CH3_VOL_REG            0x0a
+#define TAS571X_VOL_CFG_REG            0x0e
+#define TAS571X_MODULATION_LIMIT_REG   0x10
+#define TAS571X_IC_DELAY_CH1_REG       0x11
+#define TAS571X_IC_DELAY_CH2_REG       0x12
+#define TAS571X_IC_DELAY_CH3_REG       0x13
+#define TAS571X_IC_DELAY_CH4_REG       0x14
 
+#define TAS571X_PWM_CH_SDN_GROUP_REG   0x19    /* N/A on TAS5717, TAS5719 */
+#define TAS571X_PWM_CH1_SDN_MASK       (1<<0)
+#define TAS571X_PWM_CH2_SDN_SHIFT      (1<<1)
+#define TAS571X_PWM_CH3_SDN_SHIFT      (1<<2)
+#define TAS571X_PWM_CH4_SDN_SHIFT      (1<<3)
+
+#define TAS571X_START_STOP_PERIOD_REG  0x1a
 #define TAS571X_OSC_TRIM_REG           0x1b
+#define TAS571X_BKND_ERR_REG           0x1c
+#define TAS571X_INPUT_MUX_REG          0x20
+#define TAS571X_CH4_SRC_SELECT_REG     0x21
+#define TAS571X_PWM_MUX_REG            0x25
 
 #endif /* _TAS571X_H */
diff --git a/sound/soc/codecs/tas5720.c b/sound/soc/codecs/tas5720.c
new file mode 100644 (file)
index 0000000..f54fb46
--- /dev/null
@@ -0,0 +1,620 @@
+/*
+ * tas5720.c - ALSA SoC Texas Instruments TAS5720 Mono Audio Amplifier
+ *
+ * Copyright (C)2015-2016 Texas Instruments Incorporated -  http://www.ti.com
+ *
+ * Author: Andreas Dannenberg <dannenberg@ti.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/device.h>
+#include <linux/i2c.h>
+#include <linux/pm_runtime.h>
+#include <linux/regmap.h>
+#include <linux/slab.h>
+#include <linux/regulator/consumer.h>
+#include <linux/delay.h>
+
+#include <sound/pcm.h>
+#include <sound/pcm_params.h>
+#include <sound/soc.h>
+#include <sound/soc-dapm.h>
+#include <sound/tlv.h>
+
+#include "tas5720.h"
+
+/* Define how often to check (and clear) the fault status register (in ms) */
+#define TAS5720_FAULT_CHECK_INTERVAL           200
+
+static const char * const tas5720_supply_names[] = {
+       "dvdd",         /* Digital power supply. Connect to 3.3-V supply. */
+       "pvdd",         /* Class-D amp and analog power supply (connected). */
+};
+
+#define TAS5720_NUM_SUPPLIES   ARRAY_SIZE(tas5720_supply_names)
+
+struct tas5720_data {
+       struct snd_soc_codec *codec;
+       struct regmap *regmap;
+       struct i2c_client *tas5720_client;
+       struct regulator_bulk_data supplies[TAS5720_NUM_SUPPLIES];
+       struct delayed_work fault_check_work;
+       unsigned int last_fault;
+};
+
+static int tas5720_hw_params(struct snd_pcm_substream *substream,
+                            struct snd_pcm_hw_params *params,
+                            struct snd_soc_dai *dai)
+{
+       struct snd_soc_codec *codec = dai->codec;
+       unsigned int rate = params_rate(params);
+       bool ssz_ds;
+       int ret;
+
+       switch (rate) {
+       case 44100:
+       case 48000:
+               ssz_ds = false;
+               break;
+       case 88200:
+       case 96000:
+               ssz_ds = true;
+               break;
+       default:
+               dev_err(codec->dev, "unsupported sample rate: %u\n", rate);
+               return -EINVAL;
+       }
+
+       ret = snd_soc_update_bits(codec, TAS5720_DIGITAL_CTRL1_REG,
+                                 TAS5720_SSZ_DS, ssz_ds);
+       if (ret < 0) {
+               dev_err(codec->dev, "error setting sample rate: %d\n", ret);
+               return ret;
+       }
+
+       return 0;
+}
+
+static int tas5720_set_dai_fmt(struct snd_soc_dai *dai, unsigned int fmt)
+{
+       struct snd_soc_codec *codec = dai->codec;
+       u8 serial_format;
+       int ret;
+
+       if ((fmt & SND_SOC_DAIFMT_MASTER_MASK) != SND_SOC_DAIFMT_CBS_CFS) {
+               dev_vdbg(codec->dev, "DAI Format master is not found\n");
+               return -EINVAL;
+       }
+
+       switch (fmt & (SND_SOC_DAIFMT_FORMAT_MASK |
+                      SND_SOC_DAIFMT_INV_MASK)) {
+       case (SND_SOC_DAIFMT_I2S | SND_SOC_DAIFMT_NB_NF):
+               /* 1st data bit occur one BCLK cycle after the frame sync */
+               serial_format = TAS5720_SAIF_I2S;
+               break;
+       case (SND_SOC_DAIFMT_DSP_A | SND_SOC_DAIFMT_NB_NF):
+               /*
+                * Note that although the TAS5720 does not have a dedicated DSP
+                * mode it doesn't care about the LRCLK duty cycle during TDM
+                * operation. Therefore we can use the device's I2S mode with
+                * its delaying of the 1st data bit to receive DSP_A formatted
+                * data. See device datasheet for additional details.
+                */
+               serial_format = TAS5720_SAIF_I2S;
+               break;
+       case (SND_SOC_DAIFMT_DSP_B | SND_SOC_DAIFMT_NB_NF):
+               /*
+                * Similar to DSP_A, we can use the fact that the TAS5720 does
+                * not care about the LRCLK duty cycle during TDM to receive
+                * DSP_B formatted data in LEFTJ mode (no delaying of the 1st
+                * data bit).
+                */
+               serial_format = TAS5720_SAIF_LEFTJ;
+               break;
+       case (SND_SOC_DAIFMT_LEFT_J | SND_SOC_DAIFMT_NB_NF):
+               /* No delay after the frame sync */
+               serial_format = TAS5720_SAIF_LEFTJ;
+               break;
+       default:
+               dev_vdbg(codec->dev, "DAI Format is not found\n");
+               return -EINVAL;
+       }
+
+       ret = snd_soc_update_bits(codec, TAS5720_DIGITAL_CTRL1_REG,
+                                 TAS5720_SAIF_FORMAT_MASK,
+                                 serial_format);
+       if (ret < 0) {
+               dev_err(codec->dev, "error setting SAIF format: %d\n", ret);
+               return ret;
+       }
+
+       return 0;
+}
+
+static int tas5720_set_dai_tdm_slot(struct snd_soc_dai *dai,
+                                   unsigned int tx_mask, unsigned int rx_mask,
+                                   int slots, int slot_width)
+{
+       struct snd_soc_codec *codec = dai->codec;
+       unsigned int first_slot;
+       int ret;
+
+       if (!tx_mask) {
+               dev_err(codec->dev, "tx masks must not be 0\n");
+               return -EINVAL;
+       }
+
+       /*
+        * Determine the first slot that is being requested. We will only
+        * use the first slot that is found since the TAS5720 is a mono
+        * amplifier.
+        */
+       first_slot = __ffs(tx_mask);
+
+       if (first_slot > 7) {
+               dev_err(codec->dev, "slot selection out of bounds (%u)\n",
+                       first_slot);
+               return -EINVAL;
+       }
+
+       /* Enable manual TDM slot selection (instead of I2C ID based) */
+       ret = snd_soc_update_bits(codec, TAS5720_DIGITAL_CTRL1_REG,
+                                 TAS5720_TDM_CFG_SRC, TAS5720_TDM_CFG_SRC);
+       if (ret < 0)
+               goto error_snd_soc_update_bits;
+
+       /* Configure the TDM slot to process audio from */
+       ret = snd_soc_update_bits(codec, TAS5720_DIGITAL_CTRL2_REG,
+                                 TAS5720_TDM_SLOT_SEL_MASK, first_slot);
+       if (ret < 0)
+               goto error_snd_soc_update_bits;
+
+       return 0;
+
+error_snd_soc_update_bits:
+       dev_err(codec->dev, "error configuring TDM mode: %d\n", ret);
+       return ret;
+}
+
+static int tas5720_mute(struct snd_soc_dai *dai, int mute)
+{
+       struct snd_soc_codec *codec = dai->codec;
+       int ret;
+
+       ret = snd_soc_update_bits(codec, TAS5720_DIGITAL_CTRL2_REG,
+                                 TAS5720_MUTE, mute ? TAS5720_MUTE : 0);
+       if (ret < 0) {
+               dev_err(codec->dev, "error (un-)muting device: %d\n", ret);
+               return ret;
+       }
+
+       return 0;
+}
+
+static void tas5720_fault_check_work(struct work_struct *work)
+{
+       struct tas5720_data *tas5720 = container_of(work, struct tas5720_data,
+                       fault_check_work.work);
+       struct device *dev = tas5720->codec->dev;
+       unsigned int curr_fault;
+       int ret;
+
+       ret = regmap_read(tas5720->regmap, TAS5720_FAULT_REG, &curr_fault);
+       if (ret < 0) {
+               dev_err(dev, "failed to read FAULT register: %d\n", ret);
+               goto out;
+       }
+
+       /* Check/handle all errors except SAIF clock errors */
+       curr_fault &= TAS5720_OCE | TAS5720_DCE | TAS5720_OTE;
+
+       /*
+        * Only flag errors once for a given occurrence. This is needed as
+        * the TAS5720 will take time clearing the fault condition internally
+        * during which we don't want to bombard the system with the same
+        * error message over and over.
+        */
+       if ((curr_fault & TAS5720_OCE) && !(tas5720->last_fault & TAS5720_OCE))
+               dev_crit(dev, "experienced an over current hardware fault\n");
+
+       if ((curr_fault & TAS5720_DCE) && !(tas5720->last_fault & TAS5720_DCE))
+               dev_crit(dev, "experienced a DC detection fault\n");
+
+       if ((curr_fault & TAS5720_OTE) && !(tas5720->last_fault & TAS5720_OTE))
+               dev_crit(dev, "experienced an over temperature fault\n");
+
+       /* Store current fault value so we can detect any changes next time */
+       tas5720->last_fault = curr_fault;
+
+       if (!curr_fault)
+               goto out;
+
+       /*
+        * Periodically toggle SDZ (shutdown bit) H->L->H to clear any latching
+        * faults as long as a fault condition persists. Always going through
+        * the full sequence no matter the first return value to minimizes
+        * chances for the device to end up in shutdown mode.
+        */
+       ret = regmap_write_bits(tas5720->regmap, TAS5720_POWER_CTRL_REG,
+                               TAS5720_SDZ, 0);
+       if (ret < 0)
+               dev_err(dev, "failed to write POWER_CTRL register: %d\n", ret);
+
+       ret = regmap_write_bits(tas5720->regmap, TAS5720_POWER_CTRL_REG,
+                               TAS5720_SDZ, TAS5720_SDZ);
+       if (ret < 0)
+               dev_err(dev, "failed to write POWER_CTRL register: %d\n", ret);
+
+out:
+       /* Schedule the next fault check at the specified interval */
+       schedule_delayed_work(&tas5720->fault_check_work,
+                             msecs_to_jiffies(TAS5720_FAULT_CHECK_INTERVAL));
+}
+
+static int tas5720_codec_probe(struct snd_soc_codec *codec)
+{
+       struct tas5720_data *tas5720 = snd_soc_codec_get_drvdata(codec);
+       unsigned int device_id;
+       int ret;
+
+       tas5720->codec = codec;
+
+       ret = regulator_bulk_enable(ARRAY_SIZE(tas5720->supplies),
+                                   tas5720->supplies);
+       if (ret != 0) {
+               dev_err(codec->dev, "failed to enable supplies: %d\n", ret);
+               return ret;
+       }
+
+       ret = regmap_read(tas5720->regmap, TAS5720_DEVICE_ID_REG, &device_id);
+       if (ret < 0) {
+               dev_err(codec->dev, "failed to read device ID register: %d\n",
+                       ret);
+               goto probe_fail;
+       }
+
+       if (device_id != TAS5720_DEVICE_ID) {
+               dev_err(codec->dev, "wrong device ID. expected: %u read: %u\n",
+                       TAS5720_DEVICE_ID, device_id);
+               ret = -ENODEV;
+               goto probe_fail;
+       }
+
+       /* Set device to mute */
+       ret = snd_soc_update_bits(codec, TAS5720_DIGITAL_CTRL2_REG,
+                                 TAS5720_MUTE, TAS5720_MUTE);
+       if (ret < 0)
+               goto error_snd_soc_update_bits;
+
+       /*
+        * Enter shutdown mode - our default when not playing audio - to
+        * minimize current consumption. On the TAS5720 there is no real down
+        * side doing so as all device registers are preserved and the wakeup
+        * of the codec is rather quick which we do using a dapm widget.
+        */
+       ret = snd_soc_update_bits(codec, TAS5720_POWER_CTRL_REG,
+                                 TAS5720_SDZ, 0);
+       if (ret < 0)
+               goto error_snd_soc_update_bits;
+
+       INIT_DELAYED_WORK(&tas5720->fault_check_work, tas5720_fault_check_work);
+
+       return 0;
+
+error_snd_soc_update_bits:
+       dev_err(codec->dev, "error configuring device registers: %d\n", ret);
+
+probe_fail:
+       regulator_bulk_disable(ARRAY_SIZE(tas5720->supplies),
+                              tas5720->supplies);
+       return ret;
+}
+
+static int tas5720_codec_remove(struct snd_soc_codec *codec)
+{
+       struct tas5720_data *tas5720 = snd_soc_codec_get_drvdata(codec);
+       int ret;
+
+       cancel_delayed_work_sync(&tas5720->fault_check_work);
+
+       ret = regulator_bulk_disable(ARRAY_SIZE(tas5720->supplies),
+                                    tas5720->supplies);
+       if (ret < 0)
+               dev_err(codec->dev, "failed to disable supplies: %d\n", ret);
+
+       return ret;
+};
+
+static int tas5720_dac_event(struct snd_soc_dapm_widget *w,
+                            struct snd_kcontrol *kcontrol, int event)
+{
+       struct snd_soc_codec *codec = snd_soc_dapm_to_codec(w->dapm);
+       struct tas5720_data *tas5720 = snd_soc_codec_get_drvdata(codec);
+       int ret;
+
+       if (event & SND_SOC_DAPM_POST_PMU) {
+               /* Take TAS5720 out of shutdown mode */
+               ret = snd_soc_update_bits(codec, TAS5720_POWER_CTRL_REG,
+                                         TAS5720_SDZ, TAS5720_SDZ);
+               if (ret < 0) {
+                       dev_err(codec->dev, "error waking codec: %d\n", ret);
+                       return ret;
+               }
+
+               /*
+                * Observe codec shutdown-to-active time. The datasheet only
+                * lists a nominal value however just use-it as-is without
+                * additional padding to minimize the delay introduced in
+                * starting to play audio (actually there is other setup done
+                * by the ASoC framework that will provide additional delays,
+                * so we should always be safe).
+                */
+               msleep(25);
+
+               /* Turn on TAS5720 periodic fault checking/handling */
+               tas5720->last_fault = 0;
+               schedule_delayed_work(&tas5720->fault_check_work,
+                               msecs_to_jiffies(TAS5720_FAULT_CHECK_INTERVAL));
+       } else if (event & SND_SOC_DAPM_PRE_PMD) {
+               /* Disable TAS5720 periodic fault checking/handling */
+               cancel_delayed_work_sync(&tas5720->fault_check_work);
+
+               /* Place TAS5720 in shutdown mode to minimize current draw */
+               ret = snd_soc_update_bits(codec, TAS5720_POWER_CTRL_REG,
+                                         TAS5720_SDZ, 0);
+               if (ret < 0) {
+                       dev_err(codec->dev, "error shutting down codec: %d\n",
+                               ret);
+                       return ret;
+               }
+       }
+
+       return 0;
+}
+
+#ifdef CONFIG_PM
+static int tas5720_suspend(struct snd_soc_codec *codec)
+{
+       struct tas5720_data *tas5720 = snd_soc_codec_get_drvdata(codec);
+       int ret;
+
+       regcache_cache_only(tas5720->regmap, true);
+       regcache_mark_dirty(tas5720->regmap);
+
+       ret = regulator_bulk_disable(ARRAY_SIZE(tas5720->supplies),
+                                    tas5720->supplies);
+       if (ret < 0)
+               dev_err(codec->dev, "failed to disable supplies: %d\n", ret);
+
+       return ret;
+}
+
+static int tas5720_resume(struct snd_soc_codec *codec)
+{
+       struct tas5720_data *tas5720 = snd_soc_codec_get_drvdata(codec);
+       int ret;
+
+       ret = regulator_bulk_enable(ARRAY_SIZE(tas5720->supplies),
+                                   tas5720->supplies);
+       if (ret < 0) {
+               dev_err(codec->dev, "failed to enable supplies: %d\n", ret);
+               return ret;
+       }
+
+       regcache_cache_only(tas5720->regmap, false);
+
+       ret = regcache_sync(tas5720->regmap);
+       if (ret < 0) {
+               dev_err(codec->dev, "failed to sync regcache: %d\n", ret);
+               return ret;
+       }
+
+       return 0;
+}
+#else
+#define tas5720_suspend NULL
+#define tas5720_resume NULL
+#endif
+
+static bool tas5720_is_volatile_reg(struct device *dev, unsigned int reg)
+{
+       switch (reg) {
+       case TAS5720_DEVICE_ID_REG:
+       case TAS5720_FAULT_REG:
+               return true;
+       default:
+               return false;
+       }
+}
+
+static const struct regmap_config tas5720_regmap_config = {
+       .reg_bits = 8,
+       .val_bits = 8,
+
+       .max_register = TAS5720_MAX_REG,
+       .cache_type = REGCACHE_RBTREE,
+       .volatile_reg = tas5720_is_volatile_reg,
+};
+
+/*
+ * DAC analog gain. There are four discrete values to select from, ranging
+ * from 19.2 dB to 26.3dB.
+ */
+static const DECLARE_TLV_DB_RANGE(dac_analog_tlv,
+       0x0, 0x0, TLV_DB_SCALE_ITEM(1920, 0, 0),
+       0x1, 0x1, TLV_DB_SCALE_ITEM(2070, 0, 0),
+       0x2, 0x2, TLV_DB_SCALE_ITEM(2350, 0, 0),
+       0x3, 0x3, TLV_DB_SCALE_ITEM(2630, 0, 0),
+);
+
+/*
+ * DAC digital volumes. From -103.5 to 24 dB in 0.5 dB steps. Note that
+ * setting the gain below -100 dB (register value <0x7) is effectively a MUTE
+ * as per device datasheet.
+ */
+static DECLARE_TLV_DB_SCALE(dac_tlv, -10350, 50, 0);
+
+static const struct snd_kcontrol_new tas5720_snd_controls[] = {
+       SOC_SINGLE_TLV("Speaker Driver Playback Volume",
+                      TAS5720_VOLUME_CTRL_REG, 0, 0xff, 0, dac_tlv),
+       SOC_SINGLE_TLV("Speaker Driver Analog Gain", TAS5720_ANALOG_CTRL_REG,
+                      TAS5720_ANALOG_GAIN_SHIFT, 3, 0, dac_analog_tlv),
+};
+
+static const struct snd_soc_dapm_widget tas5720_dapm_widgets[] = {
+       SND_SOC_DAPM_AIF_IN("DAC IN", "Playback", 0, SND_SOC_NOPM, 0, 0),
+       SND_SOC_DAPM_DAC_E("DAC", NULL, SND_SOC_NOPM, 0, 0, tas5720_dac_event,
+                          SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_PRE_PMD),
+       SND_SOC_DAPM_OUTPUT("OUT")
+};
+
+static const struct snd_soc_dapm_route tas5720_audio_map[] = {
+       { "DAC", NULL, "DAC IN" },
+       { "OUT", NULL, "DAC" },
+};
+
+static struct snd_soc_codec_driver soc_codec_dev_tas5720 = {
+       .probe = tas5720_codec_probe,
+       .remove = tas5720_codec_remove,
+       .suspend = tas5720_suspend,
+       .resume = tas5720_resume,
+
+       .controls = tas5720_snd_controls,
+       .num_controls = ARRAY_SIZE(tas5720_snd_controls),
+       .dapm_widgets = tas5720_dapm_widgets,
+       .num_dapm_widgets = ARRAY_SIZE(tas5720_dapm_widgets),
+       .dapm_routes = tas5720_audio_map,
+       .num_dapm_routes = ARRAY_SIZE(tas5720_audio_map),
+};
+
+/* PCM rates supported by the TAS5720 driver */
+#define TAS5720_RATES  (SNDRV_PCM_RATE_44100 | SNDRV_PCM_RATE_48000 |\
+                        SNDRV_PCM_RATE_88200 | SNDRV_PCM_RATE_96000)
+
+/* Formats supported by TAS5720 driver */
+#define TAS5720_FORMATS (SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S18_3LE |\
+                        SNDRV_PCM_FMTBIT_S20_3LE | SNDRV_PCM_FMTBIT_S24_LE)
+
+static struct snd_soc_dai_ops tas5720_speaker_dai_ops = {
+       .hw_params      = tas5720_hw_params,
+       .set_fmt        = tas5720_set_dai_fmt,
+       .set_tdm_slot   = tas5720_set_dai_tdm_slot,
+       .digital_mute   = tas5720_mute,
+};
+
+/*
+ * TAS5720 DAI structure
+ *
+ * Note that were are advertising .playback.channels_max = 2 despite this being
+ * a mono amplifier. The reason for that is that some serial ports such as TI's
+ * McASP module have a minimum number of channels (2) that they can output.
+ * Advertising more channels than we have will allow us to interface with such
+ * a serial port without really any negative side effects as the TAS5720 will
+ * simply ignore any extra channel(s) asides from the one channel that is
+ * configured to be played back.
+ */
+static struct snd_soc_dai_driver tas5720_dai[] = {
+       {
+               .name = "tas5720-amplifier",
+               .playback = {
+                       .stream_name = "Playback",
+                       .channels_min = 1,
+                       .channels_max = 2,
+                       .rates = TAS5720_RATES,
+                       .formats = TAS5720_FORMATS,
+               },
+               .ops = &tas5720_speaker_dai_ops,
+       },
+};
+
+static int tas5720_probe(struct i2c_client *client,
+                        const struct i2c_device_id *id)
+{
+       struct device *dev = &client->dev;
+       struct tas5720_data *data;
+       int ret;
+       int i;
+
+       data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL);
+       if (!data)
+               return -ENOMEM;
+
+       data->tas5720_client = client;
+       data->regmap = devm_regmap_init_i2c(client, &tas5720_regmap_config);
+       if (IS_ERR(data->regmap)) {
+               ret = PTR_ERR(data->regmap);
+               dev_err(dev, "failed to allocate register map: %d\n", ret);
+               return ret;
+       }
+
+       for (i = 0; i < ARRAY_SIZE(data->supplies); i++)
+               data->supplies[i].supply = tas5720_supply_names[i];
+
+       ret = devm_regulator_bulk_get(dev, ARRAY_SIZE(data->supplies),
+                                     data->supplies);
+       if (ret != 0) {
+               dev_err(dev, "failed to request supplies: %d\n", ret);
+               return ret;
+       }
+
+       dev_set_drvdata(dev, data);
+
+       ret = snd_soc_register_codec(&client->dev,
+                                    &soc_codec_dev_tas5720,
+                                    tas5720_dai, ARRAY_SIZE(tas5720_dai));
+       if (ret < 0) {
+               dev_err(dev, "failed to register codec: %d\n", ret);
+               return ret;
+       }
+
+       return 0;
+}
+
+static int tas5720_remove(struct i2c_client *client)
+{
+       struct device *dev = &client->dev;
+
+       snd_soc_unregister_codec(dev);
+
+       return 0;
+}
+
+static const struct i2c_device_id tas5720_id[] = {
+       { "tas5720", 0 },
+       { }
+};
+MODULE_DEVICE_TABLE(i2c, tas5720_id);
+
+#if IS_ENABLED(CONFIG_OF)
+static const struct of_device_id tas5720_of_match[] = {
+       { .compatible = "ti,tas5720", },
+       { },
+};
+MODULE_DEVICE_TABLE(of, tas5720_of_match);
+#endif
+
+static struct i2c_driver tas5720_i2c_driver = {
+       .driver = {
+               .name = "tas5720",
+               .of_match_table = of_match_ptr(tas5720_of_match),
+       },
+       .probe = tas5720_probe,
+       .remove = tas5720_remove,
+       .id_table = tas5720_id,
+};
+
+module_i2c_driver(tas5720_i2c_driver);
+
+MODULE_AUTHOR("Andreas Dannenberg <dannenberg@ti.com>");
+MODULE_DESCRIPTION("TAS5720 Audio amplifier driver");
+MODULE_LICENSE("GPL");
diff --git a/sound/soc/codecs/tas5720.h b/sound/soc/codecs/tas5720.h
new file mode 100644 (file)
index 0000000..3d077c7
--- /dev/null
@@ -0,0 +1,90 @@
+/*
+ * tas5720.h - ALSA SoC Texas Instruments TAS5720 Mono Audio Amplifier
+ *
+ * Copyright (C)2015-2016 Texas Instruments Incorporated -  http://www.ti.com
+ *
+ * Author: Andreas Dannenberg <dannenberg@ti.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef __TAS5720_H__
+#define __TAS5720_H__
+
+/* Register Address Map */
+#define TAS5720_DEVICE_ID_REG          0x00
+#define TAS5720_POWER_CTRL_REG         0x01
+#define TAS5720_DIGITAL_CTRL1_REG      0x02
+#define TAS5720_DIGITAL_CTRL2_REG      0x03
+#define TAS5720_VOLUME_CTRL_REG                0x04
+#define TAS5720_ANALOG_CTRL_REG                0x06
+#define TAS5720_FAULT_REG              0x08
+#define TAS5720_DIGITAL_CLIP2_REG      0x10
+#define TAS5720_DIGITAL_CLIP1_REG      0x11
+#define TAS5720_MAX_REG                        TAS5720_DIGITAL_CLIP1_REG
+
+/* TAS5720_DEVICE_ID_REG */
+#define TAS5720_DEVICE_ID              0x01
+
+/* TAS5720_POWER_CTRL_REG */
+#define TAS5720_DIG_CLIP_MASK          GENMASK(7, 2)
+#define TAS5720_SLEEP                  BIT(1)
+#define TAS5720_SDZ                    BIT(0)
+
+/* TAS5720_DIGITAL_CTRL1_REG */
+#define TAS5720_HPF_BYPASS             BIT(7)
+#define TAS5720_TDM_CFG_SRC            BIT(6)
+#define TAS5720_SSZ_DS                 BIT(3)
+#define TAS5720_SAIF_RIGHTJ_24BIT      (0x0)
+#define TAS5720_SAIF_RIGHTJ_20BIT      (0x1)
+#define TAS5720_SAIF_RIGHTJ_18BIT      (0x2)
+#define TAS5720_SAIF_RIGHTJ_16BIT      (0x3)
+#define TAS5720_SAIF_I2S               (0x4)
+#define TAS5720_SAIF_LEFTJ             (0x5)
+#define TAS5720_SAIF_FORMAT_MASK       GENMASK(2, 0)
+
+/* TAS5720_DIGITAL_CTRL2_REG */
+#define TAS5720_MUTE                   BIT(4)
+#define TAS5720_TDM_SLOT_SEL_MASK      GENMASK(2, 0)
+
+/* TAS5720_ANALOG_CTRL_REG */
+#define TAS5720_PWM_RATE_6_3_FSYNC     (0x0 << 4)
+#define TAS5720_PWM_RATE_8_4_FSYNC     (0x1 << 4)
+#define TAS5720_PWM_RATE_10_5_FSYNC    (0x2 << 4)
+#define TAS5720_PWM_RATE_12_6_FSYNC    (0x3 << 4)
+#define TAS5720_PWM_RATE_14_7_FSYNC    (0x4 << 4)
+#define TAS5720_PWM_RATE_16_8_FSYNC    (0x5 << 4)
+#define TAS5720_PWM_RATE_20_10_FSYNC   (0x6 << 4)
+#define TAS5720_PWM_RATE_24_12_FSYNC   (0x7 << 4)
+#define TAS5720_PWM_RATE_MASK          GENMASK(6, 4)
+#define TAS5720_ANALOG_GAIN_19_2DBV    (0x0 << 2)
+#define TAS5720_ANALOG_GAIN_20_7DBV    (0x1 << 2)
+#define TAS5720_ANALOG_GAIN_23_5DBV    (0x2 << 2)
+#define TAS5720_ANALOG_GAIN_26_3DBV    (0x3 << 2)
+#define TAS5720_ANALOG_GAIN_MASK       GENMASK(3, 2)
+#define TAS5720_ANALOG_GAIN_SHIFT      (0x2)
+
+/* TAS5720_FAULT_REG */
+#define TAS5720_OC_THRESH_100PCT       (0x0 << 4)
+#define TAS5720_OC_THRESH_75PCT                (0x1 << 4)
+#define TAS5720_OC_THRESH_50PCT                (0x2 << 4)
+#define TAS5720_OC_THRESH_25PCT                (0x3 << 4)
+#define TAS5720_OC_THRESH_MASK         GENMASK(5, 4)
+#define TAS5720_CLKE                   BIT(3)
+#define TAS5720_OCE                    BIT(2)
+#define TAS5720_DCE                    BIT(1)
+#define TAS5720_OTE                    BIT(0)
+#define TAS5720_FAULT_MASK             GENMASK(3, 0)
+
+/* TAS5720_DIGITAL_CLIP1_REG */
+#define TAS5720_CLIP1_MASK             GENMASK(7, 2)
+#define TAS5720_CLIP1_SHIFT            (0x2)
+
+#endif /* __TAS5720_H__ */
index ee4def4..3c5e1df 100644 (file)
@@ -28,6 +28,7 @@
 #include <linux/i2c.h>
 #include <linux/gpio.h>
 #include <linux/regulator/consumer.h>
+#include <linux/acpi.h>
 #include <linux/of.h>
 #include <linux/of_gpio.h>
 #include <linux/slab.h>
@@ -1280,10 +1281,19 @@ static const struct i2c_device_id aic31xx_i2c_id[] = {
 };
 MODULE_DEVICE_TABLE(i2c, aic31xx_i2c_id);
 
+#ifdef CONFIG_ACPI
+static const struct acpi_device_id aic31xx_acpi_match[] = {
+       { "10TI3100", 0 },
+       { }
+};
+MODULE_DEVICE_TABLE(acpi, aic31xx_acpi_match);
+#endif
+
 static struct i2c_driver aic31xx_i2c_driver = {
        .driver = {
                .name   = "tlv320aic31xx-codec",
                .of_match_table = of_match_ptr(tlv320aic31xx_of_match),
+               .acpi_match_table = ACPI_PTR(aic31xx_acpi_match),
        },
        .probe          = aic31xx_i2c_probe,
        .remove         = aic31xx_i2c_remove,
diff --git a/sound/soc/codecs/tlv320aic32x4-i2c.c b/sound/soc/codecs/tlv320aic32x4-i2c.c
new file mode 100644 (file)
index 0000000..59606cf
--- /dev/null
@@ -0,0 +1,74 @@
+/*
+ * linux/sound/soc/codecs/tlv320aic32x4-i2c.c
+ *
+ * Copyright 2011 NW Digital Radio
+ *
+ * Author: Jeremy McDermond <nh6z@nh6z.net>
+ *
+ * Based on sound/soc/codecs/wm8974 and TI driver for kernel 2.6.27.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/i2c.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/regmap.h>
+#include <sound/soc.h>
+
+#include "tlv320aic32x4.h"
+
+static int aic32x4_i2c_probe(struct i2c_client *i2c,
+                            const struct i2c_device_id *id)
+{
+       struct regmap *regmap;
+       struct regmap_config config;
+
+       config = aic32x4_regmap_config;
+       config.reg_bits = 8;
+       config.val_bits = 8;
+
+       regmap = devm_regmap_init_i2c(i2c, &config);
+       return aic32x4_probe(&i2c->dev, regmap);
+}
+
+static int aic32x4_i2c_remove(struct i2c_client *i2c)
+{
+       return aic32x4_remove(&i2c->dev);
+}
+
+static const struct i2c_device_id aic32x4_i2c_id[] = {
+       { "tlv320aic32x4", 0 },
+       { /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(i2c, aic32x4_i2c_id);
+
+static const struct of_device_id aic32x4_of_id[] = {
+       { .compatible = "ti,tlv320aic32x4", },
+       { /* senitel */ }
+};
+MODULE_DEVICE_TABLE(of, aic32x4_of_id);
+
+static struct i2c_driver aic32x4_i2c_driver = {
+       .driver = {
+               .name = "tlv320aic32x4",
+               .of_match_table = aic32x4_of_id,
+       },
+       .probe =    aic32x4_i2c_probe,
+       .remove =   aic32x4_i2c_remove,
+       .id_table = aic32x4_i2c_id,
+};
+
+module_i2c_driver(aic32x4_i2c_driver);
+
+MODULE_DESCRIPTION("ASoC TLV320AIC32x4 codec driver I2C");
+MODULE_AUTHOR("Jeremy McDermond <nh6z@nh6z.net>");
+MODULE_LICENSE("GPL");
diff --git a/sound/soc/codecs/tlv320aic32x4-spi.c b/sound/soc/codecs/tlv320aic32x4-spi.c
new file mode 100644 (file)
index 0000000..724fcdd
--- /dev/null
@@ -0,0 +1,76 @@
+/*
+ * linux/sound/soc/codecs/tlv320aic32x4-spi.c
+ *
+ * Copyright 2011 NW Digital Radio
+ *
+ * Author: Jeremy McDermond <nh6z@nh6z.net>
+ *
+ * Based on sound/soc/codecs/wm8974 and TI driver for kernel 2.6.27.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/spi/spi.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/regmap.h>
+#include <sound/soc.h>
+
+#include "tlv320aic32x4.h"
+
+static int aic32x4_spi_probe(struct spi_device *spi)
+{
+       struct regmap *regmap;
+       struct regmap_config config;
+
+       config = aic32x4_regmap_config;
+       config.reg_bits = 7;
+       config.pad_bits = 1;
+       config.val_bits = 8;
+       config.read_flag_mask = 0x01;
+
+       regmap = devm_regmap_init_spi(spi, &config);
+       return aic32x4_probe(&spi->dev, regmap);
+}
+
+static int aic32x4_spi_remove(struct spi_device *spi)
+{
+       return aic32x4_remove(&spi->dev);
+}
+
+static const struct spi_device_id aic32x4_spi_id[] = {
+       { "tlv320aic32x4", 0 },
+       { /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(spi, aic32x4_spi_id);
+
+static const struct of_device_id aic32x4_of_id[] = {
+       { .compatible = "ti,tlv320aic32x4", },
+       { /* senitel */ }
+};
+MODULE_DEVICE_TABLE(of, aic32x4_of_id);
+
+static struct spi_driver aic32x4_spi_driver = {
+       .driver = {
+               .name = "tlv320aic32x4",
+               .owner = THIS_MODULE,
+               .of_match_table = aic32x4_of_id,
+       },
+       .probe =    aic32x4_spi_probe,
+       .remove =   aic32x4_spi_remove,
+       .id_table = aic32x4_spi_id,
+};
+
+module_spi_driver(aic32x4_spi_driver);
+
+MODULE_DESCRIPTION("ASoC TLV320AIC32x4 codec driver SPI");
+MODULE_AUTHOR("Jeremy McDermond <nh6z@nh6z.net>");
+MODULE_LICENSE("GPL");
index f2d3191..85d4978 100644 (file)
@@ -30,7 +30,6 @@
 #include <linux/pm.h>
 #include <linux/gpio.h>
 #include <linux/of_gpio.h>
-#include <linux/i2c.h>
 #include <linux/cdev.h>
 #include <linux/slab.h>
 #include <linux/clk.h>
@@ -160,7 +159,10 @@ static const struct aic32x4_rate_divs aic32x4_divs[] = {
        /* 48k rate */
        {AIC32X4_FREQ_12000000, 48000, 1, 8, 1920, 128, 2, 8, 128, 2, 8, 4},
        {AIC32X4_FREQ_24000000, 48000, 2, 8, 1920, 128, 8, 2, 64, 8, 4, 4},
-       {AIC32X4_FREQ_25000000, 48000, 2, 7, 8643, 128, 8, 2, 64, 8, 4, 4}
+       {AIC32X4_FREQ_25000000, 48000, 2, 7, 8643, 128, 8, 2, 64, 8, 4, 4},
+
+       /* 96k rate */
+       {AIC32X4_FREQ_25000000, 96000, 2, 7, 8643, 64, 4, 4, 64, 4, 4, 1},
 };
 
 static const struct snd_kcontrol_new hpl_output_mixer_controls[] = {
@@ -181,16 +183,71 @@ static const struct snd_kcontrol_new lor_output_mixer_controls[] = {
        SOC_DAPM_SINGLE("R_DAC Switch", AIC32X4_LORROUTE, 3, 1, 0),
 };
 
-static const struct snd_kcontrol_new left_input_mixer_controls[] = {
-       SOC_DAPM_SINGLE("IN1_L P Switch", AIC32X4_LMICPGAPIN, 6, 1, 0),
-       SOC_DAPM_SINGLE("IN2_L P Switch", AIC32X4_LMICPGAPIN, 4, 1, 0),
-       SOC_DAPM_SINGLE("IN3_L P Switch", AIC32X4_LMICPGAPIN, 2, 1, 0),
+static const char * const resistor_text[] = {
+       "Off", "10 kOhm", "20 kOhm", "40 kOhm",
 };
 
-static const struct snd_kcontrol_new right_input_mixer_controls[] = {
-       SOC_DAPM_SINGLE("IN1_R P Switch", AIC32X4_RMICPGAPIN, 6, 1, 0),
-       SOC_DAPM_SINGLE("IN2_R P Switch", AIC32X4_RMICPGAPIN, 4, 1, 0),
-       SOC_DAPM_SINGLE("IN3_R P Switch", AIC32X4_RMICPGAPIN, 2, 1, 0),
+/* Left mixer pins */
+static SOC_ENUM_SINGLE_DECL(in1l_lpga_p_enum, AIC32X4_LMICPGAPIN, 6, resistor_text);
+static SOC_ENUM_SINGLE_DECL(in2l_lpga_p_enum, AIC32X4_LMICPGAPIN, 4, resistor_text);
+static SOC_ENUM_SINGLE_DECL(in3l_lpga_p_enum, AIC32X4_LMICPGAPIN, 2, resistor_text);
+static SOC_ENUM_SINGLE_DECL(in1r_lpga_p_enum, AIC32X4_LMICPGAPIN, 0, resistor_text);
+
+static SOC_ENUM_SINGLE_DECL(cml_lpga_n_enum, AIC32X4_LMICPGANIN, 6, resistor_text);
+static SOC_ENUM_SINGLE_DECL(in2r_lpga_n_enum, AIC32X4_LMICPGANIN, 4, resistor_text);
+static SOC_ENUM_SINGLE_DECL(in3r_lpga_n_enum, AIC32X4_LMICPGANIN, 2, resistor_text);
+
+static const struct snd_kcontrol_new in1l_to_lmixer_controls[] = {
+       SOC_DAPM_ENUM("IN1_L L+ Switch", in1l_lpga_p_enum),
+};
+static const struct snd_kcontrol_new in2l_to_lmixer_controls[] = {
+       SOC_DAPM_ENUM("IN2_L L+ Switch", in2l_lpga_p_enum),
+};
+static const struct snd_kcontrol_new in3l_to_lmixer_controls[] = {
+       SOC_DAPM_ENUM("IN3_L L+ Switch", in3l_lpga_p_enum),
+};
+static const struct snd_kcontrol_new in1r_to_lmixer_controls[] = {
+       SOC_DAPM_ENUM("IN1_R L+ Switch", in1r_lpga_p_enum),
+};
+static const struct snd_kcontrol_new cml_to_lmixer_controls[] = {
+       SOC_DAPM_ENUM("CM_L L- Switch", cml_lpga_n_enum),
+};
+static const struct snd_kcontrol_new in2r_to_lmixer_controls[] = {
+       SOC_DAPM_ENUM("IN2_R L- Switch", in2r_lpga_n_enum),
+};
+static const struct snd_kcontrol_new in3r_to_lmixer_controls[] = {
+       SOC_DAPM_ENUM("IN3_R L- Switch", in3r_lpga_n_enum),
+};
+
+/*  Right mixer pins */
+static SOC_ENUM_SINGLE_DECL(in1r_rpga_p_enum, AIC32X4_RMICPGAPIN, 6, resistor_text);
+static SOC_ENUM_SINGLE_DECL(in2r_rpga_p_enum, AIC32X4_RMICPGAPIN, 4, resistor_text);
+static SOC_ENUM_SINGLE_DECL(in3r_rpga_p_enum, AIC32X4_RMICPGAPIN, 2, resistor_text);
+static SOC_ENUM_SINGLE_DECL(in2l_rpga_p_enum, AIC32X4_RMICPGAPIN, 0, resistor_text);
+static SOC_ENUM_SINGLE_DECL(cmr_rpga_n_enum, AIC32X4_RMICPGANIN, 6, resistor_text);
+static SOC_ENUM_SINGLE_DECL(in1l_rpga_n_enum, AIC32X4_RMICPGANIN, 4, resistor_text);
+static SOC_ENUM_SINGLE_DECL(in3l_rpga_n_enum, AIC32X4_RMICPGANIN, 2, resistor_text);
+
+static const struct snd_kcontrol_new in1r_to_rmixer_controls[] = {
+       SOC_DAPM_ENUM("IN1_R R+ Switch", in1r_rpga_p_enum),
+};
+static const struct snd_kcontrol_new in2r_to_rmixer_controls[] = {
+       SOC_DAPM_ENUM("IN2_R R+ Switch", in2r_rpga_p_enum),
+};
+static const struct snd_kcontrol_new in3r_to_rmixer_controls[] = {
+       SOC_DAPM_ENUM("IN3_R R+ Switch", in3r_rpga_p_enum),
+};
+static const struct snd_kcontrol_new in2l_to_rmixer_controls[] = {
+       SOC_DAPM_ENUM("IN2_L R+ Switch", in2l_rpga_p_enum),
+};
+static const struct snd_kcontrol_new cmr_to_rmixer_controls[] = {
+       SOC_DAPM_ENUM("CM_R R- Switch", cmr_rpga_n_enum),
+};
+static const struct snd_kcontrol_new in1l_to_rmixer_controls[] = {
+       SOC_DAPM_ENUM("IN1_L R- Switch", in1l_rpga_n_enum),
+};
+static const struct snd_kcontrol_new in3l_to_rmixer_controls[] = {
+       SOC_DAPM_ENUM("IN3_L R- Switch", in3l_rpga_n_enum),
 };
 
 static const struct snd_soc_dapm_widget aic32x4_dapm_widgets[] = {
@@ -214,14 +271,39 @@ static const struct snd_soc_dapm_widget aic32x4_dapm_widgets[] = {
                           &lor_output_mixer_controls[0],
                           ARRAY_SIZE(lor_output_mixer_controls)),
        SND_SOC_DAPM_PGA("LOR Power", AIC32X4_OUTPWRCTL, 2, 0, NULL, 0),
-       SND_SOC_DAPM_MIXER("Left Input Mixer", SND_SOC_NOPM, 0, 0,
-                          &left_input_mixer_controls[0],
-                          ARRAY_SIZE(left_input_mixer_controls)),
-       SND_SOC_DAPM_MIXER("Right Input Mixer", SND_SOC_NOPM, 0, 0,
-                          &right_input_mixer_controls[0],
-                          ARRAY_SIZE(right_input_mixer_controls)),
-       SND_SOC_DAPM_ADC("Left ADC", "Left Capture", AIC32X4_ADCSETUP, 7, 0),
+
        SND_SOC_DAPM_ADC("Right ADC", "Right Capture", AIC32X4_ADCSETUP, 6, 0),
+       SND_SOC_DAPM_MUX("IN1_R to Right Mixer Positive Resistor", SND_SOC_NOPM, 0, 0,
+                       in1r_to_rmixer_controls),
+       SND_SOC_DAPM_MUX("IN2_R to Right Mixer Positive Resistor", SND_SOC_NOPM, 0, 0,
+                       in2r_to_rmixer_controls),
+       SND_SOC_DAPM_MUX("IN3_R to Right Mixer Positive Resistor", SND_SOC_NOPM, 0, 0,
+                       in3r_to_rmixer_controls),
+       SND_SOC_DAPM_MUX("IN2_L to Right Mixer Positive Resistor", SND_SOC_NOPM, 0, 0,
+                       in2l_to_rmixer_controls),
+       SND_SOC_DAPM_MUX("CM_R to Right Mixer Negative Resistor", SND_SOC_NOPM, 0, 0,
+                       cmr_to_rmixer_controls),
+       SND_SOC_DAPM_MUX("IN1_L to Right Mixer Negative Resistor", SND_SOC_NOPM, 0, 0,
+                       in1l_to_rmixer_controls),
+       SND_SOC_DAPM_MUX("IN3_L to Right Mixer Negative Resistor", SND_SOC_NOPM, 0, 0,
+                       in3l_to_rmixer_controls),
+
+       SND_SOC_DAPM_ADC("Left ADC", "Left Capture", AIC32X4_ADCSETUP, 7, 0),
+       SND_SOC_DAPM_MUX("IN1_L to Left Mixer Positive Resistor", SND_SOC_NOPM, 0, 0,
+                       in1l_to_lmixer_controls),
+       SND_SOC_DAPM_MUX("IN2_L to Left Mixer Positive Resistor", SND_SOC_NOPM, 0, 0,
+                       in2l_to_lmixer_controls),
+       SND_SOC_DAPM_MUX("IN3_L to Left Mixer Positive Resistor", SND_SOC_NOPM, 0, 0,
+                       in3l_to_lmixer_controls),
+       SND_SOC_DAPM_MUX("IN1_R to Left Mixer Positive Resistor", SND_SOC_NOPM, 0, 0,
+                       in1r_to_lmixer_controls),
+       SND_SOC_DAPM_MUX("CM_L to Left Mixer Negative Resistor", SND_SOC_NOPM, 0, 0,
+                       cml_to_lmixer_controls),
+       SND_SOC_DAPM_MUX("IN2_R to Left Mixer Negative Resistor", SND_SOC_NOPM, 0, 0,
+                       in2r_to_lmixer_controls),
+       SND_SOC_DAPM_MUX("IN3_R to Left Mixer Negative Resistor", SND_SOC_NOPM, 0, 0,
+                       in3r_to_lmixer_controls),
+
        SND_SOC_DAPM_MICBIAS("Mic Bias", AIC32X4_MICBIAS, 6, 0),
 
        SND_SOC_DAPM_OUTPUT("HPL"),
@@ -261,19 +343,77 @@ static const struct snd_soc_dapm_route aic32x4_dapm_routes[] = {
        {"LOR Power", NULL, "LOR Output Mixer"},
        {"LOR", NULL, "LOR Power"},
 
-       /* Left input */
-       {"Left Input Mixer", "IN1_L P Switch", "IN1_L"},
-       {"Left Input Mixer", "IN2_L P Switch", "IN2_L"},
-       {"Left Input Mixer", "IN3_L P Switch", "IN3_L"},
-
-       {"Left ADC", NULL, "Left Input Mixer"},
-
        /* Right Input */
-       {"Right Input Mixer", "IN1_R P Switch", "IN1_R"},
-       {"Right Input Mixer", "IN2_R P Switch", "IN2_R"},
-       {"Right Input Mixer", "IN3_R P Switch", "IN3_R"},
-
-       {"Right ADC", NULL, "Right Input Mixer"},
+       {"Right ADC", NULL, "IN1_R to Right Mixer Positive Resistor"},
+       {"IN1_R to Right Mixer Positive Resistor", "10 kOhm", "IN1_R"},
+       {"IN1_R to Right Mixer Positive Resistor", "20 kOhm", "IN1_R"},
+       {"IN1_R to Right Mixer Positive Resistor", "40 kOhm", "IN1_R"},
+
+       {"Right ADC", NULL, "IN2_R to Right Mixer Positive Resistor"},
+       {"IN2_R to Right Mixer Positive Resistor", "10 kOhm", "IN2_R"},
+       {"IN2_R to Right Mixer Positive Resistor", "20 kOhm", "IN2_R"},
+       {"IN2_R to Right Mixer Positive Resistor", "40 kOhm", "IN2_R"},
+
+       {"Right ADC", NULL, "IN3_R to Right Mixer Positive Resistor"},
+       {"IN3_R to Right Mixer Positive Resistor", "10 kOhm", "IN3_R"},
+       {"IN3_R to Right Mixer Positive Resistor", "20 kOhm", "IN3_R"},
+       {"IN3_R to Right Mixer Positive Resistor", "40 kOhm", "IN3_R"},
+
+       {"Right ADC", NULL, "IN2_L to Right Mixer Positive Resistor"},
+       {"IN2_L to Right Mixer Positive Resistor", "10 kOhm", "IN2_L"},
+       {"IN2_L to Right Mixer Positive Resistor", "20 kOhm", "IN2_L"},
+       {"IN2_L to Right Mixer Positive Resistor", "40 kOhm", "IN2_L"},
+
+       {"Right ADC", NULL, "CM_R to Right Mixer Negative Resistor"},
+       {"CM_R to Right Mixer Negative Resistor", "10 kOhm", "CM_R"},
+       {"CM_R to Right Mixer Negative Resistor", "20 kOhm", "CM_R"},
+       {"CM_R to Right Mixer Negative Resistor", "40 kOhm", "CM_R"},
+
+       {"Right ADC", NULL, "IN1_L to Right Mixer Negative Resistor"},
+       {"IN1_L to Right Mixer Negative Resistor", "10 kOhm", "IN1_L"},
+       {"IN1_L to Right Mixer Negative Resistor", "20 kOhm", "IN1_L"},
+       {"IN1_L to Right Mixer Negative Resistor", "40 kOhm", "IN1_L"},
+
+       {"Right ADC", NULL, "IN3_L to Right Mixer Negative Resistor"},
+       {"IN3_L to Right Mixer Negative Resistor", "10 kOhm", "IN3_L"},
+       {"IN3_L to Right Mixer Negative Resistor", "20 kOhm", "IN3_L"},
+       {"IN3_L to Right Mixer Negative Resistor", "40 kOhm", "IN3_L"},
+
+       /* Left Input */
+       {"Left ADC", NULL, "IN1_L to Left Mixer Positive Resistor"},
+       {"IN1_L to Left Mixer Positive Resistor", "10 kOhm", "IN1_L"},
+       {"IN1_L to Left Mixer Positive Resistor", "20 kOhm", "IN1_L"},
+       {"IN1_L to Left Mixer Positive Resistor", "40 kOhm", "IN1_L"},
+
+       {"Left ADC", NULL, "IN2_L to Left Mixer Positive Resistor"},
+       {"IN2_L to Left Mixer Positive Resistor", "10 kOhm", "IN2_L"},
+       {"IN2_L to Left Mixer Positive Resistor", "20 kOhm", "IN2_L"},
+       {"IN2_L to Left Mixer Positive Resistor", "40 kOhm", "IN2_L"},
+
+       {"Left ADC", NULL, "IN3_L to Left Mixer Positive Resistor"},
+       {"IN3_L to Left Mixer Positive Resistor", "10 kOhm", "IN3_L"},
+       {"IN3_L to Left Mixer Positive Resistor", "20 kOhm", "IN3_L"},
+       {"IN3_L to Left Mixer Positive Resistor", "40 kOhm", "IN3_L"},
+
+       {"Left ADC", NULL, "IN1_R to Left Mixer Positive Resistor"},
+       {"IN1_R to Left Mixer Positive Resistor", "10 kOhm", "IN1_R"},
+       {"IN1_R to Left Mixer Positive Resistor", "20 kOhm", "IN1_R"},
+       {"IN1_R to Left Mixer Positive Resistor", "40 kOhm", "IN1_R"},
+
+       {"Left ADC", NULL, "CM_L to Left Mixer Negative Resistor"},
+       {"CM_L to Left Mixer Negative Resistor", "10 kOhm", "CM_L"},
+       {"CM_L to Left Mixer Negative Resistor", "20 kOhm", "CM_L"},
+       {"CM_L to Left Mixer Negative Resistor", "40 kOhm", "CM_L"},
+
+       {"Left ADC", NULL, "IN2_R to Left Mixer Negative Resistor"},
+       {"IN2_R to Left Mixer Negative Resistor", "10 kOhm", "IN2_R"},
+       {"IN2_R to Left Mixer Negative Resistor", "20 kOhm", "IN2_R"},
+       {"IN2_R to Left Mixer Negative Resistor", "40 kOhm", "IN2_R"},
+
+       {"Left ADC", NULL, "IN3_R to Left Mixer Negative Resistor"},
+       {"IN3_R to Left Mixer Negative Resistor", "10 kOhm", "IN3_R"},
+       {"IN3_R to Left Mixer Negative Resistor", "20 kOhm", "IN3_R"},
+       {"IN3_R to Left Mixer Negative Resistor", "40 kOhm", "IN3_R"},
 };
 
 static const struct regmap_range_cfg aic32x4_regmap_pages[] = {
@@ -287,14 +427,12 @@ static const struct regmap_range_cfg aic32x4_regmap_pages[] = {
        },
 };
 
-static const struct regmap_config aic32x4_regmap = {
-       .reg_bits = 8,
-       .val_bits = 8,
-
+const struct regmap_config aic32x4_regmap_config = {
        .max_register = AIC32X4_RMICPGAVOL,
        .ranges = aic32x4_regmap_pages,
        .num_ranges = ARRAY_SIZE(aic32x4_regmap_pages),
 };
+EXPORT_SYMBOL(aic32x4_regmap_config);
 
 static inline int aic32x4_get_divs(int mclk, int rate)
 {
@@ -567,7 +705,7 @@ static int aic32x4_set_bias_level(struct snd_soc_codec *codec,
        return 0;
 }
 
-#define AIC32X4_RATES  SNDRV_PCM_RATE_8000_48000
+#define AIC32X4_RATES  SNDRV_PCM_RATE_8000_96000
 #define AIC32X4_FORMATS        (SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S20_3LE \
                         | SNDRV_PCM_FMTBIT_S24_3LE | SNDRV_PCM_FMTBIT_S32_LE)
 
@@ -596,7 +734,7 @@ static struct snd_soc_dai_driver aic32x4_dai = {
        .symmetric_rates = 1,
 };
 
-static int aic32x4_probe(struct snd_soc_codec *codec)
+static int aic32x4_codec_probe(struct snd_soc_codec *codec)
 {
        struct aic32x4_priv *aic32x4 = snd_soc_codec_get_drvdata(codec);
        u32 tmp_reg;
@@ -655,7 +793,7 @@ static int aic32x4_probe(struct snd_soc_codec *codec)
 }
 
 static struct snd_soc_codec_driver soc_codec_dev_aic32x4 = {
-       .probe = aic32x4_probe,
+       .probe = aic32x4_codec_probe,
        .set_bias_level = aic32x4_set_bias_level,
        .suspend_bias_off = true,
 
@@ -777,24 +915,22 @@ error_ldo:
        return ret;
 }
 
-static int aic32x4_i2c_probe(struct i2c_client *i2c,
-                            const struct i2c_device_id *id)
+int aic32x4_probe(struct device *dev, struct regmap *regmap)
 {
-       struct aic32x4_pdata *pdata = i2c->dev.platform_data;
        struct aic32x4_priv *aic32x4;
-       struct device_node *np = i2c->dev.of_node;
+       struct aic32x4_pdata *pdata = dev->platform_data;
+       struct device_node *np = dev->of_node;
        int ret;
 
-       aic32x4 = devm_kzalloc(&i2c->dev, sizeof(struct aic32x4_priv),
+       if (IS_ERR(regmap))
+               return PTR_ERR(regmap);
+
+       aic32x4 = devm_kzalloc(dev, sizeof(struct aic32x4_priv),
                               GFP_KERNEL);
        if (aic32x4 == NULL)
                return -ENOMEM;
 
-       aic32x4->regmap = devm_regmap_init_i2c(i2c, &aic32x4_regmap);
-       if (IS_ERR(aic32x4->regmap))
-               return PTR_ERR(aic32x4->regmap);
-
-       i2c_set_clientdata(i2c, aic32x4);
+       dev_set_drvdata(dev, aic32x4);
 
        if (pdata) {
                aic32x4->power_cfg = pdata->power_cfg;
@@ -804,7 +940,7 @@ static int aic32x4_i2c_probe(struct i2c_client *i2c,
        } else if (np) {
                ret = aic32x4_parse_dt(aic32x4, np);
                if (ret) {
-                       dev_err(&i2c->dev, "Failed to parse DT node\n");
+                       dev_err(dev, "Failed to parse DT node\n");
                        return ret;
                }
        } else {
@@ -814,71 +950,48 @@ static int aic32x4_i2c_probe(struct i2c_client *i2c,
                aic32x4->rstn_gpio = -1;
        }
 
-       aic32x4->mclk = devm_clk_get(&i2c->dev, "mclk");
+       aic32x4->mclk = devm_clk_get(dev, "mclk");
        if (IS_ERR(aic32x4->mclk)) {
-               dev_err(&i2c->dev, "Failed getting the mclk. The current implementation does not support the usage of this codec without mclk\n");
+               dev_err(dev, "Failed getting the mclk. The current implementation does not support the usage of this codec without mclk\n");
                return PTR_ERR(aic32x4->mclk);
        }
 
        if (gpio_is_valid(aic32x4->rstn_gpio)) {
-               ret = devm_gpio_request_one(&i2c->dev, aic32x4->rstn_gpio,
+               ret = devm_gpio_request_one(dev, aic32x4->rstn_gpio,
                                GPIOF_OUT_INIT_LOW, "tlv320aic32x4 rstn");
                if (ret != 0)
                        return ret;
        }
 
-       ret = aic32x4_setup_regulators(&i2c->dev, aic32x4);
+       ret = aic32x4_setup_regulators(dev, aic32x4);
        if (ret) {
-               dev_err(&i2c->dev, "Failed to setup regulators\n");
+               dev_err(dev, "Failed to setup regulators\n");
                return ret;
        }
 
-       ret = snd_soc_register_codec(&i2c->dev,
+       ret = snd_soc_register_codec(dev,
                        &soc_codec_dev_aic32x4, &aic32x4_dai, 1);
        if (ret) {
-               dev_err(&i2c->dev, "Failed to register codec\n");
+               dev_err(dev, "Failed to register codec\n");
                aic32x4_disable_regulators(aic32x4);
                return ret;
        }
 
-       i2c_set_clientdata(i2c, aic32x4);
-
        return 0;
 }
+EXPORT_SYMBOL(aic32x4_probe);
 
-static int aic32x4_i2c_remove(struct i2c_client *client)
+int aic32x4_remove(struct device *dev)
 {
-       struct aic32x4_priv *aic32x4 = i2c_get_clientdata(client);
+       struct aic32x4_priv *aic32x4 = dev_get_drvdata(dev);
 
        aic32x4_disable_regulators(aic32x4);
 
-       snd_soc_unregister_codec(&client->dev);
+       snd_soc_unregister_codec(dev);
+
        return 0;
 }
-
-static const struct i2c_device_id aic32x4_i2c_id[] = {
-       { "tlv320aic32x4", 0 },
-       { }
-};
-MODULE_DEVICE_TABLE(i2c, aic32x4_i2c_id);
-
-static const struct of_device_id aic32x4_of_id[] = {
-       { .compatible = "ti,tlv320aic32x4", },
-       { /* senitel */ }
-};
-MODULE_DEVICE_TABLE(of, aic32x4_of_id);
-
-static struct i2c_driver aic32x4_i2c_driver = {
-       .driver = {
-               .name = "tlv320aic32x4",
-               .of_match_table = aic32x4_of_id,
-       },
-       .probe =    aic32x4_i2c_probe,
-       .remove =   aic32x4_i2c_remove,
-       .id_table = aic32x4_i2c_id,
-};
-
-module_i2c_driver(aic32x4_i2c_driver);
+EXPORT_SYMBOL(aic32x4_remove);
 
 MODULE_DESCRIPTION("ASoC tlv320aic32x4 codec driver");
 MODULE_AUTHOR("Javier Martin <javier.martin@vista-silicon.com>");
index 995f033..a197dd5 100644 (file)
 #ifndef _TLV320AIC32X4_H
 #define _TLV320AIC32X4_H
 
+struct device;
+struct regmap_config;
+
+extern const struct regmap_config aic32x4_regmap_config;
+int aic32x4_probe(struct device *dev, struct regmap *regmap);
+int aic32x4_remove(struct device *dev);
+
 /* tlv320aic32x4 register space (in decimal to match datasheet) */
 
 #define AIC32X4_PAGE1          128
index bc3de2e..1f70810 100644 (file)
@@ -824,7 +824,7 @@ static int twl6040_set_bias_level(struct snd_soc_codec *codec,
 {
        struct twl6040 *twl6040 = codec->control_data;
        struct twl6040_data *priv = snd_soc_codec_get_drvdata(codec);
-       int ret;
+       int ret = 0;
 
        switch (level) {
        case SND_SOC_BIAS_ON:
@@ -832,12 +832,16 @@ static int twl6040_set_bias_level(struct snd_soc_codec *codec,
        case SND_SOC_BIAS_PREPARE:
                break;
        case SND_SOC_BIAS_STANDBY:
-               if (priv->codec_powered)
+               if (priv->codec_powered) {
+                       /* Select low power PLL in standby */
+                       ret = twl6040_set_pll(twl6040, TWL6040_SYSCLK_SEL_LPPLL,
+                                             32768, 19200000);
                        break;
+               }
 
                ret = twl6040_power(twl6040, 1);
                if (ret)
-                       return ret;
+                       break;
 
                priv->codec_powered = 1;
 
@@ -853,7 +857,7 @@ static int twl6040_set_bias_level(struct snd_soc_codec *codec,
                break;
        }
 
-       return 0;
+       return ret;
 }
 
 static int twl6040_startup(struct snd_pcm_substream *substream,
@@ -983,9 +987,9 @@ static void twl6040_mute_path(struct snd_soc_codec *codec, enum twl6040_dai_id i
                if (mute) {
                        /* Power down drivers and DACs */
                        hflctl &= ~(TWL6040_HFDACENA | TWL6040_HFPGAENA |
-                                   TWL6040_HFDRVENA);
+                                   TWL6040_HFDRVENA | TWL6040_HFSWENA);
                        hfrctl &= ~(TWL6040_HFDACENA | TWL6040_HFPGAENA |
-                                   TWL6040_HFDRVENA);
+                                   TWL6040_HFDRVENA | TWL6040_HFSWENA);
                }
 
                twl6040_reg_write(twl6040, TWL6040_REG_HFLCTL, hflctl);
index fc164d6..f3109da 100644 (file)
@@ -3793,9 +3793,8 @@ static int wm8962_runtime_resume(struct device *dev)
        ret = regulator_bulk_enable(ARRAY_SIZE(wm8962->supplies),
                                    wm8962->supplies);
        if (ret != 0) {
-               dev_err(dev,
-                       "Failed to enable supplies: %d\n", ret);
-               return ret;
+               dev_err(dev, "Failed to enable supplies: %d\n", ret);
+               goto disable_clock;
        }
 
        regcache_cache_only(wm8962->regmap, false);
@@ -3833,6 +3832,10 @@ static int wm8962_runtime_resume(struct device *dev)
        msleep(5);
 
        return 0;
+
+disable_clock:
+       clk_disable_unprepare(wm8962->pdata.mclk);
+       return ret;
 }
 
 static int wm8962_runtime_suspend(struct device *dev)
index 910aafd..e63a318 100644 (file)
@@ -16,9 +16,9 @@
 #include <asm/types.h>
 #include <sound/soc.h>
 
-#define WM8962_SYSCLK_MCLK 1
-#define WM8962_SYSCLK_FLL  2
-#define WM8962_SYSCLK_PLL3 3
+#define WM8962_SYSCLK_MCLK 0
+#define WM8962_SYSCLK_FLL  1
+#define WM8962_SYSCLK_PLL3 2
 
 #define WM8962_FLL  1
 
index 2389ab4..466492b 100644 (file)
@@ -643,6 +643,7 @@ MODULE_DEVICE_TABLE(of, asoc_simple_of_match);
 static struct platform_driver asoc_simple_card = {
        .driver = {
                .name = "asoc-simple-card",
+               .pm = &snd_soc_pm_ops,
                .of_match_table = asoc_simple_of_match,
        },
        .probe = asoc_simple_card_probe,
index 132bb83..bc3c7b5 100644 (file)
@@ -1,6 +1,7 @@
 config SND_KIRKWOOD_SOC
        tristate "SoC Audio for the Marvell Kirkwood and Dove chips"
        depends on ARCH_DOVE || ARCH_MVEBU || COMPILE_TEST
+       depends on HAS_DMA
        help
          Say Y or M if you want to add support for codecs attached to
          the Kirkwood I2S interface. You will also need to select the
index f7e789e..3abf51c 100644 (file)
@@ -43,6 +43,7 @@ config SND_SOC_MT8173_RT5650_RT5676
        depends on SND_SOC_MEDIATEK && I2C
        select SND_SOC_RT5645
        select SND_SOC_RT5677
+       select SND_SOC_HDMI_CODEC
        help
          This adds ASoC driver for Mediatek MT8173 boards
          with the RT5650 and RT5676 codecs.
index 5c4c58c..bb59392 100644 (file)
@@ -134,7 +134,9 @@ static struct snd_soc_dai_link_component mt8173_rt5650_rt5676_codecs[] = {
 enum {
        DAI_LINK_PLAYBACK,
        DAI_LINK_CAPTURE,
+       DAI_LINK_HDMI,
        DAI_LINK_CODEC_I2S,
+       DAI_LINK_HDMI_I2S,
        DAI_LINK_INTERCODEC
 };
 
@@ -161,6 +163,16 @@ static struct snd_soc_dai_link mt8173_rt5650_rt5676_dais[] = {
                .dynamic = 1,
                .dpcm_capture = 1,
        },
+       [DAI_LINK_HDMI] = {
+               .name = "HDMI",
+               .stream_name = "HDMI PCM",
+               .cpu_dai_name = "HDMI",
+               .codec_name = "snd-soc-dummy",
+               .codec_dai_name = "snd-soc-dummy-dai",
+               .trigger = {SND_SOC_DPCM_TRIGGER_POST, SND_SOC_DPCM_TRIGGER_POST},
+               .dynamic = 1,
+               .dpcm_playback = 1,
+       },
 
        /* Back End DAI links */
        [DAI_LINK_CODEC_I2S] = {
@@ -177,6 +189,13 @@ static struct snd_soc_dai_link mt8173_rt5650_rt5676_dais[] = {
                .dpcm_playback = 1,
                .dpcm_capture = 1,
        },
+       [DAI_LINK_HDMI_I2S] = {
+               .name = "HDMI BE",
+               .cpu_dai_name = "HDMIO",
+               .no_pcm = 1,
+               .codec_dai_name = "i2s-hifi",
+               .dpcm_playback = 1,
+       },
        /* rt5676 <-> rt5650 intercodec link: Sets rt5676 I2S2 as master */
        [DAI_LINK_INTERCODEC] = {
                .name = "rt5650_rt5676 intercodec",
@@ -251,6 +270,14 @@ static int mt8173_rt5650_rt5676_dev_probe(struct platform_device *pdev)
        mt8173_rt5650_rt5676_dais[DAI_LINK_INTERCODEC].codec_of_node =
                mt8173_rt5650_rt5676_codecs[1].of_node;
 
+       mt8173_rt5650_rt5676_dais[DAI_LINK_HDMI_I2S].codec_of_node =
+               of_parse_phandle(pdev->dev.of_node, "mediatek,audio-codec", 2);
+       if (!mt8173_rt5650_rt5676_dais[DAI_LINK_HDMI_I2S].codec_of_node) {
+               dev_err(&pdev->dev,
+                       "Property 'audio-codec' missing or invalid\n");
+               return -EINVAL;
+       }
+
        card->dev = &pdev->dev;
        platform_set_drvdata(pdev, card);
 
index bb09bb1..a27a667 100644 (file)
@@ -85,12 +85,29 @@ static int mt8173_rt5650_init(struct snd_soc_pcm_runtime *runtime)
 {
        struct snd_soc_card *card = runtime->card;
        struct snd_soc_codec *codec = runtime->codec_dais[0]->codec;
+       const char *codec_capture_dai = runtime->codec_dais[1]->name;
        int ret;
 
        rt5645_sel_asrc_clk_src(codec,
-                               RT5645_DA_STEREO_FILTER |
-                               RT5645_AD_STEREO_FILTER,
+                               RT5645_DA_STEREO_FILTER,
                                RT5645_CLK_SEL_I2S1_ASRC);
+
+       if (!strcmp(codec_capture_dai, "rt5645-aif1")) {
+               rt5645_sel_asrc_clk_src(codec,
+                                       RT5645_AD_STEREO_FILTER,
+                                       RT5645_CLK_SEL_I2S1_ASRC);
+       } else if (!strcmp(codec_capture_dai, "rt5645-aif2")) {
+               rt5645_sel_asrc_clk_src(codec,
+                                       RT5645_AD_STEREO_FILTER,
+                                       RT5645_CLK_SEL_I2S2_ASRC);
+       } else {
+               dev_warn(card->dev,
+                        "Only one dai codec found in DTS, enabled rt5645 AD filter\n");
+               rt5645_sel_asrc_clk_src(codec,
+                                       RT5645_AD_STEREO_FILTER,
+                                       RT5645_CLK_SEL_I2S1_ASRC);
+       }
+
        /* enable jack detection */
        ret = snd_soc_card_jack_new(card, "Headset Jack",
                                    SND_JACK_HEADPHONE | SND_JACK_MICROPHONE |
@@ -110,6 +127,11 @@ static int mt8173_rt5650_init(struct snd_soc_pcm_runtime *runtime)
 
 static struct snd_soc_dai_link_component mt8173_rt5650_codecs[] = {
        {
+               /* Playback */
+               .dai_name = "rt5645-aif1",
+       },
+       {
+               /* Capture */
                .dai_name = "rt5645-aif1",
        },
 };
@@ -149,7 +171,7 @@ static struct snd_soc_dai_link mt8173_rt5650_dais[] = {
                .cpu_dai_name = "I2S",
                .no_pcm = 1,
                .codecs = mt8173_rt5650_codecs,
-               .num_codecs = 1,
+               .num_codecs = 2,
                .init = mt8173_rt5650_init,
                .dai_fmt = SND_SOC_DAIFMT_I2S | SND_SOC_DAIFMT_NB_NF |
                           SND_SOC_DAIFMT_CBS_CFS,
@@ -177,6 +199,8 @@ static int mt8173_rt5650_dev_probe(struct platform_device *pdev)
 {
        struct snd_soc_card *card = &mt8173_rt5650_card;
        struct device_node *platform_node;
+       struct device_node *np;
+       const char *codec_capture_dai;
        int i, ret;
 
        platform_node = of_parse_phandle(pdev->dev.of_node,
@@ -199,6 +223,26 @@ static int mt8173_rt5650_dev_probe(struct platform_device *pdev)
                        "Property 'audio-codec' missing or invalid\n");
                return -EINVAL;
        }
+       mt8173_rt5650_codecs[1].of_node = mt8173_rt5650_codecs[0].of_node;
+
+       if (of_find_node_by_name(platform_node, "codec-capture")) {
+               np = of_get_child_by_name(pdev->dev.of_node, "codec-capture");
+               if (!np) {
+                       dev_err(&pdev->dev,
+                               "%s: Can't find codec-capture DT node\n",
+                               __func__);
+                       return -EINVAL;
+               }
+               ret = snd_soc_of_get_dai_name(np, &codec_capture_dai);
+               if (ret < 0) {
+                       dev_err(&pdev->dev,
+                               "%s codec_capture_dai name fail %d\n",
+                               __func__, ret);
+                       return ret;
+               }
+               mt8173_rt5650_codecs[1].dai_name = codec_capture_dai;
+       }
+
        card->dev = &pdev->dev;
        platform_set_drvdata(pdev, card);
 
index f1c58a2..2b5df2e 100644 (file)
 #define AFE_TDM_CON1_WLEN_32BIT                (0x2 << 8)
 #define AFE_TDM_CON1_MSB_ALIGNED       (0x1 << 4)
 #define AFE_TDM_CON1_1_BCK_DELAY       (0x1 << 3)
+#define AFE_TDM_CON1_LRCK_INV          (0x1 << 2)
 #define AFE_TDM_CON1_BCK_INV           (0x1 << 1)
 #define AFE_TDM_CON1_EN                        (0x1 << 0)
 
@@ -449,6 +450,7 @@ static int mtk_afe_hdmi_prepare(struct snd_pcm_substream *substream,
                              runtime->rate * runtime->channels * 32);
 
        val = AFE_TDM_CON1_BCK_INV |
+             AFE_TDM_CON1_LRCK_INV |
              AFE_TDM_CON1_1_BCK_DELAY |
              AFE_TDM_CON1_MSB_ALIGNED | /* I2S mode */
              AFE_TDM_CON1_WLEN_32BIT |
index c7563e2..4a16e77 100644 (file)
@@ -260,6 +260,10 @@ static void omap_st_on(struct omap_mcbsp *mcbsp)
        if (mcbsp->pdata->enable_st_clock)
                mcbsp->pdata->enable_st_clock(mcbsp->id, 1);
 
+       /* Disable Sidetone clock auto-gating for normal operation */
+       w = MCBSP_ST_READ(mcbsp, SYSCONFIG);
+       MCBSP_ST_WRITE(mcbsp, SYSCONFIG, w & ~(ST_AUTOIDLE));
+
        /* Enable McBSP Sidetone */
        w = MCBSP_READ(mcbsp, SSELCR);
        MCBSP_WRITE(mcbsp, SSELCR, w | SIDETONEEN);
@@ -279,6 +283,10 @@ static void omap_st_off(struct omap_mcbsp *mcbsp)
        w = MCBSP_READ(mcbsp, SSELCR);
        MCBSP_WRITE(mcbsp, SSELCR, w & ~(SIDETONEEN));
 
+       /* Enable Sidetone clock auto-gating to reduce power consumption */
+       w = MCBSP_ST_READ(mcbsp, SYSCONFIG);
+       MCBSP_ST_WRITE(mcbsp, SYSCONFIG, w | ST_AUTOIDLE);
+
        if (mcbsp->pdata->enable_st_clock)
                mcbsp->pdata->enable_st_clock(mcbsp->id, 0);
 }
index 99381a2..a84f677 100644 (file)
@@ -82,6 +82,8 @@ static int omap_pcm_hw_params(struct snd_pcm_substream *substream,
        struct dma_chan *chan;
        int err = 0;
 
+       memset(&config, 0x00, sizeof(config));
+
        dma_data = snd_soc_dai_get_dma_data(rtd->cpu_dai, substream);
 
        /* return if this is a bufferless transfer e.g.
index ec522e9..b6cb995 100644 (file)
@@ -133,3 +133,4 @@ module_platform_driver(mmp_driver);
 MODULE_AUTHOR("Leo Yan <leoy@marvell.com>");
 MODULE_DESCRIPTION("ALSA SoC Brownstone");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:brownstone-audio");
index 5c8f9db..d1661fa 100644 (file)
@@ -207,3 +207,4 @@ module_platform_driver(mioa701_wm9713_driver);
 MODULE_AUTHOR("Robert Jarzmik (rjarzmik@free.fr)");
 MODULE_DESCRIPTION("ALSA SoC WM9713 MIO A701");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:mioa701-wm9713");
index 51e790d..96df9b2 100644 (file)
@@ -248,3 +248,4 @@ module_platform_driver(mmp_pcm_driver);
 MODULE_AUTHOR("Leo Yan <leoy@marvell.com>");
 MODULE_DESCRIPTION("MMP Soc Audio DMA module");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:mmp-pcm-audio");
index eca60c2..ca8b23f 100644 (file)
@@ -482,3 +482,4 @@ module_platform_driver(asoc_mmp_sspa_driver);
 MODULE_AUTHOR("Leo Yan <leoy@marvell.com>");
 MODULE_DESCRIPTION("MMP SSPA SoC Interface");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:mmp-sspa-dai");
index 4e74d95..bcc81e9 100644 (file)
@@ -161,3 +161,4 @@ module_platform_driver(palm27x_wm9712_driver);
 MODULE_AUTHOR("Marek Vasut <marek.vasut@gmail.com>");
 MODULE_DESCRIPTION("ALSA SoC Palm T|X, T5 and LifeDrive");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:palm27x-asoc");
index da03fad..3cad990 100644 (file)
@@ -833,3 +833,4 @@ module_platform_driver(asoc_ssp_driver);
 MODULE_AUTHOR("Mark Brown <broonie@opensource.wolfsonmicro.com>");
 MODULE_DESCRIPTION("PXA SSP/PCM SoC Interface");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:pxa-ssp-dai");
index f3de615..9615e6d 100644 (file)
@@ -287,3 +287,4 @@ module_platform_driver(pxa2xx_ac97_driver);
 MODULE_AUTHOR("Nicolas Pitre");
 MODULE_DESCRIPTION("AC97 driver for the Intel PXA2xx chip");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:pxa2xx-ac97");
index 9f39039..410d48b 100644 (file)
@@ -117,3 +117,4 @@ module_platform_driver(pxa_pcm_driver);
 MODULE_AUTHOR("Nicolas Pitre");
 MODULE_DESCRIPTION("Intel PXA2xx PCM DMA module");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:pxa-pcm-audio");
index 6e86654..db000c6 100644 (file)
@@ -474,7 +474,7 @@ static int lpass_platform_pcm_new(struct snd_soc_pcm_runtime *soc_runtime)
        struct lpass_data *drvdata =
                snd_soc_platform_get_drvdata(soc_runtime->platform);
        struct lpass_variant *v = drvdata->variant;
-       int ret;
+       int ret = -EINVAL;
        struct lpass_pcm_data *data;
        size_t size = lpass_platform_pcm_hardware.buffer_bytes_max;
 
@@ -491,7 +491,7 @@ static int lpass_platform_pcm_new(struct snd_soc_pcm_runtime *soc_runtime)
                        data->rdma_ch = v->alloc_dma_channel(drvdata,
                                                SNDRV_PCM_STREAM_PLAYBACK);
 
-               if (IS_ERR_VALUE(data->rdma_ch))
+               if (data->rdma_ch < 0)
                        return data->rdma_ch;
 
                drvdata->substream[data->rdma_ch] = psubstream;
@@ -518,8 +518,10 @@ static int lpass_platform_pcm_new(struct snd_soc_pcm_runtime *soc_runtime)
                        data->wrdma_ch = v->alloc_dma_channel(drvdata,
                                                SNDRV_PCM_STREAM_CAPTURE);
 
-               if (IS_ERR_VALUE(data->wrdma_ch))
+               if (data->wrdma_ch < 0) {
+                       ret = data->wrdma_ch;
                        goto capture_alloc_err;
+               }
 
                drvdata->substream[data->wrdma_ch] = csubstream;
 
index 606399d..49354d1 100644 (file)
@@ -492,9 +492,7 @@ static void rsnd_adg_get_clkout(struct rsnd_priv *priv,
         */
        if (!count) {
                clk = clk_register_fixed_rate(dev, clkout_name[CLKOUT],
-                                             parent_clk_name,
-                                             (parent_clk_name) ?
-                                             0 : CLK_IS_ROOT, req_rate);
+                                             parent_clk_name, 0, req_rate);
                if (!IS_ERR(clk)) {
                        adg->clkout[CLKOUT] = clk;
                        of_clk_add_provider(np, of_clk_src_simple_get, clk);
@@ -506,9 +504,7 @@ static void rsnd_adg_get_clkout(struct rsnd_priv *priv,
        else {
                for (i = 0; i < CLKOUTMAX; i++) {
                        clk = clk_register_fixed_rate(dev, clkout_name[i],
-                                                     parent_clk_name,
-                                                     (parent_clk_name) ?
-                                                     0 : CLK_IS_ROOT,
+                                                     parent_clk_name, 0,
                                                      req_rate);
                        if (!IS_ERR(clk)) {
                                adg->onecell.clks       = adg->clkout;
index 7658e8f..6bc93cb 100644 (file)
@@ -316,11 +316,15 @@ static u32 rsnd_dmapp_get_id(struct rsnd_dai_stream *io,
                size = ARRAY_SIZE(gen2_id_table_cmd);
        }
 
-       if (!entry)
-               return 0xFF;
+       if ((!entry) || (size <= id)) {
+               struct device *dev = rsnd_priv_to_dev(rsnd_io_to_priv(io));
 
-       if (size <= id)
-               return 0xFF;
+               dev_err(dev, "unknown connection (%s[%d])\n",
+                       rsnd_mod_name(mod), rsnd_mod_id(mod));
+
+               /* use non-prohibited SRS number as error */
+               return 0x00; /* SSI00 */
+       }
 
        return entry[id];
 }
index fc89a67..a8f61d7 100644 (file)
@@ -276,8 +276,9 @@ struct rsnd_mod {
 /*
  * status
  *
- * 0xH0000CB0
+ * 0xH0000CBA
  *
+ * A   0: probe        1: remove
  * B   0: init         1: quit
  * C   0: start        1: stop
  *
@@ -287,19 +288,19 @@ struct rsnd_mod {
  * H   0: fallback
  * H   0: hw_params
  */
+#define __rsnd_mod_shift_probe         0
+#define __rsnd_mod_shift_remove                0
 #define __rsnd_mod_shift_init          4
 #define __rsnd_mod_shift_quit          4
 #define __rsnd_mod_shift_start         8
 #define __rsnd_mod_shift_stop          8
-#define __rsnd_mod_shift_probe         28 /* always called */
-#define __rsnd_mod_shift_remove                28 /* always called */
 #define __rsnd_mod_shift_irq           28 /* always called */
 #define __rsnd_mod_shift_pcm_new       28 /* always called */
 #define __rsnd_mod_shift_fallback      28 /* always called */
 #define __rsnd_mod_shift_hw_params     28 /* always called */
 
-#define __rsnd_mod_add_probe           0
-#define __rsnd_mod_add_remove          0
+#define __rsnd_mod_add_probe            1
+#define __rsnd_mod_add_remove          -1
 #define __rsnd_mod_add_init             1
 #define __rsnd_mod_add_quit            -1
 #define __rsnd_mod_add_start            1
@@ -310,7 +311,7 @@ struct rsnd_mod {
 #define __rsnd_mod_add_hw_params       0
 
 #define __rsnd_mod_call_probe          0
-#define __rsnd_mod_call_remove         0
+#define __rsnd_mod_call_remove         1
 #define __rsnd_mod_call_init           0
 #define __rsnd_mod_call_quit           1
 #define __rsnd_mod_call_start          0
index 15d6ffe..e39f916 100644 (file)
@@ -572,6 +572,9 @@ int rsnd_src_probe(struct rsnd_priv *priv)
 
        i = 0;
        for_each_child_of_node(node, np) {
+               if (!of_device_is_available(np))
+                       goto skip;
+
                src = rsnd_src_get(priv, i);
 
                snprintf(name, RSND_SRC_NAME_SIZE, "%s.%d",
@@ -595,6 +598,7 @@ int rsnd_src_probe(struct rsnd_priv *priv)
                if (ret)
                        goto rsnd_src_probe_done;
 
+skip:
                i++;
        }
 
index 1cf94d7..ee7f15a 100644 (file)
@@ -1023,6 +1023,11 @@ static int soc_tplg_kcontrol_elems_load(struct soc_tplg *tplg,
 
                control_hdr = (struct snd_soc_tplg_ctl_hdr *)tplg->pos;
 
+               if (control_hdr->size != sizeof(*control_hdr)) {
+                       dev_err(tplg->dev, "ASoC: invalid control size\n");
+                       return -EINVAL;
+               }
+
                switch (control_hdr->ops.info) {
                case SND_SOC_TPLG_CTL_VOLSW:
                case SND_SOC_TPLG_CTL_STROBE:
@@ -1476,6 +1481,8 @@ widget:
        widget->dobj.type = SND_SOC_DOBJ_WIDGET;
        widget->dobj.ops = tplg->ops;
        widget->dobj.index = tplg->index;
+       kfree(template.sname);
+       kfree(template.name);
        list_add(&widget->dobj.list, &tplg->comp->dobj_list);
        return 0;
 
@@ -1499,10 +1506,17 @@ static int soc_tplg_dapm_widget_elems_load(struct soc_tplg *tplg,
 
        for (i = 0; i < count; i++) {
                widget = (struct snd_soc_tplg_dapm_widget *) tplg->pos;
+               if (widget->size != sizeof(*widget)) {
+                       dev_err(tplg->dev, "ASoC: invalid widget size\n");
+                       return -EINVAL;
+               }
+
                ret = soc_tplg_dapm_widget_create(tplg, widget);
-               if (ret < 0)
+               if (ret < 0) {
                        dev_err(tplg->dev, "ASoC: failed to load widget %s\n",
                                widget->name);
+                       return ret;
+               }
        }
 
        return 0;
@@ -1586,6 +1600,7 @@ static int soc_tplg_dai_create(struct soc_tplg *tplg,
        return snd_soc_register_dai(tplg->comp, dai_drv);
 }
 
+/* create the FE DAI link */
 static int soc_tplg_link_create(struct soc_tplg *tplg,
        struct snd_soc_tplg_pcm *pcm)
 {
@@ -1598,6 +1613,16 @@ static int soc_tplg_link_create(struct soc_tplg *tplg,
 
        link->name = pcm->pcm_name;
        link->stream_name = pcm->pcm_name;
+       link->id = pcm->pcm_id;
+
+       link->cpu_dai_name = pcm->dai_name;
+       link->codec_name = "snd-soc-dummy";
+       link->codec_dai_name = "snd-soc-dummy-dai";
+
+       /* enable DPCM */
+       link->dynamic = 1;
+       link->dpcm_playback = pcm->playback;
+       link->dpcm_capture = pcm->capture;
 
        /* pass control to component driver for optional further init */
        ret = soc_tplg_dai_link_load(tplg, link);
@@ -1639,8 +1664,6 @@ static int soc_tplg_pcm_elems_load(struct soc_tplg *tplg,
        if (tplg->pass != SOC_TPLG_PASS_PCM_DAI)
                return 0;
 
-       pcm = (struct snd_soc_tplg_pcm *)tplg->pos;
-
        if (soc_tplg_check_elem_count(tplg,
                sizeof(struct snd_soc_tplg_pcm), count,
                hdr->payload_size, "PCM DAI")) {
@@ -1650,7 +1673,13 @@ static int soc_tplg_pcm_elems_load(struct soc_tplg *tplg,
        }
 
        /* create the FE DAIs and DAI links */
+       pcm = (struct snd_soc_tplg_pcm *)tplg->pos;
        for (i = 0; i < count; i++) {
+               if (pcm->size != sizeof(*pcm)) {
+                       dev_err(tplg->dev, "ASoC: invalid pcm size\n");
+                       return -EINVAL;
+               }
+
                soc_tplg_pcm_create(tplg, pcm);
                pcm++;
        }
@@ -1670,6 +1699,11 @@ static int soc_tplg_manifest_load(struct soc_tplg *tplg,
                return 0;
 
        manifest = (struct snd_soc_tplg_manifest *)tplg->pos;
+       if (manifest->size != sizeof(*manifest)) {
+               dev_err(tplg->dev, "ASoC: invalid manifest size\n");
+               return -EINVAL;
+       }
+
        tplg->pos += sizeof(struct snd_soc_tplg_manifest);
 
        if (tplg->comp && tplg->ops && tplg->ops->manifest)
@@ -1686,6 +1720,14 @@ static int soc_valid_header(struct soc_tplg *tplg,
        if (soc_tplg_get_hdr_offset(tplg) >= tplg->fw->size)
                return 0;
 
+       if (hdr->size != sizeof(*hdr)) {
+               dev_err(tplg->dev,
+                       "ASoC: invalid header size for type %d at offset 0x%lx size 0x%zx.\n",
+                       hdr->type, soc_tplg_get_hdr_offset(tplg),
+                       tplg->fw->size);
+               return -EINVAL;
+       }
+
        /* big endian firmware objects not supported atm */
        if (hdr->magic == cpu_to_be32(SND_SOC_TPLG_MAGIC)) {
                dev_err(tplg->dev,
index 39bcefe..488ef4e 100644 (file)
 
 #include "uniperif.h"
 
+/*
+ * User frame size shall be 2, 4, 6 or 8 32-bits words length
+ * (i.e. 8, 16, 24 or 32 bytes)
+ * This constraint comes from allowed values for
+ * UNIPERIF_I2S_FMT_NUM_CH register
+ */
+#define UNIPERIF_MAX_FRAME_SZ 0x20
+#define UNIPERIF_ALLOWED_FRAME_SZ (0x08 | 0x10 | 0x18 | UNIPERIF_MAX_FRAME_SZ)
+
+int sti_uniperiph_set_tdm_slot(struct snd_soc_dai *dai, unsigned int tx_mask,
+                              unsigned int rx_mask, int slots,
+                              int slot_width)
+{
+       struct sti_uniperiph_data *priv = snd_soc_dai_get_drvdata(dai);
+       struct uniperif *uni = priv->dai_data.uni;
+       int i, frame_size, avail_slots;
+
+       if (!UNIPERIF_TYPE_IS_TDM(uni)) {
+               dev_err(uni->dev, "cpu dai not in tdm mode\n");
+               return -EINVAL;
+       }
+
+       /* store info in unip context */
+       uni->tdm_slot.slots = slots;
+       uni->tdm_slot.slot_width = slot_width;
+       /* unip is unidirectionnal */
+       uni->tdm_slot.mask = (tx_mask != 0) ? tx_mask : rx_mask;
+
+       /* number of available timeslots */
+       for (i = 0, avail_slots = 0; i < uni->tdm_slot.slots; i++) {
+               if ((uni->tdm_slot.mask >> i) & 0x01)
+                       avail_slots++;
+       }
+       uni->tdm_slot.avail_slots = avail_slots;
+
+       /* frame size in bytes */
+       frame_size = uni->tdm_slot.avail_slots * uni->tdm_slot.slot_width / 8;
+
+       /* check frame size is allowed */
+       if ((frame_size > UNIPERIF_MAX_FRAME_SZ) ||
+           (frame_size & ~(int)UNIPERIF_ALLOWED_FRAME_SZ)) {
+               dev_err(uni->dev, "frame size not allowed: %d bytes\n",
+                       frame_size);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+int sti_uniperiph_fix_tdm_chan(struct snd_pcm_hw_params *params,
+                              struct snd_pcm_hw_rule *rule)
+{
+       struct uniperif *uni = rule->private;
+       struct snd_interval t;
+
+       t.min = uni->tdm_slot.avail_slots;
+       t.max = uni->tdm_slot.avail_slots;
+       t.openmin = 0;
+       t.openmax = 0;
+       t.integer = 0;
+
+       return snd_interval_refine(hw_param_interval(params, rule->var), &t);
+}
+
+int sti_uniperiph_fix_tdm_format(struct snd_pcm_hw_params *params,
+                                struct snd_pcm_hw_rule *rule)
+{
+       struct uniperif *uni = rule->private;
+       struct snd_mask *maskp = hw_param_mask(params, rule->var);
+       u64 format;
+
+       switch (uni->tdm_slot.slot_width) {
+       case 16:
+               format = SNDRV_PCM_FMTBIT_S16_LE;
+               break;
+       case 32:
+               format = SNDRV_PCM_FMTBIT_S32_LE;
+               break;
+       default:
+               dev_err(uni->dev, "format not supported: %d bits\n",
+                       uni->tdm_slot.slot_width);
+               return -EINVAL;
+       }
+
+       maskp->bits[0] &= (u_int32_t)format;
+       maskp->bits[1] &= (u_int32_t)(format >> 32);
+       /* clear remaining indexes */
+       memset(maskp->bits + 2, 0, (SNDRV_MASK_MAX - 64) / 8);
+
+       if (!maskp->bits[0] && !maskp->bits[1])
+               return -EINVAL;
+
+       return 0;
+}
+
+int sti_uniperiph_get_tdm_word_pos(struct uniperif *uni,
+                                  unsigned int *word_pos)
+{
+       int slot_width = uni->tdm_slot.slot_width / 8;
+       int slots_num = uni->tdm_slot.slots;
+       unsigned int slots_mask = uni->tdm_slot.mask;
+       int i, j, k;
+       unsigned int word16_pos[4];
+
+       /* word16_pos:
+        * word16_pos[0] = WORDX_LSB
+        * word16_pos[1] = WORDX_MSB,
+        * word16_pos[2] = WORDX+1_LSB
+        * word16_pos[3] = WORDX+1_MSB
+        */
+
+       /* set unip word position */
+       for (i = 0, j = 0, k = 0; (i < slots_num) && (k < WORD_MAX); i++) {
+               if ((slots_mask >> i) & 0x01) {
+                       word16_pos[j] = i * slot_width;
+
+                       if (slot_width == 4) {
+                               word16_pos[j + 1] = word16_pos[j] + 2;
+                               j++;
+                       }
+                       j++;
+
+                       if (j > 3) {
+                               word_pos[k] = word16_pos[1] |
+                                             (word16_pos[0] << 8) |
+                                             (word16_pos[3] << 16) |
+                                             (word16_pos[2] << 24);
+                               j = 0;
+                               k++;
+                       }
+               }
+       }
+
+       return 0;
+}
+
 /*
  * sti_uniperiph_dai_create_ctrl
  * This function is used to create Ctrl associated to DAI but also pcm device.
@@ -45,10 +181,16 @@ int sti_uniperiph_dai_hw_params(struct snd_pcm_substream *substream,
                                struct snd_pcm_hw_params *params,
                                struct snd_soc_dai *dai)
 {
+       struct sti_uniperiph_data *priv = snd_soc_dai_get_drvdata(dai);
+       struct uniperif *uni = priv->dai_data.uni;
        struct snd_dmaengine_dai_dma_data *dma_data;
        int transfer_size;
 
-       transfer_size = params_channels(params) * UNIPERIF_FIFO_FRAMES;
+       if (uni->info->type == SND_ST_UNIPERIF_TYPE_TDM)
+               /* transfer size = user frame size (in 32-bits FIFO cell) */
+               transfer_size = snd_soc_params_to_frame_size(params) / 32;
+       else
+               transfer_size = params_channels(params) * UNIPERIF_FIFO_FRAMES;
 
        dma_data = snd_soc_dai_get_dma_data(dai, substream);
        dma_data->maxburst = transfer_size;
index f0fd5a9..eb9933c 100644 (file)
@@ -25,7 +25,7 @@
        writel_relaxed((((value) & mask) << shift), ip->base + offset)
 
 /*
- * AUD_UNIPERIF_SOFT_RST reg
+ * UNIPERIF_SOFT_RST reg
  */
 
 #define UNIPERIF_SOFT_RST_OFFSET(ip) 0x0000
@@ -50,7 +50,7 @@
                UNIPERIF_SOFT_RST_SOFT_RST_MASK(ip))
 
 /*
- * AUD_UNIPERIF_FIFO_DATA reg
+ * UNIPERIF_FIFO_DATA reg
  */
 
 #define UNIPERIF_FIFO_DATA_OFFSET(ip) 0x0004
@@ -58,7 +58,7 @@
        writel_relaxed(value, ip->base + UNIPERIF_FIFO_DATA_OFFSET(ip))
 
 /*
- * AUD_UNIPERIF_CHANNEL_STA_REGN reg
+ * UNIPERIF_CHANNEL_STA_REGN reg
  */
 
 #define UNIPERIF_CHANNEL_STA_REGN(ip, n) (0x0060 + (4 * n))
        writel_relaxed(value, ip->base + UNIPERIF_CHANNEL_STA_REG5_OFFSET(ip))
 
 /*
- *  AUD_UNIPERIF_ITS reg
+ *  UNIPERIF_ITS reg
  */
 
 #define UNIPERIF_ITS_OFFSET(ip) 0x000C
                0 : (BIT(UNIPERIF_ITS_UNDERFLOW_REC_FAILED_SHIFT(ip))))
 
 /*
- *  AUD_UNIPERIF_ITS_BCLR reg
+ *  UNIPERIF_ITS_BCLR reg
  */
 
 /* FIFO_ERROR */
        writel_relaxed(value, ip->base + UNIPERIF_ITS_BCLR_OFFSET(ip))
 
 /*
- *  AUD_UNIPERIF_ITM reg
+ *  UNIPERIF_ITM reg
  */
 
 #define UNIPERIF_ITM_OFFSET(ip) 0x0018
                0 : (BIT(UNIPERIF_ITM_UNDERFLOW_REC_FAILED_SHIFT(ip))))
 
 /*
- *  AUD_UNIPERIF_ITM_BCLR reg
+ *  UNIPERIF_ITM_BCLR reg
  */
 
 #define UNIPERIF_ITM_BCLR_OFFSET(ip) 0x001c
                UNIPERIF_ITM_BCLR_DMA_ERROR_MASK(ip))
 
 /*
- *  AUD_UNIPERIF_ITM_BSET reg
+ *  UNIPERIF_ITM_BSET reg
  */
 
 #define UNIPERIF_ITM_BSET_OFFSET(ip) 0x0020
        SET_UNIPERIF_REG(ip, \
                UNIPERIF_CTRL_OFFSET(ip), \
                UNIPERIF_CTRL_READER_OUT_SEL_SHIFT(ip), \
-               CORAUD_UNIPERIF_CTRL_READER_OUT_SEL_MASK(ip), 1)
+               UNIPERIF_CTRL_READER_OUT_SEL_MASK(ip), 1)
 
 /* UNDERFLOW_REC_WINDOW */
 #define UNIPERIF_CTRL_UNDERFLOW_REC_WINDOW_SHIFT(ip) 20
                UNIPERIF_STATUS_1_UNDERFLOW_DURATION_MASK(ip), value)
 
 /*
- * AUD_UNIPERIF_CHANNEL_STA_REGN reg
+ * UNIPERIF_CHANNEL_STA_REGN reg
  */
 
 #define UNIPERIF_CHANNEL_STA_REGN(ip, n) (0x0060 + (4 * n))
                        UNIPERIF_CHANNEL_STA_REGN(ip, n))
 
 /*
- * AUD_UNIPERIF_USER_VALIDITY reg
+ * UNIPERIF_USER_VALIDITY reg
  */
 
 #define UNIPERIF_USER_VALIDITY_OFFSET(ip) 0x0090
                UNIPERIF_DBG_STANDBY_LEFT_SP_SHIFT(ip), \
                UNIPERIF_DBG_STANDBY_LEFT_SP_MASK(ip), value)
 
+/*
+ * UNIPERIF_TDM_ENABLE
+ */
+#define UNIPERIF_TDM_ENABLE_OFFSET(ip) 0x0118
+#define GET_UNIPERIF_TDM_ENABLE(ip) \
+       readl_relaxed(ip->base + UNIPERIF_TDM_ENABLE_OFFSET(ip))
+#define SET_UNIPERIF_TDM_ENABLE(ip, value) \
+       writel_relaxed(value, ip->base + UNIPERIF_TDM_ENABLE_OFFSET(ip))
+
+/* TDM_ENABLE */
+#define UNIPERIF_TDM_ENABLE_EN_TDM_SHIFT(ip) 0x0
+#define UNIPERIF_TDM_ENABLE_EN_TDM_MASK(ip) 0x1
+#define GET_UNIPERIF_TDM_ENABLE_EN_TDM(ip) \
+               GET_UNIPERIF_REG(ip, \
+               UNIPERIF_TDM_ENABLE_OFFSET(ip), \
+               UNIPERIF_TDM_ENABLE_EN_TDM_SHIFT(ip), \
+               UNIPERIF_TDM_ENABLE_EN_TDM_MASK(ip))
+#define SET_UNIPERIF_TDM_ENABLE_TDM_ENABLE(ip) \
+               SET_UNIPERIF_REG(ip, \
+               UNIPERIF_TDM_ENABLE_OFFSET(ip), \
+               UNIPERIF_TDM_ENABLE_EN_TDM_SHIFT(ip), \
+               UNIPERIF_TDM_ENABLE_EN_TDM_MASK(ip), 1)
+#define SET_UNIPERIF_TDM_ENABLE_TDM_DISABLE(ip) \
+               SET_UNIPERIF_REG(ip, \
+               UNIPERIF_TDM_ENABLE_OFFSET(ip), \
+               UNIPERIF_TDM_ENABLE_EN_TDM_SHIFT(ip), \
+               UNIPERIF_TDM_ENABLE_EN_TDM_MASK(ip), 0)
+
+/*
+ * UNIPERIF_TDM_FS_REF_FREQ
+ */
+#define UNIPERIF_TDM_FS_REF_FREQ_OFFSET(ip) 0x011c
+#define GET_UNIPERIF_TDM_FS_REF_FREQ(ip) \
+       readl_relaxed(ip->base + UNIPERIF_TDM_FS_REF_FREQ_OFFSET(ip))
+#define SET_UNIPERIF_TDM_FS_REF_FREQ(ip, value) \
+       writel_relaxed(value, ip->base + \
+                       UNIPERIF_TDM_FS_REF_FREQ_OFFSET(ip))
+
+/* REF_FREQ */
+#define UNIPERIF_TDM_FS_REF_FREQ_REF_FREQ_SHIFT(ip) 0x0
+#define VALUE_UNIPERIF_TDM_FS_REF_FREQ_8KHZ(ip) 0
+#define VALUE_UNIPERIF_TDM_FS_REF_FREQ_16KHZ(ip) 1
+#define VALUE_UNIPERIF_TDM_FS_REF_FREQ_32KHZ(ip) 2
+#define VALUE_UNIPERIF_TDM_FS_REF_FREQ_48KHZ(ip) 3
+#define UNIPERIF_TDM_FS_REF_FREQ_REF_FREQ_MASK(ip) 0x3
+#define GET_UNIPERIF_TDM_FS_REF_FREQ_REF_FREQ(ip) \
+               GET_UNIPERIF_REG(ip, \
+               UNIPERIF_TDM_FS_REF_FREQ_OFFSET(ip), \
+               UNIPERIF_TDM_FS_REF_FREQ_REF_FREQ_SHIFT(ip), \
+               UNIPERIF_TDM_FS_REF_FREQ_REF_FREQ_MASK(ip))
+#define SET_UNIPERIF_TDM_FS_REF_FREQ_8KHZ(ip) \
+               SET_UNIPERIF_REG(ip, \
+               UNIPERIF_TDM_FS_REF_FREQ_OFFSET(ip), \
+               UNIPERIF_TDM_FS_REF_FREQ_REF_FREQ_SHIFT(ip), \
+               UNIPERIF_TDM_FS_REF_FREQ_REF_FREQ_MASK(ip), \
+               VALUE_UNIPERIF_TDM_FS_REF_FREQ_8KHZ(ip))
+#define SET_UNIPERIF_TDM_FS_REF_FREQ_16KHZ(ip) \
+               SET_UNIPERIF_REG(ip, \
+               UNIPERIF_TDM_FS_REF_FREQ_OFFSET(ip), \
+               UNIPERIF_TDM_FS_REF_FREQ_REF_FREQ_SHIFT(ip), \
+               UNIPERIF_TDM_FS_REF_FREQ_REF_FREQ_MASK(ip), \
+               VALUE_UNIPERIF_TDM_FS_REF_FREQ_16KHZ(ip))
+#define SET_UNIPERIF_TDM_FS_REF_FREQ_32KHZ(ip) \
+               SET_UNIPERIF_REG(ip, \
+               UNIPERIF_TDM_FS_REF_FREQ_OFFSET(ip), \
+               UNIPERIF_TDM_FS_REF_FREQ_REF_FREQ_SHIFT(ip), \
+               UNIPERIF_TDM_FS_REF_FREQ_REF_FREQ_MASK(ip), \
+               VALUE_UNIPERIF_TDM_FS_REF_FREQ_32KHZ(ip))
+#define SET_UNIPERIF_TDM_FS_REF_FREQ_48KHZ(ip) \
+               SET_UNIPERIF_REG(ip, \
+               UNIPERIF_TDM_FS_REF_FREQ_OFFSET(ip), \
+               UNIPERIF_TDM_FS_REF_FREQ_REF_FREQ_SHIFT(ip), \
+               UNIPERIF_TDM_FS_REF_FREQ_REF_FREQ_MASK(ip), \
+               VALUE_UNIPERIF_TDM_FS_REF_FREQ_48KHZ(ip))
+
+/*
+ * UNIPERIF_TDM_FS_REF_DIV
+ */
+#define UNIPERIF_TDM_FS_REF_DIV_OFFSET(ip) 0x0120
+#define GET_UNIPERIF_TDM_FS_REF_DIV(ip) \
+       readl_relaxed(ip->base + UNIPERIF_TDM_FS_REF_DIV_OFFSET(ip))
+#define SET_UNIPERIF_TDM_FS_REF_DIV(ip, value) \
+               writel_relaxed(value, ip->base + \
+                       UNIPERIF_TDM_FS_REF_DIV_OFFSET(ip))
+
+/* NUM_TIMESLOT */
+#define UNIPERIF_TDM_FS_REF_DIV_NUM_TIMESLOT_SHIFT(ip) 0x0
+#define UNIPERIF_TDM_FS_REF_DIV_NUM_TIMESLOT_MASK(ip) 0xff
+#define GET_UNIPERIF_TDM_FS_REF_DIV_NUM_TIMESLOT(ip) \
+               GET_UNIPERIF_REG(ip, \
+               UNIPERIF_TDM_FS_REF_DIV_OFFSET(ip), \
+               UNIPERIF_TDM_FS_REF_DIV_NUM_TIMESLOT_SHIFT(ip), \
+               UNIPERIF_TDM_FS_REF_DIV_NUM_TIMESLOT_MASK(ip))
+#define SET_UNIPERIF_TDM_FS_REF_DIV_NUM_TIMESLOT(ip, value) \
+               SET_UNIPERIF_REG(ip, \
+               UNIPERIF_TDM_FS_REF_DIV_OFFSET(ip), \
+               UNIPERIF_TDM_FS_REF_DIV_NUM_TIMESLOT_SHIFT(ip), \
+               UNIPERIF_TDM_FS_REF_DIV_NUM_TIMESLOT_MASK(ip), value)
+
+/*
+ * UNIPERIF_TDM_WORD_POS_X_Y
+ * 32 bits of UNIPERIF_TDM_WORD_POS_X_Y register shall be set in 1 shot
+ */
+#define UNIPERIF_TDM_WORD_POS_1_2_OFFSET(ip) 0x013c
+#define UNIPERIF_TDM_WORD_POS_3_4_OFFSET(ip) 0x0140
+#define UNIPERIF_TDM_WORD_POS_5_6_OFFSET(ip) 0x0144
+#define UNIPERIF_TDM_WORD_POS_7_8_OFFSET(ip) 0x0148
+#define GET_UNIPERIF_TDM_WORD_POS(ip, words) \
+       readl_relaxed(ip->base + UNIPERIF_TDM_WORD_POS_##words##_OFFSET(ip))
+#define SET_UNIPERIF_TDM_WORD_POS(ip, words, value) \
+               writel_relaxed(value, ip->base + \
+               UNIPERIF_TDM_WORD_POS_##words##_OFFSET(ip))
 /*
  * uniperipheral IP capabilities
  */
 #define UNIPERIF_FIFO_SIZE             70 /* FIFO is 70 cells deep */
 #define UNIPERIF_FIFO_FRAMES           4  /* FDMA trigger limit in frames */
 
+#define UNIPERIF_TYPE_IS_HDMI(p) \
+       ((p)->info->type == SND_ST_UNIPERIF_TYPE_HDMI)
+#define UNIPERIF_TYPE_IS_PCM(p) \
+       ((p)->info->type == SND_ST_UNIPERIF_TYPE_PCM)
+#define UNIPERIF_TYPE_IS_SPDIF(p) \
+       ((p)->info->type == SND_ST_UNIPERIF_TYPE_SPDIF)
+#define UNIPERIF_TYPE_IS_IEC958(p) \
+       (UNIPERIF_TYPE_IS_HDMI(p) || \
+               UNIPERIF_TYPE_IS_SPDIF(p))
+#define UNIPERIF_TYPE_IS_TDM(p) \
+       ((p)->info->type == SND_ST_UNIPERIF_TYPE_TDM)
+
 /*
  * Uniperipheral IP revisions
  */
@@ -1125,10 +1249,11 @@ enum uniperif_version {
 };
 
 enum uniperif_type {
-       SND_ST_UNIPERIF_PLAYER_TYPE_NONE,
-       SND_ST_UNIPERIF_PLAYER_TYPE_HDMI,
-       SND_ST_UNIPERIF_PLAYER_TYPE_PCM,
-       SND_ST_UNIPERIF_PLAYER_TYPE_SPDIF
+       SND_ST_UNIPERIF_TYPE_NONE,
+       SND_ST_UNIPERIF_TYPE_HDMI,
+       SND_ST_UNIPERIF_TYPE_PCM,
+       SND_ST_UNIPERIF_TYPE_SPDIF,
+       SND_ST_UNIPERIF_TYPE_TDM
 };
 
 enum uniperif_state {
@@ -1145,9 +1270,17 @@ enum uniperif_iec958_encoding_mode {
        UNIPERIF_IEC958_ENCODING_MODE_ENCODED
 };
 
+enum uniperif_word_pos {
+       WORD_1_2,
+       WORD_3_4,
+       WORD_5_6,
+       WORD_7_8,
+       WORD_MAX
+};
+
 struct uniperif_info {
        int id; /* instance value of the uniperipheral IP */
-       enum uniperif_type player_type;
+       enum uniperif_type type;
        int underflow_enabled;          /* Underflow recovery mode */
 };
 
@@ -1156,12 +1289,20 @@ struct uniperif_iec958_settings {
        struct snd_aes_iec958 iec958;
 };
 
+struct dai_tdm_slot {
+       unsigned int mask;
+       int slots;
+       int slot_width;
+       unsigned int avail_slots;
+};
+
 struct uniperif {
        /* System information */
        struct uniperif_info *info;
        struct device *dev;
        int ver; /* IP version, used by register access macros */
        struct regmap_field *clk_sel;
+       struct regmap_field *valid_sel;
 
        /* capabilities */
        const struct snd_pcm_hardware *hw;
@@ -1192,6 +1333,7 @@ struct uniperif {
 
        /* dai properties */
        unsigned int daifmt;
+       struct dai_tdm_slot tdm_slot;
 
        /* DAI callbacks */
        const struct snd_soc_dai_ops *dai_ops;
@@ -1209,6 +1351,28 @@ struct sti_uniperiph_data {
        struct sti_uniperiph_dai dai_data;
 };
 
+static const struct snd_pcm_hardware uni_tdm_hw = {
+       .info = SNDRV_PCM_INFO_INTERLEAVED | SNDRV_PCM_INFO_BLOCK_TRANSFER |
+               SNDRV_PCM_INFO_PAUSE | SNDRV_PCM_INFO_MMAP |
+               SNDRV_PCM_INFO_MMAP_VALID,
+
+       .formats = SNDRV_PCM_FMTBIT_S32_LE | SNDRV_PCM_FMTBIT_S16_LE,
+
+       .rates = SNDRV_PCM_RATE_CONTINUOUS,
+       .rate_min = 8000,
+       .rate_max = 48000,
+
+       .channels_min = 1,
+       .channels_max = 32,
+
+       .periods_min = 2,
+       .periods_max = 10,
+
+       .period_bytes_min = 128,
+       .period_bytes_max = 64 * PAGE_SIZE,
+       .buffer_bytes_max = 256 * PAGE_SIZE
+};
+
 /* uniperiph player*/
 int uni_player_init(struct platform_device *pdev,
                    struct uniperif *uni_player);
@@ -1226,4 +1390,28 @@ int sti_uniperiph_dai_hw_params(struct snd_pcm_substream *substream,
                                struct snd_pcm_hw_params *params,
                                struct snd_soc_dai *dai);
 
+static inline int sti_uniperiph_get_user_frame_size(
+       struct snd_pcm_runtime *runtime)
+{
+       return (runtime->channels * snd_pcm_format_width(runtime->format) / 8);
+}
+
+static inline int sti_uniperiph_get_unip_tdm_frame_size(struct uniperif *uni)
+{
+       return (uni->tdm_slot.slots * uni->tdm_slot.slot_width / 8);
+}
+
+int sti_uniperiph_set_tdm_slot(struct snd_soc_dai *dai, unsigned int tx_mask,
+                              unsigned int rx_mask, int slots,
+                              int slot_width);
+
+int sti_uniperiph_get_tdm_word_pos(struct uniperif *uni,
+                                  unsigned int *word_pos);
+
+int sti_uniperiph_fix_tdm_chan(struct snd_pcm_hw_params *params,
+                              struct snd_pcm_hw_rule *rule);
+
+int sti_uniperiph_fix_tdm_format(struct snd_pcm_hw_params *params,
+                                struct snd_pcm_hw_rule *rule);
+
 #endif
index 7aca6b9..ee1c7c2 100644 (file)
 
 /* sys config registers definitions */
 #define SYS_CFG_AUDIO_GLUE 0xA4
-#define SYS_CFG_AUDI0_GLUE_PCM_CLKX 8
 
 /*
  * Driver specific types.
  */
-#define UNIPERIF_PLAYER_TYPE_IS_HDMI(p) \
-       ((p)->info->player_type == SND_ST_UNIPERIF_PLAYER_TYPE_HDMI)
-#define UNIPERIF_PLAYER_TYPE_IS_PCM(p) \
-       ((p)->info->player_type == SND_ST_UNIPERIF_PLAYER_TYPE_PCM)
-#define UNIPERIF_PLAYER_TYPE_IS_SPDIF(p) \
-       ((p)->info->player_type == SND_ST_UNIPERIF_PLAYER_TYPE_SPDIF)
-#define UNIPERIF_PLAYER_TYPE_IS_IEC958(p) \
-       (UNIPERIF_PLAYER_TYPE_IS_HDMI(p) || \
-               UNIPERIF_PLAYER_TYPE_IS_SPDIF(p))
 
 #define UNIPERIF_PLAYER_CLK_ADJ_MIN  -999999
 #define UNIPERIF_PLAYER_CLK_ADJ_MAX  1000000
+#define UNIPERIF_PLAYER_I2S_OUT 1 /* player id connected to I2S/TDM TX bus */
 
 /*
  * Note: snd_pcm_hardware is linked to DMA controller but is declared here to
@@ -444,18 +435,11 @@ static int uni_player_prepare_pcm(struct uniperif *player,
 
        /* Force slot width to 32 in I2S mode (HW constraint) */
        if ((player->daifmt & SND_SOC_DAIFMT_FORMAT_MASK) ==
-               SND_SOC_DAIFMT_I2S) {
+               SND_SOC_DAIFMT_I2S)
                slot_width = 32;
-       } else {
-               switch (runtime->format) {
-               case SNDRV_PCM_FORMAT_S16_LE:
-                       slot_width = 16;
-                       break;
-               default:
-                       slot_width = 32;
-                       break;
-               }
-       }
+       else
+               slot_width = snd_pcm_format_width(runtime->format);
+
        output_frame_size = slot_width * runtime->channels;
 
        clk_div = player->mclk / runtime->rate;
@@ -530,7 +514,6 @@ static int uni_player_prepare_pcm(struct uniperif *player,
        SET_UNIPERIF_CONFIG_ONE_BIT_AUD_DISABLE(player);
 
        SET_UNIPERIF_I2S_FMT_ORDER_MSB(player);
-       SET_UNIPERIF_I2S_FMT_SCLK_EDGE_FALLING(player);
 
        /* No iec958 formatting as outputting to DAC  */
        SET_UNIPERIF_CTRL_SPDIF_FMT_OFF(player);
@@ -538,6 +521,55 @@ static int uni_player_prepare_pcm(struct uniperif *player,
        return 0;
 }
 
+static int uni_player_prepare_tdm(struct uniperif *player,
+                                 struct snd_pcm_runtime *runtime)
+{
+       int tdm_frame_size; /* unip tdm frame size in bytes */
+       int user_frame_size; /* user tdm frame size in bytes */
+       /* default unip TDM_WORD_POS_X_Y */
+       unsigned int word_pos[4] = {
+               0x04060002, 0x0C0E080A, 0x14161012, 0x1C1E181A};
+       int freq, ret;
+
+       tdm_frame_size =
+               sti_uniperiph_get_unip_tdm_frame_size(player);
+       user_frame_size =
+               sti_uniperiph_get_user_frame_size(runtime);
+
+       /* fix 16/0 format */
+       SET_UNIPERIF_CONFIG_MEM_FMT_16_0(player);
+       SET_UNIPERIF_I2S_FMT_DATA_SIZE_32(player);
+
+       /* number of words inserted on the TDM line */
+       SET_UNIPERIF_I2S_FMT_NUM_CH(player, user_frame_size / 4 / 2);
+
+       SET_UNIPERIF_I2S_FMT_ORDER_MSB(player);
+       SET_UNIPERIF_I2S_FMT_ALIGN_LEFT(player);
+
+       /* Enable the tdm functionality */
+       SET_UNIPERIF_TDM_ENABLE_TDM_ENABLE(player);
+
+       /* number of 8 bits timeslots avail in unip tdm frame */
+       SET_UNIPERIF_TDM_FS_REF_DIV_NUM_TIMESLOT(player, tdm_frame_size);
+
+       /* set the timeslot allocation for words in FIFO */
+       sti_uniperiph_get_tdm_word_pos(player, word_pos);
+       SET_UNIPERIF_TDM_WORD_POS(player, 1_2, word_pos[WORD_1_2]);
+       SET_UNIPERIF_TDM_WORD_POS(player, 3_4, word_pos[WORD_3_4]);
+       SET_UNIPERIF_TDM_WORD_POS(player, 5_6, word_pos[WORD_5_6]);
+       SET_UNIPERIF_TDM_WORD_POS(player, 7_8, word_pos[WORD_7_8]);
+
+       /* set unip clk rate (not done vai set_sysclk ops) */
+       freq = runtime->rate * tdm_frame_size * 8;
+       mutex_lock(&player->ctrl_lock);
+       ret = uni_player_clk_set_rate(player, freq);
+       if (!ret)
+               player->mclk = freq;
+       mutex_unlock(&player->ctrl_lock);
+
+       return 0;
+}
+
 /*
  * ALSA uniperipheral iec958 controls
  */
@@ -668,11 +700,29 @@ static int uni_player_startup(struct snd_pcm_substream *substream,
 {
        struct sti_uniperiph_data *priv = snd_soc_dai_get_drvdata(dai);
        struct uniperif *player = priv->dai_data.uni;
+       int ret;
+
        player->substream = substream;
 
        player->clk_adj = 0;
 
-       return 0;
+       if (!UNIPERIF_TYPE_IS_TDM(player))
+               return 0;
+
+       /* refine hw constraint in tdm mode */
+       ret = snd_pcm_hw_rule_add(substream->runtime, 0,
+                                 SNDRV_PCM_HW_PARAM_CHANNELS,
+                                 sti_uniperiph_fix_tdm_chan,
+                                 player, SNDRV_PCM_HW_PARAM_CHANNELS,
+                                 -1);
+       if (ret < 0)
+               return ret;
+
+       return snd_pcm_hw_rule_add(substream->runtime, 0,
+                                  SNDRV_PCM_HW_PARAM_FORMAT,
+                                  sti_uniperiph_fix_tdm_format,
+                                  player, SNDRV_PCM_HW_PARAM_FORMAT,
+                                  -1);
 }
 
 static int uni_player_set_sysclk(struct snd_soc_dai *dai, int clk_id,
@@ -682,7 +732,7 @@ static int uni_player_set_sysclk(struct snd_soc_dai *dai, int clk_id,
        struct uniperif *player = priv->dai_data.uni;
        int ret;
 
-       if (dir == SND_SOC_CLOCK_IN)
+       if (UNIPERIF_TYPE_IS_TDM(player) || (dir == SND_SOC_CLOCK_IN))
                return 0;
 
        if (clk_id != 0)
@@ -714,7 +764,13 @@ static int uni_player_prepare(struct snd_pcm_substream *substream,
        }
 
        /* Calculate transfer size (in fifo cells and bytes) for frame count */
-       transfer_size = runtime->channels * UNIPERIF_FIFO_FRAMES;
+       if (player->info->type == SND_ST_UNIPERIF_TYPE_TDM) {
+               /* transfer size = user frame size (in 32 bits FIFO cell) */
+               transfer_size =
+                       sti_uniperiph_get_user_frame_size(runtime) / 4;
+       } else {
+               transfer_size = runtime->channels * UNIPERIF_FIFO_FRAMES;
+       }
 
        /* Calculate number of empty cells available before asserting DREQ */
        if (player->ver < SND_ST_UNIPERIF_VERSION_UNI_PLR_TOP_1_0) {
@@ -738,16 +794,19 @@ static int uni_player_prepare(struct snd_pcm_substream *substream,
        SET_UNIPERIF_CONFIG_DMA_TRIG_LIMIT(player, trigger_limit);
 
        /* Uniperipheral setup depends on player type */
-       switch (player->info->player_type) {
-       case SND_ST_UNIPERIF_PLAYER_TYPE_HDMI:
+       switch (player->info->type) {
+       case SND_ST_UNIPERIF_TYPE_HDMI:
                ret = uni_player_prepare_iec958(player, runtime);
                break;
-       case SND_ST_UNIPERIF_PLAYER_TYPE_PCM:
+       case SND_ST_UNIPERIF_TYPE_PCM:
                ret = uni_player_prepare_pcm(player, runtime);
                break;
-       case SND_ST_UNIPERIF_PLAYER_TYPE_SPDIF:
+       case SND_ST_UNIPERIF_TYPE_SPDIF:
                ret = uni_player_prepare_iec958(player, runtime);
                break;
+       case SND_ST_UNIPERIF_TYPE_TDM:
+               ret = uni_player_prepare_tdm(player, runtime);
+               break;
        default:
                dev_err(player->dev, "invalid player type");
                return -EINVAL;
@@ -852,8 +911,8 @@ static int uni_player_start(struct uniperif *player)
         * will not take affect and hang the player.
         */
        if (player->ver < SND_ST_UNIPERIF_VERSION_UNI_PLR_TOP_1_0)
-               if (UNIPERIF_PLAYER_TYPE_IS_IEC958(player))
-                               SET_UNIPERIF_CTRL_SPDIF_FMT_ON(player);
+               if (UNIPERIF_TYPE_IS_IEC958(player))
+                       SET_UNIPERIF_CTRL_SPDIF_FMT_ON(player);
 
        /* Force channel status update (no update if clk disable) */
        if (player->ver < SND_ST_UNIPERIF_VERSION_UNI_PLR_TOP_1_0)
@@ -954,27 +1013,30 @@ static void uni_player_shutdown(struct snd_pcm_substream *substream,
        player->substream = NULL;
 }
 
-static int uni_player_parse_dt_clk_glue(struct platform_device *pdev,
-                                       struct uniperif *player)
+static int uni_player_parse_dt_audio_glue(struct platform_device *pdev,
+                                         struct uniperif *player)
 {
-       int bit_offset;
        struct device_node *node = pdev->dev.of_node;
        struct regmap *regmap;
-
-       bit_offset = SYS_CFG_AUDI0_GLUE_PCM_CLKX + player->info->id;
+       struct reg_field regfield[2] = {
+               /* PCM_CLK_SEL */
+               REG_FIELD(SYS_CFG_AUDIO_GLUE,
+                         8 + player->info->id,
+                         8 + player->info->id),
+               /* PCMP_VALID_SEL */
+               REG_FIELD(SYS_CFG_AUDIO_GLUE, 0, 1)
+       };
 
        regmap = syscon_regmap_lookup_by_phandle(node, "st,syscfg");
 
-       if (regmap) {
-               struct reg_field regfield =
-                       REG_FIELD(SYS_CFG_AUDIO_GLUE, bit_offset, bit_offset);
-
-               player->clk_sel = regmap_field_alloc(regmap, regfield);
-       } else {
+       if (!regmap) {
                dev_err(&pdev->dev, "sti-audio-clk-glue syscf not found\n");
                return -EINVAL;
        }
 
+       player->clk_sel = regmap_field_alloc(regmap, regfield[0]);
+       player->valid_sel = regmap_field_alloc(regmap, regfield[1]);
+
        return 0;
 }
 
@@ -1012,19 +1074,21 @@ static int uni_player_parse_dt(struct platform_device *pdev,
        }
 
        if (strcasecmp(mode, "hdmi") == 0)
-               info->player_type = SND_ST_UNIPERIF_PLAYER_TYPE_HDMI;
+               info->type = SND_ST_UNIPERIF_TYPE_HDMI;
        else if (strcasecmp(mode, "pcm") == 0)
-               info->player_type = SND_ST_UNIPERIF_PLAYER_TYPE_PCM;
+               info->type = SND_ST_UNIPERIF_TYPE_PCM;
        else if (strcasecmp(mode, "spdif") == 0)
-               info->player_type = SND_ST_UNIPERIF_PLAYER_TYPE_SPDIF;
+               info->type = SND_ST_UNIPERIF_TYPE_SPDIF;
+       else if (strcasecmp(mode, "tdm") == 0)
+               info->type = SND_ST_UNIPERIF_TYPE_TDM;
        else
-               info->player_type = SND_ST_UNIPERIF_PLAYER_TYPE_NONE;
+               info->type = SND_ST_UNIPERIF_TYPE_NONE;
 
        /* Save the info structure */
        player->info = info;
 
-       /* Get the PCM_CLK_SEL bit from audio-glue-ctrl SoC register */
-       if (uni_player_parse_dt_clk_glue(pdev, player))
+       /* Get PCM_CLK_SEL & PCMP_VALID_SEL from audio-glue-ctrl SoC reg */
+       if (uni_player_parse_dt_audio_glue(pdev, player))
                return -EINVAL;
 
        return 0;
@@ -1037,7 +1101,8 @@ static const struct snd_soc_dai_ops uni_player_dai_ops = {
                .trigger = uni_player_trigger,
                .hw_params = sti_uniperiph_dai_hw_params,
                .set_fmt = sti_uniperiph_dai_set_fmt,
-               .set_sysclk = uni_player_set_sysclk
+               .set_sysclk = uni_player_set_sysclk,
+               .set_tdm_slot = sti_uniperiph_set_tdm_slot
 };
 
 int uni_player_init(struct platform_device *pdev,
@@ -1047,7 +1112,6 @@ int uni_player_init(struct platform_device *pdev,
 
        player->dev = &pdev->dev;
        player->state = UNIPERIF_STATE_STOPPED;
-       player->hw = &uni_player_pcm_hw;
        player->dai_ops = &uni_player_dai_ops;
 
        ret = uni_player_parse_dt(pdev, player);
@@ -1057,6 +1121,11 @@ int uni_player_init(struct platform_device *pdev,
                return ret;
        }
 
+       if (UNIPERIF_TYPE_IS_TDM(player))
+               player->hw = &uni_tdm_hw;
+       else
+               player->hw = &uni_player_pcm_hw;
+
        /* Get uniperif resource */
        player->clk = of_clk_get(pdev->dev.of_node, 0);
        if (IS_ERR(player->clk))
@@ -1073,6 +1142,17 @@ int uni_player_init(struct platform_device *pdev,
                }
        }
 
+       /* connect to I2S/TDM TX bus */
+       if (player->valid_sel &&
+           (player->info->id == UNIPERIF_PLAYER_I2S_OUT)) {
+               ret = regmap_field_write(player->valid_sel, player->info->id);
+               if (ret) {
+                       dev_err(player->dev,
+                               "%s: unable to connect to tdm bus", __func__);
+                       return ret;
+               }
+       }
+
        ret = devm_request_irq(&pdev->dev, player->irq,
                               uni_player_irq_handler, IRQF_SHARED,
                               dev_name(&pdev->dev), player);
@@ -1087,7 +1167,7 @@ int uni_player_init(struct platform_device *pdev,
        SET_UNIPERIF_CTRL_SPDIF_LAT_OFF(player);
        SET_UNIPERIF_CONFIG_IDLE_MOD_DISABLE(player);
 
-       if (UNIPERIF_PLAYER_TYPE_IS_IEC958(player)) {
+       if (UNIPERIF_TYPE_IS_IEC958(player)) {
                /* Set default iec958 status bits  */
 
                /* Consumer, PCM, copyright, 2ch, mode 0 */
index 8a0eb20..eb74a32 100644 (file)
@@ -73,55 +73,10 @@ static irqreturn_t uni_reader_irq_handler(int irq, void *dev_id)
        return ret;
 }
 
-static int uni_reader_prepare(struct snd_pcm_substream *substream,
-                             struct snd_soc_dai *dai)
+static int uni_reader_prepare_pcm(struct snd_pcm_runtime *runtime,
+                                 struct uniperif *reader)
 {
-       struct sti_uniperiph_data *priv = snd_soc_dai_get_drvdata(dai);
-       struct uniperif *reader = priv->dai_data.uni;
-       struct snd_pcm_runtime *runtime = substream->runtime;
-       int transfer_size, trigger_limit;
        int slot_width;
-       int count = 10;
-
-       /* The reader should be stopped */
-       if (reader->state != UNIPERIF_STATE_STOPPED) {
-               dev_err(reader->dev, "%s: invalid reader state %d", __func__,
-                       reader->state);
-               return -EINVAL;
-       }
-
-       /* Calculate transfer size (in fifo cells and bytes) for frame count */
-       transfer_size = runtime->channels * UNIPERIF_FIFO_FRAMES;
-
-       /* Calculate number of empty cells available before asserting DREQ */
-       if (reader->ver < SND_ST_UNIPERIF_VERSION_UNI_PLR_TOP_1_0)
-               trigger_limit = UNIPERIF_FIFO_SIZE - transfer_size;
-       else
-               /*
-                * Since SND_ST_UNIPERIF_VERSION_UNI_PLR_TOP_1_0
-                * FDMA_TRIGGER_LIMIT also controls when the state switches
-                * from OFF or STANDBY to AUDIO DATA.
-                */
-               trigger_limit = transfer_size;
-
-       /* Trigger limit must be an even number */
-       if ((!trigger_limit % 2) ||
-           (trigger_limit != 1 && transfer_size % 2) ||
-           (trigger_limit > UNIPERIF_CONFIG_DMA_TRIG_LIMIT_MASK(reader))) {
-               dev_err(reader->dev, "invalid trigger limit %d", trigger_limit);
-               return -EINVAL;
-       }
-
-       SET_UNIPERIF_CONFIG_DMA_TRIG_LIMIT(reader, trigger_limit);
-
-       switch (reader->daifmt & SND_SOC_DAIFMT_INV_MASK) {
-       case SND_SOC_DAIFMT_IB_IF:
-       case SND_SOC_DAIFMT_NB_IF:
-               SET_UNIPERIF_I2S_FMT_LR_POL_HIG(reader);
-               break;
-       default:
-               SET_UNIPERIF_I2S_FMT_LR_POL_LOW(reader);
-       }
 
        /* Force slot width to 32 in I2S mode */
        if ((reader->daifmt & SND_SOC_DAIFMT_FORMAT_MASK)
@@ -173,6 +128,109 @@ static int uni_reader_prepare(struct snd_pcm_substream *substream,
                return -EINVAL;
        }
 
+       /* Number of channels must be even */
+       if ((runtime->channels % 2) || (runtime->channels < 2) ||
+           (runtime->channels > 10)) {
+               dev_err(reader->dev, "%s: invalid nb of channels", __func__);
+               return -EINVAL;
+       }
+
+       SET_UNIPERIF_I2S_FMT_NUM_CH(reader, runtime->channels / 2);
+       SET_UNIPERIF_I2S_FMT_ORDER_MSB(reader);
+
+       return 0;
+}
+
+static int uni_reader_prepare_tdm(struct snd_pcm_runtime *runtime,
+                                 struct uniperif *reader)
+{
+       int frame_size; /* user tdm frame size in bytes */
+       /* default unip TDM_WORD_POS_X_Y */
+       unsigned int word_pos[4] = {
+               0x04060002, 0x0C0E080A, 0x14161012, 0x1C1E181A};
+
+       frame_size = sti_uniperiph_get_user_frame_size(runtime);
+
+       /* fix 16/0 format */
+       SET_UNIPERIF_CONFIG_MEM_FMT_16_0(reader);
+       SET_UNIPERIF_I2S_FMT_DATA_SIZE_32(reader);
+
+       /* number of words inserted on the TDM line */
+       SET_UNIPERIF_I2S_FMT_NUM_CH(reader, frame_size / 4 / 2);
+
+       SET_UNIPERIF_I2S_FMT_ORDER_MSB(reader);
+       SET_UNIPERIF_I2S_FMT_ALIGN_LEFT(reader);
+       SET_UNIPERIF_TDM_ENABLE_TDM_ENABLE(reader);
+
+       /*
+        * set the timeslots allocation for words in FIFO
+        *
+        * HW bug: (LSB word < MSB word) => this config is not possible
+        *         So if we want (LSB word < MSB) word, then it shall be
+        *         handled by user
+        */
+       sti_uniperiph_get_tdm_word_pos(reader, word_pos);
+       SET_UNIPERIF_TDM_WORD_POS(reader, 1_2, word_pos[WORD_1_2]);
+       SET_UNIPERIF_TDM_WORD_POS(reader, 3_4, word_pos[WORD_3_4]);
+       SET_UNIPERIF_TDM_WORD_POS(reader, 5_6, word_pos[WORD_5_6]);
+       SET_UNIPERIF_TDM_WORD_POS(reader, 7_8, word_pos[WORD_7_8]);
+
+       return 0;
+}
+
+static int uni_reader_prepare(struct snd_pcm_substream *substream,
+                             struct snd_soc_dai *dai)
+{
+       struct sti_uniperiph_data *priv = snd_soc_dai_get_drvdata(dai);
+       struct uniperif *reader = priv->dai_data.uni;
+       struct snd_pcm_runtime *runtime = substream->runtime;
+       int transfer_size, trigger_limit, ret;
+       int count = 10;
+
+       /* The reader should be stopped */
+       if (reader->state != UNIPERIF_STATE_STOPPED) {
+               dev_err(reader->dev, "%s: invalid reader state %d", __func__,
+                       reader->state);
+               return -EINVAL;
+       }
+
+       /* Calculate transfer size (in fifo cells and bytes) for frame count */
+       if (reader->info->type == SND_ST_UNIPERIF_TYPE_TDM) {
+               /* transfer size = unip frame size (in 32 bits FIFO cell) */
+               transfer_size =
+                       sti_uniperiph_get_user_frame_size(runtime) / 4;
+       } else {
+               transfer_size = runtime->channels * UNIPERIF_FIFO_FRAMES;
+       }
+
+       /* Calculate number of empty cells available before asserting DREQ */
+       if (reader->ver < SND_ST_UNIPERIF_VERSION_UNI_PLR_TOP_1_0)
+               trigger_limit = UNIPERIF_FIFO_SIZE - transfer_size;
+       else
+               /*
+                * Since SND_ST_UNIPERIF_VERSION_UNI_PLR_TOP_1_0
+                * FDMA_TRIGGER_LIMIT also controls when the state switches
+                * from OFF or STANDBY to AUDIO DATA.
+                */
+               trigger_limit = transfer_size;
+
+       /* Trigger limit must be an even number */
+       if ((!trigger_limit % 2) ||
+           (trigger_limit != 1 && transfer_size % 2) ||
+           (trigger_limit > UNIPERIF_CONFIG_DMA_TRIG_LIMIT_MASK(reader))) {
+               dev_err(reader->dev, "invalid trigger limit %d", trigger_limit);
+               return -EINVAL;
+       }
+
+       SET_UNIPERIF_CONFIG_DMA_TRIG_LIMIT(reader, trigger_limit);
+
+       if (UNIPERIF_TYPE_IS_TDM(reader))
+               ret = uni_reader_prepare_tdm(runtime, reader);
+       else
+               ret = uni_reader_prepare_pcm(runtime, reader);
+       if (ret)
+               return ret;
+
        switch (reader->daifmt & SND_SOC_DAIFMT_FORMAT_MASK) {
        case SND_SOC_DAIFMT_I2S:
                SET_UNIPERIF_I2S_FMT_ALIGN_LEFT(reader);
@@ -191,21 +249,26 @@ static int uni_reader_prepare(struct snd_pcm_substream *substream,
                return -EINVAL;
        }
 
-       SET_UNIPERIF_I2S_FMT_ORDER_MSB(reader);
-
-       /* Data clocking (changing) on the rising edge */
-       SET_UNIPERIF_I2S_FMT_SCLK_EDGE_RISING(reader);
-
-       /* Number of channels must be even */
-
-       if ((runtime->channels % 2) || (runtime->channels < 2) ||
-           (runtime->channels > 10)) {
-               dev_err(reader->dev, "%s: invalid nb of channels", __func__);
-               return -EINVAL;
+       /* Data clocking (changing) on the rising/falling edge */
+       switch (reader->daifmt & SND_SOC_DAIFMT_INV_MASK) {
+       case SND_SOC_DAIFMT_NB_NF:
+               SET_UNIPERIF_I2S_FMT_LR_POL_LOW(reader);
+               SET_UNIPERIF_I2S_FMT_SCLK_EDGE_RISING(reader);
+               break;
+       case SND_SOC_DAIFMT_NB_IF:
+               SET_UNIPERIF_I2S_FMT_LR_POL_HIG(reader);
+               SET_UNIPERIF_I2S_FMT_SCLK_EDGE_RISING(reader);
+               break;
+       case SND_SOC_DAIFMT_IB_NF:
+               SET_UNIPERIF_I2S_FMT_LR_POL_LOW(reader);
+               SET_UNIPERIF_I2S_FMT_SCLK_EDGE_FALLING(reader);
+               break;
+       case SND_SOC_DAIFMT_IB_IF:
+               SET_UNIPERIF_I2S_FMT_LR_POL_HIG(reader);
+               SET_UNIPERIF_I2S_FMT_SCLK_EDGE_FALLING(reader);
+               break;
        }
 
-       SET_UNIPERIF_I2S_FMT_NUM_CH(reader, runtime->channels / 2);
-
        /* Clear any pending interrupts */
        SET_UNIPERIF_ITS_BCLR(reader, GET_UNIPERIF_ITS(reader));
 
@@ -293,6 +356,32 @@ static int  uni_reader_trigger(struct snd_pcm_substream *substream,
        }
 }
 
+static int uni_reader_startup(struct snd_pcm_substream *substream,
+                             struct snd_soc_dai *dai)
+{
+       struct sti_uniperiph_data *priv = snd_soc_dai_get_drvdata(dai);
+       struct uniperif *reader = priv->dai_data.uni;
+       int ret;
+
+       if (!UNIPERIF_TYPE_IS_TDM(reader))
+               return 0;
+
+       /* refine hw constraint in tdm mode */
+       ret = snd_pcm_hw_rule_add(substream->runtime, 0,
+                                 SNDRV_PCM_HW_PARAM_CHANNELS,
+                                 sti_uniperiph_fix_tdm_chan,
+                                 reader, SNDRV_PCM_HW_PARAM_CHANNELS,
+                                 -1);
+       if (ret < 0)
+               return ret;
+
+       return snd_pcm_hw_rule_add(substream->runtime, 0,
+                                  SNDRV_PCM_HW_PARAM_FORMAT,
+                                  sti_uniperiph_fix_tdm_format,
+                                  reader, SNDRV_PCM_HW_PARAM_FORMAT,
+                                  -1);
+}
+
 static void uni_reader_shutdown(struct snd_pcm_substream *substream,
                                struct snd_soc_dai *dai)
 {
@@ -310,6 +399,7 @@ static int uni_reader_parse_dt(struct platform_device *pdev,
 {
        struct uniperif_info *info;
        struct device_node *node = pdev->dev.of_node;
+       const char *mode;
 
        /* Allocate memory for the info structure */
        info = devm_kzalloc(&pdev->dev, sizeof(*info), GFP_KERNEL);
@@ -322,6 +412,17 @@ static int uni_reader_parse_dt(struct platform_device *pdev,
                return -EINVAL;
        }
 
+       /* Read the device mode property */
+       if (of_property_read_string(node, "st,mode", &mode)) {
+               dev_err(&pdev->dev, "uniperipheral mode not defined");
+               return -EINVAL;
+       }
+
+       if (strcasecmp(mode, "tdm") == 0)
+               info->type = SND_ST_UNIPERIF_TYPE_TDM;
+       else
+               info->type = SND_ST_UNIPERIF_TYPE_PCM;
+
        /* Save the info structure */
        reader->info = info;
 
@@ -329,11 +430,13 @@ static int uni_reader_parse_dt(struct platform_device *pdev,
 }
 
 static const struct snd_soc_dai_ops uni_reader_dai_ops = {
+               .startup = uni_reader_startup,
                .shutdown = uni_reader_shutdown,
                .prepare = uni_reader_prepare,
                .trigger = uni_reader_trigger,
                .hw_params = sti_uniperiph_dai_hw_params,
                .set_fmt = sti_uniperiph_dai_set_fmt,
+               .set_tdm_slot = sti_uniperiph_set_tdm_slot
 };
 
 int uni_reader_init(struct platform_device *pdev,
@@ -343,7 +446,6 @@ int uni_reader_init(struct platform_device *pdev,
 
        reader->dev = &pdev->dev;
        reader->state = UNIPERIF_STATE_STOPPED;
-       reader->hw = &uni_reader_pcm_hw;
        reader->dai_ops = &uni_reader_dai_ops;
 
        ret = uni_reader_parse_dt(pdev, reader);
@@ -352,6 +454,11 @@ int uni_reader_init(struct platform_device *pdev,
                return ret;
        }
 
+       if (UNIPERIF_TYPE_IS_TDM(reader))
+               reader->hw = &uni_tdm_hw;
+       else
+               reader->hw = &uni_reader_pcm_hw;
+
        ret = devm_request_irq(&pdev->dev, reader->irq,
                               uni_reader_irq_handler, IRQF_SHARED,
                               dev_name(&pdev->dev), reader);
index 6bf68fe..f10b64d 100644 (file)
@@ -16,6 +16,7 @@ help:
        @echo '  gpio                   - GPIO tools'
        @echo '  hv                     - tools used when in Hyper-V clients'
        @echo '  iio                    - IIO tools'
+       @echo '  kvm_stat               - top-like utility for displaying kvm statistics'
        @echo '  lguest                 - a minimal 32-bit x86 hypervisor'
        @echo '  net                    - misc networking tools'
        @echo '  perf                   - Linux performance measurement and analysis tool'
@@ -110,10 +111,13 @@ tmon_install:
 freefall_install:
        $(call descend,laptop/$(@:_install=),install)
 
+kvm_stat_install:
+       $(call descend,kvm/$(@:_install=),install)
+
 install: acpi_install cgroup_install cpupower_install hv_install firewire_install lguest_install \
                perf_install selftests_install turbostat_install usb_install \
                virtio_install vm_install net_install x86_energy_perf_policy_install \
-               tmon_install freefall_install objtool_install
+               tmon_install freefall_install objtool_install kvm_stat_install
 
 acpi_clean:
        $(call descend,power/acpi,clean)
index ee566e8..27f3583 100644 (file)
@@ -58,8 +58,8 @@ quiet_cmd_mkdir = MKDIR    $(dir $@)
 quiet_cmd_cc_o_c = CC       $@
       cmd_cc_o_c = $(CC) $(c_flags) -c -o $@ $<
 
-quiet_cmd_cc_i_c = CPP      $@
-      cmd_cc_i_c = $(CC) $(c_flags) -E -o $@ $<
+quiet_cmd_cpp_i_c = CPP      $@
+      cmd_cpp_i_c = $(CC) $(c_flags) -E -o $@ $<
 
 quiet_cmd_cc_s_c = AS       $@
       cmd_cc_s_c = $(CC) $(c_flags) -S -o $@ $<
@@ -83,11 +83,11 @@ $(OUTPUT)%.o: %.S FORCE
 
 $(OUTPUT)%.i: %.c FORCE
        $(call rule_mkdir)
-       $(call if_changed_dep,cc_i_c)
+       $(call if_changed_dep,cpp_i_c)
 
 $(OUTPUT)%.s: %.S FORCE
        $(call rule_mkdir)
-       $(call if_changed_dep,cc_i_c)
+       $(call if_changed_dep,cpp_i_c)
 
 $(OUTPUT)%.s: %.c FORCE
        $(call rule_mkdir)
diff --git a/tools/kvm/kvm_stat/Makefile b/tools/kvm/kvm_stat/Makefile
new file mode 100644 (file)
index 0000000..5b1cba5
--- /dev/null
@@ -0,0 +1,41 @@
+include ../../scripts/Makefile.include
+include ../../scripts/utilities.mak
+BINDIR=usr/bin
+MANDIR=usr/share/man
+MAN1DIR=$(MANDIR)/man1
+
+MAN1=kvm_stat.1
+
+A2X=a2x
+a2x_path := $(call get-executable,$(A2X))
+
+all: man
+
+ifneq ($(findstring $(MAKEFLAGS),s),s)
+  ifneq ($(V),1)
+     QUIET_A2X = @echo '  A2X     '$@;
+  endif
+endif
+
+%.1: %.txt
+ifeq ($(a2x_path),)
+       $(error "You need to install asciidoc for man pages")
+else
+       $(QUIET_A2X)$(A2X) --doctype manpage --format manpage $<
+endif
+
+clean:
+       rm -f $(MAN1)
+
+man: $(MAN1)
+
+install-man: man
+       install -d -m 755 $(INSTALL_ROOT)/$(MAN1DIR)
+       install -m 644 kvm_stat.1 $(INSTALL_ROOT)/$(MAN1DIR)
+
+install-tools:
+       install -d -m 755 $(INSTALL_ROOT)/$(BINDIR)
+       install -m 755 -p "kvm_stat" "$(INSTALL_ROOT)/$(BINDIR)/$(TARGET)"
+
+install: install-tools install-man
+.PHONY: all clean man install-tools install-man install
diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
new file mode 100755 (executable)
index 0000000..581278c
--- /dev/null
@@ -0,0 +1,1127 @@
+#!/usr/bin/python
+#
+# top-like utility for displaying kvm statistics
+#
+# Copyright 2006-2008 Qumranet Technologies
+# Copyright 2008-2011 Red Hat, Inc.
+#
+# Authors:
+#  Avi Kivity <avi@redhat.com>
+#
+# This work is licensed under the terms of the GNU GPL, version 2.  See
+# the COPYING file in the top-level directory.
+"""The kvm_stat module outputs statistics about running KVM VMs
+
+Three different ways of output formatting are available:
+- as a top-like text ui
+- in a key -> value format
+- in an all keys, all values format
+
+The data is sampled from the KVM's debugfs entries and its perf events.
+"""
+
+import curses
+import sys
+import os
+import time
+import optparse
+import ctypes
+import fcntl
+import resource
+import struct
+import re
+from collections import defaultdict
+from time import sleep
+
+VMX_EXIT_REASONS = {
+    'EXCEPTION_NMI':        0,
+    'EXTERNAL_INTERRUPT':   1,
+    'TRIPLE_FAULT':         2,
+    'PENDING_INTERRUPT':    7,
+    'NMI_WINDOW':           8,
+    'TASK_SWITCH':          9,
+    'CPUID':                10,
+    'HLT':                  12,
+    'INVLPG':               14,
+    'RDPMC':                15,
+    'RDTSC':                16,
+    'VMCALL':               18,
+    'VMCLEAR':              19,
+    'VMLAUNCH':             20,
+    'VMPTRLD':              21,
+    'VMPTRST':              22,
+    'VMREAD':               23,
+    'VMRESUME':             24,
+    'VMWRITE':              25,
+    'VMOFF':                26,
+    'VMON':                 27,
+    'CR_ACCESS':            28,
+    'DR_ACCESS':            29,
+    'IO_INSTRUCTION':       30,
+    'MSR_READ':             31,
+    'MSR_WRITE':            32,
+    'INVALID_STATE':        33,
+    'MWAIT_INSTRUCTION':    36,
+    'MONITOR_INSTRUCTION':  39,
+    'PAUSE_INSTRUCTION':    40,
+    'MCE_DURING_VMENTRY':   41,
+    'TPR_BELOW_THRESHOLD':  43,
+    'APIC_ACCESS':          44,
+    'EPT_VIOLATION':        48,
+    'EPT_MISCONFIG':        49,
+    'WBINVD':               54,
+    'XSETBV':               55,
+    'APIC_WRITE':           56,
+    'INVPCID':              58,
+}
+
+SVM_EXIT_REASONS = {
+    'READ_CR0':       0x000,
+    'READ_CR3':       0x003,
+    'READ_CR4':       0x004,
+    'READ_CR8':       0x008,
+    'WRITE_CR0':      0x010,
+    'WRITE_CR3':      0x013,
+    'WRITE_CR4':      0x014,
+    'WRITE_CR8':      0x018,
+    'READ_DR0':       0x020,
+    'READ_DR1':       0x021,
+    'READ_DR2':       0x022,
+    'READ_DR3':       0x023,
+    'READ_DR4':       0x024,
+    'READ_DR5':       0x025,
+    'READ_DR6':       0x026,
+    'READ_DR7':       0x027,
+    'WRITE_DR0':      0x030,
+    'WRITE_DR1':      0x031,
+    'WRITE_DR2':      0x032,
+    'WRITE_DR3':      0x033,
+    'WRITE_DR4':      0x034,
+    'WRITE_DR5':      0x035,
+    'WRITE_DR6':      0x036,
+    'WRITE_DR7':      0x037,
+    'EXCP_BASE':      0x040,
+    'INTR':           0x060,
+    'NMI':            0x061,
+    'SMI':            0x062,
+    'INIT':           0x063,
+    'VINTR':          0x064,
+    'CR0_SEL_WRITE':  0x065,
+    'IDTR_READ':      0x066,
+    'GDTR_READ':      0x067,
+    'LDTR_READ':      0x068,
+    'TR_READ':        0x069,
+    'IDTR_WRITE':     0x06a,
+    'GDTR_WRITE':     0x06b,
+    'LDTR_WRITE':     0x06c,
+    'TR_WRITE':       0x06d,
+    'RDTSC':          0x06e,
+    'RDPMC':          0x06f,
+    'PUSHF':          0x070,
+    'POPF':           0x071,
+    'CPUID':          0x072,
+    'RSM':            0x073,
+    'IRET':           0x074,
+    'SWINT':          0x075,
+    'INVD':           0x076,
+    'PAUSE':          0x077,
+    'HLT':            0x078,
+    'INVLPG':         0x079,
+    'INVLPGA':        0x07a,
+    'IOIO':           0x07b,
+    'MSR':            0x07c,
+    'TASK_SWITCH':    0x07d,
+    'FERR_FREEZE':    0x07e,
+    'SHUTDOWN':       0x07f,
+    'VMRUN':          0x080,
+    'VMMCALL':        0x081,
+    'VMLOAD':         0x082,
+    'VMSAVE':         0x083,
+    'STGI':           0x084,
+    'CLGI':           0x085,
+    'SKINIT':         0x086,
+    'RDTSCP':         0x087,
+    'ICEBP':          0x088,
+    'WBINVD':         0x089,
+    'MONITOR':        0x08a,
+    'MWAIT':          0x08b,
+    'MWAIT_COND':     0x08c,
+    'XSETBV':         0x08d,
+    'NPF':            0x400,
+}
+
+# EC definition of HSR (from arch/arm64/include/asm/kvm_arm.h)
+AARCH64_EXIT_REASONS = {
+    'UNKNOWN':      0x00,
+    'WFI':          0x01,
+    'CP15_32':      0x03,
+    'CP15_64':      0x04,
+    'CP14_MR':      0x05,
+    'CP14_LS':      0x06,
+    'FP_ASIMD':     0x07,
+    'CP10_ID':      0x08,
+    'CP14_64':      0x0C,
+    'ILL_ISS':      0x0E,
+    'SVC32':        0x11,
+    'HVC32':        0x12,
+    'SMC32':        0x13,
+    'SVC64':        0x15,
+    'HVC64':        0x16,
+    'SMC64':        0x17,
+    'SYS64':        0x18,
+    'IABT':         0x20,
+    'IABT_HYP':     0x21,
+    'PC_ALIGN':     0x22,
+    'DABT':         0x24,
+    'DABT_HYP':     0x25,
+    'SP_ALIGN':     0x26,
+    'FP_EXC32':     0x28,
+    'FP_EXC64':     0x2C,
+    'SERROR':       0x2F,
+    'BREAKPT':      0x30,
+    'BREAKPT_HYP':  0x31,
+    'SOFTSTP':      0x32,
+    'SOFTSTP_HYP':  0x33,
+    'WATCHPT':      0x34,
+    'WATCHPT_HYP':  0x35,
+    'BKPT32':       0x38,
+    'VECTOR32':     0x3A,
+    'BRK64':        0x3C,
+}
+
+# From include/uapi/linux/kvm.h, KVM_EXIT_xxx
+USERSPACE_EXIT_REASONS = {
+    'UNKNOWN':          0,
+    'EXCEPTION':        1,
+    'IO':               2,
+    'HYPERCALL':        3,
+    'DEBUG':            4,
+    'HLT':              5,
+    'MMIO':             6,
+    'IRQ_WINDOW_OPEN':  7,
+    'SHUTDOWN':         8,
+    'FAIL_ENTRY':       9,
+    'INTR':             10,
+    'SET_TPR':          11,
+    'TPR_ACCESS':       12,
+    'S390_SIEIC':       13,
+    'S390_RESET':       14,
+    'DCR':              15,
+    'NMI':              16,
+    'INTERNAL_ERROR':   17,
+    'OSI':              18,
+    'PAPR_HCALL':       19,
+    'S390_UCONTROL':    20,
+    'WATCHDOG':         21,
+    'S390_TSCH':        22,
+    'EPR':              23,
+    'SYSTEM_EVENT':     24,
+}
+
+IOCTL_NUMBERS = {
+    'SET_FILTER':  0x40082406,
+    'ENABLE':      0x00002400,
+    'DISABLE':     0x00002401,
+    'RESET':       0x00002403,
+}
+
+class Arch(object):
+    """Encapsulates global architecture specific data.
+
+    Contains the performance event open syscall and ioctl numbers, as
+    well as the VM exit reasons for the architecture it runs on.
+
+    """
+    @staticmethod
+    def get_arch():
+        machine = os.uname()[4]
+
+        if machine.startswith('ppc'):
+            return ArchPPC()
+        elif machine.startswith('aarch64'):
+            return ArchA64()
+        elif machine.startswith('s390'):
+            return ArchS390()
+        else:
+            # X86_64
+            for line in open('/proc/cpuinfo'):
+                if not line.startswith('flags'):
+                    continue
+
+                flags = line.split()
+                if 'vmx' in flags:
+                    return ArchX86(VMX_EXIT_REASONS)
+                if 'svm' in flags:
+                    return ArchX86(SVM_EXIT_REASONS)
+                return
+
+class ArchX86(Arch):
+    def __init__(self, exit_reasons):
+        self.sc_perf_evt_open = 298
+        self.ioctl_numbers = IOCTL_NUMBERS
+        self.exit_reasons = exit_reasons
+
+class ArchPPC(Arch):
+    def __init__(self):
+        self.sc_perf_evt_open = 319
+        self.ioctl_numbers = IOCTL_NUMBERS
+        self.ioctl_numbers['ENABLE'] = 0x20002400
+        self.ioctl_numbers['DISABLE'] = 0x20002401
+        self.ioctl_numbers['RESET'] = 0x20002403
+
+        # PPC comes in 32 and 64 bit and some generated ioctl
+        # numbers depend on the wordsize.
+        char_ptr_size = ctypes.sizeof(ctypes.c_char_p)
+        self.ioctl_numbers['SET_FILTER'] = 0x80002406 | char_ptr_size << 16
+        self.exit_reasons = {}
+
+class ArchA64(Arch):
+    def __init__(self):
+        self.sc_perf_evt_open = 241
+        self.ioctl_numbers = IOCTL_NUMBERS
+        self.exit_reasons = AARCH64_EXIT_REASONS
+
+class ArchS390(Arch):
+    def __init__(self):
+        self.sc_perf_evt_open = 331
+        self.ioctl_numbers = IOCTL_NUMBERS
+        self.exit_reasons = None
+
+ARCH = Arch.get_arch()
+
+
+def walkdir(path):
+    """Returns os.walk() data for specified directory.
+
+    As it is only a wrapper it returns the same 3-tuple of (dirpath,
+    dirnames, filenames).
+    """
+    return next(os.walk(path))
+
+
+def parse_int_list(list_string):
+    """Returns an int list from a string of comma separated integers and
+    integer ranges."""
+    integers = []
+    members = list_string.split(',')
+
+    for member in members:
+        if '-' not in member:
+            integers.append(int(member))
+        else:
+            int_range = member.split('-')
+            integers.extend(range(int(int_range[0]),
+                                  int(int_range[1]) + 1))
+
+    return integers
+
+
+def get_online_cpus():
+    """Returns a list of cpu id integers."""
+    with open('/sys/devices/system/cpu/online') as cpu_list:
+        cpu_string = cpu_list.readline()
+        return parse_int_list(cpu_string)
+
+
+def get_filters():
+    """Returns a dict of trace events, their filter ids and
+    the values that can be filtered.
+
+    Trace events can be filtered for special values by setting a
+    filter string via an ioctl. The string normally has the format
+    identifier==value. For each filter a new event will be created, to
+    be able to distinguish the events.
+
+    """
+    filters = {}
+    filters['kvm_userspace_exit'] = ('reason', USERSPACE_EXIT_REASONS)
+    if ARCH.exit_reasons:
+        filters['kvm_exit'] = ('exit_reason', ARCH.exit_reasons)
+    return filters
+
+libc = ctypes.CDLL('libc.so.6', use_errno=True)
+syscall = libc.syscall
+
+class perf_event_attr(ctypes.Structure):
+    """Struct that holds the necessary data to set up a trace event.
+
+    For an extensive explanation see perf_event_open(2) and
+    include/uapi/linux/perf_event.h, struct perf_event_attr
+
+    All fields that are not initialized in the constructor are 0.
+
+    """
+    _fields_ = [('type', ctypes.c_uint32),
+                ('size', ctypes.c_uint32),
+                ('config', ctypes.c_uint64),
+                ('sample_freq', ctypes.c_uint64),
+                ('sample_type', ctypes.c_uint64),
+                ('read_format', ctypes.c_uint64),
+                ('flags', ctypes.c_uint64),
+                ('wakeup_events', ctypes.c_uint32),
+                ('bp_type', ctypes.c_uint32),
+                ('bp_addr', ctypes.c_uint64),
+                ('bp_len', ctypes.c_uint64),
+                ]
+
+    def __init__(self):
+        super(self.__class__, self).__init__()
+        self.type = PERF_TYPE_TRACEPOINT
+        self.size = ctypes.sizeof(self)
+        self.read_format = PERF_FORMAT_GROUP
+
+def perf_event_open(attr, pid, cpu, group_fd, flags):
+    """Wrapper for the sys_perf_evt_open() syscall.
+
+    Used to set up performance events, returns a file descriptor or -1
+    on error.
+
+    Attributes are:
+    - syscall number
+    - struct perf_event_attr *
+    - pid or -1 to monitor all pids
+    - cpu number or -1 to monitor all cpus
+    - The file descriptor of the group leader or -1 to create a group.
+    - flags
+
+    """
+    return syscall(ARCH.sc_perf_evt_open, ctypes.pointer(attr),
+                   ctypes.c_int(pid), ctypes.c_int(cpu),
+                   ctypes.c_int(group_fd), ctypes.c_long(flags))
+
+PERF_TYPE_TRACEPOINT = 2
+PERF_FORMAT_GROUP = 1 << 3
+
+PATH_DEBUGFS_TRACING = '/sys/kernel/debug/tracing'
+PATH_DEBUGFS_KVM = '/sys/kernel/debug/kvm'
+
+class Group(object):
+    """Represents a perf event group."""
+
+    def __init__(self):
+        self.events = []
+
+    def add_event(self, event):
+        self.events.append(event)
+
+    def read(self):
+        """Returns a dict with 'event name: value' for all events in the
+        group.
+
+        Values are read by reading from the file descriptor of the
+        event that is the group leader. See perf_event_open(2) for
+        details.
+
+        Read format for the used event configuration is:
+        struct read_format {
+            u64 nr; /* The number of events */
+            struct {
+                u64 value; /* The value of the event */
+            } values[nr];
+        };
+
+        """
+        length = 8 * (1 + len(self.events))
+        read_format = 'xxxxxxxx' + 'Q' * len(self.events)
+        return dict(zip([event.name for event in self.events],
+                        struct.unpack(read_format,
+                                      os.read(self.events[0].fd, length))))
+
+class Event(object):
+    """Represents a performance event and manages its life cycle."""
+    def __init__(self, name, group, trace_cpu, trace_pid, trace_point,
+                 trace_filter, trace_set='kvm'):
+        self.name = name
+        self.fd = None
+        self.setup_event(group, trace_cpu, trace_pid, trace_point,
+                         trace_filter, trace_set)
+
+    def __del__(self):
+        """Closes the event's file descriptor.
+
+        As no python file object was created for the file descriptor,
+        python will not reference count the descriptor and will not
+        close it itself automatically, so we do it.
+
+        """
+        if self.fd:
+            os.close(self.fd)
+
+    def setup_event_attribute(self, trace_set, trace_point):
+        """Returns an initialized ctype perf_event_attr struct."""
+
+        id_path = os.path.join(PATH_DEBUGFS_TRACING, 'events', trace_set,
+                               trace_point, 'id')
+
+        event_attr = perf_event_attr()
+        event_attr.config = int(open(id_path).read())
+        return event_attr
+
+    def setup_event(self, group, trace_cpu, trace_pid, trace_point,
+                    trace_filter, trace_set):
+        """Sets up the perf event in Linux.
+
+        Issues the syscall to register the event in the kernel and
+        then sets the optional filter.
+
+        """
+
+        event_attr = self.setup_event_attribute(trace_set, trace_point)
+
+        # First event will be group leader.
+        group_leader = -1
+
+        # All others have to pass the leader's descriptor instead.
+        if group.events:
+            group_leader = group.events[0].fd
+
+        fd = perf_event_open(event_attr, trace_pid,
+                             trace_cpu, group_leader, 0)
+        if fd == -1:
+            err = ctypes.get_errno()
+            raise OSError(err, os.strerror(err),
+                          'while calling sys_perf_event_open().')
+
+        if trace_filter:
+            fcntl.ioctl(fd, ARCH.ioctl_numbers['SET_FILTER'],
+                        trace_filter)
+
+        self.fd = fd
+
+    def enable(self):
+        """Enables the trace event in the kernel.
+
+        Enabling the group leader makes reading counters from it and the
+        events under it possible.
+
+        """
+        fcntl.ioctl(self.fd, ARCH.ioctl_numbers['ENABLE'], 0)
+
+    def disable(self):
+        """Disables the trace event in the kernel.
+
+        Disabling the group leader makes reading all counters under it
+        impossible.
+
+        """
+        fcntl.ioctl(self.fd, ARCH.ioctl_numbers['DISABLE'], 0)
+
+    def reset(self):
+        """Resets the count of the trace event in the kernel."""
+        fcntl.ioctl(self.fd, ARCH.ioctl_numbers['RESET'], 0)
+
+class TracepointProvider(object):
+    """Data provider for the stats class.
+
+    Manages the events/groups from which it acquires its data.
+
+    """
+    def __init__(self):
+        self.group_leaders = []
+        self.filters = get_filters()
+        self._fields = self.get_available_fields()
+        self._pid = 0
+
+    def get_available_fields(self):
+        """Returns a list of available event's of format 'event name(filter
+        name)'.
+
+        All available events have directories under
+        /sys/kernel/debug/tracing/events/ which export information
+        about the specific event. Therefore, listing the dirs gives us
+        a list of all available events.
+
+        Some events like the vm exit reasons can be filtered for
+        specific values. To take account for that, the routine below
+        creates special fields with the following format:
+        event name(filter name)
+
+        """
+        path = os.path.join(PATH_DEBUGFS_TRACING, 'events', 'kvm')
+        fields = walkdir(path)[1]
+        extra = []
+        for field in fields:
+            if field in self.filters:
+                filter_name_, filter_dicts = self.filters[field]
+                for name in filter_dicts:
+                    extra.append(field + '(' + name + ')')
+        fields += extra
+        return fields
+
+    def setup_traces(self):
+        """Creates all event and group objects needed to be able to retrieve
+        data."""
+        if self._pid > 0:
+            # Fetch list of all threads of the monitored pid, as qemu
+            # starts a thread for each vcpu.
+            path = os.path.join('/proc', str(self._pid), 'task')
+            groupids = walkdir(path)[1]
+        else:
+            groupids = get_online_cpus()
+
+        # The constant is needed as a buffer for python libs, std
+        # streams and other files that the script opens.
+        newlim = len(groupids) * len(self._fields) + 50
+        try:
+            softlim_, hardlim = resource.getrlimit(resource.RLIMIT_NOFILE)
+
+            if hardlim < newlim:
+                # Now we need CAP_SYS_RESOURCE, to increase the hard limit.
+                resource.setrlimit(resource.RLIMIT_NOFILE, (newlim, newlim))
+            else:
+                # Raising the soft limit is sufficient.
+                resource.setrlimit(resource.RLIMIT_NOFILE, (newlim, hardlim))
+
+        except ValueError:
+            sys.exit("NOFILE rlimit could not be raised to {0}".format(newlim))
+
+        for groupid in groupids:
+            group = Group()
+            for name in self._fields:
+                tracepoint = name
+                tracefilter = None
+                match = re.match(r'(.*)\((.*)\)', name)
+                if match:
+                    tracepoint, sub = match.groups()
+                    tracefilter = ('%s==%d\0' %
+                                   (self.filters[tracepoint][0],
+                                    self.filters[tracepoint][1][sub]))
+
+                # From perf_event_open(2):
+                # pid > 0 and cpu == -1
+                # This measures the specified process/thread on any CPU.
+                #
+                # pid == -1 and cpu >= 0
+                # This measures all processes/threads on the specified CPU.
+                trace_cpu = groupid if self._pid == 0 else -1
+                trace_pid = int(groupid) if self._pid != 0 else -1
+
+                group.add_event(Event(name=name,
+                                      group=group,
+                                      trace_cpu=trace_cpu,
+                                      trace_pid=trace_pid,
+                                      trace_point=tracepoint,
+                                      trace_filter=tracefilter))
+
+            self.group_leaders.append(group)
+
+    def available_fields(self):
+        return self.get_available_fields()
+
+    @property
+    def fields(self):
+        return self._fields
+
+    @fields.setter
+    def fields(self, fields):
+        """Enables/disables the (un)wanted events"""
+        self._fields = fields
+        for group in self.group_leaders:
+            for index, event in enumerate(group.events):
+                if event.name in fields:
+                    event.reset()
+                    event.enable()
+                else:
+                    # Do not disable the group leader.
+                    # It would disable all of its events.
+                    if index != 0:
+                        event.disable()
+
+    @property
+    def pid(self):
+        return self._pid
+
+    @pid.setter
+    def pid(self, pid):
+        """Changes the monitored pid by setting new traces."""
+        self._pid = pid
+        # The garbage collector will get rid of all Event/Group
+        # objects and open files after removing the references.
+        self.group_leaders = []
+        self.setup_traces()
+        self.fields = self._fields
+
+    def read(self):
+        """Returns 'event name: current value' for all enabled events."""
+        ret = defaultdict(int)
+        for group in self.group_leaders:
+            for name, val in group.read().iteritems():
+                if name in self._fields:
+                    ret[name] += val
+        return ret
+
+class DebugfsProvider(object):
+    """Provides data from the files that KVM creates in the kvm debugfs
+    folder."""
+    def __init__(self):
+        self._fields = self.get_available_fields()
+        self._pid = 0
+        self.do_read = True
+
+    def get_available_fields(self):
+        """"Returns a list of available fields.
+
+        The fields are all available KVM debugfs files
+
+        """
+        return walkdir(PATH_DEBUGFS_KVM)[2]
+
+    @property
+    def fields(self):
+        return self._fields
+
+    @fields.setter
+    def fields(self, fields):
+        self._fields = fields
+
+    @property
+    def pid(self):
+        return self._pid
+
+    @pid.setter
+    def pid(self, pid):
+        if pid != 0:
+            self._pid = pid
+
+            vms = walkdir(PATH_DEBUGFS_KVM)[1]
+            if len(vms) == 0:
+                self.do_read = False
+
+            self.paths = filter(lambda x: "{}-".format(pid) in x, vms)
+
+        else:
+            self.paths = ['']
+            self.do_read = True
+
+    def read(self):
+        """Returns a dict with format:'file name / field -> current value'."""
+        results = {}
+
+        # If no debugfs filtering support is available, then don't read.
+        if not self.do_read:
+            return results
+
+        for path in self.paths:
+            for field in self._fields:
+                results[field] = results.get(field, 0) \
+                                 + self.read_field(field, path)
+
+        return results
+
+    def read_field(self, field, path):
+        """Returns the value of a single field from a specific VM."""
+        try:
+            return int(open(os.path.join(PATH_DEBUGFS_KVM,
+                                         path,
+                                         field))
+                       .read())
+        except IOError:
+            return 0
+
+class Stats(object):
+    """Manages the data providers and the data they provide.
+
+    It is used to set filters on the provider's data and collect all
+    provider data.
+
+    """
+    def __init__(self, providers, pid, fields=None):
+        self.providers = providers
+        self._pid_filter = pid
+        self._fields_filter = fields
+        self.values = {}
+        self.update_provider_pid()
+        self.update_provider_filters()
+
+    def update_provider_filters(self):
+        """Propagates fields filters to providers."""
+        def wanted(key):
+            if not self._fields_filter:
+                return True
+            return re.match(self._fields_filter, key) is not None
+
+        # As we reset the counters when updating the fields we can
+        # also clear the cache of old values.
+        self.values = {}
+        for provider in self.providers:
+            provider_fields = [key for key in provider.get_available_fields()
+                               if wanted(key)]
+            provider.fields = provider_fields
+
+    def update_provider_pid(self):
+        """Propagates pid filters to providers."""
+        for provider in self.providers:
+            provider.pid = self._pid_filter
+
+    @property
+    def fields_filter(self):
+        return self._fields_filter
+
+    @fields_filter.setter
+    def fields_filter(self, fields_filter):
+        self._fields_filter = fields_filter
+        self.update_provider_filters()
+
+    @property
+    def pid_filter(self):
+        return self._pid_filter
+
+    @pid_filter.setter
+    def pid_filter(self, pid):
+        self._pid_filter = pid
+        self.values = {}
+        self.update_provider_pid()
+
+    def get(self):
+        """Returns a dict with field -> (value, delta to last value) of all
+        provider data."""
+        for provider in self.providers:
+            new = provider.read()
+            for key in provider.fields:
+                oldval = self.values.get(key, (0, 0))
+                newval = new.get(key, 0)
+                newdelta = None
+                if oldval is not None:
+                    newdelta = newval - oldval[0]
+                self.values[key] = (newval, newdelta)
+        return self.values
+
+LABEL_WIDTH = 40
+NUMBER_WIDTH = 10
+
+class Tui(object):
+    """Instruments curses to draw a nice text ui."""
+    def __init__(self, stats):
+        self.stats = stats
+        self.screen = None
+        self.drilldown = False
+        self.update_drilldown()
+
+    def __enter__(self):
+        """Initialises curses for later use.  Based on curses.wrapper
+           implementation from the Python standard library."""
+        self.screen = curses.initscr()
+        curses.noecho()
+        curses.cbreak()
+
+        # The try/catch works around a minor bit of
+        # over-conscientiousness in the curses module, the error
+        # return from C start_color() is ignorable.
+        try:
+            curses.start_color()
+        except:
+            pass
+
+        curses.use_default_colors()
+        return self
+
+    def __exit__(self, *exception):
+        """Resets the terminal to its normal state.  Based on curses.wrappre
+           implementation from the Python standard library."""
+        if self.screen:
+            self.screen.keypad(0)
+            curses.echo()
+            curses.nocbreak()
+            curses.endwin()
+
+    def update_drilldown(self):
+        """Sets or removes a filter that only allows fields without braces."""
+        if not self.stats.fields_filter:
+            self.stats.fields_filter = r'^[^\(]*$'
+
+        elif self.stats.fields_filter == r'^[^\(]*$':
+            self.stats.fields_filter = None
+
+    def update_pid(self, pid):
+        """Propagates pid selection to stats object."""
+        self.stats.pid_filter = pid
+
+    def refresh(self, sleeptime):
+        """Refreshes on-screen data."""
+        self.screen.erase()
+        if self.stats.pid_filter > 0:
+            self.screen.addstr(0, 0, 'kvm statistics - pid {0}'
+                               .format(self.stats.pid_filter),
+                               curses.A_BOLD)
+        else:
+            self.screen.addstr(0, 0, 'kvm statistics - summary', curses.A_BOLD)
+        self.screen.addstr(2, 1, 'Event')
+        self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH -
+                           len('Total'), 'Total')
+        self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH + 8 -
+                           len('Current'), 'Current')
+        row = 3
+        stats = self.stats.get()
+        def sortkey(x):
+            if stats[x][1]:
+                return (-stats[x][1], -stats[x][0])
+            else:
+                return (0, -stats[x][0])
+        for key in sorted(stats.keys(), key=sortkey):
+
+            if row >= self.screen.getmaxyx()[0]:
+                break
+            values = stats[key]
+            if not values[0] and not values[1]:
+                break
+            col = 1
+            self.screen.addstr(row, col, key)
+            col += LABEL_WIDTH
+            self.screen.addstr(row, col, '%10d' % (values[0],))
+            col += NUMBER_WIDTH
+            if values[1] is not None:
+                self.screen.addstr(row, col, '%8d' % (values[1] / sleeptime,))
+            row += 1
+        self.screen.refresh()
+
+    def show_filter_selection(self):
+        """Draws filter selection mask.
+
+        Asks for a valid regex and sets the fields filter accordingly.
+
+        """
+        while True:
+            self.screen.erase()
+            self.screen.addstr(0, 0,
+                               "Show statistics for events matching a regex.",
+                               curses.A_BOLD)
+            self.screen.addstr(2, 0,
+                               "Current regex: {0}"
+                               .format(self.stats.fields_filter))
+            self.screen.addstr(3, 0, "New regex: ")
+            curses.echo()
+            regex = self.screen.getstr()
+            curses.noecho()
+            if len(regex) == 0:
+                return
+            try:
+                re.compile(regex)
+                self.stats.fields_filter = regex
+                return
+            except re.error:
+                continue
+
+    def show_vm_selection(self):
+        """Draws PID selection mask.
+
+        Asks for a pid until a valid pid or 0 has been entered.
+
+        """
+        while True:
+            self.screen.erase()
+            self.screen.addstr(0, 0,
+                               'Show statistics for specific pid.',
+                               curses.A_BOLD)
+            self.screen.addstr(1, 0,
+                               'This might limit the shown data to the trace '
+                               'statistics.')
+
+            curses.echo()
+            self.screen.addstr(3, 0, "Pid [0 or pid]: ")
+            pid = self.screen.getstr()
+            curses.noecho()
+
+            try:
+                pid = int(pid)
+
+                if pid == 0:
+                    self.update_pid(pid)
+                    break
+                else:
+                    if not os.path.isdir(os.path.join('/proc/', str(pid))):
+                        continue
+                    else:
+                        self.update_pid(pid)
+                        break
+
+            except ValueError:
+                continue
+
+    def show_stats(self):
+        """Refreshes the screen and processes user input."""
+        sleeptime = 0.25
+        while True:
+            self.refresh(sleeptime)
+            curses.halfdelay(int(sleeptime * 10))
+            sleeptime = 3
+            try:
+                char = self.screen.getkey()
+                if char == 'x':
+                    self.drilldown = not self.drilldown
+                    self.update_drilldown()
+                if char == 'q':
+                    break
+                if char == 'f':
+                    self.show_filter_selection()
+                if char == 'p':
+                    self.show_vm_selection()
+            except KeyboardInterrupt:
+                break
+            except curses.error:
+                continue
+
+def batch(stats):
+    """Prints statistics in a key, value format."""
+    s = stats.get()
+    time.sleep(1)
+    s = stats.get()
+    for key in sorted(s.keys()):
+        values = s[key]
+        print '%-42s%10d%10d' % (key, values[0], values[1])
+
+def log(stats):
+    """Prints statistics as reiterating key block, multiple value blocks."""
+    keys = sorted(stats.get().iterkeys())
+    def banner():
+        for k in keys:
+            print '%s' % k,
+        print
+    def statline():
+        s = stats.get()
+        for k in keys:
+            print ' %9d' % s[k][1],
+        print
+    line = 0
+    banner_repeat = 20
+    while True:
+        time.sleep(1)
+        if line % banner_repeat == 0:
+            banner()
+        statline()
+        line += 1
+
+def get_options():
+    """Returns processed program arguments."""
+    description_text = """
+This script displays various statistics about VMs running under KVM.
+The statistics are gathered from the KVM debugfs entries and / or the
+currently available perf traces.
+
+The monitoring takes additional cpu cycles and might affect the VM's
+performance.
+
+Requirements:
+- Access to:
+    /sys/kernel/debug/kvm
+    /sys/kernel/debug/trace/events/*
+    /proc/pid/task
+- /proc/sys/kernel/perf_event_paranoid < 1 if user has no
+  CAP_SYS_ADMIN and perf events are used.
+- CAP_SYS_RESOURCE if the hard limit is not high enough to allow
+  the large number of files that are possibly opened.
+"""
+
+    class PlainHelpFormatter(optparse.IndentedHelpFormatter):
+        def format_description(self, description):
+            if description:
+                return description + "\n"
+            else:
+                return ""
+
+    optparser = optparse.OptionParser(description=description_text,
+                                      formatter=PlainHelpFormatter())
+    optparser.add_option('-1', '--once', '--batch',
+                         action='store_true',
+                         default=False,
+                         dest='once',
+                         help='run in batch mode for one second',
+                         )
+    optparser.add_option('-l', '--log',
+                         action='store_true',
+                         default=False,
+                         dest='log',
+                         help='run in logging mode (like vmstat)',
+                         )
+    optparser.add_option('-t', '--tracepoints',
+                         action='store_true',
+                         default=False,
+                         dest='tracepoints',
+                         help='retrieve statistics from tracepoints',
+                         )
+    optparser.add_option('-d', '--debugfs',
+                         action='store_true',
+                         default=False,
+                         dest='debugfs',
+                         help='retrieve statistics from debugfs',
+                         )
+    optparser.add_option('-f', '--fields',
+                         action='store',
+                         default=None,
+                         dest='fields',
+                         help='fields to display (regex)',
+                         )
+    optparser.add_option('-p', '--pid',
+                        action='store',
+                        default=0,
+                        type=int,
+                        dest='pid',
+                        help='restrict statistics to pid',
+                        )
+    (options, _) = optparser.parse_args(sys.argv)
+    return options
+
+def get_providers(options):
+    """Returns a list of data providers depending on the passed options."""
+    providers = []
+
+    if options.tracepoints:
+        providers.append(TracepointProvider())
+    if options.debugfs:
+        providers.append(DebugfsProvider())
+    if len(providers) == 0:
+        providers.append(TracepointProvider())
+
+    return providers
+
+def check_access(options):
+    """Exits if the current user can't access all needed directories."""
+    if not os.path.exists('/sys/kernel/debug'):
+        sys.stderr.write('Please enable CONFIG_DEBUG_FS in your kernel.')
+        sys.exit(1)
+
+    if not os.path.exists(PATH_DEBUGFS_KVM):
+        sys.stderr.write("Please make sure, that debugfs is mounted and "
+                         "readable by the current user:\n"
+                         "('mount -t debugfs debugfs /sys/kernel/debug')\n"
+                         "Also ensure, that the kvm modules are loaded.\n")
+        sys.exit(1)
+
+    if not os.path.exists(PATH_DEBUGFS_TRACING) and (options.tracepoints
+                                                     or not options.debugfs):
+        sys.stderr.write("Please enable CONFIG_TRACING in your kernel "
+                         "when using the option -t (default).\n"
+                         "If it is enabled, make {0} readable by the "
+                         "current user.\n"
+                         .format(PATH_DEBUGFS_TRACING))
+        if options.tracepoints:
+            sys.exit(1)
+
+        sys.stderr.write("Falling back to debugfs statistics!\n")
+        options.debugfs = True
+        sleep(5)
+
+    return options
+
+def main():
+    options = get_options()
+    options = check_access(options)
+
+    if (options.pid > 0 and
+        not os.path.isdir(os.path.join('/proc/',
+                                       str(options.pid)))):
+        sys.stderr.write('Did you use a (unsupported) tid instead of a pid?\n')
+        sys.exit('Specified pid does not exist.')
+
+    providers = get_providers(options)
+    stats = Stats(providers, options.pid, fields=options.fields)
+
+    if options.log:
+        log(stats)
+    elif not options.once:
+        with Tui(stats) as tui:
+            tui.show_stats()
+    else:
+        batch(stats)
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/kvm/kvm_stat/kvm_stat.txt b/tools/kvm/kvm_stat/kvm_stat.txt
new file mode 100644 (file)
index 0000000..b92a153
--- /dev/null
@@ -0,0 +1,63 @@
+kvm_stat(1)
+===========
+
+NAME
+----
+kvm_stat - Report KVM kernel module event counters
+
+SYNOPSIS
+--------
+[verse]
+'kvm_stat' [OPTION]...
+
+DESCRIPTION
+-----------
+kvm_stat prints counts of KVM kernel module trace events.  These events signify
+state transitions such as guest mode entry and exit.
+
+This tool is useful for observing guest behavior from the host perspective.
+Often conclusions about performance or buggy behavior can be drawn from the
+output.
+
+The set of KVM kernel module trace events may be specific to the kernel version
+or architecture.  It is best to check the KVM kernel module source code for the
+meaning of events.
+
+OPTIONS
+-------
+-1::
+--once::
+--batch::
+       run in batch mode for one second
+
+-l::
+--log::
+       run in logging mode (like vmstat)
+
+-t::
+--tracepoints::
+       retrieve statistics from tracepoints
+
+-d::
+--debugfs::
+       retrieve statistics from debugfs
+
+-p<pid>::
+--pid=<pid>::
+       limit statistics to one virtual machine (pid)
+
+-f<fields>::
+--fields=<fields>::
+       fields to display (regex)
+
+-h::
+--help::
+       show help message
+
+SEE ALSO
+--------
+'perf'(1), 'trace-cmd'(1)
+
+AUTHOR
+------
+Stefan Hajnoczi <stefanha@redhat.com>
index 6765c7e..f094f3c 100644 (file)
@@ -30,6 +30,10 @@ INCLUDES := -I$(srctree)/tools/include
 CFLAGS   += -Wall -Werror $(EXTRA_WARNINGS) -fomit-frame-pointer -O2 -g $(INCLUDES)
 LDFLAGS  += -lelf $(LIBSUBCMD)
 
+# Allow old libelf to be used:
+elfshdr := $(shell echo '\#include <libelf.h>' | $(CC) $(CFLAGS) -x c -E - | grep elf_getshdr)
+CFLAGS += $(if $(elfshdr),,-DLIBELF_USE_DEPRECATED)
+
 AWK = awk
 export srctree OUTPUT CFLAGS ARCH AWK
 include $(srctree)/tools/build/Makefile.include
index 7f3e00a..aa1ff65 100644 (file)
 #include <linux/list.h>
 #include <linux/hashtable.h>
 
+#ifdef LIBELF_USE_DEPRECATED
+# define elf_getshdrnum    elf_getshnum
+# define elf_getshdrstrndx elf_getshstrndx
+#endif
+
 struct section {
        struct list_head list;
        GElf_Shdr sh;
index ebaf849..9cbddc2 100644 (file)
@@ -103,12 +103,13 @@ OPTIONS
 
        If --branch-stack option is used, following sort keys are also
        available:
-       dso_from, dso_to, symbol_from, symbol_to, mispredict.
 
        - dso_from: name of library or module branched from
        - dso_to: name of library or module branched to
        - symbol_from: name of function branched from
        - symbol_to: name of function branched to
+       - srcline_from: source file and line branched from
+       - srcline_to: source file and line branched to
        - mispredict: "N" for predicted branch, "Y" for mispredicted branch
        - in_tx: branch in TSX transaction
        - abort: TSX transaction abort.
@@ -248,7 +249,7 @@ OPTIONS
        Note that when using the --itrace option the synthesized callchain size
        will override this value if the synthesized callchain size is bigger.
 
-       Default: /proc/sys/kernel/perf_event_max_stack when present, 127 otherwise.
+       Default: 127
 
 -G::
 --inverted::
index a856a10..4fc44c7 100644 (file)
@@ -267,7 +267,7 @@ include::itrace.txt[]
         Note that when using the --itrace option the synthesized callchain size
         will override this value if the synthesized callchain size is bigger.
 
-        Default: /proc/sys/kernel/perf_event_max_stack when present, 127 otherwise.
+        Default: 127
 
 --ns::
        Use 9 decimal places when displaying time (i.e. show the nanoseconds)
index 6afe201..1ab0782 100644 (file)
@@ -143,7 +143,8 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
         Implies '--call-graph dwarf' when --call-graph not present on the
         command line, on systems where DWARF unwinding was built in.
 
-        Default: /proc/sys/kernel/perf_event_max_stack when present, 127 otherwise.
+        Default: /proc/sys/kernel/perf_event_max_stack when present for
+                 live sessions (without --input/-i), 127 otherwise.
 
 --min-stack::
         Set the stack depth limit when parsing the callchain, anything
index 8141583..25c8173 100644 (file)
@@ -324,8 +324,9 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __maybe_unused)
        OPT_BOOLEAN(0, "skip-missing", &annotate.skip_missing,
                    "Skip symbols that cannot be annotated"),
        OPT_STRING('C', "cpu", &annotate.cpu_list, "cpu", "list of cpus to profile"),
-       OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
-                  "Look for files with symbols relative to this directory"),
+       OPT_CALLBACK(0, "symfs", NULL, "directory",
+                    "Look for files with symbols relative to this directory",
+                    symbol__config_symfs),
        OPT_BOOLEAN(0, "source", &symbol_conf.annotate_src,
                    "Interleave source code with assembly code (default)"),
        OPT_BOOLEAN(0, "asm-raw", &symbol_conf.annotate_asm_raw,
index 632efc6..d75bded 100644 (file)
@@ -119,8 +119,8 @@ static int build_id_cache__add_kcore(const char *filename, bool force)
        if (build_id_cache__kcore_buildid(from_dir, sbuildid) < 0)
                return -1;
 
-       scnprintf(to_dir, sizeof(to_dir), "%s/[kernel.kcore]/%s",
-                 buildid_dir, sbuildid);
+       scnprintf(to_dir, sizeof(to_dir), "%s/%s/%s",
+                 buildid_dir, DSO__NAME_KCORE, sbuildid);
 
        if (!force &&
            !build_id_cache__kcore_existing(from_dir, to_dir, sizeof(to_dir))) {
@@ -131,8 +131,8 @@ static int build_id_cache__add_kcore(const char *filename, bool force)
        if (build_id_cache__kcore_dir(dir, sizeof(dir)))
                return -1;
 
-       scnprintf(to_dir, sizeof(to_dir), "%s/[kernel.kcore]/%s/%s",
-                 buildid_dir, sbuildid, dir);
+       scnprintf(to_dir, sizeof(to_dir), "%s/%s/%s/%s",
+                 buildid_dir, DSO__NAME_KCORE, sbuildid, dir);
 
        if (mkdir_p(to_dir, 0755))
                return -1;
index 9ce354f..f7645a4 100644 (file)
@@ -812,8 +812,9 @@ static const struct option options[] = {
        OPT_STRING_NOEMPTY('t', "field-separator", &symbol_conf.field_sep, "separator",
                   "separator for columns, no spaces will be added between "
                   "columns '.' is reserved."),
-       OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
-                   "Look for files with symbols relative to this directory"),
+       OPT_CALLBACK(0, "symfs", NULL, "directory",
+                    "Look for files with symbols relative to this directory",
+                    symbol__config_symfs),
        OPT_UINTEGER('o', "order", &sort_compute, "Specify compute sorting."),
        OPT_CALLBACK(0, "percentage", NULL, "relative|absolute",
                     "How to display percentage of filtered entries", parse_filter_percentage),
index f3679c4..dc3fcb5 100644 (file)
@@ -40,6 +40,7 @@
 #include <unistd.h>
 #include <sched.h>
 #include <sys/mman.h>
+#include <asm/bug.h>
 
 
 struct record {
@@ -82,27 +83,87 @@ static int process_synthesized_event(struct perf_tool *tool,
        return record__write(rec, event, event->header.size);
 }
 
+static int
+backward_rb_find_range(void *buf, int mask, u64 head, u64 *start, u64 *end)
+{
+       struct perf_event_header *pheader;
+       u64 evt_head = head;
+       int size = mask + 1;
+
+       pr_debug2("backward_rb_find_range: buf=%p, head=%"PRIx64"\n", buf, head);
+       pheader = (struct perf_event_header *)(buf + (head & mask));
+       *start = head;
+       while (true) {
+               if (evt_head - head >= (unsigned int)size) {
+                       pr_debug("Finshed reading backward ring buffer: rewind\n");
+                       if (evt_head - head > (unsigned int)size)
+                               evt_head -= pheader->size;
+                       *end = evt_head;
+                       return 0;
+               }
+
+               pheader = (struct perf_event_header *)(buf + (evt_head & mask));
+
+               if (pheader->size == 0) {
+                       pr_debug("Finshed reading backward ring buffer: get start\n");
+                       *end = evt_head;
+                       return 0;
+               }
+
+               evt_head += pheader->size;
+               pr_debug3("move evt_head: %"PRIx64"\n", evt_head);
+       }
+       WARN_ONCE(1, "Shouldn't get here\n");
+       return -1;
+}
+
+static int
+rb_find_range(struct perf_evlist *evlist,
+             void *data, int mask, u64 head, u64 old,
+             u64 *start, u64 *end)
+{
+       if (!evlist->backward) {
+               *start = old;
+               *end = head;
+               return 0;
+       }
+
+       return backward_rb_find_range(data, mask, head, start, end);
+}
+
 static int record__mmap_read(struct record *rec, int idx)
 {
        struct perf_mmap *md = &rec->evlist->mmap[idx];
        u64 head = perf_mmap__read_head(md);
        u64 old = md->prev;
+       u64 end = head, start = old;
        unsigned char *data = md->base + page_size;
        unsigned long size;
        void *buf;
        int rc = 0;
 
-       if (old == head)
+       if (rb_find_range(rec->evlist, data, md->mask, head,
+                         old, &start, &end))
+               return -1;
+
+       if (start == end)
                return 0;
 
        rec->samples++;
 
-       size = head - old;
+       size = end - start;
+       if (size > (unsigned long)(md->mask) + 1) {
+               WARN_ONCE(1, "failed to keep up with mmap data. (warn only once)\n");
+
+               md->prev = head;
+               perf_evlist__mmap_consume(rec->evlist, idx);
+               return 0;
+       }
 
-       if ((old & md->mask) + size != (head & md->mask)) {
-               buf = &data[old & md->mask];
-               size = md->mask + 1 - (old & md->mask);
-               old += size;
+       if ((start & md->mask) + size != (end & md->mask)) {
+               buf = &data[start & md->mask];
+               size = md->mask + 1 - (start & md->mask);
+               start += size;
 
                if (record__write(rec, buf, size) < 0) {
                        rc = -1;
@@ -110,16 +171,16 @@ static int record__mmap_read(struct record *rec, int idx)
                }
        }
 
-       buf = &data[old & md->mask];
-       size = head - old;
-       old += size;
+       buf = &data[start & md->mask];
+       size = end - start;
+       start += size;
 
        if (record__write(rec, buf, size) < 0) {
                rc = -1;
                goto out;
        }
 
-       md->prev = old;
+       md->prev = head;
        perf_evlist__mmap_consume(rec->evlist, idx);
 out:
        return rc;
index 87d40e3..a87cb33 100644 (file)
@@ -691,7 +691,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
                        .ordered_events  = true,
                        .ordering_requires_timestamps = true,
                },
-               .max_stack               = sysctl_perf_event_max_stack,
+               .max_stack               = PERF_MAX_STACK_DEPTH,
                .pretty_printing_style   = "normal",
                .socket_filter           = -1,
        };
@@ -770,8 +770,9 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
                   "columns '.' is reserved."),
        OPT_BOOLEAN('U', "hide-unresolved", &symbol_conf.hide_unresolved,
                    "Only display entries resolved to a symbol"),
-       OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
-                   "Look for files with symbols relative to this directory"),
+       OPT_CALLBACK(0, "symfs", NULL, "directory",
+                    "Look for files with symbols relative to this directory",
+                    symbol__config_symfs),
        OPT_STRING('C', "cpu", &report.cpu_list, "cpu",
                   "list of cpus to profile"),
        OPT_BOOLEAN('I', "show-info", &report.show_full_info,
index efca816..e3ce2f3 100644 (file)
@@ -2010,8 +2010,9 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
                   "file", "kallsyms pathname"),
        OPT_BOOLEAN('G', "hide-call-graph", &no_callchain,
                    "When printing symbols do not display call chain"),
-       OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
-                   "Look for files with symbols relative to this directory"),
+       OPT_CALLBACK(0, "symfs", NULL, "directory",
+                    "Look for files with symbols relative to this directory",
+                    symbol__config_symfs),
        OPT_CALLBACK('F', "fields", NULL, "str",
                     "comma separated output fields prepend with 'type:'. "
                     "Valid types: hw,sw,trace,raw. "
@@ -2067,8 +2068,6 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
                NULL
        };
 
-       scripting_max_stack = sysctl_perf_event_max_stack;
-
        setup_scripting();
 
        argc = parse_options_subcommand(argc, argv, options, script_subcommands, script_usage,
index e459b68..ee7ada7 100644 (file)
@@ -66,6 +66,7 @@
 #include <stdlib.h>
 #include <sys/prctl.h>
 #include <locale.h>
+#include <math.h>
 
 #define DEFAULT_SEPARATOR      " "
 #define CNTR_NOT_SUPPORTED     "<not supported>"
@@ -991,12 +992,12 @@ static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg)
        const char *fmt;
 
        if (csv_output) {
-               fmt = sc != 1.0 ?  "%.2f%s" : "%.0f%s";
+               fmt = floor(sc) != sc ?  "%.2f%s" : "%.0f%s";
        } else {
                if (big_num)
-                       fmt = sc != 1.0 ? "%'18.2f%s" : "%'18.0f%s";
+                       fmt = floor(sc) != sc ? "%'18.2f%s" : "%'18.0f%s";
                else
-                       fmt = sc != 1.0 ? "%18.2f%s" : "%18.0f%s";
+                       fmt = floor(sc) != sc ? "%18.2f%s" : "%18.0f%s";
        }
 
        aggr_printout(evsel, id, nr);
@@ -1909,6 +1910,9 @@ static int add_default_attributes(void)
        }
 
        if (!evsel_list->nr_entries) {
+               if (target__has_cpu(&target))
+                       default_attrs0[0].config = PERF_COUNT_SW_CPU_CLOCK;
+
                if (perf_evlist__add_default_attrs(evsel_list, default_attrs0) < 0)
                        return -1;
                if (pmu_have_event("cpu", "stalled-cycles-frontend")) {
@@ -2000,7 +2004,7 @@ static int process_stat_round_event(struct perf_tool *tool __maybe_unused,
                                    union perf_event *event,
                                    struct perf_session *session)
 {
-       struct stat_round_event *round = &event->stat_round;
+       struct stat_round_event *stat_round = &event->stat_round;
        struct perf_evsel *counter;
        struct timespec tsh, *ts = NULL;
        const char **argv = session->header.env.cmdline_argv;
@@ -2009,12 +2013,12 @@ static int process_stat_round_event(struct perf_tool *tool __maybe_unused,
        evlist__for_each(evsel_list, counter)
                perf_stat_process_counter(&stat_config, counter);
 
-       if (round->type == PERF_STAT_ROUND_TYPE__FINAL)
-               update_stats(&walltime_nsecs_stats, round->time);
+       if (stat_round->type == PERF_STAT_ROUND_TYPE__FINAL)
+               update_stats(&walltime_nsecs_stats, stat_round->time);
 
-       if (stat_config.interval && round->time) {
-               tsh.tv_sec  = round->time / NSECS_PER_SEC;
-               tsh.tv_nsec = round->time % NSECS_PER_SEC;
+       if (stat_config.interval && stat_round->time) {
+               tsh.tv_sec  = stat_round->time / NSECS_PER_SEC;
+               tsh.tv_nsec = stat_round->time % NSECS_PER_SEC;
                ts = &tsh;
        }
 
index 40cc9bb..733a554 100644 (file)
@@ -1945,8 +1945,9 @@ int cmd_timechart(int argc, const char **argv,
        OPT_CALLBACK('p', "process", NULL, "process",
                      "process selector. Pass a pid or process name.",
                       parse_process),
-       OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
-                   "Look for files with symbols relative to this directory"),
+       OPT_CALLBACK(0, "symfs", NULL, "directory",
+                    "Look for files with symbols relative to this directory",
+                    symbol__config_symfs),
        OPT_INTEGER('n', "proc-num", &tchart.proc_num,
                    "min. number of tasks to print"),
        OPT_BOOLEAN('t', "topology", &tchart.topology,
index 1793da5..2a6cc25 100644 (file)
@@ -732,7 +732,7 @@ static void perf_event__process_sample(struct perf_tool *tool,
        if (machine__resolve(machine, &al, sample) < 0)
                return;
 
-       if (!top->kptr_restrict_warned &&
+       if (!machine->kptr_restrict_warned &&
            symbol_conf.kptr_restrict &&
            al.cpumode == PERF_RECORD_MISC_KERNEL) {
                ui__warning(
@@ -743,7 +743,7 @@ static void perf_event__process_sample(struct perf_tool *tool,
                          " modules" : "");
                if (use_browser <= 0)
                        sleep(5);
-               top->kptr_restrict_warned = true;
+               machine->kptr_restrict_warned = true;
        }
 
        if (al.sym == NULL) {
@@ -759,7 +759,7 @@ static void perf_event__process_sample(struct perf_tool *tool,
                 * --hide-kernel-symbols, even if the user specifies an
                 * invalid --vmlinux ;-)
                 */
-               if (!top->kptr_restrict_warned && !top->vmlinux_warned &&
+               if (!machine->kptr_restrict_warned && !top->vmlinux_warned &&
                    al.map == machine->vmlinux_maps[MAP__FUNCTION] &&
                    RB_EMPTY_ROOT(&al.map->dso->symbols[MAP__FUNCTION])) {
                        if (symbol_conf.vmlinux_name) {
index 6e5c325..5c50fe7 100644 (file)
@@ -576,84 +576,54 @@ static struct syscall_fmt {
        bool       hexret;
 } syscall_fmts[] = {
        { .name     = "access",     .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */
-                            [1] = SCA_ACCMODE,  /* mode */ }, },
+         .arg_scnprintf = { [1] = SCA_ACCMODE,  /* mode */ }, },
        { .name     = "arch_prctl", .errmsg = true, .alias = "prctl", },
        { .name     = "bpf",        .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
        { .name     = "brk",        .hexret = true,
          .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
-       { .name     = "chdir",      .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
-       { .name     = "chmod",      .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
-       { .name     = "chroot",     .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
+       { .name     = "chdir",      .errmsg = true, },
+       { .name     = "chmod",      .errmsg = true, },
+       { .name     = "chroot",     .errmsg = true, },
        { .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
        { .name     = "clone",      .errpid = true, },
        { .name     = "close",      .errmsg = true,
          .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
        { .name     = "connect",    .errmsg = true, },
-       { .name     = "creat",      .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
-       { .name     = "dup",        .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
-       { .name     = "dup2",       .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
-       { .name     = "dup3",       .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
+       { .name     = "creat",      .errmsg = true, },
+       { .name     = "dup",        .errmsg = true, },
+       { .name     = "dup2",       .errmsg = true, },
+       { .name     = "dup3",       .errmsg = true, },
        { .name     = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
        { .name     = "eventfd2",   .errmsg = true,
          .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
-       { .name     = "faccessat",  .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
-                            [1] = SCA_FILENAME, /* filename */ }, },
-       { .name     = "fadvise64",  .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
-       { .name     = "fallocate",  .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
-       { .name     = "fchdir",     .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
-       { .name     = "fchmod",     .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
+       { .name     = "faccessat",  .errmsg = true, },
+       { .name     = "fadvise64",  .errmsg = true, },
+       { .name     = "fallocate",  .errmsg = true, },
+       { .name     = "fchdir",     .errmsg = true, },
+       { .name     = "fchmod",     .errmsg = true, },
        { .name     = "fchmodat",   .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
-                            [1] = SCA_FILENAME, /* filename */ }, },
-       { .name     = "fchown",     .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
+         .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
+       { .name     = "fchown",     .errmsg = true, },
        { .name     = "fchownat",   .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
-                            [1] = SCA_FILENAME, /* filename */ }, },
+         .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
        { .name     = "fcntl",      .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */
-                            [1] = SCA_STRARRAY, /* cmd */ },
+         .arg_scnprintf = { [1] = SCA_STRARRAY, /* cmd */ },
          .arg_parm      = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
-       { .name     = "fdatasync",  .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
+       { .name     = "fdatasync",  .errmsg = true, },
        { .name     = "flock",      .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */
-                            [1] = SCA_FLOCK, /* cmd */ }, },
-       { .name     = "fsetxattr",  .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
-       { .name     = "fstat",      .errmsg = true, .alias = "newfstat",
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
-       { .name     = "fstatat",    .errmsg = true, .alias = "newfstatat",
-         .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
-                            [1] = SCA_FILENAME, /* filename */ }, },
-       { .name     = "fstatfs",    .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
-       { .name     = "fsync",    .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
-       { .name     = "ftruncate", .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
+         .arg_scnprintf = { [1] = SCA_FLOCK, /* cmd */ }, },
+       { .name     = "fsetxattr",  .errmsg = true, },
+       { .name     = "fstat",      .errmsg = true, .alias = "newfstat", },
+       { .name     = "fstatat",    .errmsg = true, .alias = "newfstatat", },
+       { .name     = "fstatfs",    .errmsg = true, },
+       { .name     = "fsync",    .errmsg = true, },
+       { .name     = "ftruncate", .errmsg = true, },
        { .name     = "futex",      .errmsg = true,
          .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
        { .name     = "futimesat", .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
-                            [1] = SCA_FILENAME, /* filename */ }, },
-       { .name     = "getdents",   .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
-       { .name     = "getdents64", .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
+         .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
+       { .name     = "getdents",   .errmsg = true, },
+       { .name     = "getdents64", .errmsg = true, },
        { .name     = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
        { .name     = "getpid",     .errpid = true, },
        { .name     = "getpgid",    .errpid = true, },
@@ -661,12 +631,10 @@ static struct syscall_fmt {
        { .name     = "getrandom",  .errmsg = true,
          .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, },
        { .name     = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
-       { .name     = "getxattr",    .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
-       { .name     = "inotify_add_watch",          .errmsg = true,
-         .arg_scnprintf = { [1] = SCA_FILENAME, /* pathname */ }, },
+       { .name     = "getxattr",   .errmsg = true, },
+       { .name     = "inotify_add_watch",          .errmsg = true, },
        { .name     = "ioctl",      .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */
+         .arg_scnprintf = {
 #if defined(__i386__) || defined(__x86_64__)
 /*
  * FIXME: Make this available to all arches.
@@ -680,41 +648,28 @@ static struct syscall_fmt {
        { .name     = "keyctl",     .errmsg = true, STRARRAY(0, option, keyctl_options), },
        { .name     = "kill",       .errmsg = true,
          .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
-       { .name     = "lchown",    .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
-       { .name     = "lgetxattr",  .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
+       { .name     = "lchown",    .errmsg = true, },
+       { .name     = "lgetxattr",  .errmsg = true, },
        { .name     = "linkat",     .errmsg = true,
          .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
-       { .name     = "listxattr",  .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
-       { .name     = "llistxattr", .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
-       { .name     = "lremovexattr",  .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
+       { .name     = "listxattr",  .errmsg = true, },
+       { .name     = "llistxattr", .errmsg = true, },
+       { .name     = "lremovexattr",  .errmsg = true, },
        { .name     = "lseek",      .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */
-                            [2] = SCA_STRARRAY, /* whence */ },
+         .arg_scnprintf = { [2] = SCA_STRARRAY, /* whence */ },
          .arg_parm      = { [2] = &strarray__whences, /* whence */ }, },
-       { .name     = "lsetxattr",  .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
-       { .name     = "lstat",      .errmsg = true, .alias = "newlstat",
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
-       { .name     = "lsxattr",    .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
+       { .name     = "lsetxattr",  .errmsg = true, },
+       { .name     = "lstat",      .errmsg = true, .alias = "newlstat", },
+       { .name     = "lsxattr",    .errmsg = true, },
        { .name     = "madvise",    .errmsg = true,
          .arg_scnprintf = { [0] = SCA_HEX,      /* start */
                             [2] = SCA_MADV_BHV, /* behavior */ }, },
-       { .name     = "mkdir",    .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
+       { .name     = "mkdir",    .errmsg = true, },
        { .name     = "mkdirat",    .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
-                            [1] = SCA_FILENAME, /* pathname */ }, },
-       { .name     = "mknod",      .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
+         .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
+       { .name     = "mknod",      .errmsg = true, },
        { .name     = "mknodat",    .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
-                            [1] = SCA_FILENAME, /* filename */ }, },
+         .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
        { .name     = "mlock",      .errmsg = true,
          .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
        { .name     = "mlockall",   .errmsg = true,
@@ -722,8 +677,7 @@ static struct syscall_fmt {
        { .name     = "mmap",       .hexret = true,
          .arg_scnprintf = { [0] = SCA_HEX,       /* addr */
                             [2] = SCA_MMAP_PROT, /* prot */
-                            [3] = SCA_MMAP_FLAGS, /* flags */
-                            [4] = SCA_FD,        /* fd */ }, },
+                            [3] = SCA_MMAP_FLAGS, /* flags */ }, },
        { .name     = "mprotect",   .errmsg = true,
          .arg_scnprintf = { [0] = SCA_HEX, /* start */
                             [2] = SCA_MMAP_PROT, /* prot */ }, },
@@ -740,17 +694,14 @@ static struct syscall_fmt {
        { .name     = "name_to_handle_at", .errmsg = true,
          .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
        { .name     = "newfstatat", .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
-                            [1] = SCA_FILENAME, /* filename */ }, },
+         .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
        { .name     = "open",       .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME,   /* filename */
-                            [1] = SCA_OPEN_FLAGS, /* flags */ }, },
+         .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
        { .name     = "open_by_handle_at", .errmsg = true,
          .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
                             [2] = SCA_OPEN_FLAGS, /* flags */ }, },
        { .name     = "openat",     .errmsg = true,
          .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
-                            [1] = SCA_FILENAME, /* filename */
                             [2] = SCA_OPEN_FLAGS, /* flags */ }, },
        { .name     = "perf_event_open", .errmsg = true,
          .arg_scnprintf = { [2] = SCA_INT, /* cpu */
@@ -760,39 +711,26 @@ static struct syscall_fmt {
          .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
        { .name     = "poll",       .errmsg = true, .timeout = true, },
        { .name     = "ppoll",      .errmsg = true, .timeout = true, },
-       { .name     = "pread",      .errmsg = true, .alias = "pread64",
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
-       { .name     = "preadv",     .errmsg = true, .alias = "pread",
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
+       { .name     = "pread",      .errmsg = true, .alias = "pread64", },
+       { .name     = "preadv",     .errmsg = true, .alias = "pread", },
        { .name     = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
-       { .name     = "pwrite",     .errmsg = true, .alias = "pwrite64",
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
-       { .name     = "pwritev",    .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
-       { .name     = "read",       .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
-       { .name     = "readlink",   .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* path */ }, },
+       { .name     = "pwrite",     .errmsg = true, .alias = "pwrite64", },
+       { .name     = "pwritev",    .errmsg = true, },
+       { .name     = "read",       .errmsg = true, },
+       { .name     = "readlink",   .errmsg = true, },
        { .name     = "readlinkat", .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
-                            [1] = SCA_FILENAME, /* pathname */ }, },
-       { .name     = "readv",      .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
+         .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
+       { .name     = "readv",      .errmsg = true, },
        { .name     = "recvfrom",   .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */
-                            [3] = SCA_MSG_FLAGS, /* flags */ }, },
+         .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
        { .name     = "recvmmsg",   .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */
-                            [3] = SCA_MSG_FLAGS, /* flags */ }, },
+         .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
        { .name     = "recvmsg",    .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */
-                            [2] = SCA_MSG_FLAGS, /* flags */ }, },
-       { .name     = "removexattr", .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
+         .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
+       { .name     = "removexattr", .errmsg = true, },
        { .name     = "renameat",   .errmsg = true,
          .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
-       { .name     = "rmdir",    .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
+       { .name     = "rmdir",    .errmsg = true, },
        { .name     = "rt_sigaction", .errmsg = true,
          .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
        { .name     = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
@@ -807,22 +745,17 @@ static struct syscall_fmt {
                             [1] = SCA_SECCOMP_FLAGS, /* flags */ }, },
        { .name     = "select",     .errmsg = true, .timeout = true, },
        { .name     = "sendmmsg",    .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */
-                            [3] = SCA_MSG_FLAGS, /* flags */ }, },
+         .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
        { .name     = "sendmsg",    .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */
-                            [2] = SCA_MSG_FLAGS, /* flags */ }, },
+         .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
        { .name     = "sendto",     .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */
-                            [3] = SCA_MSG_FLAGS, /* flags */ }, },
+         .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
        { .name     = "set_tid_address", .errpid = true, },
        { .name     = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
        { .name     = "setpgid",    .errmsg = true, },
        { .name     = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
-       { .name     = "setxattr",   .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
-       { .name     = "shutdown",   .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
+       { .name     = "setxattr",   .errmsg = true, },
+       { .name     = "shutdown",   .errmsg = true, },
        { .name     = "socket",     .errmsg = true,
          .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
                             [1] = SCA_SK_TYPE, /* type */ },
@@ -831,10 +764,8 @@ static struct syscall_fmt {
          .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
                             [1] = SCA_SK_TYPE, /* type */ },
          .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
-       { .name     = "stat",       .errmsg = true, .alias = "newstat",
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
-       { .name     = "statfs",     .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
+       { .name     = "stat",       .errmsg = true, .alias = "newstat", },
+       { .name     = "statfs",     .errmsg = true, },
        { .name     = "swapoff",    .errmsg = true,
          .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
        { .name     = "swapon",     .errmsg = true,
@@ -845,29 +776,21 @@ static struct syscall_fmt {
          .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
        { .name     = "tkill",      .errmsg = true,
          .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
-       { .name     = "truncate",   .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* path */ }, },
+       { .name     = "truncate",   .errmsg = true, },
        { .name     = "uname",      .errmsg = true, .alias = "newuname", },
        { .name     = "unlinkat",   .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
-                            [1] = SCA_FILENAME, /* pathname */ }, },
-       { .name     = "utime",  .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
+         .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
+       { .name     = "utime",  .errmsg = true, },
        { .name     = "utimensat",  .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */
-                            [1] = SCA_FILENAME, /* filename */ }, },
-       { .name     = "utimes",  .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
-       { .name     = "vmsplice",  .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
+         .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
+       { .name     = "utimes",  .errmsg = true, },
+       { .name     = "vmsplice",  .errmsg = true, },
        { .name     = "wait4",      .errpid = true,
          .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, },
        { .name     = "waitid",     .errpid = true,
          .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, },
-       { .name     = "write",      .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
-       { .name     = "writev",     .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
+       { .name     = "write",      .errmsg = true, },
+       { .name     = "writev",     .errmsg = true, },
 };
 
 static int syscall_fmt__cmp(const void *name, const void *fmtp)
@@ -1160,6 +1083,24 @@ static int trace__tool_process(struct perf_tool *tool,
        return trace__process_event(trace, machine, event, sample);
 }
 
+static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
+{
+       struct machine *machine = vmachine;
+
+       if (machine->kptr_restrict_warned)
+               return NULL;
+
+       if (symbol_conf.kptr_restrict) {
+               pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
+                          "Check /proc/sys/kernel/kptr_restrict.\n\n"
+                          "Kernel samples will not be resolved.\n");
+               machine->kptr_restrict_warned = true;
+               return NULL;
+       }
+
+       return machine__resolve_kernel_addr(vmachine, addrp, modp);
+}
+
 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
 {
        int err = symbol__init(NULL);
@@ -1171,7 +1112,7 @@ static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
        if (trace->host == NULL)
                return -ENOMEM;
 
-       if (trace_event__register_resolver(trace->host, machine__resolve_kernel_addr) < 0)
+       if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
                return -errno;
 
        err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
@@ -1186,7 +1127,7 @@ static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
 static int syscall__set_arg_fmts(struct syscall *sc)
 {
        struct format_field *field;
-       int idx = 0;
+       int idx = 0, len;
 
        sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
        if (sc->arg_scnprintf == NULL)
@@ -1198,12 +1139,31 @@ static int syscall__set_arg_fmts(struct syscall *sc)
        for (field = sc->args; field; field = field->next) {
                if (sc->fmt && sc->fmt->arg_scnprintf[idx])
                        sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
+               else if (strcmp(field->type, "const char *") == 0 &&
+                        (strcmp(field->name, "filename") == 0 ||
+                         strcmp(field->name, "path") == 0 ||
+                         strcmp(field->name, "pathname") == 0))
+                       sc->arg_scnprintf[idx] = SCA_FILENAME;
                else if (field->flags & FIELD_IS_POINTER)
                        sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
                else if (strcmp(field->type, "pid_t") == 0)
                        sc->arg_scnprintf[idx] = SCA_PID;
                else if (strcmp(field->type, "umode_t") == 0)
                        sc->arg_scnprintf[idx] = SCA_MODE_T;
+               else if ((strcmp(field->type, "int") == 0 ||
+                         strcmp(field->type, "unsigned int") == 0 ||
+                         strcmp(field->type, "long") == 0) &&
+                        (len = strlen(field->name)) >= 2 &&
+                        strcmp(field->name + len - 2, "fd") == 0) {
+                       /*
+                        * /sys/kernel/tracing/events/syscalls/sys_enter*
+                        * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
+                        * 65 int
+                        * 23 unsigned int
+                        * 7 unsigned long
+                        */
+                       sc->arg_scnprintf[idx] = SCA_FD;
+               }
                ++idx;
        }
 
@@ -1534,7 +1494,7 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
        if (sc->is_exit) {
                if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
                        trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
-                       fprintf(trace->output, "%-70s\n", ttrace->entry_str);
+                       fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
                }
        } else {
                ttrace->entry_pending = true;
@@ -2887,12 +2847,12 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
                mmap_pages_user_set = false;
 
        if (trace.max_stack == UINT_MAX) {
-               trace.max_stack = sysctl_perf_event_max_stack;
+               trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
                max_stack_user_set = false;
        }
 
 #ifdef HAVE_DWARF_UNWIND_SUPPORT
-       if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled)
+       if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
                record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
 #endif
 
index 7970008..15982ce 100644 (file)
@@ -549,6 +549,9 @@ int main(int argc, const char **argv)
        if (sysctl__read_int("kernel/perf_event_max_stack", &value) == 0)
                sysctl_perf_event_max_stack = value;
 
+       if (sysctl__read_int("kernel/perf_event_max_contexts_per_stack", &value) == 0)
+               sysctl_perf_event_max_contexts_per_stack = value;
+
        cmd = extract_argv0_path(argv[0]);
        if (!cmd)
                cmd = "perf-help";
index 4db73d5..7e5a1e8 100644 (file)
@@ -354,9 +354,6 @@ static struct ins_ops nop_ops = {
        .scnprintf = nop__scnprintf,
 };
 
-/*
- * Must be sorted by name!
- */
 static struct ins instructions[] = {
        { .name = "add",   .ops  = &mov_ops, },
        { .name = "addl",  .ops  = &mov_ops, },
@@ -372,8 +369,8 @@ static struct ins instructions[] = {
        { .name = "bgt",   .ops  = &jump_ops, },
        { .name = "bhi",   .ops  = &jump_ops, },
        { .name = "bl",    .ops  = &call_ops, },
-       { .name = "blt",   .ops  = &jump_ops, },
        { .name = "bls",   .ops  = &jump_ops, },
+       { .name = "blt",   .ops  = &jump_ops, },
        { .name = "blx",   .ops  = &call_ops, },
        { .name = "bne",   .ops  = &jump_ops, },
 #endif
@@ -449,18 +446,39 @@ static struct ins instructions[] = {
        { .name = "xbeginq", .ops  = &jump_ops, },
 };
 
-static int ins__cmp(const void *name, const void *insp)
+static int ins__key_cmp(const void *name, const void *insp)
 {
        const struct ins *ins = insp;
 
        return strcmp(name, ins->name);
 }
 
+static int ins__cmp(const void *a, const void *b)
+{
+       const struct ins *ia = a;
+       const struct ins *ib = b;
+
+       return strcmp(ia->name, ib->name);
+}
+
+static void ins__sort(void)
+{
+       const int nmemb = ARRAY_SIZE(instructions);
+
+       qsort(instructions, nmemb, sizeof(struct ins), ins__cmp);
+}
+
 static struct ins *ins__find(const char *name)
 {
        const int nmemb = ARRAY_SIZE(instructions);
+       static bool sorted;
+
+       if (!sorted) {
+               ins__sort();
+               sorted = true;
+       }
 
-       return bsearch(name, instructions, nmemb, sizeof(struct ins), ins__cmp);
+       return bsearch(name, instructions, nmemb, sizeof(struct ins), ins__key_cmp);
 }
 
 int symbol__annotate_init(struct map *map __maybe_unused, struct symbol *sym)
@@ -1122,7 +1140,7 @@ int symbol__annotate(struct symbol *sym, struct map *map, size_t privsize)
        } else if (dso__is_kcore(dso)) {
                goto fallback;
        } else if (readlink(symfs_filename, command, sizeof(command)) < 0 ||
-                  strstr(command, "[kernel.kallsyms]") ||
+                  strstr(command, DSO__NAME_KALLSYMS) ||
                   access(symfs_filename, R_OK)) {
                free(filename);
 fallback:
index bff425e..67e5966 100644 (file)
@@ -256,7 +256,7 @@ static int machine__write_buildid_table(struct machine *machine, int fd)
                size_t name_len;
                bool in_kernel = false;
 
-               if (!pos->hit)
+               if (!pos->hit && !dso__is_vdso(pos))
                        continue;
 
                if (dso__is_vdso(pos)) {
index 8d96c80..c9a6dc1 100644 (file)
@@ -298,8 +298,7 @@ static struct call_path *call_path_from_sample(struct db_export *dbe,
         */
        callchain_param.order = ORDER_CALLER;
        err = thread__resolve_callchain(thread, &callchain_cursor, evsel,
-                                       sample, NULL, NULL,
-                                       sysctl_perf_event_max_stack);
+                                       sample, NULL, NULL, PERF_MAX_STACK_DEPTH);
        if (err) {
                callchain_param.order = saved_order;
                return NULL;
index 3357479..5d286f5 100644 (file)
@@ -7,6 +7,7 @@
 #include "auxtrace.h"
 #include "util.h"
 #include "debug.h"
+#include "vdso.h"
 
 char dso__symtab_origin(const struct dso *dso)
 {
@@ -62,9 +63,7 @@ int dso__read_binary_type_filename(const struct dso *dso,
                }
                break;
        case DSO_BINARY_TYPE__BUILD_ID_CACHE:
-               /* skip the locally configured cache if a symfs is given */
-               if (symbol_conf.symfs[0] ||
-                   (dso__build_id_filename(dso, filename, size) == NULL))
+               if (dso__build_id_filename(dso, filename, size) == NULL)
                        ret = -1;
                break;
 
@@ -1169,7 +1168,7 @@ bool __dsos__read_build_ids(struct list_head *head, bool with_hits)
        struct dso *pos;
 
        list_for_each_entry(pos, head, node) {
-               if (with_hits && !pos->hit)
+               if (with_hits && !pos->hit && !dso__is_vdso(pos))
                        continue;
                if (pos->has_build_id) {
                        have_build_id = true;
index c4bfe11..e82ba90 100644 (file)
@@ -44,6 +44,7 @@ void perf_evlist__init(struct perf_evlist *evlist, struct cpu_map *cpus,
        perf_evlist__set_maps(evlist, cpus, threads);
        fdarray__init(&evlist->pollfd, 64);
        evlist->workload.pid = -1;
+       evlist->backward = false;
 }
 
 struct perf_evlist *perf_evlist__new(void)
@@ -679,6 +680,33 @@ static struct perf_evsel *perf_evlist__event2evsel(struct perf_evlist *evlist,
        return NULL;
 }
 
+static int perf_evlist__set_paused(struct perf_evlist *evlist, bool value)
+{
+       int i;
+
+       for (i = 0; i < evlist->nr_mmaps; i++) {
+               int fd = evlist->mmap[i].fd;
+               int err;
+
+               if (fd < 0)
+                       continue;
+               err = ioctl(fd, PERF_EVENT_IOC_PAUSE_OUTPUT, value ? 1 : 0);
+               if (err)
+                       return err;
+       }
+       return 0;
+}
+
+int perf_evlist__pause(struct perf_evlist *evlist)
+{
+       return perf_evlist__set_paused(evlist, true);
+}
+
+int perf_evlist__resume(struct perf_evlist *evlist)
+{
+       return perf_evlist__set_paused(evlist, false);
+}
+
 /* When check_messup is true, 'end' must points to a good entry */
 static union perf_event *
 perf_mmap__read(struct perf_mmap *md, bool check_messup, u64 start,
@@ -881,6 +909,7 @@ static void __perf_evlist__munmap(struct perf_evlist *evlist, int idx)
        if (evlist->mmap[idx].base != NULL) {
                munmap(evlist->mmap[idx].base, evlist->mmap_len);
                evlist->mmap[idx].base = NULL;
+               evlist->mmap[idx].fd = -1;
                atomic_set(&evlist->mmap[idx].refcnt, 0);
        }
        auxtrace_mmap__munmap(&evlist->mmap[idx].auxtrace_mmap);
@@ -901,10 +930,14 @@ void perf_evlist__munmap(struct perf_evlist *evlist)
 
 static int perf_evlist__alloc_mmap(struct perf_evlist *evlist)
 {
+       int i;
+
        evlist->nr_mmaps = cpu_map__nr(evlist->cpus);
        if (cpu_map__empty(evlist->cpus))
                evlist->nr_mmaps = thread_map__nr(evlist->threads);
        evlist->mmap = zalloc(evlist->nr_mmaps * sizeof(struct perf_mmap));
+       for (i = 0; i < evlist->nr_mmaps; i++)
+               evlist->mmap[i].fd = -1;
        return evlist->mmap != NULL ? 0 : -ENOMEM;
 }
 
@@ -941,6 +974,7 @@ static int __perf_evlist__mmap(struct perf_evlist *evlist, int idx,
                evlist->mmap[idx].base = NULL;
                return -1;
        }
+       evlist->mmap[idx].fd = fd;
 
        if (auxtrace_mmap__mmap(&evlist->mmap[idx].auxtrace_mmap,
                                &mp->auxtrace_mp, evlist->mmap[idx].base, fd))
index 85d1b59..d740fb8 100644 (file)
@@ -28,6 +28,7 @@ struct record_opts;
 struct perf_mmap {
        void             *base;
        int              mask;
+       int              fd;
        atomic_t         refcnt;
        u64              prev;
        struct auxtrace_mmap auxtrace_mmap;
@@ -43,6 +44,7 @@ struct perf_evlist {
        bool             overwrite;
        bool             enabled;
        bool             has_user_cpus;
+       bool             backward;
        size_t           mmap_len;
        int              id_pos;
        int              is_pos;
@@ -135,6 +137,8 @@ void perf_evlist__mmap_read_catchup(struct perf_evlist *evlist, int idx);
 
 void perf_evlist__mmap_consume(struct perf_evlist *evlist, int idx);
 
+int perf_evlist__pause(struct perf_evlist *evlist);
+int perf_evlist__resume(struct perf_evlist *evlist);
 int perf_evlist__open(struct perf_evlist *evlist);
 void perf_evlist__close(struct perf_evlist *evlist);
 
index 52c7d88..5d7037e 100644 (file)
@@ -37,6 +37,7 @@ static struct {
        bool clockid;
        bool clockid_wrong;
        bool lbr_flags;
+       bool write_backward;
 } perf_missing_features;
 
 static clockid_t clockid;
@@ -1376,6 +1377,8 @@ fallback_missing_features:
        if (perf_missing_features.lbr_flags)
                evsel->attr.branch_sample_type &= ~(PERF_SAMPLE_BRANCH_NO_FLAGS |
                                     PERF_SAMPLE_BRANCH_NO_CYCLES);
+       if (perf_missing_features.write_backward)
+               evsel->attr.write_backward = false;
 retry_sample_id:
        if (perf_missing_features.sample_id_all)
                evsel->attr.sample_id_all = 0;
@@ -1438,6 +1441,12 @@ retry_open:
                                err = -EINVAL;
                                goto out_close;
                        }
+
+                       if (evsel->overwrite &&
+                           perf_missing_features.write_backward) {
+                               err = -EINVAL;
+                               goto out_close;
+                       }
                }
        }
 
@@ -1500,6 +1509,10 @@ try_fallback:
                          PERF_SAMPLE_BRANCH_NO_FLAGS))) {
                perf_missing_features.lbr_flags = true;
                goto fallback_missing_features;
+       } else if (!perf_missing_features.write_backward &&
+                       evsel->attr.write_backward) {
+               perf_missing_features.write_backward = true;
+               goto fallback_missing_features;
        }
 
 out_close:
index 8a644fe..c1f1015 100644 (file)
@@ -112,6 +112,7 @@ struct perf_evsel {
        bool                    tracking;
        bool                    per_pkg;
        bool                    precise_max;
+       bool                    overwrite;
        /* parse modifier helper */
        int                     exclude_GH;
        int                     nr_members;
index cfab531..d1f19e0 100644 (file)
@@ -117,6 +117,13 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
                        hists__new_col_len(hists, HISTC_SYMBOL_TO, symlen);
                        hists__set_unres_dso_col_len(hists, HISTC_DSO_TO);
                }
+
+               if (h->branch_info->srcline_from)
+                       hists__new_col_len(hists, HISTC_SRCLINE_FROM,
+                                       strlen(h->branch_info->srcline_from));
+               if (h->branch_info->srcline_to)
+                       hists__new_col_len(hists, HISTC_SRCLINE_TO,
+                                       strlen(h->branch_info->srcline_to));
        }
 
        if (h->mem_info) {
@@ -1042,6 +1049,8 @@ void hist_entry__delete(struct hist_entry *he)
        if (he->branch_info) {
                map__zput(he->branch_info->from.map);
                map__zput(he->branch_info->to.map);
+               free_srcline(he->branch_info->srcline_from);
+               free_srcline(he->branch_info->srcline_to);
                zfree(&he->branch_info);
        }
 
index 0f84bfb..7b54ccf 100644 (file)
@@ -52,6 +52,8 @@ enum hist_column {
        HISTC_MEM_IADDR_SYMBOL,
        HISTC_TRANSACTION,
        HISTC_CYCLES,
+       HISTC_SRCLINE_FROM,
+       HISTC_SRCLINE_TO,
        HISTC_TRACE,
        HISTC_NR_COLS, /* Last entry */
 };
index f9644f7..b177218 100644 (file)
@@ -43,6 +43,7 @@ int machine__init(struct machine *machine, const char *root_dir, pid_t pid)
 
        machine->symbol_filter = NULL;
        machine->id_hdr_size = 0;
+       machine->kptr_restrict_warned = false;
        machine->comm_exec = false;
        machine->kernel_start = 0;
 
@@ -709,7 +710,7 @@ static struct dso *machine__get_kernel(struct machine *machine)
        if (machine__is_host(machine)) {
                vmlinux_name = symbol_conf.vmlinux_name;
                if (!vmlinux_name)
-                       vmlinux_name = "[kernel.kallsyms]";
+                       vmlinux_name = DSO__NAME_KALLSYMS;
 
                kernel = machine__findnew_kernel(machine, vmlinux_name,
                                                 "[kernel]", DSO_TYPE_KERNEL);
@@ -1135,10 +1136,10 @@ int machine__create_kernel_maps(struct machine *machine)
 {
        struct dso *kernel = machine__get_kernel(machine);
        const char *name;
-       u64 addr = machine__get_running_kernel_start(machine, &name);
+       u64 addr;
        int ret;
 
-       if (!addr || kernel == NULL)
+       if (kernel == NULL)
                return -1;
 
        ret = __machine__create_kernel_maps(machine, kernel);
@@ -1160,8 +1161,9 @@ int machine__create_kernel_maps(struct machine *machine)
         */
        map_groups__fixup_end(&machine->kmaps);
 
-       if (maps__set_kallsyms_ref_reloc_sym(machine->vmlinux_maps, name,
-                                            addr)) {
+       addr = machine__get_running_kernel_start(machine, &name);
+       if (!addr) {
+       } else if (maps__set_kallsyms_ref_reloc_sym(machine->vmlinux_maps, name, addr)) {
                machine__destroy_kernel_maps(machine);
                return -1;
        }
@@ -1769,11 +1771,6 @@ static int resolve_lbr_callchain_sample(struct thread *thread,
                 */
                int mix_chain_nr = i + 1 + lbr_nr + 1;
 
-               if (mix_chain_nr > (int)sysctl_perf_event_max_stack + PERF_MAX_BRANCH_DEPTH) {
-                       pr_warning("corrupted callchain. skipping...\n");
-                       return 0;
-               }
-
                for (j = 0; j < mix_chain_nr; j++) {
                        if (callchain_param.order == ORDER_CALLEE) {
                                if (j < i + 1)
@@ -1811,9 +1808,9 @@ static int thread__resolve_callchain_sample(struct thread *thread,
 {
        struct branch_stack *branch = sample->branch_stack;
        struct ip_callchain *chain = sample->callchain;
-       int chain_nr = min(max_stack, (int)chain->nr);
+       int chain_nr = chain->nr;
        u8 cpumode = PERF_RECORD_MISC_USER;
-       int i, j, err;
+       int i, j, err, nr_entries;
        int skip_idx = -1;
        int first_call = 0;
 
@@ -1828,8 +1825,7 @@ static int thread__resolve_callchain_sample(struct thread *thread,
         * Based on DWARF debug information, some architectures skip
         * a callchain entry saved by the kernel.
         */
-       if (chain->nr < sysctl_perf_event_max_stack)
-               skip_idx = arch_skip_callchain_idx(thread, chain);
+       skip_idx = arch_skip_callchain_idx(thread, chain);
 
        /*
         * Add branches to call stack for easier browsing. This gives
@@ -1889,12 +1885,8 @@ static int thread__resolve_callchain_sample(struct thread *thread,
        }
 
 check_calls:
-       if (chain->nr > sysctl_perf_event_max_stack && (int)chain->nr > max_stack) {
-               pr_warning("corrupted callchain. skipping...\n");
-               return 0;
-       }
-
-       for (i = first_call; i < chain_nr; i++) {
+       for (i = first_call, nr_entries = 0;
+            i < chain_nr && nr_entries < max_stack; i++) {
                u64 ip;
 
                if (callchain_param.order == ORDER_CALLEE)
@@ -1908,6 +1900,9 @@ check_calls:
 #endif
                ip = chain->ips[j];
 
+               if (ip < PERF_CONTEXT_MAX)
+                       ++nr_entries;
+
                err = add_callchain_ip(thread, cursor, parent, root_al, &cpumode, ip);
 
                if (err)
index 83f4679..41ac9cf 100644 (file)
@@ -28,6 +28,7 @@ struct machine {
        pid_t             pid;
        u16               id_hdr_size;
        bool              comm_exec;
+       bool              kptr_restrict_warned;
        char              *root_dir;
        struct rb_root    threads;
        pthread_rwlock_t  threads_lock;
index 62c7f69..5d1eb1c 100644 (file)
@@ -264,8 +264,7 @@ static SV *perl_process_callchain(struct perf_sample *sample,
                goto exit;
 
        if (thread__resolve_callchain(al->thread, &callchain_cursor, evsel,
-                                     sample, NULL, NULL,
-                                     sysctl_perf_event_max_stack) != 0) {
+                                     sample, NULL, NULL, scripting_max_stack) != 0) {
                pr_err("Failed to resolve callchain. Skipping\n");
                goto exit;
        }
index 20e69ed..c4e9bd7 100644 (file)
@@ -353,6 +353,88 @@ struct sort_entry sort_srcline = {
        .se_width_idx   = HISTC_SRCLINE,
 };
 
+/* --sort srcline_from */
+
+static int64_t
+sort__srcline_from_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+       if (!left->branch_info->srcline_from) {
+               struct map *map = left->branch_info->from.map;
+               if (!map)
+                       left->branch_info->srcline_from = SRCLINE_UNKNOWN;
+               else
+                       left->branch_info->srcline_from = get_srcline(map->dso,
+                                          map__rip_2objdump(map,
+                                                            left->branch_info->from.al_addr),
+                                                        left->branch_info->from.sym, true);
+       }
+       if (!right->branch_info->srcline_from) {
+               struct map *map = right->branch_info->from.map;
+               if (!map)
+                       right->branch_info->srcline_from = SRCLINE_UNKNOWN;
+               else
+                       right->branch_info->srcline_from = get_srcline(map->dso,
+                                            map__rip_2objdump(map,
+                                                              right->branch_info->from.al_addr),
+                                                    right->branch_info->from.sym, true);
+       }
+       return strcmp(right->branch_info->srcline_from, left->branch_info->srcline_from);
+}
+
+static int hist_entry__srcline_from_snprintf(struct hist_entry *he, char *bf,
+                                       size_t size, unsigned int width)
+{
+       return repsep_snprintf(bf, size, "%-*.*s", width, width, he->branch_info->srcline_from);
+}
+
+struct sort_entry sort_srcline_from = {
+       .se_header      = "From Source:Line",
+       .se_cmp         = sort__srcline_from_cmp,
+       .se_snprintf    = hist_entry__srcline_from_snprintf,
+       .se_width_idx   = HISTC_SRCLINE_FROM,
+};
+
+/* --sort srcline_to */
+
+static int64_t
+sort__srcline_to_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+       if (!left->branch_info->srcline_to) {
+               struct map *map = left->branch_info->to.map;
+               if (!map)
+                       left->branch_info->srcline_to = SRCLINE_UNKNOWN;
+               else
+                       left->branch_info->srcline_to = get_srcline(map->dso,
+                                          map__rip_2objdump(map,
+                                                            left->branch_info->to.al_addr),
+                                                        left->branch_info->from.sym, true);
+       }
+       if (!right->branch_info->srcline_to) {
+               struct map *map = right->branch_info->to.map;
+               if (!map)
+                       right->branch_info->srcline_to = SRCLINE_UNKNOWN;
+               else
+                       right->branch_info->srcline_to = get_srcline(map->dso,
+                                            map__rip_2objdump(map,
+                                                              right->branch_info->to.al_addr),
+                                                    right->branch_info->to.sym, true);
+       }
+       return strcmp(right->branch_info->srcline_to, left->branch_info->srcline_to);
+}
+
+static int hist_entry__srcline_to_snprintf(struct hist_entry *he, char *bf,
+                                       size_t size, unsigned int width)
+{
+       return repsep_snprintf(bf, size, "%-*.*s", width, width, he->branch_info->srcline_to);
+}
+
+struct sort_entry sort_srcline_to = {
+       .se_header      = "To Source:Line",
+       .se_cmp         = sort__srcline_to_cmp,
+       .se_snprintf    = hist_entry__srcline_to_snprintf,
+       .se_width_idx   = HISTC_SRCLINE_TO,
+};
+
 /* --sort srcfile */
 
 static char no_srcfile[1];
@@ -1347,6 +1429,8 @@ static struct sort_dimension bstack_sort_dimensions[] = {
        DIM(SORT_IN_TX, "in_tx", sort_in_tx),
        DIM(SORT_ABORT, "abort", sort_abort),
        DIM(SORT_CYCLES, "cycles", sort_cycles),
+       DIM(SORT_SRCLINE_FROM, "srcline_from", sort_srcline_from),
+       DIM(SORT_SRCLINE_TO, "srcline_to", sort_srcline_to),
 };
 
 #undef DIM
index 42927f4..ebb59ca 100644 (file)
@@ -215,6 +215,8 @@ enum sort_type {
        SORT_ABORT,
        SORT_IN_TX,
        SORT_CYCLES,
+       SORT_SRCLINE_FROM,
+       SORT_SRCLINE_TO,
 
        /* memory mode specific sort keys */
        __SORT_MEMORY_MODE,
index fdb7196..aa9efe0 100644 (file)
@@ -94,7 +94,8 @@ void perf_stat__update_shadow_stats(struct perf_evsel *counter, u64 *count,
 {
        int ctx = evsel_context(counter);
 
-       if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK))
+       if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK) ||
+           perf_evsel__match(counter, SOFTWARE, SW_CPU_CLOCK))
                update_stats(&runtime_nsecs_stats[cpu], count[0]);
        else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES))
                update_stats(&runtime_cycles_stats[ctx][cpu], count[0]);
@@ -188,7 +189,7 @@ static void print_stalled_cycles_backend(int cpu,
 
        color = get_ratio_color(GRC_STALLED_CYCLES_BE, ratio);
 
-       out->print_metric(out->ctx, color, "%6.2f%%", "backend cycles idle", ratio);
+       out->print_metric(out->ctx, color, "%7.2f%%", "backend cycles idle", ratio);
 }
 
 static void print_branch_misses(int cpu,
@@ -444,7 +445,8 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
                        ratio = total / avg;
 
                print_metric(ctxp, NULL, "%8.0f", "cycles / elision", ratio);
-       } else if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK)) {
+       } else if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK) ||
+                  perf_evsel__match(evsel, SOFTWARE, SW_CPU_CLOCK)) {
                if ((ratio = avg_stats(&walltime_nsecs_stats)) != 0)
                        print_metric(ctxp, NULL, "%8.3f", "CPUs utilized",
                                     avg / ratio);
index 7fb3330..20f9cb3 100644 (file)
@@ -1662,8 +1662,8 @@ static char *dso__find_kallsyms(struct dso *dso, struct map *map)
 
        build_id__sprintf(dso->build_id, sizeof(dso->build_id), sbuild_id);
 
-       scnprintf(path, sizeof(path), "%s/[kernel.kcore]/%s", buildid_dir,
-                 sbuild_id);
+       scnprintf(path, sizeof(path), "%s/%s/%s", buildid_dir,
+                 DSO__NAME_KCORE, sbuild_id);
 
        /* Use /proc/kallsyms if possible */
        if (is_host) {
@@ -1699,8 +1699,8 @@ static char *dso__find_kallsyms(struct dso *dso, struct map *map)
        if (!find_matching_kcore(map, path, sizeof(path)))
                return strdup(path);
 
-       scnprintf(path, sizeof(path), "%s/[kernel.kallsyms]/%s",
-                 buildid_dir, sbuild_id);
+       scnprintf(path, sizeof(path), "%s/%s/%s",
+                 buildid_dir, DSO__NAME_KALLSYMS, sbuild_id);
 
        if (access(path, F_OK)) {
                pr_err("No kallsyms or vmlinux with build-id %s was found\n",
@@ -1769,7 +1769,7 @@ do_kallsyms:
 
        if (err > 0 && !dso__is_kcore(dso)) {
                dso->binary_type = DSO_BINARY_TYPE__KALLSYMS;
-               dso__set_long_name(dso, "[kernel.kallsyms]", false);
+               dso__set_long_name(dso, DSO__NAME_KALLSYMS, false);
                map__fixup_start(map);
                map__fixup_end(map);
        }
@@ -2033,3 +2033,26 @@ void symbol__exit(void)
        symbol_conf.sym_list = symbol_conf.dso_list = symbol_conf.comm_list = NULL;
        symbol_conf.initialized = false;
 }
+
+int symbol__config_symfs(const struct option *opt __maybe_unused,
+                        const char *dir, int unset __maybe_unused)
+{
+       char *bf = NULL;
+       int ret;
+
+       symbol_conf.symfs = strdup(dir);
+       if (symbol_conf.symfs == NULL)
+               return -ENOMEM;
+
+       /* skip the locally configured cache if a symfs is given, and
+        * config buildid dir to symfs/.debug
+        */
+       ret = asprintf(&bf, "%s/%s", dir, ".debug");
+       if (ret < 0)
+               return -ENOMEM;
+
+       set_buildid_dir(bf);
+
+       free(bf);
+       return 0;
+}
index 2b5e4ed..b10d558 100644 (file)
@@ -44,6 +44,9 @@ Elf_Scn *elf_section_by_name(Elf *elf, GElf_Ehdr *ep,
 #define DMGL_ANSI        (1 << 1)       /* Include const, volatile, etc */
 #endif
 
+#define DSO__NAME_KALLSYMS     "[kernel.kallsyms]"
+#define DSO__NAME_KCORE                "[kernel.kcore]"
+
 /** struct symbol - symtab entry
  *
  * @ignore - resolvable but tools ignore it (e.g. idle routines)
@@ -183,6 +186,8 @@ struct branch_info {
        struct addr_map_symbol from;
        struct addr_map_symbol to;
        struct branch_flags flags;
+       char                    *srcline_from;
+       char                    *srcline_to;
 };
 
 struct mem_info {
@@ -287,6 +292,8 @@ bool symbol_type__is_a(char symbol_type, enum map_type map_type);
 bool symbol__restricted_filename(const char *filename,
                                 const char *restricted_filename);
 bool symbol__is_idle(struct symbol *sym);
+int symbol__config_symfs(const struct option *opt __maybe_unused,
+                        const char *dir, int unset __maybe_unused);
 
 int dso__load_sym(struct dso *dso, struct map *map, struct symsrc *syms_ss,
                  struct symsrc *runtime_ss, symbol_filter_t filter,
index f92c37a..b2940c8 100644 (file)
@@ -27,7 +27,6 @@ struct perf_top {
        int                max_stack;
        bool               hide_kernel_symbols, hide_user_symbols, zero;
        bool               use_tui, use_stdio;
-       bool               kptr_restrict_warned;
        bool               vmlinux_warned;
        bool               dump_symtab;
        struct hist_entry  *sym_filter_entry;
index eab077a..23504ad 100644 (file)
@@ -33,7 +33,8 @@ struct callchain_param        callchain_param = {
 unsigned int page_size;
 int cacheline_size;
 
-unsigned int sysctl_perf_event_max_stack = PERF_MAX_STACK_DEPTH;
+int sysctl_perf_event_max_stack = PERF_MAX_STACK_DEPTH;
+int sysctl_perf_event_max_contexts_per_stack = PERF_MAX_CONTEXTS_PER_STACK;
 
 bool test_attr__enabled;
 
index 7651633..1e8c316 100644 (file)
@@ -261,7 +261,8 @@ void sighandler_dump_stack(int sig);
 
 extern unsigned int page_size;
 extern int cacheline_size;
-extern unsigned int sysctl_perf_event_max_stack;
+extern int sysctl_perf_event_max_stack;
+extern int sysctl_perf_event_max_contexts_per_stack;
 
 struct parse_tag {
        char tag;
index 7947e56..2e58549 100644 (file)
@@ -1234,6 +1234,10 @@ TEST_F(TRACE_poke, getpid_runs_normally)
 # define ARCH_REGS     struct user_pt_regs
 # define SYSCALL_NUM   regs[8]
 # define SYSCALL_RET   regs[0]
+#elif defined(__hppa__)
+# define ARCH_REGS     struct user_regs_struct
+# define SYSCALL_NUM   gr[20]
+# define SYSCALL_RET   gr[28]
 #elif defined(__powerpc__)
 # define ARCH_REGS     struct pt_regs
 # define SYSCALL_NUM   gpr[0]
@@ -1303,7 +1307,7 @@ void change_syscall(struct __test_metadata *_metadata,
        EXPECT_EQ(0, ret);
 
 #if defined(__x86_64__) || defined(__i386__) || defined(__powerpc__) || \
-    defined(__s390__)
+    defined(__s390__) || defined(__hppa__)
        {
                regs.SYSCALL_NUM = syscall;
        }
@@ -1505,6 +1509,8 @@ TEST_F(TRACE_syscall, syscall_dropped)
 #  define __NR_seccomp 383
 # elif defined(__aarch64__)
 #  define __NR_seccomp 277
+# elif defined(__hppa__)
+#  define __NR_seccomp 338
 # elif defined(__powerpc__)
 #  define __NR_seccomp 358
 # elif defined(__s390__)
index c879572..0bc737a 100644 (file)
@@ -30,7 +30,9 @@
 #define MAP_HUGE_1GB    (30 << MAP_HUGE_SHIFT)
 #define MAP_HUGE_SHIFT  26
 #define MAP_HUGE_MASK   0x3f
+#if !defined(MAP_HUGETLB)
 #define MAP_HUGETLB    0x40000
+#endif
 
 #define SHM_HUGETLB     04000   /* segment will use huge TLB pages */
 #define SHM_HUGE_SHIFT  26
index feaa64a..6ba7455 100644 (file)
@@ -1,6 +1,6 @@
 all:
 
-all: ring virtio_ring_0_9 virtio_ring_poll
+all: ring virtio_ring_0_9 virtio_ring_poll virtio_ring_inorder
 
 CFLAGS += -Wall
 CFLAGS += -pthread -O2 -ggdb
@@ -10,13 +10,16 @@ main.o: main.c main.h
 ring.o: ring.c main.h
 virtio_ring_0_9.o: virtio_ring_0_9.c main.h
 virtio_ring_poll.o: virtio_ring_poll.c virtio_ring_0_9.c main.h
+virtio_ring_inorder.o: virtio_ring_inorder.c virtio_ring_0_9.c main.h
 ring: ring.o main.o
 virtio_ring_0_9: virtio_ring_0_9.o main.o
 virtio_ring_poll: virtio_ring_poll.o main.o
+virtio_ring_inorder: virtio_ring_inorder.o main.o
 clean:
        -rm main.o
        -rm ring.o ring
        -rm virtio_ring_0_9.o virtio_ring_0_9
        -rm virtio_ring_poll.o virtio_ring_poll
+       -rm virtio_ring_inorder.o virtio_ring_inorder
 
 .PHONY: all clean
index 3a5ff43..147abb4 100644 (file)
@@ -115,7 +115,7 @@ static void run_guest(void)
                do {
                        if (started < bufs &&
                            started - completed < max_outstanding) {
-                               r = add_inbuf(0, NULL, "Hello, world!");
+                               r = add_inbuf(0, "Buffer\n", "Hello, world!");
                                if (__builtin_expect(r == 0, true)) {
                                        ++started;
                                        if (!--tokick) {
index 47c9a1a..7618662 100644 (file)
@@ -26,6 +26,14 @@ struct vring ring;
  * high bits of ring id ^ 0x8000).
  */
 /* #ifdef RING_POLL */
+/* enabling the below activates experimental in-order code
+ * (which skips ring updates and reads and writes len in descriptor).
+ */
+/* #ifdef INORDER */
+
+#if defined(RING_POLL) && defined(INORDER)
+#error "RING_POLL and INORDER are mutually exclusive"
+#endif
 
 /* how much padding is needed to avoid false cache sharing */
 #define HOST_GUEST_PADDING 0x80
@@ -35,7 +43,11 @@ struct guest {
        unsigned short last_used_idx;
        unsigned short num_free;
        unsigned short kicked_avail_idx;
+#ifndef INORDER
        unsigned short free_head;
+#else
+       unsigned short reserved_free_head;
+#endif
        unsigned char reserved[HOST_GUEST_PADDING - 10];
 } guest;
 
@@ -66,8 +78,10 @@ void alloc_ring(void)
        guest.avail_idx = 0;
        guest.kicked_avail_idx = -1;
        guest.last_used_idx = 0;
+#ifndef INORDER
        /* Put everything in free lists. */
        guest.free_head = 0;
+#endif
        for (i = 0; i < ring_size - 1; i++)
                ring.desc[i].next = i + 1;
        host.used_idx = 0;
@@ -84,13 +98,20 @@ void alloc_ring(void)
 /* guest side */
 int add_inbuf(unsigned len, void *buf, void *datap)
 {
-       unsigned head, avail;
+       unsigned head;
+#ifndef INORDER
+       unsigned avail;
+#endif
        struct vring_desc *desc;
 
        if (!guest.num_free)
                return -1;
 
+#ifdef INORDER
+       head = (ring_size - 1) & (guest.avail_idx++);
+#else
        head = guest.free_head;
+#endif
        guest.num_free--;
 
        desc = ring.desc;
@@ -102,7 +123,9 @@ int add_inbuf(unsigned len, void *buf, void *datap)
         * descriptors.
         */
        desc[head].flags &= ~VRING_DESC_F_NEXT;
+#ifndef INORDER
        guest.free_head = desc[head].next;
+#endif
 
        data[head].data = datap;
 
@@ -113,8 +136,12 @@ int add_inbuf(unsigned len, void *buf, void *datap)
        ring.avail->ring[avail & (ring_size - 1)] =
                (head | (avail & ~(ring_size - 1))) ^ 0x8000;
 #else
+#ifndef INORDER
+       /* Barrier A (for pairing) */
+       smp_release();
        avail = (ring_size - 1) & (guest.avail_idx++);
        ring.avail->ring[avail] = head;
+#endif
        /* Barrier A (for pairing) */
        smp_release();
 #endif
@@ -141,15 +168,27 @@ void *get_buf(unsigned *lenp, void **bufp)
                return NULL;
        /* Barrier B (for pairing) */
        smp_acquire();
+#ifdef INORDER
+       head = (ring_size - 1) & guest.last_used_idx;
+       index = head;
+#else
        head = (ring_size - 1) & guest.last_used_idx;
        index = ring.used->ring[head].id;
 #endif
+
+#endif
+#ifdef INORDER
+       *lenp = ring.desc[index].len;
+#else
        *lenp = ring.used->ring[head].len;
+#endif
        datap = data[index].data;
        *bufp = (void*)(unsigned long)ring.desc[index].addr;
        data[index].data = NULL;
+#ifndef INORDER
        ring.desc[index].next = guest.free_head;
        guest.free_head = index;
+#endif
        guest.num_free++;
        guest.last_used_idx++;
        return datap;
@@ -283,16 +322,24 @@ bool use_buf(unsigned *lenp, void **bufp)
        smp_acquire();
 
        used_idx &= ring_size - 1;
+#ifdef INORDER
+       head = used_idx;
+#else
        head = ring.avail->ring[used_idx];
+#endif
        desc = &ring.desc[head];
 #endif
 
        *lenp = desc->len;
        *bufp = (void *)(unsigned long)desc->addr;
 
+#ifdef INORDER
+       desc->len = desc->len - 1;
+#else
        /* now update used ring */
        ring.used->ring[used_idx].id = head;
        ring.used->ring[used_idx].len = desc->len - 1;
+#endif
        /* Barrier B (for pairing) */
        smp_release();
        host.used_idx++;
diff --git a/tools/virtio/ringtest/virtio_ring_inorder.c b/tools/virtio/ringtest/virtio_ring_inorder.c
new file mode 100644 (file)
index 0000000..2438ca5
--- /dev/null
@@ -0,0 +1,2 @@
+#define INORDER 1
+#include "virtio_ring_0_9.c"
index 409db33..e2d5b6f 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 
 #include <clocksource/arm_arch_timer.h>
 #include <asm/arch_timer.h>
@@ -174,10 +175,10 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level)
 
        timer->active_cleared_last = false;
        timer->irq.level = new_level;
-       trace_kvm_timer_update_irq(vcpu->vcpu_id, timer->map->virt_irq,
+       trace_kvm_timer_update_irq(vcpu->vcpu_id, timer->irq.irq,
                                   timer->irq.level);
        ret = kvm_vgic_inject_mapped_irq(vcpu->kvm, vcpu->vcpu_id,
-                                        timer->map,
+                                        timer->irq.irq,
                                         timer->irq.level);
        WARN_ON(ret);
 }
@@ -196,7 +197,7 @@ static int kvm_timer_update_state(struct kvm_vcpu *vcpu)
         * because the guest would never see the interrupt.  Instead wait
         * until we call this function from kvm_timer_flush_hwstate.
         */
-       if (!vgic_initialized(vcpu->kvm))
+       if (!vgic_initialized(vcpu->kvm) || !timer->enabled)
                return -ENODEV;
 
        if (kvm_timer_should_fire(vcpu) != timer->irq.level)
@@ -274,10 +275,8 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
        * to ensure that hardware interrupts from the timer triggers a guest
        * exit.
        */
-       if (timer->irq.level || kvm_vgic_map_is_active(vcpu, timer->map))
-               phys_active = true;
-       else
-               phys_active = false;
+       phys_active = timer->irq.level ||
+                       kvm_vgic_map_is_active(vcpu, timer->irq.irq);
 
        /*
         * We want to avoid hitting the (re)distributor as much as
@@ -302,7 +301,7 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
        if (timer->active_cleared_last && !phys_active)
                return;
 
-       ret = irq_set_irqchip_state(timer->map->irq,
+       ret = irq_set_irqchip_state(host_vtimer_irq,
                                    IRQCHIP_STATE_ACTIVE,
                                    phys_active);
        WARN_ON(ret);
@@ -334,7 +333,6 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
                         const struct kvm_irq_level *irq)
 {
        struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
-       struct irq_phys_map *map;
 
        /*
         * The vcpu timer irq number cannot be determined in
@@ -353,15 +351,6 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
        timer->cntv_ctl = 0;
        kvm_timer_update_state(vcpu);
 
-       /*
-        * Tell the VGIC that the virtual interrupt is tied to a
-        * physical interrupt. We do that once per VCPU.
-        */
-       map = kvm_vgic_map_phys_irq(vcpu, irq->irq, host_vtimer_irq);
-       if (WARN_ON(IS_ERR(map)))
-               return PTR_ERR(map);
-
-       timer->map = map;
        return 0;
 }
 
@@ -487,14 +476,43 @@ void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu)
        struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 
        timer_disarm(timer);
-       if (timer->map)
-               kvm_vgic_unmap_phys_irq(vcpu, timer->map);
+       kvm_vgic_unmap_phys_irq(vcpu, timer->irq.irq);
 }
 
-void kvm_timer_enable(struct kvm *kvm)
+int kvm_timer_enable(struct kvm_vcpu *vcpu)
 {
-       if (kvm->arch.timer.enabled)
-               return;
+       struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+       struct irq_desc *desc;
+       struct irq_data *data;
+       int phys_irq;
+       int ret;
+
+       if (timer->enabled)
+               return 0;
+
+       /*
+        * Find the physical IRQ number corresponding to the host_vtimer_irq
+        */
+       desc = irq_to_desc(host_vtimer_irq);
+       if (!desc) {
+               kvm_err("%s: no interrupt descriptor\n", __func__);
+               return -EINVAL;
+       }
+
+       data = irq_desc_get_irq_data(desc);
+       while (data->parent_data)
+               data = data->parent_data;
+
+       phys_irq = data->hwirq;
+
+       /*
+        * Tell the VGIC that the virtual interrupt is tied to a
+        * physical interrupt. We do that once per VCPU.
+        */
+       ret = kvm_vgic_map_phys_irq(vcpu, timer->irq.irq, phys_irq);
+       if (ret)
+               return ret;
+
 
        /*
         * There is a potential race here between VCPUs starting for the first
@@ -505,7 +523,9 @@ void kvm_timer_enable(struct kvm *kvm)
         * the arch timers are enabled.
         */
        if (timecounter && wqueue)
-               kvm->arch.timer.enabled = 1;
+               timer->enabled = 1;
+
+       return 0;
 }
 
 void kvm_timer_init(struct kvm *kvm)
index ea00d69..798866a 100644 (file)
 /* vcpu is already in the HYP VA space */
 void __hyp_text __timer_save_state(struct kvm_vcpu *vcpu)
 {
-       struct kvm *kvm = kern_hyp_va(vcpu->kvm);
        struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
        u64 val;
 
-       if (kvm->arch.timer.enabled) {
+       if (timer->enabled) {
                timer->cntv_ctl = read_sysreg_el0(cntv_ctl);
                timer->cntv_cval = read_sysreg_el0(cntv_cval);
        }
@@ -60,7 +59,7 @@ void __hyp_text __timer_restore_state(struct kvm_vcpu *vcpu)
        val |= CNTHCTL_EL1PCTEN;
        write_sysreg(val, cnthctl_el2);
 
-       if (kvm->arch.timer.enabled) {
+       if (timer->enabled) {
                write_sysreg(kvm->arch.timer.cntvoff, cntvoff_el2);
                write_sysreg_el0(timer->cntv_cval, cntv_cval);
                isb();
index 674bdf8..3a3a699 100644 (file)
 
 #include <asm/kvm_hyp.h>
 
+#ifdef CONFIG_KVM_NEW_VGIC
+extern struct vgic_global kvm_vgic_global_state;
+#define vgic_v2_params kvm_vgic_global_state
+#else
+extern struct vgic_params vgic_v2_params;
+#endif
+
 static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu,
                                            void __iomem *base)
 {
        struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-       int nr_lr = vcpu->arch.vgic_cpu.nr_lr;
+       int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr;
        u32 eisr0, eisr1;
        int i;
        bool expect_mi;
@@ -67,7 +74,7 @@ static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu,
 static void __hyp_text save_elrsr(struct kvm_vcpu *vcpu, void __iomem *base)
 {
        struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-       int nr_lr = vcpu->arch.vgic_cpu.nr_lr;
+       int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr;
        u32 elrsr0, elrsr1;
 
        elrsr0 = readl_relaxed(base + GICH_ELRSR0);
@@ -86,19 +93,18 @@ static void __hyp_text save_elrsr(struct kvm_vcpu *vcpu, void __iomem *base)
 static void __hyp_text save_lrs(struct kvm_vcpu *vcpu, void __iomem *base)
 {
        struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-       int nr_lr = vcpu->arch.vgic_cpu.nr_lr;
+       int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr;
        int i;
 
        for (i = 0; i < nr_lr; i++) {
                if (!(vcpu->arch.vgic_cpu.live_lrs & (1UL << i)))
                        continue;
 
-               if (cpu_if->vgic_elrsr & (1UL << i)) {
+               if (cpu_if->vgic_elrsr & (1UL << i))
                        cpu_if->vgic_lr[i] &= ~GICH_LR_STATE;
-                       continue;
-               }
+               else
+                       cpu_if->vgic_lr[i] = readl_relaxed(base + GICH_LR0 + (i * 4));
 
-               cpu_if->vgic_lr[i] = readl_relaxed(base + GICH_LR0 + (i * 4));
                writel_relaxed(0, base + GICH_LR0 + (i * 4));
        }
 }
@@ -141,13 +147,13 @@ void __hyp_text __vgic_v2_restore_state(struct kvm_vcpu *vcpu)
        struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
        struct vgic_dist *vgic = &kvm->arch.vgic;
        void __iomem *base = kern_hyp_va(vgic->vctrl_base);
-       int i, nr_lr;
+       int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr;
+       int i;
        u64 live_lrs = 0;
 
        if (!base)
                return;
 
-       nr_lr = vcpu->arch.vgic_cpu.nr_lr;
 
        for (i = 0; i < nr_lr; i++)
                if (cpu_if->vgic_lr[i] & GICH_LR_STATE)
index 575c7aa..a027569 100644 (file)
@@ -436,7 +436,14 @@ static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu)
        return 0;
 }
 
-static bool irq_is_valid(struct kvm *kvm, int irq, bool is_ppi)
+#define irq_is_ppi(irq) ((irq) >= VGIC_NR_SGIS && (irq) < VGIC_NR_PRIVATE_IRQS)
+
+/*
+ * For one VM the interrupt type must be same for each vcpu.
+ * As a PPI, the interrupt number is the same for all vcpus,
+ * while as an SPI it must be a separate number per vcpu.
+ */
+static bool pmu_irq_is_valid(struct kvm *kvm, int irq)
 {
        int i;
        struct kvm_vcpu *vcpu;
@@ -445,7 +452,7 @@ static bool irq_is_valid(struct kvm *kvm, int irq, bool is_ppi)
                if (!kvm_arm_pmu_irq_initialized(vcpu))
                        continue;
 
-               if (is_ppi) {
+               if (irq_is_ppi(irq)) {
                        if (vcpu->arch.pmu.irq_num != irq)
                                return false;
                } else {
@@ -457,7 +464,6 @@ static bool irq_is_valid(struct kvm *kvm, int irq, bool is_ppi)
        return true;
 }
 
-
 int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
 {
        switch (attr->attr) {
@@ -471,14 +477,11 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
                if (get_user(irq, uaddr))
                        return -EFAULT;
 
-               /*
-                * The PMU overflow interrupt could be a PPI or SPI, but for one
-                * VM the interrupt type must be same for each vcpu. As a PPI,
-                * the interrupt number is the same for all vcpus, while as an
-                * SPI it must be a separate number per vcpu.
-                */
-               if (irq < VGIC_NR_SGIS || irq >= vcpu->kvm->arch.vgic.nr_irqs ||
-                   !irq_is_valid(vcpu->kvm, irq, irq < VGIC_NR_PRIVATE_IRQS))
+               /* The PMU overflow interrupt can be a PPI or a valid SPI. */
+               if (!(irq_is_ppi(irq) || vgic_valid_spi(vcpu->kvm, irq)))
+                       return -EINVAL;
+
+               if (!pmu_irq_is_valid(vcpu->kvm, irq))
                        return -EINVAL;
 
                if (kvm_arm_pmu_irq_initialized(vcpu))
index 7e826c9..334cd7a 100644 (file)
@@ -171,7 +171,7 @@ static const struct vgic_ops vgic_v2_ops = {
        .enable                 = vgic_v2_enable,
 };
 
-static struct vgic_params vgic_v2_params;
+struct vgic_params __section(.hyp.text) vgic_v2_params;
 
 static void vgic_cpu_init_lrs(void *params)
 {
@@ -201,6 +201,8 @@ int vgic_v2_probe(const struct gic_kvm_info *gic_kvm_info,
        const struct resource *vctrl_res = &gic_kvm_info->vctrl;
        const struct resource *vcpu_res = &gic_kvm_info->vcpu;
 
+       memset(vgic, 0, sizeof(*vgic));
+
        if (!gic_kvm_info->maint_irq) {
                kvm_err("error getting vgic maintenance irq\n");
                ret = -ENXIO;
index c02a1b1..75b02fa 100644 (file)
 #include <asm/kvm_asm.h>
 #include <asm/kvm_mmu.h>
 
-/* These are for GICv2 emulation only */
-#define GICH_LR_VIRTUALID              (0x3ffUL << 0)
-#define GICH_LR_PHYSID_CPUID_SHIFT     (10)
-#define GICH_LR_PHYSID_CPUID           (7UL << GICH_LR_PHYSID_CPUID_SHIFT)
-#define ICH_LR_VIRTUALID_MASK          (BIT_ULL(32) - 1)
-
 static u32 ich_vtr_el2;
 
 static struct vgic_lr vgic_v3_get_lr(const struct kvm_vcpu *vcpu, int lr)
@@ -43,7 +37,7 @@ static struct vgic_lr vgic_v3_get_lr(const struct kvm_vcpu *vcpu, int lr)
        u64 val = vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr];
 
        if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
-               lr_desc.irq = val & ICH_LR_VIRTUALID_MASK;
+               lr_desc.irq = val & ICH_LR_VIRTUAL_ID_MASK;
        else
                lr_desc.irq = val & GICH_LR_VIRTUALID;
 
index 60668a7..c3bfbb9 100644 (file)
@@ -690,12 +690,11 @@ bool vgic_handle_cfg_reg(u32 *reg, struct kvm_exit_mmio *mmio,
  */
 void vgic_unqueue_irqs(struct kvm_vcpu *vcpu)
 {
-       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
        u64 elrsr = vgic_get_elrsr(vcpu);
        unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr);
        int i;
 
-       for_each_clear_bit(i, elrsr_ptr, vgic_cpu->nr_lr) {
+       for_each_clear_bit(i, elrsr_ptr, vgic->nr_lr) {
                struct vgic_lr lr = vgic_get_lr(vcpu, i);
 
                /*
@@ -820,7 +819,6 @@ static int vgic_handle_mmio_access(struct kvm_vcpu *vcpu,
        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
        struct vgic_io_device *iodev = container_of(this,
                                                    struct vgic_io_device, dev);
-       struct kvm_run *run = vcpu->run;
        const struct vgic_io_range *range;
        struct kvm_exit_mmio mmio;
        bool updated_state;
@@ -849,12 +847,6 @@ static int vgic_handle_mmio_access(struct kvm_vcpu *vcpu,
                updated_state = false;
        }
        spin_unlock(&dist->lock);
-       run->mmio.is_write      = is_write;
-       run->mmio.len           = len;
-       run->mmio.phys_addr     = addr;
-       memcpy(run->mmio.data, val, len);
-
-       kvm_handle_mmio_return(vcpu, run);
 
        if (updated_state)
                vgic_kick_vcpus(vcpu->kvm);
@@ -1102,18 +1094,18 @@ static bool dist_active_irq(struct kvm_vcpu *vcpu)
        return test_bit(vcpu->vcpu_id, dist->irq_active_on_cpu);
 }
 
-bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, struct irq_phys_map *map)
+bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq)
 {
        int i;
 
-       for (i = 0; i < vcpu->arch.vgic_cpu.nr_lr; i++) {
+       for (i = 0; i < vgic->nr_lr; i++) {
                struct vgic_lr vlr = vgic_get_lr(vcpu, i);
 
-               if (vlr.irq == map->virt_irq && vlr.state & LR_STATE_ACTIVE)
+               if (vlr.irq == virt_irq && vlr.state & LR_STATE_ACTIVE)
                        return true;
        }
 
-       return vgic_irq_is_active(vcpu, map->virt_irq);
+       return vgic_irq_is_active(vcpu, virt_irq);
 }
 
 /*
@@ -1521,7 +1513,6 @@ static int vgic_validate_injection(struct kvm_vcpu *vcpu, int irq, int level)
 }
 
 static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
-                                  struct irq_phys_map *map,
                                   unsigned int irq_num, bool level)
 {
        struct vgic_dist *dist = &kvm->arch.vgic;
@@ -1660,14 +1651,14 @@ int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num,
        if (map)
                return -EINVAL;
 
-       return vgic_update_irq_pending(kvm, cpuid, NULL, irq_num, level);
+       return vgic_update_irq_pending(kvm, cpuid, irq_num, level);
 }
 
 /**
  * kvm_vgic_inject_mapped_irq - Inject a physically mapped IRQ to the vgic
  * @kvm:     The VM structure pointer
  * @cpuid:   The CPU for PPIs
- * @map:     Pointer to a irq_phys_map structure describing the mapping
+ * @virt_irq: The virtual IRQ to be injected
  * @level:   Edge-triggered:  true:  to trigger the interrupt
  *                           false: to ignore the call
  *          Level-sensitive  true:  raise the input signal
@@ -1678,7 +1669,7 @@ int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num,
  * being HIGH and 0 being LOW and all devices being active-HIGH.
  */
 int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid,
-                              struct irq_phys_map *map, bool level)
+                              unsigned int virt_irq, bool level)
 {
        int ret;
 
@@ -1686,7 +1677,7 @@ int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid,
        if (ret)
                return ret;
 
-       return vgic_update_irq_pending(kvm, cpuid, map, map->virt_irq, level);
+       return vgic_update_irq_pending(kvm, cpuid, virt_irq, level);
 }
 
 static irqreturn_t vgic_maintenance_handler(int irq, void *data)
@@ -1712,43 +1703,28 @@ static struct list_head *vgic_get_irq_phys_map_list(struct kvm_vcpu *vcpu,
 /**
  * kvm_vgic_map_phys_irq - map a virtual IRQ to a physical IRQ
  * @vcpu: The VCPU pointer
- * @virt_irq: The virtual irq number
- * @irq: The Linux IRQ number
+ * @virt_irq: The virtual IRQ number for the guest
+ * @phys_irq: The hardware IRQ number of the host
  *
  * Establish a mapping between a guest visible irq (@virt_irq) and a
- * Linux irq (@irq). On injection, @virt_irq will be associated with
- * the physical interrupt represented by @irq. This mapping can be
+ * hardware irq (@phys_irq). On injection, @virt_irq will be associated with
+ * the physical interrupt represented by @phys_irq. This mapping can be
  * established multiple times as long as the parameters are the same.
  *
- * Returns a valid pointer on success, and an error pointer otherwise
+ * Returns 0 on success or an error value otherwise.
  */
-struct irq_phys_map *kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu,
-                                          int virt_irq, int irq)
+int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, int virt_irq, int phys_irq)
 {
        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
        struct list_head *root = vgic_get_irq_phys_map_list(vcpu, virt_irq);
        struct irq_phys_map *map;
        struct irq_phys_map_entry *entry;
-       struct irq_desc *desc;
-       struct irq_data *data;
-       int phys_irq;
-
-       desc = irq_to_desc(irq);
-       if (!desc) {
-               kvm_err("%s: no interrupt descriptor\n", __func__);
-               return ERR_PTR(-EINVAL);
-       }
-
-       data = irq_desc_get_irq_data(desc);
-       while (data->parent_data)
-               data = data->parent_data;
-
-       phys_irq = data->hwirq;
+       int ret = 0;
 
        /* Create a new mapping */
        entry = kzalloc(sizeof(*entry), GFP_KERNEL);
        if (!entry)
-               return ERR_PTR(-ENOMEM);
+               return -ENOMEM;
 
        spin_lock(&dist->irq_phys_map_lock);
 
@@ -1756,9 +1732,8 @@ struct irq_phys_map *kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu,
        map = vgic_irq_map_search(vcpu, virt_irq);
        if (map) {
                /* Make sure this mapping matches */
-               if (map->phys_irq != phys_irq   ||
-                   map->irq      != irq)
-                       map = ERR_PTR(-EINVAL);
+               if (map->phys_irq != phys_irq)
+                       ret = -EINVAL;
 
                /* Found an existing, valid mapping */
                goto out;
@@ -1767,7 +1742,6 @@ struct irq_phys_map *kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu,
        map           = &entry->map;
        map->virt_irq = virt_irq;
        map->phys_irq = phys_irq;
-       map->irq      = irq;
 
        list_add_tail_rcu(&entry->entry, root);
 
@@ -1775,9 +1749,9 @@ out:
        spin_unlock(&dist->irq_phys_map_lock);
        /* If we've found a hit in the existing list, free the useless
         * entry */
-       if (IS_ERR(map) || map != &entry->map)
+       if (ret || map != &entry->map)
                kfree(entry);
-       return map;
+       return ret;
 }
 
 static struct irq_phys_map *vgic_irq_map_search(struct kvm_vcpu *vcpu,
@@ -1813,25 +1787,22 @@ static void vgic_free_phys_irq_map_rcu(struct rcu_head *rcu)
 /**
  * kvm_vgic_unmap_phys_irq - Remove a virtual to physical IRQ mapping
  * @vcpu: The VCPU pointer
- * @map: The pointer to a mapping obtained through kvm_vgic_map_phys_irq
+ * @virt_irq: The virtual IRQ number to be unmapped
  *
  * Remove an existing mapping between virtual and physical interrupts.
  */
-int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, struct irq_phys_map *map)
+int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq)
 {
        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
        struct irq_phys_map_entry *entry;
        struct list_head *root;
 
-       if (!map)
-               return -EINVAL;
-
-       root = vgic_get_irq_phys_map_list(vcpu, map->virt_irq);
+       root = vgic_get_irq_phys_map_list(vcpu, virt_irq);
 
        spin_lock(&dist->irq_phys_map_lock);
 
        list_for_each_entry(entry, root, entry) {
-               if (&entry->map == map) {
+               if (entry->map.virt_irq == virt_irq) {
                        list_del_rcu(&entry->entry);
                        call_rcu(&entry->rcu, vgic_free_phys_irq_map_rcu);
                        break;
@@ -1887,13 +1858,6 @@ static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, int nr_irqs)
                return -ENOMEM;
        }
 
-       /*
-        * Store the number of LRs per vcpu, so we don't have to go
-        * all the way to the distributor structure to find out. Only
-        * assembly code should use this one.
-        */
-       vgic_cpu->nr_lr = vgic->nr_lr;
-
        return 0;
 }
 
diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c
new file mode 100644 (file)
index 0000000..a1442f7
--- /dev/null
@@ -0,0 +1,452 @@
+/*
+ * Copyright (C) 2015, 2016 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/uaccess.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+#include <linux/kvm_host.h>
+#include <kvm/arm_vgic.h>
+#include <asm/kvm_mmu.h>
+#include "vgic.h"
+
+/*
+ * Initialization rules: there are multiple stages to the vgic
+ * initialization, both for the distributor and the CPU interfaces.
+ *
+ * Distributor:
+ *
+ * - kvm_vgic_early_init(): initialization of static data that doesn't
+ *   depend on any sizing information or emulation type. No allocation
+ *   is allowed there.
+ *
+ * - vgic_init(): allocation and initialization of the generic data
+ *   structures that depend on sizing information (number of CPUs,
+ *   number of interrupts). Also initializes the vcpu specific data
+ *   structures. Can be executed lazily for GICv2.
+ *
+ * CPU Interface:
+ *
+ * - kvm_vgic_cpu_early_init(): initialization of static data that
+ *   doesn't depend on any sizing information or emulation type. No
+ *   allocation is allowed there.
+ */
+
+/* EARLY INIT */
+
+/*
+ * Those 2 functions should not be needed anymore but they
+ * still are called from arm.c
+ */
+void kvm_vgic_early_init(struct kvm *kvm)
+{
+}
+
+void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu)
+{
+}
+
+/* CREATION */
+
+/**
+ * kvm_vgic_create: triggered by the instantiation of the VGIC device by
+ * user space, either through the legacy KVM_CREATE_IRQCHIP ioctl (v2 only)
+ * or through the generic KVM_CREATE_DEVICE API ioctl.
+ * irqchip_in_kernel() tells you if this function succeeded or not.
+ * @kvm: kvm struct pointer
+ * @type: KVM_DEV_TYPE_ARM_VGIC_V[23]
+ */
+int kvm_vgic_create(struct kvm *kvm, u32 type)
+{
+       int i, vcpu_lock_idx = -1, ret;
+       struct kvm_vcpu *vcpu;
+
+       mutex_lock(&kvm->lock);
+
+       if (irqchip_in_kernel(kvm)) {
+               ret = -EEXIST;
+               goto out;
+       }
+
+       /*
+        * This function is also called by the KVM_CREATE_IRQCHIP handler,
+        * which had no chance yet to check the availability of the GICv2
+        * emulation. So check this here again. KVM_CREATE_DEVICE does
+        * the proper checks already.
+        */
+       if (type == KVM_DEV_TYPE_ARM_VGIC_V2 &&
+               !kvm_vgic_global_state.can_emulate_gicv2) {
+               ret = -ENODEV;
+               goto out;
+       }
+
+       /*
+        * Any time a vcpu is run, vcpu_load is called which tries to grab the
+        * vcpu->mutex.  By grabbing the vcpu->mutex of all VCPUs we ensure
+        * that no other VCPUs are run while we create the vgic.
+        */
+       ret = -EBUSY;
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               if (!mutex_trylock(&vcpu->mutex))
+                       goto out_unlock;
+               vcpu_lock_idx = i;
+       }
+
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               if (vcpu->arch.has_run_once)
+                       goto out_unlock;
+       }
+       ret = 0;
+
+       if (type == KVM_DEV_TYPE_ARM_VGIC_V2)
+               kvm->arch.max_vcpus = VGIC_V2_MAX_CPUS;
+       else
+               kvm->arch.max_vcpus = VGIC_V3_MAX_CPUS;
+
+       if (atomic_read(&kvm->online_vcpus) > kvm->arch.max_vcpus) {
+               ret = -E2BIG;
+               goto out_unlock;
+       }
+
+       kvm->arch.vgic.in_kernel = true;
+       kvm->arch.vgic.vgic_model = type;
+
+       /*
+        * kvm_vgic_global_state.vctrl_base is set on vgic probe (kvm_arch_init)
+        * it is stored in distributor struct for asm save/restore purpose
+        */
+       kvm->arch.vgic.vctrl_base = kvm_vgic_global_state.vctrl_base;
+
+       kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
+       kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
+       kvm->arch.vgic.vgic_redist_base = VGIC_ADDR_UNDEF;
+
+out_unlock:
+       for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
+               vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx);
+               mutex_unlock(&vcpu->mutex);
+       }
+
+out:
+       mutex_unlock(&kvm->lock);
+       return ret;
+}
+
+/* INIT/DESTROY */
+
+/**
+ * kvm_vgic_dist_init: initialize the dist data structures
+ * @kvm: kvm struct pointer
+ * @nr_spis: number of spis, frozen by caller
+ */
+static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+       struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0);
+       int i;
+
+       dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL);
+       if (!dist->spis)
+               return  -ENOMEM;
+
+       /*
+        * In the following code we do not take the irq struct lock since
+        * no other action on irq structs can happen while the VGIC is
+        * not initialized yet:
+        * If someone wants to inject an interrupt or does a MMIO access, we
+        * require prior initialization in case of a virtual GICv3 or trigger
+        * initialization when using a virtual GICv2.
+        */
+       for (i = 0; i < nr_spis; i++) {
+               struct vgic_irq *irq = &dist->spis[i];
+
+               irq->intid = i + VGIC_NR_PRIVATE_IRQS;
+               INIT_LIST_HEAD(&irq->ap_list);
+               spin_lock_init(&irq->irq_lock);
+               irq->vcpu = NULL;
+               irq->target_vcpu = vcpu0;
+               if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2)
+                       irq->targets = 0;
+               else
+                       irq->mpidr = 0;
+       }
+       return 0;
+}
+
+/**
+ * kvm_vgic_vcpu_init: initialize the vcpu data structures and
+ * enable the VCPU interface
+ * @vcpu: the VCPU which's VGIC should be initialized
+ */
+static void kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+       int i;
+
+       INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
+       spin_lock_init(&vgic_cpu->ap_list_lock);
+
+       /*
+        * Enable and configure all SGIs to be edge-triggered and
+        * configure all PPIs as level-triggered.
+        */
+       for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) {
+               struct vgic_irq *irq = &vgic_cpu->private_irqs[i];
+
+               INIT_LIST_HEAD(&irq->ap_list);
+               spin_lock_init(&irq->irq_lock);
+               irq->intid = i;
+               irq->vcpu = NULL;
+               irq->target_vcpu = vcpu;
+               irq->targets = 1U << vcpu->vcpu_id;
+               if (vgic_irq_is_sgi(i)) {
+                       /* SGIs */
+                       irq->enabled = 1;
+                       irq->config = VGIC_CONFIG_EDGE;
+               } else {
+                       /* PPIs */
+                       irq->config = VGIC_CONFIG_LEVEL;
+               }
+       }
+       if (kvm_vgic_global_state.type == VGIC_V2)
+               vgic_v2_enable(vcpu);
+       else
+               vgic_v3_enable(vcpu);
+}
+
+/*
+ * vgic_init: allocates and initializes dist and vcpu data structures
+ * depending on two dimensioning parameters:
+ * - the number of spis
+ * - the number of vcpus
+ * The function is generally called when nr_spis has been explicitly set
+ * by the guest through the KVM DEVICE API. If not nr_spis is set to 256.
+ * vgic_initialized() returns true when this function has succeeded.
+ * Must be called with kvm->lock held!
+ */
+int vgic_init(struct kvm *kvm)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+       struct kvm_vcpu *vcpu;
+       int ret = 0, i;
+
+       if (vgic_initialized(kvm))
+               return 0;
+
+       /* freeze the number of spis */
+       if (!dist->nr_spis)
+               dist->nr_spis = VGIC_NR_IRQS_LEGACY - VGIC_NR_PRIVATE_IRQS;
+
+       ret = kvm_vgic_dist_init(kvm, dist->nr_spis);
+       if (ret)
+               goto out;
+
+       kvm_for_each_vcpu(i, vcpu, kvm)
+               kvm_vgic_vcpu_init(vcpu);
+
+       dist->initialized = true;
+out:
+       return ret;
+}
+
+static void kvm_vgic_dist_destroy(struct kvm *kvm)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+
+       mutex_lock(&kvm->lock);
+
+       dist->ready = false;
+       dist->initialized = false;
+
+       kfree(dist->spis);
+       kfree(dist->redist_iodevs);
+       dist->nr_spis = 0;
+
+       mutex_unlock(&kvm->lock);
+}
+
+void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+
+       INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
+}
+
+void kvm_vgic_destroy(struct kvm *kvm)
+{
+       struct kvm_vcpu *vcpu;
+       int i;
+
+       kvm_vgic_dist_destroy(kvm);
+
+       kvm_for_each_vcpu(i, vcpu, kvm)
+               kvm_vgic_vcpu_destroy(vcpu);
+}
+
+/**
+ * vgic_lazy_init: Lazy init is only allowed if the GIC exposed to the guest
+ * is a GICv2. A GICv3 must be explicitly initialized by the guest using the
+ * KVM_DEV_ARM_VGIC_GRP_CTRL KVM_DEVICE group.
+ * @kvm: kvm struct pointer
+ */
+int vgic_lazy_init(struct kvm *kvm)
+{
+       int ret = 0;
+
+       if (unlikely(!vgic_initialized(kvm))) {
+               /*
+                * We only provide the automatic initialization of the VGIC
+                * for the legacy case of a GICv2. Any other type must
+                * be explicitly initialized once setup with the respective
+                * KVM device call.
+                */
+               if (kvm->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V2)
+                       return -EBUSY;
+
+               mutex_lock(&kvm->lock);
+               ret = vgic_init(kvm);
+               mutex_unlock(&kvm->lock);
+       }
+
+       return ret;
+}
+
+/* RESOURCE MAPPING */
+
+/**
+ * Map the MMIO regions depending on the VGIC model exposed to the guest
+ * called on the first VCPU run.
+ * Also map the virtual CPU interface into the VM.
+ * v2/v3 derivatives call vgic_init if not already done.
+ * vgic_ready() returns true if this function has succeeded.
+ * @kvm: kvm struct pointer
+ */
+int kvm_vgic_map_resources(struct kvm *kvm)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+       int ret = 0;
+
+       mutex_lock(&kvm->lock);
+       if (!irqchip_in_kernel(kvm))
+               goto out;
+
+       if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2)
+               ret = vgic_v2_map_resources(kvm);
+       else
+               ret = vgic_v3_map_resources(kvm);
+out:
+       mutex_unlock(&kvm->lock);
+       return ret;
+}
+
+/* GENERIC PROBE */
+
+static void vgic_init_maintenance_interrupt(void *info)
+{
+       enable_percpu_irq(kvm_vgic_global_state.maint_irq, 0);
+}
+
+static int vgic_cpu_notify(struct notifier_block *self,
+                          unsigned long action, void *cpu)
+{
+       switch (action) {
+       case CPU_STARTING:
+       case CPU_STARTING_FROZEN:
+               vgic_init_maintenance_interrupt(NULL);
+               break;
+       case CPU_DYING:
+       case CPU_DYING_FROZEN:
+               disable_percpu_irq(kvm_vgic_global_state.maint_irq);
+               break;
+       }
+
+       return NOTIFY_OK;
+}
+
+static struct notifier_block vgic_cpu_nb = {
+       .notifier_call = vgic_cpu_notify,
+};
+
+static irqreturn_t vgic_maintenance_handler(int irq, void *data)
+{
+       /*
+        * We cannot rely on the vgic maintenance interrupt to be
+        * delivered synchronously. This means we can only use it to
+        * exit the VM, and we perform the handling of EOIed
+        * interrupts on the exit path (see vgic_process_maintenance).
+        */
+       return IRQ_HANDLED;
+}
+
+/**
+ * kvm_vgic_hyp_init: populates the kvm_vgic_global_state variable
+ * according to the host GIC model. Accordingly calls either
+ * vgic_v2/v3_probe which registers the KVM_DEVICE that can be
+ * instantiated by a guest later on .
+ */
+int kvm_vgic_hyp_init(void)
+{
+       const struct gic_kvm_info *gic_kvm_info;
+       int ret;
+
+       gic_kvm_info = gic_get_kvm_info();
+       if (!gic_kvm_info)
+               return -ENODEV;
+
+       if (!gic_kvm_info->maint_irq) {
+               kvm_err("No vgic maintenance irq\n");
+               return -ENXIO;
+       }
+
+       switch (gic_kvm_info->type) {
+       case GIC_V2:
+               ret = vgic_v2_probe(gic_kvm_info);
+               break;
+       case GIC_V3:
+               ret = vgic_v3_probe(gic_kvm_info);
+               break;
+       default:
+               ret = -ENODEV;
+       };
+
+       if (ret)
+               return ret;
+
+       kvm_vgic_global_state.maint_irq = gic_kvm_info->maint_irq;
+       ret = request_percpu_irq(kvm_vgic_global_state.maint_irq,
+                                vgic_maintenance_handler,
+                                "vgic", kvm_get_running_vcpus());
+       if (ret) {
+               kvm_err("Cannot register interrupt %d\n",
+                       kvm_vgic_global_state.maint_irq);
+               return ret;
+       }
+
+       ret = __register_cpu_notifier(&vgic_cpu_nb);
+       if (ret) {
+               kvm_err("Cannot register vgic CPU notifier\n");
+               goto out_free_irq;
+       }
+
+       on_each_cpu(vgic_init_maintenance_interrupt, NULL, 1);
+
+       kvm_info("vgic interrupt IRQ%d\n", kvm_vgic_global_state.maint_irq);
+       return 0;
+
+out_free_irq:
+       free_percpu_irq(kvm_vgic_global_state.maint_irq,
+                       kvm_get_running_vcpus());
+       return ret;
+}
diff --git a/virt/kvm/arm/vgic/vgic-irqfd.c b/virt/kvm/arm/vgic/vgic-irqfd.c
new file mode 100644 (file)
index 0000000..c675513
--- /dev/null
@@ -0,0 +1,52 @@
+/*
+ * Copyright (C) 2015, 2016 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <trace/events/kvm.h>
+
+int kvm_irq_map_gsi(struct kvm *kvm,
+                   struct kvm_kernel_irq_routing_entry *entries,
+                   int gsi)
+{
+       return 0;
+}
+
+int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned int irqchip,
+                        unsigned int pin)
+{
+       return pin;
+}
+
+int kvm_set_irq(struct kvm *kvm, int irq_source_id,
+               u32 irq, int level, bool line_status)
+{
+       unsigned int spi = irq + VGIC_NR_PRIVATE_IRQS;
+
+       trace_kvm_set_irq(irq, level, irq_source_id);
+
+       BUG_ON(!vgic_initialized(kvm));
+
+       return kvm_vgic_inject_irq(kvm, 0, spi, level);
+}
+
+/* MSI not implemented yet */
+int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
+               struct kvm *kvm, int irq_source_id,
+               int level, bool line_status)
+{
+       return 0;
+}
diff --git a/virt/kvm/arm/vgic/vgic-kvm-device.c b/virt/kvm/arm/vgic/vgic-kvm-device.c
new file mode 100644 (file)
index 0000000..0130c4b
--- /dev/null
@@ -0,0 +1,431 @@
+/*
+ * VGIC: KVM DEVICE API
+ *
+ * Copyright (C) 2015 ARM Ltd.
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#include <linux/kvm_host.h>
+#include <kvm/arm_vgic.h>
+#include <linux/uaccess.h>
+#include <asm/kvm_mmu.h>
+#include "vgic.h"
+
+/* common helpers */
+
+static int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
+                            phys_addr_t addr, phys_addr_t alignment)
+{
+       if (addr & ~KVM_PHYS_MASK)
+               return -E2BIG;
+
+       if (!IS_ALIGNED(addr, alignment))
+               return -EINVAL;
+
+       if (!IS_VGIC_ADDR_UNDEF(*ioaddr))
+               return -EEXIST;
+
+       return 0;
+}
+
+/**
+ * kvm_vgic_addr - set or get vgic VM base addresses
+ * @kvm:   pointer to the vm struct
+ * @type:  the VGIC addr type, one of KVM_VGIC_V[23]_ADDR_TYPE_XXX
+ * @addr:  pointer to address value
+ * @write: if true set the address in the VM address space, if false read the
+ *          address
+ *
+ * Set or get the vgic base addresses for the distributor and the virtual CPU
+ * interface in the VM physical address space.  These addresses are properties
+ * of the emulated core/SoC and therefore user space initially knows this
+ * information.
+ * Check them for sanity (alignment, double assignment). We can't check for
+ * overlapping regions in case of a virtual GICv3 here, since we don't know
+ * the number of VCPUs yet, so we defer this check to map_resources().
+ */
+int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write)
+{
+       int r = 0;
+       struct vgic_dist *vgic = &kvm->arch.vgic;
+       int type_needed;
+       phys_addr_t *addr_ptr, alignment;
+
+       mutex_lock(&kvm->lock);
+       switch (type) {
+       case KVM_VGIC_V2_ADDR_TYPE_DIST:
+               type_needed = KVM_DEV_TYPE_ARM_VGIC_V2;
+               addr_ptr = &vgic->vgic_dist_base;
+               alignment = SZ_4K;
+               break;
+       case KVM_VGIC_V2_ADDR_TYPE_CPU:
+               type_needed = KVM_DEV_TYPE_ARM_VGIC_V2;
+               addr_ptr = &vgic->vgic_cpu_base;
+               alignment = SZ_4K;
+               break;
+#ifdef CONFIG_KVM_ARM_VGIC_V3
+       case KVM_VGIC_V3_ADDR_TYPE_DIST:
+               type_needed = KVM_DEV_TYPE_ARM_VGIC_V3;
+               addr_ptr = &vgic->vgic_dist_base;
+               alignment = SZ_64K;
+               break;
+       case KVM_VGIC_V3_ADDR_TYPE_REDIST:
+               type_needed = KVM_DEV_TYPE_ARM_VGIC_V3;
+               addr_ptr = &vgic->vgic_redist_base;
+               alignment = SZ_64K;
+               break;
+#endif
+       default:
+               r = -ENODEV;
+               goto out;
+       }
+
+       if (vgic->vgic_model != type_needed) {
+               r = -ENODEV;
+               goto out;
+       }
+
+       if (write) {
+               r = vgic_check_ioaddr(kvm, addr_ptr, *addr, alignment);
+               if (!r)
+                       *addr_ptr = *addr;
+       } else {
+               *addr = *addr_ptr;
+       }
+
+out:
+       mutex_unlock(&kvm->lock);
+       return r;
+}
+
+static int vgic_set_common_attr(struct kvm_device *dev,
+                               struct kvm_device_attr *attr)
+{
+       int r;
+
+       switch (attr->group) {
+       case KVM_DEV_ARM_VGIC_GRP_ADDR: {
+               u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+               u64 addr;
+               unsigned long type = (unsigned long)attr->attr;
+
+               if (copy_from_user(&addr, uaddr, sizeof(addr)))
+                       return -EFAULT;
+
+               r = kvm_vgic_addr(dev->kvm, type, &addr, true);
+               return (r == -ENODEV) ? -ENXIO : r;
+       }
+       case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
+               u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+               u32 val;
+               int ret = 0;
+
+               if (get_user(val, uaddr))
+                       return -EFAULT;
+
+               /*
+                * We require:
+                * - at least 32 SPIs on top of the 16 SGIs and 16 PPIs
+                * - at most 1024 interrupts
+                * - a multiple of 32 interrupts
+                */
+               if (val < (VGIC_NR_PRIVATE_IRQS + 32) ||
+                   val > VGIC_MAX_RESERVED ||
+                   (val & 31))
+                       return -EINVAL;
+
+               mutex_lock(&dev->kvm->lock);
+
+               if (vgic_ready(dev->kvm) || dev->kvm->arch.vgic.nr_spis)
+                       ret = -EBUSY;
+               else
+                       dev->kvm->arch.vgic.nr_spis =
+                               val - VGIC_NR_PRIVATE_IRQS;
+
+               mutex_unlock(&dev->kvm->lock);
+
+               return ret;
+       }
+       case KVM_DEV_ARM_VGIC_GRP_CTRL: {
+               switch (attr->attr) {
+               case KVM_DEV_ARM_VGIC_CTRL_INIT:
+                       mutex_lock(&dev->kvm->lock);
+                       r = vgic_init(dev->kvm);
+                       mutex_unlock(&dev->kvm->lock);
+                       return r;
+               }
+               break;
+       }
+       }
+
+       return -ENXIO;
+}
+
+static int vgic_get_common_attr(struct kvm_device *dev,
+                               struct kvm_device_attr *attr)
+{
+       int r = -ENXIO;
+
+       switch (attr->group) {
+       case KVM_DEV_ARM_VGIC_GRP_ADDR: {
+               u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+               u64 addr;
+               unsigned long type = (unsigned long)attr->attr;
+
+               r = kvm_vgic_addr(dev->kvm, type, &addr, false);
+               if (r)
+                       return (r == -ENODEV) ? -ENXIO : r;
+
+               if (copy_to_user(uaddr, &addr, sizeof(addr)))
+                       return -EFAULT;
+               break;
+       }
+       case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
+               u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+
+               r = put_user(dev->kvm->arch.vgic.nr_spis +
+                            VGIC_NR_PRIVATE_IRQS, uaddr);
+               break;
+       }
+       }
+
+       return r;
+}
+
+static int vgic_create(struct kvm_device *dev, u32 type)
+{
+       return kvm_vgic_create(dev->kvm, type);
+}
+
+static void vgic_destroy(struct kvm_device *dev)
+{
+       kfree(dev);
+}
+
+void kvm_register_vgic_device(unsigned long type)
+{
+       switch (type) {
+       case KVM_DEV_TYPE_ARM_VGIC_V2:
+               kvm_register_device_ops(&kvm_arm_vgic_v2_ops,
+                                       KVM_DEV_TYPE_ARM_VGIC_V2);
+               break;
+#ifdef CONFIG_KVM_ARM_VGIC_V3
+       case KVM_DEV_TYPE_ARM_VGIC_V3:
+               kvm_register_device_ops(&kvm_arm_vgic_v3_ops,
+                                       KVM_DEV_TYPE_ARM_VGIC_V3);
+               break;
+#endif
+       }
+}
+
+/** vgic_attr_regs_access: allows user space to read/write VGIC registers
+ *
+ * @dev: kvm device handle
+ * @attr: kvm device attribute
+ * @reg: address the value is read or written
+ * @is_write: write flag
+ *
+ */
+static int vgic_attr_regs_access(struct kvm_device *dev,
+                                struct kvm_device_attr *attr,
+                                u32 *reg, bool is_write)
+{
+       gpa_t addr;
+       int cpuid, ret, c;
+       struct kvm_vcpu *vcpu, *tmp_vcpu;
+       int vcpu_lock_idx = -1;
+
+       cpuid = (attr->attr & KVM_DEV_ARM_VGIC_CPUID_MASK) >>
+                KVM_DEV_ARM_VGIC_CPUID_SHIFT;
+       vcpu = kvm_get_vcpu(dev->kvm, cpuid);
+       addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
+
+       mutex_lock(&dev->kvm->lock);
+
+       ret = vgic_init(dev->kvm);
+       if (ret)
+               goto out;
+
+       if (cpuid >= atomic_read(&dev->kvm->online_vcpus)) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       /*
+        * Any time a vcpu is run, vcpu_load is called which tries to grab the
+        * vcpu->mutex.  By grabbing the vcpu->mutex of all VCPUs we ensure
+        * that no other VCPUs are run and fiddle with the vgic state while we
+        * access it.
+        */
+       ret = -EBUSY;
+       kvm_for_each_vcpu(c, tmp_vcpu, dev->kvm) {
+               if (!mutex_trylock(&tmp_vcpu->mutex))
+                       goto out;
+               vcpu_lock_idx = c;
+       }
+
+       switch (attr->group) {
+       case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
+               ret = vgic_v2_cpuif_uaccess(vcpu, is_write, addr, reg);
+               break;
+       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+               ret = vgic_v2_dist_uaccess(vcpu, is_write, addr, reg);
+               break;
+       default:
+               ret = -EINVAL;
+               break;
+       }
+
+out:
+       for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
+               tmp_vcpu = kvm_get_vcpu(dev->kvm, vcpu_lock_idx);
+               mutex_unlock(&tmp_vcpu->mutex);
+       }
+
+       mutex_unlock(&dev->kvm->lock);
+       return ret;
+}
+
+/* V2 ops */
+
+static int vgic_v2_set_attr(struct kvm_device *dev,
+                           struct kvm_device_attr *attr)
+{
+       int ret;
+
+       ret = vgic_set_common_attr(dev, attr);
+       if (ret != -ENXIO)
+               return ret;
+
+       switch (attr->group) {
+       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+       case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: {
+               u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+               u32 reg;
+
+               if (get_user(reg, uaddr))
+                       return -EFAULT;
+
+               return vgic_attr_regs_access(dev, attr, &reg, true);
+       }
+       }
+
+       return -ENXIO;
+}
+
+static int vgic_v2_get_attr(struct kvm_device *dev,
+                           struct kvm_device_attr *attr)
+{
+       int ret;
+
+       ret = vgic_get_common_attr(dev, attr);
+       if (ret != -ENXIO)
+               return ret;
+
+       switch (attr->group) {
+       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+       case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: {
+               u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+               u32 reg = 0;
+
+               ret = vgic_attr_regs_access(dev, attr, &reg, false);
+               if (ret)
+                       return ret;
+               return put_user(reg, uaddr);
+       }
+       }
+
+       return -ENXIO;
+}
+
+static int vgic_v2_has_attr(struct kvm_device *dev,
+                           struct kvm_device_attr *attr)
+{
+       switch (attr->group) {
+       case KVM_DEV_ARM_VGIC_GRP_ADDR:
+               switch (attr->attr) {
+               case KVM_VGIC_V2_ADDR_TYPE_DIST:
+               case KVM_VGIC_V2_ADDR_TYPE_CPU:
+                       return 0;
+               }
+               break;
+       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+       case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
+               return vgic_v2_has_attr_regs(dev, attr);
+       case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
+               return 0;
+       case KVM_DEV_ARM_VGIC_GRP_CTRL:
+               switch (attr->attr) {
+               case KVM_DEV_ARM_VGIC_CTRL_INIT:
+                       return 0;
+               }
+       }
+       return -ENXIO;
+}
+
+struct kvm_device_ops kvm_arm_vgic_v2_ops = {
+       .name = "kvm-arm-vgic-v2",
+       .create = vgic_create,
+       .destroy = vgic_destroy,
+       .set_attr = vgic_v2_set_attr,
+       .get_attr = vgic_v2_get_attr,
+       .has_attr = vgic_v2_has_attr,
+};
+
+/* V3 ops */
+
+#ifdef CONFIG_KVM_ARM_VGIC_V3
+
+static int vgic_v3_set_attr(struct kvm_device *dev,
+                           struct kvm_device_attr *attr)
+{
+       return vgic_set_common_attr(dev, attr);
+}
+
+static int vgic_v3_get_attr(struct kvm_device *dev,
+                           struct kvm_device_attr *attr)
+{
+       return vgic_get_common_attr(dev, attr);
+}
+
+static int vgic_v3_has_attr(struct kvm_device *dev,
+                           struct kvm_device_attr *attr)
+{
+       switch (attr->group) {
+       case KVM_DEV_ARM_VGIC_GRP_ADDR:
+               switch (attr->attr) {
+               case KVM_VGIC_V3_ADDR_TYPE_DIST:
+               case KVM_VGIC_V3_ADDR_TYPE_REDIST:
+                       return 0;
+               }
+               break;
+       case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
+               return 0;
+       case KVM_DEV_ARM_VGIC_GRP_CTRL:
+               switch (attr->attr) {
+               case KVM_DEV_ARM_VGIC_CTRL_INIT:
+                       return 0;
+               }
+       }
+       return -ENXIO;
+}
+
+struct kvm_device_ops kvm_arm_vgic_v3_ops = {
+       .name = "kvm-arm-vgic-v3",
+       .create = vgic_create,
+       .destroy = vgic_destroy,
+       .set_attr = vgic_v3_set_attr,
+       .get_attr = vgic_v3_get_attr,
+       .has_attr = vgic_v3_has_attr,
+};
+
+#endif /* CONFIG_KVM_ARM_VGIC_V3 */
+
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c
new file mode 100644 (file)
index 0000000..a213936
--- /dev/null
@@ -0,0 +1,446 @@
+/*
+ * VGICv2 MMIO handling functions
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/irqchip/arm-gic.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <kvm/iodev.h>
+#include <kvm/arm_vgic.h>
+
+#include "vgic.h"
+#include "vgic-mmio.h"
+
+static unsigned long vgic_mmio_read_v2_misc(struct kvm_vcpu *vcpu,
+                                           gpa_t addr, unsigned int len)
+{
+       u32 value;
+
+       switch (addr & 0x0c) {
+       case GIC_DIST_CTRL:
+               value = vcpu->kvm->arch.vgic.enabled ? GICD_ENABLE : 0;
+               break;
+       case GIC_DIST_CTR:
+               value = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS;
+               value = (value >> 5) - 1;
+               value |= (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5;
+               break;
+       case GIC_DIST_IIDR:
+               value = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
+               break;
+       default:
+               return 0;
+       }
+
+       return value;
+}
+
+static void vgic_mmio_write_v2_misc(struct kvm_vcpu *vcpu,
+                                   gpa_t addr, unsigned int len,
+                                   unsigned long val)
+{
+       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+       bool was_enabled = dist->enabled;
+
+       switch (addr & 0x0c) {
+       case GIC_DIST_CTRL:
+               dist->enabled = val & GICD_ENABLE;
+               if (!was_enabled && dist->enabled)
+                       vgic_kick_vcpus(vcpu->kvm);
+               break;
+       case GIC_DIST_CTR:
+       case GIC_DIST_IIDR:
+               /* Nothing to do */
+               return;
+       }
+}
+
+static void vgic_mmio_write_sgir(struct kvm_vcpu *source_vcpu,
+                                gpa_t addr, unsigned int len,
+                                unsigned long val)
+{
+       int nr_vcpus = atomic_read(&source_vcpu->kvm->online_vcpus);
+       int intid = val & 0xf;
+       int targets = (val >> 16) & 0xff;
+       int mode = (val >> 24) & 0x03;
+       int c;
+       struct kvm_vcpu *vcpu;
+
+       switch (mode) {
+       case 0x0:               /* as specified by targets */
+               break;
+       case 0x1:
+               targets = (1U << nr_vcpus) - 1;                 /* all, ... */
+               targets &= ~(1U << source_vcpu->vcpu_id);       /* but self */
+               break;
+       case 0x2:               /* this very vCPU only */
+               targets = (1U << source_vcpu->vcpu_id);
+               break;
+       case 0x3:               /* reserved */
+               return;
+       }
+
+       kvm_for_each_vcpu(c, vcpu, source_vcpu->kvm) {
+               struct vgic_irq *irq;
+
+               if (!(targets & (1U << c)))
+                       continue;
+
+               irq = vgic_get_irq(source_vcpu->kvm, vcpu, intid);
+
+               spin_lock(&irq->irq_lock);
+               irq->pending = true;
+               irq->source |= 1U << source_vcpu->vcpu_id;
+
+               vgic_queue_irq_unlock(source_vcpu->kvm, irq);
+       }
+}
+
+static unsigned long vgic_mmio_read_target(struct kvm_vcpu *vcpu,
+                                          gpa_t addr, unsigned int len)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 8);
+       int i;
+       u64 val = 0;
+
+       for (i = 0; i < len; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               val |= (u64)irq->targets << (i * 8);
+       }
+
+       return val;
+}
+
+static void vgic_mmio_write_target(struct kvm_vcpu *vcpu,
+                                  gpa_t addr, unsigned int len,
+                                  unsigned long val)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 8);
+       int i;
+
+       /* GICD_ITARGETSR[0-7] are read-only */
+       if (intid < VGIC_NR_PRIVATE_IRQS)
+               return;
+
+       for (i = 0; i < len; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid + i);
+               int target;
+
+               spin_lock(&irq->irq_lock);
+
+               irq->targets = (val >> (i * 8)) & 0xff;
+               target = irq->targets ? __ffs(irq->targets) : 0;
+               irq->target_vcpu = kvm_get_vcpu(vcpu->kvm, target);
+
+               spin_unlock(&irq->irq_lock);
+       }
+}
+
+static unsigned long vgic_mmio_read_sgipend(struct kvm_vcpu *vcpu,
+                                           gpa_t addr, unsigned int len)
+{
+       u32 intid = addr & 0x0f;
+       int i;
+       u64 val = 0;
+
+       for (i = 0; i < len; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               val |= (u64)irq->source << (i * 8);
+       }
+       return val;
+}
+
+static void vgic_mmio_write_sgipendc(struct kvm_vcpu *vcpu,
+                                    gpa_t addr, unsigned int len,
+                                    unsigned long val)
+{
+       u32 intid = addr & 0x0f;
+       int i;
+
+       for (i = 0; i < len; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               spin_lock(&irq->irq_lock);
+
+               irq->source &= ~((val >> (i * 8)) & 0xff);
+               if (!irq->source)
+                       irq->pending = false;
+
+               spin_unlock(&irq->irq_lock);
+       }
+}
+
+static void vgic_mmio_write_sgipends(struct kvm_vcpu *vcpu,
+                                    gpa_t addr, unsigned int len,
+                                    unsigned long val)
+{
+       u32 intid = addr & 0x0f;
+       int i;
+
+       for (i = 0; i < len; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               spin_lock(&irq->irq_lock);
+
+               irq->source |= (val >> (i * 8)) & 0xff;
+
+               if (irq->source) {
+                       irq->pending = true;
+                       vgic_queue_irq_unlock(vcpu->kvm, irq);
+               } else {
+                       spin_unlock(&irq->irq_lock);
+               }
+       }
+}
+
+static void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
+{
+       if (kvm_vgic_global_state.type == VGIC_V2)
+               vgic_v2_set_vmcr(vcpu, vmcr);
+       else
+               vgic_v3_set_vmcr(vcpu, vmcr);
+}
+
+static void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
+{
+       if (kvm_vgic_global_state.type == VGIC_V2)
+               vgic_v2_get_vmcr(vcpu, vmcr);
+       else
+               vgic_v3_get_vmcr(vcpu, vmcr);
+}
+
+#define GICC_ARCH_VERSION_V2   0x2
+
+/* These are for userland accesses only, there is no guest-facing emulation. */
+static unsigned long vgic_mmio_read_vcpuif(struct kvm_vcpu *vcpu,
+                                          gpa_t addr, unsigned int len)
+{
+       struct vgic_vmcr vmcr;
+       u32 val;
+
+       vgic_get_vmcr(vcpu, &vmcr);
+
+       switch (addr & 0xff) {
+       case GIC_CPU_CTRL:
+               val = vmcr.ctlr;
+               break;
+       case GIC_CPU_PRIMASK:
+               val = vmcr.pmr;
+               break;
+       case GIC_CPU_BINPOINT:
+               val = vmcr.bpr;
+               break;
+       case GIC_CPU_ALIAS_BINPOINT:
+               val = vmcr.abpr;
+               break;
+       case GIC_CPU_IDENT:
+               val = ((PRODUCT_ID_KVM << 20) |
+                      (GICC_ARCH_VERSION_V2 << 16) |
+                      IMPLEMENTER_ARM);
+               break;
+       default:
+               return 0;
+       }
+
+       return val;
+}
+
+static void vgic_mmio_write_vcpuif(struct kvm_vcpu *vcpu,
+                                  gpa_t addr, unsigned int len,
+                                  unsigned long val)
+{
+       struct vgic_vmcr vmcr;
+
+       vgic_get_vmcr(vcpu, &vmcr);
+
+       switch (addr & 0xff) {
+       case GIC_CPU_CTRL:
+               vmcr.ctlr = val;
+               break;
+       case GIC_CPU_PRIMASK:
+               vmcr.pmr = val;
+               break;
+       case GIC_CPU_BINPOINT:
+               vmcr.bpr = val;
+               break;
+       case GIC_CPU_ALIAS_BINPOINT:
+               vmcr.abpr = val;
+               break;
+       }
+
+       vgic_set_vmcr(vcpu, &vmcr);
+}
+
+static const struct vgic_register_region vgic_v2_dist_registers[] = {
+       REGISTER_DESC_WITH_LENGTH(GIC_DIST_CTRL,
+               vgic_mmio_read_v2_misc, vgic_mmio_write_v2_misc, 12,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_IGROUP,
+               vgic_mmio_read_rao, vgic_mmio_write_wi, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_SET,
+               vgic_mmio_read_enable, vgic_mmio_write_senable, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_CLEAR,
+               vgic_mmio_read_enable, vgic_mmio_write_cenable, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_SET,
+               vgic_mmio_read_pending, vgic_mmio_write_spending, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_CLEAR,
+               vgic_mmio_read_pending, vgic_mmio_write_cpending, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_SET,
+               vgic_mmio_read_active, vgic_mmio_write_sactive, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_CLEAR,
+               vgic_mmio_read_active, vgic_mmio_write_cactive, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PRI,
+               vgic_mmio_read_priority, vgic_mmio_write_priority, 8,
+               VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_TARGET,
+               vgic_mmio_read_target, vgic_mmio_write_target, 8,
+               VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_CONFIG,
+               vgic_mmio_read_config, vgic_mmio_write_config, 2,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GIC_DIST_SOFTINT,
+               vgic_mmio_read_raz, vgic_mmio_write_sgir, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GIC_DIST_SGI_PENDING_CLEAR,
+               vgic_mmio_read_sgipend, vgic_mmio_write_sgipendc, 16,
+               VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
+       REGISTER_DESC_WITH_LENGTH(GIC_DIST_SGI_PENDING_SET,
+               vgic_mmio_read_sgipend, vgic_mmio_write_sgipends, 16,
+               VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
+};
+
+static const struct vgic_register_region vgic_v2_cpu_registers[] = {
+       REGISTER_DESC_WITH_LENGTH(GIC_CPU_CTRL,
+               vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GIC_CPU_PRIMASK,
+               vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GIC_CPU_BINPOINT,
+               vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GIC_CPU_ALIAS_BINPOINT,
+               vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GIC_CPU_ACTIVEPRIO,
+               vgic_mmio_read_raz, vgic_mmio_write_wi, 16,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GIC_CPU_IDENT,
+               vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
+               VGIC_ACCESS_32bit),
+};
+
+unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev)
+{
+       dev->regions = vgic_v2_dist_registers;
+       dev->nr_regions = ARRAY_SIZE(vgic_v2_dist_registers);
+
+       kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops);
+
+       return SZ_4K;
+}
+
+int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+       int nr_irqs = dev->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS;
+       const struct vgic_register_region *regions;
+       gpa_t addr;
+       int nr_regions, i, len;
+
+       addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
+
+       switch (attr->group) {
+       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+               regions = vgic_v2_dist_registers;
+               nr_regions = ARRAY_SIZE(vgic_v2_dist_registers);
+               break;
+       case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
+               regions = vgic_v2_cpu_registers;
+               nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers);
+               break;
+       default:
+               return -ENXIO;
+       }
+
+       /* We only support aligned 32-bit accesses. */
+       if (addr & 3)
+               return -ENXIO;
+
+       for (i = 0; i < nr_regions; i++) {
+               if (regions[i].bits_per_irq)
+                       len = (regions[i].bits_per_irq * nr_irqs) / 8;
+               else
+                       len = regions[i].len;
+
+               if (regions[i].reg_offset <= addr &&
+                   regions[i].reg_offset + len > addr)
+                       return 0;
+       }
+
+       return -ENXIO;
+}
+
+/*
+ * When userland tries to access the VGIC register handlers, we need to
+ * create a usable struct vgic_io_device to be passed to the handlers and we
+ * have to set up a buffer similar to what would have happened if a guest MMIO
+ * access occurred, including doing endian conversions on BE systems.
+ */
+static int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev,
+                       bool is_write, int offset, u32 *val)
+{
+       unsigned int len = 4;
+       u8 buf[4];
+       int ret;
+
+       if (is_write) {
+               vgic_data_host_to_mmio_bus(buf, len, *val);
+               ret = kvm_io_gic_ops.write(vcpu, &dev->dev, offset, len, buf);
+       } else {
+               ret = kvm_io_gic_ops.read(vcpu, &dev->dev, offset, len, buf);
+               if (!ret)
+                       *val = vgic_data_mmio_bus_to_host(buf, len);
+       }
+
+       return ret;
+}
+
+int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+                         int offset, u32 *val)
+{
+       struct vgic_io_device dev = {
+               .regions = vgic_v2_cpu_registers,
+               .nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers),
+       };
+
+       return vgic_uaccess(vcpu, &dev, is_write, offset, val);
+}
+
+int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+                        int offset, u32 *val)
+{
+       struct vgic_io_device dev = {
+               .regions = vgic_v2_dist_registers,
+               .nr_regions = ARRAY_SIZE(vgic_v2_dist_registers),
+       };
+
+       return vgic_uaccess(vcpu, &dev, is_write, offset, val);
+}
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c
new file mode 100644 (file)
index 0000000..a0c515a
--- /dev/null
@@ -0,0 +1,455 @@
+/*
+ * VGICv3 MMIO handling functions
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/irqchip/arm-gic-v3.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <kvm/iodev.h>
+#include <kvm/arm_vgic.h>
+
+#include <asm/kvm_emulate.h>
+
+#include "vgic.h"
+#include "vgic-mmio.h"
+
+/* extract @num bytes at @offset bytes offset in data */
+static unsigned long extract_bytes(unsigned long data, unsigned int offset,
+                                  unsigned int num)
+{
+       return (data >> (offset * 8)) & GENMASK_ULL(num * 8 - 1, 0);
+}
+
+static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu,
+                                           gpa_t addr, unsigned int len)
+{
+       u32 value = 0;
+
+       switch (addr & 0x0c) {
+       case GICD_CTLR:
+               if (vcpu->kvm->arch.vgic.enabled)
+                       value |= GICD_CTLR_ENABLE_SS_G1;
+               value |= GICD_CTLR_ARE_NS | GICD_CTLR_DS;
+               break;
+       case GICD_TYPER:
+               value = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS;
+               value = (value >> 5) - 1;
+               value |= (INTERRUPT_ID_BITS_SPIS - 1) << 19;
+               break;
+       case GICD_IIDR:
+               value = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
+               break;
+       default:
+               return 0;
+       }
+
+       return value;
+}
+
+static void vgic_mmio_write_v3_misc(struct kvm_vcpu *vcpu,
+                                   gpa_t addr, unsigned int len,
+                                   unsigned long val)
+{
+       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+       bool was_enabled = dist->enabled;
+
+       switch (addr & 0x0c) {
+       case GICD_CTLR:
+               dist->enabled = val & GICD_CTLR_ENABLE_SS_G1;
+
+               if (!was_enabled && dist->enabled)
+                       vgic_kick_vcpus(vcpu->kvm);
+               break;
+       case GICD_TYPER:
+       case GICD_IIDR:
+               return;
+       }
+}
+
+static unsigned long vgic_mmio_read_irouter(struct kvm_vcpu *vcpu,
+                                           gpa_t addr, unsigned int len)
+{
+       int intid = VGIC_ADDR_TO_INTID(addr, 64);
+       struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid);
+
+       if (!irq)
+               return 0;
+
+       /* The upper word is RAZ for us. */
+       if (addr & 4)
+               return 0;
+
+       return extract_bytes(READ_ONCE(irq->mpidr), addr & 7, len);
+}
+
+static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu,
+                                   gpa_t addr, unsigned int len,
+                                   unsigned long val)
+{
+       int intid = VGIC_ADDR_TO_INTID(addr, 64);
+       struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid);
+
+       if (!irq)
+               return;
+
+       /* The upper word is WI for us since we don't implement Aff3. */
+       if (addr & 4)
+               return;
+
+       spin_lock(&irq->irq_lock);
+
+       /* We only care about and preserve Aff0, Aff1 and Aff2. */
+       irq->mpidr = val & GENMASK(23, 0);
+       irq->target_vcpu = kvm_mpidr_to_vcpu(vcpu->kvm, irq->mpidr);
+
+       spin_unlock(&irq->irq_lock);
+}
+
+static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu,
+                                             gpa_t addr, unsigned int len)
+{
+       unsigned long mpidr = kvm_vcpu_get_mpidr_aff(vcpu);
+       int target_vcpu_id = vcpu->vcpu_id;
+       u64 value;
+
+       value = (mpidr & GENMASK(23, 0)) << 32;
+       value |= ((target_vcpu_id & 0xffff) << 8);
+       if (target_vcpu_id == atomic_read(&vcpu->kvm->online_vcpus) - 1)
+               value |= GICR_TYPER_LAST;
+
+       return extract_bytes(value, addr & 7, len);
+}
+
+static unsigned long vgic_mmio_read_v3r_iidr(struct kvm_vcpu *vcpu,
+                                            gpa_t addr, unsigned int len)
+{
+       return (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
+}
+
+static unsigned long vgic_mmio_read_v3_idregs(struct kvm_vcpu *vcpu,
+                                             gpa_t addr, unsigned int len)
+{
+       switch (addr & 0xffff) {
+       case GICD_PIDR2:
+               /* report a GICv3 compliant implementation */
+               return 0x3b;
+       }
+
+       return 0;
+}
+
+/*
+ * The GICv3 per-IRQ registers are split to control PPIs and SGIs in the
+ * redistributors, while SPIs are covered by registers in the distributor
+ * block. Trying to set private IRQs in this block gets ignored.
+ * We take some special care here to fix the calculation of the register
+ * offset.
+ */
+#define REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(off, rd, wr, bpi, acc)  \
+       {                                                               \
+               .reg_offset = off,                                      \
+               .bits_per_irq = bpi,                                    \
+               .len = (bpi * VGIC_NR_PRIVATE_IRQS) / 8,                \
+               .access_flags = acc,                                    \
+               .read = vgic_mmio_read_raz,                             \
+               .write = vgic_mmio_write_wi,                            \
+       }, {                                                            \
+               .reg_offset = off + (bpi * VGIC_NR_PRIVATE_IRQS) / 8,   \
+               .bits_per_irq = bpi,                                    \
+               .len = (bpi * (1024 - VGIC_NR_PRIVATE_IRQS)) / 8,       \
+               .access_flags = acc,                                    \
+               .read = rd,                                             \
+               .write = wr,                                            \
+       }
+
+static const struct vgic_register_region vgic_v3_dist_registers[] = {
+       REGISTER_DESC_WITH_LENGTH(GICD_CTLR,
+               vgic_mmio_read_v3_misc, vgic_mmio_write_v3_misc, 16,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGROUPR,
+               vgic_mmio_read_rao, vgic_mmio_write_wi, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISENABLER,
+               vgic_mmio_read_enable, vgic_mmio_write_senable, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICENABLER,
+               vgic_mmio_read_enable, vgic_mmio_write_cenable, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISPENDR,
+               vgic_mmio_read_pending, vgic_mmio_write_spending, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICPENDR,
+               vgic_mmio_read_pending, vgic_mmio_write_cpending, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISACTIVER,
+               vgic_mmio_read_active, vgic_mmio_write_sactive, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICACTIVER,
+               vgic_mmio_read_active, vgic_mmio_write_cactive, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IPRIORITYR,
+               vgic_mmio_read_priority, vgic_mmio_write_priority, 8,
+               VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ITARGETSR,
+               vgic_mmio_read_raz, vgic_mmio_write_wi, 8,
+               VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICFGR,
+               vgic_mmio_read_config, vgic_mmio_write_config, 2,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGRPMODR,
+               vgic_mmio_read_raz, vgic_mmio_write_wi, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IROUTER,
+               vgic_mmio_read_irouter, vgic_mmio_write_irouter, 64,
+               VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICD_IDREGS,
+               vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48,
+               VGIC_ACCESS_32bit),
+};
+
+static const struct vgic_register_region vgic_v3_rdbase_registers[] = {
+       REGISTER_DESC_WITH_LENGTH(GICR_CTLR,
+               vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_IIDR,
+               vgic_mmio_read_v3r_iidr, vgic_mmio_write_wi, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_TYPER,
+               vgic_mmio_read_v3r_typer, vgic_mmio_write_wi, 8,
+               VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_PROPBASER,
+               vgic_mmio_read_raz, vgic_mmio_write_wi, 8,
+               VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_PENDBASER,
+               vgic_mmio_read_raz, vgic_mmio_write_wi, 8,
+               VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_IDREGS,
+               vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48,
+               VGIC_ACCESS_32bit),
+};
+
+static const struct vgic_register_region vgic_v3_sgibase_registers[] = {
+       REGISTER_DESC_WITH_LENGTH(GICR_IGROUPR0,
+               vgic_mmio_read_rao, vgic_mmio_write_wi, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_ISENABLER0,
+               vgic_mmio_read_enable, vgic_mmio_write_senable, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_ICENABLER0,
+               vgic_mmio_read_enable, vgic_mmio_write_cenable, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_ISPENDR0,
+               vgic_mmio_read_pending, vgic_mmio_write_spending, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_ICPENDR0,
+               vgic_mmio_read_pending, vgic_mmio_write_cpending, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_ISACTIVER0,
+               vgic_mmio_read_active, vgic_mmio_write_sactive, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_ICACTIVER0,
+               vgic_mmio_read_active, vgic_mmio_write_cactive, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_IPRIORITYR0,
+               vgic_mmio_read_priority, vgic_mmio_write_priority, 32,
+               VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_ICFGR0,
+               vgic_mmio_read_config, vgic_mmio_write_config, 8,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_IGRPMODR0,
+               vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_NSACR,
+               vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
+               VGIC_ACCESS_32bit),
+};
+
+unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev)
+{
+       dev->regions = vgic_v3_dist_registers;
+       dev->nr_regions = ARRAY_SIZE(vgic_v3_dist_registers);
+
+       kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops);
+
+       return SZ_64K;
+}
+
+int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t redist_base_address)
+{
+       int nr_vcpus = atomic_read(&kvm->online_vcpus);
+       struct kvm_vcpu *vcpu;
+       struct vgic_io_device *devices;
+       int c, ret = 0;
+
+       devices = kmalloc(sizeof(struct vgic_io_device) * nr_vcpus * 2,
+                         GFP_KERNEL);
+       if (!devices)
+               return -ENOMEM;
+
+       kvm_for_each_vcpu(c, vcpu, kvm) {
+               gpa_t rd_base = redist_base_address + c * SZ_64K * 2;
+               gpa_t sgi_base = rd_base + SZ_64K;
+               struct vgic_io_device *rd_dev = &devices[c * 2];
+               struct vgic_io_device *sgi_dev = &devices[c * 2 + 1];
+
+               kvm_iodevice_init(&rd_dev->dev, &kvm_io_gic_ops);
+               rd_dev->base_addr = rd_base;
+               rd_dev->regions = vgic_v3_rdbase_registers;
+               rd_dev->nr_regions = ARRAY_SIZE(vgic_v3_rdbase_registers);
+               rd_dev->redist_vcpu = vcpu;
+
+               mutex_lock(&kvm->slots_lock);
+               ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, rd_base,
+                                             SZ_64K, &rd_dev->dev);
+               mutex_unlock(&kvm->slots_lock);
+
+               if (ret)
+                       break;
+
+               kvm_iodevice_init(&sgi_dev->dev, &kvm_io_gic_ops);
+               sgi_dev->base_addr = sgi_base;
+               sgi_dev->regions = vgic_v3_sgibase_registers;
+               sgi_dev->nr_regions = ARRAY_SIZE(vgic_v3_sgibase_registers);
+               sgi_dev->redist_vcpu = vcpu;
+
+               mutex_lock(&kvm->slots_lock);
+               ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, sgi_base,
+                                             SZ_64K, &sgi_dev->dev);
+               mutex_unlock(&kvm->slots_lock);
+               if (ret) {
+                       kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
+                                                 &rd_dev->dev);
+                       break;
+               }
+       }
+
+       if (ret) {
+               /* The current c failed, so we start with the previous one. */
+               for (c--; c >= 0; c--) {
+                       kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
+                                                 &devices[c * 2].dev);
+                       kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
+                                                 &devices[c * 2 + 1].dev);
+               }
+               kfree(devices);
+       } else {
+               kvm->arch.vgic.redist_iodevs = devices;
+       }
+
+       return ret;
+}
+
+/*
+ * Compare a given affinity (level 1-3 and a level 0 mask, from the SGI
+ * generation register ICC_SGI1R_EL1) with a given VCPU.
+ * If the VCPU's MPIDR matches, return the level0 affinity, otherwise
+ * return -1.
+ */
+static int match_mpidr(u64 sgi_aff, u16 sgi_cpu_mask, struct kvm_vcpu *vcpu)
+{
+       unsigned long affinity;
+       int level0;
+
+       /*
+        * Split the current VCPU's MPIDR into affinity level 0 and the
+        * rest as this is what we have to compare against.
+        */
+       affinity = kvm_vcpu_get_mpidr_aff(vcpu);
+       level0 = MPIDR_AFFINITY_LEVEL(affinity, 0);
+       affinity &= ~MPIDR_LEVEL_MASK;
+
+       /* bail out if the upper three levels don't match */
+       if (sgi_aff != affinity)
+               return -1;
+
+       /* Is this VCPU's bit set in the mask ? */
+       if (!(sgi_cpu_mask & BIT(level0)))
+               return -1;
+
+       return level0;
+}
+
+/*
+ * The ICC_SGI* registers encode the affinity differently from the MPIDR,
+ * so provide a wrapper to use the existing defines to isolate a certain
+ * affinity level.
+ */
+#define SGI_AFFINITY_LEVEL(reg, level) \
+       ((((reg) & ICC_SGI1R_AFFINITY_## level ##_MASK) \
+       >> ICC_SGI1R_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level))
+
+/**
+ * vgic_v3_dispatch_sgi - handle SGI requests from VCPUs
+ * @vcpu: The VCPU requesting a SGI
+ * @reg: The value written into the ICC_SGI1R_EL1 register by that VCPU
+ *
+ * With GICv3 (and ARE=1) CPUs trigger SGIs by writing to a system register.
+ * This will trap in sys_regs.c and call this function.
+ * This ICC_SGI1R_EL1 register contains the upper three affinity levels of the
+ * target processors as well as a bitmask of 16 Aff0 CPUs.
+ * If the interrupt routing mode bit is not set, we iterate over all VCPUs to
+ * check for matching ones. If this bit is set, we signal all, but not the
+ * calling VCPU.
+ */
+void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg)
+{
+       struct kvm *kvm = vcpu->kvm;
+       struct kvm_vcpu *c_vcpu;
+       u16 target_cpus;
+       u64 mpidr;
+       int sgi, c;
+       int vcpu_id = vcpu->vcpu_id;
+       bool broadcast;
+
+       sgi = (reg & ICC_SGI1R_SGI_ID_MASK) >> ICC_SGI1R_SGI_ID_SHIFT;
+       broadcast = reg & BIT(ICC_SGI1R_IRQ_ROUTING_MODE_BIT);
+       target_cpus = (reg & ICC_SGI1R_TARGET_LIST_MASK) >> ICC_SGI1R_TARGET_LIST_SHIFT;
+       mpidr = SGI_AFFINITY_LEVEL(reg, 3);
+       mpidr |= SGI_AFFINITY_LEVEL(reg, 2);
+       mpidr |= SGI_AFFINITY_LEVEL(reg, 1);
+
+       /*
+        * We iterate over all VCPUs to find the MPIDRs matching the request.
+        * If we have handled one CPU, we clear its bit to detect early
+        * if we are already finished. This avoids iterating through all
+        * VCPUs when most of the times we just signal a single VCPU.
+        */
+       kvm_for_each_vcpu(c, c_vcpu, kvm) {
+               struct vgic_irq *irq;
+
+               /* Exit early if we have dealt with all requested CPUs */
+               if (!broadcast && target_cpus == 0)
+                       break;
+
+               /* Don't signal the calling VCPU */
+               if (broadcast && c == vcpu_id)
+                       continue;
+
+               if (!broadcast) {
+                       int level0;
+
+                       level0 = match_mpidr(mpidr, target_cpus, c_vcpu);
+                       if (level0 == -1)
+                               continue;
+
+                       /* remove this matching VCPU from the mask */
+                       target_cpus &= ~BIT(level0);
+               }
+
+               irq = vgic_get_irq(vcpu->kvm, c_vcpu, sgi);
+
+               spin_lock(&irq->irq_lock);
+               irq->pending = true;
+
+               vgic_queue_irq_unlock(vcpu->kvm, irq);
+       }
+}
diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c
new file mode 100644 (file)
index 0000000..9f6fab7
--- /dev/null
@@ -0,0 +1,524 @@
+/*
+ * VGIC MMIO handling functions
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/bitops.h>
+#include <linux/bsearch.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <kvm/iodev.h>
+#include <kvm/arm_vgic.h>
+
+#include "vgic.h"
+#include "vgic-mmio.h"
+
+unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu,
+                                gpa_t addr, unsigned int len)
+{
+       return 0;
+}
+
+unsigned long vgic_mmio_read_rao(struct kvm_vcpu *vcpu,
+                                gpa_t addr, unsigned int len)
+{
+       return -1UL;
+}
+
+void vgic_mmio_write_wi(struct kvm_vcpu *vcpu, gpa_t addr,
+                       unsigned int len, unsigned long val)
+{
+       /* Ignore */
+}
+
+/*
+ * Read accesses to both GICD_ICENABLER and GICD_ISENABLER return the value
+ * of the enabled bit, so there is only one function for both here.
+ */
+unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu,
+                                   gpa_t addr, unsigned int len)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+       u32 value = 0;
+       int i;
+
+       /* Loop over all IRQs affected by this read */
+       for (i = 0; i < len * 8; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               if (irq->enabled)
+                       value |= (1U << i);
+       }
+
+       return value;
+}
+
+void vgic_mmio_write_senable(struct kvm_vcpu *vcpu,
+                            gpa_t addr, unsigned int len,
+                            unsigned long val)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+       int i;
+
+       for_each_set_bit(i, &val, len * 8) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               spin_lock(&irq->irq_lock);
+               irq->enabled = true;
+               vgic_queue_irq_unlock(vcpu->kvm, irq);
+       }
+}
+
+void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu,
+                            gpa_t addr, unsigned int len,
+                            unsigned long val)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+       int i;
+
+       for_each_set_bit(i, &val, len * 8) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               spin_lock(&irq->irq_lock);
+
+               irq->enabled = false;
+
+               spin_unlock(&irq->irq_lock);
+       }
+}
+
+unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
+                                    gpa_t addr, unsigned int len)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+       u32 value = 0;
+       int i;
+
+       /* Loop over all IRQs affected by this read */
+       for (i = 0; i < len * 8; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               if (irq->pending)
+                       value |= (1U << i);
+       }
+
+       return value;
+}
+
+void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
+                             gpa_t addr, unsigned int len,
+                             unsigned long val)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+       int i;
+
+       for_each_set_bit(i, &val, len * 8) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               spin_lock(&irq->irq_lock);
+               irq->pending = true;
+               if (irq->config == VGIC_CONFIG_LEVEL)
+                       irq->soft_pending = true;
+
+               vgic_queue_irq_unlock(vcpu->kvm, irq);
+       }
+}
+
+void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
+                             gpa_t addr, unsigned int len,
+                             unsigned long val)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+       int i;
+
+       for_each_set_bit(i, &val, len * 8) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               spin_lock(&irq->irq_lock);
+
+               if (irq->config == VGIC_CONFIG_LEVEL) {
+                       irq->soft_pending = false;
+                       irq->pending = irq->line_level;
+               } else {
+                       irq->pending = false;
+               }
+
+               spin_unlock(&irq->irq_lock);
+       }
+}
+
+unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu,
+                                   gpa_t addr, unsigned int len)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+       u32 value = 0;
+       int i;
+
+       /* Loop over all IRQs affected by this read */
+       for (i = 0; i < len * 8; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               if (irq->active)
+                       value |= (1U << i);
+       }
+
+       return value;
+}
+
+static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
+                                   bool new_active_state)
+{
+       spin_lock(&irq->irq_lock);
+       /*
+        * If this virtual IRQ was written into a list register, we
+        * have to make sure the CPU that runs the VCPU thread has
+        * synced back LR state to the struct vgic_irq.  We can only
+        * know this for sure, when either this irq is not assigned to
+        * anyone's AP list anymore, or the VCPU thread is not
+        * running on any CPUs.
+        *
+        * In the opposite case, we know the VCPU thread may be on its
+        * way back from the guest and still has to sync back this
+        * IRQ, so we release and re-acquire the spin_lock to let the
+        * other thread sync back the IRQ.
+        */
+       while (irq->vcpu && /* IRQ may have state in an LR somewhere */
+              irq->vcpu->cpu != -1) /* VCPU thread is running */
+               cond_resched_lock(&irq->irq_lock);
+
+       irq->active = new_active_state;
+       if (new_active_state)
+               vgic_queue_irq_unlock(vcpu->kvm, irq);
+       else
+               spin_unlock(&irq->irq_lock);
+}
+
+/*
+ * If we are fiddling with an IRQ's active state, we have to make sure the IRQ
+ * is not queued on some running VCPU's LRs, because then the change to the
+ * active state can be overwritten when the VCPU's state is synced coming back
+ * from the guest.
+ *
+ * For shared interrupts, we have to stop all the VCPUs because interrupts can
+ * be migrated while we don't hold the IRQ locks and we don't want to be
+ * chasing moving targets.
+ *
+ * For private interrupts, we only have to make sure the single and only VCPU
+ * that can potentially queue the IRQ is stopped.
+ */
+static void vgic_change_active_prepare(struct kvm_vcpu *vcpu, u32 intid)
+{
+       if (intid < VGIC_NR_PRIVATE_IRQS)
+               kvm_arm_halt_vcpu(vcpu);
+       else
+               kvm_arm_halt_guest(vcpu->kvm);
+}
+
+/* See vgic_change_active_prepare */
+static void vgic_change_active_finish(struct kvm_vcpu *vcpu, u32 intid)
+{
+       if (intid < VGIC_NR_PRIVATE_IRQS)
+               kvm_arm_resume_vcpu(vcpu);
+       else
+               kvm_arm_resume_guest(vcpu->kvm);
+}
+
+void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
+                            gpa_t addr, unsigned int len,
+                            unsigned long val)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+       int i;
+
+       vgic_change_active_prepare(vcpu, intid);
+       for_each_set_bit(i, &val, len * 8) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+               vgic_mmio_change_active(vcpu, irq, false);
+       }
+       vgic_change_active_finish(vcpu, intid);
+}
+
+void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
+                            gpa_t addr, unsigned int len,
+                            unsigned long val)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+       int i;
+
+       vgic_change_active_prepare(vcpu, intid);
+       for_each_set_bit(i, &val, len * 8) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+               vgic_mmio_change_active(vcpu, irq, true);
+       }
+       vgic_change_active_finish(vcpu, intid);
+}
+
+unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu,
+                                     gpa_t addr, unsigned int len)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 8);
+       int i;
+       u64 val = 0;
+
+       for (i = 0; i < len; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               val |= (u64)irq->priority << (i * 8);
+       }
+
+       return val;
+}
+
+/*
+ * We currently don't handle changing the priority of an interrupt that
+ * is already pending on a VCPU. If there is a need for this, we would
+ * need to make this VCPU exit and re-evaluate the priorities, potentially
+ * leading to this interrupt getting presented now to the guest (if it has
+ * been masked by the priority mask before).
+ */
+void vgic_mmio_write_priority(struct kvm_vcpu *vcpu,
+                             gpa_t addr, unsigned int len,
+                             unsigned long val)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 8);
+       int i;
+
+       for (i = 0; i < len; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               spin_lock(&irq->irq_lock);
+               /* Narrow the priority range to what we actually support */
+               irq->priority = (val >> (i * 8)) & GENMASK(7, 8 - VGIC_PRI_BITS);
+               spin_unlock(&irq->irq_lock);
+       }
+}
+
+unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu,
+                                   gpa_t addr, unsigned int len)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 2);
+       u32 value = 0;
+       int i;
+
+       for (i = 0; i < len * 4; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               if (irq->config == VGIC_CONFIG_EDGE)
+                       value |= (2U << (i * 2));
+       }
+
+       return value;
+}
+
+void vgic_mmio_write_config(struct kvm_vcpu *vcpu,
+                           gpa_t addr, unsigned int len,
+                           unsigned long val)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 2);
+       int i;
+
+       for (i = 0; i < len * 4; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               /*
+                * The configuration cannot be changed for SGIs in general,
+                * for PPIs this is IMPLEMENTATION DEFINED. The arch timer
+                * code relies on PPIs being level triggered, so we also
+                * make them read-only here.
+                */
+               if (intid + i < VGIC_NR_PRIVATE_IRQS)
+                       continue;
+
+               spin_lock(&irq->irq_lock);
+               if (test_bit(i * 2 + 1, &val)) {
+                       irq->config = VGIC_CONFIG_EDGE;
+               } else {
+                       irq->config = VGIC_CONFIG_LEVEL;
+                       irq->pending = irq->line_level | irq->soft_pending;
+               }
+               spin_unlock(&irq->irq_lock);
+       }
+}
+
+static int match_region(const void *key, const void *elt)
+{
+       const unsigned int offset = (unsigned long)key;
+       const struct vgic_register_region *region = elt;
+
+       if (offset < region->reg_offset)
+               return -1;
+
+       if (offset >= region->reg_offset + region->len)
+               return 1;
+
+       return 0;
+}
+
+/* Find the proper register handler entry given a certain address offset. */
+static const struct vgic_register_region *
+vgic_find_mmio_region(const struct vgic_register_region *region, int nr_regions,
+                     unsigned int offset)
+{
+       return bsearch((void *)(uintptr_t)offset, region, nr_regions,
+                      sizeof(region[0]), match_region);
+}
+
+/*
+ * kvm_mmio_read_buf() returns a value in a format where it can be converted
+ * to a byte array and be directly observed as the guest wanted it to appear
+ * in memory if it had done the store itself, which is LE for the GIC, as the
+ * guest knows the GIC is always LE.
+ *
+ * We convert this value to the CPUs native format to deal with it as a data
+ * value.
+ */
+unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len)
+{
+       unsigned long data = kvm_mmio_read_buf(val, len);
+
+       switch (len) {
+       case 1:
+               return data;
+       case 2:
+               return le16_to_cpu(data);
+       case 4:
+               return le32_to_cpu(data);
+       default:
+               return le64_to_cpu(data);
+       }
+}
+
+/*
+ * kvm_mmio_write_buf() expects a value in a format such that if converted to
+ * a byte array it is observed as the guest would see it if it could perform
+ * the load directly.  Since the GIC is LE, and the guest knows this, the
+ * guest expects a value in little endian format.
+ *
+ * We convert the data value from the CPUs native format to LE so that the
+ * value is returned in the proper format.
+ */
+void vgic_data_host_to_mmio_bus(void *buf, unsigned int len,
+                               unsigned long data)
+{
+       switch (len) {
+       case 1:
+               break;
+       case 2:
+               data = cpu_to_le16(data);
+               break;
+       case 4:
+               data = cpu_to_le32(data);
+               break;
+       default:
+               data = cpu_to_le64(data);
+       }
+
+       kvm_mmio_write_buf(buf, len, data);
+}
+
+static
+struct vgic_io_device *kvm_to_vgic_iodev(const struct kvm_io_device *dev)
+{
+       return container_of(dev, struct vgic_io_device, dev);
+}
+
+static bool check_region(const struct vgic_register_region *region,
+                        gpa_t addr, int len)
+{
+       if ((region->access_flags & VGIC_ACCESS_8bit) && len == 1)
+               return true;
+       if ((region->access_flags & VGIC_ACCESS_32bit) &&
+           len == sizeof(u32) && !(addr & 3))
+               return true;
+       if ((region->access_flags & VGIC_ACCESS_64bit) &&
+           len == sizeof(u64) && !(addr & 7))
+               return true;
+
+       return false;
+}
+
+static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+                             gpa_t addr, int len, void *val)
+{
+       struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
+       const struct vgic_register_region *region;
+       struct kvm_vcpu *r_vcpu;
+       unsigned long data;
+
+       region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions,
+                                      addr - iodev->base_addr);
+       if (!region || !check_region(region, addr, len)) {
+               memset(val, 0, len);
+               return 0;
+       }
+
+       r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu;
+       data = region->read(r_vcpu, addr, len);
+       vgic_data_host_to_mmio_bus(val, len, data);
+       return 0;
+}
+
+static int dispatch_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+                              gpa_t addr, int len, const void *val)
+{
+       struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
+       const struct vgic_register_region *region;
+       struct kvm_vcpu *r_vcpu;
+       unsigned long data = vgic_data_mmio_bus_to_host(val, len);
+
+       region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions,
+                                      addr - iodev->base_addr);
+       if (!region)
+               return 0;
+
+       if (!check_region(region, addr, len))
+               return 0;
+
+       r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu;
+       region->write(r_vcpu, addr, len, data);
+       return 0;
+}
+
+struct kvm_io_device_ops kvm_io_gic_ops = {
+       .read = dispatch_mmio_read,
+       .write = dispatch_mmio_write,
+};
+
+int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
+                            enum vgic_type type)
+{
+       struct vgic_io_device *io_device = &kvm->arch.vgic.dist_iodev;
+       int ret = 0;
+       unsigned int len;
+
+       switch (type) {
+       case VGIC_V2:
+               len = vgic_v2_init_dist_iodev(io_device);
+               break;
+#ifdef CONFIG_KVM_ARM_VGIC_V3
+       case VGIC_V3:
+               len = vgic_v3_init_dist_iodev(io_device);
+               break;
+#endif
+       default:
+               BUG_ON(1);
+       }
+
+       io_device->base_addr = dist_base_address;
+       io_device->redist_vcpu = NULL;
+
+       mutex_lock(&kvm->slots_lock);
+       ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, dist_base_address,
+                                     len, &io_device->dev);
+       mutex_unlock(&kvm->slots_lock);
+
+       return ret;
+}
diff --git a/virt/kvm/arm/vgic/vgic-mmio.h b/virt/kvm/arm/vgic/vgic-mmio.h
new file mode 100644 (file)
index 0000000..8509014
--- /dev/null
@@ -0,0 +1,150 @@
+/*
+ * Copyright (C) 2015, 2016 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef __KVM_ARM_VGIC_MMIO_H__
+#define __KVM_ARM_VGIC_MMIO_H__
+
+struct vgic_register_region {
+       unsigned int reg_offset;
+       unsigned int len;
+       unsigned int bits_per_irq;
+       unsigned int access_flags;
+       unsigned long (*read)(struct kvm_vcpu *vcpu, gpa_t addr,
+                             unsigned int len);
+       void (*write)(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len,
+                     unsigned long val);
+};
+
+extern struct kvm_io_device_ops kvm_io_gic_ops;
+
+#define VGIC_ACCESS_8bit       1
+#define VGIC_ACCESS_32bit      2
+#define VGIC_ACCESS_64bit      4
+
+/*
+ * Generate a mask that covers the number of bytes required to address
+ * up to 1024 interrupts, each represented by <bits> bits. This assumes
+ * that <bits> is a power of two.
+ */
+#define VGIC_ADDR_IRQ_MASK(bits) (((bits) * 1024 / 8) - 1)
+
+/*
+ * (addr & mask) gives us the byte offset for the INT ID, so we want to
+ * divide this with 'bytes per irq' to get the INT ID, which is given
+ * by '(bits) / 8'.  But we do this with fixed-point-arithmetic and
+ * take advantage of the fact that division by a fraction equals
+ * multiplication with the inverted fraction, and scale up both the
+ * numerator and denominator with 8 to support at most 64 bits per IRQ:
+ */
+#define VGIC_ADDR_TO_INTID(addr, bits)  (((addr) & VGIC_ADDR_IRQ_MASK(bits)) * \
+                                       64 / (bits) / 8)
+
+/*
+ * Some VGIC registers store per-IRQ information, with a different number
+ * of bits per IRQ. For those registers this macro is used.
+ * The _WITH_LENGTH version instantiates registers with a fixed length
+ * and is mutually exclusive with the _PER_IRQ version.
+ */
+#define REGISTER_DESC_WITH_BITS_PER_IRQ(off, rd, wr, bpi, acc)         \
+       {                                                               \
+               .reg_offset = off,                                      \
+               .bits_per_irq = bpi,                                    \
+               .len = bpi * 1024 / 8,                                  \
+               .access_flags = acc,                                    \
+               .read = rd,                                             \
+               .write = wr,                                            \
+       }
+
+#define REGISTER_DESC_WITH_LENGTH(off, rd, wr, length, acc)            \
+       {                                                               \
+               .reg_offset = off,                                      \
+               .bits_per_irq = 0,                                      \
+               .len = length,                                          \
+               .access_flags = acc,                                    \
+               .read = rd,                                             \
+               .write = wr,                                            \
+       }
+
+int kvm_vgic_register_mmio_region(struct kvm *kvm, struct kvm_vcpu *vcpu,
+                                 struct vgic_register_region *reg_desc,
+                                 struct vgic_io_device *region,
+                                 int nr_irqs, bool offset_private);
+
+unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len);
+
+void vgic_data_host_to_mmio_bus(void *buf, unsigned int len,
+                               unsigned long data);
+
+unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu,
+                                gpa_t addr, unsigned int len);
+
+unsigned long vgic_mmio_read_rao(struct kvm_vcpu *vcpu,
+                                gpa_t addr, unsigned int len);
+
+void vgic_mmio_write_wi(struct kvm_vcpu *vcpu, gpa_t addr,
+                       unsigned int len, unsigned long val);
+
+unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu,
+                                   gpa_t addr, unsigned int len);
+
+void vgic_mmio_write_senable(struct kvm_vcpu *vcpu,
+                            gpa_t addr, unsigned int len,
+                            unsigned long val);
+
+void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu,
+                            gpa_t addr, unsigned int len,
+                            unsigned long val);
+
+unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
+                                    gpa_t addr, unsigned int len);
+
+void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
+                             gpa_t addr, unsigned int len,
+                             unsigned long val);
+
+void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
+                             gpa_t addr, unsigned int len,
+                             unsigned long val);
+
+unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu,
+                                   gpa_t addr, unsigned int len);
+
+void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
+                            gpa_t addr, unsigned int len,
+                            unsigned long val);
+
+void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
+                            gpa_t addr, unsigned int len,
+                            unsigned long val);
+
+unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu,
+                                     gpa_t addr, unsigned int len);
+
+void vgic_mmio_write_priority(struct kvm_vcpu *vcpu,
+                             gpa_t addr, unsigned int len,
+                             unsigned long val);
+
+unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu,
+                                   gpa_t addr, unsigned int len);
+
+void vgic_mmio_write_config(struct kvm_vcpu *vcpu,
+                           gpa_t addr, unsigned int len,
+                           unsigned long val);
+
+unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev);
+
+unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev);
+
+#endif
diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c
new file mode 100644 (file)
index 0000000..e31405e
--- /dev/null
@@ -0,0 +1,356 @@
+/*
+ * Copyright (C) 2015, 2016 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/irqchip/arm-gic.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <kvm/arm_vgic.h>
+#include <asm/kvm_mmu.h>
+
+#include "vgic.h"
+
+/*
+ * Call this function to convert a u64 value to an unsigned long * bitmask
+ * in a way that works on both 32-bit and 64-bit LE and BE platforms.
+ *
+ * Warning: Calling this function may modify *val.
+ */
+static unsigned long *u64_to_bitmask(u64 *val)
+{
+#if defined(CONFIG_CPU_BIG_ENDIAN) && BITS_PER_LONG == 32
+       *val = (*val >> 32) | (*val << 32);
+#endif
+       return (unsigned long *)val;
+}
+
+void vgic_v2_process_maintenance(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2;
+
+       if (cpuif->vgic_misr & GICH_MISR_EOI) {
+               u64 eisr = cpuif->vgic_eisr;
+               unsigned long *eisr_bmap = u64_to_bitmask(&eisr);
+               int lr;
+
+               for_each_set_bit(lr, eisr_bmap, kvm_vgic_global_state.nr_lr) {
+                       u32 intid = cpuif->vgic_lr[lr] & GICH_LR_VIRTUALID;
+
+                       WARN_ON(cpuif->vgic_lr[lr] & GICH_LR_STATE);
+
+                       kvm_notify_acked_irq(vcpu->kvm, 0,
+                                            intid - VGIC_NR_PRIVATE_IRQS);
+               }
+       }
+
+       /* check and disable underflow maintenance IRQ */
+       cpuif->vgic_hcr &= ~GICH_HCR_UIE;
+
+       /*
+        * In the next iterations of the vcpu loop, if we sync the
+        * vgic state after flushing it, but before entering the guest
+        * (this happens for pending signals and vmid rollovers), then
+        * make sure we don't pick up any old maintenance interrupts
+        * here.
+        */
+       cpuif->vgic_eisr = 0;
+}
+
+void vgic_v2_set_underflow(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2;
+
+       cpuif->vgic_hcr |= GICH_HCR_UIE;
+}
+
+/*
+ * transfer the content of the LRs back into the corresponding ap_list:
+ * - active bit is transferred as is
+ * - pending bit is
+ *   - transferred as is in case of edge sensitive IRQs
+ *   - set to the line-level (resample time) for level sensitive IRQs
+ */
+void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2;
+       int lr;
+
+       for (lr = 0; lr < vcpu->arch.vgic_cpu.used_lrs; lr++) {
+               u32 val = cpuif->vgic_lr[lr];
+               u32 intid = val & GICH_LR_VIRTUALID;
+               struct vgic_irq *irq;
+
+               irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
+
+               spin_lock(&irq->irq_lock);
+
+               /* Always preserve the active bit */
+               irq->active = !!(val & GICH_LR_ACTIVE_BIT);
+
+               /* Edge is the only case where we preserve the pending bit */
+               if (irq->config == VGIC_CONFIG_EDGE &&
+                   (val & GICH_LR_PENDING_BIT)) {
+                       irq->pending = true;
+
+                       if (vgic_irq_is_sgi(intid)) {
+                               u32 cpuid = val & GICH_LR_PHYSID_CPUID;
+
+                               cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT;
+                               irq->source |= (1 << cpuid);
+                       }
+               }
+
+               /*
+                * Clear soft pending state when level irqs have been acked.
+                * Always regenerate the pending state.
+                */
+               if (irq->config == VGIC_CONFIG_LEVEL) {
+                       if (!(val & GICH_LR_PENDING_BIT))
+                               irq->soft_pending = false;
+
+                       irq->pending = irq->line_level || irq->soft_pending;
+               }
+
+               spin_unlock(&irq->irq_lock);
+       }
+}
+
+/*
+ * Populates the particular LR with the state of a given IRQ:
+ * - for an edge sensitive IRQ the pending state is cleared in struct vgic_irq
+ * - for a level sensitive IRQ the pending state value is unchanged;
+ *   it is dictated directly by the input level
+ *
+ * If @irq describes an SGI with multiple sources, we choose the
+ * lowest-numbered source VCPU and clear that bit in the source bitmap.
+ *
+ * The irq_lock must be held by the caller.
+ */
+void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
+{
+       u32 val = irq->intid;
+
+       if (irq->pending) {
+               val |= GICH_LR_PENDING_BIT;
+
+               if (irq->config == VGIC_CONFIG_EDGE)
+                       irq->pending = false;
+
+               if (vgic_irq_is_sgi(irq->intid)) {
+                       u32 src = ffs(irq->source);
+
+                       BUG_ON(!src);
+                       val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT;
+                       irq->source &= ~(1 << (src - 1));
+                       if (irq->source)
+                               irq->pending = true;
+               }
+       }
+
+       if (irq->active)
+               val |= GICH_LR_ACTIVE_BIT;
+
+       if (irq->hw) {
+               val |= GICH_LR_HW;
+               val |= irq->hwintid << GICH_LR_PHYSID_CPUID_SHIFT;
+       } else {
+               if (irq->config == VGIC_CONFIG_LEVEL)
+                       val |= GICH_LR_EOI;
+       }
+
+       /* The GICv2 LR only holds five bits of priority. */
+       val |= (irq->priority >> 3) << GICH_LR_PRIORITY_SHIFT;
+
+       vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = val;
+}
+
+void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr)
+{
+       vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = 0;
+}
+
+void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
+{
+       u32 vmcr;
+
+       vmcr  = (vmcrp->ctlr << GICH_VMCR_CTRL_SHIFT) & GICH_VMCR_CTRL_MASK;
+       vmcr |= (vmcrp->abpr << GICH_VMCR_ALIAS_BINPOINT_SHIFT) &
+               GICH_VMCR_ALIAS_BINPOINT_MASK;
+       vmcr |= (vmcrp->bpr << GICH_VMCR_BINPOINT_SHIFT) &
+               GICH_VMCR_BINPOINT_MASK;
+       vmcr |= (vmcrp->pmr << GICH_VMCR_PRIMASK_SHIFT) &
+               GICH_VMCR_PRIMASK_MASK;
+
+       vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = vmcr;
+}
+
+void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
+{
+       u32 vmcr = vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr;
+
+       vmcrp->ctlr = (vmcr & GICH_VMCR_CTRL_MASK) >>
+                       GICH_VMCR_CTRL_SHIFT;
+       vmcrp->abpr = (vmcr & GICH_VMCR_ALIAS_BINPOINT_MASK) >>
+                       GICH_VMCR_ALIAS_BINPOINT_SHIFT;
+       vmcrp->bpr  = (vmcr & GICH_VMCR_BINPOINT_MASK) >>
+                       GICH_VMCR_BINPOINT_SHIFT;
+       vmcrp->pmr  = (vmcr & GICH_VMCR_PRIMASK_MASK) >>
+                       GICH_VMCR_PRIMASK_SHIFT;
+}
+
+void vgic_v2_enable(struct kvm_vcpu *vcpu)
+{
+       /*
+        * By forcing VMCR to zero, the GIC will restore the binary
+        * points to their reset values. Anything else resets to zero
+        * anyway.
+        */
+       vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0;
+       vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr = ~0;
+
+       /* Get the show on the road... */
+       vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN;
+}
+
+/* check for overlapping regions and for regions crossing the end of memory */
+static bool vgic_v2_check_base(gpa_t dist_base, gpa_t cpu_base)
+{
+       if (dist_base + KVM_VGIC_V2_DIST_SIZE < dist_base)
+               return false;
+       if (cpu_base + KVM_VGIC_V2_CPU_SIZE < cpu_base)
+               return false;
+
+       if (dist_base + KVM_VGIC_V2_DIST_SIZE <= cpu_base)
+               return true;
+       if (cpu_base + KVM_VGIC_V2_CPU_SIZE <= dist_base)
+               return true;
+
+       return false;
+}
+
+int vgic_v2_map_resources(struct kvm *kvm)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+       int ret = 0;
+
+       if (vgic_ready(kvm))
+               goto out;
+
+       if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
+           IS_VGIC_ADDR_UNDEF(dist->vgic_cpu_base)) {
+               kvm_err("Need to set vgic cpu and dist addresses first\n");
+               ret = -ENXIO;
+               goto out;
+       }
+
+       if (!vgic_v2_check_base(dist->vgic_dist_base, dist->vgic_cpu_base)) {
+               kvm_err("VGIC CPU and dist frames overlap\n");
+               ret = -EINVAL;
+               goto out;
+       }
+
+       /*
+        * Initialize the vgic if this hasn't already been done on demand by
+        * accessing the vgic state from userspace.
+        */
+       ret = vgic_init(kvm);
+       if (ret) {
+               kvm_err("Unable to initialize VGIC dynamic data structures\n");
+               goto out;
+       }
+
+       ret = vgic_register_dist_iodev(kvm, dist->vgic_dist_base, VGIC_V2);
+       if (ret) {
+               kvm_err("Unable to register VGIC MMIO regions\n");
+               goto out;
+       }
+
+       ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base,
+                                   kvm_vgic_global_state.vcpu_base,
+                                   KVM_VGIC_V2_CPU_SIZE, true);
+       if (ret) {
+               kvm_err("Unable to remap VGIC CPU to VCPU\n");
+               goto out;
+       }
+
+       dist->ready = true;
+
+out:
+       if (ret)
+               kvm_vgic_destroy(kvm);
+       return ret;
+}
+
+/**
+ * vgic_v2_probe - probe for a GICv2 compatible interrupt controller in DT
+ * @node:      pointer to the DT node
+ *
+ * Returns 0 if a GICv2 has been found, returns an error code otherwise
+ */
+int vgic_v2_probe(const struct gic_kvm_info *info)
+{
+       int ret;
+       u32 vtr;
+
+       if (!info->vctrl.start) {
+               kvm_err("GICH not present in the firmware table\n");
+               return -ENXIO;
+       }
+
+       if (!PAGE_ALIGNED(info->vcpu.start)) {
+               kvm_err("GICV physical address 0x%llx not page aligned\n",
+                       (unsigned long long)info->vcpu.start);
+               return -ENXIO;
+       }
+
+       if (!PAGE_ALIGNED(resource_size(&info->vcpu))) {
+               kvm_err("GICV size 0x%llx not a multiple of page size 0x%lx\n",
+                       (unsigned long long)resource_size(&info->vcpu),
+                       PAGE_SIZE);
+               return -ENXIO;
+       }
+
+       kvm_vgic_global_state.vctrl_base = ioremap(info->vctrl.start,
+                                                  resource_size(&info->vctrl));
+       if (!kvm_vgic_global_state.vctrl_base) {
+               kvm_err("Cannot ioremap GICH\n");
+               return -ENOMEM;
+       }
+
+       vtr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VTR);
+       kvm_vgic_global_state.nr_lr = (vtr & 0x3f) + 1;
+
+       ret = create_hyp_io_mappings(kvm_vgic_global_state.vctrl_base,
+                                    kvm_vgic_global_state.vctrl_base +
+                                        resource_size(&info->vctrl),
+                                    info->vctrl.start);
+
+       if (ret) {
+               kvm_err("Cannot map VCTRL into hyp\n");
+               iounmap(kvm_vgic_global_state.vctrl_base);
+               return ret;
+       }
+
+       kvm_vgic_global_state.can_emulate_gicv2 = true;
+       kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
+
+       kvm_vgic_global_state.vcpu_base = info->vcpu.start;
+       kvm_vgic_global_state.type = VGIC_V2;
+       kvm_vgic_global_state.max_gic_vcpus = VGIC_V2_MAX_CPUS;
+
+       kvm_info("vgic-v2@%llx\n", info->vctrl.start);
+
+       return 0;
+}
diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c
new file mode 100644 (file)
index 0000000..346b4ad
--- /dev/null
@@ -0,0 +1,334 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/irqchip/arm-gic-v3.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <kvm/arm_vgic.h>
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_asm.h>
+
+#include "vgic.h"
+
+void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3;
+       u32 model = vcpu->kvm->arch.vgic.vgic_model;
+
+       if (cpuif->vgic_misr & ICH_MISR_EOI) {
+               unsigned long eisr_bmap = cpuif->vgic_eisr;
+               int lr;
+
+               for_each_set_bit(lr, &eisr_bmap, kvm_vgic_global_state.nr_lr) {
+                       u32 intid;
+                       u64 val = cpuif->vgic_lr[lr];
+
+                       if (model == KVM_DEV_TYPE_ARM_VGIC_V3)
+                               intid = val & ICH_LR_VIRTUAL_ID_MASK;
+                       else
+                               intid = val & GICH_LR_VIRTUALID;
+
+                       WARN_ON(cpuif->vgic_lr[lr] & ICH_LR_STATE);
+
+                       kvm_notify_acked_irq(vcpu->kvm, 0,
+                                            intid - VGIC_NR_PRIVATE_IRQS);
+               }
+
+               /*
+                * In the next iterations of the vcpu loop, if we sync
+                * the vgic state after flushing it, but before
+                * entering the guest (this happens for pending
+                * signals and vmid rollovers), then make sure we
+                * don't pick up any old maintenance interrupts here.
+                */
+               cpuif->vgic_eisr = 0;
+       }
+
+       cpuif->vgic_hcr &= ~ICH_HCR_UIE;
+}
+
+void vgic_v3_set_underflow(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3;
+
+       cpuif->vgic_hcr |= ICH_HCR_UIE;
+}
+
+void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3;
+       u32 model = vcpu->kvm->arch.vgic.vgic_model;
+       int lr;
+
+       for (lr = 0; lr < vcpu->arch.vgic_cpu.used_lrs; lr++) {
+               u64 val = cpuif->vgic_lr[lr];
+               u32 intid;
+               struct vgic_irq *irq;
+
+               if (model == KVM_DEV_TYPE_ARM_VGIC_V3)
+                       intid = val & ICH_LR_VIRTUAL_ID_MASK;
+               else
+                       intid = val & GICH_LR_VIRTUALID;
+               irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
+
+               spin_lock(&irq->irq_lock);
+
+               /* Always preserve the active bit */
+               irq->active = !!(val & ICH_LR_ACTIVE_BIT);
+
+               /* Edge is the only case where we preserve the pending bit */
+               if (irq->config == VGIC_CONFIG_EDGE &&
+                   (val & ICH_LR_PENDING_BIT)) {
+                       irq->pending = true;
+
+                       if (vgic_irq_is_sgi(intid) &&
+                           model == KVM_DEV_TYPE_ARM_VGIC_V2) {
+                               u32 cpuid = val & GICH_LR_PHYSID_CPUID;
+
+                               cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT;
+                               irq->source |= (1 << cpuid);
+                       }
+               }
+
+               /*
+                * Clear soft pending state when level irqs have been acked.
+                * Always regenerate the pending state.
+                */
+               if (irq->config == VGIC_CONFIG_LEVEL) {
+                       if (!(val & ICH_LR_PENDING_BIT))
+                               irq->soft_pending = false;
+
+                       irq->pending = irq->line_level || irq->soft_pending;
+               }
+
+               spin_unlock(&irq->irq_lock);
+       }
+}
+
+/* Requires the irq to be locked already */
+void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
+{
+       u32 model = vcpu->kvm->arch.vgic.vgic_model;
+       u64 val = irq->intid;
+
+       if (irq->pending) {
+               val |= ICH_LR_PENDING_BIT;
+
+               if (irq->config == VGIC_CONFIG_EDGE)
+                       irq->pending = false;
+
+               if (vgic_irq_is_sgi(irq->intid) &&
+                   model == KVM_DEV_TYPE_ARM_VGIC_V2) {
+                       u32 src = ffs(irq->source);
+
+                       BUG_ON(!src);
+                       val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT;
+                       irq->source &= ~(1 << (src - 1));
+                       if (irq->source)
+                               irq->pending = true;
+               }
+       }
+
+       if (irq->active)
+               val |= ICH_LR_ACTIVE_BIT;
+
+       if (irq->hw) {
+               val |= ICH_LR_HW;
+               val |= ((u64)irq->hwintid) << ICH_LR_PHYS_ID_SHIFT;
+       } else {
+               if (irq->config == VGIC_CONFIG_LEVEL)
+                       val |= ICH_LR_EOI;
+       }
+
+       /*
+        * We currently only support Group1 interrupts, which is a
+        * known defect. This needs to be addressed at some point.
+        */
+       if (model == KVM_DEV_TYPE_ARM_VGIC_V3)
+               val |= ICH_LR_GROUP;
+
+       val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT;
+
+       vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val;
+}
+
+void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr)
+{
+       vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = 0;
+}
+
+void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
+{
+       u32 vmcr;
+
+       vmcr  = (vmcrp->ctlr << ICH_VMCR_CTLR_SHIFT) & ICH_VMCR_CTLR_MASK;
+       vmcr |= (vmcrp->abpr << ICH_VMCR_BPR1_SHIFT) & ICH_VMCR_BPR1_MASK;
+       vmcr |= (vmcrp->bpr << ICH_VMCR_BPR0_SHIFT) & ICH_VMCR_BPR0_MASK;
+       vmcr |= (vmcrp->pmr << ICH_VMCR_PMR_SHIFT) & ICH_VMCR_PMR_MASK;
+
+       vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr = vmcr;
+}
+
+void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
+{
+       u32 vmcr = vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr;
+
+       vmcrp->ctlr = (vmcr & ICH_VMCR_CTLR_MASK) >> ICH_VMCR_CTLR_SHIFT;
+       vmcrp->abpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT;
+       vmcrp->bpr  = (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT;
+       vmcrp->pmr  = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT;
+}
+
+void vgic_v3_enable(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3;
+
+       /*
+        * By forcing VMCR to zero, the GIC will restore the binary
+        * points to their reset values. Anything else resets to zero
+        * anyway.
+        */
+       vgic_v3->vgic_vmcr = 0;
+       vgic_v3->vgic_elrsr = ~0;
+
+       /*
+        * If we are emulating a GICv3, we do it in an non-GICv2-compatible
+        * way, so we force SRE to 1 to demonstrate this to the guest.
+        * This goes with the spec allowing the value to be RAO/WI.
+        */
+       if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
+               vgic_v3->vgic_sre = ICC_SRE_EL1_SRE;
+       else
+               vgic_v3->vgic_sre = 0;
+
+       /* Get the show on the road... */
+       vgic_v3->vgic_hcr = ICH_HCR_EN;
+}
+
+/* check for overlapping regions and for regions crossing the end of memory */
+static bool vgic_v3_check_base(struct kvm *kvm)
+{
+       struct vgic_dist *d = &kvm->arch.vgic;
+       gpa_t redist_size = KVM_VGIC_V3_REDIST_SIZE;
+
+       redist_size *= atomic_read(&kvm->online_vcpus);
+
+       if (d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE < d->vgic_dist_base)
+               return false;
+       if (d->vgic_redist_base + redist_size < d->vgic_redist_base)
+               return false;
+
+       if (d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE <= d->vgic_redist_base)
+               return true;
+       if (d->vgic_redist_base + redist_size <= d->vgic_dist_base)
+               return true;
+
+       return false;
+}
+
+int vgic_v3_map_resources(struct kvm *kvm)
+{
+       int ret = 0;
+       struct vgic_dist *dist = &kvm->arch.vgic;
+
+       if (vgic_ready(kvm))
+               goto out;
+
+       if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
+           IS_VGIC_ADDR_UNDEF(dist->vgic_redist_base)) {
+               kvm_err("Need to set vgic distributor addresses first\n");
+               ret = -ENXIO;
+               goto out;
+       }
+
+       if (!vgic_v3_check_base(kvm)) {
+               kvm_err("VGIC redist and dist frames overlap\n");
+               ret = -EINVAL;
+               goto out;
+       }
+
+       /*
+        * For a VGICv3 we require the userland to explicitly initialize
+        * the VGIC before we need to use it.
+        */
+       if (!vgic_initialized(kvm)) {
+               ret = -EBUSY;
+               goto out;
+       }
+
+       ret = vgic_register_dist_iodev(kvm, dist->vgic_dist_base, VGIC_V3);
+       if (ret) {
+               kvm_err("Unable to register VGICv3 dist MMIO regions\n");
+               goto out;
+       }
+
+       ret = vgic_register_redist_iodevs(kvm, dist->vgic_redist_base);
+       if (ret) {
+               kvm_err("Unable to register VGICv3 redist MMIO regions\n");
+               goto out;
+       }
+
+       dist->ready = true;
+
+out:
+       if (ret)
+               kvm_vgic_destroy(kvm);
+       return ret;
+}
+
+/**
+ * vgic_v3_probe - probe for a GICv3 compatible interrupt controller in DT
+ * @node:      pointer to the DT node
+ *
+ * Returns 0 if a GICv3 has been found, returns an error code otherwise
+ */
+int vgic_v3_probe(const struct gic_kvm_info *info)
+{
+       u32 ich_vtr_el2 = kvm_call_hyp(__vgic_v3_get_ich_vtr_el2);
+
+       /*
+        * The ListRegs field is 5 bits, but there is a architectural
+        * maximum of 16 list registers. Just ignore bit 4...
+        */
+       kvm_vgic_global_state.nr_lr = (ich_vtr_el2 & 0xf) + 1;
+       kvm_vgic_global_state.can_emulate_gicv2 = false;
+
+       if (!info->vcpu.start) {
+               kvm_info("GICv3: no GICV resource entry\n");
+               kvm_vgic_global_state.vcpu_base = 0;
+       } else if (!PAGE_ALIGNED(info->vcpu.start)) {
+               pr_warn("GICV physical address 0x%llx not page aligned\n",
+                       (unsigned long long)info->vcpu.start);
+               kvm_vgic_global_state.vcpu_base = 0;
+       } else if (!PAGE_ALIGNED(resource_size(&info->vcpu))) {
+               pr_warn("GICV size 0x%llx not a multiple of page size 0x%lx\n",
+                       (unsigned long long)resource_size(&info->vcpu),
+                       PAGE_SIZE);
+               kvm_vgic_global_state.vcpu_base = 0;
+       } else {
+               kvm_vgic_global_state.vcpu_base = info->vcpu.start;
+               kvm_vgic_global_state.can_emulate_gicv2 = true;
+               kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
+               kvm_info("vgic-v2@%llx\n", info->vcpu.start);
+       }
+       if (kvm_vgic_global_state.vcpu_base == 0)
+               kvm_info("disabling GICv2 emulation\n");
+       kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3);
+
+       kvm_vgic_global_state.vctrl_base = NULL;
+       kvm_vgic_global_state.type = VGIC_V3;
+       kvm_vgic_global_state.max_gic_vcpus = VGIC_V3_MAX_CPUS;
+
+       return 0;
+}
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
new file mode 100644 (file)
index 0000000..69b61ab
--- /dev/null
@@ -0,0 +1,619 @@
+/*
+ * Copyright (C) 2015, 2016 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/list_sort.h>
+
+#include "vgic.h"
+
+#define CREATE_TRACE_POINTS
+#include "../trace.h"
+
+#ifdef CONFIG_DEBUG_SPINLOCK
+#define DEBUG_SPINLOCK_BUG_ON(p) BUG_ON(p)
+#else
+#define DEBUG_SPINLOCK_BUG_ON(p)
+#endif
+
+struct vgic_global __section(.hyp.text) kvm_vgic_global_state;
+
+/*
+ * Locking order is always:
+ *   vgic_cpu->ap_list_lock
+ *     vgic_irq->irq_lock
+ *
+ * (that is, always take the ap_list_lock before the struct vgic_irq lock).
+ *
+ * When taking more than one ap_list_lock at the same time, always take the
+ * lowest numbered VCPU's ap_list_lock first, so:
+ *   vcpuX->vcpu_id < vcpuY->vcpu_id:
+ *     spin_lock(vcpuX->arch.vgic_cpu.ap_list_lock);
+ *     spin_lock(vcpuY->arch.vgic_cpu.ap_list_lock);
+ */
+
+struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
+                             u32 intid)
+{
+       /* SGIs and PPIs */
+       if (intid <= VGIC_MAX_PRIVATE)
+               return &vcpu->arch.vgic_cpu.private_irqs[intid];
+
+       /* SPIs */
+       if (intid <= VGIC_MAX_SPI)
+               return &kvm->arch.vgic.spis[intid - VGIC_NR_PRIVATE_IRQS];
+
+       /* LPIs are not yet covered */
+       if (intid >= VGIC_MIN_LPI)
+               return NULL;
+
+       WARN(1, "Looking up struct vgic_irq for reserved INTID");
+       return NULL;
+}
+
+/**
+ * kvm_vgic_target_oracle - compute the target vcpu for an irq
+ *
+ * @irq:       The irq to route. Must be already locked.
+ *
+ * Based on the current state of the interrupt (enabled, pending,
+ * active, vcpu and target_vcpu), compute the next vcpu this should be
+ * given to. Return NULL if this shouldn't be injected at all.
+ *
+ * Requires the IRQ lock to be held.
+ */
+static struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq)
+{
+       DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&irq->irq_lock));
+
+       /* If the interrupt is active, it must stay on the current vcpu */
+       if (irq->active)
+               return irq->vcpu ? : irq->target_vcpu;
+
+       /*
+        * If the IRQ is not active but enabled and pending, we should direct
+        * it to its configured target VCPU.
+        * If the distributor is disabled, pending interrupts shouldn't be
+        * forwarded.
+        */
+       if (irq->enabled && irq->pending) {
+               if (unlikely(irq->target_vcpu &&
+                            !irq->target_vcpu->kvm->arch.vgic.enabled))
+                       return NULL;
+
+               return irq->target_vcpu;
+       }
+
+       /* If neither active nor pending and enabled, then this IRQ should not
+        * be queued to any VCPU.
+        */
+       return NULL;
+}
+
+/*
+ * The order of items in the ap_lists defines how we'll pack things in LRs as
+ * well, the first items in the list being the first things populated in the
+ * LRs.
+ *
+ * A hard rule is that active interrupts can never be pushed out of the LRs
+ * (and therefore take priority) since we cannot reliably trap on deactivation
+ * of IRQs and therefore they have to be present in the LRs.
+ *
+ * Otherwise things should be sorted by the priority field and the GIC
+ * hardware support will take care of preemption of priority groups etc.
+ *
+ * Return negative if "a" sorts before "b", 0 to preserve order, and positive
+ * to sort "b" before "a".
+ */
+static int vgic_irq_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+       struct vgic_irq *irqa = container_of(a, struct vgic_irq, ap_list);
+       struct vgic_irq *irqb = container_of(b, struct vgic_irq, ap_list);
+       bool penda, pendb;
+       int ret;
+
+       spin_lock(&irqa->irq_lock);
+       spin_lock_nested(&irqb->irq_lock, SINGLE_DEPTH_NESTING);
+
+       if (irqa->active || irqb->active) {
+               ret = (int)irqb->active - (int)irqa->active;
+               goto out;
+       }
+
+       penda = irqa->enabled && irqa->pending;
+       pendb = irqb->enabled && irqb->pending;
+
+       if (!penda || !pendb) {
+               ret = (int)pendb - (int)penda;
+               goto out;
+       }
+
+       /* Both pending and enabled, sort by priority */
+       ret = irqa->priority - irqb->priority;
+out:
+       spin_unlock(&irqb->irq_lock);
+       spin_unlock(&irqa->irq_lock);
+       return ret;
+}
+
+/* Must be called with the ap_list_lock held */
+static void vgic_sort_ap_list(struct kvm_vcpu *vcpu)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+
+       DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&vgic_cpu->ap_list_lock));
+
+       list_sort(NULL, &vgic_cpu->ap_list_head, vgic_irq_cmp);
+}
+
+/*
+ * Only valid injection if changing level for level-triggered IRQs or for a
+ * rising edge.
+ */
+static bool vgic_validate_injection(struct vgic_irq *irq, bool level)
+{
+       switch (irq->config) {
+       case VGIC_CONFIG_LEVEL:
+               return irq->line_level != level;
+       case VGIC_CONFIG_EDGE:
+               return level;
+       }
+
+       return false;
+}
+
+/*
+ * Check whether an IRQ needs to (and can) be queued to a VCPU's ap list.
+ * Do the queuing if necessary, taking the right locks in the right order.
+ * Returns true when the IRQ was queued, false otherwise.
+ *
+ * Needs to be entered with the IRQ lock already held, but will return
+ * with all locks dropped.
+ */
+bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq)
+{
+       struct kvm_vcpu *vcpu;
+
+       DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&irq->irq_lock));
+
+retry:
+       vcpu = vgic_target_oracle(irq);
+       if (irq->vcpu || !vcpu) {
+               /*
+                * If this IRQ is already on a VCPU's ap_list, then it
+                * cannot be moved or modified and there is no more work for
+                * us to do.
+                *
+                * Otherwise, if the irq is not pending and enabled, it does
+                * not need to be inserted into an ap_list and there is also
+                * no more work for us to do.
+                */
+               spin_unlock(&irq->irq_lock);
+               return false;
+       }
+
+       /*
+        * We must unlock the irq lock to take the ap_list_lock where
+        * we are going to insert this new pending interrupt.
+        */
+       spin_unlock(&irq->irq_lock);
+
+       /* someone can do stuff here, which we re-check below */
+
+       spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock);
+       spin_lock(&irq->irq_lock);
+
+       /*
+        * Did something change behind our backs?
+        *
+        * There are two cases:
+        * 1) The irq lost its pending state or was disabled behind our
+        *    backs and/or it was queued to another VCPU's ap_list.
+        * 2) Someone changed the affinity on this irq behind our
+        *    backs and we are now holding the wrong ap_list_lock.
+        *
+        * In both cases, drop the locks and retry.
+        */
+
+       if (unlikely(irq->vcpu || vcpu != vgic_target_oracle(irq))) {
+               spin_unlock(&irq->irq_lock);
+               spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock);
+
+               spin_lock(&irq->irq_lock);
+               goto retry;
+       }
+
+       list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head);
+       irq->vcpu = vcpu;
+
+       spin_unlock(&irq->irq_lock);
+       spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock);
+
+       kvm_vcpu_kick(vcpu);
+
+       return true;
+}
+
+static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
+                                  unsigned int intid, bool level,
+                                  bool mapped_irq)
+{
+       struct kvm_vcpu *vcpu;
+       struct vgic_irq *irq;
+       int ret;
+
+       trace_vgic_update_irq_pending(cpuid, intid, level);
+
+       ret = vgic_lazy_init(kvm);
+       if (ret)
+               return ret;
+
+       vcpu = kvm_get_vcpu(kvm, cpuid);
+       if (!vcpu && intid < VGIC_NR_PRIVATE_IRQS)
+               return -EINVAL;
+
+       irq = vgic_get_irq(kvm, vcpu, intid);
+       if (!irq)
+               return -EINVAL;
+
+       if (irq->hw != mapped_irq)
+               return -EINVAL;
+
+       spin_lock(&irq->irq_lock);
+
+       if (!vgic_validate_injection(irq, level)) {
+               /* Nothing to see here, move along... */
+               spin_unlock(&irq->irq_lock);
+               return 0;
+       }
+
+       if (irq->config == VGIC_CONFIG_LEVEL) {
+               irq->line_level = level;
+               irq->pending = level || irq->soft_pending;
+       } else {
+               irq->pending = true;
+       }
+
+       vgic_queue_irq_unlock(kvm, irq);
+
+       return 0;
+}
+
+/**
+ * kvm_vgic_inject_irq - Inject an IRQ from a device to the vgic
+ * @kvm:     The VM structure pointer
+ * @cpuid:   The CPU for PPIs
+ * @intid:   The INTID to inject a new state to.
+ * @level:   Edge-triggered:  true:  to trigger the interrupt
+ *                           false: to ignore the call
+ *          Level-sensitive  true:  raise the input signal
+ *                           false: lower the input signal
+ *
+ * The VGIC is not concerned with devices being active-LOW or active-HIGH for
+ * level-sensitive interrupts.  You can think of the level parameter as 1
+ * being HIGH and 0 being LOW and all devices being active-HIGH.
+ */
+int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
+                       bool level)
+{
+       return vgic_update_irq_pending(kvm, cpuid, intid, level, false);
+}
+
+int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid, unsigned int intid,
+                              bool level)
+{
+       return vgic_update_irq_pending(kvm, cpuid, intid, level, true);
+}
+
+int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq)
+{
+       struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq);
+
+       BUG_ON(!irq);
+
+       spin_lock(&irq->irq_lock);
+
+       irq->hw = true;
+       irq->hwintid = phys_irq;
+
+       spin_unlock(&irq->irq_lock);
+
+       return 0;
+}
+
+int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq)
+{
+       struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq);
+
+       BUG_ON(!irq);
+
+       if (!vgic_initialized(vcpu->kvm))
+               return -EAGAIN;
+
+       spin_lock(&irq->irq_lock);
+
+       irq->hw = false;
+       irq->hwintid = 0;
+
+       spin_unlock(&irq->irq_lock);
+
+       return 0;
+}
+
+/**
+ * vgic_prune_ap_list - Remove non-relevant interrupts from the list
+ *
+ * @vcpu: The VCPU pointer
+ *
+ * Go over the list of "interesting" interrupts, and prune those that we
+ * won't have to consider in the near future.
+ */
+static void vgic_prune_ap_list(struct kvm_vcpu *vcpu)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+       struct vgic_irq *irq, *tmp;
+
+retry:
+       spin_lock(&vgic_cpu->ap_list_lock);
+
+       list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) {
+               struct kvm_vcpu *target_vcpu, *vcpuA, *vcpuB;
+
+               spin_lock(&irq->irq_lock);
+
+               BUG_ON(vcpu != irq->vcpu);
+
+               target_vcpu = vgic_target_oracle(irq);
+
+               if (!target_vcpu) {
+                       /*
+                        * We don't need to process this interrupt any
+                        * further, move it off the list.
+                        */
+                       list_del(&irq->ap_list);
+                       irq->vcpu = NULL;
+                       spin_unlock(&irq->irq_lock);
+                       continue;
+               }
+
+               if (target_vcpu == vcpu) {
+                       /* We're on the right CPU */
+                       spin_unlock(&irq->irq_lock);
+                       continue;
+               }
+
+               /* This interrupt looks like it has to be migrated. */
+
+               spin_unlock(&irq->irq_lock);
+               spin_unlock(&vgic_cpu->ap_list_lock);
+
+               /*
+                * Ensure locking order by always locking the smallest
+                * ID first.
+                */
+               if (vcpu->vcpu_id < target_vcpu->vcpu_id) {
+                       vcpuA = vcpu;
+                       vcpuB = target_vcpu;
+               } else {
+                       vcpuA = target_vcpu;
+                       vcpuB = vcpu;
+               }
+
+               spin_lock(&vcpuA->arch.vgic_cpu.ap_list_lock);
+               spin_lock_nested(&vcpuB->arch.vgic_cpu.ap_list_lock,
+                                SINGLE_DEPTH_NESTING);
+               spin_lock(&irq->irq_lock);
+
+               /*
+                * If the affinity has been preserved, move the
+                * interrupt around. Otherwise, it means things have
+                * changed while the interrupt was unlocked, and we
+                * need to replay this.
+                *
+                * In all cases, we cannot trust the list not to have
+                * changed, so we restart from the beginning.
+                */
+               if (target_vcpu == vgic_target_oracle(irq)) {
+                       struct vgic_cpu *new_cpu = &target_vcpu->arch.vgic_cpu;
+
+                       list_del(&irq->ap_list);
+                       irq->vcpu = target_vcpu;
+                       list_add_tail(&irq->ap_list, &new_cpu->ap_list_head);
+               }
+
+               spin_unlock(&irq->irq_lock);
+               spin_unlock(&vcpuB->arch.vgic_cpu.ap_list_lock);
+               spin_unlock(&vcpuA->arch.vgic_cpu.ap_list_lock);
+               goto retry;
+       }
+
+       spin_unlock(&vgic_cpu->ap_list_lock);
+}
+
+static inline void vgic_process_maintenance_interrupt(struct kvm_vcpu *vcpu)
+{
+       if (kvm_vgic_global_state.type == VGIC_V2)
+               vgic_v2_process_maintenance(vcpu);
+       else
+               vgic_v3_process_maintenance(vcpu);
+}
+
+static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu)
+{
+       if (kvm_vgic_global_state.type == VGIC_V2)
+               vgic_v2_fold_lr_state(vcpu);
+       else
+               vgic_v3_fold_lr_state(vcpu);
+}
+
+/* Requires the irq_lock to be held. */
+static inline void vgic_populate_lr(struct kvm_vcpu *vcpu,
+                                   struct vgic_irq *irq, int lr)
+{
+       DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&irq->irq_lock));
+
+       if (kvm_vgic_global_state.type == VGIC_V2)
+               vgic_v2_populate_lr(vcpu, irq, lr);
+       else
+               vgic_v3_populate_lr(vcpu, irq, lr);
+}
+
+static inline void vgic_clear_lr(struct kvm_vcpu *vcpu, int lr)
+{
+       if (kvm_vgic_global_state.type == VGIC_V2)
+               vgic_v2_clear_lr(vcpu, lr);
+       else
+               vgic_v3_clear_lr(vcpu, lr);
+}
+
+static inline void vgic_set_underflow(struct kvm_vcpu *vcpu)
+{
+       if (kvm_vgic_global_state.type == VGIC_V2)
+               vgic_v2_set_underflow(vcpu);
+       else
+               vgic_v3_set_underflow(vcpu);
+}
+
+/* Requires the ap_list_lock to be held. */
+static int compute_ap_list_depth(struct kvm_vcpu *vcpu)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+       struct vgic_irq *irq;
+       int count = 0;
+
+       DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&vgic_cpu->ap_list_lock));
+
+       list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
+               spin_lock(&irq->irq_lock);
+               /* GICv2 SGIs can count for more than one... */
+               if (vgic_irq_is_sgi(irq->intid) && irq->source)
+                       count += hweight8(irq->source);
+               else
+                       count++;
+               spin_unlock(&irq->irq_lock);
+       }
+       return count;
+}
+
+/* Requires the VCPU's ap_list_lock to be held. */
+static void vgic_flush_lr_state(struct kvm_vcpu *vcpu)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+       struct vgic_irq *irq;
+       int count = 0;
+
+       DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&vgic_cpu->ap_list_lock));
+
+       if (compute_ap_list_depth(vcpu) > kvm_vgic_global_state.nr_lr) {
+               vgic_set_underflow(vcpu);
+               vgic_sort_ap_list(vcpu);
+       }
+
+       list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
+               spin_lock(&irq->irq_lock);
+
+               if (unlikely(vgic_target_oracle(irq) != vcpu))
+                       goto next;
+
+               /*
+                * If we get an SGI with multiple sources, try to get
+                * them in all at once.
+                */
+               do {
+                       vgic_populate_lr(vcpu, irq, count++);
+               } while (irq->source && count < kvm_vgic_global_state.nr_lr);
+
+next:
+               spin_unlock(&irq->irq_lock);
+
+               if (count == kvm_vgic_global_state.nr_lr)
+                       break;
+       }
+
+       vcpu->arch.vgic_cpu.used_lrs = count;
+
+       /* Nuke remaining LRs */
+       for ( ; count < kvm_vgic_global_state.nr_lr; count++)
+               vgic_clear_lr(vcpu, count);
+}
+
+/* Sync back the hardware VGIC state into our emulation after a guest's run. */
+void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
+{
+       vgic_process_maintenance_interrupt(vcpu);
+       vgic_fold_lr_state(vcpu);
+       vgic_prune_ap_list(vcpu);
+}
+
+/* Flush our emulation state into the GIC hardware before entering the guest. */
+void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
+{
+       spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock);
+       vgic_flush_lr_state(vcpu);
+       spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock);
+}
+
+int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+       struct vgic_irq *irq;
+       bool pending = false;
+
+       if (!vcpu->kvm->arch.vgic.enabled)
+               return false;
+
+       spin_lock(&vgic_cpu->ap_list_lock);
+
+       list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
+               spin_lock(&irq->irq_lock);
+               pending = irq->pending && irq->enabled;
+               spin_unlock(&irq->irq_lock);
+
+               if (pending)
+                       break;
+       }
+
+       spin_unlock(&vgic_cpu->ap_list_lock);
+
+       return pending;
+}
+
+void vgic_kick_vcpus(struct kvm *kvm)
+{
+       struct kvm_vcpu *vcpu;
+       int c;
+
+       /*
+        * We've injected an interrupt, time to find out who deserves
+        * a good kick...
+        */
+       kvm_for_each_vcpu(c, vcpu, kvm) {
+               if (kvm_vgic_vcpu_pending_irq(vcpu))
+                       kvm_vcpu_kick(vcpu);
+       }
+}
+
+bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq)
+{
+       struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq);
+       bool map_is_active;
+
+       spin_lock(&irq->irq_lock);
+       map_is_active = irq->hw && irq->active;
+       spin_unlock(&irq->irq_lock);
+
+       return map_is_active;
+}
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
new file mode 100644 (file)
index 0000000..7b300ca
--- /dev/null
@@ -0,0 +1,131 @@
+/*
+ * Copyright (C) 2015, 2016 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef __KVM_ARM_VGIC_NEW_H__
+#define __KVM_ARM_VGIC_NEW_H__
+
+#include <linux/irqchip/arm-gic-common.h>
+
+#define PRODUCT_ID_KVM         0x4b    /* ASCII code K */
+#define IMPLEMENTER_ARM                0x43b
+
+#define VGIC_ADDR_UNDEF                (-1)
+#define IS_VGIC_ADDR_UNDEF(_x)  ((_x) == VGIC_ADDR_UNDEF)
+
+#define INTERRUPT_ID_BITS_SPIS 10
+#define VGIC_PRI_BITS          5
+
+#define vgic_irq_is_sgi(intid) ((intid) < VGIC_NR_SGIS)
+
+struct vgic_vmcr {
+       u32     ctlr;
+       u32     abpr;
+       u32     bpr;
+       u32     pmr;
+};
+
+struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
+                             u32 intid);
+bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq);
+void vgic_kick_vcpus(struct kvm *kvm);
+
+void vgic_v2_process_maintenance(struct kvm_vcpu *vcpu);
+void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu);
+void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
+void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr);
+void vgic_v2_set_underflow(struct kvm_vcpu *vcpu);
+int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr);
+int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+                        int offset, u32 *val);
+int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+                         int offset, u32 *val);
+void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
+void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
+void vgic_v2_enable(struct kvm_vcpu *vcpu);
+int vgic_v2_probe(const struct gic_kvm_info *info);
+int vgic_v2_map_resources(struct kvm *kvm);
+int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
+                            enum vgic_type);
+
+#ifdef CONFIG_KVM_ARM_VGIC_V3
+void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu);
+void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu);
+void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
+void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr);
+void vgic_v3_set_underflow(struct kvm_vcpu *vcpu);
+void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
+void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
+void vgic_v3_enable(struct kvm_vcpu *vcpu);
+int vgic_v3_probe(const struct gic_kvm_info *info);
+int vgic_v3_map_resources(struct kvm *kvm);
+int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t dist_base_address);
+#else
+static inline void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline void vgic_v3_populate_lr(struct kvm_vcpu *vcpu,
+                                      struct vgic_irq *irq, int lr)
+{
+}
+
+static inline void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr)
+{
+}
+
+static inline void vgic_v3_set_underflow(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline
+void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
+{
+}
+
+static inline
+void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
+{
+}
+
+static inline void vgic_v3_enable(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline int vgic_v3_probe(const struct gic_kvm_info *info)
+{
+       return -ENODEV;
+}
+
+static inline int vgic_v3_map_resources(struct kvm *kvm)
+{
+       return -ENODEV;
+}
+
+static inline int vgic_register_redist_iodevs(struct kvm *kvm,
+                                             gpa_t dist_base_address)
+{
+       return -ENODEV;
+}
+#endif
+
+void kvm_register_vgic_device(unsigned long type);
+int vgic_lazy_init(struct kvm *kvm);
+int vgic_init(struct kvm *kvm);
+
+#endif
index fe84e1a..8db197b 100644 (file)
@@ -40,7 +40,7 @@ int kvm_irq_map_gsi(struct kvm *kvm,
 
        irq_rt = srcu_dereference_check(kvm->irq_routing, &kvm->irq_srcu,
                                        lockdep_is_held(&kvm->irq_lock));
-       if (gsi < irq_rt->nr_rt_entries) {
+       if (irq_rt && gsi < irq_rt->nr_rt_entries) {
                hlist_for_each_entry(e, &irq_rt->map[gsi], link) {
                        entries[n] = *e;
                        ++n;
index dd4ac9d..02e98f3 100644 (file)
@@ -63,6 +63,9 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/kvm.h>
 
+/* Worst case buffer size needed for holding an integer. */
+#define ITOA_MAX_LEN 12
+
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
@@ -100,6 +103,9 @@ static __read_mostly struct preempt_ops kvm_preempt_ops;
 struct dentry *kvm_debugfs_dir;
 EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
 
+static int kvm_debugfs_num_entries;
+static const struct file_operations *stat_fops_per_vm[];
+
 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
                           unsigned long arg);
 #ifdef CONFIG_KVM_COMPAT
@@ -542,6 +548,58 @@ static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
        kvfree(slots);
 }
 
+static void kvm_destroy_vm_debugfs(struct kvm *kvm)
+{
+       int i;
+
+       if (!kvm->debugfs_dentry)
+               return;
+
+       debugfs_remove_recursive(kvm->debugfs_dentry);
+
+       for (i = 0; i < kvm_debugfs_num_entries; i++)
+               kfree(kvm->debugfs_stat_data[i]);
+       kfree(kvm->debugfs_stat_data);
+}
+
+static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
+{
+       char dir_name[ITOA_MAX_LEN * 2];
+       struct kvm_stat_data *stat_data;
+       struct kvm_stats_debugfs_item *p;
+
+       if (!debugfs_initialized())
+               return 0;
+
+       snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd);
+       kvm->debugfs_dentry = debugfs_create_dir(dir_name,
+                                                kvm_debugfs_dir);
+       if (!kvm->debugfs_dentry)
+               return -ENOMEM;
+
+       kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
+                                        sizeof(*kvm->debugfs_stat_data),
+                                        GFP_KERNEL);
+       if (!kvm->debugfs_stat_data)
+               return -ENOMEM;
+
+       for (p = debugfs_entries; p->name; p++) {
+               stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL);
+               if (!stat_data)
+                       return -ENOMEM;
+
+               stat_data->kvm = kvm;
+               stat_data->offset = p->offset;
+               kvm->debugfs_stat_data[p - debugfs_entries] = stat_data;
+               if (!debugfs_create_file(p->name, 0444,
+                                        kvm->debugfs_dentry,
+                                        stat_data,
+                                        stat_fops_per_vm[p->kind]))
+                       return -ENOMEM;
+       }
+       return 0;
+}
+
 static struct kvm *kvm_create_vm(unsigned long type)
 {
        int r, i;
@@ -647,6 +705,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
        int i;
        struct mm_struct *mm = kvm->mm;
 
+       kvm_destroy_vm_debugfs(kvm);
        kvm_arch_sync_events(kvm);
        spin_lock(&kvm_lock);
        list_del(&kvm->vm_list);
@@ -2876,7 +2935,7 @@ static long kvm_vm_ioctl(struct file *filp,
        case KVM_SET_GSI_ROUTING: {
                struct kvm_irq_routing routing;
                struct kvm_irq_routing __user *urouting;
-               struct kvm_irq_routing_entry *entries;
+               struct kvm_irq_routing_entry *entries = NULL;
 
                r = -EFAULT;
                if (copy_from_user(&routing, argp, sizeof(routing)))
@@ -2886,15 +2945,17 @@ static long kvm_vm_ioctl(struct file *filp,
                        goto out;
                if (routing.flags)
                        goto out;
-               r = -ENOMEM;
-               entries = vmalloc(routing.nr * sizeof(*entries));
-               if (!entries)
-                       goto out;
-               r = -EFAULT;
-               urouting = argp;
-               if (copy_from_user(entries, urouting->entries,
-                                  routing.nr * sizeof(*entries)))
-                       goto out_free_irq_routing;
+               if (routing.nr) {
+                       r = -ENOMEM;
+                       entries = vmalloc(routing.nr * sizeof(*entries));
+                       if (!entries)
+                               goto out;
+                       r = -EFAULT;
+                       urouting = argp;
+                       if (copy_from_user(entries, urouting->entries,
+                                          routing.nr * sizeof(*entries)))
+                               goto out_free_irq_routing;
+               }
                r = kvm_set_irq_routing(kvm, entries, routing.nr,
                                        routing.flags);
 out_free_irq_routing:
@@ -2999,8 +3060,15 @@ static int kvm_dev_ioctl_create_vm(unsigned long type)
        }
 #endif
        r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR | O_CLOEXEC);
-       if (r < 0)
+       if (r < 0) {
+               kvm_put_kvm(kvm);
+               return r;
+       }
+
+       if (kvm_create_vm_debugfs(kvm, r) < 0) {
                kvm_put_kvm(kvm);
+               return -ENOMEM;
+       }
 
        return r;
 }
@@ -3425,15 +3493,114 @@ static struct notifier_block kvm_cpu_notifier = {
        .notifier_call = kvm_cpu_hotplug,
 };
 
+static int kvm_debugfs_open(struct inode *inode, struct file *file,
+                          int (*get)(void *, u64 *), int (*set)(void *, u64),
+                          const char *fmt)
+{
+       struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
+                                         inode->i_private;
+
+       /* The debugfs files are a reference to the kvm struct which
+        * is still valid when kvm_destroy_vm is called.
+        * To avoid the race between open and the removal of the debugfs
+        * directory we test against the users count.
+        */
+       if (!atomic_add_unless(&stat_data->kvm->users_count, 1, 0))
+               return -ENOENT;
+
+       if (simple_attr_open(inode, file, get, set, fmt)) {
+               kvm_put_kvm(stat_data->kvm);
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+
+static int kvm_debugfs_release(struct inode *inode, struct file *file)
+{
+       struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
+                                         inode->i_private;
+
+       simple_attr_release(inode, file);
+       kvm_put_kvm(stat_data->kvm);
+
+       return 0;
+}
+
+static int vm_stat_get_per_vm(void *data, u64 *val)
+{
+       struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
+
+       *val = *(u32 *)((void *)stat_data->kvm + stat_data->offset);
+
+       return 0;
+}
+
+static int vm_stat_get_per_vm_open(struct inode *inode, struct file *file)
+{
+       __simple_attr_check_format("%llu\n", 0ull);
+       return kvm_debugfs_open(inode, file, vm_stat_get_per_vm,
+                               NULL, "%llu\n");
+}
+
+static const struct file_operations vm_stat_get_per_vm_fops = {
+       .owner   = THIS_MODULE,
+       .open    = vm_stat_get_per_vm_open,
+       .release = kvm_debugfs_release,
+       .read    = simple_attr_read,
+       .write   = simple_attr_write,
+       .llseek  = generic_file_llseek,
+};
+
+static int vcpu_stat_get_per_vm(void *data, u64 *val)
+{
+       int i;
+       struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
+       struct kvm_vcpu *vcpu;
+
+       *val = 0;
+
+       kvm_for_each_vcpu(i, vcpu, stat_data->kvm)
+               *val += *(u32 *)((void *)vcpu + stat_data->offset);
+
+       return 0;
+}
+
+static int vcpu_stat_get_per_vm_open(struct inode *inode, struct file *file)
+{
+       __simple_attr_check_format("%llu\n", 0ull);
+       return kvm_debugfs_open(inode, file, vcpu_stat_get_per_vm,
+                                NULL, "%llu\n");
+}
+
+static const struct file_operations vcpu_stat_get_per_vm_fops = {
+       .owner   = THIS_MODULE,
+       .open    = vcpu_stat_get_per_vm_open,
+       .release = kvm_debugfs_release,
+       .read    = simple_attr_read,
+       .write   = simple_attr_write,
+       .llseek  = generic_file_llseek,
+};
+
+static const struct file_operations *stat_fops_per_vm[] = {
+       [KVM_STAT_VCPU] = &vcpu_stat_get_per_vm_fops,
+       [KVM_STAT_VM]   = &vm_stat_get_per_vm_fops,
+};
+
 static int vm_stat_get(void *_offset, u64 *val)
 {
        unsigned offset = (long)_offset;
        struct kvm *kvm;
+       struct kvm_stat_data stat_tmp = {.offset = offset};
+       u64 tmp_val;
 
        *val = 0;
        spin_lock(&kvm_lock);
-       list_for_each_entry(kvm, &vm_list, vm_list)
-               *val += *(u32 *)((void *)kvm + offset);
+       list_for_each_entry(kvm, &vm_list, vm_list) {
+               stat_tmp.kvm = kvm;
+               vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
+               *val += tmp_val;
+       }
        spin_unlock(&kvm_lock);
        return 0;
 }
@@ -3444,15 +3611,16 @@ static int vcpu_stat_get(void *_offset, u64 *val)
 {
        unsigned offset = (long)_offset;
        struct kvm *kvm;
-       struct kvm_vcpu *vcpu;
-       int i;
+       struct kvm_stat_data stat_tmp = {.offset = offset};
+       u64 tmp_val;
 
        *val = 0;
        spin_lock(&kvm_lock);
-       list_for_each_entry(kvm, &vm_list, vm_list)
-               kvm_for_each_vcpu(i, vcpu, kvm)
-                       *val += *(u32 *)((void *)vcpu + offset);
-
+       list_for_each_entry(kvm, &vm_list, vm_list) {
+               stat_tmp.kvm = kvm;
+               vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
+               *val += tmp_val;
+       }
        spin_unlock(&kvm_lock);
        return 0;
 }
@@ -3473,7 +3641,8 @@ static int kvm_init_debug(void)
        if (kvm_debugfs_dir == NULL)
                goto out;
 
-       for (p = debugfs_entries; p->name; ++p) {
+       kvm_debugfs_num_entries = 0;
+       for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) {
                if (!debugfs_create_file(p->name, 0444, kvm_debugfs_dir,
                                         (void *)(long)p->offset,
                                         stat_fops[p->kind]))